Browse Source

Merge pull request #5124 from XiWeiGu/LoongArch64-LA264-lapack-fixed

LoongArch64: Fixed lapack test for LA264
tags/v0.3.30
Martin Kroeker GitHub 7 months ago
parent
commit
8d487ef6eb
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
9 changed files with 402 additions and 2158 deletions
  1. +1
    -6
      kernel/loongarch64/amax_lsx.S
  2. +51
    -28
      kernel/loongarch64/cnrm2_lsx.S
  3. +4
    -4
      kernel/loongarch64/copy_lsx.S
  4. +6
    -591
      kernel/loongarch64/crot_lsx.S
  5. +29
    -55
      kernel/loongarch64/dot_lsx.S
  6. +134
    -100
      kernel/loongarch64/iamax_lsx.S
  7. +113
    -1292
      kernel/loongarch64/rot_lsx.S
  8. +55
    -26
      kernel/loongarch64/snrm2_lsx.S
  9. +9
    -56
      kernel/loongarch64/swap_lsx.S

+ 1
- 6
kernel/loongarch64/amax_lsx.S View File

@@ -56,17 +56,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT INCX, 0(INCX) LDINT INCX, 0(INCX)
#endif #endif


vxor.v VM0, VM0, VM0
bge $r0, N, .L999 bge $r0, N, .L999
bge $r0, INCX, .L999 bge $r0, INCX, .L999
li.d TEMP, 1 li.d TEMP, 1
slli.d TEMP, TEMP, BASE_SHIFT slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT
#ifdef DOUBLE
vldrepl.d VM0, X, 0
#else
vldrepl.w VM0, X, 0
#endif
VFSUB VM0, VM0, VM0
bne INCX, TEMP, .L20 bne INCX, TEMP, .L20


srai.d I, N, 3 srai.d I, N, 3


+ 51
- 28
kernel/loongarch64/cnrm2_lsx.S View File

@@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VX4 $vr21 #define VX4 $vr21
#define res1 $vr19 #define res1 $vr19
#define res2 $vr20 #define res2 $vr20
#define RCP $f2
#define VALPHA $vr3


PROLOGUE PROLOGUE


@@ -55,10 +57,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT INCX, 0(INCX) LDINT INCX, 0(INCX)
#endif #endif


vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999 bge $r0, N, .L999
beq $r0, INCX, .L999 beq $r0, INCX, .L999
addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
#ifdef DYNAMIC_ARCH
bl camax_k_LA264
#else
bl camax_k
#endif
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w VALPHA, $vr2, 0
vxor.v res1, res1, res1
vxor.v res2, res2, res2
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999
li.d TEMP, 1 li.d TEMP, 1
slli.d TEMP, TEMP, ZBASE_SHIFT slli.d TEMP, TEMP, ZBASE_SHIFT
slli.d INCX, INCX, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT
@@ -69,16 +91,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


.L10: .L10:
vld VX0, X, 0 * SIZE vld VX0, X, 0 * SIZE
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vld VX0, X, 4 * SIZE
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
addi.d I, I, -1 addi.d I, I, -1
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vfmul.s VX0, VX0, VALPHA
vfmul.s VX1, VX1, VALPHA

vfmadd.s res1, VX0, VX0, res1
vfmadd.s res2, VX1, VX1, res2

addi.d X, X, 8 * SIZE addi.d X, X, 8 * SIZE
blt $r0, I, .L10 blt $r0, I, .L10
b .L996 b .L996
@@ -99,10 +120,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3 vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX add.d X, X, INCX
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res1, VX0, VX0, res1

ld.w t1, X, 0 * SIZE ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE ld.w t2, X, 1 * SIZE
add.d X, X, INCX add.d X, X, INCX
@@ -113,19 +133,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3 vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX add.d X, X, INCX
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res2, VX0, VX0, res2

addi.d I, I, -1 addi.d I, I, -1
blt $r0, I, .L21 blt $r0, I, .L21
b .L996 b .L996
.align 3 .align 3


.L996: .L996:
vfadd.d res1, res1, res2
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
vfadd.s res1, res1, res2
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3 .align 3


.L997: .L997:
@@ -137,18 +160,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fld.s a1, X, 0 * SIZE fld.s a1, X, 0 * SIZE
fld.s a2, X, 1 * SIZE fld.s a2, X, 1 * SIZE
addi.d I, I, -1 addi.d I, I, -1
fcvt.d.s a1, a1
fcvt.d.s a2, a2
fmadd.d res, a1, a1, res
fmadd.d res, a2, a2, res
fmul.s a1, a1, RCP
fmul.s a2, a2, RCP
fmadd.s res, a1, a1, res
fmadd.s res, a2, a2, res
add.d X, X, INCX add.d X, X, INCX
blt $r0, I, .L998 blt $r0, I, .L998
.align 3 .align 3


.L999: .L999:
fsqrt.d res, res
fsqrt.s res, res
fmul.s $f0, res, $f0
move $r4, $r17 move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
.align 3 .align 3




+ 4
- 4
kernel/loongarch64/copy_lsx.S View File

@@ -270,9 +270,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d Y, Y, INCY add.d Y, Y, INCY
ST a2, Y, 0 ST a2, Y, 0
add.d Y, Y, INCY add.d Y, Y, INCY
ST a3, X, 0
ST a3, Y, 0
add.d Y, Y, INCY add.d Y, Y, INCY
ST a4, X, 0
ST a4, Y, 0
add.d Y, Y, INCY add.d Y, Y, INCY
LD a1, X, 0 LD a1, X, 0
add.d X, X, INCX add.d X, X, INCX
@@ -286,9 +286,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d Y, Y, INCY add.d Y, Y, INCY
ST a2, Y, 0 ST a2, Y, 0
add.d Y, Y, INCY add.d Y, Y, INCY
ST a3, X, 0
ST a3, Y, 0
add.d Y, Y, INCY add.d Y, Y, INCY
ST a4, X, 0
ST a4, Y, 0
add.d Y, Y, INCY add.d Y, Y, INCY
addi.d I, I, -1 addi.d I, I, -1
blt $r0, I, .L222 blt $r0, I, .L222


+ 6
- 591
kernel/loongarch64/crot_lsx.S View File

@@ -75,6 +75,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
slli.d TEMP, TEMP, ZBASE_SHIFT slli.d TEMP, TEMP, ZBASE_SHIFT
slli.d INCX, INCX, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT
slli.d INCY, INCY, ZBASE_SHIFT slli.d INCY, INCY, ZBASE_SHIFT
move YY, Y
move XX, X
MTG t1, C MTG t1, C
MTG t2, S MTG t2, S
MTG t3, a1 MTG t3, a1
@@ -89,25 +91,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vreplgr2vr.w VXZ, t3 vreplgr2vr.w VXZ, t3
srai.d I, N, 2 srai.d I, N, 2
#endif #endif
bge $r0, I, .L997
beq INCX, $r0, .L996 beq INCX, $r0, .L996
beq INCY, $r0, .L996 beq INCY, $r0, .L996
bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1
bne INCY, TEMP, .L22

.L11:
bge $r0, I, .L997
CMPEQ $fcc0, C, a1
bcnez $fcc0, .L110
CMPEQ $fcc0, S, a1
bcnez $fcc0, .L112 // C!=0 S==0
b .L111 // C!=0 S!=0
.align 3

.L110:
CMPEQ $fcc0, S, a1
bcnez $fcc0, .L114 // C==0 S==0
b .L113 // C==0 S!=0
.align 3
bne INCX, TEMP, .L221 // INCX!=1 or INCY!=1
bne INCY, TEMP, .L221


.L111: // C!=0 S!=0 .L111: // C!=0 S!=0
vld VX0, X, 0 * SIZE vld VX0, X, 0 * SIZE
@@ -168,151 +156,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
b .L997 b .L997
.align 3 .align 3


.L112: // C!=0 S==0
vld VX0, X, 0 * SIZE
vld VX2, Y, 0 * SIZE
#ifdef DOUBLE
vld VX1, X, 2 * SIZE
vld VX3, Y, 2 * SIZE
vpickev.d x1, VX1, VX0
vpickod.d x2, VX1, VX0
vpickev.d x3, VX3, VX2
vpickod.d x4, VX3, VX2
vfmul.d VX0, x1, VXC
vfmul.d VX1, x3, VXC
vfmul.d VX2, x2, VXC
vfmul.d VX3, x4, VXC
vilvl.d x1, VX2 ,VX0
vilvh.d x2, VX2, VX0
vilvl.d x3, VX3 ,VX1
vilvh.d x4, VX3, VX1
vst x1, X, 0 * SIZE
vst x3, Y, 0 * SIZE
vst x2, X, 2 * SIZE
vst x4, Y, 2 * SIZE
addi.d X, X, 4 * SIZE
addi.d Y, Y, 4 * SIZE
#else
vld VX1, X, 4 * SIZE
vld VX3, Y, 4 * SIZE
vpickev.w x1, VX1, VX0
vpickod.w x2, VX1, VX0
vpickev.w x3, VX3, VX2
vpickod.w x4, VX3, VX2
vfmul.s VX0, x1, VXC
vfmul.s VX1, x3, VXC
vfmul.s VX2, x2, VXC
vfmul.s VX3, x4, VXC
vilvl.w x1, VX2 ,VX0
vilvh.w x2, VX2, VX0
vilvl.w x3, VX3 ,VX1
vilvh.w x4, VX3, VX1
vst x1, X, 0 * SIZE
vst x3, Y, 0 * SIZE
vst x2, X, 4 * SIZE
vst x4, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L112
b .L997
.align 3

.L113: // C==0 S!=0
vld VX0, X, 0 * SIZE
vld VX2, Y, 0 * SIZE
#ifdef DOUBLE
vld VX1, X, 2 * SIZE
vld VX3, Y, 2 * SIZE
vpickev.d x1, VX1, VX0
vpickod.d x2, VX1, VX0
vpickev.d x3, VX3, VX2
vpickod.d x4, VX3, VX2
vfmul.d VX0, x3, VXS
vfmul.d VX1, x1, VXS
vfsub.d VX1, VXZ, VX1
vfmul.d VX2, x4, VXS
vfmul.d VX3, x2, VXS
vfsub.d VX3, VXZ, VX3
vilvl.d x1, VX2 ,VX0
vilvh.d x2, VX2, VX0
vilvl.d x3, VX3 ,VX1
vilvh.d x4, VX3, VX1
vst x1, X, 0 * SIZE
vst x3, Y, 0 * SIZE
vst x2, X, 2 * SIZE
vst x4, Y, 2 * SIZE
addi.d X, X, 4 * SIZE
addi.d Y, Y, 4 * SIZE
#else
vld VX1, X, 4 * SIZE
vld VX3, Y, 4 * SIZE
vpickev.w x1, VX1, VX0
vpickod.w x2, VX1, VX0
vpickev.w x3, VX3, VX2
vpickod.w x4, VX3, VX2
vfmul.s VX0, x3, VXS
vfmul.s VX1, x1, VXS
vfsub.s VX1, VXZ, VX1
vfmul.s VX2, x4, VXS
vfmul.s VX3, x2, VXS
vfsub.s VX3, VXZ, VX3
vilvl.w x1, VX2 ,VX0
vilvh.w x2, VX2, VX0
vilvl.w x3, VX3 ,VX1
vilvh.w x4, VX3, VX1
vst x1, X, 0 * SIZE
vst x3, Y, 0 * SIZE
vst x2, X, 4 * SIZE
vst x4, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L113
b .L997
.align 3

.L114: // C==0 S==0
vst VXZ, X, 0 * SIZE
vst VXZ, Y, 0 * SIZE
#ifdef DOUBLE
vst VXZ, X, 2 * SIZE
vst VXZ, Y, 2 * SIZE
addi.d X, X, 4 * SIZE
addi.d Y, Y, 4 * SIZE
#else
vst VXZ, X, 4 * SIZE
vst VXZ, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
#endif
addi.d I, I, -1
blt $r0, I, .L114
b .L997
.align 3

.L22:
#ifdef DOUBLE
srai.d I, N, 2
#endif
bge $r0, I, .L997
move YY, Y
move XX, X
CMPEQ $fcc0, C, a1
bcnez $fcc0, .L220
CMPEQ $fcc0, S, a1
bcnez $fcc0, .L222 // C!=0 S==0
b .L221 // C!=0 S!=0
.align 3

.L220:
CMPEQ $fcc0, S, a1
bcnez $fcc0, .L224 // C==0 S==0
b .L223 // C==0 S!=0
.align 3

.L221: // C!=0 S!=0 .L221: // C!=0 S!=0
#ifdef DOUBLE #ifdef DOUBLE
ld.d t1, X, 0 * SIZE ld.d t1, X, 0 * SIZE
@@ -355,50 +198,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vstelm.d VX1, YY, 0, 1 vstelm.d VX1, YY, 0, 1
vstelm.d VX3, YY, 1 * SIZE, 1 vstelm.d VX3, YY, 1 * SIZE, 1
add.d YY, YY, INCY add.d YY, YY, INCY

ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
add.d X, X, INCX
ld.d t1, Y, 0 * SIZE
ld.d t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
ld.d t4, Y, 1 * SIZE
vinsgr2vr.d x3, t1, 0
vinsgr2vr.d x4, t2, 0
vinsgr2vr.d x3, t3, 1
vinsgr2vr.d x4, t4, 1
add.d Y, Y, INCY
vfmul.d VX0, x1, VXC
vfmadd.d VX0, x3, VXS, VX0
vfmul.d VX1, x1, VXS
vfmsub.d VX1, x3, VXC, VX1
vfmul.d VX2, x2, VXC
vfmadd.d VX2, x4, VXS, VX2
vfmul.d VX3, x2, VXS
vfmsub.d VX3, x4, VXC, VX3
vstelm.d VX0, XX, 0, 0
vstelm.d VX2, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.d VX0, XX, 0, 1
vstelm.d VX2, XX, 1 * SIZE, 1
add.d XX, XX, INCX
vstelm.d VX1, YY, 0, 0
vstelm.d VX3, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d VX1, YY, 0, 1
vstelm.d VX3, YY, 1 * SIZE, 1
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L221
b .L995
#else #else
ld.w t1, X, 0 * SIZE ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE ld.w t2, X, 1 * SIZE
@@ -473,396 +272,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vstelm.w VX1, YY, 0, 3 vstelm.w VX1, YY, 0, 3
vstelm.w VX3, YY, 1 * SIZE, 3 vstelm.w VX3, YY, 1 * SIZE, 3
add.d YY, YY, INCY add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L221
b .L997
#endif #endif
.align 3

.L222: // C!=0 S==0
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
ld.d t1, Y, 0 * SIZE
ld.d t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
ld.d t4, Y, 1 * SIZE
vinsgr2vr.d x3, t1, 0
vinsgr2vr.d x4, t2, 0
vinsgr2vr.d x3, t3, 1
vinsgr2vr.d x4, t4, 1
add.d Y, Y, INCY
vfmul.d VX0, x1, VXC
vfmul.d VX1, x3, VXC
vfmul.d VX2, x2, VXC
vfmul.d VX3, x4, VXC
vstelm.d VX0, XX, 0, 0
vstelm.d VX2, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.d VX0, XX, 0, 1
vstelm.d VX2, XX, 1 * SIZE, 1
add.d XX, XX, INCX
vstelm.d VX1, YY, 0, 0
vstelm.d VX3, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d VX1, YY, 0, 1
vstelm.d VX3, YY, 1 * SIZE, 1
add.d YY, YY, INCY

ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
add.d X, X, INCX
ld.d t1, Y, 0 * SIZE
ld.d t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
ld.d t4, Y, 1 * SIZE
vinsgr2vr.d x3, t1, 0
vinsgr2vr.d x4, t2, 0
vinsgr2vr.d x3, t3, 1
vinsgr2vr.d x4, t4, 1
add.d Y, Y, INCY
vfmul.d VX0, x1, VXC
vfmul.d VX1, x3, VXC
vfmul.d VX2, x2, VXC
vfmul.d VX3, x4, VXC
vstelm.d VX0, XX, 0, 0
vstelm.d VX2, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.d VX0, XX, 0, 1
vstelm.d VX2, XX, 1 * SIZE, 1
add.d XX, XX, INCX
vstelm.d VX1, YY, 0, 0
vstelm.d VX3, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d VX1, YY, 0, 1
vstelm.d VX3, YY, 1 * SIZE, 1
add.d YY, YY, INCY
addi.d I, I, -1 addi.d I, I, -1
blt $r0, I, .L222
b .L995
#else
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.w x1, t1, 0
vinsgr2vr.w x2, t2, 0
vinsgr2vr.w x1, t3, 1
vinsgr2vr.w x2, t4, 1
ld.w t1, Y, 0 * SIZE
ld.w t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
ld.w t4, Y, 1 * SIZE
add.d Y, Y, INCY
vinsgr2vr.w x3, t1, 0
vinsgr2vr.w x4, t2, 0
vinsgr2vr.w x3, t3, 1
vinsgr2vr.w x4, t4, 1
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
vinsgr2vr.w x1, t1, 2
vinsgr2vr.w x2, t2, 2
vinsgr2vr.w x1, t3, 3
vinsgr2vr.w x2, t4, 3
add.d X, X, INCX
ld.w t1, Y, 0 * SIZE
ld.w t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
ld.w t4, Y, 1 * SIZE
vinsgr2vr.w x3, t1, 2
vinsgr2vr.w x4, t2, 2
vinsgr2vr.w x3, t3, 3
vinsgr2vr.w x4, t4, 3
add.d Y, Y, INCY
vfmul.s VX0, x1, VXC
vfmul.s VX1, x3, VXC
vfmul.s VX2, x2, VXC
vfmul.s VX3, x4, VXC
vstelm.w VX0, XX, 0, 0
vstelm.w VX2, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.w VX0, XX, 0, 1
vstelm.w VX2, XX, 1 * SIZE, 1
add.d XX, XX, INCX
vstelm.w VX0, XX, 0, 2
vstelm.w VX2, XX, 1 * SIZE, 2
add.d XX, XX, INCX
vstelm.w VX0, XX, 0, 3
vstelm.w VX2, XX, 1 * SIZE, 3
add.d XX, XX, INCX
vstelm.w VX1, YY, 0, 0
vstelm.w VX3, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 1
vstelm.w VX3, YY, 1 * SIZE, 1
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 2
vstelm.w VX3, YY, 1 * SIZE, 2
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 3
vstelm.w VX3, YY, 1 * SIZE, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L222
b .L997
#endif
.align 3

.L223: // C==0 S!=0
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
ld.d t1, Y, 0 * SIZE
ld.d t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
ld.d t4, Y, 1 * SIZE
vinsgr2vr.d x3, t1, 0
vinsgr2vr.d x4, t2, 0
vinsgr2vr.d x3, t3, 1
vinsgr2vr.d x4, t4, 1
add.d Y, Y, INCY
vfmul.d VX0, x3, VXS
vfmul.d VX1, x1, VXS
vfsub.d VX1, VXZ, VX1
vfmul.d VX2, x4, VXS
vfmul.d VX3, x2, VXS
vfsub.d VX3, VXZ, VX3
vstelm.d VX0, XX, 0, 0
vstelm.d VX2, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.d VX0, XX, 0, 1
vstelm.d VX2, XX, 1 * SIZE, 1
add.d XX, XX, INCX
vstelm.d VX1, YY, 0, 0
vstelm.d VX3, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d VX1, YY, 0, 1
vstelm.d VX3, YY, 1 * SIZE, 1
add.d YY, YY, INCY

ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
add.d X, X, INCX
ld.d t1, Y, 0 * SIZE
ld.d t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
ld.d t4, Y, 1 * SIZE
vinsgr2vr.d x3, t1, 0
vinsgr2vr.d x4, t2, 0
vinsgr2vr.d x3, t3, 1
vinsgr2vr.d x4, t4, 1
add.d Y, Y, INCY
vfmul.d VX0, x3, VXS
vfmul.d VX1, x1, VXS
vfsub.d VX1, VXZ, VX1
vfmul.d VX2, x4, VXS
vfmul.d VX3, x2, VXS
vfsub.d VX3, VXZ, VX3
vstelm.d VX0, XX, 0, 0
vstelm.d VX2, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.d VX0, XX, 0, 1
vstelm.d VX2, XX, 1 * SIZE, 1
add.d XX, XX, INCX
vstelm.d VX1, YY, 0, 0
vstelm.d VX3, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d VX1, YY, 0, 1
vstelm.d VX3, YY, 1 * SIZE, 1
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L223
b .L995
#else
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.w x1, t1, 0
vinsgr2vr.w x2, t2, 0
vinsgr2vr.w x1, t3, 1
vinsgr2vr.w x2, t4, 1
ld.w t1, Y, 0 * SIZE
ld.w t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
ld.w t4, Y, 1 * SIZE
add.d Y, Y, INCY
vinsgr2vr.w x3, t1, 0
vinsgr2vr.w x4, t2, 0
vinsgr2vr.w x3, t3, 1
vinsgr2vr.w x4, t4, 1
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
vinsgr2vr.w x1, t1, 2
vinsgr2vr.w x2, t2, 2
vinsgr2vr.w x1, t3, 3
vinsgr2vr.w x2, t4, 3
add.d X, X, INCX
ld.w t1, Y, 0 * SIZE
ld.w t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
ld.w t4, Y, 1 * SIZE
vinsgr2vr.w x3, t1, 2
vinsgr2vr.w x4, t2, 2
vinsgr2vr.w x3, t3, 3
vinsgr2vr.w x4, t4, 3
add.d Y, Y, INCY
vfmul.s VX0, x3, VXS
vfmul.s VX1, x1, VXS
vfsub.s VX1, VXZ, VX1
vfmul.s VX2, x4, VXS
vfmul.s VX3, x2, VXS
vfsub.s VX3, VXZ, VX3
vstelm.w VX0, XX, 0, 0
vstelm.w VX2, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.w VX0, XX, 0, 1
vstelm.w VX2, XX, 1 * SIZE, 1
add.d XX, XX, INCX
vstelm.w VX0, XX, 0, 2
vstelm.w VX2, XX, 1 * SIZE, 2
add.d XX, XX, INCX
vstelm.w VX0, XX, 0, 3
vstelm.w VX2, XX, 1 * SIZE, 3
add.d XX, XX, INCX
vstelm.w VX1, YY, 0, 0
vstelm.w VX3, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 1
vstelm.w VX3, YY, 1 * SIZE, 1
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 2
vstelm.w VX3, YY, 1 * SIZE, 2
add.d YY, YY, INCY
vstelm.w VX1, YY, 0, 3
vstelm.w VX3, YY, 1 * SIZE, 3
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L223
b .L997
#endif
.align 3

.L224: // C==0 S==0
#ifdef DOUBLE
vstelm.d VXZ, XX, 0, 0
vstelm.d VXZ, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.d VXZ, XX, 0, 0
vstelm.d VXZ, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.d VXZ, XX, 0, 0
vstelm.d VXZ, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.d VXZ, XX, 0, 0
vstelm.d VXZ, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.d VXZ, YY, 0, 0
vstelm.d VXZ, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 0
vstelm.d VXZ, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 0
vstelm.d VXZ, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d VXZ, YY, 0, 0
vstelm.d VXZ, YY, 1 * SIZE, 0
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L224
move X, XX
move Y, YY
b .L995
#else
vstelm.w VXZ, XX, 0, 0
vstelm.w VXZ, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 0
vstelm.w VXZ, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 0
vstelm.w VXZ, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.w VXZ, XX, 0, 0
vstelm.w VXZ, XX, 1 * SIZE, 0
add.d XX, XX, INCX
vstelm.w VXZ, YY, 0, 0
vstelm.w VXZ, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 0
vstelm.w VXZ, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 0
vstelm.w VXZ, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.w VXZ, YY, 0, 0
vstelm.w VXZ, YY, 1 * SIZE, 0
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L224
move X, XX
move Y, YY
blt $r0, I, .L221
b .L997 b .L997
#endif
.align 3 .align 3


#ifdef DOUBLE
.L995:
andi I, N, 3
bge $r0, I, .L999
b .L998
.align 3

#endif
.L996: .L996:
move I, N move I, N
b .L998 b .L998


+ 29
- 55
kernel/loongarch64/dot_lsx.S View File

@@ -53,8 +53,8 @@ PROLOGUE
#endif #endif


/* init $f8 and $f9 to zero */ /* init $f8 and $f9 to zero */
SUB s1, s1, s1
SUB s2, s2, s2
vxor.v $vr8, $vr8, $vr8
vxor.v $vr9, $vr9, $vr9
slli.d INCX, INCX, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT
li.d TEMP, SIZE li.d TEMP, SIZE
slli.d INCY, INCY, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT
@@ -64,20 +64,6 @@ PROLOGUE


/* !((inc_x == 1) && (inc_y == 1)) */ /* !((inc_x == 1) && (inc_y == 1)) */


/* init $vr8 and $vr9 to zero */
#ifdef DOUBLE
vldrepl.d $vr0, X, 0
#else
vldrepl.w $vr0, X, 0
#endif
#ifdef DSDOT
vfcvtl.d.s $vr0, $vr0
vfsub.d $vr8, $vr0, $vr0
vfsub.d $vr9, $vr0, $vr0
#else
VFSUB $vr8, $vr0, $vr0
VFSUB $vr9, $vr0, $vr0
#endif


#ifdef DOUBLE #ifdef DOUBLE
srai.d I, N, 3 srai.d I, N, 3
@@ -99,31 +85,31 @@ PROLOGUE
addi.w I, I, -1 addi.w I, I, -1
addi.d X, X, 64 addi.d X, X, 64
addi.d Y, Y, 64 addi.d Y, Y, 64
#ifdef DSDOT
#ifndef DOUBLE
vfcvtl.d.s $vr10, $vr0 vfcvtl.d.s $vr10, $vr0
vfcvtl.d.s $vr11, $vr4 vfcvtl.d.s $vr11, $vr4
vfcvth.d.s $vr12, $vr0 vfcvth.d.s $vr12, $vr0
vfcvth.d.s $vr13, $vr4 vfcvth.d.s $vr13, $vr4
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
vfmadd.d $vr8, $vr10, $vr11, $vr8
vfmadd.d $vr9, $vr12, $vr13, $vr9
vfcvtl.d.s $vr10, $vr1 vfcvtl.d.s $vr10, $vr1
vfcvtl.d.s $vr11, $vr5 vfcvtl.d.s $vr11, $vr5
vfcvth.d.s $vr12, $vr1 vfcvth.d.s $vr12, $vr1
vfcvth.d.s $vr13, $vr5 vfcvth.d.s $vr13, $vr5
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
vfmadd.d $vr8, $vr10, $vr11, $vr8
vfmadd.d $vr9, $vr12, $vr13, $vr9
vfcvtl.d.s $vr10, $vr2 vfcvtl.d.s $vr10, $vr2
vfcvtl.d.s $vr11, $vr6 vfcvtl.d.s $vr11, $vr6
vfcvth.d.s $vr12, $vr2 vfcvth.d.s $vr12, $vr2
vfcvth.d.s $vr13, $vr6 vfcvth.d.s $vr13, $vr6
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
vfmadd.d $vr8, $vr10, $vr11, $vr8
vfmadd.d $vr9, $vr12, $vr13, $vr9
vfcvtl.d.s $vr10, $vr3 vfcvtl.d.s $vr10, $vr3
vfcvtl.d.s $vr11, $vr7 vfcvtl.d.s $vr11, $vr7
vfcvth.d.s $vr12, $vr3 vfcvth.d.s $vr12, $vr3
vfcvth.d.s $vr13, $vr7 vfcvth.d.s $vr13, $vr7
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
vfmadd.d $vr8, $vr10, $vr11, $vr8
vfmadd.d $vr9, $vr12, $vr13, $vr9
#else #else
VFMADD $vr8, $vr0, $vr4, $vr8 VFMADD $vr8, $vr0, $vr4, $vr8
VFMADD $vr9, $vr1, $vr5, $vr9 VFMADD $vr9, $vr1, $vr5, $vr9
@@ -149,13 +135,13 @@ PROLOGUE
addi.w I, I, -1 addi.w I, I, -1
addi.d X, X, 16 addi.d X, X, 16
addi.d Y, Y, 16 addi.d Y, Y, 16
#ifdef DSDOT
#ifndef DOUBLE
vfcvtl.d.s $vr10, $vr0 vfcvtl.d.s $vr10, $vr0
vfcvtl.d.s $vr11, $vr4 vfcvtl.d.s $vr11, $vr4
vfcvth.d.s $vr12, $vr0 vfcvth.d.s $vr12, $vr0
vfcvth.d.s $vr13, $vr4 vfcvth.d.s $vr13, $vr4
vfmadd.d $vr8, $vr10, $vr12, $vr8
vfmadd.d $vr9, $vr11, $vr13, $vr9
vfmadd.d $vr8, $vr10, $vr11, $vr8
vfmadd.d $vr9, $vr12, $vr13, $vr9
#else #else
VFMADD $vr8, $vr0, $vr4, $vr8 VFMADD $vr8, $vr0, $vr4, $vr8
#endif #endif
@@ -163,23 +149,10 @@ PROLOGUE
.align 3 .align 3
.L14: .L14:
/* store dot in s1 $f8 */ /* store dot in s1 $f8 */
#ifdef DSDOT
vfadd.d $vr8, $vr8, $vr9 vfadd.d $vr8, $vr8, $vr9
fsub.s s2, s2, s2 /* set s2 to 0.0 */
fsub.d s2, s2, s2 /* set s2 to 0.0 */
vpackod.d $vr0, $vr8, $vr8 vpackod.d $vr0, $vr8, $vr8
vfadd.d $vr8, $vr8, $vr0 vfadd.d $vr8, $vr8, $vr0
#else
VFADD $vr8, $vr8, $vr9
SUB s2, s2, s2 /* set s2 to 0.0 */
vpackod.d $vr0, $vr8, $vr8
#ifdef DOUBLE
VFADD $vr8, $vr8, $vr0
#else
VFADD $vr8, $vr8, $vr0
vpackod.w $vr0, $vr8, $vr8
VFADD $vr8, $vr8, $vr0
#endif /* defined DOUBLE */
#endif /* defined DSDOT */
.align 3 .align 3
.L15: .L15:
#ifdef DOUBLE #ifdef DOUBLE
@@ -193,7 +166,7 @@ PROLOGUE
/* DOUBLE: 1 ; FLOAT: 1~3 */ /* DOUBLE: 1 ; FLOAT: 1~3 */
LD a1, X, 0 LD a1, X, 0
LD b1, Y, 0 LD b1, Y, 0
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1 fcvt.d.s a1, a1
fcvt.d.s b1, b1 fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1 fmadd.d s1, b1, a1, s1
@@ -236,7 +209,7 @@ PROLOGUE
add.d X, X, INCX add.d X, X, INCX
LD b1, Y, 0 * SIZE LD b1, Y, 0 * SIZE
add.d Y, Y, INCY add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1 fcvt.d.s a1, a1
fcvt.d.s b1, b1 fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1 fmadd.d s1, b1, a1, s1
@@ -248,7 +221,7 @@ PROLOGUE
add.d X, X, INCX add.d X, X, INCX
LD b1, Y, 0 * SIZE LD b1, Y, 0 * SIZE
add.d Y, Y, INCY add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1 fcvt.d.s a1, a1
fcvt.d.s b1, b1 fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2 fmadd.d s2, b1, a1, s2
@@ -260,7 +233,7 @@ PROLOGUE
add.d X, X, INCX add.d X, X, INCX
LD b1, Y, 0 * SIZE LD b1, Y, 0 * SIZE
add.d Y, Y, INCY add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1 fcvt.d.s a1, a1
fcvt.d.s b1, b1 fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1 fmadd.d s1, b1, a1, s1
@@ -272,7 +245,7 @@ PROLOGUE
add.d X, X, INCX add.d X, X, INCX
LD b1, Y, 0 * SIZE LD b1, Y, 0 * SIZE
add.d Y, Y, INCY add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1 fcvt.d.s a1, a1
fcvt.d.s b1, b1 fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2 fmadd.d s2, b1, a1, s2
@@ -284,7 +257,7 @@ PROLOGUE
add.d X, X, INCX add.d X, X, INCX
LD b1, Y, 0 * SIZE LD b1, Y, 0 * SIZE
add.d Y, Y, INCY add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1 fcvt.d.s a1, a1
fcvt.d.s b1, b1 fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1 fmadd.d s1, b1, a1, s1
@@ -296,7 +269,7 @@ PROLOGUE
add.d X, X, INCX add.d X, X, INCX
LD b1, Y, 0 * SIZE LD b1, Y, 0 * SIZE
add.d Y, Y, INCY add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1 fcvt.d.s a1, a1
fcvt.d.s b1, b1 fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2 fmadd.d s2, b1, a1, s2
@@ -308,7 +281,7 @@ PROLOGUE
add.d X, X, INCX add.d X, X, INCX
LD b1, Y, 0 * SIZE LD b1, Y, 0 * SIZE
add.d Y, Y, INCY add.d Y, Y, INCY
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1 fcvt.d.s a1, a1
fcvt.d.s b1, b1 fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1 fmadd.d s1, b1, a1, s1
@@ -321,7 +294,7 @@ PROLOGUE
LD b1, Y, 0 * SIZE LD b1, Y, 0 * SIZE
add.d Y, Y, INCY add.d Y, Y, INCY
addi.d I, I, -1 addi.d I, I, -1
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1 fcvt.d.s a1, a1
fcvt.d.s b1, b1 fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2 fmadd.d s2, b1, a1, s2
@@ -342,7 +315,7 @@ PROLOGUE
LD b1, Y, 0 * SIZE LD b1, Y, 0 * SIZE
add.d Y, Y, INCY add.d Y, Y, INCY
addi.d I, I, -1 addi.d I, I, -1
#ifdef DSDOT
#ifndef DOUBLE
fcvt.d.s a1, a1 fcvt.d.s a1, a1
fcvt.d.s b1, b1 fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1 fmadd.d s1, b1, a1, s1
@@ -353,12 +326,13 @@ PROLOGUE
.align 3 .align 3


.L999: .L999:
#ifdef DSDOT
fadd.d $f0, s1, s2 fadd.d $f0, s1, s2
move $r4, $r17
#if defined(DOUBLE)
#elif defined(DSDOT)
#else #else
ADD $f0, s1, s2
fcvt.s.d $f0, $f0
#endif #endif
move $r4, $r17
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0


EPILOGUE EPILOGUE

+ 134
- 100
kernel/loongarch64/iamax_lsx.S View File

@@ -56,19 +56,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VI3 $vr8 #define VI3 $vr8
#define VI4 $vr19 #define VI4 $vr19
#define VT0 $vr23 #define VT0 $vr23
#define VZE $vr3
#define VT1 $vr4
#define VT2 $vr5
#define VC0 $vr6


PROLOGUE PROLOGUE
li.d i0, 0 li.d i0, 0
bge $r0, N, .L999 bge $r0, N, .L999
bge $r0, INCX, .L999 bge $r0, INCX, .L999
li.d TEMP, 1 li.d TEMP, 1
vldi VZE, 0
slli.d TEMP, TEMP, BASE_SHIFT slli.d TEMP, TEMP, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20 bne INCX, TEMP, .L20
vld VM0, X, 0 vld VM0, X, 0
#ifdef DOUBLE #ifdef DOUBLE
vfsub.d VT1, VZE, VM0
addi.d i0, i0, 1 addi.d i0, i0, 1
srai.d I, N, 3 srai.d I, N, 3
vfmaxa.d VM0, VM0, VT1
bge $r0, I, .L11 bge $r0, I, .L11
slli.d i0, i0, 1 //2 slli.d i0, i0, 1 //2
vreplgr2vr.d VINC2, i0 vreplgr2vr.d VINC2, i0
@@ -79,12 +86,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d i0, i0, 1 addi.d i0, i0, 1
vinsgr2vr.d VI1, i0, 1 vinsgr2vr.d VI1, i0, 1
addi.d i0, i0, 3 addi.d i0, i0, 3
vinsgr2vr.d VI0, i0, 0 //1
vinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1 addi.d i0, i0, 1
vinsgr2vr.d VI0, i0, 1 //2
vinsgr2vr.d VI0, i0, 1
#else #else
vfsub.s VT1, VZE, VM0
addi.w i0, i0, 1 addi.w i0, i0, 1
srai.d I, N, 3 srai.d I, N, 3
vfmaxa.s VM0, VM0, VT1
bge $r0, I, .L21 bge $r0, I, .L21
slli.w i0, i0, 2 //4 slli.w i0, i0, 2 //4
vreplgr2vr.w VINC2, i0 vreplgr2vr.w VINC2, i0
@@ -115,39 +124,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vadd.d VI1, VI1, VINC4 vadd.d VI1, VI1, VINC4
vld VX1, X, 2 * SIZE vld VX1, X, 2 * SIZE
vadd.d VI2, VI1, VINC2 vadd.d VI2, VI1, VINC2
vfmaxa.d x1, VX0, VX1
vfcmp.ceq.d VT0, VX0, x1
vbitsel.v x2, VI2, VI1, VT0
vfsub.d VT1, VZE, VX0
vfsub.d VT2, VZE, VX1
vfmaxa.d VX0, VX0, VT1
vfmaxa.d VX1, VX1, VT2
vfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1)
vbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
vbitsel.v x2, VI1, VI2, VT0 //i

vld VX0, X, 4 * SIZE vld VX0, X, 4 * SIZE
vadd.d VI1, VI2, VINC2 vadd.d VI1, VI2, VINC2
vld VX1, X, 6 * SIZE vld VX1, X, 6 * SIZE
vadd.d VI2, VI1, VINC2 vadd.d VI2, VI1, VINC2
vfmaxa.d x3, VX0, VX1
vfcmp.ceq.d VT0, VX0, x3
vbitsel.v x4, VI2, VI1, VT0
vfmaxa.d x3, x1, x3
vfcmp.ceq.d VT0, x1, x3
vbitsel.v x2, x4, x2, VT0
vfmaxa.d VM1, VM0, x3
vfcmp.ceq.d VT0, VM0, VM1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, x2, VI0, VT0
vfsub.d VT1, VZE, VX0
vfsub.d VT2, VZE, VX1
vfmaxa.d VX0, VX0, VT1
vfmaxa.d VX1, VX1, VT2
vfcmp.clt.d VT0, VX0, VX1
vbitsel.v x3, VX0, VX1, VT0 //abs(maxf)
vbitsel.v x4, VI1, VI2, VT0 //i
vfcmp.clt.d VC0, x1, x3
vbitsel.v x1, x1, x3, VC0 //abs(maxf)
vbitsel.v x2, x2, x4, VC0 //i
vfcmp.clt.d VT0, VM0, x1
addi.d I, I, -1 addi.d I, I, -1
addi.d X, X, 8 * SIZE addi.d X, X, 8 * SIZE
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, x2, VT0
#else #else
vld VX0, X, 0 * SIZE vld VX0, X, 0 * SIZE
vadd.w VI1, VI1, VINC4 vadd.w VI1, VI1, VINC4
vld VX1, X, 4 * SIZE vld VX1, X, 4 * SIZE
vadd.w VI2, VI1, VINC2 vadd.w VI2, VI1, VINC2
vfmaxa.s VM1, VX0, VX1
vfcmp.ceq.s VT0, VX0, VM1
vfsub.s VT1, VZE, VX0
vfsub.s VT2, VZE, VX1
vfmaxa.s VX0, VX0, VT1
vfmaxa.s VX1, VX1, VT2
vfcmp.clt.s VT0, VX0, VX1
vbitsel.v x1, VX0, VX1, VT0 //abs(maxf)
vbitsel.v x2, VI1, VI2, VT0 //i
addi.d I, I, -1 addi.d I, I, -1
vbitsel.v VI2, VI2, VI1, VT0
vfmaxa.s VM1, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
vfcmp.clt.s VT0, VM0, x1
addi.d X, X, 8 * SIZE addi.d X, X, 8 * SIZE
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI2, VI0, VT0
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, x2, VT0

#endif #endif
blt $r0, I, .L10 blt $r0, I, .L10
.align 3 .align 3
@@ -158,7 +179,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vreplvei.d VI2, VI0, 1 vreplvei.d VI2, VI0, 1
vreplvei.d x1, VM0, 0 vreplvei.d x1, VM0, 0
vreplvei.d x2, VM0, 1 vreplvei.d x2, VM0, 1
fcmp.ceq.d $fcc0, $f10, $f9
fcmp.ceq.d $fcc0, $f9, $f10
bceqz $fcc0, .L16 bceqz $fcc0, .L16
vfcmp.clt.d VT0, VI1, VI2 vfcmp.clt.d VT0, VI1, VI2
vbitsel.v VI0, VI2, VI1, VT0 vbitsel.v VI0, VI2, VI1, VT0
@@ -172,28 +193,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vreplvei.w x2, VM0, 1 vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2 vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3 vreplvei.w x4, VM0, 3
vfmaxa.s VM1, x1, x2
vfcmp.ceq.s VT0, VM1, x1
vbitsel.v VINC2, VI2, VI1, VT0
vfmaxa.s VM0, x3, x4
vfcmp.ceq.s VT0, x3, VM0
vbitsel.v VINC4, VI4, VI3, VT0
vfmaxa.s VM0, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
vbitsel.v VI0, VINC4, VINC2, VT0
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L26
vfcmp.clt.s VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
b .L26 b .L26
#endif #endif
.align 3 .align 3


#ifdef DOUBLE #ifdef DOUBLE
.L16: .L16:
vfmaxa.d VM0, x1, x2
vfcmp.ceq.d VT0, x1, VM0
vbitsel.v VI0, VI2, VI1, VT0
vfcmp.clt.d VT0, x1, x2
vbitsel.v VI0, VI1, VI2, VT0
vbitsel.v VM0, x1, x2, VT0
.align 3 .align 3


.L17: .L17:
@@ -212,10 +220,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


.L13: .L13:
fld.d $f9, X, 0 fld.d $f9, X, 0
vfmaxa.d VM1, x1, VM0
vfcmp.ceq.d VT0, VM0, VM1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI1, VI0, VT0
fsub.d $f10, $f3, $f9
vfmaxa.d x1, x1, x2
vfcmp.clt.d VT0, VM0, x1
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, VI1, VT0
addi.d I, I, -1 addi.d I, I, -1
addi.d i1, i1, 1 addi.d i1, i1, 1
addi.d X, X, SIZE addi.d X, X, SIZE
@@ -241,10 +250,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d TEMP, TEMP, INCX add.d TEMP, TEMP, INCX
vinsgr2vr.d VM0, t2, 1 vinsgr2vr.d VM0, t2, 1
slli.d i0, i0, 1 //2 slli.d i0, i0, 1 //2
vfsub.d VT1, VZE, VM0
vreplgr2vr.d VINC2, i0 vreplgr2vr.d VINC2, i0
slli.d i0, i0, 1 //4 slli.d i0, i0, 1 //4
vreplgr2vr.d VINC4, i0 vreplgr2vr.d VINC4, i0
addi.d i0, i0, -7 addi.d i0, i0, -7
vfmaxa.d VM0, VM0, VT1
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
addi.d i0, i0, 1 addi.d i0, i0, 1
vinsgr2vr.d VI1, i0, 1 vinsgr2vr.d VI1, i0, 1
@@ -269,9 +280,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d X, X, INCX add.d X, X, INCX
vinsgr2vr.d VX1, t2, 1 vinsgr2vr.d VX1, t2, 1
vadd.d VI2, VI1, VINC2 vadd.d VI2, VI1, VINC2
vfmaxa.d x1, VX0, VX1
vfcmp.ceq.d VT0, VX0, x1
vbitsel.v x2, VI2, VI1, VT0

vfsub.d VT1, VZE, VX0
vfsub.d VT2, VZE, VX1
vfmaxa.d VX0, VX0, VT1
vfmaxa.d VX1, VX1, VT2
vfcmp.clt.d VT0, VX0, VX1
vbitsel.v x1, VX0, VX1, VT0
vbitsel.v x2, VI1, VI2, VT0
ld.d t1, X, 0 * SIZE ld.d t1, X, 0 * SIZE
add.d X, X, INCX add.d X, X, INCX
vinsgr2vr.d VX0, t1, 0 vinsgr2vr.d VX0, t1, 0
@@ -286,16 +302,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add.d X, X, INCX add.d X, X, INCX
vinsgr2vr.d VX1, t2, 1 vinsgr2vr.d VX1, t2, 1
vadd.d VI2, VI1, VINC2 vadd.d VI2, VI1, VINC2
vfmaxa.d x3, VX0, VX1
vfcmp.ceq.d VT0, VX0, x3
vbitsel.v x4, VI2, VI1, VT0
vfmaxa.d x3, x1, x3
vfcmp.ceq.d VT0, x1, x3
vbitsel.v x2, x4, x2, VT0
vfmaxa.d VM1, VM0, x3
vbitsel.v VM0, VM1, VM0, VT0
vfcmp.ceq.d VT0, VM0, VM1
vbitsel.v VI0, x2, VI0, VT0
vfsub.d VT1, VZE, VX0
vfsub.d VT2, VZE, VX1
vfmaxa.d VX0, VX0, VT1
vfmaxa.d VX1, VX1, VT2
vfcmp.clt.d VT0, VX0, VX1
vbitsel.v x3, VX0, VX1, VT0
vbitsel.v x4, VI1, VI2, VT0
vfcmp.clt.d VC0, x1, x3
vbitsel.v x1, x1, x3, VC0
vbitsel.v x2, x2, x4, VC0
vfcmp.clt.d VT0, VM0, x1
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, x2, VT0

addi.d I, I, -1 addi.d I, I, -1
blt $r0, I, .L24 blt $r0, I, .L24
.align 3 .align 3
@@ -313,9 +333,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.align 3 .align 3


.L26: .L26:
vfmaxa.d VM0, x1, x2
vfcmp.ceq.d VT0, x1, VM0
vbitsel.v VI0, VI2, VI1, VT0
vfcmp.clt.d VT0, x1, x2
vbitsel.v VI0, VI1, VI2, VT0
vbitsel.v VM0, x1, x2, VT0
.align 3 .align 3


.L27: .L27:
@@ -389,14 +409,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX1, t3, 2 vinsgr2vr.w VX1, t3, 2
vinsgr2vr.w VX1, t4, 3 vinsgr2vr.w VX1, t4, 3
vadd.w VI2, VI1, VINC2 vadd.w VI2, VI1, VINC2
vfmaxa.s VM1, VX0, VX1
vfcmp.ceq.s VT0, VX0, VM1
vbitsel.v VI2, VI2, VI1, VT0
vfmaxa.s VM1, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
vfsub.s VT1, VZE, VX0
vfsub.s VT2, VZE, VX1
vfmaxa.s VX0, VX0, VT1
vfmaxa.s VX1, VX1, VT2
vfcmp.clt.s VT0, VX0, VX1
vbitsel.v x1, VX0, VX1, VT0
vbitsel.v x2, VI1, VI2, VT0 //i

addi.d I, I, -1 addi.d I, I, -1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI2, VI0, VT0
vfcmp.clt.s VT0, VM0, x1
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, x2, VT0
blt $r0, I, .L24 blt $r0, I, .L24
.align 3 .align 3


@@ -409,42 +433,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vreplvei.w x2, VM0, 1 vreplvei.w x2, VM0, 1
vreplvei.w x3, VM0, 2 vreplvei.w x3, VM0, 2
vreplvei.w x4, VM0, 3 vreplvei.w x4, VM0, 3
vfmaxa.s VM1, x1, x2
vfcmp.ceq.s VT0, VM1, x1
vbitsel.v VINC2, VI2, VI1, VT0
vfmaxa.s VM0, x3, x4
vfcmp.ceq.s VT0, x3, VM0
vbitsel.v VINC4, VI4, VI3, VT0
vfmaxa.s VM0, VM0, VM1
vfcmp.ceq.s VT0, VM0, VM1
vbitsel.v VI0, VINC4, VINC2, VT0
fcmp.ceq.d $fcc0, $f15, $f9
bceqz $fcc0, .L26
vfcmp.clt.s VT0, VI1, VI0
vbitsel.v VI0, VI0, VI1, VT0
.align 3 .align 3


.L26: .L26:
fcmp.ceq.d $fcc0, $f15, $f10
bceqz $fcc0, .L27
vfcmp.clt.s VT0, VI2, VI0
vbitsel.v VI0, VI0, VI2, VT0
fcmp.ceq.s $fcc0, $f9, $f10
bceqz $fcc0, .L31
vfcmp.clt.s VT0, VI1, VI2
vbitsel.v VI1, VI2, VI1, VT0
b .L32
.align 3 .align 3

.L27:
fcmp.ceq.d $fcc0, $f15, $f11
bceqz $fcc0, .L28
vfcmp.clt.s VT0, VI3, VI0
vbitsel.v VI0, VI0, VI3, VT0
.L31:
vfcmp.clt.s VT0, x1, x2
vbitsel.v VI1, VI1, VI2, VT0
vbitsel.v x1, x1, x2, VT0
.align 3 .align 3

.L28:
fcmp.ceq.d $fcc0, $f15, $f12
bceqz $fcc0, .L29
vfcmp.clt.s VT0, VI4, VI0
vbitsel.v VI0, VI0, VI4, VT0
.L32:
fcmp.ceq.s $fcc0, $f11, $f12
bceqz $fcc0, .L33
vfcmp.clt.s VT1, VI3, VI4
vbitsel.v VI3, VI4, VI3, VT1
b .L34
.align 3
.L33:
vfcmp.clt.s VT1, x3, x4
vbitsel.v x3, x3, x4, VT1
vbitsel.v VI3, VI3, VI4, VT1
.align 3
.L34:
fcmp.ceq.s $fcc0, $f9, $f11
bceqz $fcc0, .L35
vfcmp.clt.s VT0, VI1, VI3
vbitsel.v VI0, VI3, VI1, VT0
vxor.v VM0, x1, VZE
b .L29
.align 3
.L35:
vfcmp.clt.s VT0, x1, x3
vbitsel.v VM0, x1, x3, VT0
vbitsel.v VI0, VI1, VI3, VT0
.align 3 .align 3

.L29: .L29:
movfr2gr.s i0, $f20 movfr2gr.s i0, $f20
.align 3 .align 3
@@ -462,10 +489,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


.L22: .L22:
LD $f9, X, 0 LD $f9, X, 0
VFMAXA VM1, x1, VM0
VCMPEQ VT0, VM0, VM1
vbitsel.v VM0, VM1, VM0, VT0
vbitsel.v VI0, VI1, VI0, VT0
#ifdef DOUBLE
fsub.d $f10, $f3, $f9
vfmaxa.d x1, x1, x2
vfcmp.clt.d VT0, VM0, x1
#else
fsub.s $f10, $f3, $f9
vfmaxa.s x1, x1, x2
vfcmp.clt.s VT0, VM0, x1
#endif
vbitsel.v VM0, VM0, x1, VT0
vbitsel.v VI0, VI0, VI1, VT0
addi.d I, I, -1 addi.d I, I, -1
addi.d i1, i1, 1 addi.d i1, i1, 1
add.d X, X, INCX add.d X, X, INCX


+ 113
- 1292
kernel/loongarch64/rot_lsx.S
File diff suppressed because it is too large
View File


+ 55
- 26
kernel/loongarch64/snrm2_lsx.S View File

@@ -52,6 +52,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* Don't change following FR unless you know the effects. */ /* Don't change following FR unless you know the effects. */
#define res1 $vr19 #define res1 $vr19
#define res2 $vr20 #define res2 $vr20
#define RCP $f2
#define VALPHA $vr3

// The optimization for snrm2 cannot simply involve
// extending the data type from float to double and
// then summing the squares of the data. LAPACK tests
// have shown that this approach can still lead to data overflow.
// Instead, we need to find the maximum absolute value in the entire
// array and divide each data element by this maximum value before
// performing the calculation. This approach can avoid overflow (and does not require extending the data type).


PROLOGUE PROLOGUE


@@ -59,10 +69,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT N, 0(N) LDINT N, 0(N)
LDINT INCX, 0(INCX) LDINT INCX, 0(INCX)
#endif #endif
vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999 bge $r0, N, .L999
beq $r0, INCX, .L999 beq $r0, INCX, .L999

addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
#ifdef DYNAMIC_ARCH
bl samax_k_LA264
#else
bl samax_k
#endif
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w VALPHA, $vr2, 0
vxor.v res1, res1, res1
vxor.v res2, res2, res2
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999
li.d TEMP, SIZE li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3 srai.d I, N, 3
@@ -75,14 +106,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vld VX5, X, 4 * SIZE vld VX5, X, 4 * SIZE
addi.d I, I, -1 addi.d I, I, -1
addi.d X, X, 8 * SIZE addi.d X, X, 8 * SIZE
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfcvtl.d.s VX3, VX5
vfcvth.d.s VX4, VX5
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2

vfmul.s VX0, VX0, VALPHA
vfmul.s VX5, VX5, VALPHA

vfmadd.s res1, VX0, VX0, res1
vfmadd.s res2, VX5, VX5, res2
blt $r0, I, .L10 blt $r0, I, .L10
b .L996 b .L996
.align 3 .align 3
@@ -104,10 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t2, 1 vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3 vinsgr2vr.w VX0, t4, 3
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res1, VX0, VX0, res1

ld.w t1, X, 0 ld.w t1, X, 0
add.d X, X, INCX add.d X, X, INCX
ld.w t2, X, 0 ld.w t2, X, 0
@@ -120,19 +148,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t2, 1 vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3 vinsgr2vr.w VX0, t4, 3
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res2, VX0, VX0, res2
addi.d I, I, -1 addi.d I, I, -1
blt $r0, I, .L21 blt $r0, I, .L21
b .L996
.align 3 .align 3


.L996: .L996:
vfadd.d res1, res1, res2
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
vfadd.s res1, res1, res2
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3 .align 3


.L997: .L997:
@@ -143,16 +172,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L998: .L998:
fld.s $f15, X, 0 fld.s $f15, X, 0
addi.d I, I, -1 addi.d I, I, -1
fcvt.d.s $f15, $f15
fmadd.d $f19, $f15, $f15, $f19
fmul.s $f15, $f15, RCP
fmadd.s $f19, $f15, $f15, $f19
add.d X, X, INCX add.d X, X, INCX
blt $r0, I, .L998 blt $r0, I, .L998
.align 3 .align 3


.L999: .L999:
fsqrt.d $f19, $f19
fsqrt.s $f19, $f19
fmul.s $f0, $f19, $f0
move $r4, $r17 move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
.align 3 .align 3




+ 9
- 56
kernel/loongarch64/swap_lsx.S View File

@@ -348,62 +348,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
move XX, X move XX, X


.L222: .L222:
LD a1, X, 0
add.d X, X, INCX
LD a2, X, 0
add.d X, X, INCX
LD a3, X, 0
add.d X, X, INCX
LD a4, X, 0
add.d X, X, INCX
LD b1, Y, 0
ST a1, Y, 0
add.d Y, Y, INCY
LD b2, Y, 0
ST a2, Y, 0
add.d Y, Y, INCY
LD b3, Y, 0
ST a3, Y, 0
add.d Y, Y, INCY
LD b4, Y, 0
ST a4, Y, 0
add.d Y, Y, INCY
LD a1, X, 0
add.d X, X, INCX
ST b1, XX, 0
add.d XX, XX, INCX
LD b1, Y, 0
ST a1, Y, 0
add.d Y, Y, INCY
LD a2, X, 0
add.d X, X, INCX
ST b2, XX, 0
add.d XX, XX, INCX
LD b2, Y, 0
ST a2, Y, 0
add.d Y, Y, INCY
LD a3, X, 0
add.d X, X, INCX
ST b3, XX, 0
add.d XX, XX, INCX
LD b3, Y, 0
ST a3, Y, 0
LD a4, X, 0
add.d X, X, INCX
ST b4, XX, 0
add.d XX, XX, INCX
LD b4, Y, 0
ST a4, Y, 0
add.d Y, Y, INCY
ST b1, XX, 0
add.d XX, XX, INCX
ST b2, XX, 0
add.d XX, XX, INCX
ST b3, XX, 0
add.d XX, XX, INCX
ST b4, XX, 0
add.d XX, XX, INCX
addi.d I, I, -1
.rept 8
LD $f12, X, 0
LD $f14, Y, 0
ST $f12, Y, 0
ST $f14, X, 0
add.d X, X, INCX
add.d Y, Y, INCY
.endr
addi.d I, I, -1
blt $r0, I, .L222 blt $r0, I, .L222
.align 3 .align 3




Loading…
Cancel
Save