Browse Source

Loongarch64: fixed snrm2_lasx

tags/v0.3.30
pengxu 6 months ago
parent
commit
b471fa337b
1 changed files with 94 additions and 32 deletions
  1. +94
    -32
      kernel/loongarch64/snrm2_lasx.S

+ 94
- 32
kernel/loongarch64/snrm2_lasx.S View File

@@ -43,15 +43,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define t2 $r13
#define t3 $r14
#define t4 $r15

/* Don't change following FR unless you know the effects. */
#define VX0 $xr15
#define VX1 $xr16
#define VX2 $xr17
#define VX3 $xr18
#define VX4 $xr21
#define VX5 $xr22
/* Don't change following FR unless you know the effects. */
#define res1 $xr19
#define res2 $xr20
#define RCP $f2
#define VALPHA $xr3

// The optimization for snrm2 cannot simply involve
// extending the data type from float to double and
// then summing the squares of the data. LAPACK tests
// have shown that this approach can still lead to data overflow.
// Instead, we need to find the maximum absolute value in the entire
// array and divide each data element by this maximum value before
// performing the calculation. This approach can avoid overflow (and does not require extending the data type).

PROLOGUE

@@ -59,29 +69,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
bge $r0, N, .L999
beq $r0, INCX, .L999

addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
#ifdef DYNAMIC_ARCH
bl samax_k_LA264
#else
bl samax_k
#endif
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w $vr3, $vr2, 0
xvpermi.d VALPHA, $xr3,0x00
xvxor.v res1, res1, res1
xvxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
srai.d I, N, 4
bne INCX, TEMP, .L20
bge $r0, I, .L997
bge $r0, I, .L997
.align 3

.L10:
xvld VX0, X, 0
xvfcvtl.d.s VX1, VX0
xvfcvth.d.s VX2, VX0
xvfmadd.d res1, VX1, VX1, res1
xvfmadd.d res2, VX2, VX2, res2
xvld VX0, X, 0
xvld VX5, X, 8 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d X, X, 16 * SIZE

xvfmul.s VX0, VX0, VALPHA
xvfmul.s VX5, VX5, VALPHA

xvfmadd.s res1, VX0, VX0, res1
xvfmadd.s res2, VX5, VX5, res2
blt $r0, I, .L10
.align 3
b .L996
.align 3

.L20:
bge $r0, I, .L997
@@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvfmul.s VX0, VX0, VALPHA
xvfmadd.s res1, VX0, VX0, res1

ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
xvfcvtl.d.s VX1, VX0
xvfcvth.d.s VX2, VX0
xvfmadd.d res1, VX1, VX1, res1
xvfmadd.d res2, VX2, VX2, res2
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 0
xvinsgr2vr.w VX0, t2, 1
xvinsgr2vr.w VX0, t3, 2
xvinsgr2vr.w VX0, t4, 3
ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
add.d X, X, INCX
ld.w t3, X, 0
add.d X, X, INCX
ld.w t4, X, 0
add.d X, X, INCX
xvinsgr2vr.w VX0, t1, 4
xvinsgr2vr.w VX0, t2, 5
xvinsgr2vr.w VX0, t3, 6
xvinsgr2vr.w VX0, t4, 7
xvfmul.s VX0, VX0, VALPHA
xvfmadd.s res2, VX0, VX0, res2
addi.d I, I, -1
blt $r0, I, .L21
b .L996
.align 3

.L996:
xvfadd.d res1, res1, res2
xvpickve.d VX1, res1, 1
xvpickve.d VX2, res1, 2
xvpickve.d VX3, res1, 3
fadd.d $f19, $f19, $f16
fadd.d $f19, $f19, $f17
fadd.d $f19, $f19, $f18
xvfadd.s res1, res1, res2
xvpermi.d VX1, res1, 0x4e
xvfadd.s res1, res1, VX1
vreplvei.w $vr16, $vr19, 1
vreplvei.w $vr17, $vr19, 2
vreplvei.w $vr18, $vr19, 3
xvfadd.s res1, VX1, res1
xvfadd.s res1, VX2, res1
xvfadd.s res1, VX3, res1
.align 3

.L997:
andi I, N, 7
andi I, N, 15
bge $r0, I, .L999
.align 3

.L998:
fld.s $f15, X, 0
add.d X, X, INCX
addi.d I, I, -1
fcvt.d.s $f15, $f15
fmadd.d $f19, $f15, $f15, $f19
addi.d I, I, -1
fmul.s $f15, $f15, RCP
fmadd.s $f19, $f15, $f15, $f19
add.d X, X, INCX
blt $r0, I, .L998
.align 3

.L999:
fsqrt.d $f19, $f19
fsqrt.s $f19, $f19
fmul.s $f0, $f19, $f0
move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0
.align 3

EPILOGUE

Loading…
Cancel
Save