Browse Source

LoongArch64: Fixed snrm2_lsx.S and cnrm2_lsx.S

When the data type is single-precision real or single-precision complex,
converting it to double precision does not prevent overflow (as exposed in LAPACK tests).
The only solution is to follow C's approach: find the maximum value in the
array and divide each element by that maximum to avoid this issue
tags/v0.3.30
gxw 7 months ago
parent
commit
2c4a5cc6e6
2 changed files with 106 additions and 54 deletions
  1. +51
    -28
      kernel/loongarch64/cnrm2_lsx.S
  2. +55
    -26
      kernel/loongarch64/snrm2_lsx.S

+ 51
- 28
kernel/loongarch64/cnrm2_lsx.S View File

@@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VX4 $vr21
#define res1 $vr19
#define res2 $vr20
#define RCP $f2
#define VALPHA $vr3

PROLOGUE

@@ -55,10 +57,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT INCX, 0(INCX)
#endif

vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999
addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
#ifdef DYNAMIC_ARCH
bl camax_k_LA264
#else
bl camax_k
#endif
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w VALPHA, $vr2, 0
vxor.v res1, res1, res1
vxor.v res2, res2, res2
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999
li.d TEMP, 1
slli.d TEMP, TEMP, ZBASE_SHIFT
slli.d INCX, INCX, ZBASE_SHIFT
@@ -69,16 +91,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.L10:
vld VX0, X, 0 * SIZE
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vld VX0, X, 4 * SIZE
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
addi.d I, I, -1
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vfmul.s VX0, VX0, VALPHA
vfmul.s VX1, VX1, VALPHA

vfmadd.s res1, VX0, VX0, res1
vfmadd.s res2, VX1, VX1, res2

addi.d X, X, 8 * SIZE
blt $r0, I, .L10
b .L996
@@ -99,10 +120,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res1, VX0, VX0, res1

ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
@@ -113,19 +133,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res2, VX0, VX0, res2

addi.d I, I, -1
blt $r0, I, .L21
b .L996
.align 3

.L996:
vfadd.d res1, res1, res2
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
vfadd.s res1, res1, res2
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3

.L997:
@@ -137,18 +160,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fld.s a1, X, 0 * SIZE
fld.s a2, X, 1 * SIZE
addi.d I, I, -1
fcvt.d.s a1, a1
fcvt.d.s a2, a2
fmadd.d res, a1, a1, res
fmadd.d res, a2, a2, res
fmul.s a1, a1, RCP
fmul.s a2, a2, RCP
fmadd.s res, a1, a1, res
fmadd.s res, a2, a2, res
add.d X, X, INCX
blt $r0, I, .L998
.align 3

.L999:
fsqrt.d res, res
fsqrt.s res, res
fmul.s $f0, res, $f0
move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0
.align 3



+ 55
- 26
kernel/loongarch64/snrm2_lsx.S View File

@@ -52,6 +52,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* Don't change following FR unless you know the effects. */
#define res1 $vr19
#define res2 $vr20
#define RCP $f2
#define VALPHA $vr3

// The optimization for snrm2 cannot simply involve
// extending the data type from float to double and
// then summing the squares of the data. LAPACK tests
// have shown that this approach can still lead to data overflow.
// Instead, we need to find the maximum absolute value in the entire
// array and divide each data element by this maximum value before
// performing the calculation. This approach can avoid overflow (and does not require extending the data type).

PROLOGUE

@@ -59,10 +69,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999
beq $r0, INCX, .L999

addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
#ifdef DYNAMIC_ARCH
bl samax_k_LA264
#else
bl samax_k
#endif
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w VALPHA, $vr2, 0
vxor.v res1, res1, res1
vxor.v res2, res2, res2
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3
@@ -75,14 +106,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vld VX5, X, 4 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfcvtl.d.s VX3, VX5
vfcvth.d.s VX4, VX5
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2

vfmul.s VX0, VX0, VALPHA
vfmul.s VX5, VX5, VALPHA

vfmadd.s res1, VX0, VX0, res1
vfmadd.s res2, VX5, VX5, res2
blt $r0, I, .L10
b .L996
.align 3
@@ -104,10 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res1, VX0, VX0, res1

ld.w t1, X, 0
add.d X, X, INCX
ld.w t2, X, 0
@@ -120,19 +148,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res2, VX0, VX0, res2
addi.d I, I, -1
blt $r0, I, .L21
b .L996
.align 3

.L996:
vfadd.d res1, res1, res2
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
vfadd.s res1, res1, res2
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3

.L997:
@@ -143,16 +172,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L998:
fld.s $f15, X, 0
addi.d I, I, -1
fcvt.d.s $f15, $f15
fmadd.d $f19, $f15, $f15, $f19
fmul.s $f15, $f15, RCP
fmadd.s $f19, $f15, $f15, $f19
add.d X, X, INCX
blt $r0, I, .L998
.align 3

.L999:
fsqrt.d $f19, $f19
fsqrt.s $f19, $f19
fmul.s $f0, $f19, $f0
move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0
.align 3



Loading…
Cancel
Save