Browse Source

LoongArch64: Fixed snrm2_lsx.S and cnrm2_lsx.S

When the data type is single-precision real or single-precision complex,
converting it to double precision does not prevent overflow (as exposed in LAPACK tests).
The only solution is to follow C's approach: find the maximum value in the
array and divide each element by that maximum to avoid this issue
tags/v0.3.30
gxw 7 months ago
parent
commit
2c4a5cc6e6
2 changed files with 106 additions and 54 deletions
  1. +51
    -28
      kernel/loongarch64/cnrm2_lsx.S
  2. +55
    -26
      kernel/loongarch64/snrm2_lsx.S

+ 51
- 28
kernel/loongarch64/cnrm2_lsx.S View File

@@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VX4 $vr21 #define VX4 $vr21
#define res1 $vr19 #define res1 $vr19
#define res2 $vr20 #define res2 $vr20
#define RCP $f2
#define VALPHA $vr3


PROLOGUE PROLOGUE


@@ -55,10 +57,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT INCX, 0(INCX) LDINT INCX, 0(INCX)
#endif #endif


vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999 bge $r0, N, .L999
beq $r0, INCX, .L999 beq $r0, INCX, .L999
addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
#ifdef DYNAMIC_ARCH
bl camax_k_LA264
#else
bl camax_k
#endif
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w VALPHA, $vr2, 0
vxor.v res1, res1, res1
vxor.v res2, res2, res2
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999
li.d TEMP, 1 li.d TEMP, 1
slli.d TEMP, TEMP, ZBASE_SHIFT slli.d TEMP, TEMP, ZBASE_SHIFT
slli.d INCX, INCX, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT
@@ -69,16 +91,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


.L10: .L10:
vld VX0, X, 0 * SIZE vld VX0, X, 0 * SIZE
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vld VX0, X, 4 * SIZE
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
addi.d I, I, -1 addi.d I, I, -1
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vfmul.s VX0, VX0, VALPHA
vfmul.s VX1, VX1, VALPHA

vfmadd.s res1, VX0, VX0, res1
vfmadd.s res2, VX1, VX1, res2

addi.d X, X, 8 * SIZE addi.d X, X, 8 * SIZE
blt $r0, I, .L10 blt $r0, I, .L10
b .L996 b .L996
@@ -99,10 +120,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3 vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX add.d X, X, INCX
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res1, VX0, VX0, res1

ld.w t1, X, 0 * SIZE ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE ld.w t2, X, 1 * SIZE
add.d X, X, INCX add.d X, X, INCX
@@ -113,19 +133,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3 vinsgr2vr.w VX0, t4, 3
add.d X, X, INCX add.d X, X, INCX
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res2, VX0, VX0, res2

addi.d I, I, -1 addi.d I, I, -1
blt $r0, I, .L21 blt $r0, I, .L21
b .L996 b .L996
.align 3 .align 3


.L996: .L996:
vfadd.d res1, res1, res2
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
vfadd.s res1, res1, res2
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3 .align 3


.L997: .L997:
@@ -137,18 +160,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fld.s a1, X, 0 * SIZE fld.s a1, X, 0 * SIZE
fld.s a2, X, 1 * SIZE fld.s a2, X, 1 * SIZE
addi.d I, I, -1 addi.d I, I, -1
fcvt.d.s a1, a1
fcvt.d.s a2, a2
fmadd.d res, a1, a1, res
fmadd.d res, a2, a2, res
fmul.s a1, a1, RCP
fmul.s a2, a2, RCP
fmadd.s res, a1, a1, res
fmadd.s res, a2, a2, res
add.d X, X, INCX add.d X, X, INCX
blt $r0, I, .L998 blt $r0, I, .L998
.align 3 .align 3


.L999: .L999:
fsqrt.d res, res
fsqrt.s res, res
fmul.s $f0, res, $f0
move $r4, $r17 move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
.align 3 .align 3




+ 55
- 26
kernel/loongarch64/snrm2_lsx.S View File

@@ -52,6 +52,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* Don't change following FR unless you know the effects. */ /* Don't change following FR unless you know the effects. */
#define res1 $vr19 #define res1 $vr19
#define res2 $vr20 #define res2 $vr20
#define RCP $f2
#define VALPHA $vr3

// The optimization for snrm2 cannot simply involve
// extending the data type from float to double and
// then summing the squares of the data. LAPACK tests
// have shown that this approach can still lead to data overflow.
// Instead, we need to find the maximum absolute value in the entire
// array and divide each data element by this maximum value before
// performing the calculation. This approach can avoid overflow (and does not require extending the data type).


PROLOGUE PROLOGUE


@@ -59,10 +69,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT N, 0(N) LDINT N, 0(N)
LDINT INCX, 0(INCX) LDINT INCX, 0(INCX)
#endif #endif
vxor.v res1, res1, res1
vxor.v res2, res2, res2
bge $r0, N, .L999 bge $r0, N, .L999
beq $r0, INCX, .L999 beq $r0, INCX, .L999

addi.d $sp, $sp, -32
st.d $ra, $sp, 0
st.d N, $sp, 8
st.d X, $sp, 16
st.d INCX, $sp, 24
#ifdef DYNAMIC_ARCH
bl samax_k_LA264
#else
bl samax_k
#endif
ld.d $ra, $sp, 0
ld.d N, $sp, 8
ld.d X, $sp, 16
ld.d INCX, $sp, 24
addi.d $sp, $sp, 32

frecip.s RCP, $f0
vreplvei.w VALPHA, $vr2, 0
vxor.v res1, res1, res1
vxor.v res2, res2, res2
fcmp.ceq.s $fcc0, $f0, $f19
bcnez $fcc0, .L999
li.d TEMP, SIZE li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT
srai.d I, N, 3 srai.d I, N, 3
@@ -75,14 +106,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vld VX5, X, 4 * SIZE vld VX5, X, 4 * SIZE
addi.d I, I, -1 addi.d I, I, -1
addi.d X, X, 8 * SIZE addi.d X, X, 8 * SIZE
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfcvtl.d.s VX3, VX5
vfcvth.d.s VX4, VX5
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2

vfmul.s VX0, VX0, VALPHA
vfmul.s VX5, VX5, VALPHA

vfmadd.s res1, VX0, VX0, res1
vfmadd.s res2, VX5, VX5, res2
blt $r0, I, .L10 blt $r0, I, .L10
b .L996 b .L996
.align 3 .align 3
@@ -104,10 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t2, 1 vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3 vinsgr2vr.w VX0, t4, 3
vfcvtl.d.s VX1, VX0
vfcvth.d.s VX2, VX0
vfmadd.d res1, VX1, VX1, res1
vfmadd.d res2, VX2, VX2, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res1, VX0, VX0, res1

ld.w t1, X, 0 ld.w t1, X, 0
add.d X, X, INCX add.d X, X, INCX
ld.w t2, X, 0 ld.w t2, X, 0
@@ -120,19 +148,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vinsgr2vr.w VX0, t2, 1 vinsgr2vr.w VX0, t2, 1
vinsgr2vr.w VX0, t3, 2 vinsgr2vr.w VX0, t3, 2
vinsgr2vr.w VX0, t4, 3 vinsgr2vr.w VX0, t4, 3
vfcvtl.d.s VX3, VX0
vfcvth.d.s VX4, VX0
vfmadd.d res1, VX3, VX3, res1
vfmadd.d res2, VX4, VX4, res2
vfmul.s VX0, VX0, VALPHA
vfmadd.s res2, VX0, VX0, res2
addi.d I, I, -1 addi.d I, I, -1
blt $r0, I, .L21 blt $r0, I, .L21
b .L996
.align 3 .align 3


.L996: .L996:
vfadd.d res1, res1, res2
vreplvei.d VX1, res1, 1
vfadd.d res1, VX1, res1
vfadd.s res1, res1, res2
vreplvei.w VX1, res1, 1
vreplvei.w VX2, res1, 2
vreplvei.w VX3, res1, 3
vfadd.s res1, VX1, res1
vfadd.s res1, VX2, res1
vfadd.s res1, VX3, res1
.align 3 .align 3


.L997: .L997:
@@ -143,16 +172,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.L998: .L998:
fld.s $f15, X, 0 fld.s $f15, X, 0
addi.d I, I, -1 addi.d I, I, -1
fcvt.d.s $f15, $f15
fmadd.d $f19, $f15, $f15, $f19
fmul.s $f15, $f15, RCP
fmadd.s $f19, $f15, $f15, $f19
add.d X, X, INCX add.d X, X, INCX
blt $r0, I, .L998 blt $r0, I, .L998
.align 3 .align 3


.L999: .L999:
fsqrt.d $f19, $f19
fsqrt.s $f19, $f19
fmul.s $f0, $f19, $f0
move $r4, $r17 move $r4, $r17
fcvt.s.d $f0, $f19
jirl $r0, $r1, 0x0 jirl $r0, $r1, 0x0
.align 3 .align 3




Loading…
Cancel
Save