When the data type is single-precision real or single-precision complex, converting it to double precision does not prevent overflow (as exposed in LAPACK tests). The only solution is to follow C's approach: find the maximum value in the array and divide each element by that maximum to avoid this issuetags/v0.3.30
@@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define VX4 $vr21 | #define VX4 $vr21 | ||||
#define res1 $vr19 | #define res1 $vr19 | ||||
#define res2 $vr20 | #define res2 $vr20 | ||||
#define RCP $f2 | |||||
#define VALPHA $vr3 | |||||
PROLOGUE | PROLOGUE | ||||
@@ -55,10 +57,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
LDINT INCX, 0(INCX) | LDINT INCX, 0(INCX) | ||||
#endif | #endif | ||||
vxor.v res1, res1, res1 | |||||
vxor.v res2, res2, res2 | |||||
bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
beq $r0, INCX, .L999 | beq $r0, INCX, .L999 | ||||
addi.d $sp, $sp, -32 | |||||
st.d $ra, $sp, 0 | |||||
st.d N, $sp, 8 | |||||
st.d X, $sp, 16 | |||||
st.d INCX, $sp, 24 | |||||
#ifdef DYNAMIC_ARCH | |||||
bl camax_k_LA264 | |||||
#else | |||||
bl camax_k | |||||
#endif | |||||
ld.d $ra, $sp, 0 | |||||
ld.d N, $sp, 8 | |||||
ld.d X, $sp, 16 | |||||
ld.d INCX, $sp, 24 | |||||
addi.d $sp, $sp, 32 | |||||
frecip.s RCP, $f0 | |||||
vreplvei.w VALPHA, $vr2, 0 | |||||
vxor.v res1, res1, res1 | |||||
vxor.v res2, res2, res2 | |||||
fcmp.ceq.s $fcc0, $f0, $f19 | |||||
bcnez $fcc0, .L999 | |||||
li.d TEMP, 1 | li.d TEMP, 1 | ||||
slli.d TEMP, TEMP, ZBASE_SHIFT | slli.d TEMP, TEMP, ZBASE_SHIFT | ||||
slli.d INCX, INCX, ZBASE_SHIFT | slli.d INCX, INCX, ZBASE_SHIFT | ||||
@@ -69,16 +91,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.L10: | .L10: | ||||
vld VX0, X, 0 * SIZE | vld VX0, X, 0 * SIZE | ||||
vfcvtl.d.s VX1, VX0 | |||||
vfcvth.d.s VX2, VX0 | |||||
vfmadd.d res1, VX1, VX1, res1 | |||||
vfmadd.d res2, VX2, VX2, res2 | |||||
vld VX0, X, 4 * SIZE | |||||
vfcvtl.d.s VX3, VX0 | |||||
vfcvth.d.s VX4, VX0 | |||||
vfmadd.d res1, VX3, VX3, res1 | |||||
vfmadd.d res2, VX4, VX4, res2 | |||||
addi.d I, I, -1 | addi.d I, I, -1 | ||||
vld VX0, X, 0 * SIZE | |||||
vld VX1, X, 4 * SIZE | |||||
vfmul.s VX0, VX0, VALPHA | |||||
vfmul.s VX1, VX1, VALPHA | |||||
vfmadd.s res1, VX0, VX0, res1 | |||||
vfmadd.s res2, VX1, VX1, res2 | |||||
addi.d X, X, 8 * SIZE | addi.d X, X, 8 * SIZE | ||||
blt $r0, I, .L10 | blt $r0, I, .L10 | ||||
b .L996 | b .L996 | ||||
@@ -99,10 +120,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
vinsgr2vr.w VX0, t3, 2 | vinsgr2vr.w VX0, t3, 2 | ||||
vinsgr2vr.w VX0, t4, 3 | vinsgr2vr.w VX0, t4, 3 | ||||
add.d X, X, INCX | add.d X, X, INCX | ||||
vfcvtl.d.s VX1, VX0 | |||||
vfcvth.d.s VX2, VX0 | |||||
vfmadd.d res1, VX1, VX1, res1 | |||||
vfmadd.d res2, VX2, VX2, res2 | |||||
vfmul.s VX0, VX0, VALPHA | |||||
vfmadd.s res1, VX0, VX0, res1 | |||||
ld.w t1, X, 0 * SIZE | ld.w t1, X, 0 * SIZE | ||||
ld.w t2, X, 1 * SIZE | ld.w t2, X, 1 * SIZE | ||||
add.d X, X, INCX | add.d X, X, INCX | ||||
@@ -113,19 +133,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
vinsgr2vr.w VX0, t3, 2 | vinsgr2vr.w VX0, t3, 2 | ||||
vinsgr2vr.w VX0, t4, 3 | vinsgr2vr.w VX0, t4, 3 | ||||
add.d X, X, INCX | add.d X, X, INCX | ||||
vfcvtl.d.s VX3, VX0 | |||||
vfcvth.d.s VX4, VX0 | |||||
vfmadd.d res1, VX3, VX3, res1 | |||||
vfmadd.d res2, VX4, VX4, res2 | |||||
vfmul.s VX0, VX0, VALPHA | |||||
vfmadd.s res2, VX0, VX0, res2 | |||||
addi.d I, I, -1 | addi.d I, I, -1 | ||||
blt $r0, I, .L21 | blt $r0, I, .L21 | ||||
b .L996 | b .L996 | ||||
.align 3 | .align 3 | ||||
.L996: | .L996: | ||||
vfadd.d res1, res1, res2 | |||||
vreplvei.d VX1, res1, 1 | |||||
vfadd.d res1, VX1, res1 | |||||
vfadd.s res1, res1, res2 | |||||
vreplvei.w VX1, res1, 1 | |||||
vreplvei.w VX2, res1, 2 | |||||
vreplvei.w VX3, res1, 3 | |||||
vfadd.s res1, VX1, res1 | |||||
vfadd.s res1, VX2, res1 | |||||
vfadd.s res1, VX3, res1 | |||||
.align 3 | .align 3 | ||||
.L997: | .L997: | ||||
@@ -137,18 +160,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
fld.s a1, X, 0 * SIZE | fld.s a1, X, 0 * SIZE | ||||
fld.s a2, X, 1 * SIZE | fld.s a2, X, 1 * SIZE | ||||
addi.d I, I, -1 | addi.d I, I, -1 | ||||
fcvt.d.s a1, a1 | |||||
fcvt.d.s a2, a2 | |||||
fmadd.d res, a1, a1, res | |||||
fmadd.d res, a2, a2, res | |||||
fmul.s a1, a1, RCP | |||||
fmul.s a2, a2, RCP | |||||
fmadd.s res, a1, a1, res | |||||
fmadd.s res, a2, a2, res | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
blt $r0, I, .L998 | blt $r0, I, .L998 | ||||
.align 3 | .align 3 | ||||
.L999: | .L999: | ||||
fsqrt.d res, res | |||||
fsqrt.s res, res | |||||
fmul.s $f0, res, $f0 | |||||
move $r4, $r17 | move $r4, $r17 | ||||
fcvt.s.d $f0, $f19 | |||||
jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
.align 3 | .align 3 | ||||
@@ -52,6 +52,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
/* Don't change following FR unless you know the effects. */ | /* Don't change following FR unless you know the effects. */ | ||||
#define res1 $vr19 | #define res1 $vr19 | ||||
#define res2 $vr20 | #define res2 $vr20 | ||||
#define RCP $f2 | |||||
#define VALPHA $vr3 | |||||
// The optimization for snrm2 cannot simply involve | |||||
// extending the data type from float to double and | |||||
// then summing the squares of the data. LAPACK tests | |||||
// have shown that this approach can still lead to data overflow. | |||||
// Instead, we need to find the maximum absolute value in the entire | |||||
// array and divide each data element by this maximum value before | |||||
// performing the calculation. This approach can avoid overflow (and does not require extending the data type). | |||||
PROLOGUE | PROLOGUE | ||||
@@ -59,10 +69,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
LDINT N, 0(N) | LDINT N, 0(N) | ||||
LDINT INCX, 0(INCX) | LDINT INCX, 0(INCX) | ||||
#endif | #endif | ||||
vxor.v res1, res1, res1 | |||||
vxor.v res2, res2, res2 | |||||
bge $r0, N, .L999 | bge $r0, N, .L999 | ||||
beq $r0, INCX, .L999 | beq $r0, INCX, .L999 | ||||
addi.d $sp, $sp, -32 | |||||
st.d $ra, $sp, 0 | |||||
st.d N, $sp, 8 | |||||
st.d X, $sp, 16 | |||||
st.d INCX, $sp, 24 | |||||
#ifdef DYNAMIC_ARCH | |||||
bl samax_k_LA264 | |||||
#else | |||||
bl samax_k | |||||
#endif | |||||
ld.d $ra, $sp, 0 | |||||
ld.d N, $sp, 8 | |||||
ld.d X, $sp, 16 | |||||
ld.d INCX, $sp, 24 | |||||
addi.d $sp, $sp, 32 | |||||
frecip.s RCP, $f0 | |||||
vreplvei.w VALPHA, $vr2, 0 | |||||
vxor.v res1, res1, res1 | |||||
vxor.v res2, res2, res2 | |||||
fcmp.ceq.s $fcc0, $f0, $f19 | |||||
bcnez $fcc0, .L999 | |||||
li.d TEMP, SIZE | li.d TEMP, SIZE | ||||
slli.d INCX, INCX, BASE_SHIFT | slli.d INCX, INCX, BASE_SHIFT | ||||
srai.d I, N, 3 | srai.d I, N, 3 | ||||
@@ -75,14 +106,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
vld VX5, X, 4 * SIZE | vld VX5, X, 4 * SIZE | ||||
addi.d I, I, -1 | addi.d I, I, -1 | ||||
addi.d X, X, 8 * SIZE | addi.d X, X, 8 * SIZE | ||||
vfcvtl.d.s VX1, VX0 | |||||
vfcvth.d.s VX2, VX0 | |||||
vfcvtl.d.s VX3, VX5 | |||||
vfcvth.d.s VX4, VX5 | |||||
vfmadd.d res1, VX1, VX1, res1 | |||||
vfmadd.d res2, VX2, VX2, res2 | |||||
vfmadd.d res1, VX3, VX3, res1 | |||||
vfmadd.d res2, VX4, VX4, res2 | |||||
vfmul.s VX0, VX0, VALPHA | |||||
vfmul.s VX5, VX5, VALPHA | |||||
vfmadd.s res1, VX0, VX0, res1 | |||||
vfmadd.s res2, VX5, VX5, res2 | |||||
blt $r0, I, .L10 | blt $r0, I, .L10 | ||||
b .L996 | b .L996 | ||||
.align 3 | .align 3 | ||||
@@ -104,10 +133,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
vinsgr2vr.w VX0, t2, 1 | vinsgr2vr.w VX0, t2, 1 | ||||
vinsgr2vr.w VX0, t3, 2 | vinsgr2vr.w VX0, t3, 2 | ||||
vinsgr2vr.w VX0, t4, 3 | vinsgr2vr.w VX0, t4, 3 | ||||
vfcvtl.d.s VX1, VX0 | |||||
vfcvth.d.s VX2, VX0 | |||||
vfmadd.d res1, VX1, VX1, res1 | |||||
vfmadd.d res2, VX2, VX2, res2 | |||||
vfmul.s VX0, VX0, VALPHA | |||||
vfmadd.s res1, VX0, VX0, res1 | |||||
ld.w t1, X, 0 | ld.w t1, X, 0 | ||||
add.d X, X, INCX | add.d X, X, INCX | ||||
ld.w t2, X, 0 | ld.w t2, X, 0 | ||||
@@ -120,19 +148,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
vinsgr2vr.w VX0, t2, 1 | vinsgr2vr.w VX0, t2, 1 | ||||
vinsgr2vr.w VX0, t3, 2 | vinsgr2vr.w VX0, t3, 2 | ||||
vinsgr2vr.w VX0, t4, 3 | vinsgr2vr.w VX0, t4, 3 | ||||
vfcvtl.d.s VX3, VX0 | |||||
vfcvth.d.s VX4, VX0 | |||||
vfmadd.d res1, VX3, VX3, res1 | |||||
vfmadd.d res2, VX4, VX4, res2 | |||||
vfmul.s VX0, VX0, VALPHA | |||||
vfmadd.s res2, VX0, VX0, res2 | |||||
addi.d I, I, -1 | addi.d I, I, -1 | ||||
blt $r0, I, .L21 | blt $r0, I, .L21 | ||||
b .L996 | |||||
.align 3 | .align 3 | ||||
.L996: | .L996: | ||||
vfadd.d res1, res1, res2 | |||||
vreplvei.d VX1, res1, 1 | |||||
vfadd.d res1, VX1, res1 | |||||
vfadd.s res1, res1, res2 | |||||
vreplvei.w VX1, res1, 1 | |||||
vreplvei.w VX2, res1, 2 | |||||
vreplvei.w VX3, res1, 3 | |||||
vfadd.s res1, VX1, res1 | |||||
vfadd.s res1, VX2, res1 | |||||
vfadd.s res1, VX3, res1 | |||||
.align 3 | .align 3 | ||||
.L997: | .L997: | ||||
@@ -143,16 +172,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.L998: | .L998: | ||||
fld.s $f15, X, 0 | fld.s $f15, X, 0 | ||||
addi.d I, I, -1 | addi.d I, I, -1 | ||||
fcvt.d.s $f15, $f15 | |||||
fmadd.d $f19, $f15, $f15, $f19 | |||||
fmul.s $f15, $f15, RCP | |||||
fmadd.s $f19, $f15, $f15, $f19 | |||||
add.d X, X, INCX | add.d X, X, INCX | ||||
blt $r0, I, .L998 | blt $r0, I, .L998 | ||||
.align 3 | .align 3 | ||||
.L999: | .L999: | ||||
fsqrt.d $f19, $f19 | |||||
fsqrt.s $f19, $f19 | |||||
fmul.s $f0, $f19, $f0 | |||||
move $r4, $r17 | move $r4, $r17 | ||||
fcvt.s.d $f0, $f19 | |||||
jirl $r0, $r1, 0x0 | jirl $r0, $r1, 0x0 | ||||
.align 3 | .align 3 | ||||