| @@ -43,15 +43,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define t2 $r13 | |||
| #define t3 $r14 | |||
| #define t4 $r15 | |||
| /* Don't change following FR unless you know the effects. */ | |||
| #define VX0 $xr15 | |||
| #define VX1 $xr16 | |||
| #define VX2 $xr17 | |||
| #define VX3 $xr18 | |||
| #define VX4 $xr21 | |||
| #define VX5 $xr22 | |||
| /* Don't change following FR unless you know the effects. */ | |||
| #define res1 $xr19 | |||
| #define res2 $xr20 | |||
| #define RCP $f2 | |||
| #define VALPHA $xr3 | |||
| // The optimization for snrm2 cannot simply involve | |||
| // extending the data type from float to double and | |||
| // then summing the squares of the data. LAPACK tests | |||
| // have shown that this approach can still lead to data overflow. | |||
| // Instead, we need to find the maximum absolute value in the entire | |||
| // array and divide each data element by this maximum value before | |||
| // performing the calculation. This approach can avoid overflow (and does not require extending the data type). | |||
| PROLOGUE | |||
| @@ -59,29 +69,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LDINT N, 0(N) | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| bge $r0, N, .L999 | |||
| beq $r0, INCX, .L999 | |||
| addi.d $sp, $sp, -32 | |||
| st.d $ra, $sp, 0 | |||
| st.d N, $sp, 8 | |||
| st.d X, $sp, 16 | |||
| st.d INCX, $sp, 24 | |||
| #ifdef DYNAMIC_ARCH | |||
| bl samax_k_LA264 | |||
| #else | |||
| bl samax_k | |||
| #endif | |||
| ld.d $ra, $sp, 0 | |||
| ld.d N, $sp, 8 | |||
| ld.d X, $sp, 16 | |||
| ld.d INCX, $sp, 24 | |||
| addi.d $sp, $sp, 32 | |||
| frecip.s RCP, $f0 | |||
| vreplvei.w $vr3, $vr2, 0 | |||
| xvpermi.d VALPHA, $xr3,0x00 | |||
| xvxor.v res1, res1, res1 | |||
| xvxor.v res2, res2, res2 | |||
| bge $r0, N, .L999 | |||
| beq $r0, INCX, .L999 | |||
| fcmp.ceq.s $fcc0, $f0, $f19 | |||
| bcnez $fcc0, .L999 | |||
| li.d TEMP, SIZE | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| srai.d I, N, 3 | |||
| srai.d I, N, 4 | |||
| bne INCX, TEMP, .L20 | |||
| bge $r0, I, .L997 | |||
| bge $r0, I, .L997 | |||
| .align 3 | |||
| .L10: | |||
| xvld VX0, X, 0 | |||
| xvfcvtl.d.s VX1, VX0 | |||
| xvfcvth.d.s VX2, VX0 | |||
| xvfmadd.d res1, VX1, VX1, res1 | |||
| xvfmadd.d res2, VX2, VX2, res2 | |||
| xvld VX0, X, 0 | |||
| xvld VX5, X, 8 * SIZE | |||
| addi.d I, I, -1 | |||
| addi.d X, X, 8 * SIZE | |||
| addi.d X, X, 16 * SIZE | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmul.s VX5, VX5, VALPHA | |||
| xvfmadd.s res1, VX0, VX0, res1 | |||
| xvfmadd.s res2, VX5, VX5, res2 | |||
| blt $r0, I, .L10 | |||
| .align 3 | |||
| b .L996 | |||
| .align 3 | |||
| .L20: | |||
| bge $r0, I, .L997 | |||
| @@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld.w t3, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t4, X, 0 | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w VX0, t1, 4 | |||
| xvinsgr2vr.w VX0, t2, 5 | |||
| xvinsgr2vr.w VX0, t3, 6 | |||
| xvinsgr2vr.w VX0, t4, 7 | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmadd.s res1, VX0, VX0, res1 | |||
| ld.w t1, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t2, X, 0 | |||
| add.d X, X, INCX | |||
| xvfcvtl.d.s VX1, VX0 | |||
| xvfcvth.d.s VX2, VX0 | |||
| xvfmadd.d res1, VX1, VX1, res1 | |||
| xvfmadd.d res2, VX2, VX2, res2 | |||
| ld.w t3, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t4, X, 0 | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w VX0, t1, 0 | |||
| xvinsgr2vr.w VX0, t2, 1 | |||
| xvinsgr2vr.w VX0, t3, 2 | |||
| xvinsgr2vr.w VX0, t4, 3 | |||
| ld.w t1, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t2, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t4, X, 0 | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w VX0, t1, 4 | |||
| xvinsgr2vr.w VX0, t2, 5 | |||
| xvinsgr2vr.w VX0, t3, 6 | |||
| xvinsgr2vr.w VX0, t4, 7 | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmadd.s res2, VX0, VX0, res2 | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L21 | |||
| b .L996 | |||
| .align 3 | |||
| .L996: | |||
| xvfadd.d res1, res1, res2 | |||
| xvpickve.d VX1, res1, 1 | |||
| xvpickve.d VX2, res1, 2 | |||
| xvpickve.d VX3, res1, 3 | |||
| fadd.d $f19, $f19, $f16 | |||
| fadd.d $f19, $f19, $f17 | |||
| fadd.d $f19, $f19, $f18 | |||
| xvfadd.s res1, res1, res2 | |||
| xvpermi.d VX1, res1, 0x4e | |||
| xvfadd.s res1, res1, VX1 | |||
| vreplvei.w $vr16, $vr19, 1 | |||
| vreplvei.w $vr17, $vr19, 2 | |||
| vreplvei.w $vr18, $vr19, 3 | |||
| xvfadd.s res1, VX1, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| .align 3 | |||
| .L997: | |||
| andi I, N, 7 | |||
| andi I, N, 15 | |||
| bge $r0, I, .L999 | |||
| .align 3 | |||
| .L998: | |||
| fld.s $f15, X, 0 | |||
| add.d X, X, INCX | |||
| addi.d I, I, -1 | |||
| fcvt.d.s $f15, $f15 | |||
| fmadd.d $f19, $f15, $f15, $f19 | |||
| addi.d I, I, -1 | |||
| fmul.s $f15, $f15, RCP | |||
| fmadd.s $f19, $f15, $f15, $f19 | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L998 | |||
| .align 3 | |||
| .L999: | |||
| fsqrt.d $f19, $f19 | |||
| fsqrt.s $f19, $f19 | |||
| fmul.s $f0, $f19, $f0 | |||
| move $r4, $r17 | |||
| fcvt.s.d $f0, $f19 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| EPILOGUE | |||