LoongArch64: Update symvtags/v0.3.29
@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/* Param */ | |||
#define M $r4 | |||
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define T2 $r28 | |||
#define T3 $r29 | |||
#define T4 $r30 | |||
#define T5 $r17 | |||
#define T6 $r16 | |||
/* LSX vectors */ | |||
#define U0 $xr31 | |||
@@ -87,10 +90,113 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define a8 $f8 | |||
#define a9 $f9 | |||
.macro LOAD_Y_8 | |||
beqz T5, .L01_Y_0 | |||
add.d T2, IY, INCY | |||
fldx.d $f4, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f5, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f6, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f7, Y, T2 | |||
PROLOGUE | |||
add.d T2, T2, INCY | |||
fldx.d $f8, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f9, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f10, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f11, Y, T2 | |||
vextrins.d $vr4, $vr5, 0x10 | |||
vextrins.d $vr6, $vr7, 0x10 | |||
xvpermi.q U4, U6, 0x02 | |||
LDARG BUFFER, $sp, 0 | |||
vextrins.d $vr8, $vr9, 0x10 | |||
vextrins.d $vr10, $vr11, 0x10 | |||
xvpermi.q U8, U10, 0x02 | |||
b .L01_Y_1 | |||
.L01_Y_0: | |||
add.d T3, IY, INCY | |||
xvldx U4, Y, T3 | |||
alsl.d T4, INCY, T3, 2 | |||
xvldx U8, Y, T4 | |||
.L01_Y_1: | |||
.endm | |||
.macro LOAD_X_8 | |||
beqz T6, .L01_X_0 | |||
add.d T2, IX, INCX | |||
fldx.d $f4, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f5, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f6, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f7, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f8, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f9, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f10, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f11, X, T2 | |||
vextrins.d $vr4, $vr5, 0x10 | |||
vextrins.d $vr6, $vr7, 0x10 | |||
xvpermi.q U4, U6, 0x02 | |||
vextrins.d $vr8, $vr9, 0x10 | |||
vextrins.d $vr10, $vr11, 0x10 | |||
xvpermi.q U8, U10, 0x02 | |||
b .L01_X_1 | |||
.L01_X_0: | |||
add.d T3, IX, INCX | |||
xvldx U4, X, T3 | |||
alsl.d T2, INCX, T3, 2 | |||
xvldx U8, X, T2 | |||
.L01_X_1: | |||
.endm | |||
.macro STORE_Y_8 | |||
beqz T5, .L01_Y_2 | |||
xvpermi.d U6, U4, 0xee | |||
vextrins.d $vr5, $vr4, 0x01 | |||
vextrins.d $vr7, $vr6, 0x01 | |||
xvpermi.d U10, U8, 0xee | |||
vextrins.d $vr9, $vr8, 0x01 | |||
vextrins.d $vr11, $vr10, 0x01 | |||
add.d T2, IY, INCY | |||
fstx.d $f4, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f5, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f6, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f7, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f8, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f9, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f10, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f11, Y, T2 | |||
b .L01_Y_3 | |||
.L01_Y_2: | |||
xvstx U4, Y, T3 | |||
xvstx U8, Y, T4 | |||
.L01_Y_3: | |||
.endm | |||
PROLOGUE | |||
addi.d $sp, $sp, -88 | |||
@@ -107,6 +213,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
xvldrepl.d VALPHA, $sp, 80 | |||
addi.d T5, INCY, -1 | |||
addi.d T6, INCX, -1 | |||
slli.d LDA, LDA, BASE_SHIFT | |||
slli.d INCX, INCX, BASE_SHIFT | |||
slli.d INCY, INCY, BASE_SHIFT | |||
@@ -122,11 +230,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
beq J, N, .L999 | |||
.L01: | |||
MTC a2, $r0 //temp2 | |||
xvxor.v U2, U2, U2 | |||
fldx.d a6, X, JX | |||
fmul.d a3, ALPHA, a6 //temp1 | |||
xvreplve0.d U3, U3 | |||
xvreplve0.d U2, U2 | |||
mul.d T0, J, LDA | |||
slli.d T1, J, BASE_SHIFT | |||
@@ -147,126 +254,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
srai.d T0, T0, 3 | |||
add.d T0, T0, J | |||
addi.d T0, T0, 1 | |||
beq I, T0, .L03 | |||
bge I, T0, .L03 | |||
beq I, T0, .L03 | |||
bge I, T0, .L03 | |||
mul.d T1, J, LDA | |||
add.d T1, T1, II | |||
.L02: /* /8 */ | |||
xvldx U1, AO1, T1 | |||
addi.d T1, T1, 32 | |||
xvldx U14, AO1, T1 | |||
addi.d T1, T1, 32 | |||
addi.d T2, T1, 32 | |||
xvldx U14, AO1, T2 | |||
add.d T2, IY, INCY | |||
fldx.d $f4, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f5, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f6, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f7, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f8, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f9, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f10, Y, T2 | |||
add.d T2, T2, INCY | |||
fldx.d $f11, Y, T2 | |||
vextrins.d $vr4, $vr5, 0x10 | |||
vextrins.d $vr6, $vr7, 0x10 | |||
xvpermi.q U4, U6, 0x02 | |||
vextrins.d $vr8, $vr9, 0x10 | |||
vextrins.d $vr10, $vr11, 0x10 | |||
xvpermi.q U8, U10, 0x02 | |||
LOAD_Y_8 | |||
xvfmadd.d U4, U3, U1, U4 | |||
xvfmadd.d U8, U3, U14, U8 | |||
xvpermi.d U6, U4, 0xee | |||
vextrins.d $vr5, $vr4, 0x01 | |||
vextrins.d $vr7, $vr6, 0x01 | |||
xvpermi.d U10, U8, 0xee | |||
vextrins.d $vr9, $vr8, 0x01 | |||
vextrins.d $vr11, $vr10, 0x01 | |||
add.d T2, IY, INCY | |||
fstx.d $f4, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f5, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f6, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f7, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f8, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f9, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f10, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f11, Y, T2 | |||
slli.d T2, INCY, 3 | |||
add.d IY, IY, T2 | |||
add.d T2, IX, INCX | |||
fldx.d $f4, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f5, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f6, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f7, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f8, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f9, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f10, X, T2 | |||
add.d T2, T2, INCX | |||
fldx.d $f11, X, T2 | |||
vextrins.d $vr4, $vr5, 0x10 | |||
vextrins.d $vr6, $vr7, 0x10 | |||
xvpermi.q U4, U6, 0x02 | |||
vextrins.d $vr8, $vr9, 0x10 | |||
vextrins.d $vr10, $vr11, 0x10 | |||
xvpermi.q U8, U10, 0x02 | |||
xvand.v $xr12, $xr2, $xr2 | |||
xvfmadd.d U2, U1, U4, U2 | |||
xvfsub.d U2, U2, $xr12 | |||
xvfmadd.d U2, U14, U8, U2 | |||
STORE_Y_8 | |||
xvpermi.d U4, U2, 0x01 | |||
xvpermi.d U5, U2, 0x02 | |||
xvpermi.d U6, U2, 0x03 | |||
alsl.d IY, INCY, IY, 3 | |||
fadd.d $f2, $f2, $f4 | |||
fadd.d $f2, $f2, $f5 | |||
fadd.d $f2, $f2, $f6 | |||
fadd.d $f2, $f2, $f12 | |||
LOAD_X_8 | |||
xvreplve0.d U2, U2 | |||
xvfmadd.d U2, U1, U4, U2 | |||
xvfmadd.d U2, U14, U8, U2 | |||
slli.d T2, INCX, 3 | |||
add.d IX, IX, T2 | |||
alsl.d IX, INCX, IX, 3 | |||
addi.d T1, T1, 64 | |||
addi.d II, II, 64 | |||
addi.d I, I, 1 | |||
blt I, T0, .L02 | |||
//Acc U2 | |||
GACC xvf, d, U4, U2 | |||
fmov.d $f2, $f4 | |||
.L03: /* &4 */ | |||
sub.d T0, M, J | |||
addi.d T0, T0, -1 | |||
@@ -437,4 +459,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
addi.d $sp, $sp, 88 | |||
jirl $r0, $r1, 0x0 | |||
EPILOGUE | |||
EPILOGUE |
@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/* Param */ | |||
#define M $r4 | |||
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define T2 $r28 | |||
#define T3 $r29 | |||
#define T4 $r30 | |||
#define T5 $r17 | |||
#define T6 $r16 | |||
/* LSX vectors */ | |||
#define U0 $xr31 | |||
@@ -87,67 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define a8 $f8 | |||
#define a9 $f9 | |||
PROLOGUE | |||
LDARG BUFFER, $sp, 0 | |||
addi.d $sp, $sp, -88 | |||
SDARG $r23, $sp, 0 | |||
SDARG $r24, $sp, 8 | |||
SDARG $r25, $sp, 16 | |||
SDARG $r26, $sp, 32 | |||
SDARG $r27, $sp, 40 | |||
SDARG $r28, $sp, 48 | |||
SDARG $r29, $sp, 56 | |||
SDARG $r30, $sp, 64 | |||
SDARG $r31, $sp, 72 | |||
ST ALPHA, $sp, 80 | |||
xvldrepl.d VALPHA, $sp, 80 | |||
slli.d LDA, LDA, BASE_SHIFT | |||
slli.d INCX, INCX, BASE_SHIFT | |||
slli.d INCY, INCY, BASE_SHIFT | |||
bge $r0, M, .L999 | |||
bge $r0, N, .L999 | |||
sub.d M1, M, N | |||
mul.d JY, M1, INCY | |||
mul.d JX, M1, INCX | |||
move J, M1 | |||
move AO1, A | |||
beq J, M, .L999 | |||
.L01: | |||
MTC $f2, $r0 //temp2 | |||
fldx.d $f6, X, JX | |||
fmul.d $f3, ALPHA, $f6 //temp1 | |||
xvreplve0.d U3, U3 | |||
xvreplve0.d U2, U2 | |||
move IY, $r0 | |||
move IX, $r0 | |||
move II, $r0 | |||
move I, $r0 | |||
srai.d T0, J, 3 | |||
beq I, T0, .L03 | |||
mul.d T1, J, LDA | |||
add.d T1, T1, II | |||
.L02: /* /8 */ | |||
xvldx U1, AO1, T1 | |||
addi.d T1, T1, 32 | |||
xvldx U14, AO1, T1 | |||
addi.d T1, T1, 32 | |||
.macro LOAD_Y_8 | |||
beqz T5, .L01_Y_0 | |||
fldx.d $f4, Y, IY | |||
add.d T2, IY, INCY | |||
fldx.d $f5, Y, T2 | |||
@@ -167,20 +111,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
vextrins.d $vr4, $vr5, 0x10 | |||
vextrins.d $vr6, $vr7, 0x10 | |||
xvpermi.q U4, U6, 0x02 | |||
xvpermi.q U4, U6, 0x02 | |||
vextrins.d $vr8, $vr9, 0x10 | |||
vextrins.d $vr10, $vr11, 0x10 | |||
xvpermi.q U8, U10, 0x02 | |||
xvfmadd.d U4, U3, U1, U4 | |||
xvfmadd.d U8, U3, U14, U8 | |||
xvpermi.d U6, U4, 0xee | |||
xvpermi.q U8, U10, 0x02 | |||
b .L01_Y_1 | |||
.L01_Y_0: | |||
xvldx U4, Y, IY | |||
alsl.d T4, INCY, IY, 2 | |||
xvldx U8, Y, T4 | |||
.L01_Y_1: | |||
.endm | |||
.macro STORE_Y_8 | |||
beqz T5, .L01_Y_2 | |||
xvpermi.d U6, U4, 0xee | |||
vextrins.d $vr5, $vr4, 0x01 | |||
vextrins.d $vr7, $vr6, 0x01 | |||
xvpermi.d U10, U8, 0xee | |||
xvpermi.d U10, U8, 0xee | |||
vextrins.d $vr9, $vr8, 0x01 | |||
vextrins.d $vr11, $vr10, 0x01 | |||
@@ -200,10 +150,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
fstx.d $f10, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.d $f11, Y, T2 | |||
slli.d T2, INCY, 3 | |||
add.d IY, IY, T2 | |||
b .L01_Y_3 | |||
.L01_Y_2: | |||
xvstx U4, Y, IY | |||
xvstx U8, Y, T4 | |||
.L01_Y_3: | |||
.endm | |||
.macro LOAD_X_8 | |||
beqz T6, .L01_X_0 | |||
fldx.d $f4, X, IX | |||
add.d T2, IX, INCX | |||
fldx.d $f5, X, T2 | |||
@@ -223,36 +178,102 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
vextrins.d $vr4, $vr5, 0x10 | |||
vextrins.d $vr6, $vr7, 0x10 | |||
xvpermi.q U4, U6, 0x02 | |||
xvpermi.q U4, U6, 0x02 | |||
vextrins.d $vr8, $vr9, 0x10 | |||
vextrins.d $vr10, $vr11, 0x10 | |||
xvpermi.q U8, U10, 0x02 | |||
xvpermi.q U8, U10, 0x02 | |||
b .L01_X_1 | |||
.L01_X_0: | |||
xvldx U4, X, IX | |||
alsl.d T2, INCX, IX, 2 | |||
xvldx U8, X, T2 | |||
.L01_X_1: | |||
.endm | |||
xvand.v $xr12, $xr2, $xr2 | |||
PROLOGUE | |||
xvfmadd.d U2, U1, U4, U2 | |||
xvfsub.d U2, U2, $xr12 | |||
xvfmadd.d U2, U14, U8, U2 | |||
addi.d $sp, $sp, -88 | |||
xvpermi.d U4, U2, 0x01 | |||
xvpermi.d U5, U2, 0x02 | |||
xvpermi.d U6, U2, 0x03 | |||
SDARG $r23, $sp, 0 | |||
SDARG $r24, $sp, 8 | |||
SDARG $r25, $sp, 16 | |||
SDARG $r26, $sp, 32 | |||
SDARG $r27, $sp, 40 | |||
SDARG $r28, $sp, 48 | |||
SDARG $r29, $sp, 56 | |||
SDARG $r30, $sp, 64 | |||
SDARG $r31, $sp, 72 | |||
ST ALPHA, $sp, 80 | |||
fadd.d $f2, $f2, $f4 | |||
fadd.d $f2, $f2, $f5 | |||
fadd.d $f2, $f2, $f6 | |||
fadd.d $f2, $f2, $f12 | |||
xvldrepl.d VALPHA, $sp, 80 | |||
xvreplve0.d U2, U2 | |||
addi.d T5, INCY, -1 | |||
addi.d T6, INCX, -1 | |||
slli.d LDA, LDA, BASE_SHIFT | |||
slli.d INCX, INCX, BASE_SHIFT | |||
slli.d INCY, INCY, BASE_SHIFT | |||
slli.d T2, INCX, 3 | |||
add.d IX, IX, T2 | |||
bge $r0, M, .L999 | |||
bge $r0, N, .L999 | |||
sub.d M1, M, N | |||
mul.d JY, M1, INCY | |||
mul.d JX, M1, INCX | |||
move J, M1 | |||
move AO1, A | |||
beq J, M, .L999 | |||
.L01: | |||
xvxor.v U2, U2, U2 | |||
fldx.d $f6, X, JX | |||
fmul.d $f3, ALPHA, $f6 //temp1 | |||
xvreplve0.d U3, U3 | |||
move IY, $r0 | |||
move IX, $r0 | |||
move II, $r0 | |||
move I, $r0 | |||
srai.d T0, J, 3 | |||
beq I, T0, .L03 | |||
mul.d T1, J, LDA | |||
add.d T1, T1, II | |||
.L02: /* /8 */ | |||
xvldx U1, AO1, T1 | |||
addi.d T2, T1, 32 | |||
xvldx U14, AO1, T2 | |||
LOAD_Y_8 | |||
xvfmadd.d U4, U3, U1, U4 | |||
xvfmadd.d U8, U3, U14, U8 | |||
STORE_Y_8 | |||
alsl.d IY, INCY, IY, 3 | |||
LOAD_X_8 | |||
xvfmadd.d U2, U1, U4, U2 | |||
xvfmadd.d U2, U14, U8, U2 | |||
alsl.d IX, INCX, IX, 3 | |||
addi.d T1, T1, 64 | |||
addi.d II, II, 64 | |||
addi.d I, I, 1 | |||
blt I, T0, .L02 | |||
//Acc U2 | |||
GACC xvf, d, U4, U2 | |||
fmov.d $f2, $f4 | |||
.L03: /* &4 */ | |||
andi T0, J, 4 | |||
beq $r0, T0, .L04 | |||
@@ -425,4 +446,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
addi.d $sp, $sp, 88 | |||
jirl $r0, $r1, 0x0 | |||
EPILOGUE | |||
EPILOGUE |
@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/* Param */ | |||
#define M $r4 | |||
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define T2 $r28 | |||
#define T3 $r29 | |||
#define T4 $r30 | |||
#define T5 $r17 | |||
#define T6 $r16 | |||
/* LSX vectors */ | |||
#define U0 $xr31 | |||
@@ -87,75 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define a8 $f8 | |||
#define a9 $f9 | |||
PROLOGUE | |||
LDARG BUFFER, $sp, 0 | |||
addi.d $sp, $sp, -88 | |||
SDARG $r23, $sp, 0 | |||
SDARG $r24, $sp, 8 | |||
SDARG $r25, $sp, 16 | |||
SDARG $r26, $sp, 32 | |||
SDARG $r27, $sp, 40 | |||
SDARG $r28, $sp, 48 | |||
SDARG $r29, $sp, 56 | |||
SDARG $r30, $sp, 64 | |||
SDARG $r31, $sp, 72 | |||
ST ALPHA, $sp, 80 | |||
xvldrepl.w VALPHA, $sp, 80 | |||
slli.d LDA, LDA, BASE_SHIFT | |||
slli.d INCX, INCX, BASE_SHIFT | |||
slli.d INCY, INCY, BASE_SHIFT | |||
bge $r0, M, .L999 | |||
bge $r0, N, .L999 | |||
move J, $r0 | |||
move JY, $r0 | |||
move JX, $r0 | |||
move AO1, A | |||
beq J, N, .L999 | |||
.L01: | |||
MTC a2, $r0 //temp2 | |||
fldx.s a6, X, JX | |||
fmul.s a3, ALPHA, a6 //temp1 | |||
xvreplve0.w U3, U3 | |||
xvreplve0.w U2, U2 | |||
mul.w T0, J, LDA | |||
slli.d T1, J, BASE_SHIFT | |||
add.w T0, T0, T1 | |||
fldx.s a6, AO1, T0 | |||
fldx.s a4, Y, JY | |||
fmadd.s a4, a3, a6, a4 | |||
fstx.s a4, Y, JY | |||
move IY, JY | |||
move IX, JX | |||
addi.d II, J, 1 | |||
move I, II | |||
slli.d II, II, BASE_SHIFT | |||
sub.d T0, M, J | |||
addi.d T0, T0, -1 | |||
srai.d T0, T0, 3 | |||
add.d T0, T0, J | |||
addi.d T0, T0, 1 | |||
beq I, T0, .L03 | |||
bge I, T0, .L03 | |||
mul.w T1, J, LDA | |||
add.d T1, T1, II | |||
.L02: /* /8 */ | |||
xvldx U1, AO1, T1 | |||
.macro LOAD_Y_8 | |||
beqz T5, .L01_Y_0 | |||
add.d T2, IY, INCY | |||
fldx.s $f4, Y, T2 | |||
add.d T2, T2, INCY | |||
@@ -180,11 +116,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
vextrins.w $vr8, $vr9, 0x10 | |||
vextrins.w $vr8, $vr10, 0x20 | |||
vextrins.w $vr8, $vr11, 0x30 | |||
xvpermi.q U4, U8, 0x02 | |||
xvfmadd.s U4, U3, U1, U4 | |||
xvpermi.d U8, U4, 0xee | |||
xvpermi.q U4, U8, 0x02 | |||
b .L01_Y_1 | |||
.L01_Y_0: | |||
add.d T3, IY, INCY | |||
xvldx U4, Y, T3 | |||
.L01_Y_1: | |||
.endm | |||
.macro STORE_Y_8 | |||
beqz T5, .L01_Y_2 | |||
xvpermi.d U8, U4, 0xee | |||
vextrins.w $vr5, $vr4, 0x01 | |||
vextrins.w $vr6, $vr4, 0x02 | |||
vextrins.w $vr7, $vr4, 0x03 | |||
@@ -209,10 +151,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
fstx.s $f10, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.s $f11, Y, T2 | |||
slli.d T2, INCY, 3 | |||
add.d IY, IY, T2 | |||
b .L01_Y_3 | |||
.L01_Y_2: | |||
xvstx U4, Y, T3 | |||
.L01_Y_3: | |||
.endm | |||
.macro LOAD_X_8 | |||
beqz T6, .L01_X_0 | |||
add.d T2, IX, INCX | |||
fldx.s $f4, X, T2 | |||
add.d T2, T2, INCX | |||
@@ -238,39 +184,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
vextrins.w $vr8, $vr10, 0x20 | |||
vextrins.w $vr8, $vr11, 0x30 | |||
xvpermi.q U4, U8, 0x02 | |||
b .L01_X_1 | |||
.L01_X_0: | |||
add.d T3, IX, INCX | |||
xvldx U4, X, T3 | |||
.L01_X_1: | |||
.endm | |||
PROLOGUE | |||
xvand.v $xr12, $xr2, $xr2 | |||
addi.d $sp, $sp, -88 | |||
xvfmadd.s U2, U1, U4, U2 | |||
xvfsub.s U2, U2, $xr12 | |||
SDARG $r23, $sp, 0 | |||
SDARG $r24, $sp, 8 | |||
SDARG $r25, $sp, 16 | |||
SDARG $r26, $sp, 32 | |||
SDARG $r27, $sp, 40 | |||
SDARG $r28, $sp, 48 | |||
SDARG $r29, $sp, 56 | |||
SDARG $r30, $sp, 64 | |||
SDARG $r31, $sp, 72 | |||
ST ALPHA, $sp, 80 | |||
xvpickve.w U4, U2, 0x01 | |||
xvpickve.w U5, U2, 0x02 | |||
xvpickve.w U6, U2, 0x03 | |||
xvpickve.w U7, U2, 0x04 | |||
xvpickve.w U8, U2, 0x05 | |||
xvpickve.w U9, U2, 0x06 | |||
xvpickve.w U10, U2, 0x07 | |||
xvldrepl.w VALPHA, $sp, 80 | |||
fadd.s $f2, $f2, $f4 | |||
fadd.s $f2, $f2, $f5 | |||
fadd.s $f2, $f2, $f6 | |||
fadd.s $f2, $f2, $f7 | |||
fadd.s $f2, $f2, $f8 | |||
fadd.s $f2, $f2, $f9 | |||
fadd.s $f2, $f2, $f10 | |||
fadd.s $f2, $f2, $f12 | |||
addi.d T5, INCY, -1 | |||
addi.d T6, INCX, -1 | |||
slli.d LDA, LDA, BASE_SHIFT | |||
slli.d INCX, INCX, BASE_SHIFT | |||
slli.d INCY, INCY, BASE_SHIFT | |||
xvreplve0.d U2, U2 | |||
bge $r0, M, .L999 | |||
bge $r0, N, .L999 | |||
move J, $r0 | |||
move JY, $r0 | |||
move JX, $r0 | |||
move AO1, A | |||
slli.d T2, INCX, 3 | |||
add.d IX, IX, T2 | |||
beq J, N, .L999 | |||
.L01: | |||
xvxor.v U2, U2, U2 | |||
fldx.s a6, X, JX | |||
fmul.s a3, ALPHA, a6 //temp1 | |||
xvreplve0.w U3, U3 | |||
mul.w T0, J, LDA | |||
slli.d T1, J, BASE_SHIFT | |||
add.w T0, T0, T1 | |||
fldx.s a6, AO1, T0 | |||
fldx.s a4, Y, JY | |||
fmadd.s a4, a3, a6, a4 | |||
fstx.s a4, Y, JY | |||
move IY, JY | |||
move IX, JX | |||
addi.d II, J, 1 | |||
move I, II | |||
slli.d II, II, BASE_SHIFT | |||
sub.d T0, M, J | |||
addi.d T0, T0, -1 | |||
srai.d T0, T0, 3 | |||
add.d T0, T0, J | |||
addi.d T0, T0, 1 | |||
beq I, T0, .L03 | |||
bge I, T0, .L03 | |||
mul.w T1, J, LDA | |||
add.d T1, T1, II | |||
.L02: /* /8 */ | |||
xvldx U1, AO1, T1 | |||
LOAD_Y_8 | |||
xvfmadd.s U4, U3, U1, U4 | |||
STORE_Y_8 | |||
alsl.d IY, INCY, IY, 3 | |||
LOAD_X_8 | |||
xvfmadd.s U2, U1, U4, U2 | |||
alsl.d IX, INCX, IX, 3 | |||
addi.d II, II, 32 | |||
addi.d T1, T1, 32 | |||
addi.d I, I, 1 | |||
blt I, T0, .L02 | |||
//Acc U2 | |||
GACC xvf, s, U4, U2 | |||
fmov.d $f2, $f4 | |||
.L03: /* &4 */ | |||
sub.d T0, M, J | |||
addi.d T0, T0, -1 | |||
@@ -433,4 +443,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
addi.d $sp, $sp, 88 | |||
jirl $r0, $r1, 0x0 | |||
EPILOGUE | |||
EPILOGUE |
@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#include "loongarch64_asm.S" | |||
/* Param */ | |||
#define M $r4 | |||
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define T2 $r28 | |||
#define T3 $r29 | |||
#define T4 $r30 | |||
#define T5 $r17 | |||
#define T6 $r16 | |||
/* LSX vectors */ | |||
#define U0 $xr31 | |||
@@ -87,64 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define a8 $f8 | |||
#define a9 $f9 | |||
PROLOGUE | |||
LDARG BUFFER, $sp, 0 | |||
addi.d $sp, $sp, -88 | |||
SDARG $r23, $sp, 0 | |||
SDARG $r24, $sp, 8 | |||
SDARG $r25, $sp, 16 | |||
SDARG $r26, $sp, 32 | |||
SDARG $r27, $sp, 40 | |||
SDARG $r28, $sp, 48 | |||
SDARG $r29, $sp, 56 | |||
SDARG $r30, $sp, 64 | |||
SDARG $r31, $sp, 72 | |||
ST ALPHA, $sp, 80 | |||
xvldrepl.w VALPHA, $sp, 80 | |||
slli.d LDA, LDA, BASE_SHIFT | |||
slli.d INCX, INCX, BASE_SHIFT | |||
slli.d INCY, INCY, BASE_SHIFT | |||
bge $r0, M, .L999 | |||
bge $r0, N, .L999 | |||
sub.d M1, M, N | |||
mul.d JY, M1, INCY | |||
mul.d JX, M1, INCX | |||
move J, M1 | |||
move AO1, A | |||
beq J, M, .L999 | |||
.L01: | |||
MTC $f2, $r0 //temp2 | |||
fldx.s $f6, X, JX | |||
fmul.s $f3, ALPHA, $f6 //temp1 | |||
xvreplve0.w U3, U3 | |||
xvreplve0.w U2, U2 | |||
move IY, $r0 | |||
move IX, $r0 | |||
move II, $r0 | |||
move I, $r0 | |||
srai.d T0, J, 3 | |||
beq I, T0, .L03 | |||
mul.w T1, J, LDA | |||
add.d T1, T1, II | |||
.L02: /* /8 */ | |||
xvldx U1, AO1, T1 | |||
.macro LOAD_Y_8 | |||
beqz T5, .L01_Y_0 | |||
fldx.s $f4, Y, IY | |||
add.d T2, IY, INCY | |||
fldx.s $f5, Y, T2 | |||
@@ -168,10 +115,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
vextrins.w $vr8, $vr9, 0x10 | |||
vextrins.w $vr8, $vr10, 0x20 | |||
vextrins.w $vr8, $vr11, 0x30 | |||
xvpermi.q U4, U8, 0x02 | |||
xvfmadd.s U4, U3, U1, U4 | |||
xvpermi.q U4, U8, 0x02 | |||
b .L01_Y_1 | |||
.L01_Y_0: | |||
xvldx U4, Y, IY | |||
.L01_Y_1: | |||
.endm | |||
.macro STORE_Y_8 | |||
beqz T5, .L01_Y_2 | |||
xvpermi.d U8, U4, 0xee | |||
vextrins.w $vr5, $vr4, 0x01 | |||
vextrins.w $vr6, $vr4, 0x02 | |||
@@ -196,10 +148,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
fstx.s $f10, Y, T2 | |||
add.d T2, T2, INCY | |||
fstx.s $f11, Y, T2 | |||
slli.d T2, INCY, 3 | |||
add.d IY, IY, T2 | |||
b .L01_Y_3 | |||
.L01_Y_2: | |||
xvstx U4, Y, IY | |||
.L01_Y_3: | |||
.endm | |||
.macro LOAD_X_8 | |||
beqz T6, .L01_X_0 | |||
fldx.s $f4, X, IX | |||
add.d T2, IX, INCX | |||
fldx.s $f5, X, T2 | |||
@@ -224,39 +180,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
vextrins.w $vr8, $vr10, 0x20 | |||
vextrins.w $vr8, $vr11, 0x30 | |||
xvpermi.q U4, U8, 0x02 | |||
b .L01_X_1 | |||
.L01_X_0: | |||
xvldx U4, X, IX | |||
.L01_X_1: | |||
.endm | |||
PROLOGUE | |||
xvand.v $xr12, $xr2, $xr2 | |||
addi.d $sp, $sp, -88 | |||
xvfmadd.s U2, U1, U4, U2 | |||
xvfsub.s U2, U2, $xr12 | |||
SDARG $r23, $sp, 0 | |||
SDARG $r24, $sp, 8 | |||
SDARG $r25, $sp, 16 | |||
SDARG $r26, $sp, 32 | |||
SDARG $r27, $sp, 40 | |||
SDARG $r28, $sp, 48 | |||
SDARG $r29, $sp, 56 | |||
SDARG $r30, $sp, 64 | |||
SDARG $r31, $sp, 72 | |||
ST ALPHA, $sp, 80 | |||
xvpickve.w U4, U2, 0x01 | |||
xvpickve.w U5, U2, 0x02 | |||
xvpickve.w U6, U2, 0x03 | |||
xvpickve.w U7, U2, 0x04 | |||
xvpickve.w U8, U2, 0x05 | |||
xvpickve.w U9, U2, 0x06 | |||
xvpickve.w U10, U2, 0x07 | |||
xvldrepl.w VALPHA, $sp, 80 | |||
fadd.s $f2, $f2, $f4 | |||
fadd.s $f2, $f2, $f5 | |||
fadd.s $f2, $f2, $f6 | |||
fadd.s $f2, $f2, $f7 | |||
fadd.s $f2, $f2, $f8 | |||
fadd.s $f2, $f2, $f9 | |||
fadd.s $f2, $f2, $f10 | |||
fadd.s $f2, $f2, $f12 | |||
addi.d T5, INCY, -1 | |||
addi.d T6, INCX, -1 | |||
slli.d LDA, LDA, BASE_SHIFT | |||
slli.d INCX, INCX, BASE_SHIFT | |||
slli.d INCY, INCY, BASE_SHIFT | |||
xvreplve0.d U2, U2 | |||
bge $r0, M, .L999 | |||
bge $r0, N, .L999 | |||
sub.d M1, M, N | |||
mul.d JY, M1, INCY | |||
mul.d JX, M1, INCX | |||
move J, M1 | |||
move AO1, A | |||
beq J, M, .L999 | |||
.L01: | |||
xvxor.v U2, U2, U2 | |||
fldx.s $f6, X, JX | |||
fmul.s $f3, ALPHA, $f6 //temp1 | |||
xvreplve0.w U3, U3 | |||
slli.d T2, INCX, 3 | |||
add.d IX, IX, T2 | |||
move IY, $r0 | |||
move IX, $r0 | |||
move II, $r0 | |||
move I, $r0 | |||
srai.d T0, J, 3 | |||
beq I, T0, .L03 | |||
mul.w T1, J, LDA | |||
add.d T1, T1, II | |||
.L02: /* /8 */ | |||
xvldx U1, AO1, T1 | |||
LOAD_Y_8 | |||
xvfmadd.s U4, U3, U1, U4 | |||
STORE_Y_8 | |||
alsl.d IY, INCY, IY, 3 | |||
LOAD_X_8 | |||
xvfmadd.s U2, U1, U4, U2 | |||
alsl.d IX, INCX, IX, 3 | |||
addi.d II, II, 32 | |||
addi.d T1, T1, 32 | |||
addi.d I, I, 1 | |||
blt I, T0, .L02 | |||
//Acc U2 | |||
GACC xvf, s, U4, U2 | |||
fmov.d $f2, $f4 | |||
.L03: /* &4 */ | |||
andi T0, J, 4 | |||
beq $r0, T0, .L04 | |||
@@ -421,4 +429,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
addi.d $sp, $sp, 88 | |||
jirl $r0, $r1, 0x0 | |||
EPILOGUE | |||
EPILOGUE |