Browse Source

Merge pull request #5061 from XiWeiGu/la64_update_symv

LoongArch64: Update symv
tags/v0.3.29
Martin Kroeker GitHub 8 months ago
parent
commit
c31f148c76
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
4 changed files with 444 additions and 383 deletions
  1. +127
    -105
      kernel/loongarch64/dsymv_L_lasx.S
  2. +111
    -90
      kernel/loongarch64/dsymv_U_lasx.S
  3. +110
    -100
      kernel/loongarch64/ssymv_L_lasx.S
  4. +96
    -88
      kernel/loongarch64/ssymv_U_lasx.S

+ 127
- 105
kernel/loongarch64/dsymv_L_lasx.S View File

@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER

#include "common.h"
#include "loongarch64_asm.S"

/* Param */
#define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28
#define T3 $r29
#define T4 $r30
#define T5 $r17
#define T6 $r16

/* LSX vectors */
#define U0 $xr31
@@ -87,10 +90,113 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a8 $f8
#define a9 $f9

.macro LOAD_Y_8
beqz T5, .L01_Y_0
add.d T2, IY, INCY
fldx.d $f4, Y, T2
add.d T2, T2, INCY
fldx.d $f5, Y, T2
add.d T2, T2, INCY
fldx.d $f6, Y, T2
add.d T2, T2, INCY
fldx.d $f7, Y, T2

PROLOGUE
add.d T2, T2, INCY
fldx.d $f8, Y, T2
add.d T2, T2, INCY
fldx.d $f9, Y, T2
add.d T2, T2, INCY
fldx.d $f10, Y, T2
add.d T2, T2, INCY
fldx.d $f11, Y, T2

vextrins.d $vr4, $vr5, 0x10
vextrins.d $vr6, $vr7, 0x10
xvpermi.q U4, U6, 0x02

LDARG BUFFER, $sp, 0
vextrins.d $vr8, $vr9, 0x10
vextrins.d $vr10, $vr11, 0x10
xvpermi.q U8, U10, 0x02
b .L01_Y_1
.L01_Y_0:
add.d T3, IY, INCY
xvldx U4, Y, T3
alsl.d T4, INCY, T3, 2
xvldx U8, Y, T4
.L01_Y_1:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
add.d T2, IX, INCX
fldx.d $f4, X, T2
add.d T2, T2, INCX
fldx.d $f5, X, T2
add.d T2, T2, INCX
fldx.d $f6, X, T2
add.d T2, T2, INCX
fldx.d $f7, X, T2

add.d T2, T2, INCX
fldx.d $f8, X, T2
add.d T2, T2, INCX
fldx.d $f9, X, T2
add.d T2, T2, INCX
fldx.d $f10, X, T2
add.d T2, T2, INCX
fldx.d $f11, X, T2

vextrins.d $vr4, $vr5, 0x10
vextrins.d $vr6, $vr7, 0x10
xvpermi.q U4, U6, 0x02

vextrins.d $vr8, $vr9, 0x10
vextrins.d $vr10, $vr11, 0x10
xvpermi.q U8, U10, 0x02
b .L01_X_1
.L01_X_0:
add.d T3, IX, INCX
xvldx U4, X, T3
alsl.d T2, INCX, T3, 2
xvldx U8, X, T2
.L01_X_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
xvpermi.d U6, U4, 0xee
vextrins.d $vr5, $vr4, 0x01
vextrins.d $vr7, $vr6, 0x01

xvpermi.d U10, U8, 0xee
vextrins.d $vr9, $vr8, 0x01
vextrins.d $vr11, $vr10, 0x01

add.d T2, IY, INCY
fstx.d $f4, Y, T2
add.d T2, T2, INCY
fstx.d $f5, Y, T2
add.d T2, T2, INCY
fstx.d $f6, Y, T2
add.d T2, T2, INCY
fstx.d $f7, Y, T2

add.d T2, T2, INCY
fstx.d $f8, Y, T2
add.d T2, T2, INCY
fstx.d $f9, Y, T2
add.d T2, T2, INCY
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2
b .L01_Y_3
.L01_Y_2:
xvstx U4, Y, T3
xvstx U8, Y, T4
.L01_Y_3:
.endm

PROLOGUE

addi.d $sp, $sp, -88

@@ -107,6 +213,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

xvldrepl.d VALPHA, $sp, 80

addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT
@@ -122,11 +230,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
beq J, N, .L999

.L01:
MTC a2, $r0 //temp2
xvxor.v U2, U2, U2
fldx.d a6, X, JX
fmul.d a3, ALPHA, a6 //temp1
xvreplve0.d U3, U3
xvreplve0.d U2, U2

mul.d T0, J, LDA
slli.d T1, J, BASE_SHIFT
@@ -147,126 +254,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
srai.d T0, T0, 3
add.d T0, T0, J
addi.d T0, T0, 1
beq I, T0, .L03
bge I, T0, .L03
beq I, T0, .L03
bge I, T0, .L03

mul.d T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
xvldx U1, AO1, T1
addi.d T1, T1, 32
xvldx U14, AO1, T1
addi.d T1, T1, 32
addi.d T2, T1, 32
xvldx U14, AO1, T2

add.d T2, IY, INCY
fldx.d $f4, Y, T2
add.d T2, T2, INCY
fldx.d $f5, Y, T2
add.d T2, T2, INCY
fldx.d $f6, Y, T2
add.d T2, T2, INCY
fldx.d $f7, Y, T2

add.d T2, T2, INCY
fldx.d $f8, Y, T2
add.d T2, T2, INCY
fldx.d $f9, Y, T2
add.d T2, T2, INCY
fldx.d $f10, Y, T2
add.d T2, T2, INCY
fldx.d $f11, Y, T2

vextrins.d $vr4, $vr5, 0x10
vextrins.d $vr6, $vr7, 0x10
xvpermi.q U4, U6, 0x02

vextrins.d $vr8, $vr9, 0x10
vextrins.d $vr10, $vr11, 0x10
xvpermi.q U8, U10, 0x02
LOAD_Y_8

xvfmadd.d U4, U3, U1, U4
xvfmadd.d U8, U3, U14, U8

xvpermi.d U6, U4, 0xee
vextrins.d $vr5, $vr4, 0x01
vextrins.d $vr7, $vr6, 0x01

xvpermi.d U10, U8, 0xee
vextrins.d $vr9, $vr8, 0x01
vextrins.d $vr11, $vr10, 0x01

add.d T2, IY, INCY
fstx.d $f4, Y, T2
add.d T2, T2, INCY
fstx.d $f5, Y, T2
add.d T2, T2, INCY
fstx.d $f6, Y, T2
add.d T2, T2, INCY
fstx.d $f7, Y, T2

add.d T2, T2, INCY
fstx.d $f8, Y, T2
add.d T2, T2, INCY
fstx.d $f9, Y, T2
add.d T2, T2, INCY
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

add.d T2, IX, INCX
fldx.d $f4, X, T2
add.d T2, T2, INCX
fldx.d $f5, X, T2
add.d T2, T2, INCX
fldx.d $f6, X, T2
add.d T2, T2, INCX
fldx.d $f7, X, T2

add.d T2, T2, INCX
fldx.d $f8, X, T2
add.d T2, T2, INCX
fldx.d $f9, X, T2
add.d T2, T2, INCX
fldx.d $f10, X, T2
add.d T2, T2, INCX
fldx.d $f11, X, T2

vextrins.d $vr4, $vr5, 0x10
vextrins.d $vr6, $vr7, 0x10
xvpermi.q U4, U6, 0x02

vextrins.d $vr8, $vr9, 0x10
vextrins.d $vr10, $vr11, 0x10
xvpermi.q U8, U10, 0x02

xvand.v $xr12, $xr2, $xr2

xvfmadd.d U2, U1, U4, U2
xvfsub.d U2, U2, $xr12
xvfmadd.d U2, U14, U8, U2
STORE_Y_8

xvpermi.d U4, U2, 0x01
xvpermi.d U5, U2, 0x02
xvpermi.d U6, U2, 0x03
alsl.d IY, INCY, IY, 3

fadd.d $f2, $f2, $f4
fadd.d $f2, $f2, $f5
fadd.d $f2, $f2, $f6
fadd.d $f2, $f2, $f12
LOAD_X_8

xvreplve0.d U2, U2
xvfmadd.d U2, U1, U4, U2
xvfmadd.d U2, U14, U8, U2

slli.d T2, INCX, 3
add.d IX, IX, T2
alsl.d IX, INCX, IX, 3

addi.d T1, T1, 64
addi.d II, II, 64
addi.d I, I, 1
blt I, T0, .L02

//Acc U2
GACC xvf, d, U4, U2
fmov.d $f2, $f4
.L03: /* &4 */
sub.d T0, M, J
addi.d T0, T0, -1
@@ -437,4 +459,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0

EPILOGUE
EPILOGUE

+ 111
- 90
kernel/loongarch64/dsymv_U_lasx.S View File

@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER

#include "common.h"
#include "loongarch64_asm.S"

/* Param */
#define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28
#define T3 $r29
#define T4 $r30
#define T5 $r17
#define T6 $r16

/* LSX vectors */
#define U0 $xr31
@@ -87,67 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a8 $f8
#define a9 $f9


PROLOGUE

LDARG BUFFER, $sp, 0

addi.d $sp, $sp, -88

SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80

xvldrepl.d VALPHA, $sp, 80

slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

bge $r0, M, .L999
bge $r0, N, .L999

sub.d M1, M, N

mul.d JY, M1, INCY
mul.d JX, M1, INCX

move J, M1
move AO1, A

beq J, M, .L999

.L01:
MTC $f2, $r0 //temp2
fldx.d $f6, X, JX
fmul.d $f3, ALPHA, $f6 //temp1
xvreplve0.d U3, U3
xvreplve0.d U2, U2

move IY, $r0
move IX, $r0
move II, $r0
move I, $r0

srai.d T0, J, 3
beq I, T0, .L03

mul.d T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
xvldx U1, AO1, T1
addi.d T1, T1, 32
xvldx U14, AO1, T1
addi.d T1, T1, 32

.macro LOAD_Y_8
beqz T5, .L01_Y_0
fldx.d $f4, Y, IY
add.d T2, IY, INCY
fldx.d $f5, Y, T2
@@ -167,20 +111,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

vextrins.d $vr4, $vr5, 0x10
vextrins.d $vr6, $vr7, 0x10
xvpermi.q U4, U6, 0x02
xvpermi.q U4, U6, 0x02

vextrins.d $vr8, $vr9, 0x10
vextrins.d $vr10, $vr11, 0x10
xvpermi.q U8, U10, 0x02

xvfmadd.d U4, U3, U1, U4
xvfmadd.d U8, U3, U14, U8

xvpermi.d U6, U4, 0xee
xvpermi.q U8, U10, 0x02
b .L01_Y_1
.L01_Y_0:
xvldx U4, Y, IY
alsl.d T4, INCY, IY, 2
xvldx U8, Y, T4
.L01_Y_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
xvpermi.d U6, U4, 0xee
vextrins.d $vr5, $vr4, 0x01
vextrins.d $vr7, $vr6, 0x01

xvpermi.d U10, U8, 0xee
xvpermi.d U10, U8, 0xee
vextrins.d $vr9, $vr8, 0x01
vextrins.d $vr11, $vr10, 0x01

@@ -200,10 +150,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fstx.d $f10, Y, T2
add.d T2, T2, INCY
fstx.d $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

b .L01_Y_3
.L01_Y_2:
xvstx U4, Y, IY
xvstx U8, Y, T4
.L01_Y_3:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
fldx.d $f4, X, IX
add.d T2, IX, INCX
fldx.d $f5, X, T2
@@ -223,36 +178,102 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

vextrins.d $vr4, $vr5, 0x10
vextrins.d $vr6, $vr7, 0x10
xvpermi.q U4, U6, 0x02
xvpermi.q U4, U6, 0x02

vextrins.d $vr8, $vr9, 0x10
vextrins.d $vr10, $vr11, 0x10
xvpermi.q U8, U10, 0x02
xvpermi.q U8, U10, 0x02
b .L01_X_1
.L01_X_0:
xvldx U4, X, IX
alsl.d T2, INCX, IX, 2
xvldx U8, X, T2
.L01_X_1:
.endm

xvand.v $xr12, $xr2, $xr2
PROLOGUE

xvfmadd.d U2, U1, U4, U2
xvfsub.d U2, U2, $xr12
xvfmadd.d U2, U14, U8, U2
addi.d $sp, $sp, -88

xvpermi.d U4, U2, 0x01
xvpermi.d U5, U2, 0x02
xvpermi.d U6, U2, 0x03
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80

fadd.d $f2, $f2, $f4
fadd.d $f2, $f2, $f5
fadd.d $f2, $f2, $f6
fadd.d $f2, $f2, $f12
xvldrepl.d VALPHA, $sp, 80

xvreplve0.d U2, U2
addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

slli.d T2, INCX, 3
add.d IX, IX, T2
bge $r0, M, .L999
bge $r0, N, .L999

sub.d M1, M, N

mul.d JY, M1, INCY
mul.d JX, M1, INCX

move J, M1
move AO1, A

beq J, M, .L999

.L01:
xvxor.v U2, U2, U2
fldx.d $f6, X, JX
fmul.d $f3, ALPHA, $f6 //temp1
xvreplve0.d U3, U3

move IY, $r0
move IX, $r0
move II, $r0
move I, $r0

srai.d T0, J, 3
beq I, T0, .L03

mul.d T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
xvldx U1, AO1, T1
addi.d T2, T1, 32
xvldx U14, AO1, T2

LOAD_Y_8

xvfmadd.d U4, U3, U1, U4
xvfmadd.d U8, U3, U14, U8

STORE_Y_8

alsl.d IY, INCY, IY, 3

LOAD_X_8

xvfmadd.d U2, U1, U4, U2
xvfmadd.d U2, U14, U8, U2

alsl.d IX, INCX, IX, 3

addi.d T1, T1, 64
addi.d II, II, 64
addi.d I, I, 1
blt I, T0, .L02

//Acc U2
GACC xvf, d, U4, U2
fmov.d $f2, $f4

.L03: /* &4 */
andi T0, J, 4
beq $r0, T0, .L04
@@ -425,4 +446,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0

EPILOGUE
EPILOGUE

+ 110
- 100
kernel/loongarch64/ssymv_L_lasx.S View File

@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER

#include "common.h"
#include "loongarch64_asm.S"

/* Param */
#define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28
#define T3 $r29
#define T4 $r30
#define T5 $r17
#define T6 $r16

/* LSX vectors */
#define U0 $xr31
@@ -87,75 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a8 $f8
#define a9 $f9


PROLOGUE

LDARG BUFFER, $sp, 0

addi.d $sp, $sp, -88

SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80

xvldrepl.w VALPHA, $sp, 80

slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

bge $r0, M, .L999
bge $r0, N, .L999

move J, $r0
move JY, $r0
move JX, $r0
move AO1, A

beq J, N, .L999

.L01:
MTC a2, $r0 //temp2
fldx.s a6, X, JX
fmul.s a3, ALPHA, a6 //temp1
xvreplve0.w U3, U3
xvreplve0.w U2, U2

mul.w T0, J, LDA
slli.d T1, J, BASE_SHIFT
add.w T0, T0, T1
fldx.s a6, AO1, T0
fldx.s a4, Y, JY
fmadd.s a4, a3, a6, a4
fstx.s a4, Y, JY

move IY, JY
move IX, JX
addi.d II, J, 1
move I, II
slli.d II, II, BASE_SHIFT

sub.d T0, M, J
addi.d T0, T0, -1
srai.d T0, T0, 3
add.d T0, T0, J
addi.d T0, T0, 1
beq I, T0, .L03
bge I, T0, .L03

mul.w T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
xvldx U1, AO1, T1

.macro LOAD_Y_8
beqz T5, .L01_Y_0
add.d T2, IY, INCY
fldx.s $f4, Y, T2
add.d T2, T2, INCY
@@ -180,11 +116,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vextrins.w $vr8, $vr9, 0x10
vextrins.w $vr8, $vr10, 0x20
vextrins.w $vr8, $vr11, 0x30
xvpermi.q U4, U8, 0x02

xvfmadd.s U4, U3, U1, U4

xvpermi.d U8, U4, 0xee
xvpermi.q U4, U8, 0x02
b .L01_Y_1
.L01_Y_0:
add.d T3, IY, INCY
xvldx U4, Y, T3
.L01_Y_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
xvpermi.d U8, U4, 0xee
vextrins.w $vr5, $vr4, 0x01
vextrins.w $vr6, $vr4, 0x02
vextrins.w $vr7, $vr4, 0x03
@@ -209,10 +151,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fstx.s $f10, Y, T2
add.d T2, T2, INCY
fstx.s $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

b .L01_Y_3
.L01_Y_2:
xvstx U4, Y, T3
.L01_Y_3:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
add.d T2, IX, INCX
fldx.s $f4, X, T2
add.d T2, T2, INCX
@@ -238,39 +184,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vextrins.w $vr8, $vr10, 0x20
vextrins.w $vr8, $vr11, 0x30
xvpermi.q U4, U8, 0x02
b .L01_X_1
.L01_X_0:
add.d T3, IX, INCX
xvldx U4, X, T3
.L01_X_1:
.endm

PROLOGUE

xvand.v $xr12, $xr2, $xr2
addi.d $sp, $sp, -88

xvfmadd.s U2, U1, U4, U2
xvfsub.s U2, U2, $xr12
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80

xvpickve.w U4, U2, 0x01
xvpickve.w U5, U2, 0x02
xvpickve.w U6, U2, 0x03
xvpickve.w U7, U2, 0x04
xvpickve.w U8, U2, 0x05
xvpickve.w U9, U2, 0x06
xvpickve.w U10, U2, 0x07
xvldrepl.w VALPHA, $sp, 80

fadd.s $f2, $f2, $f4
fadd.s $f2, $f2, $f5
fadd.s $f2, $f2, $f6
fadd.s $f2, $f2, $f7
fadd.s $f2, $f2, $f8
fadd.s $f2, $f2, $f9
fadd.s $f2, $f2, $f10
fadd.s $f2, $f2, $f12
addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

xvreplve0.d U2, U2
bge $r0, M, .L999
bge $r0, N, .L999

move J, $r0
move JY, $r0
move JX, $r0
move AO1, A

slli.d T2, INCX, 3
add.d IX, IX, T2
beq J, N, .L999

.L01:
xvxor.v U2, U2, U2
fldx.s a6, X, JX
fmul.s a3, ALPHA, a6 //temp1
xvreplve0.w U3, U3

mul.w T0, J, LDA
slli.d T1, J, BASE_SHIFT
add.w T0, T0, T1
fldx.s a6, AO1, T0
fldx.s a4, Y, JY
fmadd.s a4, a3, a6, a4
fstx.s a4, Y, JY

move IY, JY
move IX, JX
addi.d II, J, 1
move I, II
slli.d II, II, BASE_SHIFT

sub.d T0, M, J
addi.d T0, T0, -1
srai.d T0, T0, 3
add.d T0, T0, J
addi.d T0, T0, 1
beq I, T0, .L03
bge I, T0, .L03

mul.w T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
xvldx U1, AO1, T1

LOAD_Y_8

xvfmadd.s U4, U3, U1, U4

STORE_Y_8

alsl.d IY, INCY, IY, 3

LOAD_X_8

xvfmadd.s U2, U1, U4, U2

alsl.d IX, INCX, IX, 3

addi.d II, II, 32
addi.d T1, T1, 32
addi.d I, I, 1
blt I, T0, .L02

//Acc U2
GACC xvf, s, U4, U2
fmov.d $f2, $f4

.L03: /* &4 */
sub.d T0, M, J
addi.d T0, T0, -1
@@ -433,4 +443,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0

EPILOGUE
EPILOGUE

+ 96
- 88
kernel/loongarch64/ssymv_U_lasx.S View File

@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ASSEMBLER

#include "common.h"
#include "loongarch64_asm.S"

/* Param */
#define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define T2 $r28
#define T3 $r29
#define T4 $r30
#define T5 $r17
#define T6 $r16

/* LSX vectors */
#define U0 $xr31
@@ -87,64 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define a8 $f8
#define a9 $f9


PROLOGUE

LDARG BUFFER, $sp, 0

addi.d $sp, $sp, -88

SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80

xvldrepl.w VALPHA, $sp, 80

slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

bge $r0, M, .L999
bge $r0, N, .L999

sub.d M1, M, N

mul.d JY, M1, INCY
mul.d JX, M1, INCX

move J, M1
move AO1, A

beq J, M, .L999

.L01:
MTC $f2, $r0 //temp2
fldx.s $f6, X, JX
fmul.s $f3, ALPHA, $f6 //temp1
xvreplve0.w U3, U3
xvreplve0.w U2, U2

move IY, $r0
move IX, $r0
move II, $r0
move I, $r0

srai.d T0, J, 3
beq I, T0, .L03

mul.w T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
xvldx U1, AO1, T1

.macro LOAD_Y_8
beqz T5, .L01_Y_0
fldx.s $f4, Y, IY
add.d T2, IY, INCY
fldx.s $f5, Y, T2
@@ -168,10 +115,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vextrins.w $vr8, $vr9, 0x10
vextrins.w $vr8, $vr10, 0x20
vextrins.w $vr8, $vr11, 0x30
xvpermi.q U4, U8, 0x02

xvfmadd.s U4, U3, U1, U4

xvpermi.q U4, U8, 0x02
b .L01_Y_1
.L01_Y_0:
xvldx U4, Y, IY
.L01_Y_1:
.endm

.macro STORE_Y_8
beqz T5, .L01_Y_2
xvpermi.d U8, U4, 0xee
vextrins.w $vr5, $vr4, 0x01
vextrins.w $vr6, $vr4, 0x02
@@ -196,10 +148,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fstx.s $f10, Y, T2
add.d T2, T2, INCY
fstx.s $f11, Y, T2

slli.d T2, INCY, 3
add.d IY, IY, T2

b .L01_Y_3
.L01_Y_2:
xvstx U4, Y, IY
.L01_Y_3:
.endm

.macro LOAD_X_8
beqz T6, .L01_X_0
fldx.s $f4, X, IX
add.d T2, IX, INCX
fldx.s $f5, X, T2
@@ -224,39 +180,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
vextrins.w $vr8, $vr10, 0x20
vextrins.w $vr8, $vr11, 0x30
xvpermi.q U4, U8, 0x02
b .L01_X_1
.L01_X_0:
xvldx U4, X, IX
.L01_X_1:
.endm

PROLOGUE

xvand.v $xr12, $xr2, $xr2
addi.d $sp, $sp, -88

xvfmadd.s U2, U1, U4, U2
xvfsub.s U2, U2, $xr12
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 32
SDARG $r27, $sp, 40
SDARG $r28, $sp, 48
SDARG $r29, $sp, 56
SDARG $r30, $sp, 64
SDARG $r31, $sp, 72
ST ALPHA, $sp, 80

xvpickve.w U4, U2, 0x01
xvpickve.w U5, U2, 0x02
xvpickve.w U6, U2, 0x03
xvpickve.w U7, U2, 0x04
xvpickve.w U8, U2, 0x05
xvpickve.w U9, U2, 0x06
xvpickve.w U10, U2, 0x07
xvldrepl.w VALPHA, $sp, 80

fadd.s $f2, $f2, $f4
fadd.s $f2, $f2, $f5
fadd.s $f2, $f2, $f6
fadd.s $f2, $f2, $f7
fadd.s $f2, $f2, $f8
fadd.s $f2, $f2, $f9
fadd.s $f2, $f2, $f10
fadd.s $f2, $f2, $f12
addi.d T5, INCY, -1
addi.d T6, INCX, -1
slli.d LDA, LDA, BASE_SHIFT
slli.d INCX, INCX, BASE_SHIFT
slli.d INCY, INCY, BASE_SHIFT

xvreplve0.d U2, U2
bge $r0, M, .L999
bge $r0, N, .L999

sub.d M1, M, N

mul.d JY, M1, INCY
mul.d JX, M1, INCX

move J, M1
move AO1, A

beq J, M, .L999

.L01:
xvxor.v U2, U2, U2
fldx.s $f6, X, JX
fmul.s $f3, ALPHA, $f6 //temp1
xvreplve0.w U3, U3

slli.d T2, INCX, 3
add.d IX, IX, T2
move IY, $r0
move IX, $r0
move II, $r0
move I, $r0

srai.d T0, J, 3
beq I, T0, .L03

mul.w T1, J, LDA
add.d T1, T1, II

.L02: /* /8 */
xvldx U1, AO1, T1

LOAD_Y_8

xvfmadd.s U4, U3, U1, U4

STORE_Y_8

alsl.d IY, INCY, IY, 3

LOAD_X_8

xvfmadd.s U2, U1, U4, U2

alsl.d IX, INCX, IX, 3

addi.d II, II, 32
addi.d T1, T1, 32
addi.d I, I, 1
blt I, T0, .L02

//Acc U2
GACC xvf, s, U4, U2
fmov.d $f2, $f4

.L03: /* &4 */
andi T0, J, 4
beq $r0, T0, .L04
@@ -421,4 +429,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi.d $sp, $sp, 88
jirl $r0, $r1, 0x0

EPILOGUE
EPILOGUE

Loading…
Cancel
Save