Signed-off-by: Hao Chen <chenhao@loongson.cn>tags/v0.3.26
@@ -59,10 +59,10 @@ SNRM2KERNEL = snrm2_lsx.S | |||||
DNRM2KERNEL = dnrm2_lsx.S | DNRM2KERNEL = dnrm2_lsx.S | ||||
DGEMMKERNEL = dgemm_kernel_8x4.S | DGEMMKERNEL = dgemm_kernel_8x4.S | ||||
DGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
DGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
DGEMMINCOPY = dgemm_ncopy_8_lsx.S | |||||
DGEMMITCOPY = dgemm_tcopy_8_lsx.S | |||||
DGEMMONCOPY = dgemm_ncopy_4_lsx.S | |||||
DGEMMOTCOPY = dgemm_tcopy_4_lsx.S | |||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
@@ -0,0 +1,185 @@ | |||||
/******************************************************************************* | |||||
Copyright (c) 2023, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*******************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "loongarch64_asm.S" | |||||
/* Function parameters */ | |||||
#define M $r4 // param 1: m | |||||
#define N $r5 // param 2: n | |||||
#define SRC $r6 // param 3: src | |||||
#define LDA $r7 // param 4: lda | |||||
#define DST $r8 // param 5: dst | |||||
#define I $r9 | |||||
#define J $r10 | |||||
#define S1 $r12 | |||||
#define S2 $r13 | |||||
#define S3 $r14 | |||||
#define S4 $r15 | |||||
#define S5 $r16 | |||||
#define S6 $r17 | |||||
#define S7 $r18 | |||||
#define S8 $r19 | |||||
#define TD $r20 | |||||
#define TS $r21 | |||||
#define TL $r7 | |||||
#define T0 $r6 | |||||
#define ZERO $r0 | |||||
#define F0 $f0 | |||||
#define F1 $f1 | |||||
#define F2 $f2 | |||||
#define F3 $f3 | |||||
#define F4 $f4 | |||||
#define F5 $f5 | |||||
#define F6 $f6 | |||||
#define F7 $f7 | |||||
/* LSX vectors */ | |||||
#define U0 $vr0 | |||||
#define U1 $vr1 | |||||
#define U2 $vr2 | |||||
#define U3 $vr3 | |||||
#define U4 $vr4 | |||||
#define U5 $vr5 | |||||
#define U6 $vr6 | |||||
#define U7 $vr7 | |||||
#define D0 $vr8 | |||||
#define D1 $vr9 | |||||
#define D2 $vr10 | |||||
#define D3 $vr11 | |||||
#define D4 $vr12 | |||||
#define D5 $vr13 | |||||
#define D6 $vr14 | |||||
#define D7 $vr15 | |||||
PROLOGUE | |||||
move TD, DST | |||||
move TS, SRC | |||||
slli.d TL, LDA, 0x03 | |||||
slli.d T0, TL, 0x01 | |||||
srai.d J, N, 0x02 | |||||
beq J, ZERO, .L_N2 | |||||
.L_J1: /* J-- */ | |||||
move S1, TS | |||||
add.d S2, TS, TL | |||||
srai.d I, M, 0x02 | |||||
add.d S3, S2, TL | |||||
add.d S4, S2, T0 | |||||
add.d TS, S3, T0 | |||||
addi.d J, J, -1 | |||||
beq I, ZERO, .L_I3 | |||||
.L_I1: /* I-- */ | |||||
GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 | |||||
GINTERLACE v, d, D0, D2, U1, U0 | |||||
GINTERLACE v, d, D1, D3, U3, U2 | |||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 | |||||
addi.d TD, TD, 0x40 | |||||
GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 | |||||
GINTERLACE v, d, D0, D2, U1, U0 | |||||
GINTERLACE v, d, D1, D3, U3, U2 | |||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 | |||||
addi.d S1, S1, 0x20 | |||||
addi.d S2, S2, 0x20 | |||||
addi.d S3, S3, 0x20 | |||||
addi.d S4, S4, 0x20 | |||||
addi.d TD, TD, 0x40 | |||||
addi.d I, I, -1 | |||||
blt ZERO, I, .L_I1 | |||||
.L_I3: | |||||
andi I, M, 0x03 | |||||
beq I, ZERO, .L_I0 | |||||
.L_II1: | |||||
fld.d F0, S1, 0x00 | |||||
fld.d F1, S2, 0x00 | |||||
fld.d F2, S3, 0x00 | |||||
fld.d F3, S4, 0x00 | |||||
fst.d F0, TD, 0x00 | |||||
addi.d S1, S1, 0x08 | |||||
fst.d F1, TD, 0x08 | |||||
addi.d S2, S2, 0x08 | |||||
fst.d F2, TD, 0x10 | |||||
addi.d S3, S3, 0x08 | |||||
fst.d F3, TD, 0x18 | |||||
addi.d S4, S4, 0x08 | |||||
addi.d TD, TD, 0x20 | |||||
addi.d I, I, -1 | |||||
blt ZERO, I, .L_II1 | |||||
.L_I0: | |||||
blt ZERO, J, .L_J1 | |||||
.L_N2: | |||||
andi J, N, 0x02 | |||||
beq ZERO, J, .L_N1 | |||||
move S1, TS | |||||
add.d S2, TS, TL | |||||
srai.d I, M, 0x01 | |||||
add.d TS, S2, TL | |||||
beq I, ZERO, .L_2I3 | |||||
.L_2I1: /* I-- */ | |||||
GLD v, , U0, S1, 0x00, U1, S2, 0x00 | |||||
GINTERLACE v, d, D0, D1, U1, U0 | |||||
GST v, , D0, TD, 0x00, D1, TD, 0x10 | |||||
addi.d S1, S1, 0x10 | |||||
addi.d S2, S2, 0x10 | |||||
addi.d TD, TD, 0x20 | |||||
addi.d I, I, -1 | |||||
blt ZERO, I, .L_2I1 | |||||
.L_2I3: | |||||
andi I, M, 0x01 | |||||
beq ZERO, I, .L_N1 | |||||
.L_2II1: /* I-- */ | |||||
fld.d F0, S1, 0x00 | |||||
fld.d F1, S2, 0x00 | |||||
fst.d F0, TD, 0x00 | |||||
addi.d I, I, -1 | |||||
fst.d F1, TD, 0x08 | |||||
addi.d S1, S1, 0x08 | |||||
addi.d S2, S2, 0x08 | |||||
addi.d TD, TD, 0x10 | |||||
blt ZERO, I, .L_2II1 | |||||
.L_N1: | |||||
move S1, TS | |||||
beq ZERO, M, .L_N0 | |||||
.L_M1: | |||||
fld.d F0, S1, 0x00 | |||||
addi.d S1, S1, 0x08 | |||||
fst.d F0, TD, 0x00 | |||||
addi.d TD, TD, 0x08 | |||||
addi.d M, M, -1 | |||||
blt ZERO, M, .L_M1 | |||||
.L_N0: | |||||
jirl $r0, $r1, 0x00 | |||||
EPILOGUE |
@@ -0,0 +1,283 @@ | |||||
/******************************************************************************* | |||||
Copyright (c) 2023, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*******************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "loongarch64_asm.S" | |||||
/* Function parameters */ | |||||
#define M $r4 // param 1: m | |||||
#define N $r5 // param 2: n | |||||
#define SRC $r6 // param 3: src | |||||
#define LDA $r7 // param 4: lda | |||||
#define DST $r8 // param 5: dst | |||||
#define I $r9 | |||||
#define J $r10 | |||||
#define S1 $r12 | |||||
#define S2 $r13 | |||||
#define S3 $r14 | |||||
#define S4 $r15 | |||||
#define S5 $r16 | |||||
#define S6 $r17 | |||||
#define S7 $r18 | |||||
#define S8 $r19 | |||||
#define TD $r20 | |||||
#define TS $r21 | |||||
#define TL $r7 | |||||
#define T0 $r6 | |||||
#define ZERO $r0 | |||||
#define F0 $f0 | |||||
#define F1 $f1 | |||||
#define F2 $f2 | |||||
#define F3 $f3 | |||||
#define F4 $f4 | |||||
#define F5 $f5 | |||||
#define F6 $f6 | |||||
#define F7 $f7 | |||||
/* LSX vectors */ | |||||
#define U0 $vr0 | |||||
#define U1 $vr1 | |||||
#define U2 $vr2 | |||||
#define U3 $vr3 | |||||
#define U4 $vr4 | |||||
#define U5 $vr5 | |||||
#define U6 $vr6 | |||||
#define U7 $vr7 | |||||
#define D0 $vr8 | |||||
#define D1 $vr9 | |||||
#define D2 $vr10 | |||||
#define D3 $vr11 | |||||
#define D4 $vr12 | |||||
#define D5 $vr13 | |||||
#define D6 $vr14 | |||||
#define D7 $vr15 | |||||
PROLOGUE | |||||
push_if_used 26, 32 | |||||
move TD, DST | |||||
move TS, SRC | |||||
slli.d TL, LDA, 0x03 | |||||
slli.d T0, TL, 0x01 | |||||
srai.d J, N, 0x03 | |||||
beq J, ZERO, .L_N4 | |||||
.L_J1: | |||||
move S1, TS | |||||
add.d S2, TS, TL | |||||
srai.d I, M, 0x03 | |||||
add.d S3, S2, TL | |||||
addi.d J, J, -1 | |||||
add.d S4, S3, TL | |||||
add.d S5, S3, T0 | |||||
add.d S6, S4, T0 | |||||
add.d S7, S5, T0 | |||||
add.d S8, S6, T0 | |||||
add.d TS, S7, T0 | |||||
beq I, ZERO, .L_I7 | |||||
.L_I1: | |||||
GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \ | |||||
U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00 | |||||
GINTERLACE v, d, D0, D4, U1, U0 | |||||
GINTERLACE v, d, D1, D5, U3, U2 | |||||
GINTERLACE v, d, D2, D6, U5, U4 | |||||
GINTERLACE v, d, D3, D7, U7, U6 | |||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ | |||||
D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 | |||||
addi.d TD, TD, 0x80 | |||||
GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \ | |||||
U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10 | |||||
GINTERLACE v, d, D0, D4, U1, U0 | |||||
GINTERLACE v, d, D1, D5, U3, U2 | |||||
GINTERLACE v, d, D2, D6, U5, U4 | |||||
GINTERLACE v, d, D3, D7, U7, U6 | |||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ | |||||
D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 | |||||
addi.d TD, TD, 0x80 | |||||
GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \ | |||||
U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20 | |||||
GINTERLACE v, d, D0, D4, U1, U0 | |||||
GINTERLACE v, d, D1, D5, U3, U2 | |||||
GINTERLACE v, d, D2, D6, U5, U4 | |||||
GINTERLACE v, d, D3, D7, U7, U6 | |||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ | |||||
D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 | |||||
addi.d TD, TD, 0x80 | |||||
GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \ | |||||
U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30 | |||||
GINTERLACE v, d, D0, D4, U1, U0 | |||||
GINTERLACE v, d, D1, D5, U3, U2 | |||||
GINTERLACE v, d, D2, D6, U5, U4 | |||||
GINTERLACE v, d, D3, D7, U7, U6 | |||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \ | |||||
D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70 | |||||
addi.d TD, TD, 0x80 | |||||
addi.d S1, S1, 0x40 | |||||
addi.d S2, S2, 0x40 | |||||
addi.d S3, S3, 0x40 | |||||
addi.d S4, S4, 0x40 | |||||
addi.d S5, S5, 0x40 | |||||
addi.d S6, S6, 0x40 | |||||
addi.d S7, S7, 0x40 | |||||
addi.d S8, S8, 0x40 | |||||
addi.d I, I, -1 | |||||
blt ZERO, I, .L_I1 | |||||
.L_I7: | |||||
andi I, M, 0x07 | |||||
beq I, ZERO, .L_I0 | |||||
.L_II1: /* I-- */ | |||||
fld.d F0, S1, 0x00 | |||||
fld.d F1, S2, 0x00 | |||||
fld.d F2, S3, 0x00 | |||||
fld.d F3, S4, 0x00 | |||||
fld.d F4, S5, 0x00 | |||||
fld.d F5, S6, 0x00 | |||||
fld.d F6, S7, 0x00 | |||||
fld.d F7, S8, 0x00 | |||||
fst.d F0, TD, 0x00 | |||||
addi.d S1, S1, 0x08 | |||||
fst.d F1, TD, 0x08 | |||||
addi.d S2, S2, 0x08 | |||||
fst.d F2, TD, 0x10 | |||||
addi.d S3, S3, 0x08 | |||||
fst.d F3, TD, 0x18 | |||||
addi.d S4, S4, 0x08 | |||||
fst.d F4, TD, 0x20 | |||||
addi.d S5, S5, 0x08 | |||||
fst.d F5, TD, 0x28 | |||||
addi.d S6, S6, 0x08 | |||||
fst.d F6, TD, 0x30 | |||||
addi.d S7, S7, 0x08 | |||||
fst.d F7, TD, 0x38 | |||||
addi.d S8, S8, 0x08 | |||||
addi.d TD, TD, 0x40 | |||||
addi.d I, I, -1 | |||||
blt ZERO, I, .L_II1 | |||||
.L_I0: | |||||
blt ZERO, J, .L_J1 | |||||
.L_N4: | |||||
andi J, N, 0x04 | |||||
beq ZERO, J, .L_N2 | |||||
move S1, TS | |||||
add.d S2, TS, TL | |||||
srai.d I, M, 0x02 | |||||
add.d S3, S2, TL | |||||
add.d S4, S2, T0 | |||||
add.d TS, S3, T0 | |||||
beq I, ZERO, .L_I3 | |||||
.L_4I1: /* I-- */ | |||||
GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00 | |||||
GINTERLACE v, d, D0, D2, U1, U0 | |||||
GINTERLACE v, d, D1, D3, U3, U2 | |||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 | |||||
addi.d TD, TD, 0x40 | |||||
GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10 | |||||
GINTERLACE v, d, D0, D2, U1, U0 | |||||
GINTERLACE v, d, D1, D3, U3, U2 | |||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30 | |||||
addi.d S1, S1, 0x20 | |||||
addi.d S2, S2, 0x20 | |||||
addi.d S3, S3, 0x20 | |||||
addi.d S4, S4, 0x20 | |||||
addi.d TD, TD, 0x40 | |||||
addi.d I, I, -1 | |||||
blt ZERO, I, .L_4I1 | |||||
.L_I3: | |||||
andi I, M, 0x03 | |||||
beq I, ZERO, .L_N2 | |||||
.L_4II1: | |||||
fld.d F0, S1, 0x00 | |||||
fld.d F1, S2, 0x00 | |||||
fld.d F2, S3, 0x00 | |||||
fld.d F3, S4, 0x00 | |||||
fst.d F0, TD, 0x00 | |||||
addi.d S1, S1, 0x08 | |||||
fst.d F1, TD, 0x08 | |||||
addi.d S2, S2, 0x08 | |||||
fst.d F2, TD, 0x10 | |||||
addi.d S3, S3, 0x08 | |||||
fst.d F3, TD, 0x18 | |||||
addi.d S4, S4, 0x08 | |||||
addi.d TD, TD, 0x20 | |||||
addi.d I, I, -1 | |||||
blt ZERO, I, .L_4II1 | |||||
.L_N2: | |||||
andi J, N, 0x02 | |||||
beq ZERO, J, .L_N1 | |||||
move S1, TS | |||||
add.d S2, TS, TL | |||||
srai.d I, M, 0x01 | |||||
add.d TS, S2, TL | |||||
beq I, ZERO, .L_NI1 | |||||
.L_2I1: /* I-- */ | |||||
GLD v, , U0, S1, 0x00, U1, S2, 0x00 | |||||
GINTERLACE v, d, D0, D1, U1, U0 | |||||
GST v, , D0, TD, 0x00, D1, TD, 0x10 | |||||
addi.d S1, S1, 0x10 | |||||
addi.d S2, S2, 0x10 | |||||
addi.d TD, TD, 0x20 | |||||
addi.d I, I, -1 | |||||
blt ZERO, I, .L_2I1 | |||||
.L_NI1: | |||||
andi I, M, 0x01 | |||||
beq I, ZERO, .L_N1 | |||||
fld.d F0, S1, 0x00 | |||||
fld.d F1, S2, 0x00 | |||||
fst.d F0, TD, 0x00 | |||||
addi.d S1, S1, 0x08 | |||||
fst.d F1, TD, 0x08 | |||||
addi.d S2, S2, 0x08 | |||||
addi.d TD, TD, 0x10 | |||||
.L_N1: | |||||
move S1, TS | |||||
beq ZERO, M, .L_N0 | |||||
.L_M1: | |||||
fld.d F0, S1, 0x00 | |||||
addi.d S1, S1, 0x08 | |||||
fst.d F0, TD, 0x00 | |||||
addi.d TD, TD, 0x08 | |||||
addi.d M, M, -1 | |||||
blt ZERO, M, .L_M1 | |||||
.L_N0: | |||||
pop_if_used 26, 32 | |||||
jirl $r0, $r1, 0x00 | |||||
EPILOGUE |
@@ -0,0 +1,280 @@ | |||||
/******************************************************************************* | |||||
Copyright (c) 2023, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*******************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "loongarch64_asm.S" | |||||
/* Function parameters */ | |||||
#define M $r4 // param 1: m | |||||
#define N $r5 // param 2: n | |||||
#define SRC $r6 // param 3: src | |||||
#define LDA $r7 // param 4: lda | |||||
#define DST $r8 // param 5: dst | |||||
#define I $r9 | |||||
#define J $r10 | |||||
#define S0 $r11 | |||||
#define S1 $r12 | |||||
#define S2 $r13 | |||||
#define S3 $r14 | |||||
#define S4 $r15 | |||||
#define P0 $r16 | |||||
#define P1 $r17 | |||||
#define P2 $r18 | |||||
#define P3 $r19 | |||||
#define T0 $r20 | |||||
#define T1 $r23 | |||||
#define TL $r7 | |||||
#define ZERO $r0 | |||||
#define F0 $f0 | |||||
#define F1 $f1 | |||||
#define F2 $f2 | |||||
#define F3 $f3 | |||||
/* LSX vectors */ | |||||
#define U0 $vr0 | |||||
#define U1 $vr1 | |||||
#define U2 $vr2 | |||||
#define U3 $vr3 | |||||
#define U4 $vr4 | |||||
#define U5 $vr5 | |||||
#define U6 $vr6 | |||||
#define U7 $vr7 | |||||
PROLOGUE | |||||
push_if_used 18, 8 | |||||
move S0, SRC | |||||
move P0, DST | |||||
// Find P0, P2, P3 | |||||
srai.d T0, N, 0x02 | |||||
slli.d T0, T0, 0x02 | |||||
srai.d T1, N, 0x01 | |||||
slli.d T1, T1, 0x01 | |||||
mul.d T0, M, T0 | |||||
mul.d T1, M, T1 | |||||
slli.d T0, T0, 0x03 | |||||
slli.d T1, T1, 0x03 | |||||
add.d P2, DST, T0 | |||||
add.d P3, DST, T1 | |||||
slli.d TL, LDA, 0x03 | |||||
srai.d J, M, 0x02 | |||||
slli.d T0, TL, 0x01 | |||||
slli.d T1, M, 0x05 | |||||
beq ZERO, J, .L_M3 | |||||
.L_J1: /* J-- */ | |||||
move S1, S0 | |||||
add.d S2, S0, TL | |||||
add.d S3, S1, T0 | |||||
add.d S4, S2, T0 | |||||
add.d S0, S3, T0 | |||||
move P1, P0 | |||||
addi.d P0, P0, 0x80 | |||||
srai.d I, N, 0x02 | |||||
addi.d J, J, -1 | |||||
beq ZERO, I, .L_N3 | |||||
.L_I1: /* I-- */ | |||||
vld U0, S1, 0x00 | |||||
vld U1, S1, 0x10 | |||||
vld U2, S2, 0x00 | |||||
vld U3, S2, 0x10 | |||||
vld U4, S3, 0x00 | |||||
vld U5, S3, 0x10 | |||||
vld U6, S4, 0x00 | |||||
vld U7, S4, 0x10 | |||||
vst U0, P1, 0x00 | |||||
vst U1, P1, 0x10 | |||||
vst U2, P1, 0x20 | |||||
vst U3, P1, 0x30 | |||||
vst U4, P1, 0x40 | |||||
vst U5, P1, 0x50 | |||||
vst U6, P1, 0x60 | |||||
vst U7, P1, 0x70 | |||||
addi.d S1, S1, 0x20 | |||||
addi.d S2, S2, 0x20 | |||||
addi.d S3, S3, 0x20 | |||||
addi.d S4, S4, 0x20 | |||||
add.d P1, P1, T1 | |||||
addi.d I, I, -1 | |||||
blt ZERO, I, .L_I1 | |||||
.L_N3: | |||||
andi I, N, 0x02 | |||||
beq ZERO, I, .L_N1 | |||||
vld U0, S1, 0x00 | |||||
vld U1, S2, 0x00 | |||||
vld U2, S3, 0x00 | |||||
vld U3, S4, 0x00 | |||||
vst U0, P2, 0x00 | |||||
vst U1, P2, 0x10 | |||||
vst U2, P2, 0x20 | |||||
vst U3, P2, 0x30 | |||||
addi.d S1, S1, 0x10 | |||||
addi.d S2, S2, 0x10 | |||||
addi.d S3, S3, 0x10 | |||||
addi.d S4, S4, 0x10 | |||||
addi.d P2, P2, 0x40 | |||||
.L_N1: | |||||
andi I, N, 0x01 | |||||
beq ZERO, I, .L_N0 | |||||
fld.d F0, S1, 0x00 | |||||
fld.d F1, S2, 0x00 | |||||
fld.d F2, S3, 0x00 | |||||
fld.d F3, S4, 0x00 | |||||
fst.d F0, P3, 0x00 | |||||
fst.d F1, P3, 0x08 | |||||
fst.d F2, P3, 0x10 | |||||
fst.d F3, P3, 0x18 | |||||
addi.d S1, S1, 0x08 | |||||
addi.d S2, S2, 0x08 | |||||
addi.d S3, S3, 0x08 | |||||
addi.d S4, S4, 0x08 | |||||
addi.d P3, P3, 0x20 | |||||
.L_N0: | |||||
blt ZERO, J, .L_J1 | |||||
.L_M3: | |||||
andi J, M, 0x02 | |||||
beq ZERO, J, .L_M1 | |||||
move S1, S0 | |||||
add.d S2, S0, TL | |||||
add.d S0, S0, T0 | |||||
move P1, P0 | |||||
addi.d P0, P0, 0x40 | |||||
srai.d I, N, 0x02 | |||||
beq ZERO, I, .L_2N3 | |||||
.L_2I1: /* I-- */ | |||||
vld U0, S1, 0x00 | |||||
vld U1, S1, 0x10 | |||||
vld U2, S2, 0x00 | |||||
vld U3, S2, 0x10 | |||||
vst U0, P1, 0x00 | |||||
vst U1, P1, 0x10 | |||||
vst U2, P1, 0x20 | |||||
vst U3, P1, 0x30 | |||||
addi.d S1, S1, 0x20 | |||||
addi.d S2, S2, 0x20 | |||||
addi.d I, I, -1 | |||||
add.d P1, P1, T1 | |||||
blt ZERO, I, .L_2I1 | |||||
.L_2N3: | |||||
andi I, N, 0x02 | |||||
beq ZERO, I, .L_2N1 | |||||
vld U0, S1, 0x00 | |||||
vld U1, S2, 0x00 | |||||
vst U0, P2, 0x00 | |||||
vst U1, P2, 0x10 | |||||
addi.d S1, S1, 0x10 | |||||
addi.d S2, S2, 0x10 | |||||
addi.d P2, P2, 0x20 | |||||
.L_2N1: | |||||
addi.d I, N, 0x01 | |||||
beq ZERO, I, .L_M1 | |||||
fld.d F0, S1, 0x00 | |||||
fld.d F1, S2, 0x00 | |||||
fst.d F0, P3, 0x00 | |||||
fst.d F1, P3, 0x08 | |||||
addi.d S1, S1, 0x08 | |||||
addi.d S2, S2, 0x08 | |||||
addi.d P3, P3, 0x10 | |||||
.L_M1: | |||||
andi J, M, 0x01 | |||||
beq ZERO, J, .L_M0 | |||||
move S1, S0 | |||||
move P1, P0 | |||||
srai.d I, N, 0x02 | |||||
beq ZERO, I, .L_1N3 | |||||
.L_1I1: | |||||
vld U0, S1, 0x00 | |||||
vld U1, S1, 0x10 | |||||
vst U0, P1, 0x00 | |||||
vst U1, P1, 0x10 | |||||
addi.d S1, S1, 0x20 | |||||
addi.d I, I, -1 | |||||
add.d P1, P1, T1 | |||||
blt ZERO, I, .L_1I1 | |||||
.L_1N3: | |||||
andi I, N, 0x02 | |||||
beq I, ZERO, .L_1N1 | |||||
fld.d F0, S1, 0x00 | |||||
fld.d F1, S1, 0x08 | |||||
fst.d F0, P2, 0x00 | |||||
fst.d F1, P2, 0x08 | |||||
addi.d S1, S1, 0x10 | |||||
addi.d P2, P2, 0x10 | |||||
.L_1N1: | |||||
andi I, N, 0x01 | |||||
beq I, ZERO, .L_M0 | |||||
fld.d F0, S1, 0x00 | |||||
fst.d F0, P3, 0x00 | |||||
.L_M0: | |||||
pop_if_used 18, 8 | |||||
jirl $r0, $r1, 0x00 | |||||
EPILOGUE |
@@ -0,0 +1,597 @@ | |||||
/******************************************************************************* | |||||
Copyright (c) 2023, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*******************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#include "loongarch64_asm.S" | |||||
/* Function parameters */ | |||||
#define M $r4 // param 1: m | |||||
#define N $r5 // param 2: n | |||||
#define SRC $r6 // param 3: src | |||||
#define LDA $r7 // param 4: lda | |||||
#define DST $r8 // param 5: dst | |||||
#define I $r9 | |||||
#define J $r10 | |||||
#define S0 $r11 | |||||
#define S1 $r12 | |||||
#define S2 $r13 | |||||
#define S3 $r14 | |||||
#define S4 $r15 | |||||
#define S5 $r16 | |||||
#define S6 $r17 | |||||
#define S7 $r18 | |||||
#define S8 $r19 | |||||
#define P0 $r20 | |||||
#define P1 $r23 | |||||
#define P2 $r24 | |||||
#define P3 $r25 | |||||
#define P4 $r26 | |||||
#define P5 $r27 | |||||
#define T0 $r28 | |||||
#define T1 $r29 | |||||
#define TL $r7 | |||||
#define ZERO $r0 | |||||
#define F0 $f0 | |||||
#define F1 $f1 | |||||
#define F2 $f2 | |||||
#define F3 $f3 | |||||
#define F4 $f4 | |||||
#define F5 $f5 | |||||
#define F6 $f6 | |||||
#define F7 $f7 | |||||
/* LASX vectors */ | |||||
#define U0 $vr0 | |||||
#define U1 $vr1 | |||||
#define U2 $vr2 | |||||
#define U3 $vr3 | |||||
#define U4 $vr4 | |||||
#define U5 $vr5 | |||||
#define U6 $vr6 | |||||
#define U7 $vr7 | |||||
PROLOGUE | |||||
push_if_used 24, 8 | |||||
move S0, SRC | |||||
move P0, DST | |||||
srai.d T0, N, 0x03 | |||||
srai.d T1, N, 0x02 | |||||
slli.d T0, T0, 0x03 | |||||
slli.d T1, T1, 0x02 | |||||
mul.d P2, M, T0 | |||||
mul.d P3, M, T1 | |||||
slli.d P2, P2, 0x03 | |||||
slli.d P3, P3, 0x03 | |||||
add.d P2, DST, P2 | |||||
add.d P3, DST, P3 | |||||
srai.d T0, N, 0x01 | |||||
slli.d T0, T0, 0x01 | |||||
mul.d P4, M, T0 | |||||
slli.d P4, P4, 0x03 | |||||
add.d P4, DST, P4 | |||||
slli.d TL, LDA, 0x03 | |||||
srai.d J, M, 0x03 | |||||
slli.d T0, TL, 0x01 | |||||
slli.d T1, M, 0x06 | |||||
beq ZERO, J, .L_M7 | |||||
.L_J1: /* J-- */ | |||||
move S1, S0 | |||||
add.d S2, S0, TL | |||||
add.d S3, S1, T0 | |||||
add.d S4, S2, T0 | |||||
add.d S5, S3, T0 | |||||
add.d S6, S4, T0 | |||||
add.d S7, S5, T0 | |||||
add.d S8, S6, T0 | |||||
add.d S0, S7, T0 | |||||
move P1, P0 | |||||
addi.d P0, P0, 0x200 | |||||
srai.d I, N, 0x03 | |||||
addi.d J, J, -1 | |||||
beq ZERO, I, .L_N7 | |||||
.L_I1: /* I-- */ | |||||
vld U0, S1, 0x00 | |||||
vld U1, S1, 0x10 | |||||
vld U2, S1, 0x20 | |||||
vld U3, S1, 0x30 | |||||
vld U4, S2, 0x00 | |||||
vld U5, S2, 0x10 | |||||
vld U6, S2, 0x20 | |||||
vld U7, S2, 0x30 | |||||
vst U0, P1, 0x00 | |||||
vst U1, P1, 0x10 | |||||
vst U2, P1, 0x20 | |||||
vst U3, P1, 0x30 | |||||
vst U4, P1, 0x40 | |||||
vst U5, P1, 0x50 | |||||
vst U6, P1, 0x60 | |||||
vst U7, P1, 0x70 | |||||
vld U0, S3, 0x00 | |||||
vld U1, S3, 0x10 | |||||
vld U2, S3, 0x20 | |||||
vld U3, S3, 0x30 | |||||
vld U4, S4, 0x00 | |||||
vld U5, S4, 0x10 | |||||
vld U6, S4, 0x20 | |||||
vld U7, S4, 0x30 | |||||
vst U0, P1, 0x80 | |||||
vst U1, P1, 0x90 | |||||
vst U2, P1, 0xa0 | |||||
vst U3, P1, 0xb0 | |||||
vst U4, P1, 0xc0 | |||||
vst U5, P1, 0xd0 | |||||
vst U6, P1, 0xe0 | |||||
vst U7, P1, 0xf0 | |||||
vld U0, S5, 0x00 | |||||
vld U1, S5, 0x10 | |||||
vld U2, S5, 0x20 | |||||
vld U3, S5, 0x30 | |||||
vld U4, S6, 0x00 | |||||
vld U5, S6, 0x10 | |||||
vld U6, S6, 0x20 | |||||
vld U7, S6, 0x30 | |||||
vst U0, P1, 0x100 | |||||
vst U1, P1, 0x110 | |||||
vst U2, P1, 0x120 | |||||
vst U3, P1, 0x130 | |||||
vst U4, P1, 0x140 | |||||
vst U5, P1, 0x150 | |||||
vst U6, P1, 0x160 | |||||
vst U7, P1, 0x170 | |||||
vld U0, S7, 0x00 | |||||
vld U1, S7, 0x10 | |||||
vld U2, S7, 0x20 | |||||
vld U3, S7, 0x30 | |||||
vld U4, S8, 0x00 | |||||
vld U5, S8, 0x10 | |||||
vld U6, S8, 0x20 | |||||
vld U7, S8, 0x30 | |||||
vst U0, P1, 0x180 | |||||
vst U1, P1, 0x190 | |||||
vst U2, P1, 0x1a0 | |||||
vst U3, P1, 0x1b0 | |||||
vst U4, P1, 0x1c0 | |||||
vst U5, P1, 0x1d0 | |||||
vst U6, P1, 0x1e0 | |||||
vst U7, P1, 0x1f0 | |||||
addi.d S1, S1, 0x40 | |||||
addi.d S2, S2, 0x40 | |||||
addi.d S3, S3, 0x40 | |||||
addi.d S4, S4, 0x40 | |||||
addi.d S5, S5, 0x40 | |||||
addi.d S6, S6, 0x40 | |||||
addi.d S7, S7, 0x40 | |||||
addi.d S8, S8, 0x40 | |||||
addi.d I, I, -1 | |||||
add.d P1, P1, T1 | |||||
blt ZERO, I, .L_I1 | |||||
.L_N7: | |||||
andi I, N, 0x04 | |||||
beq ZERO, I, .L_N3 | |||||
vld U0, S1, 0x00 | |||||
vld U1, S1, 0x10 | |||||
vld U2, S2, 0x00 | |||||
vld U3, S2, 0x10 | |||||
vld U4, S3, 0x00 | |||||
vld U5, S3, 0x10 | |||||
vld U6, S4, 0x00 | |||||
vld U7, S4, 0x10 | |||||
vst U0, P2, 0x00 | |||||
vst U1, P2, 0x10 | |||||
vst U2, P2, 0x20 | |||||
vst U3, P2, 0x30 | |||||
vst U4, P2, 0x40 | |||||
vst U5, P2, 0x50 | |||||
vst U6, P2, 0x60 | |||||
vst U7, P2, 0x70 | |||||
vld U0, S5, 0x00 | |||||
vld U1, S5, 0x10 | |||||
vld U2, S6, 0x00 | |||||
vld U3, S6, 0x10 | |||||
vld U4, S7, 0x00 | |||||
vld U5, S7, 0x10 | |||||
vld U6, S8, 0x00 | |||||
vld U7, S8, 0x10 | |||||
vst U0, P2, 0x80 | |||||
vst U1, P2, 0x90 | |||||
vst U2, P2, 0xa0 | |||||
vst U3, P2, 0xb0 | |||||
vst U4, P2, 0xc0 | |||||
vst U5, P2, 0xd0 | |||||
vst U6, P2, 0xe0 | |||||
vst U7, P2, 0xf0 | |||||
addi.d S1, S1, 0x20 | |||||
addi.d S2, S2, 0x20 | |||||
addi.d S3, S3, 0x20 | |||||
addi.d S4, S4, 0x20 | |||||
addi.d S5, S5, 0x20 | |||||
addi.d S6, S6, 0x20 | |||||
addi.d S7, S7, 0x20 | |||||
addi.d S8, S8, 0x20 | |||||
addi.d P2, P2, 0x100 | |||||
.L_N3: | |||||
andi I, N, 0x02 | |||||
beq ZERO, I, .L_N1 | |||||
vld U0, S1, 0x00 | |||||
vld U1, S2, 0x00 | |||||
vld U2, S3, 0x00 | |||||
vld U3, S4, 0x00 | |||||
vld U4, S5, 0x00 | |||||
vld U5, S6, 0x00 | |||||
vld U6, S7, 0x00 | |||||
vld U7, S8, 0x00 | |||||
vst U0, P3, 0x00 | |||||
vst U1, P3, 0x10 | |||||
vst U2, P3, 0x20 | |||||
vst U3, P3, 0x30 | |||||
vst U4, P3, 0x40 | |||||
vst U5, P3, 0x50 | |||||
vst U6, P3, 0x60 | |||||
vst U7, P3, 0x70 | |||||
addi.d S1, S1, 0x10 | |||||
addi.d S2, S2, 0x10 | |||||
addi.d S3, S3, 0x10 | |||||
addi.d S4, S4, 0x10 | |||||
addi.d S5, S5, 0x10 | |||||
addi.d S6, S6, 0x10 | |||||
addi.d S7, S7, 0x10 | |||||
addi.d S8, S8, 0x10 | |||||
addi.d P3, P3, 0x80 | |||||
.L_N1: | |||||
andi I, N, 0x01 | |||||
beq ZERO, I, .L_N0 | |||||
fld.d F0, S1, 0x00 | |||||
fld.d F1, S2, 0x00 | |||||
fld.d F2, S3, 0x00 | |||||
fld.d F3, S4, 0x00 | |||||
fld.d F4, S5, 0x00 | |||||
fld.d F5, S6, 0x00 | |||||
fld.d F6, S7, 0x00 | |||||
fld.d F7, S8, 0x00 | |||||
fst.d F0, P4, 0x00 | |||||
fst.d F1, P4, 0x08 | |||||
fst.d F2, P4, 0x10 | |||||
fst.d F3, P4, 0x18 | |||||
fst.d F4, P4, 0x20 | |||||
fst.d F5, P4, 0x28 | |||||
fst.d F6, P4, 0x30 | |||||
fst.d F7, P4, 0x38 | |||||
addi.d S1, S1, 0x08 | |||||
addi.d S2, S2, 0x08 | |||||
addi.d S3, S3, 0x08 | |||||
addi.d S4, S4, 0x08 | |||||
addi.d S5, S5, 0x08 | |||||
addi.d S6, S6, 0x08 | |||||
addi.d S7, S7, 0x08 | |||||
addi.d S8, S8, 0x08 | |||||
addi.d P4, P4, 0x40 | |||||
.L_N0: | |||||
blt ZERO, J, .L_J1 | |||||
.L_M7: | |||||
andi J, M, 0x04 | |||||
beq ZERO, J, .L_M3 | |||||
move S1, S0 | |||||
add.d S2, S0, TL | |||||
add.d S3, S1, T0 | |||||
add.d S4, S2, T0 | |||||
add.d S0, S3, T0 | |||||
move P1, P0 | |||||
addi.d P0, P0, 0x100 | |||||
srai.d I, N, 0x03 | |||||
beq ZERO, I, .L_4N7 | |||||
.L_4I1: /* I-- */ | |||||
vld U0, S1, 0x00 | |||||
vld U1, S1, 0x10 | |||||
vld U2, S1, 0x20 | |||||
vld U3, S1, 0x30 | |||||
vld U4, S2, 0x00 | |||||
vld U5, S2, 0x10 | |||||
vld U6, S2, 0x20 | |||||
vld U7, S2, 0x30 | |||||
vst U0, P1, 0x00 | |||||
vst U1, P1, 0x10 | |||||
vst U2, P1, 0x20 | |||||
vst U3, P1, 0x30 | |||||
vst U4, P1, 0x40 | |||||
vst U5, P1, 0x50 | |||||
vst U6, P1, 0x60 | |||||
vst U7, P1, 0x70 | |||||
vld U0, S3, 0x00 | |||||
vld U1, S3, 0x10 | |||||
vld U2, S3, 0x20 | |||||
vld U3, S3, 0x30 | |||||
vld U4, S4, 0x00 | |||||
vld U5, S4, 0x10 | |||||
vld U6, S4, 0x20 | |||||
vld U7, S4, 0x30 | |||||
vst U0, P1, 0x80 | |||||
vst U1, P1, 0x90 | |||||
vst U2, P1, 0xa0 | |||||
vst U3, P1, 0xb0 | |||||
vst U4, P1, 0xc0 | |||||
vst U5, P1, 0xd0 | |||||
vst U6, P1, 0xe0 | |||||
vst U7, P1, 0xf0 | |||||
addi.d S1, S1, 0x40 | |||||
addi.d S2, S2, 0x40 | |||||
addi.d S3, S3, 0x40 | |||||
addi.d S4, S4, 0x40 | |||||
addi.d I, I, -1 | |||||
add.d P1, P1, T1 | |||||
blt ZERO, I, .L_4I1 | |||||
.L_4N7: | |||||
andi I, N, 0x04 | |||||
beq ZERO, I, .L_4N3 | |||||
vld U0, S1, 0x00 | |||||
vld U1, S1, 0x10 | |||||
vld U2, S2, 0x00 | |||||
vld U3, S2, 0x10 | |||||
vld U4, S3, 0x00 | |||||
vld U5, S3, 0x10 | |||||
vld U6, S4, 0x00 | |||||
vld U7, S4, 0x10 | |||||
vst U0, P2, 0x00 | |||||
vst U1, P2, 0x10 | |||||
vst U2, P2, 0x20 | |||||
vst U3, P2, 0x30 | |||||
vst U4, P2, 0x40 | |||||
vst U5, P2, 0x50 | |||||
vst U6, P2, 0x60 | |||||
vst U7, P2, 0x70 | |||||
addi.d S1, S1, 0x20 | |||||
addi.d S2, S2, 0x20 | |||||
addi.d S3, S3, 0x20 | |||||
addi.d S4, S4, 0x20 | |||||
addi.d P2, P2, 0x80 | |||||
.L_4N3: | |||||
andi I, N, 0x02 | |||||
beq ZERO, I, .L_4N1 | |||||
vld U0, S1, 0x00 | |||||
vld U1, S2, 0x00 | |||||
vld U2, S3, 0x00 | |||||
vld U3, S4, 0x00 | |||||
vst U0, P3, 0x00 | |||||
vst U1, P3, 0x10 | |||||
vst U2, P3, 0x20 | |||||
vst U3, P3, 0x30 | |||||
addi.d S1, S1, 0x10 | |||||
addi.d S2, S2, 0x10 | |||||
addi.d S3, S3, 0x10 | |||||
addi.d S4, S4, 0x10 | |||||
addi.d P3, P3, 0x40 | |||||
.L_4N1: | |||||
andi I, N, 0x01 | |||||
beq ZERO, I, .L_M3 | |||||
fld.d F0, S1, 0x00 | |||||
fld.d F1, S2, 0x00 | |||||
fld.d F2, S3, 0x00 | |||||
fld.d F3, S4, 0x00 | |||||
fst.d F0, P4, 0x00 | |||||
fst.d F1, P4, 0x08 | |||||
fst.d F2, P4, 0x10 | |||||
fst.d F3, P4, 0x18 | |||||
addi.d S1, S1, 0x08 | |||||
addi.d S2, S2, 0x08 | |||||
addi.d S3, S3, 0x08 | |||||
addi.d S4, S4, 0x08 | |||||
addi.d P4, P4, 0x20 | |||||
.L_M3: | |||||
andi J, M, 0x02 | |||||
beq ZERO, J, .L_M1 | |||||
move S1, S0 | |||||
add.d S2, S0, TL | |||||
add.d S0, S0, T0 | |||||
move P1, P0 | |||||
addi.d P0, P0, 0x80 | |||||
srai.d I, N, 0x03 | |||||
beq ZERO, I, .L_2N7 | |||||
.L_2I1: /* I-- */ | |||||
vld U0, S1, 0x00 | |||||
vld U1, S1, 0x10 | |||||
vld U2, S1, 0x20 | |||||
vld U3, S1, 0x30 | |||||
vld U4, S2, 0x00 | |||||
vld U5, S2, 0x10 | |||||
vld U6, S2, 0x20 | |||||
vld U7, S2, 0x30 | |||||
vst U0, P1, 0x00 | |||||
vst U1, P1, 0x10 | |||||
vst U2, P1, 0x20 | |||||
vst U3, P1, 0x30 | |||||
vst U4, P1, 0x40 | |||||
vst U5, P1, 0x50 | |||||
vst U6, P1, 0x60 | |||||
vst U7, P1, 0x70 | |||||
addi.d S1, S1, 0x40 | |||||
addi.d S2, S2, 0x40 | |||||
addi.d I, I, -1 | |||||
add.d P1, P1, T1 | |||||
blt ZERO, I, .L_2I1 | |||||
.L_2N7: | |||||
andi I, N, 0x04 | |||||
beq ZERO, I, .L_2N3 | |||||
vld U0, S1, 0x00 | |||||
vld U1, S1, 0x10 | |||||
vld U2, S2, 0x00 | |||||
vld U3, S2, 0x10 | |||||
vst U0, P2, 0x00 | |||||
vst U1, P2, 0x10 | |||||
vst U2, P2, 0x20 | |||||
vst U3, P2, 0x30 | |||||
addi.d S1, S1, 0x20 | |||||
addi.d S2, S2, 0x20 | |||||
addi.d P2, P2, 0x40 | |||||
.L_2N3: | |||||
andi I, N, 0x02 | |||||
beq ZERO, I, .L_2N1 | |||||
vld U0, S1, 0x00 | |||||
vld U1, S2, 0x00 | |||||
vst U0, P3, 0x00 | |||||
vst U1, P3, 0x10 | |||||
addi.d S1, S1, 0x10 | |||||
addi.d S2, S2, 0x10 | |||||
addi.d P3, P3, 0x20 | |||||
.L_2N1: | |||||
andi I, N, 0x01 | |||||
beq ZERO, I, .L_M1 | |||||
fld.d F0, S1, 0x00 | |||||
fld.d F1, S2, 0x00 | |||||
fst.d F0, P4, 0x00 | |||||
fst.d F1, P4, 0x08 | |||||
addi.d S1, S1, 0x08 | |||||
addi.d S2, S2, 0x08 | |||||
addi.d P4, P4, 0x10 | |||||
.L_M1: | |||||
andi J, M, 0x01 | |||||
beq ZERO, J, .L_M0 | |||||
move S1, S0 | |||||
add.d S2, S0, TL | |||||
move P1, P0 | |||||
addi.d P0, P0, 0x40 | |||||
srai.d I, N, 0x03 | |||||
beq ZERO, I, .L_1N7 | |||||
.L_1I1: /* I-- */ | |||||
vld U0, S1, 0x00 | |||||
vld U1, S1, 0x10 | |||||
vld U2, S1, 0x20 | |||||
vld U3, S1, 0x30 | |||||
vst U0, P1, 0x00 | |||||
vst U1, P1, 0x10 | |||||
vst U2, P1, 0x20 | |||||
vst U3, P1, 0x30 | |||||
addi.d S1, S1, 0x40 | |||||
addi.d I, I, -1 | |||||
add.d P1, P1, T1 | |||||
blt ZERO, I, .L_1I1 | |||||
.L_1N7: | |||||
andi I, N, 0x04 | |||||
beq ZERO, I, .L_1N3 | |||||
vld U0, S1, 0x00 | |||||
vld U1, S1, 0x10 | |||||
vst U0, P2, 0x00 | |||||
vst U1, P2, 0x10 | |||||
addi.d S1, S1, 0x20 | |||||
addi.d P2, P2, 0x20 | |||||
.L_1N3: | |||||
andi I, N, 0x02 | |||||
beq ZERO, I, .L_1N1 | |||||
vld U0, S1, 0x00 | |||||
vst U0, P3, 0x00 | |||||
addi.d S1, S1, 0x10 | |||||
addi.d P3, P3, 0x10 | |||||
.L_1N1: | |||||
andi I, N, 0x01 | |||||
beq ZERO, I, .L_M0 | |||||
fld.d F0, S1, 0x00 | |||||
fst.d F0, P4, 0x00 | |||||
addi.d S1, S1, 0x08 | |||||
addi.d P4, P4, 0x08 | |||||
.L_M0: | |||||
pop_if_used 24, 8 | |||||
jirl $r0, $r1, 0x00 | |||||
EPILOGUE |