|
- /*******************************************************************************
- Copyright (c) 2023, The OpenBLAS Project
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *******************************************************************************/
- #define ASSEMBLER
-
- #include "common.h"
- #include "loongarch64_asm.S"
-
- /*********************************************************************
- * 2023/08/30 guxiwei
- * UTEST : OK
- * CTEST : OK
- * TEST : OK
- *
- *
- *********************************************************************/
-
- /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
- * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
- */
- #define M $r4
- #define N $r5
- #define ALPHA $f0
- #define A $r7
- #define LDA $r8
- #define X $r9
- #define INC_X $r10
- #define Y $r11
- #define INC_Y $r6
-
- #define J $r12
- #define I $r13
- #define K $r14
- #define Y_ORG $r15
- #define OFFSET $r16
- #define K_LDA $r17
- #define M4 $r18
- #define T0 $r19
- #define PA0 $r20
- #define PA1 $r23
- #define PA2 $r24
- #define PA3 $r25
- #define PA4 $r26
- #define PA5 $r27
- #define PA6 $r28
- #define PA7 $r29
-
- #define VALPHA $xr1
- #define X0 $xr2
- #define X1 $xr3
- #define X2 $xr4
- #define X3 $xr5
- #define X4 $xr6
- #define X5 $xr7
- #define X6 $xr8
- #define X7 $xr9
- #define Y0 $xr10
- #define A0 $xr11
- #define A1 $xr12
- #define A2 $xr13
- #define A3 $xr14
- #define A4 $xr15
- #define A5 $xr16
- #define A6 $xr17
- #define A7 $xr18
-
- #define X0_F $f2
- #define X1_F $f3
- #define X2_F $f4
- #define X3_F $f5
- #define X4_F $f6
- #define X5_F $f7
- #define X6_F $f8
- #define X7_F $f9
- #define Y0_F $f10
- #define A0_F $f11
- #define A1_F $f12
- #define A2_F $f13
- #define A3_F $f14
- #define A4_F $f15
- #define A5_F $f16
- #define A6_F $f17
- #define A7_F $f18
-
- .macro SLOAD_X_8
- GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C, \
- X4, X, 0x10, X5, X, 0x14, X6, X, 0x18, X7, X, 0x1C
- GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
- X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
- .endm
-
- .macro SLOAD_X_8_GAP
- xvldrepl.w X0, X, 0x00
- PTR_ADD T0, X, INC_X
- xvldrepl.w X1, T0, 0x00
- PTR_ADD T0, T0, INC_X
- xvldrepl.w X2, T0, 0x00
- PTR_ADD T0, T0, INC_X
- xvldrepl.w X3, T0, 0x00
- PTR_ADD T0, T0, INC_X
- xvldrepl.w X4, T0, 0x00
- PTR_ADD T0, T0, INC_X
- xvldrepl.w X5, T0, 0x00
- PTR_ADD T0, T0, INC_X
- xvldrepl.w X6, T0, 0x00
- PTR_ADD T0, T0, INC_X
- xvldrepl.w X7, T0, 0x00
- GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
- X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
- .endm
-
- .macro SLOAD_X_4
- GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C
- GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
- .endm
-
- .macro SLOAD_X_4_GAP
- xvldrepl.w X0, X, 0x00
- PTR_ADD T0, X, INC_X
- xvldrepl.w X1, T0, 0x00
- PTR_ADD T0, T0, INC_X
- xvldrepl.w X2, T0, 0x00
- PTR_ADD T0, T0, INC_X
- xvldrepl.w X3, T0, 0x00
- GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
- .endm
-
- .macro SLOAD_X_2
- GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04
- GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA
- .endm
-
- .macro SLOAD_X_2_GAP
- xvldrepl.w X0, X, 0x00
- PTR_ADD T0, X, INC_X
- xvldrepl.w X1, T0, 0x00
- GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA
- .endm
-
- .macro SLOAD_X_1
- GLDREPL xv, w, X0, X, 0x00
- GMUL xvf, s, X0, X0, VALPHA
- .endm
-
- .macro SLOAD_Y_8
- GLD xv, , Y0, Y, 0
- .endm
-
- .macro SLOAD_Y_8_GAP
- fld.s Y0_F, Y, 0
- fldx.s A0_F, Y, INC_Y
- PTR_ALSL T0, INC_Y, Y, 1
- fld.s A1_F, T0, 0
- fldx.s A2_F, T0, INC_Y
- PTR_ALSL T0, INC_Y, Y, 2
- fld.s A3_F, T0, 0
- fldx.s A4_F, T0, INC_Y
- PTR_ADD T0, T0, INC_Y
- PTR_ADD T0, T0, INC_Y
- fld.s A5_F, T0, 0
- fldx.s A6_F, T0, INC_Y
- GINSVE0 xv, w, Y0, A0, 1, Y0, A1, 2, Y0, A2, 3, Y0, A3, 4, \
- Y0, A4, 5, Y0, A5, 6, Y0, A6, 7
- .endm
-
- .macro SLOAD_Y_1
- GLD f, s, Y0_F, Y, 0
- .endm
-
- .macro SGEMV_N_8x8
- GLD_INC xv, , 0x20, \
- A0, PA0, 0, A1, PA1, 0, \
- A2, PA2, 0, A3, PA3, 0, \
- A4, PA4, 0, A5, PA5, 0, \
- A6, PA6, 0, A7, PA7, 0
- GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \
- Y0, A2, X2, Y0, Y0, A3, X3, Y0, \
- Y0, A4, X4, Y0, Y0, A5, X5, Y0, \
- Y0, A6, X6, Y0, Y0, A7, X7, Y0
- .endm
-
- .macro SGEMV_N_1x8
- GLD_INC f, s, 0x04, \
- A0_F, PA0, 0, A1_F, PA1, 0, \
- A2_F, PA2, 0, A3_F, PA3, 0, \
- A4_F, PA4, 0, A5_F, PA5, 0, \
- A6_F, PA6, 0, A7_F, PA7, 0
- GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \
- Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F, \
- Y0_F, A4_F, X4_F, Y0_F, Y0_F, A5_F, X5_F, Y0_F, \
- Y0_F, A6_F, X6_F, Y0_F, Y0_F, A7_F, X7_F, Y0_F
- .endm
-
- .macro SGEMV_N_8x4
- GLD_INC xv, , 0x20, \
- A0, PA0, 0, A1, PA1, 0, \
- A2, PA2, 0, A3, PA3, 0
- GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \
- Y0, A2, X2, Y0, Y0, A3, X3, Y0
- .endm
-
- .macro SGEMV_N_1x4
- GLD_INC f, s, 0x04, \
- A0_F, PA0, 0, A1_F, PA1, 0, \
- A2_F, PA2, 0, A3_F, PA3, 0
- GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \
- Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F
- .endm
-
- .macro SGEMV_N_8x2
- GLD_INC xv, , 0x20, \
- A0, PA0, 0, A1, PA1, 0
- GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0
- .endm
-
- .macro SGEMV_N_1x2
- GLD_INC f, s, 0x04, \
- A0_F, PA0, 0, A1_F, PA1, 0
- GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F
- .endm
-
- .macro SGEMV_N_1x1
- GLD_INC f, s, 0x04, A0_F, PA0, 0
- GMADD f, s, Y0_F, A0_F, X0_F, Y0_F
- .endm
-
- .macro SSTORE_Y_8
- GST xv, , Y0, Y, 0
- .endm
-
- .macro SSTORE_Y_8_GAP
- xvstelm.w Y0, Y, 0, 0
- PTR_ADD T0, Y, INC_Y
- xvstelm.w Y0, T0, 0, 1
- PTR_ADD T0, T0, INC_Y
- xvstelm.w Y0, T0, 0, 2
- PTR_ADD T0, T0, INC_Y
- xvstelm.w Y0, T0, 0, 3
-
- PTR_ADD T0, T0, INC_Y
- xvstelm.w Y0, T0, 0, 4
- PTR_ADD T0, T0, INC_Y
- xvstelm.w Y0, T0, 0, 5
- PTR_ADD T0, T0, INC_Y
- xvstelm.w Y0, T0, 0, 6
- PTR_ADD T0, T0, INC_Y
- xvstelm.w Y0, T0, 0, 7
- .endm
-
- .macro SSTORE_Y_1
- GST f, s, Y0_F, Y, 0
- .endm
-
- .macro SGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
- PTR_SRLI J, N, 3
- beqz J, .L_\XW\()_N_7
- PTR_SLLI K_LDA, LDA, 3
- PTR_SUB K_LDA, K_LDA, M4
- .L_\XW\()_N_L8:
- SLOAD_\X_8
- xor K, K, K
- move Y, Y_ORG
- PTR_SRLI I, M, 3
- beqz I, .L_\XW\()_M_7
- .align 5
- .L_\XW\()_M_L8:
- SLOAD_\Y_8
- SGEMV_N_8x8
- SSTORE_\Y_8
- PTR_ADDI I, I, -1
- PTR_ALSL Y, INC_Y, Y, 3
- PTR_ADDI K, K, 8
- bnez I, .L_\XW\()_M_L8
- .L_\XW\()_M_7:
- andi I, M, 7
- beqz I, .L_\XW\()_M_END
- .align 5
- .L_\XW\()_M_L1:
- SLOAD_\Y_1
- SGEMV_N_1x8
- SSTORE_\Y_1
- PTR_ADDI I, I, -1
- PTR_ADD Y, Y, INC_Y
- PTR_ADDI K, K, 1
- bnez I, .L_\XW\()_M_L1
- .L_\XW\()_M_END:
- PTR_ADDI J, J, -1
- #if __loongarch_grlen == 64
- GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
- PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
- #elif __loongarch_grlen == 32
- GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
- PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
- #else
- GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
- PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
- #endif
- PTR_ALSL X, INC_X, X, 3
- bnez J, .L_\XW\()_N_L8
- .L_\XW\()_N_7:
- andi J, N, 4
- beqz J, .L_\XW\()_N_3
- SLOAD_\X_4
- xor K, K, K
- move Y, Y_ORG
-
- PTR_SRLI I, M, 3
- beqz I, .L_\XW\()_N_4_M_7
- .align 5
- .L_\XW\()_N_4_M_L8:
- SLOAD_\Y_8
- SGEMV_N_8x4
- SSTORE_\Y_8
- PTR_ADDI I, I, -1
- PTR_ADDI K, K, 8
- PTR_ALSL Y, INC_Y, Y, 3
- bnez I, .L_\XW\()_N_4_M_L8
- .L_\XW\()_N_4_M_7:
- andi I, M, 7
- beqz I, .L_\XW\()_N_4_M_END
- .align 5
- .L_\XW\()_N_4_M_L1:
- SLOAD_\Y_1
- SGEMV_N_1x4
- SSTORE_\Y_1
- PTR_ADDI I, I, -1
- PTR_ADD Y, Y, INC_Y
- PTR_ADDI K, K, 1
- bnez I, .L_\XW\()_N_4_M_L1
- .L_\XW\()_N_4_M_END:
- PTR_SLLI K_LDA, LDA, 2
- PTR_SUB K_LDA, K_LDA, M4
- #if __loongarch_grlen == 64
- GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
- #elif __loongarch_grlen == 32
- GADD , w PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
- #else
- GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
- #endif
- PTR_ALSL X, INC_X, X, 2
- .L_\XW\()_N_3:
- andi J, N, 2
- beqz J, .L_\XW\()_N_1
- SLOAD_\X_2
- xor K, K, K
- move Y, Y_ORG
- PTR_SRLI I, M, 3
- beqz I, .L_\XW\()_N_2_M_7
- .align 5
- .L_\XW\()_N_2_M_L8:
- SLOAD_\Y_8
- SGEMV_N_8x2
- SSTORE_\Y_8
- PTR_ADDI I, I, -1
- PTR_ADDI K, K, 8
- PTR_ALSL Y, INC_Y, Y, 3
- bnez I, .L_\XW\()_N_2_M_L8
- .L_\XW\()_N_2_M_7:
- andi I, M, 7
- beqz I, .L_\XW\()_N_2_M_END
- .align 5
- .L_\XW\()_N_2_M_L1:
- SLOAD_\Y_1
- SGEMV_N_1x2
- SSTORE_\Y_1
- PTR_ADDI I, I, -1
- PTR_ADD Y, Y, INC_Y
- PTR_ADDI K, K, 1
- bnez I, .L_\XW\()_N_2_M_L1
- .L_\XW\()_N_2_M_END:
- PTR_SLLI K_LDA, LDA, 1
- PTR_SUB K_LDA, K_LDA, M4
- PTR_ADD PA0, PA0, K_LDA
- PTR_ADD PA1, PA1, K_LDA
- PTR_ALSL X, INC_X, X, 1
- .L_\XW\()_N_1:
- andi J, N, 1
- beqz J, .L_END
- SLOAD_\X_1
- xor K, K, K
- move Y, Y_ORG
- move I, M
- beqz I, .L_END
- .align 5
- .L_\XW\()_N_1_M_L1:
- SLOAD_\Y_1
- SGEMV_N_1x1
- SSTORE_\Y_1
- PTR_ADDI I, I, -1
- PTR_ADD Y, Y, INC_Y
- PTR_ADDI K, K, 1
- bnez I, .L_\XW\()_N_1_M_L1
- b .L_END
- .endm
-
- PROLOGUE
- PTR_LD INC_Y, $sp, 0
- push_if_used 17 + 7, 19
- PTR_ADDI K, $r0, 0x01
- PTR_SUB I, INC_X, K
- PTR_SUB J, INC_Y, K
- maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
- maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
- PTR_ALSL I, I, J, 1
- GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2
- xvreplve0.w VALPHA, $xr0
- move Y_ORG, Y
- move PA0, A
- #if __loongarch_grlen == 64
- GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
- PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
- #elif __loongarch_grlen == 32
- GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
- PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
- #else
- GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
- PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
- #endif
- la.local T0, .L_GAP_TABLE
- PTR_ALSL I, I, T0, 1
- ld.h K, I, 0
- PTR_ADD T0, T0, K
- jirl $r0, T0, 0
- .L_GAP_TABLE:
- .hword .L_GAP_0_0 - .L_GAP_TABLE
- .hword .L_GAP_0_1 - .L_GAP_TABLE
- .hword .L_GAP_1_0 - .L_GAP_TABLE
- .hword .L_GAP_1_1 - .L_GAP_TABLE
- .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
- SGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
- .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
- SGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
- .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
- SGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
- .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
- SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
- .L_END:
- pop_if_used 17 + 7, 19
- jirl $r0, $r1, 0x0
- EPILOGUE
|