You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_ncopy_4_lsx.S 5.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. /*******************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #include "loongarch64_asm.S"
  30. /* Function parameters */
  31. #define M $r4 // param 1: m
  32. #define N $r5 // param 2: n
  33. #define SRC $r6 // param 3: src
  34. #define LDA $r7 // param 4: lda
  35. #define DST $r8 // param 5: dst
  36. #define I $r9
  37. #define J $r10
  38. #define S1 $r12
  39. #define S2 $r13
  40. #define S3 $r14
  41. #define S4 $r15
  42. #define S5 $r16
  43. #define S6 $r17
  44. #define S7 $r18
  45. #define S8 $r19
  46. #define TD $r20
  47. #define TS $r21
  48. #define TL $r7
  49. #define T0 $r6
  50. #define ZERO $r0
  51. #define F0 $f0
  52. #define F1 $f1
  53. #define F2 $f2
  54. #define F3 $f3
  55. #define F4 $f4
  56. #define F5 $f5
  57. #define F6 $f6
  58. #define F7 $f7
  59. /* LSX vectors */
  60. #define U0 $vr0
  61. #define U1 $vr1
  62. #define U2 $vr2
  63. #define U3 $vr3
  64. #define U4 $vr4
  65. #define U5 $vr5
  66. #define U6 $vr6
  67. #define U7 $vr7
  68. #define D0 $vr8
  69. #define D1 $vr9
  70. #define D2 $vr10
  71. #define D3 $vr11
  72. #define D4 $vr12
  73. #define D5 $vr13
  74. #define D6 $vr14
  75. #define D7 $vr15
  76. PROLOGUE
  77. move TD, DST
  78. move TS, SRC
  79. slli.d TL, LDA, 0x03
  80. slli.d T0, TL, 0x01
  81. srai.d J, N, 0x02
  82. beq J, ZERO, .L_N2
  83. .L_J1: /* J-- */
  84. move S1, TS
  85. add.d S2, TS, TL
  86. srai.d I, M, 0x02
  87. add.d S3, S2, TL
  88. add.d S4, S2, T0
  89. add.d TS, S3, T0
  90. addi.d J, J, -1
  91. beq I, ZERO, .L_I3
  92. .L_I1: /* I-- */
  93. GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00
  94. GINTERLACE v, d, D0, D2, U1, U0
  95. GINTERLACE v, d, D1, D3, U3, U2
  96. GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
  97. addi.d TD, TD, 0x40
  98. GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10
  99. GINTERLACE v, d, D0, D2, U1, U0
  100. GINTERLACE v, d, D1, D3, U3, U2
  101. GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
  102. addi.d S1, S1, 0x20
  103. addi.d S2, S2, 0x20
  104. addi.d S3, S3, 0x20
  105. addi.d S4, S4, 0x20
  106. addi.d TD, TD, 0x40
  107. addi.d I, I, -1
  108. blt ZERO, I, .L_I1
  109. .L_I3:
  110. andi I, M, 0x03
  111. beq I, ZERO, .L_I0
  112. .L_II1:
  113. fld.d F0, S1, 0x00
  114. fld.d F1, S2, 0x00
  115. fld.d F2, S3, 0x00
  116. fld.d F3, S4, 0x00
  117. fst.d F0, TD, 0x00
  118. addi.d S1, S1, 0x08
  119. fst.d F1, TD, 0x08
  120. addi.d S2, S2, 0x08
  121. fst.d F2, TD, 0x10
  122. addi.d S3, S3, 0x08
  123. fst.d F3, TD, 0x18
  124. addi.d S4, S4, 0x08
  125. addi.d TD, TD, 0x20
  126. addi.d I, I, -1
  127. blt ZERO, I, .L_II1
  128. .L_I0:
  129. blt ZERO, J, .L_J1
  130. .L_N2:
  131. andi J, N, 0x02
  132. beq ZERO, J, .L_N1
  133. move S1, TS
  134. add.d S2, TS, TL
  135. srai.d I, M, 0x01
  136. add.d TS, S2, TL
  137. beq I, ZERO, .L_2I3
  138. .L_2I1: /* I-- */
  139. GLD v, , U0, S1, 0x00, U1, S2, 0x00
  140. GINTERLACE v, d, D0, D1, U1, U0
  141. GST v, , D0, TD, 0x00, D1, TD, 0x10
  142. addi.d S1, S1, 0x10
  143. addi.d S2, S2, 0x10
  144. addi.d TD, TD, 0x20
  145. addi.d I, I, -1
  146. blt ZERO, I, .L_2I1
  147. .L_2I3:
  148. andi I, M, 0x01
  149. beq ZERO, I, .L_N1
  150. .L_2II1: /* I-- */
  151. fld.d F0, S1, 0x00
  152. fld.d F1, S2, 0x00
  153. fst.d F0, TD, 0x00
  154. addi.d I, I, -1
  155. fst.d F1, TD, 0x08
  156. addi.d S1, S1, 0x08
  157. addi.d S2, S2, 0x08
  158. addi.d TD, TD, 0x10
  159. blt ZERO, I, .L_2II1
  160. .L_N1:
  161. move S1, TS
  162. beq ZERO, M, .L_N0
  163. .L_M1:
  164. fld.d F0, S1, 0x00
  165. addi.d S1, S1, 0x08
  166. fst.d F0, TD, 0x00
  167. addi.d TD, TD, 0x08
  168. addi.d M, M, -1
  169. blt ZERO, M, .L_M1
  170. .L_N0:
  171. jirl $r0, $r1, 0x00
  172. EPILOGUE