You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_tcopy_8_lsx.S 7.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. /*******************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Function parameters */
  30. #define M $r4 // param 1: m
  31. #define N $r5 // param 2: n
  32. #define SRC $r6 // param 3: src
  33. #define LDA $r7 // param 4: lda
  34. #define DST $r8 // param 5: dst
  35. #define I $r9
  36. #define J $r10
  37. #define S1 $r12
  38. #define S2 $r13
  39. #define S3 $r14
  40. #define S4 $r15
  41. #define S5 $r16
  42. #define S6 $r17
  43. #define S7 $r18
  44. #define S8 $r19
  45. #define TD $r20
  46. #define TS $r11
  47. #define TL $r7
  48. #define T0 $r23
  49. #define ZERO $r0
  50. #define F0 $f0
  51. #define F1 $f1
  52. #define F2 $f2
  53. #define F3 $f3
  54. #define F4 $f4
  55. #define F5 $f5
  56. #define F6 $f6
  57. #define F7 $f7
  58. /* LASX vectors */
  59. #define U0 $vr0
  60. #define U1 $vr1
  61. #define U2 $vr2
  62. #define U3 $vr3
  63. #define U4 $vr4
  64. #define U5 $vr5
  65. #define U6 $vr6
  66. #define U7 $vr7
  67. #define D0 $vr8
  68. #define D1 $vr9
  69. #define D2 $vr10
  70. #define D3 $vr11
  71. #define D4 $vr12
  72. #define D5 $vr13
  73. #define D6 $vr14
  74. #define D7 $vr15
  75. PROLOGUE
  76. addi.d $sp, $sp, -8
  77. SDARG $r23, $sp, 0
  78. move TS, SRC //aoffset
  79. move TD, DST //boffset
  80. slli.d TL, LDA, 0x02 //lda
  81. slli.d TL, TL, 0x01
  82. srai.d J, N, 0x03 //j
  83. beq J, ZERO, .L_N1
  84. .L_J1: /* if(j>0) j--*/
  85. move S1, TS //aoffset1
  86. slli.d T0, TL, 0x01 //2*lda
  87. add.d S2, TS, TL
  88. addi.d TS, TS, 0x40
  89. srai.d I, M, 0x01
  90. beq ZERO, I, .L_J1M1
  91. .L_J1I1: /* if(i>0) i--*/
  92. vld U0, S1, 0x00
  93. vld U1, S1, 0x10
  94. vld U2, S1, 0x20
  95. vld U3, S1, 0x30
  96. vld U4, S2, 0x00
  97. vld U5, S2, 0x10
  98. vld U6, S2, 0x20
  99. vld U7, S2, 0x30
  100. vst U0, TD, 0x00
  101. vst U1, TD, 0x10
  102. vst U2, TD, 0x20
  103. vst U3, TD, 0x30
  104. vst U4, TD, 0x40
  105. vst U5, TD, 0x50
  106. vst U6, TD, 0x60
  107. vst U7, TD, 0x70
  108. add.d S1, S1, T0
  109. add.d S2, S2, T0
  110. addi.d TD, TD, 0x80
  111. addi.d I, I, -1
  112. blt ZERO, I, .L_J1I1
  113. .L_J1M1: /* if(m&1) */
  114. andi I, M, 0x01
  115. beq ZERO, I, .L_J0
  116. vld U0, S1, 0x00
  117. vld U1, S1, 0x10
  118. vld U2, S1, 0x20
  119. vld U3, S1, 0x30
  120. vst U0, TD, 0x00
  121. vst U1, TD, 0x10
  122. vst U2, TD, 0x20
  123. vst U3, TD, 0x30
  124. addi.d TD, TD, 0x40
  125. .L_J0:
  126. addi.d J, J, -1
  127. blt ZERO, J, .L_J1
  128. .L_N1: /* if(n&4) */
  129. andi I, N, 0x04
  130. beq ZERO, I, .L_N2
  131. move S1, TS //aoffset1
  132. slli.d T0, TL, 0x01 //2*lda
  133. add.d S2, TS, TL
  134. addi.d TS, TS, 0x20
  135. srai.d I, M, 0x01
  136. beq ZERO, I, .L_N1M1
  137. .L_N1I1: /* if(i>0) i-- */
  138. vld U0, S1, 0x00
  139. vld U1, S1, 0x10
  140. vld U2, S2, 0x00
  141. vld U3, S2, 0x10
  142. vst U0, TD, 0x00
  143. vst U1, TD, 0x10
  144. vst U2, TD, 0x20
  145. vst U3, TD, 0x30
  146. add.d S1, S1, T0
  147. add.d S2, S2, T0
  148. addi.d TD, TD, 0x40
  149. addi.d I, I, -1
  150. blt ZERO, I, .L_N1I1
  151. .L_N1M1: /* if(m&1) */
  152. andi I, M, 0x01
  153. beq ZERO, I, .L_N2
  154. vld U0, S1, 0x00
  155. vld U1, S1, 0x10
  156. vst U0, TD, 0x00
  157. vst U1, TD, 0x10
  158. addi.d TD, TD, 0x20
  159. .L_N2: /* if(n&2) */
  160. andi I, N, 0x02
  161. beq ZERO, I, .L_N3
  162. move S1, TS //aoffset1
  163. slli.d T0, TL, 0x01 //2*lda
  164. add.d S2, TS, TL
  165. addi.d TS, TS, 0x10
  166. srai.d I, M, 0x01
  167. beq ZERO, I, .L_N2M1
  168. .L_N2I1: /* if(i>0) i-- */
  169. vld U0, S1, 0x00
  170. vld U1, S2, 0x00
  171. vst U0, TD, 0x00
  172. vst U1, TD, 0x10
  173. add.d S1, S1, T0
  174. add.d S2, S2, T0
  175. addi.d TD, TD, 0x20
  176. addi.d I, I, -1
  177. blt ZERO, I, .L_N2I1
  178. .L_N2M1: /* if(m&1) */
  179. andi I, M, 0x01
  180. beq ZERO, I, .L_N3
  181. vld U0, S1, 0x00
  182. vst U0, TD, 0x00
  183. addi.d TD, TD, 0x10
  184. .L_N3: /* if(n&1) */
  185. andi I, N, 0x01
  186. beq ZERO, I, .L_N0
  187. move S1, TS //aoffset1
  188. slli.d T0, TL, 0x01 //2*lda
  189. add.d S2, TS, TL
  190. srai.d I, M, 0x01
  191. beq ZERO, I, .L_N3M1
  192. .L_N3I1: /* if(i>0) i-- */
  193. fld.s F0, S1, 0x00
  194. fld.s F1, S1, 0x04
  195. fld.s F2, S2, 0x00
  196. fld.s F3, S2, 0x04
  197. fst.s F0, TD, 0x00
  198. fst.s F1, TD, 0x04
  199. fst.s F2, TD, 0x08
  200. fst.s F3, TD, 0x0c
  201. add.d S1, S1, T0
  202. add.d S2, S2, T0
  203. addi.d TD, TD, 0x10
  204. addi.d I, I, -1
  205. blt ZERO, I, .L_N3I1
  206. .L_N3M1: /* if(m&1) */
  207. andi I, M, 0x01
  208. beq ZERO, I, .L_N0
  209. fld.s F0, S1, 0x00
  210. fld.s F1, S1, 0x04
  211. fst.s F0, TD, 0x00
  212. fst.s F1, TD, 0x04
  213. .L_N0:
  214. LDARG $r23, $sp, 0
  215. addi.d $sp, $sp, 8
  216. jirl $r0, $r1, 0x00
  217. EPILOGUE