You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_tcopy_4_lasx.S 8.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. /*******************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Function parameters */
  30. #define M $r4 // param 1: m
  31. #define N $r5 // param 2: n
  32. #define SRC $r6 // param 3: src
  33. #define LDA $r7 // param 4: lda
  34. #define DST $r8 // param 5: dst
  35. #define I $r9
  36. #define J $r10
  37. #define S1 $r12
  38. #define S2 $r13
  39. #define S3 $r14
  40. #define S4 $r15
  41. #define TD $r16
  42. #define TS $r17
  43. #define TL $r18
  44. #define T0 $r19
  45. #define S8 $r20
  46. #define S9 $r23
  47. #define S10 $r11
  48. #define ZERO $r0
  49. #define F0 $f0
  50. #define F1 $f1
  51. #define F2 $f2
  52. #define F3 $f3
  53. #define F4 $f4
  54. #define F5 $f5
  55. #define F6 $f6
  56. #define F7 $f7
  57. /* LASX vectors */
  58. #define U0 $xr0
  59. #define U1 $xr1
  60. #define U2 $xr2
  61. #define U3 $xr3
  62. #define U4 $xr4
  63. #define U5 $xr5
  64. #define U6 $xr6
  65. #define U7 $xr7
  66. #define U8 $xr8
  67. #define U9 $xr9
  68. #define U10 $xr10
  69. #define U11 $xr11
  70. #define U12 $xr12
  71. #define U13 $xr13
  72. #define U14 $xr14
  73. #define U15 $xr15
  74. PROLOGUE
  75. addi.d $sp, $sp, -8
  76. SDARG $r23, $sp, 0
  77. move TS, SRC //aoffset
  78. move TD, DST //boffset
  79. slli.d TL, LDA, 0x02 //lda
  80. slli.d TL, TL, 0x01 //lda
  81. ori T0, ZERO, 0x03
  82. andn T0, N, T0
  83. mul.w T0, M, T0
  84. slli.d T0, T0, 0x01
  85. slli.d T0, T0, 0x02
  86. add.d S9, DST, T0 //boffset2
  87. ori T0, ZERO, 0x01
  88. andn T0, N, T0
  89. mul.w T0, M, T0
  90. slli.d T0, T0, 0x01
  91. slli.d T0, T0, 0x02
  92. add.d S10, DST, T0 //boffset3
  93. srai.d J, M, 0x02 //j
  94. beq J, ZERO, .L_M1
  95. .L_J1: /* if(j>0) j--*/
  96. move S1, TS //aoffset1
  97. add.d S2, S1, TL
  98. add.d S3, S2, TL
  99. add.d S4, S3, TL
  100. slli.d T0, TL, 0x02
  101. add.d TS, TS, T0
  102. move S8, TD //boffset1
  103. addi.d TD, TD, 0x80
  104. srai.d I, N, 0x02
  105. beq ZERO, I, .L_JN1
  106. .L_JI1: /* if(i>0) i--*/
  107. xvld U0, S1, 0x00
  108. xvld U1, S2, 0x00
  109. xvld U2, S3, 0x00
  110. xvld U3, S4, 0x00
  111. xvst U0, S8, 0x00
  112. xvst U1, S8, 0x20
  113. xvst U2, S8, 0x40
  114. xvst U3, S8, 0x60
  115. addi.d S1, S1, 0x20
  116. addi.d S2, S2, 0x20
  117. addi.d S3, S3, 0x20
  118. addi.d S4, S4, 0x20
  119. slli.d T0, M, 0x05
  120. add.d S8, S8, T0
  121. addi.d I, I, -1
  122. blt ZERO, I, .L_JI1
  123. .L_JN1: /* if(n&2) */
  124. andi I, N, 0x02
  125. beq ZERO, I, .L_JN2
  126. vld $vr0, S1, 0x00
  127. vld $vr1, S2, 0x00
  128. vld $vr2, S3, 0x00
  129. vld $vr3, S4, 0x00
  130. vst $vr0, S9, 0x00
  131. vst $vr1, S9, 0x10
  132. vst $vr2, S9, 0x20
  133. vst $vr3, S9, 0x30
  134. addi.d S1, S1, 0x10
  135. addi.d S2, S2, 0x10
  136. addi.d S3, S3, 0x10
  137. addi.d S4, S4, 0x10
  138. addi.d S9, S9, 0x40
  139. .L_JN2: /* if(n&1) */
  140. andi I, N, 0x01
  141. beq ZERO, I, .L_J0
  142. fld.s F0, S1, 0x00
  143. fld.s F1, S1, 0x04
  144. fld.s F2, S2, 0x00
  145. fld.s F3, S2, 0x04
  146. fld.s F4, S3, 0x00
  147. fld.s F5, S3, 0x04
  148. fld.s F6, S4, 0x00
  149. fld.s F7, S4, 0x04
  150. fst.s F0, S10, 0x00
  151. fst.s F1, S10, 0x04
  152. fst.s F2, S10, 0x08
  153. fst.s F3, S10, 0x0c
  154. fst.s F4, S10, 0x10
  155. fst.s F5, S10, 0x14
  156. fst.s F6, S10, 0x18
  157. fst.s F7, S10, 0x1c
  158. addi.d S10, S10, 0x20
  159. .L_J0:
  160. addi.d J, J, -1
  161. blt ZERO, J, .L_J1
  162. .L_M1: /* if(m&2) */
  163. andi I, M, 0x02
  164. beq ZERO, I, .L_M2
  165. move S1, TS //aoffset1
  166. add.d S2, S1, TL
  167. slli.d T0, TL, 0x01
  168. add.d TS, TS, T0
  169. move S8, TD //boffset1
  170. addi.d TD, TD, 0x40
  171. srai.d I, N, 0x02
  172. beq ZERO, I, .L_M1N1
  173. .L_M1I1: /* if(i>0) */
  174. xvld U0, S1, 0x00
  175. xvld U1, S2, 0x00
  176. xvst U0, S8, 0x00
  177. xvst U1, S8, 0x20
  178. addi.d S1, S1, 0x20
  179. addi.d S2, S2, 0x20
  180. slli.d T0, M, 0x05
  181. add.d S8, S8, T0
  182. addi.d I, I, -1
  183. blt ZERO, I, .L_M1I1
  184. .L_M1N1: /* if(n&2) */
  185. andi I, N, 0x02
  186. beq ZERO, I, .L_M1N2
  187. vld $vr0, S1, 0x00
  188. vld $vr1, S2, 0x00
  189. vst $vr0, S9, 0x00
  190. vst $vr1, S9, 0x10
  191. addi.d S1, S1, 0x10
  192. addi.d S2, S2, 0x10
  193. addi.d S9, S9, 0x20
  194. .L_M1N2: /* if(n&1) */
  195. andi I, N, 0x01
  196. beq ZERO, I, .L_M2
  197. fld.s F0, S1, 0x00
  198. fld.s F1, S1, 0x04
  199. fld.s F2, S2, 0x00
  200. fld.s F3, S2, 0x04
  201. fst.s F0, S10, 0x00
  202. fst.s F1, S10, 0x04
  203. fst.s F2, S10, 0x08
  204. fst.s F3, S10, 0x0c
  205. addi.d S10, S10, 0x10
  206. .L_M2: /* if(m&1) */
  207. andi I, M, 0x01
  208. beq ZERO, I, .L_M0
  209. move S1, TS //aoffset1
  210. move S8, TD //boffset1
  211. srai.d I, N, 0x02
  212. beq ZERO, I, .L_M2N1
  213. .L_M2I1: /* if(i>0) */
  214. xvld U0, S1, 0x00
  215. xvst U0, S8, 0x00
  216. addi.d S1, S1, 0x20
  217. slli.d T0, M, 0x05
  218. add.d S8, S8, T0
  219. addi.d I, I, -1
  220. blt ZERO, I, .L_M2I1
  221. .L_M2N1: /* if(n&2) */
  222. andi I, N, 0x02
  223. beq ZERO, I, .L_M2N2
  224. vld $vr0, S1, 0x00
  225. vst $vr0, S9, 0x00
  226. addi.d S1, S1, 0x10
  227. .L_M2N2: /* if(n&1) */
  228. andi I, N, 0x01
  229. beq ZERO, I, .L_M0
  230. fld.s F0, S1, 0x00
  231. fld.s F1, S1, 0x04
  232. fst.s F0, S10, 0x00
  233. fst.s F1, S10, 0x04
  234. .L_M0:
  235. LDARG $r23, $sp, 0
  236. addi.d $sp, $sp, 8
  237. jirl $r0, $r1, 0x00
  238. EPILOGUE