You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_ncopy_4_lasx.S 8.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. /*******************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Function parameters */
  30. #define M $r4 // param 1: m
  31. #define N $r5 // param 2: n
  32. #define SRC $r6 // param 3: src
  33. #define LDA $r7 // param 4: lda
  34. #define DST $r8 // param 5: dst
  35. #define I $r9
  36. #define J $r10
  37. #define S1 $r12
  38. #define S2 $r13
  39. #define S3 $r14
  40. #define S4 $r15
  41. #define S5 $r16
  42. #define S6 $r17
  43. #define S7 $r18
  44. #define TD $r20
  45. #define TS $r11
  46. #define TL $r19
  47. #define T0 $r23
  48. #define ZERO $r0
  49. #define F0 $f0
  50. #define F1 $f1
  51. #define F2 $f2
  52. #define F3 $f3
  53. #define F4 $f4
  54. #define F5 $f5
  55. #define F6 $f6
  56. #define F7 $f7
  57. /* LASX vectors */
  58. #define U0 $xr0
  59. #define U1 $xr1
  60. #define U2 $xr2
  61. #define U3 $xr3
  62. #define U4 $xr4
  63. #define U5 $xr5
  64. #define U6 $xr6
  65. #define U7 $xr7
  66. #define D0 $xr8
  67. #define D1 $xr9
  68. #define D2 $xr10
  69. #define D3 $xr11
  70. #define D4 $xr12
  71. #define D5 $xr13
  72. #define D6 $xr14
  73. #define D7 $xr15
  74. #define D8 $xr16
  75. PROLOGUE
  76. addi.d $sp, $sp, -8
  77. SDARG $r23, $sp, 0
  78. move TD, DST //boffset
  79. move TS, SRC //aoffset
  80. slli.d TL, LDA, 0x02
  81. slli.d TL, TL, 0x01
  82. srai.d J, N, 0x02
  83. beq J, ZERO, .L_N0
  84. .L_J1: /* J-- */
  85. move S1, TS
  86. add.d S2, S1, TL
  87. add.d S3, S2, TL
  88. add.d S4, S3, TL
  89. slli.d T0, TL, 0x02
  90. add.d TS, TS, T0
  91. srai.d I, M, 0x02
  92. beq I, ZERO, .L_I3
  93. .L_I1: /* I-- */
  94. xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8
  95. xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16
  96. xvld U2, S3, 0x00 //17 18 19 20 21 22 23 24
  97. xvld U3, S4, 0x00 //25 26 27 28 29 30 31 32
  98. xvand.v D0, U0, U0
  99. xvand.v D1, U1, U1
  100. xvand.v D2, U2, U2
  101. xvand.v D3, U3, U3
  102. xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14
  103. xvshuf4i.d D2, U3, 0x88 //17 18 25 26 21 22 29 30
  104. xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16
  105. xvshuf4i.d D3, U2, 0x77 //19 20 27 28 23 24 31 32
  106. xvand.v U4, D0, D0
  107. xvand.v U5, D1, D1
  108. xvpermi.q U4, D2, 0x02 //1 2 9 10 17 18 25 26
  109. xvpermi.q U5, D3, 0x02 //3 4 11 12 19 20 27 28
  110. xvpermi.q D2, D0, 0x31 //5 6 13 14 21 22 29 30
  111. xvpermi.q D3, D1, 0x31 //7 8 15 16 23 24 31 32
  112. xvst U4, TD, 0x00
  113. xvst U5, TD, 0x20
  114. xvst D2, TD, 0x40
  115. xvst D3, TD, 0x60
  116. addi.d S1, S1, 0x20 // a_offset
  117. addi.d S2, S2, 0x20
  118. addi.d S3, S3, 0x20
  119. addi.d S4, S4, 0x20
  120. addi.d TD, TD, 0x80 // b_offset
  121. addi.d I, I, -1
  122. blt ZERO, I, .L_I1
  123. .L_I3: /* if(m&2) */
  124. andi I, M, 0x02
  125. beq I, ZERO, .L_II20
  126. vld $vr0, S1, 0x00
  127. vld $vr1, S2, 0x00
  128. vld $vr2, S3, 0x00
  129. vld $vr3, S4, 0x00
  130. vand.v $vr8, $vr1, $vr1
  131. vand.v $vr9, $vr1, $vr1
  132. vand.v $vr10, $vr3, $vr3
  133. vand.v $vr11, $vr3, $vr3
  134. vpermi.w $vr8, $vr0, 0x44
  135. vpermi.w $vr10, $vr2, 0x44
  136. vpermi.w $vr9, $vr0, 0xee
  137. vpermi.w $vr11, $vr2, 0xee
  138. vst $vr8, TD, 0x00
  139. vst $vr10, TD, 0x10
  140. vst $vr9, TD, 0x20
  141. vst $vr11, TD, 0x30
  142. addi.d S1, S1, 0x10
  143. addi.d S2, S2, 0x10
  144. addi.d S3, S3, 0x10
  145. addi.d S4, S4, 0x10
  146. addi.d TD, TD, 0x40
  147. .L_II20: /* if(m&1) */
  148. andi I, M, 0x01
  149. beq I, ZERO, .L_J0
  150. fld.s F0, S1, 0x00
  151. fld.s F1, S1, 0x04
  152. fld.s F2, S2, 0x00
  153. fld.s F3, S2, 0x04
  154. fld.s F4, S3, 0x00
  155. fld.s F5, S3, 0x04
  156. fld.s F6, S4, 0x00
  157. fld.s F7, S4, 0x04
  158. fst.s F0, TD, 0x00
  159. fst.s F1, TD, 0x04
  160. fst.s F2, TD, 0x08
  161. fst.s F3, TD, 0x0c
  162. fst.s F4, TD, 0x10
  163. fst.s F5, TD, 0x14
  164. fst.s F6, TD, 0x18
  165. fst.s F7, TD, 0x1c
  166. addi.d TD, TD, 0x20
  167. .L_J0:
  168. addi.d J, J, -1
  169. blt ZERO, J, .L_J1
  170. .L_N0: /* if(n&2) */
  171. andi I, N, 0x02
  172. beq ZERO, I, .L_N20
  173. move S1, TS
  174. add.d S2, S1, TL
  175. slli.d T0, TL, 0x01
  176. add.d TS, TS, T0
  177. srai.d I, M, 0x02
  178. beq ZERO, I, .L_N10
  179. .L_N11: /* if(i>0) */
  180. xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8
  181. xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16
  182. xvand.v D0, U0, U0
  183. xvand.v D1, U1, U1
  184. xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14
  185. xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16
  186. xvand.v U4, D0, D0
  187. xvpermi.q U4, D1, 0x02 //1 2 9 10 3 4 11 12
  188. xvpermi.q D1, D0, 0x31 //5 6 13 14 7 8 15 16
  189. xvst U4, TD, 0x00
  190. xvst D1, TD, 0x20
  191. addi.d S1, S1, 0x20 // a_offset
  192. addi.d S2, S2, 0x20
  193. addi.d TD, TD, 0x40 // b_offset
  194. addi.d I, I, -1
  195. blt ZERO, I, .L_N11
  196. .L_N10: /* if(m&2) */
  197. andi I, M, 0x02
  198. beq I, ZERO, .L_N130
  199. vld $vr0, S1, 0x00
  200. vld $vr1, S2, 0x00
  201. vand.v $vr8, $vr1, $vr1
  202. vpermi.w $vr8, $vr0, 0x44
  203. vpermi.w $vr1, $vr0, 0xee
  204. vst $vr8, TD, 0x00
  205. vst $vr1, TD, 0x10
  206. addi.d S1, S1, 0x10 // a_offset
  207. addi.d S2, S2, 0x10
  208. addi.d TD, TD, 0x20 // b_offset
  209. .L_N130: /* if(m&1) */
  210. andi I, M, 0x01
  211. beq I, ZERO, .L_N20
  212. fld.s F0, S1, 0x00
  213. fld.s F1, S1, 0x04
  214. fld.s F2, S2, 0x00
  215. fld.s F3, S2, 0x04
  216. fst.s F0, TD, 0x00
  217. fst.s F1, TD, 0x04
  218. fst.s F2, TD, 0x08
  219. fst.s F3, TD, 0x0c
  220. addi.d TD, TD, 0x10
  221. .L_N20: /* if(n&1) */
  222. andi I, N, 0x01
  223. beq I, ZERO, .L_N00
  224. move S1, TS
  225. srai.d I, M, 0x02
  226. beq I, ZERO, .L_N30
  227. .L_N21: /* if(i>0) */
  228. xvld U0, S1, 0x00
  229. xvst U0, TD, 0x00
  230. addi.d S1, S1, 0x20 // aoffset1
  231. addi.d TD, TD, 0x20 // b_offset
  232. addi.d I, I, -1
  233. blt ZERO, I, .L_N21
  234. .L_N30: /* if(m&2) */
  235. andi I, M, 0x02
  236. beq I, ZERO, .L_N330
  237. vld $vr0, S1, 0x00
  238. vst $vr0, TD, 0x00
  239. addi.d S1, S1, 0x10 // aoffset1
  240. addi.d TD, TD, 0x10 // b_offset
  241. .L_N330: /* if(m&1) */
  242. andi I, M, 0x01
  243. beq I, ZERO, .L_N00
  244. fld.s F0, S1, 0x00
  245. fld.s F1, S1, 0x04
  246. fst.s F0, TD, 0x00
  247. fst.s F1, TD, 0x04
  248. .L_N00:
  249. LDARG $r23, $sp, 0
  250. addi.d $sp, $sp, 8
  251. jirl $r0, $r1, 0x00
  252. EPILOGUE