You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_ncopy_4_lsx.S 8.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. /*******************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Function parameters */
  30. #define M $r4 // param 1: m
  31. #define N $r5 // param 2: n
  32. #define SRC $r6 // param 3: src
  33. #define LDA $r7 // param 4: lda
  34. #define DST $r8 // param 5: dst
  35. #define I $r9
  36. #define J $r10
  37. #define S1 $r12
  38. #define S2 $r13
  39. #define S3 $r14
  40. #define S4 $r15
  41. #define S5 $r16
  42. #define S6 $r17
  43. #define S7 $r18
  44. #define TD $r20
  45. #define TS $r11
  46. #define TL $r19
  47. #define T0 $r23
  48. #define ZERO $r0
  49. #define F0 $f0
  50. #define F1 $f1
  51. #define F2 $f2
  52. #define F3 $f3
  53. #define F4 $f4
  54. #define F5 $f5
  55. #define F6 $f6
  56. #define F7 $f7
  57. /* LSX vectors */
  58. #define U0 $vr0
  59. #define U1 $vr1
  60. #define U2 $vr2
  61. #define U3 $vr3
  62. #define U4 $vr4
  63. #define U5 $vr5
  64. #define U6 $vr6
  65. #define U7 $vr7
  66. #define D0 $vr8
  67. #define D1 $vr9
  68. #define D2 $vr10
  69. #define D3 $vr11
  70. #define D4 $vr12
  71. #define D5 $vr13
  72. #define D6 $vr14
  73. #define D7 $vr15
  74. #define D8 $vr16
  75. PROLOGUE
  76. addi.d $sp, $sp, -8
  77. SDARG $r23, $sp, 0
  78. move TD, DST //boffset
  79. move TS, SRC //aoffset
  80. slli.d TL, LDA, 0x02
  81. slli.d TL, TL, 0x01
  82. srai.d J, N, 0x02
  83. beq J, ZERO, .L_N0
  84. .L_J1: /* J-- */
  85. move S1, TS
  86. add.d S2, S1, TL
  87. add.d S3, S2, TL
  88. add.d S4, S3, TL
  89. slli.d T0, TL, 0x02
  90. add.d TS, TS, T0
  91. srai.d I, M, 0x02
  92. beq I, ZERO, .L_I3
  93. .L_I1: /* I-- */
  94. vld U0, S1, 0x00
  95. vld U1, S1, 0x10
  96. vld U2, S2, 0x00
  97. vld U3, S2, 0x10
  98. vld U4, S3, 0x00
  99. vld U5, S3, 0x10
  100. vld U6, S4, 0x00
  101. vld U7, S4, 0x10
  102. vand.v D0, U2, U2
  103. vand.v D1, U3, U3
  104. vand.v D2, U2, U2
  105. vand.v D3, U3, U3
  106. vand.v D4, U6, U6
  107. vand.v D5, U7, U7
  108. vand.v D6, U6, U6
  109. vand.v D7, U7, U7
  110. vpermi.w D0, U0, 0x44
  111. vpermi.w D4, U4, 0x44
  112. vpermi.w D2, U0, 0xee
  113. vpermi.w D6, U4, 0xee
  114. vpermi.w D1, U1, 0x44
  115. vpermi.w D5, U5, 0x44
  116. vpermi.w D3, U1, 0xee
  117. vpermi.w D7, U5, 0xee
  118. vst D0, TD, 0x00
  119. vst D4, TD, 0x10
  120. vst D2, TD, 0x20
  121. vst D6, TD, 0x30
  122. vst D1, TD, 0x40
  123. vst D5, TD, 0x50
  124. vst D3, TD, 0x60
  125. vst D7, TD, 0x70
  126. addi.d S1, S1, 0x20 // a_offset
  127. addi.d S2, S2, 0x20
  128. addi.d S3, S3, 0x20
  129. addi.d S4, S4, 0x20
  130. addi.d TD, TD, 0x80 // b_offset
  131. addi.d I, I, -1
  132. blt ZERO, I, .L_I1
  133. .L_I3: /* if(m&2) */
  134. andi I, M, 0x02
  135. beq I, ZERO, .L_II20
  136. vld U0, S1, 0x00
  137. vld U1, S2, 0x00
  138. vld U2, S3, 0x00
  139. vld U3, S4, 0x00
  140. vand.v D0, U1, U1
  141. vand.v D1, U1, U1
  142. vand.v D2, U3, U3
  143. vand.v D3, U3, U3
  144. vpermi.w D0, U0, 0x44
  145. vpermi.w D2, U2, 0x44
  146. vpermi.w D1, U0, 0xee
  147. vpermi.w D3, U2, 0xee
  148. vst D0, TD, 0x00
  149. vst D2, TD, 0x10
  150. vst D1, TD, 0x20
  151. vst D3, TD, 0x30
  152. addi.d S1, S1, 0x10
  153. addi.d S2, S2, 0x10
  154. addi.d S3, S3, 0x10
  155. addi.d S4, S4, 0x10
  156. addi.d TD, TD, 0x40
  157. .L_II20: /* if(m&1) */
  158. andi I, M, 0x01
  159. beq I, ZERO, .L_J0
  160. fld.s F0, S1, 0x00
  161. fld.s F1, S1, 0x04
  162. fld.s F2, S2, 0x00
  163. fld.s F3, S2, 0x04
  164. fld.s F4, S3, 0x00
  165. fld.s F5, S3, 0x04
  166. fld.s F6, S4, 0x00
  167. fld.s F7, S4, 0x04
  168. fst.s F0, TD, 0x00
  169. fst.s F1, TD, 0x04
  170. fst.s F2, TD, 0x08
  171. fst.s F3, TD, 0x0c
  172. fst.s F4, TD, 0x10
  173. fst.s F5, TD, 0x14
  174. fst.s F6, TD, 0x18
  175. fst.s F7, TD, 0x1c
  176. addi.d TD, TD, 0x20
  177. .L_J0:
  178. addi.d J, J, -1
  179. blt ZERO, J, .L_J1
  180. .L_N0: /* if(n&2) */
  181. andi I, N, 0x02
  182. beq ZERO, I, .L_N20
  183. move S1, TS
  184. add.d S2, S1, TL
  185. slli.d T0, TL, 0x01
  186. add.d TS, TS, T0
  187. srai.d I, M, 0x02
  188. beq ZERO, I, .L_N10
  189. .L_N11: /* if(i>0) */
  190. vld U0, S1, 0x00
  191. vld U1, S1, 0x10
  192. vld U2, S2, 0x00
  193. vld U3, S2, 0x10
  194. vand.v D0, U2, U2
  195. vand.v D1, U3, U3
  196. vand.v D2, U2, U2
  197. vand.v D3, U3, U3
  198. vpermi.w D0, U0, 0x44
  199. vpermi.w D2, U0, 0xee
  200. vpermi.w D1, U1, 0x44
  201. vpermi.w D3, U1, 0xee
  202. vst D0, TD, 0x00
  203. vst D2, TD, 0x10
  204. vst D1, TD, 0x20
  205. vst D3, TD, 0x30
  206. addi.d S1, S1, 0x20 // a_offset
  207. addi.d S2, S2, 0x20
  208. addi.d TD, TD, 0x40 // b_offset
  209. addi.d I, I, -1
  210. blt ZERO, I, .L_N11
  211. .L_N10: /* if(m&2) */
  212. andi I, M, 0x02
  213. beq I, ZERO, .L_N130
  214. vld U0, S1, 0x00
  215. vld U1, S2, 0x00
  216. vand.v D0, U1, U1
  217. vpermi.w D0, U0, 0x44
  218. vpermi.w U1, U0, 0xee
  219. vst D0, TD, 0x00
  220. vst U1, TD, 0x10
  221. addi.d S1, S1, 0x10 // a_offset
  222. addi.d S2, S2, 0x10
  223. addi.d TD, TD, 0x20 // b_offset
  224. .L_N130: /* if(m&1) */
  225. andi I, M, 0x01
  226. beq I, ZERO, .L_N20
  227. fld.s F0, S1, 0x00
  228. fld.s F1, S1, 0x04
  229. fld.s F2, S2, 0x00
  230. fld.s F3, S2, 0x04
  231. fst.s F0, TD, 0x00
  232. fst.s F1, TD, 0x04
  233. fst.s F2, TD, 0x08
  234. fst.s F3, TD, 0x0c
  235. addi.d TD, TD, 0x10
  236. .L_N20: /* if(n&1) */
  237. andi I, N, 0x01
  238. beq I, ZERO, .L_N00
  239. move S1, TS
  240. srai.d I, M, 0x02
  241. beq I, ZERO, .L_N30
  242. .L_N21: /* if(i>0) */
  243. vld U0, S1, 0x00
  244. vld U1, S1, 0x10
  245. vst U0, TD, 0x00
  246. vst U1, TD, 0x10
  247. addi.d S1, S1, 0x20 // aoffset1
  248. addi.d TD, TD, 0x20 // b_offset
  249. addi.d I, I, -1
  250. blt ZERO, I, .L_N21
  251. .L_N30: /* if(m&2) */
  252. andi I, M, 0x02
  253. beq I, ZERO, .L_N330
  254. vld U0, S1, 0x00
  255. vst U0, TD, 0x00
  256. addi.d S1, S1, 0x10 // aoffset1
  257. addi.d TD, TD, 0x10 // b_offset
  258. .L_N330: /* if(m&1) */
  259. andi I, M, 0x01
  260. beq I, ZERO, .L_N00
  261. fld.s F0, S1, 0x00
  262. fld.s F1, S1, 0x04
  263. fst.s F0, TD, 0x00
  264. fst.s F1, TD, 0x04
  265. .L_N00:
  266. LDARG $r23, $sp, 0
  267. addi.d $sp, $sp, 8
  268. jirl $r0, $r1, 0x00
  269. EPILOGUE