You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ssymv_U_lasx.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. /*******************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #include "loongarch64_asm.S"
  30. /* Param */
  31. #define M $r4
  32. #define N $r5
  33. #define A $r6
  34. #define LDA $r7
  35. #define X $r8
  36. #define INCX $r9
  37. #define Y $r10
  38. #define INCY $r11
  39. #define BUFFER $r16
  40. #define ALPHA $f0
  41. #define JY $r18
  42. #define JX $r31
  43. #define T0 $r19
  44. #define T1 $r20
  45. #define M1 $r12
  46. #define AO4 $r13
  47. #define I $r14
  48. #define J $r15
  49. #define AO1 $r23
  50. #define AO2 $r24
  51. #define IX $r25
  52. #define IY $r26
  53. #define II $r27
  54. #define T2 $r28
  55. #define T3 $r29
  56. #define T4 $r30
  57. #define T5 $r17
  58. #define T6 $r16
  59. /* LSX vectors */
  60. #define U0 $xr31
  61. #define U1 $xr1
  62. #define U2 $xr2
  63. #define U3 $xr3
  64. #define U4 $xr4
  65. #define U5 $xr5
  66. #define U6 $xr6
  67. #define U7 $xr7
  68. #define U8 $xr8
  69. #define U9 $xr9
  70. #define U10 $xr10
  71. #define U11 $xr11
  72. #define U12 $xr12
  73. #define U13 $xr13
  74. #define U14 $xr14
  75. #define U15 $xr15
  76. #define U16 $xr16
  77. #define VALPHA $xr17
  78. #define a2 $f2
  79. #define a3 $f3
  80. #define a4 $f4
  81. #define a5 $f5
  82. #define a6 $f6
  83. #define a7 $f7
  84. #define a8 $f8
  85. #define a9 $f9
  86. .macro LOAD_Y_8
  87. beqz T5, .L01_Y_0
  88. fldx.s $f4, Y, IY
  89. add.d T2, IY, INCY
  90. fldx.s $f5, Y, T2
  91. add.d T2, T2, INCY
  92. fldx.s $f6, Y, T2
  93. add.d T2, T2, INCY
  94. fldx.s $f7, Y, T2
  95. add.d T2, T2, INCY
  96. fldx.s $f8, Y, T2
  97. add.d T2, T2, INCY
  98. fldx.s $f9, Y, T2
  99. add.d T2, T2, INCY
  100. fldx.s $f10, Y, T2
  101. add.d T2, T2, INCY
  102. fldx.s $f11, Y, T2
  103. vextrins.w $vr4, $vr5, 0x10
  104. vextrins.w $vr4, $vr6, 0x20
  105. vextrins.w $vr4, $vr7, 0x30
  106. vextrins.w $vr8, $vr9, 0x10
  107. vextrins.w $vr8, $vr10, 0x20
  108. vextrins.w $vr8, $vr11, 0x30
  109. xvpermi.q U4, U8, 0x02
  110. b .L01_Y_1
  111. .L01_Y_0:
  112. xvldx U4, Y, IY
  113. .L01_Y_1:
  114. .endm
  115. .macro STORE_Y_8
  116. beqz T5, .L01_Y_2
  117. xvpermi.d U8, U4, 0xee
  118. vextrins.w $vr5, $vr4, 0x01
  119. vextrins.w $vr6, $vr4, 0x02
  120. vextrins.w $vr7, $vr4, 0x03
  121. vextrins.w $vr9, $vr8, 0x01
  122. vextrins.w $vr10, $vr8, 0x02
  123. vextrins.w $vr11, $vr8, 0x03
  124. fstx.s $f4, Y, IY
  125. add.d T2, IY, INCY
  126. fstx.s $f5, Y, T2
  127. add.d T2, T2, INCY
  128. fstx.s $f6, Y, T2
  129. add.d T2, T2, INCY
  130. fstx.s $f7, Y, T2
  131. add.d T2, T2, INCY
  132. fstx.s $f8, Y, T2
  133. add.d T2, T2, INCY
  134. fstx.s $f9, Y, T2
  135. add.d T2, T2, INCY
  136. fstx.s $f10, Y, T2
  137. add.d T2, T2, INCY
  138. fstx.s $f11, Y, T2
  139. b .L01_Y_3
  140. .L01_Y_2:
  141. xvstx U4, Y, IY
  142. .L01_Y_3:
  143. .endm
  144. .macro LOAD_X_8
  145. beqz T6, .L01_X_0
  146. fldx.s $f4, X, IX
  147. add.d T2, IX, INCX
  148. fldx.s $f5, X, T2
  149. add.d T2, T2, INCX
  150. fldx.s $f6, X, T2
  151. add.d T2, T2, INCX
  152. fldx.s $f7, X, T2
  153. add.d T2, T2, INCX
  154. fldx.s $f8, X, T2
  155. add.d T2, T2, INCX
  156. fldx.s $f9, X, T2
  157. add.d T2, T2, INCX
  158. fldx.s $f10, X, T2
  159. add.d T2, T2, INCX
  160. fldx.s $f11, X, T2
  161. vextrins.w $vr4, $vr5, 0x10
  162. vextrins.w $vr4, $vr6, 0x20
  163. vextrins.w $vr4, $vr7, 0x30
  164. vextrins.w $vr8, $vr9, 0x10
  165. vextrins.w $vr8, $vr10, 0x20
  166. vextrins.w $vr8, $vr11, 0x30
  167. xvpermi.q U4, U8, 0x02
  168. b .L01_X_1
  169. .L01_X_0:
  170. xvldx U4, X, IX
  171. .L01_X_1:
  172. .endm
  173. PROLOGUE
  174. addi.d $sp, $sp, -88
  175. SDARG $r23, $sp, 0
  176. SDARG $r24, $sp, 8
  177. SDARG $r25, $sp, 16
  178. SDARG $r26, $sp, 32
  179. SDARG $r27, $sp, 40
  180. SDARG $r28, $sp, 48
  181. SDARG $r29, $sp, 56
  182. SDARG $r30, $sp, 64
  183. SDARG $r31, $sp, 72
  184. ST ALPHA, $sp, 80
  185. xvldrepl.w VALPHA, $sp, 80
  186. addi.d T5, INCY, -1
  187. addi.d T6, INCX, -1
  188. slli.d LDA, LDA, BASE_SHIFT
  189. slli.d INCX, INCX, BASE_SHIFT
  190. slli.d INCY, INCY, BASE_SHIFT
  191. bge $r0, M, .L999
  192. bge $r0, N, .L999
  193. sub.d M1, M, N
  194. mul.d JY, M1, INCY
  195. mul.d JX, M1, INCX
  196. move J, M1
  197. move AO1, A
  198. beq J, M, .L999
  199. .L01:
  200. xvxor.v U2, U2, U2
  201. fldx.s $f6, X, JX
  202. fmul.s $f3, ALPHA, $f6 //temp1
  203. xvreplve0.w U3, U3
  204. move IY, $r0
  205. move IX, $r0
  206. move II, $r0
  207. move I, $r0
  208. srai.d T0, J, 3
  209. beq I, T0, .L03
  210. mul.w T1, J, LDA
  211. add.d T1, T1, II
  212. .L02: /* /8 */
  213. xvldx U1, AO1, T1
  214. LOAD_Y_8
  215. xvfmadd.s U4, U3, U1, U4
  216. STORE_Y_8
  217. alsl.d IY, INCY, IY, 3
  218. LOAD_X_8
  219. xvfmadd.s U2, U1, U4, U2
  220. alsl.d IX, INCX, IX, 3
  221. addi.d II, II, 32
  222. addi.d T1, T1, 32
  223. addi.d I, I, 1
  224. blt I, T0, .L02
  225. //Acc U2
  226. GACC xvf, s, U4, U2
  227. xvreplve0.d U2, U4
  228. .L03: /* &4 */
  229. andi T0, J, 4
  230. beq $r0, T0, .L04
  231. mul.w T1, J, LDA
  232. add.d T1, T1, II
  233. vldx $vr1, AO1, T1
  234. move T1, IY
  235. add.d T2, T1, INCY
  236. add.d T3, T2, INCY
  237. add.d T4, T3, INCY
  238. fldx.s $f4, Y, T1
  239. fldx.s $f5, Y, T2
  240. fldx.s $f6, Y, T3
  241. fldx.s $f7, Y, T4
  242. vextrins.w $vr4, $vr5, 0x10
  243. vextrins.w $vr4, $vr6, 0x20
  244. vextrins.w $vr4, $vr7, 0x30
  245. vfmadd.s $vr4, $vr3, $vr1, $vr4
  246. vextrins.w $vr5, $vr4, 0x01
  247. vextrins.w $vr6, $vr4, 0x02
  248. vextrins.w $vr7, $vr4, 0x03
  249. fstx.s $f4, Y, T1
  250. fstx.s $f5, Y, T2
  251. fstx.s $f6, Y, T3
  252. fstx.s $f7, Y, T4
  253. slli.d T1, INCY, 2
  254. add.d IY, IY, T1
  255. move T1, IX
  256. add.d T2, T1, INCX
  257. add.d T3, T2, INCX
  258. add.d T4, T3, INCX
  259. fldx.s $f4, X, T1
  260. fldx.s $f5, X, T2
  261. fldx.s $f6, X, T3
  262. fldx.s $f7, X, T4
  263. vextrins.w $vr4, $vr5, 0x10
  264. vextrins.w $vr4, $vr6, 0x20
  265. vextrins.w $vr4, $vr7, 0x30
  266. vand.v $vr12, $vr2, $vr2
  267. vfmadd.s $vr2, $vr1, $vr4, $vr2
  268. vfsub.s $vr2, $vr2, $vr12
  269. vextrins.w $vr5, $vr2, 0x01
  270. vextrins.w $vr6, $vr2, 0x02
  271. vextrins.w $vr7, $vr2, 0x03
  272. fadd.s $f2, $f2, $f5
  273. fadd.s $f2, $f2, $f6
  274. fadd.s $f2, $f2, $f7
  275. fadd.s $f2, $f2, $f12
  276. xvreplve0.d U2, U2
  277. slli.d T2, INCX, 2
  278. add.d IX, IX, T2
  279. addi.d II, II, 16
  280. .L04: /* &2 */
  281. andi T0, J, 2
  282. beq $r0, T0, .L05
  283. mul.w T1, J, LDA
  284. add.d T1, T1, II
  285. addi.d T2, T1, 4
  286. fldx.s $f4, AO1, T1
  287. fldx.s $f5, AO1, T2
  288. move T1, IY
  289. add.d T2, T1, INCY
  290. fldx.s $f6, Y, T1
  291. fldx.s $f7, Y, T2
  292. fmadd.s $f6, $f3, $f4, $f6
  293. fmadd.s $f7, $f3, $f5, $f7
  294. fstx.s $f6, Y, T1
  295. fstx.s $f7, Y, T2
  296. slli.d T1, INCY, 1
  297. add.d IY, IY, T1
  298. move T1, IX
  299. add.d T2, T1, INCX
  300. fldx.s $f6, X, T1
  301. fldx.s $f7, X, T2
  302. fmadd.s $f2, $f4, $f6, $f2
  303. fmadd.s $f2, $f5, $f7, $f2
  304. slli.d T2, INCX, 1
  305. add.d IX, IX, T2
  306. addi.d II, II, 8
  307. .L05: /* &1 */
  308. andi T0, J, 1
  309. beq $r0, T0, .L06
  310. mul.w T1, J, LDA
  311. add.d T1, T1, II
  312. fldx.s $f4, AO1, T1
  313. fldx.s $f6, Y, IY
  314. fmadd.s $f6, $f3, $f4, $f6
  315. fstx.s $f6, Y, IY
  316. add.d IY, IY, INCY
  317. fldx.s $f6, X, IX
  318. fmadd.s $f2, $f4, $f6, $f2
  319. add.d IX, IX, INCX
  320. addi.d II, II, 4
  321. .L06:
  322. mul.w T1, J, LDA
  323. slli.d T2, J, BASE_SHIFT
  324. add.d T1, T1, T2
  325. fldx.s $f6, Y, JY
  326. fldx.s $f4, AO1, T1
  327. fmadd.s $f6, $f3, $f4, $f6
  328. fmul.s $f7, ALPHA, $f2
  329. fadd.s $f6, $f6, $f7
  330. fstx.s $f6, Y, JY
  331. add.d JX, JX, INCX
  332. add.d JY, JY, INCY
  333. addi.d J, J, 1
  334. blt J, M, .L01
  335. .L999:
  336. LDARG $r23, $sp, 0
  337. LDARG $r24, $sp, 8
  338. LDARG $r25, $sp, 16
  339. LDARG $r26, $sp, 32
  340. LDARG $r27, $sp, 40
  341. LDARG $r28, $sp, 48
  342. LDARG $r29, $sp, 56
  343. LDARG $r30, $sp, 64
  344. LDARG $r31, $sp, 72
  345. addi.d $sp, $sp, 88
  346. jirl $r0, $r1, 0x0
  347. EPILOGUE