You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_tcopy_16_lasx.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526
  1. /*******************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #include "loongarch64_asm.S"
  30. /*********************************************************************
  31. * 2023/08/23 guxiwei
  32. * UTEST : OK
  33. * CTEST : OK
  34. * TEST : OK
  35. *********************************************************************/
  36. /* Function parameters */
  37. #define M $r4 // param 1: m
  38. #define N $r5 // param 2: n
  39. #define SRC $r6 // param 3: src
  40. #define LDA $r7 // param 4: lda
  41. #define DST $r8 // param 5: dst
  42. #define I $r9
  43. #define J $r10
  44. #define S0 $r11
  45. #define S1 $r12
  46. #define S2 $r13
  47. #define S3 $r14
  48. #define S4 $r15
  49. #define S5 $r16
  50. #define S6 $r17
  51. #define S7 $r18
  52. #define S8 $r19
  53. #define P0 $r20
  54. #define P1 $r23
  55. #define P2 $r24
  56. #define P3 $r25
  57. #define P4 $r26
  58. #define P5 $r27
  59. #define T0 $r28
  60. #define T1 $r29
  61. #define TL $r7
  62. #define ZERO $r0
  63. /* LASX vectors */
  64. #define U0 $xr0
  65. #define U1 $xr1
  66. #define U2 $xr2
  67. #define U3 $xr3
  68. #define U4 $xr4
  69. #define U5 $xr5
  70. #define U6 $xr6
  71. #define U7 $xr7
  72. // Loops outline
  73. //.L_M8 <-------------------
  74. //| .L_N16: |
  75. //| .L_N15: |
  76. //| .L_N8: |
  77. //| .L_N7: | Main Loop
  78. //| .L_N4: |
  79. //| .L_N3: |
  80. //| .L_N2: |
  81. //| .L_N1: |
  82. //| .L_N0: ---------------
  83. //.L_M7
  84. //.L_M4
  85. //| .L_M4_N16:
  86. //| .L_M4_N15:
  87. //| .L_M4_N8:
  88. //| .L_M4_N7:
  89. //| .L_M4_N4:
  90. //| .L_M4_N3:
  91. //| .L_M4_N2:
  92. //| .L_M4_N1:
  93. //.L_M3
  94. //.L_M2
  95. //| .L_M2_N16:
  96. //| .L_M2_N15:
  97. //| .L_M2_N8:
  98. //| .L_M2_N7:
  99. //| .L_M2_N4:
  100. //| .L_M2_N3:
  101. //| .L_M2_N2:
  102. //| .L_M2_N1:
  103. //.L_M1
  104. //| .L_M1_N16:
  105. //| .L_M1_N15:
  106. //| .L_M1_N8:
  107. //| .L_M1_N7:
  108. //| .L_M1_N4:
  109. //| .L_M1_N3:
  110. //| .L_M1_N2:
  111. //| .L_M1_N1:
  112. //.L_M0
  113. PROLOGUE
  114. push_if_used 7, 0
  115. move S0, SRC
  116. move P0, DST
  117. PTR_SRAI T0, N, 0x04
  118. PTR_SRAI T1, N, 0x03
  119. PTR_SLLI T0, T0, 0x04
  120. PTR_SLLI T1, T1, 0x03
  121. PTR_MUL P2, M, T0
  122. PTR_MUL P3, M, T1
  123. PTR_SLLI P2, P2, 0x02
  124. PTR_SLLI P3, P3, 0x02
  125. PTR_ADD P2, DST, P2
  126. PTR_ADD P3, DST, P3
  127. PTR_SRAI T0, N, 0x02
  128. PTR_SRAI T1, N, 0x01
  129. PTR_SLLI T0, T0, 0x02
  130. PTR_SLLI T1, T1, 0x01
  131. PTR_MUL P4, M, T0
  132. PTR_MUL P5, M, T1
  133. PTR_SLLI P4, P4, 0x02
  134. PTR_SLLI P5, P5, 0x02
  135. PTR_ADD P4, DST, P4
  136. PTR_ADD P5, DST, P5
  137. PTR_SLLI TL, LDA, 0x02
  138. PTR_SRAI J, M, 0x03
  139. PTR_SLLI T0, TL, 0x01
  140. PTR_SLLI T1, M, 0x06
  141. beq ZERO, J, .L_M7
  142. .align 5
  143. .L_M8:
  144. move S1, S0
  145. PTR_ADD S2, S0, TL
  146. PTR_ADD S3, S1, T0
  147. PTR_ADD S4, S2, T0
  148. PTR_ADD S5, S3, T0
  149. PTR_ADD S6, S4, T0
  150. PTR_ADD S7, S5, T0
  151. PTR_ADD S8, S6, T0
  152. PTR_ADD S0, S7, T0
  153. move P1, P0
  154. PTR_ADDI P0, P0, 0x200
  155. PTR_SRAI I, N, 0x04
  156. PTR_ADDI J, J, -1
  157. beq ZERO, I, .L_N15
  158. .L_N16:
  159. xvld U0, S1, 0x00
  160. xvld U1, S1, 0x20
  161. xvld U2, S2, 0x00
  162. xvld U3, S2, 0x20
  163. xvst U0, P1, 0x00
  164. xvst U1, P1, 0x20
  165. xvst U2, P1, 0x40
  166. xvst U3, P1, 0x60
  167. xvld U4, S3, 0x00
  168. xvld U5, S3, 0x20
  169. xvld U6, S4, 0x00
  170. xvld U7, S4, 0x20
  171. xvst U4, P1, 0x80
  172. xvst U5, P1, 0xA0
  173. xvst U6, P1, 0xC0
  174. xvst U7, P1, 0xE0
  175. xvld U0, S5, 0x00
  176. xvld U1, S5, 0x20
  177. xvld U2, S6, 0x00
  178. xvld U3, S6, 0x20
  179. xvst U0, P1, 0x100
  180. xvst U1, P1, 0x120
  181. xvst U2, P1, 0x140
  182. xvst U3, P1, 0x160
  183. xvld U4, S7, 0x00
  184. xvld U5, S7, 0x20
  185. xvld U6, S8, 0x00
  186. xvld U7, S8, 0x20
  187. xvst U4, P1, 0x180
  188. xvst U5, P1, 0x1A0
  189. xvst U6, P1, 0x1C0
  190. xvst U7, P1, 0x1E0
  191. PTR_ADDI S1, S1, 0x40
  192. PTR_ADDI S2, S2, 0x40
  193. PTR_ADDI S3, S3, 0x40
  194. PTR_ADDI S4, S4, 0x40
  195. PTR_ADDI S5, S5, 0x40
  196. PTR_ADDI S6, S6, 0x40
  197. PTR_ADDI S7, S7, 0x40
  198. PTR_ADDI S8, S8, 0x40
  199. PTR_ADDI I, I, -1
  200. PTR_ADD P1, P1, T1
  201. blt ZERO, I, .L_N16
  202. .L_N15:
  203. andi I, N, 0x08
  204. beq ZERO, I, .L_N7
  205. .L_N8:
  206. xvld U0, S1, 0x00
  207. xvld U1, S2, 0x00
  208. xvld U2, S3, 0x00
  209. xvld U3, S4, 0x00
  210. xvld U4, S5, 0x00
  211. xvld U5, S6, 0x00
  212. xvld U6, S7, 0x00
  213. xvld U7, S8, 0x00
  214. GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \
  215. U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0
  216. PTR_ADDI S1, S1, 0x20
  217. PTR_ADDI S2, S2, 0x20
  218. PTR_ADDI S3, S3, 0x20
  219. PTR_ADDI S4, S4, 0x20
  220. PTR_ADDI S5, S5, 0x20
  221. PTR_ADDI S6, S6, 0x20
  222. PTR_ADDI S7, S7, 0x20
  223. PTR_ADDI S8, S8, 0x20
  224. PTR_ADDI P2, P2, 0x100
  225. .L_N7:
  226. andi I, N, 0x04
  227. beq ZERO, I, .L_N3
  228. .L_N4:
  229. GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
  230. $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
  231. GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \
  232. $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70
  233. PTR_ADDI S1, S1, 0x10
  234. PTR_ADDI S2, S2, 0x10
  235. PTR_ADDI S3, S3, 0x10
  236. PTR_ADDI S4, S4, 0x10
  237. PTR_ADDI S5, S5, 0x10
  238. PTR_ADDI S6, S6, 0x10
  239. PTR_ADDI S7, S7, 0x10
  240. PTR_ADDI S8, S8, 0x10
  241. PTR_ADDI P3, P3, 0x80
  242. .L_N3:
  243. andi I, N, 0x02
  244. beq ZERO, I, .L_N1
  245. .L_N2:
  246. GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
  247. $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
  248. GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \
  249. $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38
  250. PTR_ADDI S1, S1, 0x08
  251. PTR_ADDI S2, S2, 0x08
  252. PTR_ADDI S3, S3, 0x08
  253. PTR_ADDI S4, S4, 0x08
  254. PTR_ADDI S5, S5, 0x08
  255. PTR_ADDI S6, S6, 0x08
  256. PTR_ADDI S7, S7, 0x08
  257. PTR_ADDI S8, S8, 0x08
  258. PTR_ADDI P4, P4, 0x40
  259. .L_N1:
  260. andi I, N, 0x01
  261. beq ZERO, I, .L_N0
  262. GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
  263. $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
  264. GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \
  265. $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C
  266. PTR_ADDI S1, S1, 0x04
  267. PTR_ADDI S2, S2, 0x04
  268. PTR_ADDI S3, S3, 0x04
  269. PTR_ADDI S4, S4, 0x04
  270. PTR_ADDI S5, S5, 0x04
  271. PTR_ADDI S6, S6, 0x04
  272. PTR_ADDI S7, S7, 0x04
  273. PTR_ADDI S8, S8, 0x04
  274. PTR_ADDI P5, P5, 0x20
  275. .L_N0:
  276. blt ZERO, J, .L_M8
  277. .L_M7:
  278. andi J, M, 0x04
  279. beq ZERO, J, .L_M3
  280. .L_M4:
  281. move S1, S0
  282. PTR_ADD S2, S0, TL
  283. PTR_ADD S3, S1, T0
  284. PTR_ADD S4, S2, T0
  285. PTR_ADD S0, S3, T0
  286. move P1, P0
  287. PTR_ADDI P0, P0, 0x100
  288. PTR_SRAI I, N, 0x04
  289. beq ZERO, I, .L_M4_N15
  290. .align 5
  291. .L_M4_N16:
  292. xvld U0, S1, 0x00
  293. xvld U1, S1, 0x20
  294. xvld U2, S2, 0x00
  295. xvld U3, S2, 0x20
  296. xvst U0, P1, 0x00
  297. xvst U1, P1, 0x20
  298. xvst U2, P1, 0x40
  299. xvst U3, P1, 0x60
  300. xvld U4, S3, 0x00
  301. xvld U5, S3, 0x20
  302. xvld U6, S4, 0x00
  303. xvld U7, S4, 0x20
  304. xvst U4, P1, 0x80
  305. xvst U5, P1, 0xA0
  306. xvst U6, P1, 0xC0
  307. xvst U7, P1, 0xE0
  308. PTR_ADDI S1, S1, 0x40
  309. PTR_ADDI S2, S2, 0x40
  310. PTR_ADDI S3, S3, 0x40
  311. PTR_ADDI S4, S4, 0x40
  312. PTR_ADDI I, I, -1
  313. PTR_ADD P1, P1, T1
  314. blt ZERO, I, .L_M4_N16
  315. .L_M4_N15:
  316. andi I, N, 0x08
  317. beq ZERO, I, .L_M4_N7
  318. .L_M4_N8:
  319. xvld U0, S1, 0x00
  320. xvld U1, S2, 0x00
  321. xvld U2, S3, 0x00
  322. xvld U3, S4, 0x00
  323. GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60
  324. PTR_ADDI S1, S1, 0x20
  325. PTR_ADDI S2, S2, 0x20
  326. PTR_ADDI S3, S3, 0x20
  327. PTR_ADDI S4, S4, 0x20
  328. PTR_ADDI P2, P2, 0x80
  329. .L_M4_N7:
  330. andi I, N, 0x04
  331. beq ZERO, I, .L_M4_N3
  332. .L_M4_N4:
  333. GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
  334. GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30
  335. PTR_ADDI S1, S1, 0x10
  336. PTR_ADDI S2, S2, 0x10
  337. PTR_ADDI S3, S3, 0x10
  338. PTR_ADDI S4, S4, 0x10
  339. PTR_ADDI P3, P3, 0x40
  340. .L_M4_N3:
  341. andi I, N, 0x02
  342. beq ZERO, I, .L_M4_N1
  343. .L_M4_N2:
  344. GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
  345. GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18
  346. PTR_ADDI S1, S1, 0x08
  347. PTR_ADDI S2, S2, 0x08
  348. PTR_ADDI S3, S3, 0x08
  349. PTR_ADDI S4, S4, 0x08
  350. PTR_ADDI P4, P4, 0x20
  351. .L_M4_N1:
  352. andi I, N, 0x01
  353. beq ZERO, I, .L_M3
  354. GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
  355. GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C
  356. PTR_ADDI S1, S1, 0x04
  357. PTR_ADDI S2, S2, 0x04
  358. PTR_ADDI S3, S3, 0x04
  359. PTR_ADDI S4, S4, 0x04
  360. PTR_ADDI P5, P5, 0x10
  361. .L_M3:
  362. andi J, M, 0x02
  363. beq ZERO, J, .L_M1
  364. .L_M2:
  365. move S1, S0
  366. PTR_ADD S2, S0, TL
  367. PTR_ADD S0, S0, T0
  368. move P1, P0
  369. PTR_ADDI P0, P0, 0x80
  370. PTR_SRAI I, N, 0x04
  371. beq ZERO, I, .L_M2_N15
  372. .align 5
  373. .L_M2_N16:
  374. xvld U0, S1, 0x00
  375. xvld U1, S1, 0x20
  376. xvld U2, S2, 0x00
  377. xvld U3, S2, 0x20
  378. xvst U0, P1, 0x00
  379. xvst U1, P1, 0x20
  380. xvst U2, P1, 0x40
  381. xvst U3, P1, 0x60
  382. PTR_ADDI S1, S1, 0x40
  383. PTR_ADDI S2, S2, 0x40
  384. PTR_ADDI I, I, -1
  385. PTR_ADD P1, P1, T1
  386. blt ZERO, I, .L_M2_N16
  387. .L_M2_N15:
  388. andi I, N, 0x08
  389. beq ZERO, I, .L_M2_N7
  390. .L_M2_N8:
  391. xvld U0, S1, 0x00
  392. xvld U1, S2, 0x00
  393. GST xv, , U0, P2, 0x00, U1, P2, 0x20
  394. PTR_ADDI S1, S1, 0x20
  395. PTR_ADDI S2, S2, 0x20
  396. PTR_ADDI P2, P2, 0x40
  397. .L_M2_N7:
  398. andi I, N, 0x04
  399. beq ZERO, I, .L_M2_N3
  400. .L_M2_N4:
  401. GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
  402. GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10
  403. PTR_ADDI S1, S1, 0x10
  404. PTR_ADDI S2, S2, 0x10
  405. PTR_ADDI P3, P3, 0x20
  406. .L_M2_N3:
  407. andi I, N, 0x02
  408. beq ZERO, I, .L_M2_N1
  409. .L_M2_N2:
  410. GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
  411. GST f, d, $f0, P4, 0x00, $f1, P4, 0x08
  412. PTR_ADDI S1, S1, 0x08
  413. PTR_ADDI S2, S2, 0x08
  414. PTR_ADDI P4, P4, 0x10
  415. .L_M2_N1:
  416. andi I, N, 0x01
  417. beq ZERO, I, .L_M1
  418. GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
  419. GST f, s, $f0, P5, 0x00, $f1, P5, 0x04
  420. PTR_ADDI S1, S1, 0x04
  421. PTR_ADDI S2, S2, 0x04
  422. PTR_ADDI P5, P5, 0x08
  423. .L_M1:
  424. andi J, M, 0x01
  425. beq ZERO, J, .L_M0
  426. move S1, S0
  427. PTR_ADD S2, S0, TL
  428. move P1, P0
  429. PTR_ADDI P0, P0, 0x40
  430. PTR_SRAI I, N, 0x04
  431. beq ZERO, I, .L_M1_N15
  432. .align 5
  433. .L_M1_N16:
  434. xvld U0, S1, 0x00
  435. xvld U1, S1, 0x20
  436. xvst U0, P1, 0x00
  437. xvst U1, P1, 0x20
  438. PTR_ADDI S1, S1, 0x40
  439. PTR_ADDI I, I, -1
  440. PTR_ADD P1, P1, T1
  441. blt ZERO, I, .L_M1_N16
  442. .L_M1_N15:
  443. andi I, N, 0x08
  444. beq ZERO, I, .L_M1_N7
  445. .L_M1_N8:
  446. xvld U0, S1, 0x00
  447. GST xv, , U0, P2, 0x00
  448. PTR_ADDI S1, S1, 0x20
  449. PTR_ADDI P2, P2, 0x20
  450. .L_M1_N7:
  451. andi I, N, 0x04
  452. beq ZERO, I, .L_M1_N3
  453. .L_M1_N4:
  454. GLD v, , $vr0, S1, 0x00
  455. GST v, , $vr0, P3, 0x00
  456. PTR_ADDI S1, S1, 0x10
  457. PTR_ADDI P3, P3, 0x10
  458. .L_M1_N3:
  459. andi I, N, 0x02
  460. beq ZERO, I, .L_M1_N1
  461. .L_M1_N2:
  462. GLD f, d, $f0, S1, 0x00
  463. GST f, d, $f0, P4, 0x00
  464. PTR_ADDI S1, S1, 0x08
  465. PTR_ADDI P4, P4, 0x08
  466. .L_M1_N1:
  467. andi I, N, 0x01
  468. beq ZERO, I, .L_M0
  469. GLD f, s, $f0, S1, 0x00
  470. GST f, s, $f0, P5, 0x00
  471. PTR_ADDI S1, S1, 0x04
  472. PTR_ADDI P5, P5, 0x04
  473. .L_M0:
  474. pop_if_used 7, 0
  475. jirl $r0, $r1, 0x00
  476. EPILOGUE