You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dtrsm_kernel_RT_16x4_lasx.S 28 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951
  1. /*******************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #include "loongarch64_asm.S"
  30. /*********************************************************************
  31. * 2023/09/26 guxiwei
  32. * UTEST : OK
  33. * CTEST : OK
  34. * TEST : OK
  35. *
  36. *
  37. *********************************************************************/
  38. /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
  39. * FLOAT *c, BLASLONG ldc, BLASLONG offset)
  40. */
  41. #define M $r4 // param 1: bm
  42. #define N $r5 // param 2: bn
  43. #define K $r6 // param 3: bk
  44. #define A $r7 // param 5: ba
  45. #define B $r8 // param 6: bb
  46. #define C $r9 // param 7: bc
  47. #define LDC $r10 // param 8: ldc
  48. #define OFFSET $r11 // param 9: offset
  49. /* Cycle control parameters */
  50. #define I $r13
  51. #define J $r14
  52. #define L $r15
  53. #define TL $r16
  54. /* Matrix address */
  55. #define A0 $r17
  56. #define B0 $r18
  57. #define C0 $r19
  58. #define C1 $r20
  59. #define C2 $r23
  60. #define C3 $r24
  61. #define T0 $r25
  62. #define T1 $r26
  63. #define T2 $r27
  64. #define KK $r28
  65. #define AA $r29
  66. #define CC $r30
  67. #define BB $r31
  68. #undef ZERO
  69. #define ZERO $r0
  70. #define U0 $xr0
  71. #define U1 $xr1
  72. #define U2 $xr2
  73. #define U3 $xr3
  74. #define U4 $xr4
  75. #define U5 $xr5
  76. #define U6 $xr6
  77. #define U7 $xr7
  78. #define U8 $xr8
  79. #define U9 $xr9
  80. #define U10 $xr10
  81. #define U11 $xr11
  82. #define U12 $xr12
  83. #define U13 $xr13
  84. #define U14 $xr14
  85. #define U15 $xr15
  86. #define D0 $xr16
  87. #define D1 $xr17
  88. #define D2 $xr18
  89. #define D3 $xr19
  90. #define D4 $xr20
  91. #define D5 $xr21
  92. #define D6 $xr22
  93. #define D7 $xr23
  94. #define D8 $xr24
  95. #define D9 $xr25
  96. #define D10 $xr26
  97. #define D11 $xr27
  98. #define D12 $xr28
  99. #define D13 $xr29
  100. #define D14 $xr30
  101. #define D15 $xr31
  102. /* Prefetch interval */
  103. #define A_PRE 0x400
  104. #define B_PRE 0x100
  105. #include "dtrsm_kernel_macro.S"
  106. .macro ldrepl_macro stride:req, index:req, more:vararg
  107. // Load Ux (x = 0...15)
  108. GLDREPL xv, d, $xr\index, B0, \index * 8 - \stride * 8
  109. .ifnb \more
  110. ldrepl_macro \stride, \more
  111. .endif
  112. .endm
  113. .macro nmsub_macro reg:req, start0:req, start1:req, more:vararg
  114. // Gx -= reg * Ux
  115. xvfnmsub.d $xr\start0, \reg, $xr\start1, $xr\start0
  116. .ifnb \more
  117. nmsub_macro \reg, \more
  118. .endif
  119. .endm
  120. .macro A_st_macro N:req, stride:req, start:req, more:vararg
  121. // Store Gx(x = 16...31)
  122. .if \N == 4
  123. xvst $xr\start, A0, \start * 0x20 - \stride * 0x20
  124. .elseif \N == 2
  125. vst $vr\start, A0, \start * 0x10 - \stride * 0x10
  126. .elseif \N == 1
  127. fst.d $f\start, A0, \start * 0x08 - \stride * 0x08
  128. .endif
  129. .ifnb \more
  130. A_st_macro \N, \stride, \more
  131. .endif
  132. .endm
  133. .macro dsolve_16x2
  134. // We are going to process matrix B with a size of 2x2,
  135. // using only the upper triangular portion. The memory layout of
  136. // matrix B is as follows:
  137. //0
  138. //2 3
  139. // Sequentially extract data from B in row order
  140. ldrepl_macro 16, 16
  141. ldrepl_macro 15, 17, 18
  142. GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
  143. nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7
  144. GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
  145. // Store A
  146. A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
  147. // Store C
  148. GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
  149. U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60
  150. .endm
  151. .macro dsolve_8x2
  152. // We are going to process matrix B with a size of 2x2,
  153. // using only the upper triangular portion. The memory layout of
  154. // matrix B is as follows:
  155. //0
  156. //2 3
  157. // Sequentially extract data from B in row order
  158. ldrepl_macro 16, 16
  159. ldrepl_macro 15, 17, 18
  160. GMUL xvf, d, U2, D2, U2, U3, D2, U3
  161. nmsub_macro D1, 0, 2, 1, 3
  162. GMUL xvf, d, U0, D0, U0, U1, D0, U1
  163. // Store A
  164. A_st_macro 4, 0, 0, 1, 2, 3
  165. // Store C
  166. GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
  167. U2, C1, 0x00, U3, C1, 0x20
  168. .endm
  169. .macro dsolve_4x2
  170. // We are going to process matrix B with a size of 2x2,
  171. // using only the upper triangular portion. The memory layout of
  172. // matrix B is as follows:
  173. //0
  174. //2 3
  175. // Sequentially extract data from B in row order
  176. ldrepl_macro 16, 16
  177. ldrepl_macro 15, 17, 18
  178. GMUL xvf, d, U1, D2, U1
  179. nmsub_macro D1, 0, 1
  180. GMUL xvf, d, U0, D0, U0
  181. // Store A
  182. A_st_macro 4, 0, 0, 1
  183. // Store C
  184. GST xv, , U0, C0, 0x00, U1, C1, 0x00
  185. .endm
  186. .macro dsolve_2x2
  187. // We are going to process matrix B with a size of 2x2,
  188. // using only the upper triangular portion. The memory layout of
  189. // matrix B is as follows:
  190. //0
  191. //2 3
  192. // Sequentially extract data from B in row order
  193. ldrepl_macro 16, 16
  194. ldrepl_macro 15, 17, 18
  195. GMUL xvf, d, U1, D2, U1
  196. nmsub_macro D1, 0, 1
  197. GMUL xvf, d, U0, D0, U0
  198. // Store A
  199. A_st_macro 2, 0, 0, 1
  200. // Store C
  201. GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
  202. .endm
  203. .macro dsolve_1x2
  204. // We are going to process matrix B with a size of 2x2,
  205. // using only the upper triangular portion. The memory layout of
  206. // matrix B is as follows:
  207. //0
  208. //2 3
  209. // Sequentially extract data from B in row order
  210. ldrepl_macro 16, 16
  211. ldrepl_macro 15, 17, 18
  212. GMUL xvf, d, U1, D2, U1
  213. nmsub_macro D1, 0, 1
  214. GMUL xvf, d, U0, D0, U0
  215. // Store A
  216. A_st_macro 1, 0, 0, 1
  217. // Store C
  218. GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
  219. .endm
  220. .macro dsolve_16x4
  221. // We are going to process matrix B with a size of 4x4,
  222. // using only the upper triangular portion. The memory layout of
  223. // matrix B is as follows:
  224. //0
  225. //4 5
  226. //8 9 10
  227. //12 13 14 15
  228. // Sequentially extract data from B in row order
  229. ldrepl_macro 10, 22, 23, 24, 25
  230. GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
  231. ldrepl_macro 11, 19, 20, 21
  232. nmsub_macro D8, 8, 12, 9, 13, 10, 14, 11, 15
  233. ldrepl_macro 13, 17, 18
  234. GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11
  235. ldrepl_macro 16, 16
  236. nmsub_macro D7, 4, 12, 5, 13, 6, 14, 7, 15
  237. nmsub_macro D4, 4, 8, 5, 9, 6, 10, 7, 11
  238. GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
  239. nmsub_macro D6, 0, 12, 1, 13, 2, 14, 3, 15
  240. nmsub_macro D3, 0, 8, 1, 9, 2, 10, 3, 11
  241. nmsub_macro D1, 0, 4, 1, 5, 2, 6, 3, 7
  242. GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
  243. // Store A
  244. A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
  245. // Store C
  246. GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60, \
  247. U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60, \
  248. U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \
  249. U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
  250. .endm
  251. .macro dsolve_8x4
  252. // We are going to process matrix B with a size of 4x4,
  253. // using only the upper triangular portion. The memory layout of
  254. // matrix B is as follows:
  255. //0
  256. //4 5
  257. //8 9 10
  258. //12 13 14 15
  259. // Sequentially extract data from B in row order
  260. ldrepl_macro 10, 22, 23, 24, 25
  261. GMUL xvf, d, U6, D9, U6, U7, D9, U7
  262. ldrepl_macro 11, 19, 20, 21
  263. nmsub_macro D8, 4, 6, 5, 7
  264. ldrepl_macro 13, 17, 18
  265. GMUL xvf, d, U4, D5, U4, U5, D5, U5
  266. ldrepl_macro 16, 16
  267. nmsub_macro D7, 2, 6, 3, 7
  268. nmsub_macro D4, 2, 4, 3, 5
  269. GMUL xvf, d, U2, D2, U2, U3, D2, U3
  270. nmsub_macro D6, 0, 6, 1, 7
  271. nmsub_macro D3, 0, 4, 1, 5
  272. nmsub_macro D1, 0, 2, 1, 3
  273. GMUL xvf, d, U0, D0, U0, U1, D0, U1
  274. // Store A
  275. A_st_macro 4, 0, 0, 1, 2, 3, 4, 5, 6, 7
  276. // Store C
  277. GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
  278. U2, C1, 0x00, U3, C1, 0x20, \
  279. U4, C2, 0x00, U5, C2, 0x20, \
  280. U6, C3, 0x00, U7, C3, 0x20
  281. .endm
  282. .macro dsolve_4x4
  283. // We are going to process matrix B with a size of 4x4,
  284. // using only the upper triangular portion. The memory layout of
  285. // matrix B is as follows:
  286. //0
  287. //4 5
  288. //8 9 10
  289. //12 13 14 15
  290. // Sequentially extract data from B in row order
  291. ldrepl_macro 10, 22, 23, 24, 25
  292. GMUL xvf, d, U3, D9, U3
  293. ldrepl_macro 11, 19, 20, 21
  294. nmsub_macro D8, 2, 3
  295. ldrepl_macro 13, 17, 18
  296. GMUL xvf, d, U2, D5, U2
  297. ldrepl_macro 16, 16
  298. nmsub_macro D7, 1, 3
  299. nmsub_macro D4, 1, 2
  300. GMUL xvf, d, U1, D2, U1
  301. nmsub_macro D6, 0, 3
  302. nmsub_macro D3, 0, 2
  303. nmsub_macro D1, 0, 1
  304. GMUL xvf, d, U0, D0, U0
  305. // Store A
  306. A_st_macro 4, 0, 0, 1, 2, 3
  307. // Store C
  308. GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
  309. .endm
  310. .macro dsolve_2x4
  311. // We are going to process matrix B with a size of 4x4,
  312. // using only the upper triangular portion. The memory layout of
  313. // matrix B is as follows:
  314. //0
  315. //4 5
  316. //8 9 10
  317. //12 13 14 15
  318. // Sequentially extract data from B in row order
  319. ldrepl_macro 10, 22, 23, 24, 25
  320. GMUL xvf, d, U3, D9, U3
  321. ldrepl_macro 11, 19, 20, 21
  322. nmsub_macro D8, 2, 3
  323. ldrepl_macro 13, 17, 18
  324. GMUL xvf, d, U2, D5, U2
  325. ldrepl_macro 16, 16
  326. nmsub_macro D7, 1, 3
  327. nmsub_macro D4, 1, 2
  328. GMUL xvf, d, U1, D2, U1
  329. nmsub_macro D6, 0, 3
  330. nmsub_macro D3, 0, 2
  331. nmsub_macro D1, 0, 1
  332. GMUL xvf, d, U0, D0, U0
  333. // Store A
  334. A_st_macro 2, 0, 0, 1, 2, 3
  335. // Store C
  336. GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00
  337. .endm
  338. .macro dsolve_1x4
  339. // We are going to process matrix B with a size of 4x4,
  340. // using only the upper triangular portion. The memory layout of
  341. // matrix B is as follows:
  342. //0
  343. //4 5
  344. //8 9 10
  345. //12 13 14 15
  346. // Sequentially extract data from B in row order
  347. ldrepl_macro 10, 22, 23, 24, 25
  348. GMUL xvf, d, U3, D9, U3
  349. ldrepl_macro 11, 19, 20, 21
  350. nmsub_macro D8, 2, 3
  351. ldrepl_macro 13, 17, 18
  352. GMUL xvf, d, U2, D5, U2
  353. ldrepl_macro 16, 16
  354. nmsub_macro D7, 1, 3
  355. nmsub_macro D4, 1, 2
  356. GMUL xvf, d, U1, D2, U1
  357. nmsub_macro D6, 0, 3
  358. nmsub_macro D3, 0, 2
  359. nmsub_macro D1, 0, 1
  360. GMUL xvf, d, U0, D0, U0
  361. // Store A
  362. A_st_macro 1, 0, 0, 1, 2, 3
  363. // Store C
  364. GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
  365. .endm
  366. .macro dgemm_dsolve_16x1
  367. or T1, A0, A0
  368. or T2, B0, B0
  369. bge ZERO, L, .L_dsolve_16x1_load
  370. dgemm_16x1
  371. b .L_dsolve_16x1
  372. .L_dsolve_16x1_load:
  373. /* Load C0 */
  374. xvld U0, C0, 0x00
  375. xvld U1, C0, 0x20
  376. xvld U2, C0, 0x40
  377. xvld U3, C0, 0x60
  378. .L_dsolve_16x1:
  379. PTR_ADDI A0, T1, -16 * 8
  380. PTR_ADDI B0, T2, -1 * 8
  381. ldrepl_macro 16, 16
  382. GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
  383. // Store A
  384. A_st_macro 4, 0, 0, 1, 2, 3
  385. // Strore C
  386. GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
  387. .endm
  388. .macro dgemm_dsolve_8x1
  389. or T1, A0, A0
  390. or T2, B0, B0
  391. bge ZERO, L, .L_dsolve_8x1_load
  392. dgemm_8x1
  393. b .L_dsolve_8x1
  394. .L_dsolve_8x1_load:
  395. /* Load C0 */
  396. xvld U0, C0, 0x00
  397. xvld U1, C0, 0x20
  398. .L_dsolve_8x1:
  399. PTR_ADDI A0, T1, -8 * 8
  400. PTR_ADDI B0, T2, -1 * 8
  401. ldrepl_macro 16, 16
  402. GMUL xvf, d, U0, D0, U0, U1, D0, U1
  403. // Store A
  404. A_st_macro 4, 0, 0, 1
  405. // Strore C
  406. GST xv, , U0, C0, 0x00, U1, C0, 0x20
  407. .endm
  408. .macro dgemm_dsolve_4x1
  409. or T1, A0, A0
  410. or T2, B0, B0
  411. bge ZERO, L, .L_dsolve_4x1_load
  412. dgemm_4x1
  413. b .L_dsolve_4x1
  414. .L_dsolve_4x1_load:
  415. /* Load C0 */
  416. xvld U0, C0, 0x00
  417. .L_dsolve_4x1:
  418. PTR_ADDI A0, T1, -4 * 8
  419. PTR_ADDI B0, T2, -1 * 8
  420. ldrepl_macro 16, 16
  421. GMUL xvf, d, U0, D0, U0
  422. // Store A
  423. A_st_macro 4, 0, 0
  424. // Strore C
  425. GST xv, , U0, C0, 0x00
  426. .endm
  427. .macro dgemm_dsolve_2x1
  428. or T1, A0, A0
  429. or T2, B0, B0
  430. bge ZERO, L, .L_dsolve_2x1_load
  431. dgemm_2x1
  432. b .L_dsolve_2x1
  433. .L_dsolve_2x1_load:
  434. /* Load C0 */
  435. xvld U0, C0, 0x00
  436. .L_dsolve_2x1:
  437. PTR_ADDI A0, T1, -2 * 8
  438. PTR_ADDI B0, T2, -1 * 8
  439. ldrepl_macro 16, 16
  440. GMUL xvf, d, U0, D0, U0
  441. // Store A
  442. A_st_macro 2, 0, 0
  443. // Strore C
  444. GST v, , $vr0, C0, 0x00
  445. .endm
  446. .macro dgemm_dsolve_1x1
  447. or T1, A0, A0
  448. or T2, B0, B0
  449. bge ZERO, L, .L_dsolve_1x1_load
  450. dgemm_1x1
  451. b .L_dsolve_1x1
  452. .L_dsolve_1x1_load:
  453. // Load C
  454. fld.d $f0, C0, 0x00
  455. .L_dsolve_1x1:
  456. PTR_ADDI A0, T1, -1 * 8
  457. PTR_ADDI B0, T2, -1 * 8
  458. ldrepl_macro 16, 16
  459. GMUL xvf, d, U0, D0, U0
  460. // Store A
  461. A_st_macro 1, 0, 0
  462. // Strore C
  463. GST f, d, $f0, C0, 0x00
  464. .endm
  465. .macro dgemm_dsolve_16x2
  466. or T1, A0, A0
  467. or T2, B0, B0
  468. bge ZERO, L, .L_dsolve_16x2_load
  469. dgemm_16x2
  470. b .L_dsolve_16x2
  471. .L_dsolve_16x2_load:
  472. /* Load C0 */
  473. xvld U0, C0, 0x00
  474. xvld U1, C0, 0x20
  475. xvld U2, C0, 0x40
  476. xvld U3, C0, 0x60
  477. /* Load C1 */
  478. xvld U4, C1, 0x00
  479. xvld U5, C1, 0x20
  480. xvld U6, C1, 0x40
  481. xvld U7, C1, 0x60
  482. .L_dsolve_16x2:
  483. PTR_ADDI A0, T1, -(16 * 2) * 8
  484. PTR_ADDI B0, T2, -(2 * 2) * 8
  485. dsolve_16x2
  486. .endm
  487. .macro dgemm_dsolve_8x2
  488. or T1, A0, A0
  489. or T2, B0, B0
  490. bge ZERO, L, .L_dsolve_8x2_load
  491. dgemm_8x2
  492. b .L_dsolve_8x2
  493. .L_dsolve_8x2_load:
  494. /* Load C0 */
  495. xvld U0, C0, 0x00
  496. xvld U1, C0, 0x20
  497. /* Load C1 */
  498. xvld U2, C1, 0x00
  499. xvld U3, C1, 0x20
  500. .L_dsolve_8x2:
  501. PTR_ADDI A0, T1, -(8 * 2) * 8
  502. PTR_ADDI B0, T2, -(2 * 2) * 8
  503. dsolve_8x2
  504. .endm
  505. .macro dgemm_dsolve_4x2
  506. or T1, A0, A0
  507. or T2, B0, B0
  508. bge ZERO, L, .L_dsolve_4x2_load
  509. dgemm_4x2
  510. b .L_dsolve_4x2
  511. .L_dsolve_4x2_load:
  512. /* Load C0 */
  513. xvld U0, C0, 0x00
  514. /* Load C1 */
  515. xvld U1, C1, 0x00
  516. .L_dsolve_4x2:
  517. PTR_ADDI A0, T1, -(4 * 2) * 8
  518. PTR_ADDI B0, T2, -(2 * 2) * 8
  519. dsolve_4x2
  520. .endm
  521. .macro dgemm_dsolve_2x2
  522. or T1, A0, A0
  523. or T2, B0, B0
  524. bge ZERO, L, .L_dsolve_2x2_load
  525. dgemm_2x2
  526. b .L_dsolve_2x2
  527. .L_dsolve_2x2_load:
  528. /* Load C0 */
  529. xvld U0, C0, 0x00
  530. /* Load C1 */
  531. xvld U1, C1, 0x00
  532. .L_dsolve_2x2:
  533. PTR_ADDI A0, T1, -(2 * 2) * 8
  534. PTR_ADDI B0, T2, -(2 * 2) * 8
  535. dsolve_2x2
  536. .endm
  537. .macro dgemm_dsolve_1x2
  538. or T1, A0, A0
  539. or T2, B0, B0
  540. bge ZERO, L, .L_dsolve_1x2_load
  541. dgemm_1x2
  542. xvpackod.d U1, U0, U0
  543. b .L_dsolve_1x2
  544. .L_dsolve_1x2_load:
  545. // Load C
  546. fld.d $f0, C0, 0x00
  547. fld.d $f1, C1, 0x00
  548. .L_dsolve_1x2:
  549. PTR_ADDI A0, T1, -(1 * 2) * 8
  550. PTR_ADDI B0, T2, -(2 * 2) * 8
  551. dsolve_1x2
  552. .endm
  553. .macro dgemm_dsolve_16x4
  554. or T1, A0, A0
  555. or T2, B0, B0
  556. bge ZERO, L, .L_dsolve_16x4_load
  557. dgemm_16x4
  558. b .L_dsolve_16x4
  559. .L_dsolve_16x4_load:
  560. // Load C
  561. GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
  562. GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60
  563. GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
  564. GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
  565. /********************** solver ******************/
  566. .L_dsolve_16x4:
  567. PTR_ADDI A0, T1, -(16 * 4) * 8
  568. PTR_ADDI B0, T2, -(4 * 4) * 8
  569. dsolve_16x4
  570. .endm
  571. .macro dgemm_dsolve_8x4
  572. or T1, A0, A0
  573. or T2, B0, B0
  574. bge ZERO, L, .L_dsolve_8x4_load
  575. dgemm_8x4
  576. b .L_dsolve_8x4
  577. .L_dsolve_8x4_load:
  578. /* Load C0 */
  579. xvld U0, C0, 0x00
  580. xvld U1, C0, 0x20
  581. /* Load C1 */
  582. xvld U2, C1, 0x00
  583. xvld U3, C1, 0x20
  584. /* Load C2 */
  585. xvld U4, C2, 0x00
  586. xvld U5, C2, 0x20
  587. /* Load C3 */
  588. xvld U6, C3, 0x00
  589. xvld U7, C3, 0x20
  590. /********* solver *********/
  591. .L_dsolve_8x4:
  592. PTR_ADDI A0, T1, -(8 * 4) * 8
  593. PTR_ADDI B0, T2, -(4 * 4) * 8
  594. dsolve_8x4
  595. .endm
  596. .macro dgemm_dsolve_4x4
  597. or T1, A0, A0
  598. or T2, B0, B0
  599. bge ZERO, L, .L_dsolve_4x4_load
  600. dgemm_4x4
  601. b .L_dsolve_4x4
  602. .L_dsolve_4x4_load:
  603. /* Load C0 */
  604. xvld U0, C0, 0x00
  605. /* Load C1 */
  606. xvld U1, C1, 0x00
  607. /* Load C2 */
  608. xvld U2, C2, 0x00
  609. /* Load C3 */
  610. xvld U3, C3, 0x00
  611. /************** solver *****************/
  612. .L_dsolve_4x4:
  613. PTR_ADDI A0, T1, -(4 * 4) * 8
  614. PTR_ADDI B0, T2, -(4 * 4) * 8
  615. dsolve_4x4
  616. .endm
  617. .macro dgemm_dsolve_2x4
  618. or T1, A0, A0
  619. or T2, B0, B0
  620. bge ZERO, L, .L_dsolve_2x4_load
  621. dgemm_2x4
  622. xvpermi.q U2, U0, 0x01
  623. xvpermi.q U3, U1, 0x01
  624. b .L_dsolve_2x4
  625. .L_dsolve_2x4_load:
  626. /* Load C0 */
  627. xvld U0, C0, 0x00
  628. /* Load C1 */
  629. xvld U1, C1, 0x00
  630. /* Load C2 */
  631. xvld U2, C2, 0x00
  632. /* Load C3 */
  633. xvld U3, C3, 0x00
  634. /********************** solver ******************/
  635. .L_dsolve_2x4:
  636. PTR_ADDI A0, T1, -(2 * 4) * 8
  637. PTR_ADDI B0, T2, -(4 * 4) * 8
  638. dsolve_2x4
  639. .endm
  640. .macro dgemm_dsolve_1x4
  641. or T1, A0, A0
  642. or T2, B0, B0
  643. bge ZERO, L, .L_dsolve_1x4_load
  644. dgemm_1x4
  645. xvpackod.d U1, U0, U0
  646. xvpermi.q U2, U0, 0x01
  647. xvpermi.q U3, U1, 0x01
  648. b .L_dsolve_1x4
  649. .L_dsolve_1x4_load:
  650. // Load C
  651. fld.d $f0, C0, 0x00
  652. fld.d $f1, C1, 0x00
  653. fld.d $f2, C2, 0x00
  654. fld.d $f3, C3, 0x00
  655. .L_dsolve_1x4:
  656. PTR_ADDI A0, T1, -(1 * 4) * 8
  657. PTR_ADDI B0, T2, -(4 * 4) * 8
  658. dsolve_1x4
  659. .endm
  660. PROLOGUE
  661. push_if_used 9, 8
  662. PTR_SLLI LDC, LDC, 3
  663. PTR_SUB KK, N, OFFSET
  664. PTR_MUL T0, N, LDC
  665. PTR_MUL T1, N, K
  666. PTR_ADD C, C, T0 // c += n * ldc
  667. PTR_SLLI T1, T1, 3
  668. PTR_ADD B, B, T1
  669. andi J, N, 1
  670. beqz J, .L_N2
  671. .L_N1:
  672. move AA, A
  673. PTR_SUB C, C, LDC // c -= ldc
  674. PTR_SLLI T0, K, 3
  675. PTR_SLLI T1, KK, 3
  676. PTR_SUB B, B, T0 // b -= k
  677. PTR_ADD BB, B, T1 // bb = b + kk
  678. move CC, C
  679. PTR_SRAI I, M, 4 // M >> 4
  680. beqz I, .L_N1_M15
  681. .align 4
  682. .L_N1_I1:
  683. PTR_SLLI T1, KK, 7
  684. GADD , d, C0, CC, ZERO
  685. PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk
  686. move B0, BB
  687. PTR_SUB L, K, KK // L = K - KK
  688. dgemm_dsolve_16x1
  689. PTR_ADDI I, I, -1
  690. PTR_SLLI T0, K, 7
  691. PTR_ADDI CC, CC, 0x80 // cc += 16
  692. PTR_ADD AA, AA, T0 // aa += 16 * k
  693. bnez I, .L_N1_I1
  694. .L_N1_M15:
  695. andi I, M, 8
  696. beqz I, .L_N1_M7
  697. .L_N1_M8:
  698. PTR_SLLI T1, KK, 6
  699. GADD , d, C0, CC, ZERO
  700. PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk
  701. move B0, BB
  702. PTR_SUB L, K, KK // L = K - KK
  703. dgemm_dsolve_8x1
  704. PTR_SLLI T0, K, 6
  705. PTR_ADDI CC, CC, 0x40 // cc += 8
  706. PTR_ADD AA, AA, T0 // aa += 8 * k
  707. .L_N1_M7:
  708. andi I, M, 4
  709. beqz I, .L_N1_M3
  710. .L_N1_M4:
  711. PTR_SLLI T1, KK, 5
  712. GADD , d, C0, CC, ZERO
  713. PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk
  714. move B0, BB
  715. PTR_SUB L, K, KK // L = K - KK
  716. dgemm_dsolve_4x1
  717. PTR_SLLI T0, K, 5
  718. PTR_ADDI CC, CC, 0x20 // cc += 4
  719. PTR_ADD AA, AA, T0 // aa += 4 * k
  720. .L_N1_M3:
  721. andi I, M, 2
  722. beqz I, .L_N1_M1
  723. .L_N1_M2:
  724. PTR_SLLI T1, KK, 4
  725. GADD , d, C0, CC, ZERO
  726. PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk
  727. move B0, BB
  728. PTR_SUB L, K, KK // L = K - KK
  729. dgemm_dsolve_2x1
  730. PTR_SLLI T0, K, 4
  731. PTR_ADDI CC, CC, 0x10 // cc += 2
  732. PTR_ADD AA, AA, T0 // aa += 2 * k
  733. .L_N1_M1:
  734. andi I, M, 1
  735. beqz I, .L_N1_M0
  736. PTR_SLLI T1, KK, 3
  737. GADD , d, C0, CC, ZERO
  738. PTR_ADD A0, AA, T1 // a0 = aa + kk
  739. move B0, BB
  740. PTR_SUB L, K, KK // L = K - KK
  741. dgemm_dsolve_1x1
  742. PTR_SLLI T0, K, 3
  743. PTR_ADDI CC, CC, 0x08 // cc += 1
  744. PTR_ADD AA, AA, T0 // aa += 1 * k
  745. .L_N1_M0:
  746. PTR_ADDI KK, KK, -1
  747. .L_N2:
  748. andi J, N, 2
  749. beq ZERO, J, .L_N4
  750. move AA, A
  751. PTR_SLLI T0, LDC, 1
  752. PTR_SLLI T1, K, 4
  753. PTR_SLLI T2, KK, 4
  754. PTR_SUB B, B, T1
  755. PTR_SUB C, C, T0
  756. PTR_ADD BB, B, T2
  757. move CC, C
  758. PTR_SRAI I, M, 4 // M >> 4
  759. beqz I, .L_N2_M15
  760. .align 4
  761. .L_N2_I1:
  762. PTR_SLLI T1, KK, 7
  763. GADD , d, C0, CC, ZERO, C1, C0, LDC
  764. PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk
  765. move B0, BB
  766. PTR_SUB L, K, KK // L = K - KK
  767. dgemm_dsolve_16x2
  768. PTR_ADDI I, I, -1
  769. PTR_SLLI T0, K, 7
  770. PTR_ADDI CC, CC, 0x80 // cc += 16
  771. PTR_ADD AA, AA, T0 // aa += 16 * k
  772. bnez I, .L_N2_I1
  773. .L_N2_M15:
  774. andi I, M, 8
  775. beqz I, .L_N2_M7
  776. .L_N2_M8:
  777. PTR_SLLI T1, KK, 6
  778. GADD , d, C0, CC, ZERO, C1, C0, LDC
  779. PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk
  780. move B0, BB
  781. PTR_SUB L, K, KK // L = K - KK
  782. dgemm_dsolve_8x2
  783. PTR_SLLI T0, K, 6
  784. PTR_ADDI CC, CC, 0x40 // cc += 8
  785. PTR_ADD AA, AA, T0 // aa += 8 * k
  786. .L_N2_M7:
  787. andi I, M, 4
  788. beqz I, .L_N2_M3
  789. .L_N2_M4:
  790. PTR_SLLI T1, KK, 5
  791. GADD , d, C0, CC, ZERO, C1, C0, LDC
  792. PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk
  793. move B0, BB
  794. PTR_SUB L, K, KK // L = K - KK
  795. dgemm_dsolve_4x2
  796. PTR_SLLI T0, K, 5
  797. PTR_ADDI CC, CC, 0x20 // cc += 4
  798. PTR_ADD AA, AA, T0 // aa += 4 * k
  799. .L_N2_M3:
  800. andi I, M, 2
  801. beqz I, .L_N2_M1
  802. .L_N2_M2:
  803. PTR_SLLI T1, KK, 4
  804. GADD , d, C0, CC, ZERO, C1, C0, LDC
  805. PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk
  806. move B0, BB
  807. PTR_SUB L, K, KK // L = K - KK
  808. dgemm_dsolve_2x2
  809. PTR_SLLI T0, K, 4
  810. PTR_ADDI CC, CC, 0x10 // cc += 2
  811. PTR_ADD AA, AA, T0 // aa += 2 * k
  812. .L_N2_M1:
  813. andi I, M, 1
  814. beqz I, .L_N2_M0
  815. PTR_SLLI T1, KK, 3
  816. GADD , d, C0, CC, ZERO, C1, C0, LDC
  817. PTR_ADD A0, AA, T1 // a0 = aa + kk
  818. move B0, BB
  819. PTR_SUB L, K, KK // L = K - KK
  820. dgemm_dsolve_1x2
  821. PTR_SLLI T0, K, 3
  822. PTR_ADDI CC, CC, 0x08 // cc += 1
  823. PTR_ADD AA, AA, T0 // aa += 1 * k
  824. .L_N2_M0:
  825. PTR_ADDI KK, KK, -2
  826. .L_N4:
  827. PTR_SRAI J, N, 2 /* J = bn >> 2 */
  828. beq ZERO, J, .L_N0
  829. .align 5
  830. .L_J1:
  831. PTR_ADDI J, J, -1
  832. move AA, A
  833. PTR_SLLI T0, LDC, 2
  834. PTR_SLLI T1, K, 5
  835. PTR_SLLI T2, KK, 5
  836. PTR_SUB B, B, T1
  837. PTR_SUB C, C, T0
  838. PTR_ADD BB, B, T2
  839. move CC, C
  840. PTR_SRAI I, M, 4 // M >> 4
  841. beqz I, .L_M15
  842. .align 4
  843. .L_I1:
  844. PTR_SLLI T1, KK, 7
  845. GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
  846. PTR_ADD A0, AA, T1 // a0 = aa + 16 * kk
  847. move B0, BB
  848. PTR_SUB L, K, KK // L = K - KK
  849. dgemm_dsolve_16x4
  850. PTR_ADDI I, I, -1
  851. PTR_SLLI T0, K, 7
  852. PTR_ADDI CC, CC, 0x80 // cc += 16
  853. PTR_ADD AA, AA, T0 // aa += 16 * k
  854. bnez I, .L_I1
  855. .L_M15:
  856. andi I, M, 8
  857. beqz I, .L_M7
  858. .L_M8:
  859. PTR_SLLI T1, KK, 6
  860. GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
  861. PTR_ADD A0, AA, T1 // a0 = aa + 8 * kk
  862. move B0, BB
  863. PTR_SUB L, K, KK // L = K - KK
  864. dgemm_dsolve_8x4
  865. PTR_SLLI T0, K, 6
  866. PTR_ADDI CC, CC, 0x40 // cc += 8
  867. PTR_ADD AA, AA, T0 // aa += 8 * k
  868. .L_M7:
  869. andi I, M, 4
  870. beqz I, .L_M3
  871. .L_M4:
  872. PTR_SLLI T1, KK, 5
  873. GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
  874. PTR_ADD A0, AA, T1 // a0 = aa + 4 * kk
  875. move B0, BB
  876. PTR_SUB L, K, KK // L = K - KK
  877. dgemm_dsolve_4x4
  878. PTR_SLLI T0, K, 5
  879. PTR_ADDI CC, CC, 0x20 // cc += 4
  880. PTR_ADD AA, AA, T0 // aa += 4 * k
  881. .L_M3:
  882. andi I, M, 2
  883. beqz I, .L_M1
  884. .L_M2:
  885. PTR_SLLI T1, KK, 4
  886. GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
  887. PTR_ADD A0, AA, T1 // a0 = aa + 2 * kk
  888. move B0, BB
  889. PTR_SUB L, K, KK // L = K - KK
  890. dgemm_dsolve_2x4
  891. PTR_SLLI T0, K, 4
  892. PTR_ADDI CC, CC, 0x10 // cc += 2
  893. PTR_ADD AA, AA, T0 // aa += 2 * k
  894. .L_M1:
  895. andi I, M, 1
  896. beqz I, .L_M0
  897. PTR_SLLI T1, KK, 3
  898. GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
  899. PTR_ADD A0, AA, T1 // a0 = aa + kk
  900. move B0, BB
  901. PTR_SUB L, K, KK // L = K - KK
  902. dgemm_dsolve_1x4
  903. PTR_SLLI T0, K, 3
  904. PTR_ADDI CC, CC, 0x08 // cc += 1
  905. PTR_ADD AA, AA, T0 // aa += 1 * k
  906. .L_M0:
  907. PTR_ADDI KK, KK, -4
  908. bnez J, .L_J1
  909. .L_N0:
  910. pop_if_used 9, 8
  911. jirl $r0, $r1, 0x0
  912. EPILOGUE