You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_small_kernel_nt_lasx.S 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. /***************************************************************************
  2. Copyright (c) 2024 The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #include "loongarch64_asm.S"
  30. #define M $a0
  31. #define N $a1
  32. #define K $a2
  33. #define A $a3
  34. #define LDA $a4
  35. #define ALPHA $f0
  36. #define B $a5
  37. #define LDB $a6
  38. #define C $a7
  39. #define LDC $t0
  40. #ifdef B0
  41. #define BETA $f1
  42. #endif
  43. #undef ZERO
  44. #define ZERO $r0
  45. #define M16 $t1
  46. #define M8 $t1
  47. #define M4 $t1
  48. #define M2 $t1
  49. #define M1 $t1
  50. #define N4 $t2
  51. #define N2 $t2
  52. #define N1 $t2
  53. #define K_LDB $t3
  54. #define A0 $t4
  55. #define X0 $t5
  56. #define C0 $t6
  57. #define C1 $t7
  58. #define C2 $t8
  59. #define C3 $s0
  60. #define K1 $s1
  61. #define VALPHA $xr0
  62. #ifndef B0
  63. #define VBETA $xr1
  64. #endif
  65. #define D0 $xr2
  66. #define D1 $xr3
  67. #define D2 $xr4
  68. #define D3 $xr5
  69. #define D4 $xr6
  70. #define D5 $xr7
  71. #define D6 $xr8
  72. #define D7 $xr9
  73. #define D8 $xr10
  74. #define D9 $xr11
  75. #define D10 $xr12
  76. #define D11 $xr13
  77. #define D12 $xr14
  78. #define D13 $xr15
  79. #define D14 $xr16
  80. #define D15 $xr17
  81. #define S0 $xr18
  82. #define S1 $xr19
  83. #define S2 $xr20
  84. #define S3 $xr21
  85. #define Z0 $xr22
  86. #define Z1 $xr23
  87. #define Z2 $xr24
  88. #define Z3 $xr25
  89. #define V0 $vr2
  90. #define V1 $vr3
  91. #define V2 $vr4
  92. #define V3 $vr5
  93. #define F0 $f2
  94. #define F1 $f3
  95. #define F2 $f4
  96. #define F3 $f5
  97. .macro DGEMM_SMALL_KERNEL_NT_TAIL M
  98. PTR_SRAI N4, N, 2 // N >> 2
  99. move A0, A // Restore A0
  100. move X0, B // Restore X0
  101. move C0, C // Restore C0
  102. PTR_ADD C1, C0, LDC
  103. PTR_ADD C2, C1, LDC
  104. PTR_ADD C3, C2, LDC
  105. beqz N4, .L_M\M\()_N3
  106. .L_M\M\()_N4:
  107. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3
  108. move K1, K // Restore K1
  109. PTR_ADDI N4, N4, -1
  110. bge ZERO, K, .L_M\M\()_N4_END
  111. .L_M\M\()_N4_K1:
  112. PTR_ADDI K1, K1, -1
  113. GLD xv, , S0, A0, 0x00
  114. GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18
  115. GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1, D2, S0, Z2, D2, D3, S0, Z3, D3
  116. PTR_ADD X0, X0, LDB
  117. PTR_ADD A0, A0, LDA
  118. bnez K1, .L_M\M\()_N4_K1
  119. .L_M\M\()_N4_END:
  120. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
  121. #ifndef B0
  122. GLD xv, , S0, C0, 0x00
  123. GMADD xvf, d, D0, S0, VBETA, D0
  124. GLD xv, , S0, C1, 0x00
  125. GMADD xvf, d, D1, S0, VBETA, D1
  126. GLD xv, , S0, C2, 0x00
  127. GMADD xvf, d, D2, S0, VBETA, D2
  128. GLD xv, , S0, C3, 0x00
  129. GMADD xvf, d, D3, S0, VBETA, D3
  130. #endif
  131. .if \M == 4
  132. GST xv, , D0, C0, 0x00, D1, C1, 0x00, D2, C2, 0x00, D3, C3, 0x00
  133. .elseif \M == 2
  134. GST v, , V0, C0, 0x00, V1, C1, 0x00, V2, C2, 0x00, V3, C3, 0x00
  135. .elseif \M == 1
  136. GST f, d, F0, C0, 0x00, F1, C1, 0x00, F2, C2, 0x00, F3, C3, 0x00
  137. .endif
  138. // Update C0, C1, C2, C3
  139. PTR_ALSL C0, LDC, C0, 2
  140. PTR_ALSL C1, LDC, C1, 2
  141. PTR_ALSL C2, LDC, C2, 2
  142. PTR_ALSL C3, LDC, C3, 2
  143. // Update X0
  144. PTR_SUB X0, X0, K_LDB
  145. PTR_ADDI X0, X0, 0x20
  146. // Restore A0
  147. move A0, A
  148. bnez N4, .L_M\M\()_N4
  149. .L_M\M\()_N3:
  150. andi N2, N, 0x02
  151. beqz N2, .L_M\M\()_N1
  152. .L_M\M\()_N2:
  153. GXOR xv, v, D0, D0, D0, D1, D1, D1
  154. move K1, K // Restore K1
  155. bge ZERO, K, .L_M\M\()_N2_END
  156. .L_M\M\()_N2_K1:
  157. PTR_ADDI K1, K1, -1
  158. GLD xv, , S0, A0, 0x00
  159. GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08
  160. GMADD xvf, d, D0, S0, Z0, D0, D1, S0, Z1, D1
  161. PTR_ADD X0, X0, LDB
  162. PTR_ADD A0, A0, LDA
  163. bnez K1, .L_M\M\()_N2_K1
  164. .L_M\M\()_N2_END:
  165. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA
  166. #ifndef B0
  167. GLD xv, , S0, C0, 0x00
  168. GMADD xvf, d, D0, S0, VBETA, D0
  169. GLD xv, , S0, C1, 0x00
  170. GMADD xvf, d, D1, S0, VBETA, D1
  171. #endif
  172. .if \M == 4
  173. GST xv, , D0, C0, 0x00, D1, C1, 0x00
  174. .elseif \M == 2
  175. GST v, , V0, C0, 0x00, V1, C1, 0x00
  176. .elseif \M == 1
  177. GST f, d, F0, C0, 0x00, F1, C1, 0x00
  178. .endif
  179. // Update C0, C1
  180. PTR_ALSL C0, LDC, C0, 1
  181. PTR_ALSL C1, LDC, C1, 1
  182. // Update X0
  183. PTR_SUB X0, X0, K_LDB
  184. PTR_ADDI X0, X0, 0x10
  185. // Restore A0
  186. move A0, A
  187. .L_M\M\()_N1:
  188. andi N1, N, 0x01
  189. beqz N1, .L_M\M\()_END
  190. GXOR xv, v, D0, D0, D0
  191. move K1, K // Restore K1
  192. bge ZERO, K, .L_M\M\()_N1_END
  193. .L_M\M\()_N1_K1:
  194. PTR_ADDI K1, K1, -1
  195. GLD xv, , S0, A0, 0x00
  196. GLDREPL xv, d, Z0, X0, 0x00
  197. GMADD xvf, d, D0, S0, Z0, D0
  198. PTR_ADD X0, X0, LDB
  199. PTR_ADD A0, A0, LDA
  200. bnez K1, .L_M\M\()_N1_K1
  201. .L_M\M\()_N1_END:
  202. GMUL xvf, d, D0, D0, VALPHA
  203. #ifndef B0
  204. GLD xv, , S0, C0, 0x00
  205. GMADD xvf, d, D0, S0, VBETA, D0
  206. #endif
  207. .if \M == 4
  208. GST xv, , D0, C0, 0x00
  209. .elseif \M == 2
  210. GST v, , V0, C0, 0x00
  211. .elseif \M == 1
  212. GST f, d, F0, C0, 0x00
  213. .endif
  214. .L_M\M\()_END:
  215. .if \M == 4
  216. PTR_ADDI A, A, 0x20
  217. PTR_ADDI C, C, 0x20
  218. .elseif \M == 2
  219. PTR_ADDI A, A, 0x10
  220. PTR_ADDI C, C, 0x10
  221. .elseif \M == 1
  222. .endif
  223. .endm
  224. PROLOGUE
  225. PTR_LD LDC, $sp, 0
  226. push_if_used 2, 2
  227. xvreplve0.d VALPHA, VALPHA
  228. #ifndef B0
  229. xvreplve0.d VBETA, VBETA
  230. #endif
  231. PTR_SLLI LDA, LDA, 3
  232. PTR_SLLI LDB, LDB, 3
  233. PTR_SLLI LDC, LDC, 3
  234. PTR_MUL K_LDB, K, LDB
  235. PTR_SRAI M16, M, 4 // M >> 4
  236. beqz M16, .L_M15
  237. .L_M16:
  238. PTR_SRAI N4, N, 2 // N >> 2
  239. move A0, A // Restore A0
  240. move X0, B // Restore X0
  241. move C0, C // Restore C0
  242. PTR_ADD C1, C0, LDC
  243. PTR_ADD C2, C1, LDC
  244. PTR_ADD C3, C2, LDC
  245. beqz N4, .L_M16_N3
  246. .L_M16_N4:
  247. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \
  248. D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7, \
  249. D8, D8, D8, D9, D9, D9, D10, D10, D10, D11, D11, D11, \
  250. D12, D12, D12, D13, D13, D13, D14, D14, D14, D15, D15, D15
  251. move K1, K // Restore K1
  252. PTR_ADDI N4, N4, -1
  253. bge ZERO, K, .L_M16_N4_END
  254. .L_M16_N4_K1:
  255. PTR_ADDI K1, K1, -1
  256. GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60
  257. GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18
  258. GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \
  259. D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7, \
  260. D8, S0, Z2, D8, D9, S1, Z2, D9, D10, S2, Z2, D10, D11, S3, Z2, D11, \
  261. D12, S0, Z3, D12, D13, S1, Z3, D13, D14, S2, Z3, D14, D15, S3, Z3, D15
  262. PTR_ADD X0, X0, LDB
  263. PTR_ADD A0, A0, LDA
  264. bnez K1, .L_M16_N4_K1
  265. .L_M16_N4_END:
  266. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \
  267. D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \
  268. D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \
  269. D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA
  270. #ifndef B0
  271. GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60
  272. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3
  273. GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60
  274. GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7
  275. GLD xv, , S0, C2, 0x00, S1, C2, 0x20, S2, C2, 0x40, S3, C2, 0x60
  276. GMADD xvf, d, D8, S0, VBETA, D8, D9, S1, VBETA, D9, D10, S2, VBETA, D10, D11, S3, VBETA, D11
  277. GLD xv, , S0, C3, 0x00, S1, C3, 0x20, S2, C3, 0x40, S3, C3, 0x60
  278. GMADD xvf, d, D12, S0, VBETA, D12, D13, S1, VBETA, D13, D14, S2, VBETA, D14, D15, S3, VBETA, D15
  279. #endif
  280. GST xv, , D12, C3, 0x00, D13, C3, 0x20, D14, C3, 0x40, D15, C3, 0x60, \
  281. D8, C2, 0x00, D9, C2, 0x20, D10, C2, 0x40, D11, C2, 0x60, \
  282. D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \
  283. D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60
  284. // Update C0, C1, C2, C3
  285. PTR_ALSL C0, LDC, C0, 2
  286. PTR_ALSL C1, LDC, C1, 2
  287. PTR_ALSL C2, LDC, C2, 2
  288. PTR_ALSL C3, LDC, C3, 2
  289. // Update X0
  290. PTR_SUB X0, X0, K_LDB
  291. PTR_ADDI X0, X0, 0x20
  292. // Restore A0
  293. move A0, A
  294. bnez N4, .L_M16_N4
  295. .L_M16_N3:
  296. andi N2, N, 0x02
  297. beqz N2, .L_M16_N1
  298. .L_M16_N2:
  299. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \
  300. D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7
  301. move K1, K // Restore K1
  302. bge ZERO, K, .L_M16_N2_END
  303. .L_M16_N2_K1:
  304. PTR_ADDI K1, K1, -1
  305. GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60
  306. GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08
  307. GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3, \
  308. D4, S0, Z1, D4, D5, S1, Z1, D5, D6, S2, Z1, D6, D7, S3, Z1, D7
  309. PTR_ADD X0, X0, LDB
  310. PTR_ADD A0, A0, LDA
  311. bnez K1, .L_M16_N2_K1
  312. .L_M16_N2_END:
  313. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \
  314. D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA
  315. #ifndef B0
  316. GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60
  317. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3
  318. GLD xv, , S0, C1, 0x00, S1, C1, 0x20, S2, C1, 0x40, S3, C1, 0x60
  319. GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5, D6, S2, VBETA, D6, D7, S3, VBETA, D7
  320. #endif
  321. GST xv, , D4, C1, 0x00, D5, C1, 0x20, D6, C1, 0x40, D7, C1, 0x60, \
  322. D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60
  323. // Update C0, C1
  324. PTR_ALSL C0, LDC, C0, 1
  325. PTR_ALSL C1, LDC, C1, 1
  326. // Update X0
  327. PTR_SUB X0, X0, K_LDB
  328. PTR_ADDI X0, X0, 0x10
  329. // Restore A0
  330. move A0, A
  331. .L_M16_N1:
  332. andi N1, N, 0x01
  333. beqz N1, .L_M16_END
  334. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3
  335. move K1, K // Restore K1
  336. bge ZERO, K, .L_M16_N1_END
  337. .L_M16_N1_K1:
  338. PTR_ADDI K1, K1, -1
  339. GLD xv, , S0, A0, 0x00, S1, A0, 0x20, S2, A0, 0x40, S3, A0, 0x60
  340. GLDREPL xv, d, Z0, X0, 0x00
  341. GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, D2, S2, Z0, D2, D3, S3, Z0, D3
  342. PTR_ADD X0, X0, LDB
  343. PTR_ADD A0, A0, LDA
  344. bnez K1, .L_M16_N1_K1
  345. .L_M16_N1_END:
  346. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
  347. #ifndef B0
  348. GLD xv, , S0, C0, 0x00, S1, C0, 0x20, S2, C0, 0x40, S3, C0, 0x60
  349. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3
  350. #endif
  351. GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C0, 0x40, D3, C0, 0x60
  352. // Update C0
  353. PTR_ALSL C0, LDC, C0, 2
  354. // Update X0
  355. PTR_SUB X0, X0, K_LDB
  356. PTR_ADDI X0, X0, 0x08
  357. // Restore A0
  358. move A0, A
  359. .L_M16_END:
  360. PTR_ADDI M16, M16, -1
  361. PTR_ADDI A, A, 0x80
  362. PTR_ADDI C, C, 0x80
  363. bnez M16, .L_M16
  364. .L_M15:
  365. andi M8, M, 0x08
  366. beqz M8, .L_M7
  367. .L_M8:
  368. PTR_SRAI N4, N, 2 // N >> 2
  369. move A0, A // Restore A0
  370. move X0, B // Restore X0
  371. move C0, C // Restore C0
  372. PTR_ADD C1, C0, LDC
  373. PTR_ADD C2, C1, LDC
  374. PTR_ADD C3, C2, LDC
  375. beqz N4, .L_M8_N3
  376. .L_M8_N4:
  377. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3, \
  378. D4, D4, D4, D5, D5, D5, D6, D6, D6, D7, D7, D7
  379. move K1, K // Restore K1
  380. PTR_ADDI N4, N4, -1
  381. bge ZERO, K, .L_M8_N4_END
  382. .L_M8_N4_K1:
  383. PTR_ADDI K1, K1, -1
  384. GLD xv, , S0, A0, 0x00, S1, A0, 0x20
  385. GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18
  386. GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \
  387. D2, S0, Z1, D2, D3, S1, Z1, D3, \
  388. D4, S0, Z2, D4, D5, S1, Z2, D5, \
  389. D6, S0, Z3, D6, D7, S1, Z3, D7,
  390. PTR_ADD X0, X0, LDB
  391. PTR_ADD A0, A0, LDA
  392. bnez K1, .L_M8_N4_K1
  393. .L_M8_N4_END:
  394. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \
  395. D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA
  396. #ifndef B0
  397. GLD xv, , S0, C0, 0x00, S1, C0, 0x20
  398. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1
  399. GLD xv, , S0, C1, 0x00, S1, C1, 0x20
  400. GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3
  401. GLD xv, , S0, C2, 0x00, S1, C2, 0x20
  402. GMADD xvf, d, D4, S0, VBETA, D4, D5, S1, VBETA, D5
  403. GLD xv, , S0, C3, 0x00, S1, C3, 0x20
  404. GMADD xvf, d, D6, S0, VBETA, D6, D7, S1, VBETA, D7
  405. #endif
  406. GST xv, , D4, C2, 0x00, D5, C2, 0x20, D6, C3, 0x00, D7, C3, 0x20, \
  407. D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20
  408. // Update C0, C1, C2, C3
  409. PTR_ALSL C0, LDC, C0, 2
  410. PTR_ALSL C1, LDC, C1, 2
  411. PTR_ALSL C2, LDC, C2, 2
  412. PTR_ALSL C3, LDC, C3, 2
  413. // Update X0
  414. PTR_SUB X0, X0, K_LDB
  415. PTR_ADDI X0, X0, 0x20
  416. // Restore A0
  417. move A0, A
  418. bnez N4, .L_M8_N4
  419. .L_M8_N3:
  420. andi N2, N, 0x02
  421. beqz N2, .L_M8_N1
  422. .L_M8_N2:
  423. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3
  424. move K1, K // Restore K1
  425. bge ZERO, K, .L_M8_N2_END
  426. .L_M8_N2_K1:
  427. PTR_ADDI K1, K1, -1
  428. GLD xv, , S0, A0, 0x00, S1, A0, 0x20
  429. GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08
  430. GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1, \
  431. D2, S0, Z1, D2, D3, S1, Z1, D3
  432. PTR_ADD X0, X0, LDB
  433. PTR_ADD A0, A0, LDA
  434. bnez K1, .L_M8_N2_K1
  435. .L_M8_N2_END:
  436. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
  437. #ifndef B0
  438. GLD xv, , S0, C0, 0x00, S1, C0, 0x20
  439. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1
  440. GLD xv, , S0, C1, 0x00, S1, C1, 0x20
  441. GMADD xvf, d, D2, S0, VBETA, D2, D3, S1, VBETA, D3
  442. #endif
  443. GST xv, , D0, C0, 0x00, D1, C0, 0x20, D2, C1, 0x00, D3, C1, 0x20
  444. // Update C0, C1
  445. PTR_ALSL C0, LDC, C0, 1
  446. PTR_ALSL C1, LDC, C1, 1
  447. // Update X0
  448. PTR_SUB X0, X0, K_LDB
  449. PTR_ADDI X0, X0, 0x10
  450. // Restore A0
  451. move A0, A
  452. .L_M8_N1:
  453. andi N1, N, 0x01
  454. beqz N1, .L_M8_END
  455. GXOR xv, v, D0, D0, D0, D1, D1, D1
  456. move K1, K // Restore K1
  457. bge ZERO, K, .L_M8_N1_END
  458. .L_M8_N1_K1:
  459. PTR_ADDI K1, K1, -1
  460. GLD xv, , S0, A0, 0x00, S1, A0, 0x20
  461. GLDREPL xv, d, Z0, X0, 0x00
  462. GMADD xvf, d, D0, S0, Z0, D0, D1, S1, Z0, D1
  463. PTR_ADD X0, X0, LDB
  464. PTR_ADD A0, A0, LDA
  465. bnez K1, .L_M8_N1_K1
  466. .L_M8_N1_END:
  467. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA
  468. #ifndef B0
  469. GLD xv, , S0, C0, 0x00, S1, C0, 0x20
  470. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1
  471. #endif
  472. GST xv, , D0, C0, 0x00, D1, C0, 0x20
  473. .L_M8_END:
  474. PTR_ADDI A, A, 0x40
  475. PTR_ADDI C, C, 0x40
  476. .L_M7:
  477. andi M4, M, 0x04
  478. beqz M4, .L_M3
  479. .L_M4:
  480. DGEMM_SMALL_KERNEL_NT_TAIL 4
  481. .L_M3:
  482. andi M2, M, 0x02
  483. beqz M2, .L_M1
  484. .L_M2:
  485. DGEMM_SMALL_KERNEL_NT_TAIL 2
  486. .L_M1:
  487. andi M1, M, 0x01
  488. beqz M1, .L_M0
  489. DGEMM_SMALL_KERNEL_NT_TAIL 1
  490. .L_M0:
  491. pop_if_used 2, 2
  492. jirl $r0, $r1, 0x0
  493. EPILOGUE