You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_small_kernel_tt_lasx.S 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. /***************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #include "loongarch64_asm.S"
  30. #define M $a0
  31. #define N $a1
  32. #define K $a2
  33. #define A $a3
  34. #define LDA $a4
  35. #define ALPHA $f0
  36. #define B $a5
  37. #define LDB $a6
  38. #define C $a7
  39. #define LDC $t0
  40. #ifdef B0
  41. #define BETA $f1
  42. #endif
  43. #undef ZERO
  44. #define ZERO $r0
  45. #define M4 $t1
  46. #define M2 $t1
  47. #define M1 $t1
  48. #define N4 $t2
  49. #define N2 $t2
  50. #define N1 $t2
  51. #define K_LDB $t3
  52. #define A0 $t4
  53. #define X0 $t5
  54. #define A1 $t6
  55. #define A2 $t7
  56. #define A3 $t8
  57. #define C0 $s0
  58. #define C1 $s1
  59. #define C2 $s2
  60. #define C3 $s3
  61. #define K1 $s4
  62. #define B1 $s5
  63. #define B2 $s6
  64. #define B3 $s7
  65. #define VALPHA $xr0
  66. #ifndef B0
  67. #define VBETA $xr1
  68. #endif
  69. #define D0 $xr2
  70. #define D1 $xr3
  71. #define D2 $xr4
  72. #define D3 $xr5
  73. #define T0 $xr6
  74. #define T1 $xr7
  75. #define T2 $xr8
  76. #define T3 $xr9
  77. #define Y0 $xr10
  78. #define Y1 $xr11
  79. #define Y2 $xr12
  80. #define Y3 $xr13
  81. #define G0 $xr14
  82. #define G1 $xr15
  83. #define G2 $xr16
  84. #define G3 $xr17
  85. #define S0 $xr18
  86. #define S1 $xr19
  87. #define S2 $xr20
  88. #define S3 $xr21
  89. #define Z0 $xr22
  90. #define Z1 $xr23
  91. #define Z2 $xr24
  92. #define Z3 $xr25
  93. #define V0 $vr2
  94. #define V1 $vr3
  95. #define V2 $vr4
  96. #define V3 $vr5
  97. #define F0 $f2
  98. #define F1 $f3
  99. #define F2 $f4
  100. #define F3 $f5
  101. PROLOGUE
  102. PTR_LD LDC, $sp, 0
  103. push_if_used 8, 2
  104. xvreplve0.d VALPHA, VALPHA
  105. #ifndef B0
  106. xvreplve0.d VBETA, VBETA
  107. #endif
  108. PTR_SLLI LDA, LDA, 3
  109. PTR_SLLI LDB, LDB, 3
  110. PTR_SLLI LDC, LDC, 3
  111. PTR_MUL K_LDB, K, LDB
  112. PTR_SRAI M4, M, 2 // M >> 2
  113. beqz M4, .L_M3
  114. .L_M4:
  115. PTR_SRAI N4, N, 2 // N >> 2
  116. move A0, A // Restore A0
  117. PTR_ADD A1, A0, LDA
  118. PTR_ADD A2, A1, LDA
  119. PTR_ADD A3, A2, LDA
  120. move X0, B // Restore X0
  121. move C0, C // Restore C0
  122. PTR_ADD C1, C0, LDC
  123. PTR_ADD C2, C1, LDC
  124. PTR_ADD C3, C2, LDC
  125. beqz N4, .L_M4_N3
  126. .L_M4_N4:
  127. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3
  128. move K1, K // Restore K1
  129. PTR_ADDI N4, N4, -1
  130. bge ZERO, K, .L_M4_N4_END
  131. PTR_SRAI K1, K1, 2
  132. beq ZERO, K1, .L_M4_N4_K3
  133. PTR_ADD B1, X0, LDB
  134. PTR_ADD B2, B1, LDB
  135. PTR_ADD B3, B2, LDB
  136. .L_M4_N4_K4:
  137. PTR_ADDI K1, K1, -1
  138. GLD xv, , T0, A0, 0x00, T1, A1, 0x00, T2, A2, 0x00, T3, A3, 0x00
  139. GTRANSPOSE4x4_D T0, T1, T2, T3, S0, S1, S2, S3, Z0, Z1
  140. GLDREPL xv, d, Z0, X0, 0x00, Z1, B1, 0x00, Z2, B2, 0x00, Z3, B3, 0x00
  141. GLDREPL xv, d, T0, X0, 0x08, T1, B1, 0x08, T2, B2, 0x08, T3, B3, 0x08
  142. GLDREPL xv, d, Y0, X0, 0x10, Y1, B1, 0x10, Y2, B2, 0x10, Y3, B3, 0x10
  143. GLDREPL xv, d, G0, X0, 0x18, G1, B1, 0x18, G2, B2, 0x18, G3, B3, 0x18
  144. GMADD xvf, d, D0, S0, Z0, D0, \
  145. D1, S0, T0, D1, \
  146. D2, S0, Y0, D2, \
  147. D3, S0, G0, D3
  148. GMADD xvf, d, D0, S1, Z1, D0, \
  149. D1, S1, T1, D1, \
  150. D2, S1, Y1, D2, \
  151. D3, S1, G1, D3
  152. GMADD xvf, d, D0, S2, Z2, D0, \
  153. D1, S2, T2, D1, \
  154. D2, S2, Y2, D2, \
  155. D3, S2, G2, D3
  156. GMADD xvf, d, D0, S3, Z3, D0, \
  157. D1, S3, T3, D1, \
  158. D2, S3, Y3, D2, \
  159. D3, S3, G3, D3
  160. PTR_ALSL X0, LDB, X0, 2
  161. PTR_ALSL B1, LDB, B1, 2
  162. PTR_ALSL B2, LDB, B2, 2
  163. PTR_ALSL B3, LDB, B3, 2
  164. PTR_ADDI A0, A0, 0x20
  165. PTR_ADDI A1, A1, 0x20
  166. PTR_ADDI A2, A2, 0x20
  167. PTR_ADDI A3, A3, 0x20
  168. bnez K1, .L_M4_N4_K4
  169. .L_M4_N4_K3:
  170. andi K1, K, 3
  171. beqz K1, .L_M4_N4_END
  172. .L_M4_N4_K1:
  173. PTR_ADDI K1, K1, -1
  174. GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00
  175. GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3
  176. GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18
  177. GMADD xvf, d, D0, S0, Z0, D0, \
  178. D1, S0, Z1, D1, \
  179. D2, S0, Z2, D2, \
  180. D3, S0, Z3, D3
  181. PTR_ADD X0, X0, LDB
  182. PTR_ADDI A0, A0, 0x08
  183. PTR_ADDI A1, A1, 0x08
  184. PTR_ADDI A2, A2, 0x08
  185. PTR_ADDI A3, A3, 0x08
  186. bnez K1, .L_M4_N4_K1
  187. .L_M4_N4_END:
  188. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
  189. #ifndef B0
  190. GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00
  191. GMADD xvf, d, D0, S0, VBETA, D0, \
  192. D1, S1, VBETA, D1, \
  193. D2, S2, VBETA, D2, \
  194. D3, S3, VBETA, D3
  195. #endif
  196. GST xv, , D3, C3, 0x00, \
  197. D2, C2, 0x00, \
  198. D1, C1, 0x00, \
  199. D0, C0, 0x00
  200. // Update C0, C1, C2, C3
  201. PTR_ALSL C0, LDC, C0, 2
  202. PTR_ALSL C1, LDC, C1, 2
  203. PTR_ALSL C2, LDC, C2, 2
  204. PTR_ALSL C3, LDC, C3, 2
  205. // Update X0
  206. PTR_SUB X0, X0, K_LDB
  207. PTR_ADDI X0, X0, 0x20
  208. // Restore A0, A1, A2, A3
  209. move A0, A
  210. PTR_ADD A1, A0, LDA
  211. PTR_ADD A2, A1, LDA
  212. PTR_ADD A3, A2, LDA
  213. bnez N4, .L_M4_N4
  214. .L_M4_N3:
  215. andi N2, N, 0x02
  216. beqz N2, .L_M4_N1
  217. .L_M4_N2:
  218. GXOR xv, v, D0, D0, D0, D1, D1, D1
  219. move K1, K // Restore K1
  220. bge ZERO, K, .L_M4_N2_END
  221. .L_M4_N2_K1:
  222. PTR_ADDI K1, K1, -1
  223. GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00
  224. GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3
  225. GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08
  226. GMADD xvf, d, D0, S0, Z0, D0, \
  227. D1, S0, Z1, D1
  228. PTR_ADD X0, X0, LDB
  229. PTR_ADDI A0, A0, 0x08
  230. PTR_ADDI A1, A1, 0x08
  231. PTR_ADDI A2, A2, 0x08
  232. PTR_ADDI A3, A3, 0x08
  233. bnez K1, .L_M4_N2_K1
  234. .L_M4_N2_END:
  235. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA
  236. #ifndef B0
  237. GLD xv, , S0, C0, 0x00, S1, C1, 0x00
  238. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1
  239. #endif
  240. GST xv, , D1, C1, 0x00, \
  241. D0, C0, 0x00
  242. // Update C0, C1
  243. PTR_ALSL C0, LDC, C0, 1
  244. PTR_ALSL C1, LDC, C1, 1
  245. // Update X0
  246. PTR_SUB X0, X0, K_LDB
  247. PTR_ADDI X0, X0, 0x10
  248. // Restore A0
  249. move A0, A
  250. PTR_ADD A1, A0, LDA
  251. PTR_ADD A2, A1, LDA
  252. PTR_ADD A3, A2, LDA
  253. .L_M4_N1:
  254. andi N1, N, 0x01
  255. beqz N1, .L_M4_END
  256. GXOR xv, v, D0, D0, D0
  257. move K1, K // Restore K1
  258. bge ZERO, K, .L_M4_N1_END
  259. .L_M4_N1_K1:
  260. PTR_ADDI K1, K1, -1
  261. GLD xv, , S0, A0, 0x00, S1, A1, 0x00, S2, A2, 0x00, S3, A3, 0x00
  262. GINSVE0 xv, d, S0, S1, 1, S0, S2, 2, S0, S3, 3
  263. GLDREPL xv, d, Z0, X0, 0x00
  264. GMADD xvf, d, D0, S0, Z0, D0
  265. PTR_ADD X0, X0, LDB
  266. PTR_ADDI A0, A0, 0x08
  267. PTR_ADDI A1, A1, 0x08
  268. PTR_ADDI A2, A2, 0x08
  269. PTR_ADDI A3, A3, 0x08
  270. bnez K1, .L_M4_N1_K1
  271. .L_M4_N1_END:
  272. GMUL xvf, d, D0, D0, VALPHA
  273. #ifndef B0
  274. GLD xv, , S0, C0, 0x00
  275. GMADD xvf, d, D0, S0, VBETA, D0
  276. #endif
  277. GST xv, , D0, C0, 0x00
  278. // Update C0
  279. PTR_ALSL C0, LDC, C0, 2
  280. // Update X0
  281. PTR_SUB X0, X0, K_LDB
  282. PTR_ADDI X0, X0, 0x08
  283. // Restore A0
  284. move A0, A
  285. PTR_ADD A1, A0, LDA
  286. PTR_ADD A2, A1, LDA
  287. PTR_ADD A3, A2, LDA
  288. .L_M4_END:
  289. PTR_ADDI M4, M4, -1
  290. PTR_ALSL A, LDA, A, 2 // A += LDA << 2;
  291. PTR_ADDI C, C, 0x20
  292. bnez M4, .L_M4
  293. .L_M3:
  294. andi M2, M, 0x02
  295. beqz M2, .L_M1
  296. .L_M2:
  297. PTR_SRAI N4, N, 2 // N >> 2
  298. move A0, A // Restore A0
  299. PTR_ADD A1, A0, LDA
  300. move X0, B // Restore X0
  301. move C0, C // Restore C0
  302. PTR_ADD C1, C0, LDC
  303. PTR_ADD C2, C1, LDC
  304. PTR_ADD C3, C2, LDC
  305. beqz N4, .L_M2_N3
  306. .L_M2_N4:
  307. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3
  308. move K1, K // Restore K1
  309. PTR_ADDI N4, N4, -1
  310. bge ZERO, K, .L_M2_N4_END
  311. .L_M2_N4_K1:
  312. PTR_ADDI K1, K1, -1
  313. GLD xv, , S0, A0, 0x00, S1, A1, 0x00
  314. GINSVE0 xv, d, S0, S1, 1
  315. GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18
  316. GMADD xvf, d, D0, S0, Z0, D0, \
  317. D1, S0, Z1, D1, \
  318. D2, S0, Z2, D2, \
  319. D3, S0, Z3, D3
  320. PTR_ADD X0, X0, LDB
  321. PTR_ADDI A0, A0, 0x08
  322. PTR_ADDI A1, A1, 0x08
  323. bnez K1, .L_M2_N4_K1
  324. .L_M2_N4_END:
  325. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
  326. #ifndef B0
  327. GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00
  328. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3
  329. #endif
  330. GST v, , V3, C3, 0x00, \
  331. V2, C2, 0x00, \
  332. V1, C1, 0x00, \
  333. V0, C0, 0x00
  334. // Update C0, C1, C2, C3
  335. PTR_ALSL C0, LDC, C0, 2
  336. PTR_ALSL C1, LDC, C1, 2
  337. PTR_ALSL C2, LDC, C2, 2
  338. PTR_ALSL C3, LDC, C3, 2
  339. // Update X0
  340. PTR_SUB X0, X0, K_LDB
  341. PTR_ADDI X0, X0, 0x20
  342. // Restore A0, A1
  343. move A0, A
  344. PTR_ADD A1, A0, LDA
  345. bnez N4, .L_M2_N4
  346. .L_M2_N3:
  347. andi N2, N, 0x02
  348. beqz N2, .L_M2_N1
  349. .L_M2_N2:
  350. GXOR xv, v, D0, D0, D0, D1, D1, D1
  351. move K1, K // Restore K1
  352. bge ZERO, K, .L_M2_N2_END
  353. .L_M2_N2_K1:
  354. PTR_ADDI K1, K1, -1
  355. GLD xv, , S0, A0, 0x00, S1, A1, 0x00
  356. GINSVE0 xv, d, S0, S1, 1
  357. GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08
  358. GMADD xvf, d, D0, S0, Z0, D0, \
  359. D1, S0, Z1, D1
  360. PTR_ADD X0, X0, LDB
  361. PTR_ADDI A0, A0, 0x08
  362. PTR_ADDI A1, A1, 0x08
  363. bnez K1, .L_M2_N2_K1
  364. .L_M2_N2_END:
  365. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA
  366. #ifndef B0
  367. GLD xv, , S0, C0, 0x00, S1, C1, 0x00
  368. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1
  369. #endif
  370. GST v, , V1, C1, 0x00, \
  371. V0, C0, 0x00
  372. // Update C0, C1
  373. PTR_ALSL C0, LDC, C0, 1
  374. PTR_ALSL C1, LDC, C1, 1
  375. // Update X0
  376. PTR_SUB X0, X0, K_LDB
  377. PTR_ADDI X0, X0, 0x10
  378. // Restore A0, A1
  379. move A0, A
  380. PTR_ADD A1, A0, LDA
  381. .L_M2_N1:
  382. andi N1, N, 0x01
  383. beqz N1, .L_M2_END
  384. GXOR xv, v, D0, D0, D0
  385. move K1, K // Restore K1
  386. bge ZERO, K, .L_M2_N1_END
  387. .L_M2_N1_K1:
  388. PTR_ADDI K1, K1, -1
  389. GLD xv, , S0, A0, 0x00, S1, A1, 0x00
  390. GINSVE0 xv, d, S0, S1, 1
  391. GLDREPL xv, d, Z0, X0, 0x00
  392. GMADD xvf, d, D0, S0, Z0, D0
  393. PTR_ADD X0, X0, LDB
  394. PTR_ADDI A0, A0, 0x08
  395. PTR_ADDI A1, A1, 0x08
  396. bnez K1, .L_M2_N1_K1
  397. .L_M2_N1_END:
  398. GMUL xvf, d, D0, D0, VALPHA
  399. #ifndef B0
  400. GLD xv, , S0, C0, 0x00
  401. GMADD xvf, d, D0, S0, VBETA, D0
  402. #endif
  403. GST v, , V0, C0, 0x00
  404. // Update C0
  405. PTR_ALSL C0, LDC, C0, 2
  406. // Update X0
  407. PTR_SUB X0, X0, K_LDB
  408. PTR_ADDI X0, X0, 0x08
  409. // Restore A0, A1
  410. move A0, A
  411. PTR_ADD A1, A0, LDA
  412. .L_M2_END:
  413. PTR_ALSL A, LDA, A, 1 // A += LDA << 1;
  414. PTR_ADDI C, C, 0x10
  415. .L_M1:
  416. andi M1, M, 0x01
  417. beqz M1, .L_M0
  418. PTR_SRAI N4, N, 2 // N >> 2
  419. move A0, A // Restore A0
  420. move X0, B // Restore X0
  421. move C0, C // Restore C0
  422. PTR_ADD C1, C0, LDC
  423. PTR_ADD C2, C1, LDC
  424. PTR_ADD C3, C2, LDC
  425. beqz N4, .L_M1_N3
  426. .L_M1_N4:
  427. GXOR xv, v, D0, D0, D0, D1, D1, D1, D2, D2, D2, D3, D3, D3
  428. move K1, K // Restore K1
  429. PTR_ADDI N4, N4, -1
  430. bge ZERO, K, .L_M1_N4_END
  431. .L_M1_N4_K1:
  432. PTR_ADDI K1, K1, -1
  433. GLD xv, , S0, A0, 0x00
  434. GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08, Z2, X0, 0x10, Z3, X0, 0x18
  435. GMADD xvf, d, D0, S0, Z0, D0, \
  436. D1, S0, Z1, D1, \
  437. D2, S0, Z2, D2, \
  438. D3, S0, Z3, D3
  439. PTR_ADD X0, X0, LDB
  440. PTR_ADDI A0, A0, 0x08
  441. bnez K1, .L_M1_N4_K1
  442. .L_M1_N4_END:
  443. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA
  444. #ifndef B0
  445. GLD xv, , S0, C0, 0x00, S1, C1, 0x00, S2, C2, 0x00, S3, C3, 0x00
  446. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1, D2, S2, VBETA, D2, D3, S3, VBETA, D3
  447. #endif
  448. GST f, d, F3, C3, 0x00, \
  449. F2, C2, 0x00, \
  450. F1, C1, 0x00, \
  451. F0, C0, 0x00
  452. // Update C0, C1, C2, C3
  453. PTR_ALSL C0, LDC, C0, 2
  454. PTR_ALSL C1, LDC, C1, 2
  455. PTR_ALSL C2, LDC, C2, 2
  456. PTR_ALSL C3, LDC, C3, 2
  457. // Update X0
  458. PTR_SUB X0, X0, K_LDB
  459. PTR_ADDI X0, X0, 0x20
  460. // Restore A0, A1
  461. move A0, A
  462. bnez N4, .L_M1_N4
  463. .L_M1_N3:
  464. andi N2, N, 0x02
  465. beqz N2, .L_M1_N1
  466. .L_M1_N2:
  467. GXOR xv, v, D0, D0, D0, D1, D1, D1
  468. move K1, K // Restore K1
  469. bge ZERO, K, .L_M1_N2_END
  470. .L_M1_N2_K1:
  471. PTR_ADDI K1, K1, -1
  472. GLD xv, , S0, A0, 0x00
  473. GLDREPL xv, d, Z0, X0, 0x00, Z1, X0, 0x08
  474. GMADD xvf, d, D0, S0, Z0, D0, \
  475. D1, S0, Z1, D1
  476. PTR_ADD X0, X0, LDB
  477. PTR_ADDI A0, A0, 0x08
  478. bnez K1, .L_M1_N2_K1
  479. .L_M1_N2_END:
  480. GMUL xvf, d, D0, D0, VALPHA, D1, D1, VALPHA
  481. #ifndef B0
  482. GLD xv, , S0, C0, 0x00, S1, C1, 0x00
  483. GMADD xvf, d, D0, S0, VBETA, D0, D1, S1, VBETA, D1
  484. #endif
  485. GST f, d, F1, C1, 0x00, \
  486. F0, C0, 0x00
  487. // Update C0, C1
  488. PTR_ALSL C0, LDC, C0, 1
  489. PTR_ALSL C1, LDC, C1, 1
  490. // Update X0
  491. PTR_SUB X0, X0, K_LDB
  492. PTR_ADDI X0, X0, 0x10
  493. // Restore A0
  494. move A0, A
  495. .L_M1_N1:
  496. andi N1, N, 0x01
  497. beqz N1, .L_M0
  498. GXOR xv, v, D0, D0, D0
  499. move K1, K // Restore K1
  500. bge ZERO, K, .L_M1_N1_END
  501. .L_M1_N1_K1:
  502. PTR_ADDI K1, K1, -1
  503. GLD xv, , S0, A0, 0x00
  504. GLDREPL xv, d, Z0, X0, 0x00
  505. GMADD xvf, d, D0, S0, Z0, D0
  506. PTR_ADD X0, X0, LDB
  507. PTR_ADDI A0, A0, 0x08
  508. bnez K1, .L_M1_N1_K1
  509. .L_M1_N1_END:
  510. GMUL xvf, d, D0, D0, VALPHA
  511. #ifndef B0
  512. GLD xv, , S0, C0, 0x00
  513. GMADD xvf, d, D0, S0, VBETA, D0
  514. #endif
  515. GST f, d, F0, C0, 0x00
  516. // Update C0
  517. PTR_ALSL C0, LDC, C0, 2
  518. // Update X0
  519. PTR_SUB X0, X0, K_LDB
  520. PTR_ADDI X0, X0, 0x08
  521. // Restore A0
  522. move A0, A
  523. .L_M0:
  524. pop_if_used 8, 2
  525. jirl $r0, $r1, 0x0
  526. EPILOGUE