You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_2x2_lsx.S 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812
  1. /*******************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Function parameters */
  30. #define M $r4 // param 1: bm
  31. #define N $r5 // param 2: bn
  32. #define K $r6 // param 3: bk
  33. #define ALPHA_R $f0 // param 4: alphar
  34. #define ALPHA_I $f1 // param 5: alphai
  35. #define A $r7 // param 6: ba
  36. #define B $r8 // param 7: bb
  37. #define C $r9 // param 8: bc
  38. #define LDC $r10 // param 9: ldc
  39. #if defined (TRMMKERNEL)
  40. #define OFFSET $r11 // param 10: offset
  41. #endif
  42. #define OFF $r26
  43. #define I $r12
  44. #define J $r13
  45. #define L $r14
  46. #define TL $r15
  47. #define A0 $r16
  48. #define B0 $r17
  49. #define C0 $r18
  50. #define C1 $r19
  51. #define C2 $r20
  52. #define C3 $r23
  53. #define T0 $r24
  54. #define T1 $r25
  55. #define a1 $f2
  56. #define a2 $f3
  57. #define a3 $f4
  58. #define a4 $f5
  59. #define a5 $f6
  60. #define a6 $f7
  61. #define a7 $f8
  62. #define a8 $f9
  63. #define b1 $f10
  64. #define b2 $f11
  65. #define b3 $f12
  66. #define b4 $f13
  67. #define b5 $f14
  68. #define b6 $f15
  69. #define b7 $f16
  70. #define b8 $f17
  71. #define c11 $f18
  72. #define c12 $f19
  73. #define c21 $f20
  74. #define c22 $f21
  75. #define c31 $f22
  76. #define c32 $f23
  77. #define c41 $f24
  78. #define c42 $f25
  79. /* LASX vectors */
  80. #define U0 $vr30
  81. #define U1 $vr31
  82. #define U2 $vr2
  83. #define U3 $vr3
  84. #define U4 $vr4
  85. #define U5 $vr5
  86. #define U6 $vr6
  87. #define U7 $vr7
  88. #define U8 $vr8
  89. #define U9 $vr9
  90. #define U10 $vr10
  91. #define U11 $vr11
  92. #define U12 $vr12
  93. #define U13 $vr13
  94. #define U14 $vr14
  95. #define U15 $vr15
  96. #define D0 $vr16
  97. #define D1 $vr17
  98. #define D2 $vr18
  99. #define D3 $vr19
  100. #define D4 $vr20
  101. #define D5 $vr21
  102. #define D6 $vr22
  103. #define D7 $vr23
  104. #define D8 $vr24
  105. #define D9 $vr25
  106. #define D10 $vr26
  107. #define D11 $vr27
  108. #define VALPHAR $vr28
  109. #define VALPHAI $vr29
  110. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  111. #define VMADD1 VFMADD
  112. #define VMADD2 VFMADD
  113. #define VMADD3 VNMSUB
  114. #define VMADD4 VFMADD
  115. #define MADD1 MADD
  116. #define MADD2 MADD
  117. #define MADD3 NMSUB
  118. #define MADD4 MADD
  119. #endif
  120. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  121. #define VMADD1 VFMADD
  122. #define VMADD2 VFMADD
  123. #define VMADD3 VFMADD
  124. #define VMADD4 VNMSUB
  125. #define MADD1 MADD
  126. #define MADD2 MADD
  127. #define MADD3 MADD
  128. #define MADD4 NMSUB
  129. #endif
  130. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  131. #define VMADD1 VFMADD
  132. #define VMADD2 VNMSUB
  133. #define VMADD3 VFMADD
  134. #define VMADD4 VFMADD
  135. #define MADD1 MADD
  136. #define MADD2 NMSUB
  137. #define MADD3 MADD
  138. #define MADD4 MADD
  139. #endif
  140. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  141. #define VMADD1 VFMADD
  142. #define VMADD2 VNMSUB
  143. #define VMADD3 VNMSUB
  144. #define VMADD4 VNMSUB
  145. #define MADD1 MADD
  146. #define MADD2 NMSUB
  147. #define MADD3 NMSUB
  148. #define MADD4 NMSUB
  149. #endif
  150. PROLOGUE
  151. addi.d $sp, $sp, -128
  152. SDARG $r23, $sp, 0
  153. SDARG $r24, $sp, 8
  154. SDARG $r25, $sp, 16
  155. SDARG $r26, $sp, 24
  156. SDARG $r27, $sp, 32
  157. ST $f23, $sp, 40
  158. ST $f24, $sp, 48
  159. ST $f25, $sp, 56
  160. ST $f26, $sp, 64
  161. ST $f27, $sp, 72
  162. ST $f28, $sp, 80
  163. ST $f29, $sp, 88
  164. ST $f30, $sp, 96
  165. ST $f31, $sp, 104
  166. ST ALPHA_R,$sp, 112
  167. ST ALPHA_I,$sp, 120
  168. vldrepl.w VALPHAR, $sp, 112
  169. vldrepl.w VALPHAI, $sp, 120
  170. #if defined (TRMMKERNEL) && !defined(LEFT)
  171. sub.d OFF, $r0, OFFSET
  172. #else
  173. xor OFF, OFF, OFF
  174. #endif
  175. slli.d LDC, LDC, 2
  176. move J, $r0
  177. srai.d T0, N, 1
  178. beq J, T0, .L19
  179. .L10: /* for(j=0; j<bn/2; j+=1) */
  180. move C0, C
  181. slli.d TL, LDC, 1
  182. add.d C1, C0, TL
  183. move A0, A //ptrba
  184. #if defined(TRMMKERNEL) && defined(LEFT)
  185. move OFF, OFFSET
  186. #endif
  187. move I, $r0
  188. srai.d T0, M, 1
  189. beq I, T0, .L150
  190. .L11: /* for(i=0; i<bm/2; i+=1) */
  191. move B0, B //ptrbb
  192. move TL, K /* TL = bk */
  193. #if defined(TRMMKERNEL)
  194. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  195. move B0, B //ptrbb
  196. #else
  197. slli.d C3, OFF, 0x04
  198. add.d A0, A0, C3
  199. add.d B0, B, C3
  200. #endif
  201. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  202. sub.d TL, K, OFF //temp
  203. #elif defined(LEFT)
  204. addi.d TL, OFF, 2
  205. #else
  206. addi.d TL, OFF, 2
  207. #endif
  208. #endif // #if defined(TRMMKERNEL)
  209. vxor.v U0, U0, U0
  210. vxor.v U1, U1, U1
  211. move L, $r0 //cycle param k
  212. srai.d C2, TL, 2
  213. beq L, C2, .L130
  214. blt C2, L, .L130
  215. .L12: /* for(k=0; k<bk/4; k+=1) */
  216. vld D0, A0, 0x00 //a0 a1 a2 a3
  217. vld D1, B0, 0x00 //b0 b1 b2 b3
  218. vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
  219. vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
  220. vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
  221. vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
  222. VMADD1 U0, D2, D4, U0 //res0 2 4 6
  223. VMADD2 U1, D3, D4, U1 //res1 3 4 7
  224. VMADD3 U0, D3, D5, U0
  225. VMADD4 U1, D2, D5, U1
  226. vld D0, A0, 0x10 //a0 a1 a2 a3
  227. vld D1, B0, 0x10 //b0 b1 b2 b3
  228. vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
  229. vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
  230. vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
  231. vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
  232. VMADD1 U0, D2, D4, U0 //res0 2 4 6
  233. VMADD2 U1, D3, D4, U1 //res1 3 4 7
  234. VMADD3 U0, D3, D5, U0
  235. VMADD4 U1, D2, D5, U1
  236. vld D0, A0, 0x20 //a0 a1 a2 a3
  237. vld D1, B0, 0x20 //b0 b1 b2 b3
  238. vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
  239. vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
  240. vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
  241. vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
  242. VMADD1 U0, D2, D4, U0 //res0 2 4 6
  243. VMADD2 U1, D3, D4, U1 //res1 3 4 7
  244. VMADD3 U0, D3, D5, U0
  245. VMADD4 U1, D2, D5, U1
  246. vld D0, A0, 0x30 //a0 a1 a2 a3
  247. vld D1, B0, 0x30 //b0 b1 b2 b3
  248. vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
  249. vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
  250. vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
  251. vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
  252. VMADD1 U0, D2, D4, U0 //res0 2 4 6
  253. VMADD2 U1, D3, D4, U1 //res1 3 4 7
  254. VMADD3 U0, D3, D5, U0
  255. VMADD4 U1, D2, D5, U1
  256. addi.d A0, A0, 0x40
  257. addi.d B0, B0, 0x40
  258. addi.d L, L, 1
  259. blt L, C2, .L12
  260. .L130:
  261. move L, $r0
  262. andi C2, TL, 3
  263. beq L, C2, .L14
  264. .L13: /* for(k=0; k<(bk&3); k+=1) */
  265. vld D0, A0, 0x00 //a0 a1 a2 a3
  266. vld D1, B0, 0x00 //b0 b1 b2 b3
  267. vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
  268. vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
  269. vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
  270. vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
  271. VMADD1 U0, D2, D4, U0 //res0 2 4 6
  272. VMADD2 U1, D3, D4, U1 //res1 3 5 7
  273. VMADD3 U0, D3, D5, U0
  274. VMADD4 U1, D2, D5, U1
  275. addi.d A0, A0, 0x10
  276. addi.d B0, B0, 0x10
  277. addi.d L, L, 1
  278. blt L, C2, .L13
  279. .L14:
  280. #if defined(TRMMKERNEL)
  281. vld U8, C0, 0x00 //0 1 2 3
  282. vld U9, C1, 0x00 //4 5 6 7
  283. vpackev.w U10, U9, U8 //0 4 2 6
  284. vpermi.w U10, U10, 0xd8 //0 2 4 6
  285. vpackod.w U11, U9, U8 //1 5 3 7
  286. vpermi.w U11, U11, 0xd8 //1 3 5 7
  287. vfmul.s U10, U0, VALPHAR
  288. vfmul.s U11, U1, VALPHAR
  289. VNMSUB U10, U1, VALPHAI, U10
  290. VFMADD U11, U0, VALPHAI, U11
  291. vilvl.w U8, U11, U10 //0 1 2 3
  292. vilvh.w U9, U11, U10 //4 5 6 7
  293. vst U8, C0, 0x00
  294. vst U9, C1, 0x00
  295. #else
  296. vld U8, C0, 0x00 //0 1 2 3
  297. vld U9, C1, 0x00 //4 5 6 7
  298. vpackev.w U10, U9, U8 //0 4 2 6
  299. vpermi.w U10, U10, 0xd8 //0 2 4 6
  300. vpackod.w U11, U9, U8 //1 5 3 7
  301. vpermi.w U11, U11, 0xd8 //1 3 5 7
  302. VFMADD U10, U0, VALPHAR, U10
  303. VFMADD U11, U1, VALPHAR, U11
  304. VNMSUB U10, U1, VALPHAI, U10
  305. VFMADD U11, U0, VALPHAI, U11
  306. vilvl.w U8, U11, U10 //0 1 2 3
  307. vilvh.w U9, U11, U10 //4 5 6 7
  308. vst U8, C0, 0x00
  309. vst U9, C1, 0x00
  310. #endif
  311. #if defined(TRMMKERNEL)
  312. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  313. sub.d TL, K, OFF
  314. #ifdef LEFT
  315. addi.d TL, TL, -2
  316. #else
  317. addi.d TL, TL, -2
  318. #endif
  319. slli.d C3, TL, 0x04
  320. add.d A0, A0, C3
  321. add.d B0, B0, C3
  322. #endif
  323. #ifdef LEFT
  324. addi.d OFF, OFF, 2
  325. #endif
  326. #endif // #if defined(TRMMKERNEL)
  327. addi.d C0, C0, 0x10
  328. addi.d C1, C1, 0x10
  329. addi.d I, I, 1
  330. blt I, T0, .L11
  331. .L150:
  332. move I, $r0
  333. andi T0, M, 1
  334. beq I, T0, .L18
  335. .L15: /* for(i=0; i<(bm&1); i+=1) */
  336. move B0, B //ptrbb
  337. move TL, K /* TL = bk */
  338. #if defined(TRMMKERNEL)
  339. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  340. move B0, B //ptrbb
  341. #else
  342. slli.d C3, OFF, 0x03
  343. add.d A0, A0, C3
  344. slli.d C3, OFF, 0x04
  345. add.d B0, B, C3
  346. #endif
  347. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  348. sub.d TL, K, OFF
  349. #elif defined(LEFT)
  350. addi.d TL, OFF, 1
  351. #else
  352. addi.d TL, OFF, 2
  353. #endif
  354. #endif // #if defined(TRMMKERNEL)
  355. MTC c11, $r0
  356. MTC c12, $r0
  357. MTC c21, $r0
  358. MTC c22, $r0
  359. move L, $r0 //cycle param k
  360. beq L, TL, .L17
  361. blt TL, L, .L17
  362. .L16: /* for (k=0; k<bk; k+=1) */
  363. LD a1, A0, 0x00 //load0
  364. LD b1, B0, 0x00 //load1
  365. MADD1 c11, a1, b1, c11 //res0
  366. LD a2, A0, 0x04 //load2
  367. MADD2 c12, a2, b1, c12 //res1
  368. LD b2, B0, 0x04 //load3
  369. MADD3 c11, a2, b2, c11
  370. MADD4 c12, a1, b2, c12
  371. LD b3, B0, 0x08 //load4
  372. MADD1 c21, a1, b3, c21 //res2
  373. MADD2 c22, a2, b3, c22 //res3
  374. LD b4, B0, 0x0c //load5
  375. MADD3 c21, a2, b4, c21
  376. MADD4 c22, a1, b4, c22
  377. addi.d A0, A0, 0x08
  378. addi.d B0, B0, 0x10
  379. addi.d L, L, 1
  380. blt L, TL, .L16
  381. .L17:
  382. #if defined(TRMMKERNEL)
  383. MUL a5, c11, ALPHA_R
  384. MUL a6, c12, ALPHA_I
  385. SUB a5, a5, a6
  386. ST a5, C0, 0x00
  387. MUL a5, c12, ALPHA_R
  388. MUL a6, c11, ALPHA_I
  389. ADD a6, a5, a6
  390. ST a6, C0, 0x04
  391. MUL b5, c21, ALPHA_R
  392. MUL b6, c22, ALPHA_I
  393. SUB b5, b5, b6
  394. ST b5, C1, 0x00
  395. MUL b5, c22, ALPHA_R
  396. MUL b6, c21, ALPHA_I
  397. ADD b6, b5, b6
  398. ST b6, C1, 0x04
  399. #else
  400. LD a5, C0, 0x00 //C0[0]
  401. LD a6, C0, 0x04 //C0[1]
  402. LD b5, C1, 0x00 //C1[0]
  403. LD b6, C1, 0x04 //C1[1]
  404. MADD a5, c11, ALPHA_R, a5
  405. MADD a6, c12, ALPHA_R, a6
  406. NMSUB a5, c12, ALPHA_I, a5
  407. MADD a6, c11, ALPHA_I, a6
  408. ST a5, C0, 0x00
  409. ST a6, C0, 0x04
  410. MADD b5, c21, ALPHA_R, b5
  411. MADD b6, c22, ALPHA_R, b6
  412. NMSUB b5, c22, ALPHA_I, b5
  413. MADD b6, c21, ALPHA_I, b6
  414. ST b5, C1, 0x00
  415. ST b6, C1, 0x04
  416. #endif
  417. #if defined(TRMMKERNEL)
  418. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  419. sub.d TL, K, OFF
  420. #ifdef LEFT
  421. addi.d TL, TL, -1
  422. #else
  423. addi.d TL, TL, -2
  424. #endif
  425. slli.d C3, TL, 0x03
  426. add.d A0, A0, C3
  427. slli.d C3, TL, 0x04
  428. add.d B0, B0, C3
  429. #endif
  430. #ifdef LEFT
  431. addi.d OFF, OFF, 1
  432. #endif
  433. #endif // #if defined(TRMMKERNEL)
  434. addi.d C0, C0, 0x08
  435. addi.d C1, C1, 0x08
  436. addi.d I, I, 1
  437. blt I, T0, .L15
  438. .L18:
  439. #if defined(TRMMKERNEL) && !defined(LEFT)
  440. addi.d OFF, OFF, 2
  441. #endif
  442. slli.d L, K, 0x04
  443. add.d B, B, L
  444. slli.d I, LDC, 0x02
  445. add.d C, C, I
  446. addi.d J, J, 1
  447. srai.d T0, N, 1
  448. blt J, T0, .L10
  449. .L19:
  450. move J, $r0
  451. andi T0, N, 1
  452. beq J, T0, .L30
  453. .L20: /* for (j=0; j<(bn&1); j+=1) */
  454. #if defined(TRMMKERNEL) && defined(LEFT)
  455. move OFF, OFFSET
  456. #endif
  457. move C0, C
  458. move A0, A //ptrba
  459. move I, $r0
  460. srai.d T0, M, 1
  461. beq I, T0, .L24
  462. .L21: /* for (i=0; i<bm/2; i+=1) */
  463. move B0, B //ptrbb
  464. move TL, K /* TL = bk */
  465. #if defined(TRMMKERNEL)
  466. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  467. move B0, B //ptrbb
  468. #else
  469. slli.d C3, OFF, 0x04
  470. add.d A0, A0, C3
  471. slli.d C3, OFF, 0x03
  472. add.d B0, B, C3
  473. #endif
  474. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  475. sub.d TL, K, OFF
  476. #elif defined(LEFT)
  477. addi.d TL, OFF, 2
  478. #else
  479. addi.d TL, OFF, 1
  480. #endif
  481. #endif // #if defined(TRMMKERNEL)
  482. MTC c11, $r0
  483. MTC c12, $r0
  484. MTC c21, $r0
  485. MTC c22, $r0
  486. move L, $r0 //cycle param k
  487. beq L, TL, .L23
  488. blt TL, L, .L23
  489. .L22: /* for (k=0; k<bk; k+=1) */
  490. LD a1, A0, 0x00 //load0
  491. LD b1, B0, 0x00 //load1
  492. MADD1 c11, a1, b1, c11 //res0
  493. LD a2, A0, 0x04 //load2
  494. MADD2 c12, a2, b1, c12 //res1
  495. LD b2, B0, 0x04 //load3
  496. MADD3 c11, a2, b2, c11
  497. MADD4 c12, a1, b2, c12
  498. LD a3, A0, 0x08 //load4
  499. MADD1 c21, a3, b1, c21 //res2
  500. LD a4, A0, 0x0c //load5
  501. MADD2 c22, a4, b1, c22 //res3
  502. MADD3 c21, a4, b2, c21
  503. MADD4 c22, a3, b2, c22
  504. addi.d A0, A0, 0x10
  505. addi.d B0, B0, 0x08
  506. addi.d L, L, 1
  507. blt L, TL, .L22
  508. .L23:
  509. #if defined(TRMMKERNEL)
  510. MUL a5, c11, ALPHA_R
  511. MUL a6, c12, ALPHA_I
  512. SUB a5, a5, a6
  513. ST a5, C0, 0x00
  514. MUL a5, c12, ALPHA_R
  515. MUL a6, c11, ALPHA_I
  516. ADD a6, a5, a6
  517. ST a6, C0, 0x04
  518. MUL a7, c21, ALPHA_R
  519. MUL a8, c22, ALPHA_I
  520. SUB a7, a7, a8
  521. ST a7, C0, 0x08
  522. MUL a7, c22, ALPHA_R
  523. MUL a8, c21, ALPHA_I
  524. ADD a8, a7, a8
  525. ST a8, C0, 0x0c
  526. #else
  527. LD a5, C0, 0x00 //C0[0]
  528. LD a6, C0, 0x04 //C0[1]
  529. LD a7, C0, 0x08 //C1[2]
  530. LD a8, C0, 0x0c //C1[3]
  531. MADD a5, c11, ALPHA_R, a5
  532. MADD a6, c12, ALPHA_R, a6
  533. NMSUB a5, c12, ALPHA_I, a5
  534. MADD a6, c11, ALPHA_I, a6
  535. MADD a7, c21, ALPHA_R, a7
  536. MADD a8, c22, ALPHA_R, a8
  537. NMSUB a7, c22, ALPHA_I, a7
  538. MADD a8, c21, ALPHA_I, a8
  539. ST a5, C0, 0x00
  540. ST a6, C0, 0x04
  541. ST a7, C0, 0x08
  542. ST a8, C0, 0x0c
  543. #endif
  544. #if defined(TRMMKERNEL)
  545. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  546. sub.d TL, K, OFF
  547. #ifdef LEFT
  548. addi.d TL, TL, -2
  549. #else
  550. addi.d TL, TL, -1
  551. #endif
  552. slli.d C3, TL, 0x04
  553. add.d A0, A0, C3
  554. slli.d C3, TL, 0x03
  555. add.d B0, B0, C3
  556. #endif
  557. #ifdef LEFT
  558. addi.d OFF, OFF, 2
  559. #endif
  560. #endif // #if defined(TRMMKERNEL)
  561. addi.d C0, C0, 0x10
  562. addi.d I, I, 1
  563. blt I, T0, .L21
  564. .L24:
  565. move I, $r0
  566. andi T1, M, 1 //bm&1
  567. beq I, T1, .L28
  568. .L25: /* for (i=0; i<(bm&1); i+=1) */
  569. move B0, B //ptrbb
  570. move TL, K /* TL = bk */
  571. #if defined(TRMMKERNEL)
  572. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  573. move B0, B //ptrbb
  574. #else
  575. slli.d C3, OFF, 0x03
  576. add.d A0, A0, C3
  577. add.d B0, B, C3
  578. #endif
  579. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  580. sub.d TL, K, OFF
  581. #elif defined(LEFT)
  582. addi.d TL, OFF, 1
  583. #else
  584. addi.d TL, OFF, 1
  585. #endif
  586. #endif // #if defined(TRMMKERNEL)
  587. MTC c11, $r0
  588. MTC c12, $r0
  589. move L, $r0 //cycle param k
  590. beq L, TL, .L27
  591. blt TL, L, .L27
  592. .L26: /* for (k=0; k<bk; k+=1) */
  593. LD a1, A0, 0x00 //load0
  594. LD b1, B0, 0x00 //load1
  595. MADD1 c11, a1, b1, c11 //res0
  596. LD a2, A0, 0x04 //load2
  597. MADD2 c12, a2, b1, c12 //res1
  598. LD b2, B0, 0x04 //load3
  599. MADD3 c11, a2, b2, c11
  600. MADD4 c12, a1, b2, c12
  601. addi.d A0, A0, 0x08
  602. addi.d B0, B0, 0x08
  603. addi.d L, L, 1
  604. blt L, TL, .L26
  605. .L27:
  606. #if defined(TRMMKERNEL)
  607. MUL a5, c11, ALPHA_R
  608. MUL a6, c12, ALPHA_I
  609. SUB a5, a5, a6
  610. ST a5, C0, 0x00
  611. MUL a5, c12, ALPHA_R
  612. MUL a6, c11, ALPHA_I
  613. ADD a6, a5, a6
  614. ST a6, C0, 0x04
  615. #else
  616. LD a5, C0, 0x00 //C0[0]
  617. LD a6, C0, 0x04 //C0[1]
  618. MADD a5, c11, ALPHA_R, a5
  619. MADD a6, c12, ALPHA_R, a6
  620. NMSUB a5, c12, ALPHA_I, a5
  621. MADD a6, c11, ALPHA_I, a6
  622. ST a5, C0, 0x00
  623. ST a6, C0, 0x04
  624. #endif
  625. #if defined(TRMMKERNEL)
  626. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  627. sub.d TL, K, OFF
  628. #ifdef LEFT
  629. addi.d TL, TL, -1
  630. #else
  631. addi.d TL, TL, -1
  632. #endif
  633. slli.d C3, TL, 0x03
  634. add.d A0, A0, C3
  635. add.d B0, B0, C3
  636. #endif
  637. #ifdef LEFT
  638. addi.d OFF, OFF, 1
  639. #endif
  640. #endif // #if defined(TRMMKERNEL)
  641. addi.d C0, C0, 0x08
  642. addi.d I, I, 1
  643. blt I, T1, .L25
  644. .L28:
  645. slli.d L, K, 3
  646. add.d B, B, L
  647. slli.d I, LDC, 1
  648. add.d C, C, I
  649. addi.d J, J, 1
  650. andi T0, N, 1
  651. blt J, T0, .L20
  652. .L30:
  653. LDARG $r23, $sp, 0
  654. LDARG $r24, $sp, 8
  655. LDARG $r25, $sp, 16
  656. LDARG $r26, $sp, 24
  657. LDARG $r27, $sp, 32
  658. LD $f23, $sp, 40
  659. LD $f24, $sp, 48
  660. LD $f25, $sp, 56
  661. LD $f26, $sp, 64
  662. LD $f27, $sp, 72
  663. LD $f28, $sp, 80
  664. LD $f29, $sp, 88
  665. LD $f30, $sp, 96
  666. LD $f31, $sp, 104
  667. addi.d $sp, $sp, 128
  668. jirl $r0, $r1, 0x0
  669. EPILOGUE