You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_2x2_lasx.S 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857
  1. /*******************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Function parameters */
  30. #define M $r4 // param 1: bm
  31. #define N $r5 // param 2: bn
  32. #define K $r6 // param 3: bk
  33. #define ALPHA_R $f0 // param 4: alphar
  34. #define ALPHA_I $f1 // param 5: alphai
  35. #define A $r7 // param 6: ba
  36. #define B $r8 // param 7: bb
  37. #define C $r9 // param 8: bc
  38. #define LDC $r10 // param 9: ldc
  39. #if defined (TRMMKERNEL)
  40. #define OFFSET $r11 // param 10: offset
  41. #endif
  42. #define OFF $r26
  43. #define I $r12
  44. #define J $r13
  45. #define L $r14
  46. #define TL $r15
  47. #define A0 $r16
  48. #define B0 $r17
  49. #define C0 $r18
  50. #define C1 $r19
  51. #define C2 $r20
  52. #define C3 $r23
  53. #define T0 $r24
  54. #define T1 $r25
  55. #define a1 $f2
  56. #define a2 $f3
  57. #define a3 $f4
  58. #define a4 $f5
  59. #define a5 $f6
  60. #define a6 $f7
  61. #define a7 $f8
  62. #define a8 $f9
  63. #define b1 $f10
  64. #define b2 $f11
  65. #define b3 $f12
  66. #define b4 $f13
  67. #define b5 $f14
  68. #define b6 $f15
  69. #define b7 $f16
  70. #define b8 $f17
  71. #define c11 $f18
  72. #define c12 $f19
  73. #define c21 $f20
  74. #define c22 $f21
  75. #define c31 $f22
  76. #define c32 $f23
  77. #define c41 $f24
  78. #define c42 $f25
  79. /* LASX vectors */
  80. #define U0 $xr30
  81. #define U1 $xr31
  82. #define U2 $xr2
  83. #define U3 $xr3
  84. #define U4 $xr4
  85. #define U5 $xr5
  86. #define U6 $xr6
  87. #define U7 $xr7
  88. #define U8 $xr8
  89. #define U9 $xr9
  90. #define U10 $xr10
  91. #define U11 $xr11
  92. #define U12 $xr12
  93. #define U13 $xr13
  94. #define U14 $xr14
  95. #define U15 $xr15
  96. #define D0 $xr16
  97. #define D1 $xr17
  98. #define D2 $xr18
  99. #define D3 $xr19
  100. #define D4 $xr20
  101. #define D5 $xr21
  102. #define D6 $xr22
  103. #define D7 $xr23
  104. #define D8 $xr24
  105. #define D9 $xr25
  106. #define D10 $xr26
  107. #define D11 $xr27
  108. #define VALPHAR $xr28
  109. #define VALPHAI $xr29
  110. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  111. #define XVMADD1 XVFMADD
  112. #define XVMADD2 XVFMADD
  113. #define XVMADD3 XVNMSUB
  114. #define XVMADD4 XVFMADD
  115. #define VMADD1 VFMADD
  116. #define VMADD2 VFMADD
  117. #define VMADD3 VNMSUB
  118. #define VMADD4 VFMADD
  119. #define XVFADD1 XVFADD
  120. #define XVFADD2 XVFADD
  121. #define XVFADD3 XVFSUB
  122. #define XVFADD4 XVFADD
  123. #define MADD1 MADD
  124. #define MADD2 MADD
  125. #define MADD3 NMSUB
  126. #define MADD4 MADD
  127. #endif
  128. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  129. #define XVMADD1 XVFMADD
  130. #define XVMADD2 XVFMADD
  131. #define XVMADD3 XVFMADD
  132. #define XVMADD4 XVNMSUB
  133. #define VMADD1 VFMADD
  134. #define VMADD2 VFMADD
  135. #define VMADD3 VFMADD
  136. #define VMADD4 VNMSUB
  137. #define XVFADD1 XVFADD
  138. #define XVFADD2 XVFADD
  139. #define XVFADD3 XVFADD
  140. #define XVFADD4 XVFSUB
  141. #define MADD1 MADD
  142. #define MADD2 MADD
  143. #define MADD3 MADD
  144. #define MADD4 NMSUB
  145. #endif
  146. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  147. #define XVMADD1 XVFMADD
  148. #define XVMADD2 XVNMSUB
  149. #define XVMADD3 XVFMADD
  150. #define XVMADD4 XVFMADD
  151. #define VMADD1 VFMADD
  152. #define VMADD2 VNMSUB
  153. #define VMADD3 VFMADD
  154. #define VMADD4 VFMADD
  155. #define XVFADD1 XVFADD
  156. #define XVFADD2 XVFSUB
  157. #define XVFADD3 XVFADD
  158. #define XVFADD4 XVFADD
  159. #define MADD1 MADD
  160. #define MADD2 NMSUB
  161. #define MADD3 MADD
  162. #define MADD4 MADD
  163. #endif
  164. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  165. #define XVMADD1 XVFMADD
  166. #define XVMADD2 XVNMSUB
  167. #define XVMADD3 XVNMSUB
  168. #define XVMADD4 XVNMSUB
  169. #define VMADD1 VFMADD
  170. #define VMADD2 VNMSUB
  171. #define VMADD3 VNMSUB
  172. #define VMADD4 VNMSUB
  173. #define XVFADD1 XVFADD
  174. #define XVFADD2 XVFSUB
  175. #define XVFADD3 XVFSUB
  176. #define XVFADD4 XVFSUB
  177. #define MADD1 MADD
  178. #define MADD2 NMSUB
  179. #define MADD3 NMSUB
  180. #define MADD4 NMSUB
  181. #endif
  182. PROLOGUE
  183. addi.d $sp, $sp, -128
  184. SDARG $r23, $sp, 0
  185. SDARG $r24, $sp, 8
  186. SDARG $r25, $sp, 16
  187. SDARG $r26, $sp, 24
  188. SDARG $r27, $sp, 32
  189. ST $f23, $sp, 40
  190. ST $f24, $sp, 48
  191. ST $f25, $sp, 56
  192. ST $f26, $sp, 64
  193. ST $f27, $sp, 72
  194. ST $f28, $sp, 80
  195. ST $f29, $sp, 88
  196. ST $f30, $sp, 96
  197. ST $f31, $sp, 104
  198. ST ALPHA_R,$sp, 112
  199. ST ALPHA_I,$sp, 120
  200. xvldrepl.w VALPHAR, $sp, 112
  201. xvldrepl.w VALPHAI, $sp, 120
  202. #if defined (TRMMKERNEL) && !defined(LEFT)
  203. sub.d OFF, $r0, OFFSET
  204. #else
  205. xor OFF, OFF, OFF
  206. #endif
  207. slli.d LDC, LDC, 2
  208. move J, $r0
  209. srai.d T0, N, 1
  210. beq J, T0, .L19
  211. .L10: /* for(j=0; j<bn/2; j+=1) */
  212. move C0, C
  213. slli.d TL, LDC, 1
  214. add.d C1, C0, TL
  215. move A0, A //ptrba
  216. #if defined(TRMMKERNEL) && defined(LEFT)
  217. move OFF, OFFSET
  218. #endif
  219. move I, $r0
  220. srai.d T0, M, 1
  221. beq I, T0, .L150
  222. .L11: /* for(i=0; i<bm/2; i+=1) */
  223. move B0, B //ptrbb
  224. move TL, K /* TL = bk */
  225. #if defined(TRMMKERNEL)
  226. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  227. move B0, B //ptrbb
  228. #else
  229. slli.d C3, OFF, 0x04
  230. add.d A0, A0, C3
  231. add.d B0, B, C3
  232. #endif
  233. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  234. sub.d TL, K, OFF //temp
  235. #elif defined(LEFT)
  236. addi.d TL, OFF, 2
  237. #else
  238. addi.d TL, OFF, 2
  239. #endif
  240. #endif // #if defined(TRMMKERNEL)
  241. xvxor.v U0, U0, U0
  242. xvxor.v U1, U1, U1
  243. move L, $r0 //cycle param k
  244. srai.d C2, TL, 2
  245. beq L, C2, .L130
  246. blt C2, L, .L130
  247. .L12: /* for(k=0; k<bk/4; k+=1) */
  248. xvld D0, A0, 0x00 //a 0-7
  249. xvld D1, A0, 0x20 //a 8-15
  250. xvld D2, B0, 0x00 //b 0-7
  251. xvld D3, B0, 0x20 //b 8-15
  252. xvand.v D4, D0, D0
  253. xvpermi.q D4, D1, 0x02 //a 0 1 2 3 8 9 10 11
  254. xvand.v D5, D4, D4
  255. xvshuf4i.w D4, D4, 0x88 //a 0 2 0 2 8 10 8 10
  256. xvshuf4i.w D5, D5, 0xdd //a 1 3 1 3 9 11 9 11
  257. xvand.v D6, D1, D1
  258. xvpermi.q D6, D0, 0x31 //a 4 5 6 7 12 13 14 15
  259. xvand.v D7, D6, D6
  260. xvshuf4i.w D6, D6, 0x88 //a 4 6 4 6 12 14 12 14
  261. xvshuf4i.w D7, D7, 0xdd //a 5 7 5 7 13 15 13 15
  262. xvand.v D8, D2, D2
  263. xvpermi.q D8, D3, 0x02 //b 0 1 2 3 8 9 10 11
  264. xvand.v D9, D8, D8
  265. xvshuf4i.w D8, D8, 0xa0 //b 0 0 2 2 8 8 10 10
  266. xvshuf4i.w D9, D9, 0xf5 //a 1 1 3 3 9 9 11 11
  267. xvand.v D10, D3, D3
  268. xvpermi.q D10, D2, 0x31 //b 4 5 6 7 12 13 14 15
  269. xvand.v D11, D10, D10
  270. xvshuf4i.w D10, D10, 0xa0 //b 4 4 6 6 12 12 14 14
  271. xvshuf4i.w D11, D11, 0xf5 //a 5 5 7 7 13 13 15 15
  272. XVMADD1 U0, D4, D8, U0 //res0 2 4 6 0 2 4 6
  273. XVMADD2 U1, D5, D8, U1 //res1 3 4 7 1 3 4 7
  274. xvpermi.q U0, U0, 0x01
  275. xvpermi.q U1, U1, 0x01
  276. XVMADD1 U0, D4, D8, U0
  277. XVMADD2 U1, D5, D8, U1
  278. XVMADD3 U0, D5, D9, U0
  279. XVMADD4 U1, D4, D9, U1
  280. xvpermi.q U0, U0, 0x01
  281. xvpermi.q U1, U1, 0x01
  282. XVMADD3 U0, D5, D9, U0
  283. XVMADD4 U1, D4, D9, U1
  284. XVMADD1 U0, D6, D10, U0 //res0 2 4 6 0 2 4 6
  285. XVMADD2 U1, D7, D10, U1 //res1 3 4 7 1 3 4 7
  286. xvpermi.q U0, U0, 0x01
  287. xvpermi.q U1, U1, 0x01
  288. XVMADD1 U0, D6, D10, U0
  289. XVMADD2 U1, D7, D10, U1
  290. XVMADD3 U0, D7, D11, U0
  291. XVMADD4 U1, D6, D11, U1
  292. xvpermi.q U0, U0, 0x01
  293. xvpermi.q U1, U1, 0x01
  294. XVMADD3 U0, D7, D11, U0
  295. XVMADD4 U1, D6, D11, U1
  296. addi.d A0, A0, 0x40
  297. addi.d B0, B0, 0x40
  298. addi.d L, L, 1
  299. blt L, C2, .L12
  300. .L130:
  301. move L, $r0
  302. andi C2, TL, 3
  303. beq L, C2, .L14
  304. .L13: /* for(k=0; k<(bk&3); k+=1) */
  305. vld $vr16, A0, 0x00 //a0 a1 a2 a3
  306. vld $vr17, B0, 0x00 //b0 b1 b2 b3
  307. vshuf4i.w $vr20, $vr17, 0xa0 //b0 b0 b2 b2
  308. vshuf4i.w $vr21, $vr17, 0xf5 //b1 b1 b3 b3
  309. vshuf4i.w $vr18, $vr16, 0x88 //a0 a2 a0 a2
  310. vshuf4i.w $vr19, $vr16, 0xdd //a1 a3 a1 a3
  311. VMADD1 $vr30, $vr18, $vr20, $vr30 //res0 2 4 6
  312. VMADD2 $vr31, $vr19, $vr20, $vr31 //res1 3 5 7
  313. VMADD3 $vr30, $vr19, $vr21, $vr30
  314. VMADD4 $vr31, $vr18, $vr21, $vr31
  315. addi.d A0, A0, 0x10
  316. addi.d B0, B0, 0x10
  317. addi.d L, L, 1
  318. blt L, C2, .L13
  319. .L14:
  320. #if defined(TRMMKERNEL)
  321. vld $vr8, C0, 0x00 //0 1 2 3
  322. vld $vr9, C1, 0x00 //4 5 6 7
  323. vpackev.w $vr10, $vr9, $vr8 //0 4 2 6
  324. vpermi.w $vr10, $vr10, 0xd8 //0 2 4 6
  325. vpackod.w $vr11, $vr9, $vr8 //1 5 3 7
  326. vpermi.w $vr11, $vr11, 0xd8 //1 3 5 7
  327. vfmul.s $vr10, $vr30, $vr28
  328. vfmul.s $vr11, $vr31, $vr28
  329. VNMSUB $vr10, $vr31, $vr29, $vr10
  330. VFMADD $vr11, $vr30, $vr29, $vr11
  331. vilvl.w $vr8, $vr11, $vr10 //0 1 2 3
  332. vilvh.w $vr9, $vr11, $vr10 //4 5 6 7
  333. vst $vr8, C0, 0x00
  334. vst $vr9, C1, 0x00
  335. #else
  336. vld $vr8, C0, 0x00 //0 1 2 3
  337. vld $vr9, C1, 0x00 //4 5 6 7
  338. vpackev.w $vr10, $vr9, $vr8 //0 4 2 6
  339. vpermi.w $vr10, $vr10, 0xd8 //0 2 4 6
  340. vpackod.w $vr11, $vr9, $vr8 //1 5 3 7
  341. vpermi.w $vr11, $vr11, 0xd8 //1 3 5 7
  342. VFMADD $vr10, $vr30, $vr28, $vr10
  343. VFMADD $vr11, $vr31, $vr28, $vr11
  344. VNMSUB $vr10, $vr31, $vr29, $vr10
  345. VFMADD $vr11, $vr30, $vr29, $vr11
  346. vilvl.w $vr8, $vr11, $vr10 //0 1 2 3
  347. vilvh.w $vr9, $vr11, $vr10 //4 5 6 7
  348. vst $vr8, C0, 0x00
  349. vst $vr9, C1, 0x00
  350. #endif
  351. #if defined(TRMMKERNEL)
  352. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  353. sub.d TL, K, OFF
  354. #ifdef LEFT
  355. addi.d TL, TL, -2
  356. #else
  357. addi.d TL, TL, -2
  358. #endif
  359. slli.d C3, TL, 0x04
  360. add.d A0, A0, C3
  361. add.d B0, B0, C3
  362. #endif
  363. #ifdef LEFT
  364. addi.d OFF, OFF, 2
  365. #endif
  366. #endif // #if defined(TRMMKERNEL)
  367. addi.d C0, C0, 0x10
  368. addi.d C1, C1, 0x10
  369. addi.d I, I, 1
  370. blt I, T0, .L11
  371. .L150:
  372. move I, $r0
  373. andi T0, M, 1
  374. beq I, T0, .L18
  375. .L15: /* for(i=0; i<(bm&1); i+=1) */
  376. move B0, B //ptrbb
  377. move TL, K /* TL = bk */
  378. #if defined(TRMMKERNEL)
  379. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  380. move B0, B //ptrbb
  381. #else
  382. slli.d C3, OFF, 0x03
  383. add.d A0, A0, C3
  384. slli.d C3, OFF, 0x04
  385. add.d B0, B, C3
  386. #endif
  387. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  388. sub.d TL, K, OFF
  389. #elif defined(LEFT)
  390. addi.d TL, OFF, 1
  391. #else
  392. addi.d TL, OFF, 2
  393. #endif
  394. #endif // #if defined(TRMMKERNEL)
  395. MTC c11, $r0
  396. MTC c12, $r0
  397. MTC c21, $r0
  398. MTC c22, $r0
  399. move L, $r0 //cycle param k
  400. beq L, TL, .L17
  401. blt TL, L, .L17
  402. .L16: /* for (k=0; k<bk; k+=1) */
  403. LD a1, A0, 0x00 //load0
  404. LD b1, B0, 0x00 //load1
  405. MADD1 c11, a1, b1, c11 //res0
  406. LD a2, A0, 0x04 //load2
  407. MADD2 c12, a2, b1, c12 //res1
  408. LD b2, B0, 0x04 //load3
  409. MADD3 c11, a2, b2, c11
  410. MADD4 c12, a1, b2, c12
  411. LD b3, B0, 0x08 //load4
  412. MADD1 c21, a1, b3, c21 //res2
  413. MADD2 c22, a2, b3, c22 //res3
  414. LD b4, B0, 0x0c //load5
  415. MADD3 c21, a2, b4, c21
  416. MADD4 c22, a1, b4, c22
  417. addi.d A0, A0, 0x08
  418. addi.d B0, B0, 0x10
  419. addi.d L, L, 1
  420. blt L, TL, .L16
  421. .L17:
  422. #if defined(TRMMKERNEL)
  423. MUL a5, c11, ALPHA_R
  424. MUL a6, c12, ALPHA_I
  425. SUB a5, a5, a6
  426. ST a5, C0, 0x00
  427. MUL a5, c12, ALPHA_R
  428. MUL a6, c11, ALPHA_I
  429. ADD a6, a5, a6
  430. ST a6, C0, 0x04
  431. MUL b5, c21, ALPHA_R
  432. MUL b6, c22, ALPHA_I
  433. SUB b5, b5, b6
  434. ST b5, C1, 0x00
  435. MUL b5, c22, ALPHA_R
  436. MUL b6, c21, ALPHA_I
  437. ADD b6, b5, b6
  438. ST b6, C1, 0x04
  439. #else
  440. LD a5, C0, 0x00 //C0[0]
  441. LD a6, C0, 0x04 //C0[1]
  442. LD b5, C1, 0x00 //C1[0]
  443. LD b6, C1, 0x04 //C1[1]
  444. MADD a5, c11, ALPHA_R, a5
  445. MADD a6, c12, ALPHA_R, a6
  446. NMSUB a5, c12, ALPHA_I, a5
  447. MADD a6, c11, ALPHA_I, a6
  448. ST a5, C0, 0x00
  449. ST a6, C0, 0x04
  450. MADD b5, c21, ALPHA_R, b5
  451. MADD b6, c22, ALPHA_R, b6
  452. NMSUB b5, c22, ALPHA_I, b5
  453. MADD b6, c21, ALPHA_I, b6
  454. ST b5, C1, 0x00
  455. ST b6, C1, 0x04
  456. #endif
  457. #if defined(TRMMKERNEL)
  458. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  459. sub.d TL, K, OFF
  460. #ifdef LEFT
  461. addi.d TL, TL, -1
  462. #else
  463. addi.d TL, TL, -2
  464. #endif
  465. slli.d C3, TL, 0x03
  466. add.d A0, A0, C3
  467. slli.d C3, TL, 0x04
  468. add.d B0, B0, C3
  469. #endif
  470. #ifdef LEFT
  471. addi.d OFF, OFF, 1
  472. #endif
  473. #endif // #if defined(TRMMKERNEL)
  474. addi.d C0, C0, 0x08
  475. addi.d C1, C1, 0x08
  476. addi.d I, I, 1
  477. blt I, T0, .L15
  478. .L18:
  479. #if defined(TRMMKERNEL) && !defined(LEFT)
  480. addi.d OFF, OFF, 2
  481. #endif
  482. slli.d L, K, 0x04
  483. add.d B, B, L
  484. slli.d I, LDC, 0x02
  485. add.d C, C, I
  486. addi.d J, J, 1
  487. srai.d T0, N, 1
  488. blt J, T0, .L10
  489. .L19:
  490. move J, $r0
  491. andi T0, N, 1
  492. beq J, T0, .L30
  493. .L20: /* for (j=0; j<(bn&1); j+=1) */
  494. #if defined(TRMMKERNEL) && defined(LEFT)
  495. move OFF, OFFSET
  496. #endif
  497. move C0, C
  498. move A0, A //ptrba
  499. move I, $r0
  500. srai.d T0, M, 1
  501. beq I, T0, .L24
  502. .L21: /* for (i=0; i<bm/2; i+=1) */
  503. move B0, B //ptrbb
  504. move TL, K /* TL = bk */
  505. #if defined(TRMMKERNEL)
  506. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  507. move B0, B //ptrbb
  508. #else
  509. slli.d C3, OFF, 0x04
  510. add.d A0, A0, C3
  511. slli.d C3, OFF, 0x03
  512. add.d B0, B, C3
  513. #endif
  514. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  515. sub.d TL, K, OFF
  516. #elif defined(LEFT)
  517. addi.d TL, OFF, 2
  518. #else
  519. addi.d TL, OFF, 1
  520. #endif
  521. #endif // #if defined(TRMMKERNEL)
  522. MTC c11, $r0
  523. MTC c12, $r0
  524. MTC c21, $r0
  525. MTC c22, $r0
  526. move L, $r0 //cycle param k
  527. beq L, TL, .L23
  528. blt TL, L, .L23
  529. .L22: /* for (k=0; k<bk; k+=1) */
  530. LD a1, A0, 0x00 //load0
  531. LD b1, B0, 0x00 //load1
  532. MADD1 c11, a1, b1, c11 //res0
  533. LD a2, A0, 0x04 //load2
  534. MADD2 c12, a2, b1, c12 //res1
  535. LD b2, B0, 0x04 //load3
  536. MADD3 c11, a2, b2, c11
  537. MADD4 c12, a1, b2, c12
  538. LD a3, A0, 0x08 //load4
  539. MADD1 c21, a3, b1, c21 //res2
  540. LD a4, A0, 0x0c //load5
  541. MADD2 c22, a4, b1, c22 //res3
  542. MADD3 c21, a4, b2, c21
  543. MADD4 c22, a3, b2, c22
  544. addi.d A0, A0, 0x10
  545. addi.d B0, B0, 0x08
  546. addi.d L, L, 1
  547. blt L, TL, .L22
  548. .L23:
  549. #if defined(TRMMKERNEL)
  550. MUL a5, c11, ALPHA_R
  551. MUL a6, c12, ALPHA_I
  552. SUB a5, a5, a6
  553. ST a5, C0, 0x00
  554. MUL a5, c12, ALPHA_R
  555. MUL a6, c11, ALPHA_I
  556. ADD a6, a5, a6
  557. ST a6, C0, 0x04
  558. MUL a7, c21, ALPHA_R
  559. MUL a8, c22, ALPHA_I
  560. SUB a7, a7, a8
  561. ST a7, C0, 0x08
  562. MUL a7, c22, ALPHA_R
  563. MUL a8, c21, ALPHA_I
  564. ADD a8, a7, a8
  565. ST a8, C0, 0x0c
  566. #else
  567. LD a5, C0, 0x00 //C0[0]
  568. LD a6, C0, 0x04 //C0[1]
  569. LD a7, C0, 0x08 //C1[2]
  570. LD a8, C0, 0x0c //C1[3]
  571. MADD a5, c11, ALPHA_R, a5
  572. MADD a6, c12, ALPHA_R, a6
  573. NMSUB a5, c12, ALPHA_I, a5
  574. MADD a6, c11, ALPHA_I, a6
  575. MADD a7, c21, ALPHA_R, a7
  576. MADD a8, c22, ALPHA_R, a8
  577. NMSUB a7, c22, ALPHA_I, a7
  578. MADD a8, c21, ALPHA_I, a8
  579. ST a5, C0, 0x00
  580. ST a6, C0, 0x04
  581. ST a7, C0, 0x08
  582. ST a8, C0, 0x0c
  583. #endif
  584. #if defined(TRMMKERNEL)
  585. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  586. sub.d TL, K, OFF
  587. #ifdef LEFT
  588. addi.d TL, TL, -2
  589. #else
  590. addi.d TL, TL, -1
  591. #endif
  592. slli.d C3, TL, 0x04
  593. add.d A0, A0, C3
  594. slli.d C3, TL, 0x03
  595. add.d B0, B0, C3
  596. #endif
  597. #ifdef LEFT
  598. addi.d OFF, OFF, 2
  599. #endif
  600. #endif // #if defined(TRMMKERNEL)
  601. addi.d C0, C0, 0x10
  602. addi.d I, I, 1
  603. blt I, T0, .L21
  604. .L24:
  605. move I, $r0
  606. andi T1, M, 1 //bm&1
  607. beq I, T1, .L28
  608. .L25: /* for (i=0; i<(bm&1); i+=1) */
  609. move B0, B //ptrbb
  610. move TL, K /* TL = bk */
  611. #if defined(TRMMKERNEL)
  612. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  613. move B0, B //ptrbb
  614. #else
  615. slli.d C3, OFF, 0x03
  616. add.d A0, A0, C3
  617. add.d B0, B, C3
  618. #endif
  619. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  620. sub.d TL, K, OFF
  621. #elif defined(LEFT)
  622. addi.d TL, OFF, 1
  623. #else
  624. addi.d TL, OFF, 1
  625. #endif
  626. #endif // #if defined(TRMMKERNEL)
  627. MTC c11, $r0
  628. MTC c12, $r0
  629. move L, $r0 //cycle param k
  630. beq L, TL, .L27
  631. blt TL, L, .L27
  632. .L26: /* for (k=0; k<bk; k+=1) */
  633. LD a1, A0, 0x00 //load0
  634. LD b1, B0, 0x00 //load1
  635. MADD1 c11, a1, b1, c11 //res0
  636. LD a2, A0, 0x04 //load2
  637. MADD2 c12, a2, b1, c12 //res1
  638. LD b2, B0, 0x04 //load3
  639. MADD3 c11, a2, b2, c11
  640. MADD4 c12, a1, b2, c12
  641. addi.d A0, A0, 0x08
  642. addi.d B0, B0, 0x08
  643. addi.d L, L, 1
  644. blt L, TL, .L26
  645. .L27:
  646. #if defined(TRMMKERNEL)
  647. MUL a5, c11, ALPHA_R
  648. MUL a6, c12, ALPHA_I
  649. SUB a5, a5, a6
  650. ST a5, C0, 0x00
  651. MUL a5, c12, ALPHA_R
  652. MUL a6, c11, ALPHA_I
  653. ADD a6, a5, a6
  654. ST a6, C0, 0x04
  655. #else
  656. LD a5, C0, 0x00 //C0[0]
  657. LD a6, C0, 0x04 //C0[1]
  658. MADD a5, c11, ALPHA_R, a5
  659. MADD a6, c12, ALPHA_R, a6
  660. NMSUB a5, c12, ALPHA_I, a5
  661. MADD a6, c11, ALPHA_I, a6
  662. ST a5, C0, 0x00
  663. ST a6, C0, 0x04
  664. #endif
  665. #if defined(TRMMKERNEL)
  666. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  667. sub.d TL, K, OFF
  668. #ifdef LEFT
  669. addi.d TL, TL, -1
  670. #else
  671. addi.d TL, TL, -1
  672. #endif
  673. slli.d C3, TL, 0x03
  674. add.d A0, A0, C3
  675. add.d B0, B0, C3
  676. #endif
  677. #ifdef LEFT
  678. addi.d OFF, OFF, 1
  679. #endif
  680. #endif // #if defined(TRMMKERNEL)
  681. addi.d C0, C0, 0x08
  682. addi.d I, I, 1
  683. blt I, T1, .L25
  684. .L28:
  685. slli.d L, K, 3
  686. add.d B, B, L
  687. slli.d I, LDC, 1
  688. add.d C, C, I
  689. addi.d J, J, 1
  690. andi T0, N, 1
  691. blt J, T0, .L20
  692. .L30:
  693. LDARG $r23, $sp, 0
  694. LDARG $r24, $sp, 8
  695. LDARG $r25, $sp, 16
  696. LDARG $r26, $sp, 24
  697. LDARG $r27, $sp, 32
  698. LD $f23, $sp, 40
  699. LD $f24, $sp, 48
  700. LD $f25, $sp, 56
  701. LD $f26, $sp, 64
  702. LD $f27, $sp, 72
  703. LD $f28, $sp, 80
  704. LD $f29, $sp, 88
  705. LD $f30, $sp, 96
  706. LD $f31, $sp, 104
  707. addi.d $sp, $sp, 128
  708. jirl $r0, $r1, 0x0
  709. EPILOGUE