You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_2x2.S 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848
  1. /*******************************************************************************
  2. Copyright (c) 2023, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Function parameters */
  30. #define M $r4 // param 1: bm
  31. #define N $r5 // param 2: bn
  32. #define K $r6 // param 3: bk
  33. #define ALPHA_R $f0 // param 4: alphar
  34. #define ALPHA_I $f1 // param 5: alphai
  35. #define A $r7 // param 6: ba
  36. #define B $r8 // param 7: bb
  37. #define C $r9 // param 8: bc
  38. #define LDC $r10 // param 9: ldc
  39. #if defined (TRMMKERNEL)
  40. #define OFFSET $r11 // param 10: offset
  41. #endif
  42. #define OFF $r26
  43. #define I $r12
  44. #define J $r13
  45. #define L $r14
  46. #define TL $r15
  47. #define A0 $r16
  48. #define B0 $r17
  49. #define C0 $r18
  50. #define C1 $r19
  51. #define C2 $r20
  52. #define C3 $r23
  53. #define T0 $r24
  54. #define T1 $r25
  55. #define a1 $f2
  56. #define a2 $f3
  57. #define a3 $f4
  58. #define a4 $f5
  59. #define a5 $f6
  60. #define a6 $f7
  61. #define a7 $f8
  62. #define a8 $f9
  63. #define b1 $f10
  64. #define b2 $f11
  65. #define b3 $f12
  66. #define b4 $f13
  67. #define b5 $f14
  68. #define b6 $f15
  69. #define b7 $f16
  70. #define b8 $f17
  71. #define c11 $f18
  72. #define c12 $f19
  73. #define c21 $f20
  74. #define c22 $f21
  75. #define c31 $f22
  76. #define c32 $f23
  77. #define c41 $f24
  78. #define c42 $f25
  79. #define c51 $f26
  80. #define c52 $f27
  81. #define c61 $f28
  82. #define c62 $f29
  83. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  84. #define MADD1 MADD
  85. #define MADD2 MADD
  86. #define MADD3 NMSUB
  87. #define MADD4 MADD
  88. #endif
  89. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  90. #define MADD1 MADD
  91. #define MADD2 MADD
  92. #define MADD3 MADD
  93. #define MADD4 NMSUB
  94. #endif
  95. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  96. #define MADD1 MADD
  97. #define MADD2 NMSUB
  98. #define MADD3 MADD
  99. #define MADD4 MADD
  100. #endif
  101. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  102. #define MADD1 MADD
  103. #define MADD2 NMSUB
  104. #define MADD3 NMSUB
  105. #define MADD4 NMSUB
  106. #endif
  107. PROLOGUE
  108. addi.d $sp, $sp, -88
  109. SDARG $r23, $sp, 0
  110. SDARG $r24, $sp, 8
  111. SDARG $r25, $sp, 16
  112. SDARG $r26, $sp, 24
  113. ST $f23, $sp, 32
  114. ST $f24, $sp, 40
  115. ST $f25, $sp, 48
  116. ST $f26, $sp, 56
  117. ST $f27, $sp, 64
  118. ST $f28, $sp, 72
  119. ST $f29, $sp, 80
  120. #if defined (TRMMKERNEL) && !defined(LEFT)
  121. sub.d OFF, $r0, OFFSET
  122. #else
  123. xor OFF, OFF, OFF
  124. #endif
  125. slli.d LDC, LDC, BASE_SHIFT
  126. move J, $r0
  127. srai.d T0, N, 1
  128. beq J, T0, .L19
  129. .L10: /* for(j=0; j<bn/2; j+=1) */
  130. move C0, C
  131. slli.d TL, LDC, 1
  132. add.d C1, C0, TL
  133. move A0, A //ptrba
  134. #if defined(TRMMKERNEL) && defined(LEFT)
  135. move OFF, OFFSET
  136. #endif
  137. move I, $r0
  138. srai.d T0, M, 1
  139. beq I, T0, .L150
  140. .L11: /* for(i=0; i<bm/2; i+=1) */
  141. move B0, B //ptrbb
  142. move TL, K /* TL = bk */
  143. #if defined(TRMMKERNEL)
  144. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  145. move B0, B //ptrbb
  146. #else
  147. slli.d C3, OFF, 0x05
  148. add.d A0, A0, C3
  149. add.d B0, B, C3
  150. #endif
  151. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  152. sub.d TL, K, OFF //temp
  153. #elif defined(LEFT)
  154. addi.d TL, OFF, 2
  155. #else
  156. addi.d TL, OFF, 2
  157. #endif
  158. #endif // #if defined(TRMMKERNEL)
  159. MTC c11, $r0
  160. MOV c12, c11
  161. MOV c21, c11
  162. MOV c22, c11
  163. MOV c31, c11
  164. MOV c32, c11
  165. MOV c41, c11
  166. MOV c42, c11
  167. move L, $r0 //cycle param k
  168. srai.d C2, TL, 2
  169. beq L, C2, .L130
  170. blt C2, L, .L130
  171. .L12: /* for(k=0; k<bk/4; k+=1) */
  172. LD a1, A0, 0x00 //load0
  173. LD b1, B0, 0x00 //load1
  174. MADD1 c11, a1, b1, c11 //res0
  175. LD a2, A0, 0x08 //load2
  176. MADD2 c12, a2, b1, c12 //res1
  177. LD b2, B0, 0x08 //load3
  178. MADD3 c11, a2, b2, c11
  179. MADD4 c12, a1, b2, c12
  180. LD a3, A0, 0x10 //load4
  181. MADD1 c21, a3, b1, c21 //res2
  182. LD a4, A0, 0x18 //load5
  183. MADD2 c22, a4, b1, c22 //res3
  184. MADD3 c21, a4, b2, c21
  185. MADD4 c22, a3, b2, c22
  186. LD b3, B0, 0x10 //load6
  187. MADD1 c31, a1, b3, c31 //res4
  188. MADD2 c32, a2, b3, c32 //res5
  189. LD b4, B0, 0x18 //load7
  190. MADD3 c31, a2, b4, c31
  191. MADD4 c32, a1, b4, c32
  192. MADD1 c41, a3, b3, c41 //res6
  193. MADD2 c42, a4, b3, c42 //res7
  194. MADD3 c41, a4, b4, c41
  195. MADD4 c42, a3, b4, c42
  196. LD a5, A0, 0x20 //load8
  197. LD b5, B0, 0x20 //load9
  198. MADD1 c11, a5, b5, c11
  199. LD a6, A0, 0x28 //load10
  200. MADD2 c12, a6, b5, c12
  201. LD b6, B0, 0x28 //load11
  202. MADD3 c11, a6, b6, c11
  203. MADD4 c12, a5, b6, c12
  204. LD a7, A0, 0x30 //load12
  205. MADD1 c21, a7, b5, c21
  206. LD a8, A0, 0x38 //load13
  207. MADD2 c22, a8, b5, c22
  208. MADD3 c21, a8, b6, c21
  209. MADD4 c22, a7, b6, c22
  210. LD b7, B0, 0x30 //load14
  211. MADD1 c31, a5, b7, c31
  212. MADD2 c32, a6, b7, c32
  213. LD b8, B0, 0x38 //load15
  214. MADD3 c31, a6, b8, c31
  215. MADD4 c32, a5, b8, c32
  216. MADD1 c41, a7, b7, c41
  217. MADD2 c42, a8, b7, c42
  218. MADD3 c41, a8, b8, c41
  219. MADD4 c42, a7, b8, c42
  220. LD a1, A0, 0x40 //load0
  221. LD b1, B0, 0x40 //load1
  222. MADD1 c11, a1, b1, c11 //res0
  223. LD a2, A0, 0x48 //load2
  224. MADD2 c12, a2, b1, c12 //res1
  225. LD b2, B0, 0x48 //load3
  226. MADD3 c11, a2, b2, c11
  227. MADD4 c12, a1, b2, c12
  228. LD a3, A0, 0x50 //load4
  229. MADD1 c21, a3, b1, c21 //res2
  230. LD a4, A0, 0x58 //load5
  231. MADD2 c22, a4, b1, c22 //res3
  232. MADD3 c21, a4, b2, c21
  233. MADD4 c22, a3, b2, c22
  234. LD b3, B0, 0x50 //load6
  235. MADD1 c31, a1, b3, c31 //res4
  236. MADD2 c32, a2, b3, c32 //res5
  237. LD b4, B0, 0x58 //load7
  238. MADD3 c31, a2, b4, c31
  239. MADD4 c32, a1, b4, c32
  240. MADD1 c41, a3, b3, c41 //res6
  241. MADD2 c42, a4, b3, c42 //res7
  242. MADD3 c41, a4, b4, c41
  243. MADD4 c42, a3, b4, c42
  244. LD a5, A0, 0x60 //load8
  245. LD b5, B0, 0x60 //load9
  246. MADD1 c11, a5, b5, c11
  247. LD a6, A0, 0x68 //load10
  248. MADD2 c12, a6, b5, c12
  249. LD b6, B0, 0x68 //load11
  250. MADD3 c11, a6, b6, c11
  251. MADD4 c12, a5, b6, c12
  252. LD a7, A0, 0x70 //load12
  253. MADD1 c21, a7, b5, c21
  254. LD a8, A0, 0x78 //load13
  255. MADD2 c22, a8, b5, c22
  256. MADD3 c21, a8, b6, c21
  257. MADD4 c22, a7, b6, c22
  258. LD b7, B0, 0x70 //load14
  259. MADD1 c31, a5, b7, c31
  260. MADD2 c32, a6, b7, c32
  261. LD b8, B0, 0x78 //load15
  262. MADD3 c31, a6, b8, c31
  263. MADD4 c32, a5, b8, c32
  264. MADD1 c41, a7, b7, c41
  265. MADD2 c42, a8, b7, c42
  266. MADD3 c41, a8, b8, c41
  267. MADD4 c42, a7, b8, c42
  268. addi.d A0, A0, 0x80
  269. addi.d B0, B0, 0x80
  270. addi.d L, L, 1
  271. blt L, C2, .L12
  272. .L130:
  273. move L, $r0
  274. andi C2, TL, 3
  275. beq L, C2, .L14
  276. .L13: /* for(k=0; k<(bk&3); k+=1) */
  277. LD a1, A0, 0x00 //load0
  278. LD b1, B0, 0x00 //load1
  279. MADD1 c11, a1, b1, c11 //res0
  280. LD a2, A0, 0x08 //load2
  281. MADD2 c12, a2, b1, c12 //res1
  282. LD b2, B0, 0x08 //load3
  283. MADD3 c11, a2, b2, c11
  284. MADD4 c12, a1, b2, c12
  285. LD a3, A0, 0x10 //load4
  286. MADD1 c21, a3, b1, c21 //res2
  287. LD a4, A0, 0x18 //load5
  288. MADD2 c22, a4, b1, c22 //res3
  289. MADD3 c21, a4, b2, c21
  290. MADD4 c22, a3, b2, c22
  291. LD b3, B0, 0x10 //load6
  292. MADD1 c31, a1, b3, c31 //res4
  293. MADD2 c32, a2, b3, c32 //res5
  294. LD b4, B0, 0x18 //load7
  295. MADD3 c31, a2, b4, c31
  296. MADD4 c32, a1, b4, c32
  297. MADD1 c41, a3, b3, c41 //res6
  298. MADD2 c42, a4, b3, c42 //res7
  299. MADD3 c41, a4, b4, c41
  300. MADD4 c42, a3, b4, c42
  301. addi.d A0, A0, 0x20
  302. addi.d B0, B0, 0x20
  303. addi.d L, L, 1
  304. blt L, C2, .L13
  305. .L14:
  306. #if defined(TRMMKERNEL)
  307. MUL a5, c11, ALPHA_R
  308. MUL a6, c12, ALPHA_I
  309. SUB a5, a5, a6
  310. ST a5, C0, 0x00
  311. MUL a5, c12, ALPHA_R
  312. MUL a6, c11, ALPHA_I
  313. ADD a6, a5, a6
  314. ST a6, C0, 0x08
  315. MUL a7, c21, ALPHA_R
  316. MUL a8, c22, ALPHA_I
  317. SUB a7, a7, a8
  318. ST a7, C0, 0x10
  319. MUL a7, c22, ALPHA_R
  320. MUL a8, c21, ALPHA_I
  321. ADD a8, a7, a8
  322. ST a8, C0, 0x18
  323. MUL b5, c31, ALPHA_R
  324. MUL b6, c32, ALPHA_I
  325. SUB b5, b5, b6
  326. ST b5, C1, 0x00
  327. MUL b5, c32, ALPHA_R
  328. MUL b6, c31, ALPHA_I
  329. ADD b6, b5, b6
  330. ST b6, C1, 0x08
  331. MUL b7, c41, ALPHA_R
  332. MUL b8, c42, ALPHA_I
  333. SUB b7, b7, b8
  334. ST b7, C1, 0x10
  335. MUL b7, c42, ALPHA_R
  336. MUL b8, c41, ALPHA_I
  337. ADD b8, b7, b8
  338. ST b8, C1, 0x18
  339. #else
  340. LD a5, C0, 0x00 //C0[0]
  341. LD a6, C0, 0x08 //C0[1]
  342. LD a7, C0, 0x10 //C0[2]
  343. LD a8, C0, 0x18 //C0[3]
  344. LD b5, C1, 0x00 //C1[0]
  345. LD b6, C1, 0x08 //C1[1]
  346. LD b7, C1, 0x10 //C1[2]
  347. LD b8, C1, 0x18 //C1[3]
  348. MADD a5, c11, ALPHA_R, a5
  349. MADD a6, c12, ALPHA_R, a6
  350. NMSUB a5, c12, ALPHA_I, a5
  351. MADD a6, c11, ALPHA_I, a6
  352. ST a5, C0, 0x00
  353. ST a6, C0, 0x08
  354. MADD a7, c21, ALPHA_R, a7
  355. MADD a8, c22, ALPHA_R, a8
  356. NMSUB a7, c22, ALPHA_I, a7
  357. MADD a8, c21, ALPHA_I, a8
  358. ST a7, C0, 0x10
  359. ST a8, C0, 0x18
  360. MADD b5, c31, ALPHA_R, b5
  361. MADD b6, c32, ALPHA_R, b6
  362. NMSUB b5, c32, ALPHA_I, b5
  363. MADD b6, c31, ALPHA_I, b6
  364. ST b5, C1, 0x00
  365. ST b6, C1, 0x08
  366. MADD b7, c41, ALPHA_R, b7
  367. MADD b8, c42, ALPHA_R, b8
  368. NMSUB b7, c42, ALPHA_I, b7
  369. MADD b8, c41, ALPHA_I, b8
  370. ST b7, C1, 0x10
  371. ST b8, C1, 0x18
  372. #endif
  373. #if defined(TRMMKERNEL)
  374. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  375. sub.d TL, K, OFF
  376. #ifdef LEFT
  377. addi.d TL, TL, -2
  378. #else
  379. addi.d TL, TL, -2
  380. #endif
  381. slli.d C3, TL, 0x05
  382. add.d A0, A0, C3
  383. add.d B0, B0, C3
  384. #endif
  385. #ifdef LEFT
  386. addi.d OFF, OFF, 2
  387. #endif
  388. #endif // #if defined(TRMMKERNEL)
  389. addi.d C0, C0, 0x20
  390. addi.d C1, C1, 0x20
  391. addi.d I, I, 1
  392. blt I, T0, .L11
  393. .L150:
  394. move I, $r0
  395. andi T0, M, 1
  396. beq I, T0, .L18
  397. .L15: /* for(i=0; i<(bm&1); i+=1) */
  398. move B0, B //ptrbb
  399. move TL, K /* TL = bk */
  400. #if defined(TRMMKERNEL)
  401. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  402. move B0, B //ptrbb
  403. #else
  404. slli.d C3, OFF, 0x04
  405. add.d A0, A0, C3
  406. slli.d C3, OFF, 0x05
  407. add.d B0, B, C3
  408. #endif
  409. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  410. sub.d TL, K, OFF
  411. #elif defined(LEFT)
  412. addi.d TL, OFF, 1
  413. #else
  414. addi.d TL, OFF, 2
  415. #endif
  416. #endif // #if defined(TRMMKERNEL)
  417. MTC c11, $r0
  418. MTC c12, $r0
  419. MTC c21, $r0
  420. MTC c22, $r0
  421. move L, $r0 //cycle param k
  422. beq L, TL, .L17
  423. blt TL, L, .L17
  424. .L16: /* for (k=0; k<bk; k+=1) */
  425. LD a1, A0, 0x00 //load0
  426. LD b1, B0, 0x00 //load1
  427. MADD1 c11, a1, b1, c11 //res0
  428. LD a2, A0, 0x08 //load2
  429. MADD2 c12, a2, b1, c12 //res1
  430. LD b2, B0, 0x08 //load3
  431. MADD3 c11, a2, b2, c11
  432. MADD4 c12, a1, b2, c12
  433. LD b3, B0, 0x10 //load4
  434. MADD1 c21, a1, b3, c21 //res2
  435. MADD2 c22, a2, b3, c22 //res3
  436. LD b4, B0, 0x18 //load5
  437. MADD3 c21, a2, b4, c21
  438. MADD4 c22, a1, b4, c22
  439. addi.d A0, A0, 0x10
  440. addi.d B0, B0, 0x20
  441. addi.d L, L, 1
  442. blt L, TL, .L16
  443. .L17:
  444. #if defined(TRMMKERNEL)
  445. MUL a5, c11, ALPHA_R
  446. MUL a6, c12, ALPHA_I
  447. SUB a5, a5, a6
  448. ST a5, C0, 0x00
  449. MUL a5, c12, ALPHA_R
  450. MUL a6, c11, ALPHA_I
  451. ADD a6, a5, a6
  452. ST a6, C0, 0x08
  453. MUL b5, c21, ALPHA_R
  454. MUL b6, c22, ALPHA_I
  455. SUB b5, b5, b6
  456. ST b5, C1, 0x00
  457. MUL b5, c22, ALPHA_R
  458. MUL b6, c21, ALPHA_I
  459. ADD b6, b5, b6
  460. ST b6, C1, 0x08
  461. #else
  462. LD a5, C0, 0x00 //C0[0]
  463. LD a6, C0, 0x08 //C0[1]
  464. LD b5, C1, 0x00 //C1[0]
  465. LD b6, C1, 0x08 //C1[1]
  466. MADD a5, c11, ALPHA_R, a5
  467. MADD a6, c12, ALPHA_R, a6
  468. NMSUB a5, c12, ALPHA_I, a5
  469. MADD a6, c11, ALPHA_I, a6
  470. ST a5, C0, 0x00
  471. ST a6, C0, 0x08
  472. MADD b5, c21, ALPHA_R, b5
  473. MADD b6, c22, ALPHA_R, b6
  474. NMSUB b5, c22, ALPHA_I, b5
  475. MADD b6, c21, ALPHA_I, b6
  476. ST b5, C1, 0x00
  477. ST b6, C1, 0x08
  478. #endif
  479. #if defined(TRMMKERNEL)
  480. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  481. sub.d TL, K, OFF
  482. #ifdef LEFT
  483. addi.d TL, TL, -1
  484. #else
  485. addi.d TL, TL, -2
  486. #endif
  487. slli.d C3, TL, 0x04
  488. add.d A0, A0, C3
  489. slli.d C3, TL, 0x05
  490. add.d B0, B0, C3
  491. #endif
  492. #ifdef LEFT
  493. addi.d OFF, OFF, 1
  494. #endif
  495. #endif // #if defined(TRMMKERNEL)
  496. addi.d C0, C0, 0x10
  497. addi.d C1, C1, 0x10
  498. addi.d I, I, 1
  499. blt I, T0, .L15
  500. .L18:
  501. #if defined(TRMMKERNEL) && !defined(LEFT)
  502. addi.d OFF, OFF, 2
  503. #endif
  504. slli.d L, K, 0x05
  505. add.d B, B, L
  506. slli.d I, LDC, 0x02
  507. add.d C, C, I
  508. addi.d J, J, 1
  509. srai.d T0, N, 1
  510. blt J, T0, .L10
  511. .L19:
  512. move J, $r0
  513. andi T0, N, 1
  514. beq J, T0, .L30
  515. .L20: /* for (j=0; j<(bn&1); j+=1) */
  516. #if defined(TRMMKERNEL) && defined(LEFT)
  517. move OFF, OFFSET
  518. #endif
  519. move C0, C
  520. move A0, A //ptrba
  521. move I, $r0
  522. srai.d T0, M, 1
  523. beq I, T0, .L24
  524. .L21: /* for (i=0; i<bm/2; i+=1) */
  525. move B0, B //ptrbb
  526. move TL, K /* TL = bk */
  527. #if defined(TRMMKERNEL)
  528. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  529. move B0, B //ptrbb
  530. #else
  531. slli.d C3, OFF, 0x05
  532. add.d A0, A0, C3
  533. slli.d C3, OFF, 0x04
  534. add.d B0, B, C3
  535. #endif
  536. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  537. sub.d TL, K, OFF
  538. #elif defined(LEFT)
  539. addi.d TL, OFF, 2
  540. #else
  541. addi.d TL, OFF, 1
  542. #endif
  543. #endif // #if defined(TRMMKERNEL)
  544. MTC c11, $r0
  545. MTC c12, $r0
  546. MTC c21, $r0
  547. MTC c22, $r0
  548. move L, $r0 //cycle param k
  549. beq L, TL, .L23
  550. blt TL, L, .L23
  551. .L22: /* for (k=0; k<bk; k+=1) */
  552. LD a1, A0, 0x00 //load0
  553. LD b1, B0, 0x00 //load1
  554. MADD1 c11, a1, b1, c11 //res0
  555. LD a2, A0, 0x08 //load2
  556. MADD2 c12, a2, b1, c12 //res1
  557. LD b2, B0, 0x08 //load3
  558. MADD3 c11, a2, b2, c11
  559. MADD4 c12, a1, b2, c12
  560. LD a3, A0, 0x10 //load4
  561. MADD1 c21, a3, b1, c21 //res2
  562. LD a4, A0, 0x18 //load5
  563. MADD2 c22, a4, b1, c22 //res3
  564. MADD3 c21, a4, b2, c21
  565. MADD4 c22, a3, b2, c22
  566. addi.d A0, A0, 0x20
  567. addi.d B0, B0, 0x10
  568. addi.d L, L, 1
  569. blt L, TL, .L22
  570. .L23:
  571. #if defined(TRMMKERNEL)
  572. MUL a5, c11, ALPHA_R
  573. MUL a6, c12, ALPHA_I
  574. SUB a5, a5, a6
  575. ST a5, C0, 0x00
  576. MUL a5, c12, ALPHA_R
  577. MUL a6, c11, ALPHA_I
  578. ADD a6, a5, a6
  579. ST a6, C0, 0x08
  580. MUL a7, c21, ALPHA_R
  581. MUL a8, c22, ALPHA_I
  582. SUB a7, a7, a8
  583. ST a7, C0, 0x10
  584. MUL a7, c22, ALPHA_R
  585. MUL a8, c21, ALPHA_I
  586. ADD a8, a7, a8
  587. ST a8, C0, 0x18
  588. #else
  589. LD a5, C0, 0x00 //C0[0]
  590. LD a6, C0, 0x08 //C0[1]
  591. LD a7, C0, 0x10 //C1[2]
  592. LD a8, C0, 0x18 //C1[3]
  593. MADD a5, c11, ALPHA_R, a5
  594. MADD a6, c12, ALPHA_R, a6
  595. NMSUB a5, c12, ALPHA_I, a5
  596. MADD a6, c11, ALPHA_I, a6
  597. MADD a7, c21, ALPHA_R, a7
  598. MADD a8, c22, ALPHA_R, a8
  599. NMSUB a7, c22, ALPHA_I, a7
  600. MADD a8, c21, ALPHA_I, a8
  601. ST a5, C0, 0x00
  602. ST a6, C0, 0x08
  603. ST a7, C0, 0x10
  604. ST a8, C0, 0x18
  605. #endif
  606. #if defined(TRMMKERNEL)
  607. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  608. sub.d TL, K, OFF
  609. #ifdef LEFT
  610. addi.d TL, TL, -2
  611. #else
  612. addi.d TL, TL, -1
  613. #endif
  614. slli.d C3, TL, 0x05
  615. add.d A0, A0, C3
  616. slli.d C3, TL, 0x04
  617. add.d B0, B0, C3
  618. #endif
  619. #ifdef LEFT
  620. addi.d OFF, OFF, 2
  621. #endif
  622. #endif // #if defined(TRMMKERNEL)
  623. addi.d C0, C0, 0x20
  624. addi.d I, I, 1
  625. blt I, T0, .L21
  626. .L24:
  627. move I, $r0
  628. andi T1, M, 1 //bm&1
  629. beq I, T1, .L28
  630. .L25: /* for (i=0; i<(bm&1); i+=1) */
  631. move B0, B //ptrbb
  632. move TL, K /* TL = bk */
  633. #if defined(TRMMKERNEL)
  634. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  635. move B0, B //ptrbb
  636. #else
  637. slli.d C3, OFF, 0x04
  638. add.d A0, A0, C3
  639. add.d B0, B, C3
  640. #endif
  641. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  642. sub.d TL, K, OFF
  643. #elif defined(LEFT)
  644. addi.d TL, OFF, 1
  645. #else
  646. addi.d TL, OFF, 1
  647. #endif
  648. #endif // #if defined(TRMMKERNEL)
  649. MTC c11, $r0
  650. MTC c12, $r0
  651. move L, $r0 //cycle param k
  652. beq L, TL, .L27
  653. blt TL, L, .L27
  654. .L26: /* for (k=0; k<bk; k+=1) */
  655. LD a1, A0, 0x00 //load0
  656. LD b1, B0, 0x00 //load1
  657. MADD1 c11, a1, b1, c11 //res0
  658. LD a2, A0, 0x08 //load2
  659. MADD2 c12, a2, b1, c12 //res1
  660. LD b2, B0, 0x08 //load3
  661. MADD3 c11, a2, b2, c11
  662. MADD4 c12, a1, b2, c12
  663. addi.d A0, A0, 0x10
  664. addi.d B0, B0, 0x10
  665. addi.d L, L, 1
  666. blt L, TL, .L26
  667. .L27:
  668. #if defined(TRMMKERNEL)
  669. MUL a5, c11, ALPHA_R
  670. MUL a6, c12, ALPHA_I
  671. SUB a5, a5, a6
  672. ST a5, C0, 0x00
  673. MUL a5, c12, ALPHA_R
  674. MUL a6, c11, ALPHA_I
  675. ADD a6, a5, a6
  676. ST a6, C0, 0x08
  677. #else
  678. LD a5, C0, 0x00 //C0[0]
  679. LD a6, C0, 0x08 //C0[1]
  680. MADD a5, c11, ALPHA_R, a5
  681. MADD a6, c12, ALPHA_R, a6
  682. NMSUB a5, c12, ALPHA_I, a5
  683. MADD a6, c11, ALPHA_I, a6
  684. ST a5, C0, 0x00
  685. ST a6, C0, 0x08
  686. #endif
  687. #if defined(TRMMKERNEL)
  688. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  689. sub.d TL, K, OFF
  690. #ifdef LEFT
  691. addi.d TL, TL, -1
  692. #else
  693. addi.d TL, TL, -1
  694. #endif
  695. slli.d C3, TL, 0x04
  696. add.d A0, A0, C3
  697. add.d B0, B0, C3
  698. #endif
  699. #ifdef LEFT
  700. addi.d OFF, OFF, 1
  701. #endif
  702. #endif // #if defined(TRMMKERNEL)
  703. addi.d C0, C0, 0x10
  704. addi.d I, I, 1
  705. blt I, T1, .L25
  706. .L28:
  707. slli.d L, K, 4
  708. add.d B, B, L
  709. slli.d I, LDC, 1
  710. add.d C, C, I
  711. addi.d J, J, 1
  712. andi T0, N, 1
  713. blt J, T0, .L20
  714. .L30:
  715. LDARG $r23, $sp, 0
  716. LDARG $r24, $sp, 8
  717. LDARG $r25, $sp, 16
  718. LDARG $r26, $sp, 24
  719. LD $f23, $sp, 32
  720. LD $f24, $sp, 40
  721. LD $f25, $sp, 48
  722. LD $f26, $sp, 56
  723. LD $f27, $sp, 64
  724. LD $f28, $sp, 72
  725. LD $f29, $sp, 80
  726. addi.d $sp, $sp, 88
  727. jirl $r0, $r1, 0x0
  728. EPILOGUE