You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel.S 25 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047
  1. /***************************************************************************
  2. Copyright (c) 2020, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. #define M $r4
  30. #define N $r5
  31. #define K $r6
  32. #define A $r7
  33. #define B $r8
  34. #define C $r9
  35. #define LDC $r10
  36. #define AO $r12
  37. #define BO $r13
  38. #define I $r17
  39. #define J $r18
  40. #define L $r25
  41. #define CO1 $r14
  42. #define CO2 $r15
  43. #define CO3 $r23
  44. #define CO4 $r24
  45. #if defined(TRMMKERNEL)
  46. #define OFFSET $r11
  47. #define KK $r26
  48. #define TEMP $r27
  49. #endif
  50. #define a1 $f22
  51. #define a2 $f8
  52. #define a3 $f28
  53. #define a4 $f29
  54. #define b1 $f23
  55. #define b2 $f9
  56. #define b3 $f10
  57. #define b4 $f11
  58. #define b5 $f12
  59. #define b6 $f13
  60. #define b7 $f14
  61. #define b8 $f15
  62. #define a5 b8
  63. #define c11 $f16
  64. #define c12 $f17
  65. #define c21 $f3
  66. #define c22 $f4
  67. #define c31 $f2
  68. #define c32 $f5
  69. #define c41 $f6
  70. #define c42 $f7
  71. #define c51 $f18
  72. #define c52 $f19
  73. #define c61 $f20
  74. #define c62 $f21
  75. #define c71 $f24
  76. #define c72 $f25
  77. #define c81 $f26
  78. #define c82 $f27
  79. #define ALPHA_R $f0
  80. #define ALPHA_I $f1
  81. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  82. #define MADD1 MADD
  83. #define MADD2 MADD
  84. #define MADD3 MADD
  85. #define MADD4 NMSUB
  86. #endif
  87. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  88. #define MADD1 MADD
  89. #define MADD2 MADD
  90. #define MADD3 NMSUB
  91. #define MADD4 MADD
  92. #endif
  93. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  94. #define MADD1 MADD
  95. #define MADD2 NMSUB
  96. #define MADD3 MADD
  97. #define MADD4 MADD
  98. #endif
  99. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  100. #define MADD1 MADD
  101. #define MADD2 NMSUB
  102. #define MADD3 NMSUB
  103. #define MADD4 NMSUB
  104. #endif
  105. PROLOGUE
  106. addi.d $sp, $sp, -128
  107. SDARG $r23, $sp, 0
  108. SDARG $r24, $sp, 8
  109. SDARG $r25, $sp, 64
  110. fst.d $f24, $sp, 16
  111. fst.d $f25, $sp, 24
  112. fst.d $f26, $sp, 32
  113. fst.d $f27, $sp, 40
  114. fst.d $f28, $sp, 48
  115. fst.d $f29, $sp, 56
  116. #if defined(TRMMKERNEL)
  117. SDARG $r26, $sp, 72
  118. SDARG $r27, $sp, 80
  119. #endif
  120. #ifndef __64BIT__
  121. fst.d $f18, $sp, 88
  122. fst.d $f19, $sp, 96
  123. fst.d $f20, $sp, 104
  124. fst.d $f21, $sp, 112
  125. #endif
  126. slli.d LDC, LDC, ZBASE_SHIFT
  127. #if defined(TRMMKERNEL) && !defined(LEFT)
  128. sub.d KK, $r0, OFFSET
  129. #endif
  130. srai.d J, N, 2
  131. nop
  132. bge $r0, J, .L20
  133. .L10:
  134. move CO1, C
  135. MTC c11, $r0
  136. add.d CO2, C, LDC
  137. move AO, A
  138. add.d CO3, CO2, LDC
  139. addi.d J, J, -1
  140. add.d CO4, CO3, LDC
  141. MOV c21, c11
  142. MOV c31, c11
  143. #if defined(TRMMKERNEL) && defined(LEFT)
  144. move KK, OFFSET
  145. #endif
  146. MOV c41, c11
  147. MOV c51, c11
  148. move I, M
  149. add.d C, CO4, LDC
  150. MOV c61, c11
  151. bge $r0, I, .L19
  152. .L11:
  153. #if defined(TRMMKERNEL)
  154. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  155. move BO, B
  156. #else
  157. slli.d L, KK, ZBASE_SHIFT
  158. slli.d TEMP, KK, 2 + ZBASE_SHIFT
  159. add.d AO, AO, L
  160. add.d BO, B, TEMP
  161. #endif
  162. LD a1, AO, 0 * SIZE
  163. MOV c71, c11
  164. LD b1, BO, 0 * SIZE
  165. MOV c81, c11
  166. LD a3, AO, 4 * SIZE
  167. MOV c12, c11
  168. LD b2, BO, 1 * SIZE
  169. MOV c22, c11
  170. MOV c32, c11
  171. LD b3, BO, 2 * SIZE
  172. MOV c42, c11
  173. LD b4, BO, 3 * SIZE
  174. MOV c52, c11
  175. LD b5, BO, 4 * SIZE
  176. MOV c62, c11
  177. LD b6, BO, 8 * SIZE
  178. MOV c72, c11
  179. LD b7, BO, 12 * SIZE
  180. MOV c82, c11
  181. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  182. sub.d TEMP, K, KK
  183. #elif defined(LEFT)
  184. addi.d TEMP, KK, 1
  185. #else
  186. addi.d TEMP, KK, 4
  187. #endif
  188. srai.d L, TEMP, 2
  189. bge $r0, L, .L15
  190. #else
  191. LD a1, AO, 0 * SIZE
  192. MOV c71, c11
  193. LD b1, B, 0 * SIZE
  194. MOV c81, c11
  195. LD a3, AO, 4 * SIZE
  196. MOV c12, c11
  197. LD b2, B, 1 * SIZE
  198. MOV c22, c11
  199. srai.d L, K, 2
  200. MOV c32, c11
  201. LD b3, B, 2 * SIZE
  202. MOV c42, c11
  203. LD b4, B, 3 * SIZE
  204. MOV c52, c11
  205. LD b5, B, 4 * SIZE
  206. MOV c62, c11
  207. LD b6, B, 8 * SIZE
  208. MOV c72, c11
  209. LD b7, B, 12 * SIZE
  210. MOV c82, c11
  211. move BO, B
  212. bge $r0, L, .L15
  213. #endif
  214. MADD1 c11, b1, a1, c11
  215. LD a2, AO, 1 * SIZE
  216. MADD3 c21, b2, a1, c21
  217. addi.d L, L, -1
  218. MADD1 c31, b3, a1, c31
  219. MADD3 c41, b4, a1, c41
  220. bge $r0, L, .L13
  221. .align 3
  222. .L12:
  223. MADD2 c12, b1, a2, c12
  224. LD b1, BO, 16 * SIZE
  225. MADD4 c22, b2, a2, c22
  226. LD b2, BO, 5 * SIZE
  227. MADD2 c32, b3, a2, c32
  228. LD b3, BO, 6 * SIZE
  229. MADD4 c42, b4, a2, c42
  230. LD b4, BO, 7 * SIZE
  231. MADD1 c51, b5, a1, c51
  232. MADD3 c61, b2, a1, c61
  233. LD a4, AO, 2 * SIZE
  234. MADD1 c71, b3, a1, c71
  235. MADD3 c81, b4, a1, c81
  236. LD a1, AO, 8 * SIZE
  237. MADD2 c52, b5, a2, c52
  238. LD b5, BO, 20 * SIZE
  239. MADD4 c62, b2, a2, c62
  240. LD b2, BO, 9 * SIZE
  241. MADD2 c72, b3, a2, c72
  242. LD b3, BO, 10 * SIZE
  243. MADD4 c82, b4, a2, c82
  244. LD b4, BO, 11 * SIZE
  245. MADD1 c11, b6, a4, c11
  246. LD a2, AO, 3 * SIZE
  247. MADD3 c21, b2, a4, c21
  248. MADD1 c31, b3, a4, c31
  249. MADD3 c41, b4, a4, c41
  250. MADD2 c12, b6, a2, c12
  251. LD b6, BO, 24 * SIZE
  252. MADD4 c22, b2, a2, c22
  253. LD b2, BO, 13 * SIZE
  254. MADD2 c32, b3, a2, c32
  255. LD b3, BO, 14 * SIZE
  256. MADD4 c42, b4, a2, c42
  257. LD b4, BO, 15 * SIZE
  258. MADD1 c51, b7, a4, c51
  259. MADD3 c61, b2, a4, c61
  260. MADD1 c71, b3, a4, c71
  261. MADD3 c81, b4, a4, c81
  262. MADD2 c52, b7, a2, c52
  263. LD b7, BO, 28 * SIZE
  264. MADD4 c62, b2, a2, c62
  265. LD b2, BO, 17 * SIZE
  266. MADD2 c72, b3, a2, c72
  267. LD b3, BO, 18 * SIZE
  268. MADD4 c82, b4, a2, c82
  269. LD b4, BO, 19 * SIZE
  270. MADD1 c11, b1, a3, c11
  271. LD a2, AO, 5 * SIZE
  272. MADD3 c21, b2, a3, c21
  273. MADD1 c31, b3, a3, c31
  274. MADD3 c41, b4, a3, c41
  275. MADD2 c12, b1, a2, c12
  276. LD b1, BO, 32 * SIZE
  277. MADD4 c22, b2, a2, c22
  278. LD b2, BO, 21 * SIZE
  279. MADD2 c32, b3, a2, c32
  280. LD b3, BO, 22 * SIZE
  281. MADD4 c42, b4, a2, c42
  282. LD b4, BO, 23 * SIZE
  283. MADD1 c51, b5, a3, c51
  284. MADD3 c61, b2, a3, c61
  285. LD a4, AO, 6 * SIZE
  286. MADD1 c71, b3, a3, c71
  287. MADD3 c81, b4, a3, c81
  288. LD a3, AO, 12 * SIZE
  289. MADD2 c52, b5, a2, c52
  290. LD b5, BO, 36 * SIZE
  291. MADD4 c62, b2, a2, c62
  292. LD b2, BO, 25 * SIZE
  293. MADD2 c72, b3, a2, c72
  294. LD b3, BO, 26 * SIZE
  295. MADD4 c82, b4, a2, c82
  296. LD b4, BO, 27 * SIZE
  297. MADD1 c11, b6, a4, c11
  298. LD a2, AO, 7 * SIZE
  299. MADD3 c21, b2, a4, c21
  300. MADD1 c31, b3, a4, c31
  301. MADD3 c41, b4, a4, c41
  302. addi.d L, L, -1
  303. MADD2 c12, b6, a2, c12
  304. LD b6, BO, 40 * SIZE
  305. MADD4 c22, b2, a2, c22
  306. LD b2, BO, 29 * SIZE
  307. MADD2 c32, b3, a2, c32
  308. LD b3, BO, 30 * SIZE
  309. MADD4 c42, b4, a2, c42
  310. LD b4, BO, 31 * SIZE
  311. MADD1 c51, b7, a4, c51
  312. addi.d BO, BO, 32 * SIZE
  313. MADD3 c61, b2, a4, c61
  314. addi.d AO, AO, 8 * SIZE
  315. MADD1 c71, b3, a4, c71
  316. MADD3 c81, b4, a4, c81
  317. MADD2 c52, b7, a2, c52
  318. LD b7, BO, 12 * SIZE
  319. MADD4 c62, b2, a2, c62
  320. LD b2, BO, 1 * SIZE
  321. MADD2 c72, b3, a2, c72
  322. LD b3, BO, 2 * SIZE
  323. MADD4 c82, b4, a2, c82
  324. LD b4, BO, 3 * SIZE
  325. MADD1 c11, b1, a1, c11
  326. LD a2, AO, 1 * SIZE
  327. MADD3 c21, b2, a1, c21
  328. MADD1 c31, b3, a1, c31
  329. MADD3 c41, b4, a1, c41
  330. blt $r0, L, .L12
  331. .align 3
  332. .L13:
  333. MADD2 c12, b1, a2, c12
  334. LD b1, BO, 16 * SIZE
  335. MADD4 c22, b2, a2, c22
  336. LD b2, BO, 5 * SIZE
  337. MADD2 c32, b3, a2, c32
  338. LD b3, BO, 6 * SIZE
  339. MADD4 c42, b4, a2, c42
  340. LD b4, BO, 7 * SIZE
  341. MADD1 c51, b5, a1, c51
  342. MADD3 c61, b2, a1, c61
  343. LD a4, AO, 2 * SIZE
  344. MADD1 c71, b3, a1, c71
  345. MADD3 c81, b4, a1, c81
  346. LD a1, AO, 8 * SIZE
  347. MADD2 c52, b5, a2, c52
  348. LD b5, BO, 20 * SIZE
  349. MADD4 c62, b2, a2, c62
  350. LD b2, BO, 9 * SIZE
  351. MADD2 c72, b3, a2, c72
  352. LD b3, BO, 10 * SIZE
  353. MADD4 c82, b4, a2, c82
  354. LD b4, BO, 11 * SIZE
  355. MADD1 c11, b6, a4, c11
  356. LD a2, AO, 3 * SIZE
  357. MADD3 c21, b2, a4, c21
  358. MADD1 c31, b3, a4, c31
  359. MADD3 c41, b4, a4, c41
  360. MADD2 c12, b6, a2, c12
  361. LD b6, BO, 24 * SIZE
  362. MADD4 c22, b2, a2, c22
  363. LD b2, BO, 13 * SIZE
  364. MADD2 c32, b3, a2, c32
  365. LD b3, BO, 14 * SIZE
  366. MADD4 c42, b4, a2, c42
  367. LD b4, BO, 15 * SIZE
  368. MADD1 c51, b7, a4, c51
  369. MADD3 c61, b2, a4, c61
  370. MADD1 c71, b3, a4, c71
  371. MADD3 c81, b4, a4, c81
  372. MADD2 c52, b7, a2, c52
  373. LD b7, BO, 28 * SIZE
  374. MADD4 c62, b2, a2, c62
  375. LD b2, BO, 17 * SIZE
  376. MADD2 c72, b3, a2, c72
  377. LD b3, BO, 18 * SIZE
  378. MADD4 c82, b4, a2, c82
  379. LD b4, BO, 19 * SIZE
  380. MADD1 c11, b1, a3, c11
  381. LD a2, AO, 5 * SIZE
  382. MADD3 c21, b2, a3, c21
  383. MADD1 c31, b3, a3, c31
  384. MADD3 c41, b4, a3, c41
  385. MADD2 c12, b1, a2, c12
  386. LD b1, BO, 32 * SIZE
  387. MADD4 c22, b2, a2, c22
  388. LD b2, BO, 21 * SIZE
  389. MADD2 c32, b3, a2, c32
  390. LD b3, BO, 22 * SIZE
  391. MADD4 c42, b4, a2, c42
  392. LD b4, BO, 23 * SIZE
  393. MADD1 c51, b5, a3, c51
  394. MADD3 c61, b2, a3, c61
  395. LD a4, AO, 6 * SIZE
  396. MADD1 c71, b3, a3, c71
  397. MADD3 c81, b4, a3, c81
  398. LD a3, AO, 12 * SIZE
  399. MADD2 c52, b5, a2, c52
  400. LD b5, BO, 36 * SIZE
  401. MADD4 c62, b2, a2, c62
  402. LD b2, BO, 25 * SIZE
  403. MADD2 c72, b3, a2, c72
  404. LD b3, BO, 26 * SIZE
  405. MADD4 c82, b4, a2, c82
  406. LD b4, BO, 27 * SIZE
  407. MADD1 c11, b6, a4, c11
  408. LD a2, AO, 7 * SIZE
  409. MADD3 c21, b2, a4, c21
  410. MADD1 c31, b3, a4, c31
  411. MADD3 c41, b4, a4, c41
  412. MADD2 c12, b6, a2, c12
  413. LD b6, BO, 40 * SIZE
  414. MADD4 c22, b2, a2, c22
  415. LD b2, BO, 29 * SIZE
  416. MADD2 c32, b3, a2, c32
  417. LD b3, BO, 30 * SIZE
  418. MADD4 c42, b4, a2, c42
  419. LD b4, BO, 31 * SIZE
  420. MADD1 c51, b7, a4, c51
  421. addi.d BO, BO, 32 * SIZE
  422. MADD3 c61, b2, a4, c61
  423. addi.d AO, AO, 8 * SIZE
  424. MADD1 c71, b3, a4, c71
  425. MADD3 c81, b4, a4, c81
  426. MADD2 c52, b7, a2, c52
  427. LD b7, BO, 12 * SIZE
  428. MADD4 c62, b2, a2, c62
  429. LD b2, BO, 1 * SIZE
  430. MADD2 c72, b3, a2, c72
  431. LD b3, BO, 2 * SIZE
  432. MADD4 c82, b4, a2, c82
  433. LD b4, BO, 3 * SIZE
  434. .align 3
  435. .L15:
  436. #ifndef TRMMKERNEL
  437. andi L, K, 3
  438. #else
  439. andi L, TEMP, 3
  440. #endif
  441. bge $r0, L, .L18
  442. .align 3
  443. .L16:
  444. MADD1 c11, b1, a1, c11
  445. LD a2, AO, 1 * SIZE
  446. MADD3 c21, b2, a1, c21
  447. MADD1 c31, b3, a1, c31
  448. MADD3 c41, b4, a1, c41
  449. MADD2 c12, b1, a2, c12
  450. LD b1, BO, 8 * SIZE
  451. MADD4 c22, b2, a2, c22
  452. LD b2, BO, 5 * SIZE
  453. MADD2 c32, b3, a2, c32
  454. LD b3, BO, 6 * SIZE
  455. MADD4 c42, b4, a2, c42
  456. LD b4, BO, 7 * SIZE
  457. MADD1 c51, b5, a1, c51
  458. addi.d L, L, -1
  459. MADD3 c61, b2, a1, c61
  460. addi.d AO, AO, 2 * SIZE
  461. MADD1 c71, b3, a1, c71
  462. addi.d BO, BO, 8 * SIZE
  463. MADD3 c81, b4, a1, c81
  464. LD a1, AO, 0 * SIZE
  465. MADD2 c52, b5, a2, c52
  466. LD b5, BO, 4 * SIZE
  467. MADD4 c62, b2, a2, c62
  468. LD b2, BO, 1 * SIZE
  469. MADD2 c72, b3, a2, c72
  470. LD b3, BO, 2 * SIZE
  471. MADD4 c82, b4, a2, c82
  472. LD b4, BO, 3 * SIZE
  473. blt $r0, L, .L16
  474. .L18:
  475. #ifndef TRMMKERNEL
  476. LD b1, CO1, 0 * SIZE
  477. ADD c11, c11, c22
  478. LD b2, CO1, 1 * SIZE
  479. ADD c12, c12, c21
  480. LD b3, CO2, 0 * SIZE
  481. ADD c31, c31, c42
  482. LD b4, CO2, 1 * SIZE
  483. ADD c32, c32, c41
  484. LD b5, CO3, 0 * SIZE
  485. ADD c51, c51, c62
  486. LD b6, CO3, 1 * SIZE
  487. ADD c52, c52, c61
  488. LD b7, CO4, 0 * SIZE
  489. ADD c71, c71, c82
  490. LD b8, CO4, 1 * SIZE
  491. ADD c72, c72, c81
  492. MADD b1, c11, ALPHA_R, b1
  493. addi.d CO1,CO1, 2 * SIZE
  494. MADD b2, c12, ALPHA_R, b2
  495. addi.d CO2,CO2, 2 * SIZE
  496. MADD b3, c31, ALPHA_R, b3
  497. addi.d CO3,CO3, 2 * SIZE
  498. MADD b4, c32, ALPHA_R, b4
  499. addi.d CO4,CO4, 2 * SIZE
  500. MADD b5, c51, ALPHA_R, b5
  501. addi.d I, I, -1
  502. MADD b6, c52, ALPHA_R, b6
  503. MADD b7, c71, ALPHA_R, b7
  504. MADD b8, c72, ALPHA_R, b8
  505. NMSUB b1, c12, ALPHA_I, b1
  506. MADD b2, c11, ALPHA_I, b2
  507. MTC c11, $r0
  508. NMSUB b3, c32, ALPHA_I, b3
  509. MADD b4, c31, ALPHA_I, b4
  510. ST b1, CO1, -2 * SIZE
  511. NMSUB b5, c52, ALPHA_I, b5
  512. ST b2, CO1, -1 * SIZE
  513. MADD b6, c51, ALPHA_I, b6
  514. ST b3, CO2, -2 * SIZE
  515. NMSUB b7, c72, ALPHA_I, b7
  516. ST b4, CO2, -1 * SIZE
  517. MADD b8, c71, ALPHA_I, b8
  518. ST b5, CO3, -2 * SIZE
  519. MOV c21, c11
  520. ST b6, CO3, -1 * SIZE
  521. MOV c31, c11
  522. ST b7, CO4, -2 * SIZE
  523. MOV c41, c11
  524. ST b8, CO4, -1 * SIZE
  525. MOV c51, c11
  526. #else
  527. ADD c11, c11, c22
  528. addi.d CO1,CO1, 2 * SIZE
  529. ADD c12, c12, c21
  530. addi.d CO2,CO2, 2 * SIZE
  531. ADD c31, c31, c42
  532. addi.d CO3,CO3, 2 * SIZE
  533. ADD c32, c32, c41
  534. addi.d CO4,CO4, 2 * SIZE
  535. ADD c51, c51, c62
  536. addi.d I, I, -1
  537. ADD c52, c52, c61
  538. ADD c71, c71, c82
  539. ADD c72, c72, c81
  540. MUL b1, ALPHA_R, c11
  541. MUL b2, ALPHA_R, c12
  542. MUL b3, ALPHA_R, c31
  543. MUL b4, ALPHA_R, c32
  544. MUL b5, ALPHA_R, c51
  545. MUL b6, ALPHA_R, c52
  546. MUL b7, ALPHA_R, c71
  547. MUL b8, ALPHA_R, c72
  548. NMSUB b1, c12, ALPHA_I, b1
  549. MADD b2, c11, ALPHA_I, b2
  550. MTC c11, $r0
  551. NMSUB b3, c32, ALPHA_I, b3
  552. MADD b4, c31, ALPHA_I, b4
  553. ST b1, CO1, -2 * SIZE
  554. NMSUB b5, c52, ALPHA_I, b5
  555. ST b2, CO1, -1 * SIZE
  556. MADD b6, c51, ALPHA_I, b6
  557. ST b3, CO2, -2 * SIZE
  558. NMSUB b7, c72, ALPHA_I, b7
  559. ST b4, CO2, -1 * SIZE
  560. MADD b8, c71, ALPHA_I, b8
  561. ST b5, CO3, -2 * SIZE
  562. MOV c21, c11
  563. ST b6, CO3, -1 * SIZE
  564. MOV c31, c11
  565. ST b7, CO4, -2 * SIZE
  566. MOV c41, c11
  567. ST b8, CO4, -1 * SIZE
  568. MOV c51, c11
  569. #if ( defined(LEFT) && defined(TRANSA)) || \
  570. (!defined(LEFT) && !defined(TRANSA))
  571. sub.d TEMP, K, KK
  572. #ifdef LEFT
  573. addi.d TEMP, TEMP, -1
  574. #else
  575. addi.d TEMP, TEMP, -4
  576. #endif
  577. slli.d L, TEMP, ZBASE_SHIFT
  578. slli.d TEMP, TEMP, 2 + ZBASE_SHIFT
  579. add.d AO, AO, L
  580. add.d BO, BO, TEMP
  581. #endif
  582. #ifdef LEFT
  583. addi.d KK, KK, 1
  584. #endif
  585. #endif
  586. MOV c61, c11
  587. blt $r0, I, .L11
  588. .align 3
  589. .L19:
  590. #if defined(TRMMKERNEL) && !defined(LEFT)
  591. addi.d KK, KK, 4
  592. #endif
  593. move B, BO
  594. blt $r0, J, .L10
  595. .align 3
  596. .L20:
  597. andi J, N, 2
  598. MTC c11, $r0
  599. move CO1, C
  600. bge $r0, J, .L30
  601. add.d CO2, C, LDC
  602. add.d C, CO2, LDC
  603. #if defined(TRMMKERNEL) && defined(LEFT)
  604. move KK, OFFSET
  605. #endif
  606. move I, M
  607. move AO, A
  608. bge $r0, I, .L29
  609. .align 3
  610. .L21:
  611. #if defined(TRMMKERNEL)
  612. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  613. move BO, B
  614. #else
  615. slli.d L, KK, ZBASE_SHIFT
  616. slli.d TEMP, KK, 1 + ZBASE_SHIFT
  617. add.d AO, AO, L
  618. add.d BO, B, TEMP
  619. #endif
  620. LD a1, AO, 0 * SIZE
  621. MOV c21, c11
  622. LD b1, BO, 0 * SIZE
  623. MOV c31, c11
  624. LD a3, AO, 4 * SIZE
  625. MOV c41, c11
  626. LD b2, BO, 1 * SIZE
  627. LD b3, BO, 2 * SIZE
  628. MOV c12, c11
  629. LD b4, BO, 3 * SIZE
  630. MOV c22, c11
  631. LD b5, BO, 4 * SIZE
  632. MOV c32, c11
  633. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  634. sub.d TEMP, K, KK
  635. #elif defined(LEFT)
  636. addi.d TEMP, KK, 1
  637. #else
  638. addi.d TEMP, KK, 2
  639. #endif
  640. srai.d L, TEMP, 2
  641. MOV c42, c11
  642. bge $r0, L, .L25
  643. #else
  644. LD a1, AO, 0 * SIZE
  645. MOV c21, c11
  646. LD b1, B, 0 * SIZE
  647. MOV c31, c11
  648. LD a3, AO, 4 * SIZE
  649. MOV c41, c11
  650. LD b2, B, 1 * SIZE
  651. srai.d L, K, 2
  652. LD b3, B, 2 * SIZE
  653. MOV c12, c11
  654. LD b4, B, 3 * SIZE
  655. MOV c22, c11
  656. LD b5, B, 4 * SIZE
  657. MOV c32, c11
  658. MOV c42, c11
  659. move BO, B
  660. bge $r0, L, .L25
  661. #endif
  662. .align 3
  663. .L22:
  664. MADD1 c11, b1, a1, c11
  665. LD a2, AO, 1 * SIZE
  666. MADD3 c21, b2, a1, c21
  667. addi.d L, L, -1
  668. MADD1 c31, b3, a1, c31
  669. MADD3 c41, b4, a1, c41
  670. LD a1, AO, 2 * SIZE
  671. MADD2 c12, b1, a2, c12
  672. LD b1, BO, 8 * SIZE
  673. MADD4 c22, b2, a2, c22
  674. LD b2, BO, 5 * SIZE
  675. MADD2 c32, b3, a2, c32
  676. LD b3, BO, 6 * SIZE
  677. MADD4 c42, b4, a2, c42
  678. LD b4, BO, 7 * SIZE
  679. MADD1 c11, b5, a1, c11
  680. LD a2, AO, 3 * SIZE
  681. MADD3 c21, b2, a1, c21
  682. MADD1 c31, b3, a1, c31
  683. MADD3 c41, b4, a1, c41
  684. LD a1, AO, 8 * SIZE
  685. MADD2 c12, b5, a2, c12
  686. LD b5, BO, 12 * SIZE
  687. MADD4 c22, b2, a2, c22
  688. LD b2, BO, 9 * SIZE
  689. MADD2 c32, b3, a2, c32
  690. LD b3, BO, 10 * SIZE
  691. MADD4 c42, b4, a2, c42
  692. LD b4, BO, 11 * SIZE
  693. MADD1 c11, b1, a3, c11
  694. LD a2, AO, 5 * SIZE
  695. MADD3 c21, b2, a3, c21
  696. MADD1 c31, b3, a3, c31
  697. MADD3 c41, b4, a3, c41
  698. LD a3, AO, 6 * SIZE
  699. MADD2 c12, b1, a2, c12
  700. LD b1, BO, 16 * SIZE
  701. MADD4 c22, b2, a2, c22
  702. LD b2, BO, 13 * SIZE
  703. MADD2 c32, b3, a2, c32
  704. LD b3, BO, 14 * SIZE
  705. MADD4 c42, b4, a2, c42
  706. LD b4, BO, 15 * SIZE
  707. MADD1 c11, b5, a3, c11
  708. LD a2, AO, 7 * SIZE
  709. MADD3 c21, b2, a3, c21
  710. addi.d AO, AO, 8 * SIZE
  711. MADD1 c31, b3, a3, c31
  712. MADD3 c41, b4, a3, c41
  713. LD a3, AO, 4 * SIZE
  714. MADD2 c12, b5, a2, c12
  715. LD b5, BO, 20 * SIZE
  716. MADD4 c22, b2, a2, c22
  717. LD b2, BO, 17 * SIZE
  718. MADD2 c32, b3, a2, c32
  719. LD b3, BO, 18 * SIZE
  720. MADD4 c42, b4, a2, c42
  721. LD b4, BO, 19 * SIZE
  722. addi.d BO, BO, 16 * SIZE
  723. blt $r0, L, .L22
  724. .align 3
  725. .L25:
  726. #ifndef TRMMKERNEL
  727. andi L, K, 3
  728. #else
  729. andi L, TEMP, 3
  730. #endif
  731. bge $r0, L, .L28
  732. .align 3
  733. .L26:
  734. MADD1 c11, b1, a1, c11
  735. LD a2, AO, 1 * SIZE
  736. MADD3 c21, b2, a1, c21
  737. addi.d L, L, -1
  738. MADD1 c31, b3, a1, c31
  739. addi.d BO, BO, 4 * SIZE
  740. MADD3 c41, b4, a1, c41
  741. LD a1, AO, 2 * SIZE
  742. MADD2 c12, b1, a2, c12
  743. LD b1, BO, 0 * SIZE
  744. MADD4 c22, b2, a2, c22
  745. LD b2, BO, 1 * SIZE
  746. MADD2 c32, b3, a2, c32
  747. LD b3, BO, 2 * SIZE
  748. MADD4 c42, b4, a2, c42
  749. LD b4, BO, 3 * SIZE
  750. addi.d AO, AO, 2 * SIZE
  751. blt $r0, L, .L26
  752. .L28:
  753. #ifndef TRMMKERNEL
  754. LD b1, CO1, 0 * SIZE
  755. ADD c11, c11, c22
  756. LD b2, CO1, 1 * SIZE
  757. ADD c12, c12, c21
  758. LD b3, CO2, 0 * SIZE
  759. ADD c31, c31, c42
  760. LD b4, CO2, 1 * SIZE
  761. ADD c32, c32, c41
  762. MADD b1, c11, ALPHA_R, b1
  763. addi.d CO1,CO1, 2 * SIZE
  764. MADD b2, c12, ALPHA_R, b2
  765. addi.d CO2,CO2, 2 * SIZE
  766. MADD b3, c31, ALPHA_R, b3
  767. addi.d I, I, -1
  768. MADD b4, c32, ALPHA_R, b4
  769. NMSUB b1, c12, ALPHA_I, b1
  770. MADD b2, c11, ALPHA_I, b2
  771. MTC c11, $r0
  772. NMSUB b3, c32, ALPHA_I, b3
  773. MADD b4, c31, ALPHA_I, b4
  774. ST b1, CO1, -2 * SIZE
  775. ST b2, CO1, -1 * SIZE
  776. ST b3, CO2, -2 * SIZE
  777. #else
  778. ADD c11, c11, c22
  779. ADD c12, c12, c21
  780. ADD c31, c31, c42
  781. ADD c32, c32, c41
  782. MUL b1, ALPHA_R, c11
  783. addi.d CO1,CO1, 2 * SIZE
  784. MUL b2, ALPHA_R, c12
  785. addi.d CO2,CO2, 2 * SIZE
  786. MUL b3, ALPHA_R, c31
  787. addi.d I, I, -1
  788. MUL b4, ALPHA_R, c32
  789. NMSUB b1, c12, ALPHA_I, b1
  790. MADD b2, c11, ALPHA_I, b2
  791. MTC c11, $r0
  792. NMSUB b3, c32, ALPHA_I, b3
  793. MADD b4, c31, ALPHA_I, b4
  794. ST b1, CO1, -2 * SIZE
  795. ST b2, CO1, -1 * SIZE
  796. ST b3, CO2, -2 * SIZE
  797. #if ( defined(LEFT) && defined(TRANSA)) || \
  798. (!defined(LEFT) && !defined(TRANSA))
  799. sub.d TEMP, K, KK
  800. #ifdef LEFT
  801. addi.d TEMP, TEMP, -1
  802. #else
  803. addi.d TEMP, TEMP, -2
  804. #endif
  805. slli.d L, TEMP, ZBASE_SHIFT
  806. slli.d TEMP, TEMP, 1 + ZBASE_SHIFT
  807. add.d AO, AO, L
  808. add.d BO, BO, TEMP
  809. #endif
  810. #ifdef LEFT
  811. addi.d KK, KK, 1
  812. #endif
  813. #endif
  814. ST b4, CO2, -1 * SIZE
  815. blt $r0, I, .L21
  816. .align 3
  817. .L29:
  818. #if defined(TRMMKERNEL) && !defined(LEFT)
  819. addi.d KK, KK, 2
  820. #endif
  821. move B, BO
  822. .align 3
  823. .L30:
  824. andi J, N, 1
  825. MTC c11, $r0
  826. move CO1, C
  827. bge $r0, J, .L999
  828. #if defined(TRMMKERNEL) && defined(LEFT)
  829. move KK, OFFSET
  830. #endif
  831. move I, M
  832. add.d C, CO1, LDC
  833. move AO, A
  834. bge $r0, I, .L39
  835. .align 3
  836. .L31:
  837. #if defined(TRMMKERNEL)
  838. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  839. move BO, B
  840. #else
  841. slli.d TEMP, KK, ZBASE_SHIFT
  842. add.d AO, AO, TEMP
  843. add.d BO, B, TEMP
  844. #endif
  845. LD a1, AO, 0 * SIZE
  846. MOV c21, c11
  847. LD b1, BO, 0 * SIZE
  848. MOV c31, c11
  849. LD a2, AO, 1 * SIZE
  850. MOV c41, c11
  851. LD b2, BO, 1 * SIZE
  852. MOV c12, c11
  853. MOV c22, c11
  854. LD a3, AO, 4 * SIZE
  855. MOV c32, c11
  856. LD b3, BO, 4 * SIZE
  857. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  858. sub.d TEMP, K, KK
  859. #elif defined(LEFT)
  860. addi.d TEMP, KK, 1
  861. #else
  862. addi.d TEMP, KK, 1
  863. #endif
  864. srai.d L, TEMP, 2
  865. MOV c42, c11
  866. bge $r0, L, .L35
  867. #else
  868. LD a1, AO, 0 * SIZE
  869. MOV c21, c11
  870. LD b1, B, 0 * SIZE
  871. MOV c31, c11
  872. LD a2, AO, 1 * SIZE
  873. MOV c41, c11
  874. LD b2, B, 1 * SIZE
  875. MOV c12, c11
  876. srai.d L, K, 2
  877. MOV c22, c11
  878. LD a3, AO, 4 * SIZE
  879. MOV c32, c11
  880. LD b3, B, 4 * SIZE
  881. MOV c42, c11
  882. move BO, B
  883. bge $r0, L, .L35
  884. #endif
  885. .align 3
  886. .L32:
  887. MADD1 c11, b1, a1, c11
  888. LD b4, BO, 3 * SIZE
  889. MADD3 c21, b2, a1, c21
  890. LD a1, AO, 2 * SIZE
  891. MADD2 c12, b1, a2, c12
  892. LD b1, BO, 2 * SIZE
  893. MADD4 c22, b2, a2, c22
  894. LD a2, AO, 3 * SIZE
  895. MADD1 c11, b1, a1, c11
  896. LD b2, BO, 5 * SIZE
  897. MADD3 c21, b4, a1, c21
  898. LD a1, AO, 8 * SIZE
  899. MADD2 c12, b1, a2, c12
  900. LD b1, BO, 8 * SIZE
  901. MADD4 c22, b4, a2, c22
  902. LD a2, AO, 5 * SIZE
  903. MADD1 c11, b3, a3, c11
  904. LD b4, BO, 7 * SIZE
  905. MADD3 c21, b2, a3, c21
  906. LD a3, AO, 6 * SIZE
  907. MADD2 c12, b3, a2, c12
  908. LD b3, BO, 6 * SIZE
  909. MADD4 c22, b2, a2, c22
  910. LD a2, AO, 7 * SIZE
  911. MADD1 c11, b3, a3, c11
  912. LD b2, BO, 9 * SIZE
  913. MADD3 c21, b4, a3, c21
  914. LD a3, AO, 12 * SIZE
  915. MADD2 c12, b3, a2, c12
  916. LD b3, BO, 12 * SIZE
  917. MADD4 c22, b4, a2, c22
  918. LD a2, AO, 9 * SIZE
  919. addi.d AO, AO, 8 * SIZE
  920. addi.d L, L, -1
  921. addi.d BO, BO, 8 * SIZE
  922. blt $r0, L, .L32
  923. .align 3
  924. .L35:
  925. #ifndef TRMMKERNEL
  926. andi L, K, 3
  927. #else
  928. andi L, TEMP, 3
  929. #endif
  930. bge $r0, L, .L38
  931. .align 3
  932. .L36:
  933. MADD1 c11, b1, a1, c11
  934. addi.d L, L, -1
  935. MADD3 c21, b2, a1, c21
  936. LD a1, AO, 2 * SIZE
  937. MADD2 c12, b1, a2, c12
  938. LD b1, BO, 2 * SIZE
  939. MADD4 c22, b2, a2, c22
  940. LD a2, AO, 3 * SIZE
  941. LD b2, BO, 3 * SIZE
  942. addi.d BO, BO, 2 * SIZE
  943. addi.d AO, AO, 2 * SIZE
  944. blt $r0, L, .L36
  945. .L38:
  946. #ifndef TRMMKERNEL
  947. LD b1, CO1, 0 * SIZE
  948. ADD c11, c11, c22
  949. LD b2, CO1, 1 * SIZE
  950. ADD c12, c12, c21
  951. MADD b1, c11, ALPHA_R, b1
  952. addi.d CO1,CO1, 2 * SIZE
  953. MADD b2, c12, ALPHA_R, b2
  954. addi.d I, I, -1
  955. NMSUB b1, c12, ALPHA_I, b1
  956. MADD b2, c11, ALPHA_I, b2
  957. MTC c11, $r0
  958. ST b1, CO1, -2 * SIZE
  959. ST b2, CO1, -1 * SIZE
  960. blt $r0, I, .L31
  961. #else
  962. ADD c11, c11, c22
  963. ADD c12, c12, c21
  964. MUL b1, ALPHA_R, c11
  965. addi.d CO1,CO1, 2 * SIZE
  966. MUL b2, ALPHA_R, c12
  967. addi.d I, I, -1
  968. NMSUB b1, c12, ALPHA_I, b1
  969. MADD b2, c11, ALPHA_I, b2
  970. MTC c11, $r0
  971. #if ( defined(LEFT) && defined(TRANSA)) || \
  972. (!defined(LEFT) && !defined(TRANSA))
  973. sub.d TEMP, K, KK
  974. #ifdef LEFT
  975. addi.d TEMP, TEMP, -1
  976. #else
  977. addi.d TEMP, TEMP, -1
  978. #endif
  979. slli.d TEMP, TEMP, ZBASE_SHIFT
  980. add.d AO, AO, TEMP
  981. add.d BO, BO, TEMP
  982. #endif
  983. #ifdef LEFT
  984. addi.d KK, KK, 1
  985. #endif
  986. ST b1, CO1, -2 * SIZE
  987. ST b2, CO1, -1 * SIZE
  988. blt $r0, I, .L31
  989. #endif
  990. .align 3
  991. .L39:
  992. #if defined(TRMMKERNEL) && !defined(LEFT)
  993. addi.d KK, KK, 1
  994. #endif
  995. move B, BO
  996. .align 3
  997. .L999:
  998. LDARG $r23, $sp, 0
  999. LDARG $r24, $sp, 8
  1000. LDARG $r25, $sp, 64
  1001. fld.d $f24, $sp, 16
  1002. fld.d $f25, $sp, 24
  1003. fld.d $f26, $sp, 32
  1004. fld.d $f27, $sp, 40
  1005. fld.d $f28, $sp, 48
  1006. fld.d $f29, $sp, 56
  1007. #if defined(TRMMKERNEL)
  1008. LDARG $r26, $sp, 72
  1009. LDARG $r27, $sp, 80
  1010. #endif
  1011. #ifndef __64BIT__
  1012. fld.d $f18, $sp, 88
  1013. fld.d $f19, $sp, 96
  1014. fld.d $f20, $sp, 104
  1015. fld.d $f21, $sp, 112
  1016. #endif
  1017. addi.d $sp, $sp, 128
  1018. move $r4, $r17
  1019. fmov.d $f0, $f22
  1020. fmov.d $f1, $f23
  1021. jirl $r0, $r1, 0x0
  1022. EPILOGUE