You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_8x4_lsx.S 104 kB


  1. /*******************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Function parameters */
  30. #define M $r4 // param 1: bm
  31. #define N $r5 // param 2: bn
  32. #define K $r6 // param 3: bk
  33. #define ALPHA_R $f0 // param 4: alphar
  34. #define ALPHA_I $f1 // param 5: alphai
  35. #define A $r7 // param 6: ba
  36. #define B $r8 // param 7: bb
  37. #define C $r9 // param 8: bc
  38. #define LDC $r10 // param 9: ldc
  39. #if defined (TRMMKERNEL)
  40. #define OFFSET $r11 // param 10: offset
  41. #endif
  42. #define OFF $r26
  43. #define I $r12
  44. #define J $r13
  45. #define L $r14
  46. #define TL $r15
  47. #define A0 $r16
  48. #define B0 $r17
  49. #define C0 $r18
  50. #define C1 $r19
  51. #define C2 $r20
  52. #define C3 $r23
  53. #define T0 $r24
  54. #define T1 $r25
  55. #define T2 $r26
  56. #define T3 $r27
  57. #define a1 $f2
  58. #define a2 $f3
  59. #define a3 $f4
  60. #define a4 $f5
  61. #define a5 $f6
  62. #define a6 $f7
  63. #define a7 $f8
  64. #define a8 $f9
  65. #define b1 $f10
  66. #define b2 $f11
  67. #define b3 $f12
  68. #define b4 $f13
  69. #define b5 $f14
  70. #define b6 $f15
  71. #define b7 $f16
  72. #define b8 $f17
  73. #define c11 $f18
  74. #define c12 $f19
  75. #define c21 $f20
  76. #define c22 $f21
  77. #define c31 $f22
  78. #define c32 $f23
  79. #define c41 $f24
  80. #define c42 $f25
  81. /* LSX vectors */
  82. #define U0 $vr30
  83. #define U1 $vr31
  84. #define U2 $vr2
  85. #define U3 $vr3
  86. #define U4 $vr4
  87. #define U5 $vr5
  88. #define U6 $vr6
  89. #define U7 $vr7
  90. #define U8 $vr8
  91. #define U9 $vr9
  92. #define U10 $vr10
  93. #define U11 $vr11
  94. #define U12 $vr12
  95. #define U13 $vr13
  96. #define U14 $vr14
  97. #define U15 $vr15
  98. #define D0 $vr16
  99. #define D1 $vr17
  100. #define D2 $vr18
  101. #define D3 $vr19
  102. #define D4 $vr20
  103. #define D5 $vr21
  104. #define D6 $vr22
  105. #define D7 $vr23
  106. #define D8 $vr24
  107. #define D9 $vr25
  108. #define D10 $vr26
  109. #define D11 $vr27
  110. #define D12 $vr28
  111. #define D13 $vr29
  112. #define VALPHAR $vr28
  113. #define VALPHAI $vr29
  114. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  115. #define VMADD1 VFMADD
  116. #define VMADD2 VFMADD
  117. #define VMADD3 VNMSUB
  118. #define VMADD4 VFMADD
  119. #define MADD1 MADD
  120. #define MADD2 MADD
  121. #define MADD3 NMSUB
  122. #define MADD4 MADD
  123. #endif
  124. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  125. #define VMADD1 VFMADD
  126. #define VMADD2 VFMADD
  127. #define VMADD3 VFMADD
  128. #define VMADD4 VNMSUB
  129. #define MADD1 MADD
  130. #define MADD2 MADD
  131. #define MADD3 MADD
  132. #define MADD4 NMSUB
  133. #endif
  134. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  135. #define VMADD1 VFMADD
  136. #define VMADD2 VNMSUB
  137. #define VMADD3 VFMADD
  138. #define VMADD4 VFMADD
  139. #define MADD1 MADD
  140. #define MADD2 NMSUB
  141. #define MADD3 MADD
  142. #define MADD4 MADD
  143. #endif
  144. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  145. #define VMADD1 VFMADD
  146. #define VMADD2 VNMSUB
  147. #define VMADD3 VNMSUB
  148. #define VMADD4 VNMSUB
  149. #define MADD1 MADD
  150. #define MADD2 NMSUB
  151. #define MADD3 NMSUB
  152. #define MADD4 NMSUB
  153. #endif
  154. PROLOGUE
  155. addi.d $sp, $sp, -128
  156. SDARG $r23, $sp, 0
  157. SDARG $r24, $sp, 8
  158. SDARG $r25, $sp, 16
  159. SDARG $r26, $sp, 24
  160. SDARG $r27, $sp, 32
  161. ST $f23, $sp, 40
  162. ST $f24, $sp, 48
  163. ST $f25, $sp, 56
  164. ST $f26, $sp, 64
  165. ST $f27, $sp, 72
  166. ST $f28, $sp, 80
  167. ST $f29, $sp, 88
  168. ST $f30, $sp, 96
  169. ST $f31, $sp, 104
  170. ST ALPHA_R,$sp, 112
  171. ST ALPHA_I,$sp, 120
  172. vldrepl.w VALPHAR, $sp, 112
  173. vldrepl.w VALPHAI, $sp, 120
  174. #if defined (TRMMKERNEL) && !defined(LEFT)
  175. sub.d OFF, $r0, OFFSET
  176. #else
  177. xor OFF, OFF, OFF
  178. #endif
  179. slli.d LDC, LDC, 2
  180. move J, $r0
  181. srai.d T0, N, 2 //bn/4
  182. beq J, T0, .L19
  183. .L10: /* for(j=0; j<bn/4; j+=1) */
  184. move C0, C
  185. slli.d TL, LDC, 1
  186. add.d C1, C0, TL
  187. add.d C2, C1, TL
  188. add.d C3, C2, TL
  189. move A0, A //ptrba
  190. #if defined(TRMMKERNEL) && defined(LEFT)
  191. move OFF, OFFSET
  192. #endif
  193. move I, $r0
  194. srai.d T0, M, 3 //bm/8
  195. beq I, T0, .L150
  196. .L11: /* for(i=0; i<bm/8; i+=1) */
  197. move B0, B //ptrbb
  198. move TL, K /* TL = bk */
  199. #if defined(TRMMKERNEL)
  200. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  201. move B0, B //ptrbb
  202. #else
  203. slli.d T3, OFF, 0x06
  204. add.d A0, A0, T3
  205. slli.d T3, OFF, 0x05
  206. add.d B0, B, T3
  207. #endif
  208. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  209. sub.d TL, K, OFF //temp
  210. #elif defined(LEFT)
  211. addi.d TL, OFF, 8
  212. #else
  213. addi.d TL, OFF, 4
  214. #endif
  215. #endif // #if defined(TRMMKERNEL)
  216. vxor.v U0, U0, U0
  217. vxor.v U1, U1, U1
  218. vxor.v U2, U2, U2
  219. vxor.v U3, U3, U3
  220. vxor.v U4, U4, U4
  221. vxor.v U5, U5, U5
  222. vxor.v U6, U6, U6
  223. vxor.v U7, U7, U7
  224. vxor.v U8, U8, U8
  225. vxor.v U9, U9, U9
  226. vxor.v U10, U10, U10
  227. vxor.v U11, U11, U11
  228. vxor.v U12, U12, U12
  229. vxor.v U13, U13, U13
  230. vxor.v U14, U14, U14
  231. vxor.v U15, U15, U15
  232. move L, $r0 //cycle param k
  233. beq L, TL, .L13
  234. blt TL, L, .L13
  235. .L12: /* for(k=0; k<temp; k+=1) */
  236. vld D0, A0, 0x00 // a0ri a1ri
  237. vld D2, B0, 0x00 // b0ri b1ri
  238. vld D3, B0, 0x10 // b2ri b3ri
  239. vshuf4i.w D4, D0, 0x00 //a0r
  240. vshuf4i.w D5, D0, 0x55 //a0i
  241. vpackev.w D6, D3, D2
  242. vshuf4i.w D6, D6, 0xd8 //b0r b1r b2r b3r
  243. vpackod.w D7, D3, D2
  244. vshuf4i.w D7, D7, 0xd8 //b0i b1i b2i b3i
  245. VMADD1 U0, D4, D6, U0 //00r 10r 20r 30r
  246. VMADD2 U1, D5, D6, U1 //00i 10i 20i 30i
  247. VMADD3 U0, D5, D7, U0
  248. VMADD4 U1, D4, D7, U1
  249. vshuf4i.w D4, D0, 0xaa //a1r
  250. vshuf4i.w D5, D0, 0xff //a1i
  251. VMADD1 U2, D4, D6, U2 //01r 11r 21r 31r
  252. VMADD2 U3, D5, D6, U3 //01i 11i 21i 31i
  253. VMADD3 U2, D5, D7, U2
  254. VMADD4 U3, D4, D7, U3
  255. vld D0, A0, 0x10 // a2ri a3ri
  256. vshuf4i.w D4, D0, 0x00 //a2r
  257. vshuf4i.w D5, D0, 0x55 //a2i
  258. VMADD1 U4, D4, D6, U4 //02r 12r 22r 32r
  259. VMADD2 U5, D5, D6, U5 //02i 12i 22i 32i
  260. VMADD3 U4, D5, D7, U4
  261. VMADD4 U5, D4, D7, U5
  262. vshuf4i.w D4, D0, 0xaa //a3r
  263. vshuf4i.w D5, D0, 0xff //a3i
  264. VMADD1 U6, D4, D6, U6 //03r 13r 23r 33r
  265. VMADD2 U7, D5, D6, U7 //03i 13i 23i 33i
  266. VMADD3 U6, D5, D7, U6
  267. VMADD4 U7, D4, D7, U7
  268. vld D0, A0, 0x20 // a4ri a5ri
  269. vshuf4i.w D4, D0, 0x00 //a4r
  270. vshuf4i.w D5, D0, 0x55 //a4i
  271. VMADD1 U8, D4, D6, U8 //04r 14r 24r 34r
  272. VMADD2 U9, D5, D6, U9 //04i 14i 24i 34i
  273. VMADD3 U8, D5, D7, U8
  274. VMADD4 U9, D4, D7, U9
  275. vshuf4i.w D4, D0, 0xaa //a5r
  276. vshuf4i.w D5, D0, 0xff //a5i
  277. VMADD1 U10, D4, D6, U10 //05r 15r 25r 35r
  278. VMADD2 U11, D5, D6, U11 //05i 15i 25i 35i
  279. VMADD3 U10, D5, D7, U10
  280. VMADD4 U11, D4, D7, U11
  281. vld D0, A0, 0x30 // a6ri a7ri
  282. vshuf4i.w D4, D0, 0x00 //a6r
  283. vshuf4i.w D5, D0, 0x55 //a6i
  284. VMADD1 U12, D4, D6, U12 //06r 16r 26r 36r
  285. VMADD2 U13, D5, D6, U13 //06i 16i 26i 36i
  286. VMADD3 U12, D5, D7, U12
  287. VMADD4 U13, D4, D7, U13
  288. vshuf4i.w D4, D0, 0xaa //a5r
  289. vshuf4i.w D5, D0, 0xff //a5i
  290. VMADD1 U14, D4, D6, U14 //07r 17r 27r 37r
  291. VMADD2 U15, D5, D6, U15 //07i 17i 27i 37i
  292. VMADD3 U14, D5, D7, U14
  293. VMADD4 U15, D4, D7, U15
  294. addi.d A0, A0, 0x40
  295. addi.d B0, B0, 0x20
  296. addi.d L, L, 1
  297. blt L, TL, .L12
  298. .L13:
  299. #if defined(TRMMKERNEL)
  300. //res00 res10 res20 res30
  301. vld D0, C0, 0x00 //c0: 0 1 2 3
  302. vld D1, C1, 0x00 //c1: 0 1 2 3
  303. vld D2, C2, 0x00 //c2: 0 1 2 3
  304. vld D3, C3, 0x00 //c3: 0 1 2 3
  305. vand.v D4, D1, D1
  306. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  307. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  308. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  309. vand.v D5, D3, D3
  310. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  311. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  312. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  313. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  314. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  315. vfmul.s D8, U0, VALPHAR
  316. vfmul.s D9, U1, VALPHAR
  317. VNMSUB D8, U1, VALPHAI, D8
  318. VFMADD D9, U0, VALPHAI, D9
  319. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  320. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  321. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  322. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  323. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  324. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  325. //res01 res11 res21 res31
  326. vand.v D4, D1, D1
  327. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  328. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  329. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  330. vand.v D5, D3, D3
  331. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  332. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  333. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  334. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  335. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  336. vfmul.s D8, U2, VALPHAR
  337. vfmul.s D9, U3, VALPHAR
  338. VNMSUB D8, U3, VALPHAI, D8
  339. VFMADD D9, U2, VALPHAI, D9
  340. vand.v D4, D9, D9
  341. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  342. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  343. vand.v D2, D4, D4
  344. vand.v D5, D9, D9
  345. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  346. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  347. vand.v D3, D5, D5
  348. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  349. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  350. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  351. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  352. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  353. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  354. vst D4, C0, 0x00
  355. vst D2, C1, 0x00
  356. vst D5, C2, 0x00
  357. vst D3, C3, 0x00
  358. addi.d C0, C0, 0x10
  359. addi.d C1, C1, 0x10
  360. addi.d C2, C2, 0x10
  361. addi.d C3, C3, 0x10
  362. //res02 res12 res22 res32
  363. vld D0, C0, 0x00 //c0: 0 1 2 3
  364. vld D1, C1, 0x00 //c1: 0 1 2 3
  365. vld D2, C2, 0x00 //c2: 0 1 2 3
  366. vld D3, C3, 0x00 //c3: 0 1 2 3
  367. vand.v D4, D1, D1
  368. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  369. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  370. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  371. vand.v D5, D3, D3
  372. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  373. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  374. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  375. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  376. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  377. vfmul.s D8, U4, VALPHAR
  378. vfmul.s D9, U5, VALPHAR
  379. VNMSUB D8, U5, VALPHAI, D8
  380. VFMADD D9, U4, VALPHAI, D9
  381. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  382. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  383. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  384. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  385. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  386. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  387. //res03 res13 res23 res33
  388. vand.v D4, D1, D1
  389. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  390. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  391. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  392. vand.v D5, D3, D3
  393. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  394. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  395. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  396. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  397. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  398. vfmul.s D8, U6, VALPHAR
  399. vfmul.s D9, U7, VALPHAR
  400. VNMSUB D8, U7, VALPHAI, D8
  401. VFMADD D9, U6, VALPHAI, D9
  402. vand.v D4, D9, D9
  403. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  404. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  405. vand.v D2, D4, D4
  406. vand.v D5, D9, D9
  407. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  408. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  409. vand.v D3, D5, D5
  410. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  411. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  412. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  413. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  414. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  415. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  416. vst D4, C0, 0x00
  417. vst D2, C1, 0x00
  418. vst D5, C2, 0x00
  419. vst D3, C3, 0x00
  420. addi.d C0, C0, 0x10
  421. addi.d C1, C1, 0x10
  422. addi.d C2, C2, 0x10
  423. addi.d C3, C3, 0x10
  424. //res04 res14 res24 res34
  425. vld D0, C0, 0x00 //c0: 0 1 2 3
  426. vld D1, C1, 0x00 //c1: 0 1 2 3
  427. vld D2, C2, 0x00 //c2: 0 1 2 3
  428. vld D3, C3, 0x00 //c3: 0 1 2 3
  429. vand.v D4, D1, D1
  430. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  431. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  432. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  433. vand.v D5, D3, D3
  434. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  435. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  436. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  437. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  438. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  439. vfmul.s D8, U8, VALPHAR
  440. vfmul.s D9, U9, VALPHAR
  441. VNMSUB D8, U9, VALPHAI, D8
  442. VFMADD D9, U8, VALPHAI, D9
  443. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  444. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  445. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  446. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  447. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  448. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  449. //res05 res15 res25 res35
  450. vand.v D4, D1, D1
  451. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  452. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  453. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  454. vand.v D5, D3, D3
  455. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  456. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  457. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  458. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  459. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  460. vfmul.s D8, U10, VALPHAR
  461. vfmul.s D9, U11, VALPHAR
  462. VNMSUB D8, U11, VALPHAI, D8
  463. VFMADD D9, U10, VALPHAI, D9
  464. vand.v D4, D9, D9
  465. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  466. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  467. vand.v D2, D4, D4
  468. vand.v D5, D9, D9
  469. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  470. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  471. vand.v D3, D5, D5
  472. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  473. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  474. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  475. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  476. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  477. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  478. vst D4, C0, 0x00
  479. vst D2, C1, 0x00
  480. vst D5, C2, 0x00
  481. vst D3, C3, 0x00
  482. addi.d C0, C0, 0x10
  483. addi.d C1, C1, 0x10
  484. addi.d C2, C2, 0x10
  485. addi.d C3, C3, 0x10
  486. //res06 res16 res26 res36
  487. vld D0, C0, 0x00 //c0: 0 1 2 3
  488. vld D1, C1, 0x00 //c1: 0 1 2 3
  489. vld D2, C2, 0x00 //c2: 0 1 2 3
  490. vld D3, C3, 0x00 //c3: 0 1 2 3
  491. vand.v D4, D1, D1
  492. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  493. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  494. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  495. vand.v D5, D3, D3
  496. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  497. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  498. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  499. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  500. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  501. vfmul.s D8, U12, VALPHAR
  502. vfmul.s D9, U13, VALPHAR
  503. VNMSUB D8, U13, VALPHAI, D8
  504. VFMADD D9, U12, VALPHAI, D9
  505. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  506. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  507. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  508. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  509. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  510. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  511. //res07 res17 res27 res37
  512. vand.v D4, D1, D1
  513. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  514. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  515. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  516. vand.v D5, D3, D3
  517. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  518. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  519. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  520. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  521. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  522. vfmul.s D8, U14, VALPHAR
  523. vfmul.s D9, U15, VALPHAR
  524. VNMSUB D8, U15, VALPHAI, D8
  525. VFMADD D9, U14, VALPHAI, D9
  526. vand.v D4, D9, D9
  527. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  528. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  529. vand.v D2, D4, D4
  530. vand.v D5, D9, D9
  531. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  532. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  533. vand.v D3, D5, D5
  534. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  535. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  536. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  537. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  538. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  539. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  540. vst D4, C0, 0x00
  541. vst D2, C1, 0x00
  542. vst D5, C2, 0x00
  543. vst D3, C3, 0x00
  544. addi.d C0, C0, 0x10
  545. addi.d C1, C1, 0x10
  546. addi.d C2, C2, 0x10
  547. addi.d C3, C3, 0x10
  548. #else
  549. //res00 res10 res20 res30
  550. vld D0, C0, 0x00 //c0: 0 1 2 3
  551. vld D1, C1, 0x00 //c1: 0 1 2 3
  552. vld D2, C2, 0x00 //c2: 0 1 2 3
  553. vld D3, C3, 0x00 //c3: 0 1 2 3
  554. vand.v D4, D1, D1
  555. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  556. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  557. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  558. vand.v D5, D3, D3
  559. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  560. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  561. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  562. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  563. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  564. VFMADD D8, U0, VALPHAR, D8
  565. VFMADD D9, U1, VALPHAR, D9
  566. VNMSUB D8, U1, VALPHAI, D8
  567. VFMADD D9, U0, VALPHAI, D9
  568. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  569. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  570. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  571. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  572. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  573. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  574. //res01 res11 res21 res31
  575. vand.v D4, D1, D1
  576. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  577. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  578. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  579. vand.v D5, D3, D3
  580. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  581. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  582. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  583. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  584. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  585. VFMADD D8, U2, VALPHAR, D8
  586. VFMADD D9, U3, VALPHAR, D9
  587. VNMSUB D8, U3, VALPHAI, D8
  588. VFMADD D9, U2, VALPHAI, D9
  589. vand.v D4, D9, D9
  590. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  591. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  592. vand.v D2, D4, D4
  593. vand.v D5, D9, D9
  594. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  595. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  596. vand.v D3, D5, D5
  597. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  598. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  599. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  600. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  601. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  602. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  603. vst D4, C0, 0x00
  604. vst D2, C1, 0x00
  605. vst D5, C2, 0x00
  606. vst D3, C3, 0x00
  607. addi.d C0, C0, 0x10
  608. addi.d C1, C1, 0x10
  609. addi.d C2, C2, 0x10
  610. addi.d C3, C3, 0x10
  611. //res02 res12 res22 res32
  612. vld D0, C0, 0x00 //c0: 0 1 2 3
  613. vld D1, C1, 0x00 //c1: 0 1 2 3
  614. vld D2, C2, 0x00 //c2: 0 1 2 3
  615. vld D3, C3, 0x00 //c3: 0 1 2 3
  616. vand.v D4, D1, D1
  617. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  618. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  619. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  620. vand.v D5, D3, D3
  621. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  622. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  623. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  624. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  625. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  626. VFMADD D8, U4, VALPHAR, D8
  627. VFMADD D9, U5, VALPHAR, D9
  628. VNMSUB D8, U5, VALPHAI, D8
  629. VFMADD D9, U4, VALPHAI, D9
  630. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  631. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  632. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  633. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  634. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  635. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  636. //res03 res13 res23 res33
  637. vand.v D4, D1, D1
  638. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  639. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  640. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  641. vand.v D5, D3, D3
  642. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  643. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  644. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  645. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  646. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  647. VFMADD D8, U6, VALPHAR, D8
  648. VFMADD D9, U7, VALPHAR, D9
  649. VNMSUB D8, U7, VALPHAI, D8
  650. VFMADD D9, U6, VALPHAI, D9
  651. vand.v D4, D9, D9
  652. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  653. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  654. vand.v D2, D4, D4
  655. vand.v D5, D9, D9
  656. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  657. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  658. vand.v D3, D5, D5
  659. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  660. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  661. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  662. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  663. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  664. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  665. vst D4, C0, 0x00
  666. vst D2, C1, 0x00
  667. vst D5, C2, 0x00
  668. vst D3, C3, 0x00
  669. addi.d C0, C0, 0x10
  670. addi.d C1, C1, 0x10
  671. addi.d C2, C2, 0x10
  672. addi.d C3, C3, 0x10
  673. //res04 res14 res24 res34
  674. vld D0, C0, 0x00 //c0: 0 1 2 3
  675. vld D1, C1, 0x00 //c1: 0 1 2 3
  676. vld D2, C2, 0x00 //c2: 0 1 2 3
  677. vld D3, C3, 0x00 //c3: 0 1 2 3
  678. vand.v D4, D1, D1
  679. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  680. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  681. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  682. vand.v D5, D3, D3
  683. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  684. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  685. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  686. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  687. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  688. VFMADD D8, U8, VALPHAR, D8
  689. VFMADD D9, U9, VALPHAR, D9
  690. VNMSUB D8, U9, VALPHAI, D8
  691. VFMADD D9, U8, VALPHAI, D9
  692. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  693. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  694. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  695. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  696. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  697. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  698. //res05 res15 res25 res35
  699. vand.v D4, D1, D1
  700. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  701. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  702. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  703. vand.v D5, D3, D3
  704. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  705. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  706. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  707. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  708. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  709. VFMADD D8, U10, VALPHAR, D8
  710. VFMADD D9, U11, VALPHAR, D9
  711. VNMSUB D8, U11, VALPHAI, D8
  712. VFMADD D9, U10, VALPHAI, D9
  713. vand.v D4, D9, D9
  714. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  715. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  716. vand.v D2, D4, D4
  717. vand.v D5, D9, D9
  718. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  719. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  720. vand.v D3, D5, D5
  721. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  722. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  723. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  724. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  725. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  726. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  727. vst D4, C0, 0x00
  728. vst D2, C1, 0x00
  729. vst D5, C2, 0x00
  730. vst D3, C3, 0x00
  731. addi.d C0, C0, 0x10
  732. addi.d C1, C1, 0x10
  733. addi.d C2, C2, 0x10
  734. addi.d C3, C3, 0x10
  735. //res06 res16 res26 res36
  736. vld D0, C0, 0x00 //c0: 0 1 2 3
  737. vld D1, C1, 0x00 //c1: 0 1 2 3
  738. vld D2, C2, 0x00 //c2: 0 1 2 3
  739. vld D3, C3, 0x00 //c3: 0 1 2 3
  740. vand.v D4, D1, D1
  741. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  742. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  743. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  744. vand.v D5, D3, D3
  745. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  746. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  747. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  748. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  749. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  750. VFMADD D8, U12, VALPHAR, D8
  751. VFMADD D9, U13, VALPHAR, D9
  752. VNMSUB D8, U13, VALPHAI, D8
  753. VFMADD D9, U12, VALPHAI, D9
  754. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  755. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  756. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  757. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  758. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  759. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  760. //res07 res17 res27 res37
  761. vand.v D4, D1, D1
  762. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  763. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  764. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  765. vand.v D5, D3, D3
  766. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  767. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  768. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  769. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  770. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  771. VFMADD D8, U14, VALPHAR, D8
  772. VFMADD D9, U15, VALPHAR, D9
  773. VNMSUB D8, U15, VALPHAI, D8
  774. VFMADD D9, U14, VALPHAI, D9
  775. vand.v D4, D9, D9
  776. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  777. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  778. vand.v D2, D4, D4
  779. vand.v D5, D9, D9
  780. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  781. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  782. vand.v D3, D5, D5
  783. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  784. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  785. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  786. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  787. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  788. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  789. vst D4, C0, 0x00
  790. vst D2, C1, 0x00
  791. vst D5, C2, 0x00
  792. vst D3, C3, 0x00
  793. addi.d C0, C0, 0x10
  794. addi.d C1, C1, 0x10
  795. addi.d C2, C2, 0x10
  796. addi.d C3, C3, 0x10
  797. #endif
  798. #if defined(TRMMKERNEL)
  799. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  800. sub.d TL, K, OFF
  801. #ifdef LEFT
  802. addi.d TL, TL, -8
  803. #else
  804. addi.d TL, TL, -4
  805. #endif
  806. slli.d T3, TL, 0x06
  807. add.d A0, A0, T3
  808. slli.d T3, TL, 0x05
  809. add.d B0, B0, T3
  810. #endif
  811. #ifdef LEFT
  812. addi.d OFF, OFF, 8
  813. #endif
  814. #endif // #if defined(TRMMKERNEL)
  815. addi.d I, I, 1
  816. blt I, T0, .L11
  817. .L150:
  818. move I, $r0
  819. andi T0, M, 4
  820. beq I, T0, .L18
  821. .L15: /* if (bm & 4) */
  822. move B0, B //ptrbb
  823. move TL, K /* TL = bk */
  824. #if defined(TRMMKERNEL)
  825. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  826. move B0, B //ptrbb
  827. #else
  828. slli.d T3, OFF, 0x05
  829. add.d A0, A0, T3
  830. slli.d T3, OFF, 0x05
  831. add.d B0, B, T3
  832. #endif
  833. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  834. sub.d TL, K, OFF
  835. #elif defined(LEFT)
  836. addi.d TL, OFF, 4
  837. #else
  838. addi.d TL, OFF, 4
  839. #endif
  840. #endif // #if defined(TRMMKERNEL)
  841. vxor.v U0, U0, U0
  842. vxor.v U1, U1, U1
  843. vxor.v U2, U2, U2
  844. vxor.v U3, U3, U3
  845. vxor.v U4, U4, U4
  846. vxor.v U5, U5, U5
  847. vxor.v U6, U6, U6
  848. vxor.v U7, U7, U7
  849. move L, $r0 //cycle param k
  850. beq L, TL, .L17
  851. blt TL, L, .L17
  852. .L16: /* for (k=0; k<temp; k++) */
  853. vld D0, A0, 0x00 // a0ri a1ri
  854. vld D2, B0, 0x00 // b0ri b1ri
  855. vld D3, B0, 0x10 // b2ri b3ri
  856. vshuf4i.w D4, D0, 0x00 //a0r
  857. vshuf4i.w D5, D0, 0x55 //a0i
  858. vpackev.w D6, D3, D2
  859. vshuf4i.w D6, D6, 0xd8 //b0r b1r b2r b3r
  860. vpackod.w D7, D3, D2
  861. vshuf4i.w D7, D7, 0xd8 //b0i b1i b2i b3i
  862. VMADD1 U0, D4, D6, U0 //00r 10r 20r 30r
  863. VMADD2 U1, D5, D6, U1 //00i 10i 20i 30i
  864. VMADD3 U0, D5, D7, U0
  865. VMADD4 U1, D4, D7, U1
  866. vshuf4i.w D4, D0, 0xaa //a1r
  867. vshuf4i.w D5, D0, 0xff //a1i
  868. VMADD1 U2, D4, D6, U2 //01r 11r 21r 31r
  869. VMADD2 U3, D5, D6, U3 //01i 11i 21i 31i
  870. VMADD3 U2, D5, D7, U2
  871. VMADD4 U3, D4, D7, U3
  872. vld D0, A0, 0x10 // a2ri a3ri
  873. vshuf4i.w D4, D0, 0x00 //a2r
  874. vshuf4i.w D5, D0, 0x55 //a2i
  875. VMADD1 U4, D4, D6, U4 //02r 12r 22r 32r
  876. VMADD2 U5, D5, D6, U5 //02i 12i 22i 32i
  877. VMADD3 U4, D5, D7, U4
  878. VMADD4 U5, D4, D7, U5
  879. vshuf4i.w D4, D0, 0xaa //a3r
  880. vshuf4i.w D5, D0, 0xff //a3i
  881. VMADD1 U6, D4, D6, U6 //03r 13r 23r 33r
  882. VMADD2 U7, D5, D6, U7 //03i 13i 23i 33i
  883. VMADD3 U6, D5, D7, U6
  884. VMADD4 U7, D4, D7, U7
  885. addi.d A0, A0, 0x20
  886. addi.d B0, B0, 0x20
  887. addi.d L, L, 1
  888. blt L, TL, .L16
  889. .L17:
  890. #if defined(TRMMKERNEL)
  891. //res00 res10 res20 res30
  892. vld D0, C0, 0x00 //c0: 0 1 2 3
  893. vld D1, C1, 0x00 //c1: 0 1 2 3
  894. vld D2, C2, 0x00 //c2: 0 1 2 3
  895. vld D3, C3, 0x00 //c3: 0 1 2 3
  896. vand.v D4, D1, D1
  897. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  898. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  899. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  900. vand.v D5, D3, D3
  901. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  902. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  903. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  904. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  905. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  906. vfmul.s D8, U0, VALPHAR
  907. vfmul.s D9, U1, VALPHAR
  908. VNMSUB D8, U1, VALPHAI, D8
  909. VFMADD D9, U0, VALPHAI, D9
  910. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  911. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  912. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  913. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  914. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  915. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  916. //res01 res11 res21 res31
  917. vand.v D4, D1, D1
  918. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  919. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  920. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  921. vand.v D5, D3, D3
  922. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  923. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  924. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  925. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  926. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  927. vfmul.s D8, U2, VALPHAR
  928. vfmul.s D9, U3, VALPHAR
  929. VNMSUB D8, U3, VALPHAI, D8
  930. VFMADD D9, U2, VALPHAI, D9
  931. vand.v D4, D9, D9
  932. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  933. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  934. vand.v D2, D4, D4
  935. vand.v D5, D9, D9
  936. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  937. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  938. vand.v D3, D5, D5
  939. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  940. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  941. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  942. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  943. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  944. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  945. vst D4, C0, 0x00
  946. vst D2, C1, 0x00
  947. vst D5, C2, 0x00
  948. vst D3, C3, 0x00
  949. addi.d C0, C0, 0x10
  950. addi.d C1, C1, 0x10
  951. addi.d C2, C2, 0x10
  952. addi.d C3, C3, 0x10
  953. //res02 res12 res22 res32
  954. vld D0, C0, 0x00 //c0: 0 1 2 3
  955. vld D1, C1, 0x00 //c1: 0 1 2 3
  956. vld D2, C2, 0x00 //c2: 0 1 2 3
  957. vld D3, C3, 0x00 //c3: 0 1 2 3
  958. vand.v D4, D1, D1
  959. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  960. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  961. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  962. vand.v D5, D3, D3
  963. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  964. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  965. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  966. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  967. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  968. vfmul.s D8, U4, VALPHAR
  969. vfmul.s D9, U5, VALPHAR
  970. VNMSUB D8, U5, VALPHAI, D8
  971. VFMADD D9, U4, VALPHAI, D9
  972. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  973. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  974. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  975. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  976. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  977. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  978. //res03 res13 res23 res33
  979. vand.v D4, D1, D1
  980. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  981. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  982. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  983. vand.v D5, D3, D3
  984. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  985. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  986. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  987. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  988. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  989. vfmul.s D8, U6, VALPHAR
  990. vfmul.s D9, U7, VALPHAR
  991. VNMSUB D8, U7, VALPHAI, D8
  992. VFMADD D9, U6, VALPHAI, D9
  993. vand.v D4, D9, D9
  994. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  995. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  996. vand.v D2, D4, D4
  997. vand.v D5, D9, D9
  998. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  999. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  1000. vand.v D3, D5, D5
  1001. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  1002. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  1003. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  1004. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  1005. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  1006. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  1007. vst D4, C0, 0x00
  1008. vst D2, C1, 0x00
  1009. vst D5, C2, 0x00
  1010. vst D3, C3, 0x00
  1011. addi.d C0, C0, 0x10
  1012. addi.d C1, C1, 0x10
  1013. addi.d C2, C2, 0x10
  1014. addi.d C3, C3, 0x10
  1015. #else
  1016. //res00 res10 res20 res30
  1017. vld D0, C0, 0x00 //c0: 0 1 2 3
  1018. vld D1, C1, 0x00 //c1: 0 1 2 3
  1019. vld D2, C2, 0x00 //c2: 0 1 2 3
  1020. vld D3, C3, 0x00 //c3: 0 1 2 3
  1021. vand.v D4, D1, D1
  1022. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  1023. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  1024. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  1025. vand.v D5, D3, D3
  1026. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  1027. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  1028. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  1029. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  1030. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  1031. VFMADD D8, U0, VALPHAR, D8
  1032. VFMADD D9, U1, VALPHAR, D9
  1033. VNMSUB D8, U1, VALPHAI, D8
  1034. VFMADD D9, U0, VALPHAI, D9
  1035. vst VALPHAR, C0, 0x00
  1036. vst VALPHAI, C1, 0x00
  1037. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  1038. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  1039. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  1040. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  1041. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  1042. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  1043. //res01 res11 res21 res31
  1044. vand.v D4, D1, D1
  1045. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  1046. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  1047. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  1048. vand.v D5, D3, D3
  1049. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  1050. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  1051. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  1052. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  1053. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  1054. VFMADD D8, U2, VALPHAR, D8
  1055. VFMADD D9, U3, VALPHAR, D9
  1056. VNMSUB D8, U3, VALPHAI, D8
  1057. VFMADD D9, U2, VALPHAI, D9
  1058. vand.v D4, D9, D9
  1059. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  1060. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  1061. vand.v D2, D4, D4
  1062. vand.v D5, D9, D9
  1063. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  1064. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  1065. vand.v D3, D5, D5
  1066. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  1067. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  1068. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  1069. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  1070. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  1071. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  1072. // vst VALPHAR,C0, 0x00
  1073. // LD $f15, C0, 0x00
  1074. // LD $f15, C0, 0x04
  1075. // LD $f15, C0, 0x08
  1076. // LD $f15, C0, 0x0c
  1077. // vst VALPHAI,C0, 0x00
  1078. // LD $f15, C0, 0x00
  1079. // LD $f15, C0, 0x04
  1080. // LD $f15, C0, 0x08
  1081. // LD $f15, C0, 0x0c
  1082. vst D4, C0, 0x00
  1083. vst D2, C1, 0x00
  1084. vst D5, C2, 0x00
  1085. vst D3, C3, 0x00
  1086. // LD $f15, C0, 0x00
  1087. // LD $f15, C0, 0x04
  1088. // LD $f15, C0, 0x08
  1089. // LD $f15, C0, 0x0c
  1090. // LD $f15, C1, 0x00
  1091. // LD $f15, C1, 0x04
  1092. // LD $f15, C1, 0x08
  1093. // LD $f15, C1, 0x0c
  1094. // LD $f15, C2, 0x00
  1095. // LD $f15, C2, 0x04
  1096. // LD $f15, C2, 0x08
  1097. // LD $f15, C2, 0x0c
  1098. // LD $f15, C3, 0x00
  1099. // LD $f15, C3, 0x04
  1100. // LD $f15, C3, 0x08
  1101. // LD $f15, C3, 0x0c
  1102. addi.d C0, C0, 0x10
  1103. addi.d C1, C1, 0x10
  1104. addi.d C2, C2, 0x10
  1105. addi.d C3, C3, 0x10
  1106. //res02 res12 res22 res32
  1107. vld D0, C0, 0x00 //c0: 0 1 2 3
  1108. vld D1, C1, 0x00 //c1: 0 1 2 3
  1109. vld D2, C2, 0x00 //c2: 0 1 2 3
  1110. vld D3, C3, 0x00 //c3: 0 1 2 3
  1111. vand.v D4, D1, D1
  1112. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  1113. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  1114. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  1115. vand.v D5, D3, D3
  1116. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  1117. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  1118. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  1119. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  1120. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  1121. VFMADD D8, U4, VALPHAR, D8
  1122. VFMADD D9, U5, VALPHAR, D9
  1123. VNMSUB D8, U5, VALPHAI, D8
  1124. VFMADD D9, U4, VALPHAI, D9
  1125. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  1126. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  1127. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  1128. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  1129. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  1130. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  1131. //res03 res13 res23 res33
  1132. vand.v D4, D1, D1
  1133. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  1134. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  1135. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  1136. vand.v D5, D3, D3
  1137. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  1138. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  1139. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  1140. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  1141. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  1142. VFMADD D8, U6, VALPHAR, D8
  1143. VFMADD D9, U7, VALPHAR, D9
  1144. VNMSUB D8, U7, VALPHAI, D8
  1145. VFMADD D9, U6, VALPHAI, D9
  1146. vand.v D4, D9, D9
  1147. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  1148. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  1149. vand.v D2, D4, D4
  1150. vand.v D5, D9, D9
  1151. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  1152. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  1153. vand.v D3, D5, D5
  1154. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  1155. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  1156. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  1157. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  1158. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  1159. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  1160. vst D4, C0, 0x00
  1161. vst D2, C1, 0x00
  1162. vst D5, C2, 0x00
  1163. vst D3, C3, 0x00
  1164. addi.d C0, C0, 0x10
  1165. addi.d C1, C1, 0x10
  1166. addi.d C2, C2, 0x10
  1167. addi.d C3, C3, 0x10
  1168. #endif
  1169. #if defined(TRMMKERNEL)
  1170. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1171. sub.d TL, K, OFF
  1172. #ifdef LEFT
  1173. addi.d TL, TL, -4
  1174. #else
  1175. addi.d TL, TL, -4
  1176. #endif
  1177. slli.d T3, TL, 0x05
  1178. add.d A0, A0, T3
  1179. add.d B0, B0, T3
  1180. #endif
  1181. #ifdef LEFT
  1182. addi.d OFF, OFF, 4
  1183. #endif
  1184. #endif // #if defined(TRMMKERNEL)
  1185. .L18: /* if (bm & 2) */
  1186. move I, $r0
  1187. andi T0, M, 2
  1188. beq I, T0, .L183
  1189. move B0, B //ptrbb
  1190. move TL, K /* TL = bk */
  1191. #if defined(TRMMKERNEL)
  1192. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  1193. move B0, B //ptrbb
  1194. #else
  1195. slli.d T3, OFF, 0x04
  1196. add.d A0, A0, T3
  1197. slli.d T3, OFF, 0x05
  1198. add.d B0, B, T3
  1199. #endif
  1200. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1201. sub.d TL, K, OFF
  1202. #elif defined(LEFT)
  1203. addi.d TL, OFF, 2
  1204. #else
  1205. addi.d TL, OFF, 4
  1206. #endif
  1207. #endif // #if defined(TRMMKERNEL)
  1208. vxor.v U0, U0, U0
  1209. vxor.v U1, U1, U1
  1210. vxor.v U2, U2, U2
  1211. vxor.v U3, U3, U3
  1212. move L, $r0 //cycle param k
  1213. beq L, TL, .L182
  1214. blt TL, L, .L182
  1215. .L181: /* for (k=0; k<temp; k++) */
  1216. vld D0, A0, 0x00 // a0ri a1ri
  1217. vld D2, B0, 0x00 // b0ri b1ri
  1218. vld D3, B0, 0x10 // b2ri b3ri
  1219. vshuf4i.w D4, D0, 0x00 //a0r
  1220. vshuf4i.w D5, D0, 0x55 //a0i
  1221. vpackev.w D6, D3, D2
  1222. vshuf4i.w D6, D6, 0xd8 //b0r b1r b2r b3r
  1223. vpackod.w D7, D3, D2
  1224. vshuf4i.w D7, D7, 0xd8 //b0i b1i b2i b3i
  1225. VMADD1 U0, D4, D6, U0 //00r 10r 20r 30r
  1226. VMADD2 U1, D5, D6, U1 //00i 10i 20i 30i
  1227. VMADD3 U0, D5, D7, U0
  1228. VMADD4 U1, D4, D7, U1
  1229. vshuf4i.w D4, D0, 0xaa //a1r
  1230. vshuf4i.w D5, D0, 0xff //a1i
  1231. VMADD1 U2, D4, D6, U2 //01r 11r 21r 31r
  1232. VMADD2 U3, D5, D6, U3 //01i 11i 21i 31i
  1233. VMADD3 U2, D5, D7, U2
  1234. VMADD4 U3, D4, D7, U3
  1235. addi.d A0, A0, 0x10
  1236. addi.d B0, B0, 0x20
  1237. addi.d L, L, 1
  1238. blt L, TL, .L181
  1239. .L182:
  1240. #if defined(TRMMKERNEL)
  1241. //res00 res10 res20 res30
  1242. vld D0, C0, 0x00 //c0: 0 1 2 3
  1243. vld D1, C1, 0x00 //c1: 0 1 2 3
  1244. vld D2, C2, 0x00 //c2: 0 1 2 3
  1245. vld D3, C3, 0x00 //c3: 0 1 2 3
  1246. vand.v D4, D1, D1
  1247. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  1248. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  1249. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  1250. vand.v D5, D3, D3
  1251. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  1252. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  1253. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  1254. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  1255. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  1256. vfmul.s D8, U0, VALPHAR
  1257. vfmul.s D9, U1, VALPHAR
  1258. VNMSUB D8, U1, VALPHAI, D8
  1259. VFMADD D9, U0, VALPHAI, D9
  1260. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  1261. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  1262. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  1263. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  1264. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  1265. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  1266. //res01 res11 res21 res31
  1267. vand.v D4, D1, D1
  1268. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  1269. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  1270. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  1271. vand.v D5, D3, D3
  1272. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  1273. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  1274. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  1275. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  1276. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  1277. vfmul.s D8, U2, VALPHAR
  1278. vfmul.s D9, U3, VALPHAR
  1279. VNMSUB D8, U3, VALPHAI, D8
  1280. VFMADD D9, U2, VALPHAI, D9
  1281. vand.v D4, D9, D9
  1282. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  1283. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  1284. vand.v D2, D4, D4
  1285. vand.v D5, D9, D9
  1286. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  1287. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  1288. vand.v D3, D5, D5
  1289. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  1290. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  1291. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  1292. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  1293. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  1294. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  1295. vst D4, C0, 0x00
  1296. vst D2, C1, 0x00
  1297. vst D5, C2, 0x00
  1298. vst D3, C3, 0x00
  1299. addi.d C0, C0, 0x10
  1300. addi.d C1, C1, 0x10
  1301. addi.d C2, C2, 0x10
  1302. addi.d C3, C3, 0x10
  1303. #else
  1304. //res00 res10 res20 res30
  1305. vld D0, C0, 0x00 //c0: 0 1 2 3
  1306. vld D1, C1, 0x00 //c1: 0 1 2 3
  1307. vld D2, C2, 0x00 //c2: 0 1 2 3
  1308. vld D3, C3, 0x00 //c3: 0 1 2 3
  1309. vand.v D4, D1, D1
  1310. vpermi.w D4, D0, 0x44 //c0:0 1, c1:0 1
  1311. vshuf4i.w D6, D4, 0xd8 //c0[0] c1[0] c0[1] c1[1]
  1312. vshuf4i.w D7, D4, 0x8d //c0[1] c1[1] c0[0] c1[0]
  1313. vand.v D5, D3, D3
  1314. vpermi.w D5, D2, 0x44 //c2:0 1, c3:0 1
  1315. vshuf4i.w D8, D5, 0xd8 //c2[0] c3[0] c2[1] c3[1]
  1316. vshuf4i.w D9, D5, 0x8d //c2[1] c3[1] c2[0] c3[0]
  1317. vpermi.w D8, D6, 0x44 //c0[0] c1[0] c2[0] c3[0]
  1318. vpermi.w D9, D7, 0x44 //c0[1] c1[1] c2[1] c3[1]
  1319. VFMADD D8, U0, VALPHAR, D8
  1320. VFMADD D9, U1, VALPHAR, D9
  1321. VNMSUB D8, U1, VALPHAI, D8
  1322. VFMADD D9, U0, VALPHAI, D9
  1323. vand.v D10, D9, D9 //c0[1] c1[1] c2[1] c3[1]
  1324. vand.v D11, D9, D9 //c0[0] c1[0] c2[0] c3[0]
  1325. vpermi.w D10, D8, 0x44 //c0[0] c1[0] c0[1] c1[1]
  1326. vshuf4i.w D10, D10, 0xd8 //c0[0] c0[1] c1[0] c1[1]
  1327. vpermi.w D11, D8, 0xee //c2[0] c3[0] c2[1] c3[1]
  1328. vshuf4i.w D11, D11, 0xd8 //c2[0] c2[1] c3[0] c3[1]
  1329. //res01 res11 res21 res31
  1330. vand.v D4, D1, D1
  1331. vpermi.w D4, D0, 0xee //c0:2 3, c1:2 3
  1332. vshuf4i.w D6, D4, 0xd8 //c0[2] c1[2] c0[3] c1[3]
  1333. vshuf4i.w D7, D4, 0x8d //c0[3] c1[3] c0[2] c1[2]
  1334. vand.v D5, D3, D3
  1335. vpermi.w D5, D2, 0xee //c2:2 3, c3:2 3
  1336. vshuf4i.w D8, D5, 0xd8 //c2[2] c3[2] c2[3] c3[3]
  1337. vshuf4i.w D9, D5, 0x8d //c2[3] c3[3] c2[2] c3[2]
  1338. vpermi.w D8, D6, 0x44 //c0[2] c1[2] c2[2] c3[2]
  1339. vpermi.w D9, D7, 0x44 //c0[3] c1[3] c2[3] c3[3]
  1340. VFMADD D8, U2, VALPHAR, D8
  1341. VFMADD D9, U3, VALPHAR, D9
  1342. VNMSUB D8, U3, VALPHAI, D8
  1343. VFMADD D9, U2, VALPHAI, D9
  1344. vand.v D4, D9, D9
  1345. vpermi.w D4, D8, 0x44 //c0[2] c1[2] c0[3] c1[3]
  1346. vshuf4i.w D4, D4, 0xd8 //c0[2] c0[3] c1[2] c1[3]
  1347. vand.v D2, D4, D4
  1348. vand.v D5, D9, D9
  1349. vpermi.w D5, D8, 0xee //c2[2] c3[2] c2[3] c3[3]
  1350. vshuf4i.w D5, D5, 0xd8 //c2[2] c2[3] c3[2] c3[3]
  1351. vand.v D3, D5, D5
  1352. vand.v D0, D10, D10 //c0[0] c0[1] c1[0] c1[1]
  1353. vand.v D1, D11, D11 //c2[0] c2[1] c3[0] c3[1]
  1354. vpermi.w D4, D0, 0x44 //c0: 0 1 2 3
  1355. vpermi.w D2, D0, 0xee //c1: 0 1 2 3
  1356. vpermi.w D5, D1, 0x44 //c2: 0 1 2 3
  1357. vpermi.w D3, D1, 0xee //c3: 0 1 2 3
  1358. vst D4, C0, 0x00
  1359. vst D2, C1, 0x00
  1360. vst D5, C2, 0x00
  1361. vst D3, C3, 0x00
  1362. addi.d C0, C0, 0x10
  1363. addi.d C1, C1, 0x10
  1364. addi.d C2, C2, 0x10
  1365. addi.d C3, C3, 0x10
  1366. #endif
  1367. #if defined(TRMMKERNEL)
  1368. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1369. sub.d TL, K, OFF
  1370. #ifdef LEFT
  1371. addi.d TL, TL, -2
  1372. #else
  1373. addi.d TL, TL, -4
  1374. #endif
  1375. slli.d T3, TL, 0x04
  1376. add.d A0, A0, T3
  1377. slli.d T3, TL, 0x05
  1378. add.d B0, B0, T3
  1379. #endif
  1380. #ifdef LEFT
  1381. addi.d OFF, OFF, 2
  1382. #endif
  1383. #endif // #if defined(TRMMKERNEL)
  1384. .L183: /* if (bm & 1) */
  1385. move I, $r0
  1386. andi T0, M, 1
  1387. beq I, T0, .L186
  1388. move B0, B //ptrbb
  1389. move TL, K /* TL = bk */
  1390. #if defined(TRMMKERNEL)
  1391. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  1392. move B0, B //ptrbb
  1393. #else
  1394. slli.d T3, OFF, 0x03
  1395. add.d A0, A0, T3
  1396. slli.d T3, OFF, 0x05
  1397. add.d B0, B, T3
  1398. #endif
  1399. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1400. sub.d TL, K, OFF
  1401. #elif defined(LEFT)
  1402. addi.d TL, OFF, 1
  1403. #else
  1404. addi.d TL, OFF, 4
  1405. #endif
  1406. #endif // #if defined(TRMMKERNEL)
  1407. MTC c11, $r0
  1408. MTC c12, $r0
  1409. MTC c21, $r0
  1410. MTC c22, $r0
  1411. MTC c31, $r0
  1412. MTC c32, $r0
  1413. MTC c41, $r0
  1414. MTC c42, $r0
  1415. move L, $r0 //cycle param k
  1416. beq L, TL, .L185
  1417. blt TL, L, .L185
  1418. .L184: /* for (k=0; k<temp; k++) */
  1419. LD a1, A0, 0x00 //a0r
  1420. LD a2, A0, 0x04 //a0i
  1421. LD b1, B0, 0x00 //b0r
  1422. LD b2, B0, 0x04 //b0i
  1423. LD b3, B0, 0x08 //b1r
  1424. LD b4, B0, 0x0c //b1i
  1425. LD b5, B0, 0x10 //b2r
  1426. LD b6, B0, 0x14 //b2i
  1427. LD b7, B0, 0x18 //b3r
  1428. LD b8, B0, 0x1c //b3i
  1429. MADD1 c11, a1, b1, c11 //res00r
  1430. MADD2 c12, a2, b1, c12 //res00i
  1431. MADD3 c11, a2, b2, c11
  1432. MADD4 c12, a1, b2, c12
  1433. MADD1 c21, a1, b3, c21 //res10r
  1434. MADD2 c22, a2, b3, c22 //res10i
  1435. MADD3 c21, a2, b4, c21
  1436. MADD4 c22, a1, b4, c22
  1437. MADD1 c31, a1, b5, c31 //res20r
  1438. MADD2 c32, a2, b5, c32 //res20i
  1439. MADD3 c31, a2, b6, c31
  1440. MADD4 c32, a1, b6, c32
  1441. MADD1 c41, a1, b7, c41 //res30r
  1442. MADD2 c42, a2, b7, c42 //res30i
  1443. MADD3 c41, a2, b8, c41
  1444. MADD4 c42, a1, b8, c42
  1445. addi.d A0, A0, 0x08
  1446. addi.d B0, B0, 0x20
  1447. addi.d L, L, 1
  1448. blt L, TL, .L184
  1449. .L185:
  1450. #if defined(TRMMKERNEL)
  1451. //res00 res10 res20 res30
  1452. LD a5, C0, 0x00 //C0[0]
  1453. LD a6, C0, 0x04 //C0[1]
  1454. MUL a5, c11, ALPHA_R
  1455. MUL a6, c12, ALPHA_R
  1456. NMSUB a5, c12, ALPHA_I, a5
  1457. MADD a6, c11, ALPHA_I, a6
  1458. ST a5, C0, 0x00
  1459. ST a6, C0, 0x04
  1460. LD a5, C1, 0x00 //C1[0]
  1461. LD a6, C1, 0x04 //C1[1]
  1462. MUL a5, c21, ALPHA_R
  1463. MUL a6, c22, ALPHA_R
  1464. NMSUB a5, c22, ALPHA_I, a5
  1465. MADD a6, c21, ALPHA_I, a6
  1466. ST a5, C1, 0x00
  1467. ST a6, C1, 0x04
  1468. LD a5, C2, 0x00 //C2[0]
  1469. LD a6, C2, 0x04 //C2[1]
  1470. MUL a5, c31, ALPHA_R
  1471. MUL a6, c32, ALPHA_R
  1472. NMSUB a5, c32, ALPHA_I, a5
  1473. MADD a6, c31, ALPHA_I, a6
  1474. ST a5, C2, 0x00
  1475. ST a6, C2, 0x04
  1476. LD a5, C3, 0x00 //C3[0]
  1477. LD a6, C3, 0x04 //C3[1]
  1478. MUL a5, c41, ALPHA_R
  1479. MUL a6, c42, ALPHA_R
  1480. NMSUB a5, c42, ALPHA_I, a5
  1481. MADD a6, c41, ALPHA_I, a6
  1482. ST a5, C3, 0x00
  1483. ST a6, C3, 0x04
  1484. addi.d C0, C0, 0x08
  1485. addi.d C1, C1, 0x08
  1486. addi.d C2, C2, 0x08
  1487. addi.d C3, C3, 0x08
  1488. #else
  1489. //res00 res10 res20 res30
  1490. LD a5, C0, 0x00 //C0[0]
  1491. LD a6, C0, 0x04 //C0[1]
  1492. MADD a5, c11, ALPHA_R, a5
  1493. MADD a6, c12, ALPHA_R, a6
  1494. NMSUB a5, c12, ALPHA_I, a5
  1495. MADD a6, c11, ALPHA_I, a6
  1496. ST a5, C0, 0x00
  1497. ST a6, C0, 0x04
  1498. LD a5, C1, 0x00 //C1[0]
  1499. LD a6, C1, 0x04 //C1[1]
  1500. MADD a5, c21, ALPHA_R, a5
  1501. MADD a6, c22, ALPHA_R, a6
  1502. NMSUB a5, c22, ALPHA_I, a5
  1503. MADD a6, c21, ALPHA_I, a6
  1504. ST a5, C1, 0x00
  1505. ST a6, C1, 0x04
  1506. LD a5, C2, 0x00 //C2[0]
  1507. LD a6, C2, 0x04 //C2[1]
  1508. MADD a5, c31, ALPHA_R, a5
  1509. MADD a6, c32, ALPHA_R, a6
  1510. NMSUB a5, c32, ALPHA_I, a5
  1511. MADD a6, c31, ALPHA_I, a6
  1512. ST a5, C2, 0x00
  1513. ST a6, C2, 0x04
  1514. LD a5, C3, 0x00 //C3[0]
  1515. LD a6, C3, 0x04 //C3[1]
  1516. MADD a5, c41, ALPHA_R, a5
  1517. MADD a6, c42, ALPHA_R, a6
  1518. NMSUB a5, c42, ALPHA_I, a5
  1519. MADD a6, c41, ALPHA_I, a6
  1520. ST a5, C3, 0x00
  1521. ST a6, C3, 0x04
  1522. addi.d C0, C0, 0x08
  1523. addi.d C1, C1, 0x08
  1524. addi.d C2, C2, 0x08
  1525. addi.d C3, C3, 0x08
  1526. #endif
  1527. #if defined(TRMMKERNEL)
  1528. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1529. sub.d TL, K, OFF
  1530. #ifdef LEFT
  1531. addi.d TL, TL, -1
  1532. #else
  1533. addi.d TL, TL, -4
  1534. #endif
  1535. slli.d T3, TL, 0x03
  1536. add.d A0, A0, T3
  1537. slli.d C3, TL, 0x05
  1538. add.d B0, B0, T3
  1539. #endif
  1540. #ifdef LEFT
  1541. addi.d OFF, OFF, 1
  1542. #endif
  1543. #endif // #if defined(TRMMKERNEL)
  1544. .L186:
  1545. #if defined(TRMMKERNEL) && !defined(LEFT)
  1546. addi.d OFF, OFF, 4
  1547. #endif
  1548. slli.d L, K, 0x05
  1549. add.d B, B, L
  1550. slli.d I, LDC, 0x03
  1551. add.d C, C, I
  1552. addi.d J, J, 1
  1553. srai.d T0, N, 2
  1554. blt J, T0, .L10
  1555. .L19:
  1556. move J, $r0
  1557. andi T0, N, 2
  1558. beq J, T0, .L30
  1559. .L20: /* for (j=0; j<(bn&2); j+=2) */
  1560. #if defined(TRMMKERNEL) && defined(LEFT)
  1561. move OFF, OFFSET
  1562. #endif
  1563. move C0, C
  1564. slli.d TL, LDC, 1
  1565. add.d C1, C0, TL
  1566. move A0, A //ptrba
  1567. move I, $r0
  1568. srai.d T0, M, 3 //bm/8
  1569. beq I, T0, .L24
  1570. .L21: /* for (i=0; i<bm/8; i+=1) */
  1571. move B0, B //ptrbb
  1572. move TL, K /* TL = bk */
  1573. #if defined(TRMMKERNEL)
  1574. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  1575. move B0, B //ptrbb
  1576. #else
  1577. slli.d T3, OFF, 0x06
  1578. add.d A0, A0, T3
  1579. slli.d T3, OFF, 0x04
  1580. add.d B0, B, T3
  1581. #endif
  1582. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1583. sub.d TL, K, OFF
  1584. #elif defined(LEFT)
  1585. addi.d TL, OFF, 8
  1586. #else
  1587. addi.d TL, OFF, 2
  1588. #endif
  1589. #endif // #if defined(TRMMKERNEL)
  1590. vxor.v U0, U0, U0
  1591. vxor.v U1, U1, U1
  1592. vxor.v U2, U2, U2
  1593. vxor.v U3, U3, U3
  1594. vxor.v U4, U4, U4
  1595. vxor.v U5, U5, U5
  1596. vxor.v U6, U6, U6
  1597. vxor.v U7, U7, U7
  1598. move L, $r0 //cycle param k
  1599. beq L, TL, .L23
  1600. blt TL, L, .L23
  1601. .L22: /* for (k=0; k<temp; k++) */
  1602. vld D0, A0, 0x00 // a0ri a1ri
  1603. vld D2, B0, 0x00 // b0ri b1ri
  1604. vshuf4i.w D4, D0, 0xa0 //a0rr a1rr
  1605. vshuf4i.w D5, D0, 0xf5 //a0ii a1ii
  1606. vshuf4i.w D6, D2, 0x88 //b0r b1r b0r b1r
  1607. vshuf4i.w D7, D2, 0xdd //b0i b1i b0i b1i
  1608. VMADD1 U0, D4, D6, U0 //00r 10r 01r 11r
  1609. VMADD2 U1, D5, D6, U1 //00i 10i 01i 11i
  1610. VMADD3 U0, D5, D7, U0
  1611. VMADD4 U1, D4, D7, U1
  1612. vld D0, A0, 0x10 // a2ri a3ri
  1613. vshuf4i.w D4, D0, 0xa0 //a2rr a3rr
  1614. vshuf4i.w D5, D0, 0xf5 //a2ii a3ii
  1615. VMADD1 U2, D4, D6, U2 //02r 12r 03r 13r
  1616. VMADD2 U3, D5, D6, U3 //02i 12i 03i 13i
  1617. VMADD3 U2, D5, D7, U2
  1618. VMADD4 U3, D4, D7, U3
  1619. vld D0, A0, 0x20 // a4ri a5ri
  1620. vshuf4i.w D4, D0, 0xa0 //a4rr a5rr
  1621. vshuf4i.w D5, D0, 0xf5 //a4ii a5ii
  1622. VMADD1 U4, D4, D6, U4 //04r 14r 05r 15r
  1623. VMADD2 U5, D5, D6, U5 //04i 14i 05i 15i
  1624. VMADD3 U4, D5, D7, U4
  1625. VMADD4 U5, D4, D7, U5
  1626. vld D0, A0, 0x30 // a6ri a7ri
  1627. vshuf4i.w D4, D0, 0xa0 //a6rr a7rr
  1628. vshuf4i.w D5, D0, 0xf5 //a6ii a7ii
  1629. VMADD1 U6, D4, D6, U6 //06r 16r 07r 17r
  1630. VMADD2 U7, D5, D6, U7 //06i 16i 07i 17i
  1631. VMADD3 U6, D5, D7, U6
  1632. VMADD4 U7, D4, D7, U7
  1633. addi.d A0, A0, 0x40
  1634. addi.d B0, B0, 0x10
  1635. addi.d L, L, 1
  1636. blt L, TL, .L22
  1637. .L23:
  1638. #if defined(TRMMKERNEL)
  1639. //res00 res10 res01 res11
  1640. vld D0, C0, 0x00 //c0: 0 1 2 3
  1641. vld D1, C1, 0x00 //c1: 0 1 2 3
  1642. vpackev.w D2, D1, D0 //0 4 2 6
  1643. vpackod.w D3, D1, D0 //1 5 3 7
  1644. vfmul.s D2, U0, VALPHAR
  1645. vfmul.s D3, U1, VALPHAR
  1646. VNMSUB D2, U1, VALPHAI, D2
  1647. VFMADD D3, U0, VALPHAI, D3
  1648. vpackev.w D4, D3, D2 //0 1 2 3
  1649. vpackod.w D5, D3, D2 //4 5 6 7
  1650. vst D4, C0, 0x00 //c0: 0 1 2 3
  1651. vst D5, C1, 0x00 //c1: 0 1 2 3
  1652. addi.d C0, C0, 0x10
  1653. addi.d C1, C1, 0x10
  1654. //res02 res12 res03 res13
  1655. vld D0, C0, 0x00 //c0: 0 1 2 3
  1656. vld D1, C1, 0x00 //c1: 0 1 2 3
  1657. vpackev.w D2, D1, D0 //0 4 2 6
  1658. vpackod.w D3, D1, D0 //1 5 3 7
  1659. vfmul.s D2, U2, VALPHAR
  1660. vfmul.s D3, U3, VALPHAR
  1661. VNMSUB D2, U3, VALPHAI, D2
  1662. VFMADD D3, U2, VALPHAI, D3
  1663. vpackev.w D4, D3, D2 //0 1 2 3
  1664. vpackod.w D5, D3, D2 //4 5 6 7
  1665. vst D4, C0, 0x00 //c0: 0 1 2 3
  1666. vst D5, C1, 0x00 //c1: 0 1 2 3
  1667. addi.d C0, C0, 0x10
  1668. addi.d C1, C1, 0x10
  1669. //res04 res14 res05 res15
  1670. vld D0, C0, 0x00 //c0: 0 1 2 3
  1671. vld D1, C1, 0x00 //c1: 0 1 2 3
  1672. vpackev.w D2, D1, D0 //0 4 2 6
  1673. vpackod.w D3, D1, D0 //1 5 3 7
  1674. vfmul.s D2, U4, VALPHAR
  1675. vfmul.s D3, U5, VALPHAR
  1676. VNMSUB D2, U5, VALPHAI, D2
  1677. VFMADD D3, U4, VALPHAI, D3
  1678. vpackev.w D4, D3, D2 //0 1 2 3
  1679. vpackod.w D5, D3, D2 //4 5 6 7
  1680. vst D4, C0, 0x00 //c0: 0 1 2 3
  1681. vst D5, C1, 0x00 //c1: 0 1 2 3
  1682. addi.d C0, C0, 0x10
  1683. addi.d C1, C1, 0x10
  1684. //res06 res16 res07 res17
  1685. vld D0, C0, 0x00 //c0: 0 1 2 3
  1686. vld D1, C1, 0x00 //c1: 0 1 2 3
  1687. vpackev.w D2, D1, D0 //0 4 2 6
  1688. vpackod.w D3, D1, D0 //1 5 3 7
  1689. vfmul.s D2, U6, VALPHAR
  1690. vfmul.s D3, U7, VALPHAR
  1691. VNMSUB D2, U7, VALPHAI, D2
  1692. VFMADD D3, U6, VALPHAI, D3
  1693. vpackev.w D4, D3, D2 //0 1 2 3
  1694. vpackod.w D5, D3, D2 //4 5 6 7
  1695. vst D4, C0, 0x00 //c0: 0 1 2 3
  1696. vst D5, C1, 0x00 //c1: 0 1 2 3
  1697. addi.d C0, C0, 0x10
  1698. addi.d C1, C1, 0x10
  1699. #else
  1700. //res00 res10 res01 res11
  1701. vld D0, C0, 0x00 //c0: 0 1 2 3
  1702. vld D1, C1, 0x00 //c1: 0 1 2 3
  1703. vpackev.w D2, D1, D0 //0 4 2 6
  1704. vpackod.w D3, D1, D0 //1 5 3 7
  1705. VFMADD D2, U0, VALPHAR, D2
  1706. VFMADD D3, U1, VALPHAR, D3
  1707. VNMSUB D2, U1, VALPHAI, D2
  1708. VFMADD D3, U0, VALPHAI, D3
  1709. vpackev.w D4, D3, D2 //0 1 2 3
  1710. vpackod.w D5, D3, D2 //4 5 6 7
  1711. vst D4, C0, 0x00 //c0: 0 1 2 3
  1712. vst D5, C1, 0x00 //c1: 0 1 2 3
  1713. addi.d C0, C0, 0x10
  1714. addi.d C1, C1, 0x10
  1715. //res02 res12 res03 res13
  1716. vld D0, C0, 0x00 //c0: 0 1 2 3
  1717. vld D1, C1, 0x00 //c1: 0 1 2 3
  1718. vpackev.w D2, D1, D0 //0 4 2 6
  1719. vpackod.w D3, D1, D0 //1 5 3 7
  1720. VFMADD D2, U2, VALPHAR, D2
  1721. VFMADD D3, U3, VALPHAR, D3
  1722. VNMSUB D2, U3, VALPHAI, D2
  1723. VFMADD D3, U2, VALPHAI, D3
  1724. vpackev.w D4, D3, D2 //0 1 2 3
  1725. vpackod.w D5, D3, D2 //4 5 6 7
  1726. vst D4, C0, 0x00 //c0: 0 1 2 3
  1727. vst D5, C1, 0x00 //c1: 0 1 2 3
  1728. addi.d C0, C0, 0x10
  1729. addi.d C1, C1, 0x10
  1730. //res04 res14 res05 res15
  1731. vld D0, C0, 0x00 //c0: 0 1 2 3
  1732. vld D1, C1, 0x00 //c1: 0 1 2 3
  1733. vpackev.w D2, D1, D0 //0 4 2 6
  1734. vpackod.w D3, D1, D0 //1 5 3 7
  1735. VFMADD D2, U4, VALPHAR, D2
  1736. VFMADD D3, U5, VALPHAR, D3
  1737. VNMSUB D2, U5, VALPHAI, D2
  1738. VFMADD D3, U4, VALPHAI, D3
  1739. vpackev.w D4, D3, D2 //0 1 2 3
  1740. vpackod.w D5, D3, D2 //4 5 6 7
  1741. vst D4, C0, 0x00 //c0: 0 1 2 3
  1742. vst D5, C1, 0x00 //c1: 0 1 2 3
  1743. addi.d C0, C0, 0x10
  1744. addi.d C1, C1, 0x10
  1745. //res06 res16 res07 res17
  1746. vld D0, C0, 0x00 //c0: 0 1 2 3
  1747. vld D1, C1, 0x00 //c1: 0 1 2 3
  1748. vpackev.w D2, D1, D0 //0 4 2 6
  1749. vpackod.w D3, D1, D0 //1 5 3 7
  1750. VFMADD D2, U6, VALPHAR, D2
  1751. VFMADD D3, U7, VALPHAR, D3
  1752. VNMSUB D2, U7, VALPHAI, D2
  1753. VFMADD D3, U6, VALPHAI, D3
  1754. vpackev.w D4, D3, D2 //0 1 2 3
  1755. vpackod.w D5, D3, D2 //4 5 6 7
  1756. vst D4, C0, 0x00 //c0: 0 1 2 3
  1757. vst D5, C1, 0x00 //c1: 0 1 2 3
  1758. addi.d C0, C0, 0x10
  1759. addi.d C1, C1, 0x10
  1760. #endif
  1761. #if defined(TRMMKERNEL)
  1762. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1763. sub.d TL, K, OFF
  1764. #ifdef LEFT
  1765. addi.d TL, TL, -8
  1766. #else
  1767. addi.d TL, TL, -2
  1768. #endif
  1769. slli.d T3, TL, 0x06
  1770. add.d A0, A0, T3
  1771. slli.d T3, TL, 0x04
  1772. add.d B0, B0, T3
  1773. #endif
  1774. #ifdef LEFT
  1775. addi.d OFF, OFF, 8
  1776. #endif
  1777. #endif // #if defined(TRMMKERNEL)
  1778. addi.d I, I, 1
  1779. blt I, T0, .L21
  1780. .L24: /* if ( bm & 4 ) */
  1781. move I, $r0
  1782. andi T1, M, 4 //bm&4
  1783. beq I, T1, .L280
  1784. .L25:
  1785. move B0, B //ptrbb
  1786. move TL, K /* TL = bk */
  1787. #if defined(TRMMKERNEL)
  1788. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  1789. move B0, B //ptrbb
  1790. #else
  1791. slli.d T3, OFF, 0x05
  1792. add.d A0, A0, T3
  1793. slli.d T3, OFF, 0x04
  1794. add.d B0, B, T3
  1795. #endif
  1796. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1797. sub.d TL, K, OFF
  1798. #elif defined(LEFT)
  1799. addi.d TL, OFF, 4
  1800. #else
  1801. addi.d TL, OFF, 2
  1802. #endif
  1803. #endif // #if defined(TRMMKERNEL)
  1804. vxor.v U0, U0, U0
  1805. vxor.v U1, U1, U1
  1806. vxor.v U2, U2, U2
  1807. vxor.v U3, U3, U3
  1808. move L, $r0 //cycle param k
  1809. beq L, TL, .L27
  1810. blt TL, L, .L27
  1811. .L26: /* for (k=0; k<temp; k++) */
  1812. vld D0, A0, 0x00 // a0ri a1ri
  1813. vld D2, B0, 0x00 // b0ri b1ri
  1814. vshuf4i.w D4, D0, 0xa0 //a0rr a1rr
  1815. vshuf4i.w D5, D0, 0xf5 //a0ii a1ii
  1816. vshuf4i.w D6, D2, 0x88 //b0r b1r b0r b1r
  1817. vshuf4i.w D7, D2, 0xdd //b0i b1i b0i b1i
  1818. VMADD1 U0, D4, D6, U0 //00r 10r 01r 11r
  1819. VMADD2 U1, D5, D6, U1 //00i 10i 01i 11i
  1820. VMADD3 U0, D5, D7, U0
  1821. VMADD4 U1, D4, D7, U1
  1822. vld D0, A0, 0x10 // a2ri a3ri
  1823. vshuf4i.w D4, D0, 0xa0 //a2rr a3rr
  1824. vshuf4i.w D5, D0, 0xf5 //a2ii a3ii
  1825. VMADD1 U2, D4, D6, U2 //02r 12r 03r 13r
  1826. VMADD2 U3, D5, D6, U3 //02i 12i 03i 13i
  1827. VMADD3 U2, D5, D7, U2
  1828. VMADD4 U3, D4, D7, U3
  1829. addi.d A0, A0, 0x20
  1830. addi.d B0, B0, 0x10
  1831. addi.d L, L, 1
  1832. blt L, TL, .L26
  1833. .L27:
  1834. #if defined(TRMMKERNEL)
  1835. //res00 res10 res01 res11
  1836. vld D0, C0, 0x00 //c0: 0 1 2 3
  1837. vld D1, C1, 0x00 //c1: 0 1 2 3
  1838. vpackev.w D2, D1, D0 //0 4 2 6
  1839. vpackod.w D3, D1, D0 //1 5 3 7
  1840. vfmul.s D2, U0, VALPHAR
  1841. vfmul.s D3, U1, VALPHAR
  1842. VNMSUB D2, U1, VALPHAI, D2
  1843. VFMADD D3, U0, VALPHAI, D3
  1844. vpackev.w D4, D3, D2 //0 1 2 3
  1845. vpackod.w D5, D3, D2 //4 5 6 7
  1846. vst D4, C0, 0x00 //c0: 0 1 2 3
  1847. vst D5, C1, 0x00 //c1: 0 1 2 3
  1848. addi.d C0, C0, 0x10
  1849. addi.d C1, C1, 0x10
  1850. //res02 res12 res03 res13
  1851. vld D0, C0, 0x00 //c0: 0 1 2 3
  1852. vld D1, C1, 0x00 //c1: 0 1 2 3
  1853. vpackev.w D2, D1, D0 //0 4 2 6
  1854. vpackod.w D3, D1, D0 //1 5 3 7
  1855. vfmul.s D2, U2, VALPHAR
  1856. vfmul.s D3, U3, VALPHAR
  1857. VNMSUB D2, U3, VALPHAI, D2
  1858. VFMADD D3, U2, VALPHAI, D3
  1859. vpackev.w D4, D3, D2 //0 1 2 3
  1860. vpackod.w D5, D3, D2 //4 5 6 7
  1861. vst D4, C0, 0x00 //c0: 0 1 2 3
  1862. vst D5, C1, 0x00 //c1: 0 1 2 3
  1863. addi.d C0, C0, 0x10
  1864. addi.d C1, C1, 0x10
  1865. #else
  1866. //res00 res10 res01 res11
  1867. vld D0, C0, 0x00 //c0: 0 1 2 3
  1868. vld D1, C1, 0x00 //c1: 0 1 2 3
  1869. vpackev.w D2, D1, D0 //0 4 2 6
  1870. vpackod.w D3, D1, D0 //1 5 3 7
  1871. VFMADD D2, U0, VALPHAR, D2
  1872. VFMADD D3, U1, VALPHAR, D3
  1873. VNMSUB D2, U1, VALPHAI, D2
  1874. VFMADD D3, U0, VALPHAI, D3
  1875. vpackev.w D4, D3, D2 //0 1 2 3
  1876. vpackod.w D5, D3, D2 //4 5 6 7
  1877. vst D4, C0, 0x00 //c0: 0 1 2 3
  1878. vst D5, C1, 0x00 //c1: 0 1 2 3
  1879. addi.d C0, C0, 0x10
  1880. addi.d C1, C1, 0x10
  1881. //res02 res12 res03 res13
  1882. vld D0, C0, 0x00 //c0: 0 1 2 3
  1883. vld D1, C1, 0x00 //c1: 0 1 2 3
  1884. vpackev.w D2, D1, D0 //0 4 2 6
  1885. vpackod.w D3, D1, D0 //1 5 3 7
  1886. VFMADD D2, U2, VALPHAR, D2
  1887. VFMADD D3, U3, VALPHAR, D3
  1888. VNMSUB D2, U3, VALPHAI, D2
  1889. VFMADD D3, U2, VALPHAI, D3
  1890. vpackev.w D4, D3, D2 //0 1 2 3
  1891. vpackod.w D5, D3, D2 //4 5 6 7
  1892. vst D4, C0, 0x00 //c0: 0 1 2 3
  1893. vst D5, C1, 0x00 //c1: 0 1 2 3
  1894. addi.d C0, C0, 0x10
  1895. addi.d C1, C1, 0x10
  1896. #endif
  1897. #if defined(TRMMKERNEL)
  1898. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1899. sub.d TL, K, OFF
  1900. #ifdef LEFT
  1901. addi.d TL, TL, -4
  1902. #else
  1903. addi.d TL, TL, -2
  1904. #endif
  1905. slli.d T3, TL, 0x05
  1906. add.d A0, A0, T3
  1907. slli.d T3, TL, 0x04
  1908. add.d B0, B0, T3
  1909. #endif
  1910. #ifdef LEFT
  1911. addi.d OFF, OFF, 4
  1912. #endif
  1913. #endif // #if defined(TRMMKERNEL)
  1914. .L280: /* if ( bm & 2 )*/
  1915. move I, $r0
  1916. andi T1, M, 2 //bm&2
  1917. beq I, T1, .L284
  1918. .L281:
  1919. move B0, B //ptrbb
  1920. move TL, K /* TL = bk */
  1921. #if defined(TRMMKERNEL)
  1922. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  1923. move B0, B //ptrbb
  1924. #else
  1925. slli.d T3, OFF, 0x04
  1926. add.d A0, A0, T3
  1927. add.d B0, B, T3
  1928. #endif
  1929. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1930. sub.d TL, K, OFF
  1931. #elif defined(LEFT)
  1932. addi.d TL, OFF, 2
  1933. #else
  1934. addi.d TL, OFF, 2
  1935. #endif
  1936. #endif // #if defined(TRMMKERNEL)
  1937. vxor.v U0, U0, U0
  1938. vxor.v U1, U1, U1
  1939. move L, $r0 //cycle param k
  1940. beq L, TL, .L283
  1941. blt TL, L, .L283
  1942. .L282: /* for (k=0; k<temp; k++) */
  1943. vld D0, A0, 0x00 // a0ri a1ri
  1944. vld D2, B0, 0x00 // b0ri b1ri
  1945. vshuf4i.w D4, D0, 0xa0 //a0rr a1rr
  1946. vshuf4i.w D5, D0, 0xf5 //a0ii a1ii
  1947. vshuf4i.w D6, D2, 0x88 //b0r b1r b0r b1r
  1948. vshuf4i.w D7, D2, 0xdd //b0i b1i b0i b1i
  1949. VMADD1 U0, D4, D6, U0 //00r 10r 01r 11r
  1950. VMADD2 U1, D5, D6, U1 //00i 10i 01i 11i
  1951. VMADD3 U0, D5, D7, U0
  1952. VMADD4 U1, D4, D7, U1
  1953. addi.d A0, A0, 0x10
  1954. addi.d B0, B0, 0x10
  1955. addi.d L, L, 1
  1956. blt L, TL, .L282
  1957. .L283:
  1958. #if defined(TRMMKERNEL)
  1959. //res00 res10 res01 res11
  1960. vld D0, C0, 0x00 //c0: 0 1 2 3
  1961. vld D1, C1, 0x00 //c1: 0 1 2 3
  1962. vpackev.w D2, D1, D0 //0 4 2 6
  1963. vpackod.w D3, D1, D0 //1 5 3 7
  1964. vfmul.s D2, U0, VALPHAR
  1965. vfmul.s D3, U1, VALPHAR
  1966. VNMSUB D2, U1, VALPHAI, D2
  1967. VFMADD D3, U0, VALPHAI, D3
  1968. vpackev.w D4, D3, D2 //0 1 2 3
  1969. vpackod.w D5, D3, D2 //4 5 6 7
  1970. vst D4, C0, 0x00 //c0: 0 1 2 3
  1971. vst D5, C1, 0x00 //c1: 0 1 2 3
  1972. addi.d C0, C0, 0x10
  1973. addi.d C1, C1, 0x10
  1974. #else
  1975. //res00 res10 res01 res11
  1976. vld D0, C0, 0x00 //c0: 0 1 2 3
  1977. vld D1, C1, 0x00 //c1: 0 1 2 3
  1978. vpackev.w D2, D1, D0 //0 4 2 6
  1979. vpackod.w D3, D1, D0 //1 5 3 7
  1980. VFMADD D2, U0, VALPHAR, D2
  1981. VFMADD D3, U1, VALPHAR, D3
  1982. VNMSUB D2, U1, VALPHAI, D2
  1983. VFMADD D3, U0, VALPHAI, D3
  1984. vpackev.w D4, D3, D2 //0 1 2 3
  1985. vpackod.w D5, D3, D2 //4 5 6 7
  1986. vst D4, C0, 0x00 //c0: 0 1 2 3
  1987. vst D5, C1, 0x00 //c1: 0 1 2 3
  1988. addi.d C0, C0, 0x10
  1989. addi.d C1, C1, 0x10
  1990. #endif
  1991. #if defined(TRMMKERNEL)
  1992. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1993. sub.d TL, K, OFF
  1994. #ifdef LEFT
  1995. addi.d TL, TL, -2
  1996. #else
  1997. addi.d TL, TL, -2
  1998. #endif
  1999. slli.d T3, TL, 0x04
  2000. add.d A0, A0, T3
  2001. slli.d T3, TL, 0x04
  2002. add.d B0, B0, T3
  2003. #endif
  2004. #ifdef LEFT
  2005. addi.d OFF, OFF, 2
  2006. #endif
  2007. #endif // #if defined(TRMMKERNEL)
  2008. .L284: /* if ( bm & 1 )*/
  2009. move I, $r0
  2010. andi T1, M, 1 //bm&1
  2011. beq I, T1, .L288
  2012. .L285:
  2013. move B0, B //ptrbb
  2014. move TL, K /* TL = bk */
  2015. #if defined(TRMMKERNEL)
  2016. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  2017. move B0, B //ptrbb
  2018. #else
  2019. slli.d T3, OFF, 0x03
  2020. add.d A0, A0, T3
  2021. slli.d T3, OFF, 0x04
  2022. add.d B0, B, T3
  2023. #endif
  2024. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2025. sub.d TL, K, OFF
  2026. #elif defined(LEFT)
  2027. addi.d TL, OFF, 1
  2028. #else
  2029. addi.d TL, OFF, 2
  2030. #endif
  2031. #endif // #if defined(TRMMKERNEL)
  2032. MTC c11, $r0
  2033. MTC c12, $r0
  2034. MTC c21, $r0
  2035. MTC c22, $r0
  2036. move L, $r0 //cycle param k
  2037. beq L, TL, .L287
  2038. blt TL, L, .L287
  2039. .L286: /* for (k=0; k<temp; k++) */
  2040. LD a1, A0, 0x00 //a0r
  2041. LD a2, A0, 0x04 //a0i
  2042. LD b1, B0, 0x00 //b0r
  2043. LD b2, B0, 0x04 //b0i
  2044. LD b3, B0, 0x08 //b1r
  2045. LD b4, B0, 0x0c //b1i
  2046. MADD1 c11, a1, b1, c11 //res00r
  2047. MADD2 c12, a2, b1, c12 //res00i
  2048. MADD3 c11, a2, b2, c11
  2049. MADD4 c12, a1, b2, c12
  2050. MADD1 c21, a1, b3, c21 //res10r
  2051. MADD2 c22, a2, b3, c22 //res10i
  2052. MADD3 c21, a2, b4, c21
  2053. MADD4 c22, a1, b4, c22
  2054. addi.d A0, A0, 0x08
  2055. addi.d B0, B0, 0x10
  2056. addi.d L, L, 1
  2057. blt L, TL, .L286
  2058. .L287:
  2059. #if defined(TRMMKERNEL)
  2060. //res00 res10
  2061. LD a5, C0, 0x00 //C0[0]
  2062. LD a6, C0, 0x04 //C0[1]
  2063. LD a7, C1, 0x00 //C1[0]
  2064. LD a8, C1, 0x04 //C1[1]
  2065. MUL a5, c11, ALPHA_R
  2066. MUL a6, c12, ALPHA_R
  2067. NMSUB a5, c12, ALPHA_I, a5
  2068. MADD a6, c11, ALPHA_I, a6
  2069. MUL a7, c21, ALPHA_R
  2070. MUL a8, c22, ALPHA_R
  2071. NMSUB a7, c22, ALPHA_I, a7
  2072. MADD a8, c21, ALPHA_I, a8
  2073. ST a5, C0, 0x00
  2074. ST a6, C0, 0x04
  2075. ST a7, C1, 0x00
  2076. ST a8, C1, 0x04
  2077. addi.d C0, C0, 0x08
  2078. addi.d C1, C1, 0x08
  2079. #else
  2080. //res00 res10
  2081. LD a5, C0, 0x00 //C0[0]
  2082. LD a6, C0, 0x04 //C0[1]
  2083. LD a7, C1, 0x00 //C1[0]
  2084. LD a8, C1, 0x04 //C1[1]
  2085. MADD a5, c11, ALPHA_R, a5
  2086. MADD a6, c12, ALPHA_R, a6
  2087. NMSUB a5, c12, ALPHA_I, a5
  2088. MADD a6, c11, ALPHA_I, a6
  2089. MADD a7, c21, ALPHA_R, a7
  2090. MADD a8, c22, ALPHA_R, a8
  2091. NMSUB a7, c22, ALPHA_I, a7
  2092. MADD a8, c21, ALPHA_I, a8
  2093. ST a5, C0, 0x00
  2094. ST a6, C0, 0x04
  2095. ST a7, C1, 0x00
  2096. ST a8, C1, 0x04
  2097. addi.d C0, C0, 0x08
  2098. addi.d C1, C1, 0x08
  2099. #endif
  2100. #if defined(TRMMKERNEL)
  2101. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2102. sub.d TL, K, OFF
  2103. #ifdef LEFT
  2104. addi.d TL, TL, -1
  2105. #else
  2106. addi.d TL, TL, -2
  2107. #endif
  2108. slli.d T3, TL, 0x03
  2109. add.d A0, A0, T3
  2110. slli.d T3, TL, 0x04
  2111. add.d B0, B0, T3
  2112. #endif
  2113. #ifdef LEFT
  2114. addi.d OFF, OFF, 1
  2115. #endif
  2116. #endif // #if defined(TRMMKERNEL)
  2117. .L288:
  2118. #if defined(TRMMKERNEL) && !defined(LEFT)
  2119. addi.d OFF, OFF, 2
  2120. #endif
  2121. slli.d L, K, 4
  2122. add.d B, B, L
  2123. slli.d I, LDC, 2
  2124. add.d C, C, I
  2125. addi.d J, J, 2
  2126. andi T0, N, 2
  2127. blt J, T0, .L20
  2128. .L30:
  2129. move J, $r0
  2130. andi T0, N, 1
  2131. beq J, T0, .L999
  2132. .L300: /* for (j=0; j<(bn&1); j+=1) */
  2133. #if defined(TRMMKERNEL) && defined(LEFT)
  2134. move OFF, OFFSET
  2135. #endif
  2136. move C0, C
  2137. move A0, A //ptrba
  2138. move I, $r0
  2139. srai.d T0, M, 3 //bm/8
  2140. beq I, T0, .L34
  2141. .L31: /* for (i=0; i<bm/8; i+=1) */
  2142. move B0, B //ptrbb
  2143. move TL, K /* TL = bk */
  2144. #if defined(TRMMKERNEL)
  2145. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  2146. move B0, B //ptrbb
  2147. #else
  2148. slli.d T3, OFF, 0x06
  2149. add.d A0, A0, T3
  2150. slli.d T3, OFF, 0x03
  2151. add.d B0, B, T3
  2152. #endif
  2153. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2154. sub.d TL, K, OFF
  2155. #elif defined(LEFT)
  2156. addi.d TL, OFF, 8
  2157. #else
  2158. addi.d TL, OFF, 1
  2159. #endif
  2160. #endif // #if defined(TRMMKERNEL)
  2161. vxor.v U0, U0, U0
  2162. vxor.v U1, U1, U1
  2163. vxor.v U2, U2, U2
  2164. vxor.v U3, U3, U3
  2165. move L, $r0 //cycle param k
  2166. beq L, TL, .L33
  2167. blt TL, L, .L33
  2168. .L32: /* for (k=0; k<temp; k++) */
  2169. vld D0, A0, 0x00 // a0ri a1ri
  2170. vld D1, A0, 0x10 // a2ri a3ri
  2171. vldrepl.w D2, B0, 0x00 //b0r
  2172. vldrepl.w D3, B0, 0x04 //b0i
  2173. vpackev.w D4, D1, D0
  2174. vshuf4i.w D4, D4, 0xd8 //a0r a1r a2r a3r
  2175. vpackod.w D5, D1, D0
  2176. vshuf4i.w D5, D5, 0xd8 //a0i a1i a2i a3i
  2177. VMADD1 U0, D4, D2, U0 //00r 01r 02r 03r
  2178. VMADD2 U1, D5, D2, U1 //00i 01i 02i 03i
  2179. VMADD3 U0, D5, D3, U0
  2180. VMADD4 U1, D4, D3, U1
  2181. vld D0, A0, 0x20 // a4ri a5ri
  2182. vld D1, A0, 0x30 // a6ri a7ri
  2183. vpackev.w D4, D1, D0
  2184. vshuf4i.w D4, D4, 0xd8 //a4r a5r a6r a7r
  2185. vpackod.w D5, D1, D0
  2186. vshuf4i.w D5, D5, 0xd8 //a4i a5i a6i a7i
  2187. VMADD1 U2, D4, D2, U2 //04r 05r 06r 07r
  2188. VMADD2 U3, D5, D2, U3 //04i 05i 06i 07i
  2189. VMADD3 U2, D5, D3, U2
  2190. VMADD4 U3, D4, D3, U3
  2191. addi.d A0, A0, 0x40
  2192. addi.d B0, B0, 0x08
  2193. addi.d L, L, 1
  2194. blt L, TL, .L32
  2195. .L33:
  2196. #if defined(TRMMKERNEL)
  2197. //res00 res01 res02 res03
  2198. vld D0, C0, 0x00 //c0: 0 1 2 3
  2199. vld D1, C0, 0x10 //c0: 4 5 6 7
  2200. vpackev.w D2, D1, D0
  2201. vshuf4i.w D2, D2, 0xd8 //0 2 4 6
  2202. vpackod.w D3, D1, D0
  2203. vshuf4i.w D3, D3, 0xd8 //1 3 5 7
  2204. vfmul.s D2, U0, VALPHAR
  2205. vfmul.s D3, U1, VALPHAR
  2206. VNMSUB D2, U1, VALPHAI, D2
  2207. VFMADD D3, U0, VALPHAI, D3
  2208. vand.v D4, D3, D3 //1 3 5 7
  2209. vpermi.w D4, D2, 0x44 //0 2 1 3
  2210. vshuf4i.w D4, D4, 0xd8 //0 1 2 3
  2211. vand.v D5, D3, D3 //1 3 5 7
  2212. vpermi.w D5, D2, 0xee //4 6 5 7
  2213. vshuf4i.w D5, D5, 0xd8 //4 5 6 7
  2214. vst D4, C0, 0x00
  2215. vst D5, C0, 0x10
  2216. //res04 res05 res06 res07
  2217. vld D0, C0, 0x20 //c0: 8 9 10 11
  2218. vld D1, C0, 0x30 //c0: 12 13 14 15
  2219. vpackev.w D2, D1, D0
  2220. vshuf4i.w D2, D2, 0xd8 //8 10 12 14
  2221. vpackod.w D3, D1, D0
  2222. vshuf4i.w D3, D3, 0xd8 //9 11 13 15
  2223. vfmul.s D2, U2, VALPHAR
  2224. vfmul.s D3, U3, VALPHAR
  2225. VNMSUB D2, U3, VALPHAI, D2
  2226. VFMADD D3, U2, VALPHAI, D3
  2227. vand.v D4, D3, D3 //8 10 12 14
  2228. vpermi.w D4, D2, 0x44 //8 10 9 11
  2229. vshuf4i.w D4, D4, 0xd8 //8 9 10 11
  2230. vand.v D5, D3, D3 //9 11 13 15
  2231. vpermi.w D5, D2, 0xee //12 14 13 15
  2232. vshuf4i.w D5, D5, 0xd8 //12 13 14 15
  2233. vst D4, C0, 0x20
  2234. vst D5, C0, 0x30
  2235. addi.d C0, C0, 0x40
  2236. #else
  2237. //res00 res01 res02 res03
  2238. vld D0, C0, 0x00 //c0: 0 1 2 3
  2239. vld D1, C0, 0x10 //c0: 4 5 6 7
  2240. vpackev.w D2, D1, D0
  2241. vshuf4i.w D2, D2, 0xd8 //0 2 4 6
  2242. vpackod.w D3, D1, D0
  2243. vshuf4i.w D3, D3, 0xd8 //1 3 5 7
  2244. VFMADD D2, U0, VALPHAR, D2
  2245. VFMADD D3, U1, VALPHAR, D3
  2246. VNMSUB D2, U1, VALPHAI, D2
  2247. VFMADD D3, U0, VALPHAI, D3
  2248. vand.v D4, D3, D3 //1 3 5 7
  2249. vpermi.w D4, D2, 0x44 //0 2 1 3
  2250. vshuf4i.w D4, D4, 0xd8 //0 1 2 3
  2251. vand.v D5, D3, D3 //1 3 5 7
  2252. vpermi.w D5, D2, 0xee //4 6 5 7
  2253. vshuf4i.w D5, D5, 0xd8 //4 5 6 7
  2254. vst D4, C0, 0x00
  2255. vst D5, C0, 0x10
  2256. //res04 res05 res06 res07
  2257. vld D0, C0, 0x20 //c0: 8 9 10 11
  2258. vld D1, C0, 0x30 //c0: 12 13 14 15
  2259. vpackev.w D2, D1, D0
  2260. vshuf4i.w D2, D2, 0xd8 //8 10 12 14
  2261. vpackod.w D3, D1, D0
  2262. vshuf4i.w D3, D3, 0xd8 //9 11 13 15
  2263. VFMADD D2, U2, VALPHAR, D2
  2264. VFMADD D3, U3, VALPHAR, D3
  2265. VNMSUB D2, U3, VALPHAI, D2
  2266. VFMADD D3, U2, VALPHAI, D3
  2267. vand.v D4, D3, D3 //8 10 12 14
  2268. vpermi.w D4, D2, 0x44 //8 10 9 11
  2269. vshuf4i.w D4, D4, 0xd8 //8 9 10 11
  2270. vand.v D5, D3, D3 //9 11 13 15
  2271. vpermi.w D5, D2, 0xee //12 14 13 15
  2272. vshuf4i.w D5, D5, 0xd8 //12 13 14 15
  2273. vst D4, C0, 0x20
  2274. vst D5, C0, 0x30
  2275. addi.d C0, C0, 0x40
  2276. #endif
  2277. #if defined(TRMMKERNEL)
  2278. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2279. sub.d TL, K, OFF
  2280. #ifdef LEFT
  2281. addi.d TL, TL, -8
  2282. #else
  2283. addi.d TL, TL, -1
  2284. #endif
  2285. slli.d T3, TL, 0x06
  2286. add.d A0, A0, T3
  2287. slli.d T3, TL, 0x03
  2288. add.d B0, B0, T3
  2289. #endif
  2290. #ifdef LEFT
  2291. addi.d OFF, OFF, 8
  2292. #endif
  2293. #endif // #if defined(TRMMKERNEL)
  2294. addi.d I, I, 1
  2295. blt I, T0, .L31
  2296. .L34: /* if ( bm & 4 ) */
  2297. move I, $r0
  2298. andi T1, M, 4 //bm&4
  2299. beq I, T1, .L38
  2300. .L35:
  2301. move B0, B //ptrbb
  2302. move TL, K /* TL = bk */
  2303. #if defined(TRMMKERNEL)
  2304. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  2305. move B0, B //ptrbb
  2306. #else
  2307. slli.d T3, OFF, 0x05
  2308. add.d A0, A0, T3
  2309. slli.d T3, OFF, 0x03
  2310. add.d B0, B, T3
  2311. #endif
  2312. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2313. sub.d TL, K, OFF
  2314. #elif defined(LEFT)
  2315. addi.d TL, OFF, 4
  2316. #else
  2317. addi.d TL, OFF, 1
  2318. #endif
  2319. #endif // #if defined(TRMMKERNEL)
  2320. vxor.v U0, U0, U0
  2321. vxor.v U1, U1, U1
  2322. move L, $r0 //cycle param k
  2323. beq L, TL, .L37
  2324. blt TL, L, .L37
  2325. .L36: /* for (k=0; k<temp; k++) */
  2326. vld D0, A0, 0x00 // a0ri a1ri
  2327. vld D1, A0, 0x10 // a2ri a3ri
  2328. vldrepl.w D2, B0, 0x00 //b0r
  2329. vldrepl.w D3, B0, 0x04 //b0i
  2330. vpackev.w D4, D1, D0
  2331. vshuf4i.w D4, D4, 0xd8 //a0r a1r a2r a3r
  2332. vpackod.w D5, D1, D0
  2333. vshuf4i.w D5, D5, 0xd8 //a0i a1i a2i a3i
  2334. VMADD1 U0, D4, D2, U0 //00r 01r 02r 03r
  2335. VMADD2 U1, D5, D2, U1 //00i 01i 02i 03i
  2336. VMADD3 U0, D5, D3, U0
  2337. VMADD4 U1, D4, D3, U1
  2338. addi.d A0, A0, 0x20
  2339. addi.d B0, B0, 0x08
  2340. addi.d L, L, 1
  2341. blt L, TL, .L36
  2342. .L37:
  2343. #if defined(TRMMKERNEL)
  2344. //res00 res01 res02 res03
  2345. vld D0, C0, 0x00 //c0: 0 1 2 3
  2346. vld D1, C0, 0x10 //c0: 4 5 6 7
  2347. vpackev.w D2, D1, D0
  2348. vshuf4i.w D2, D2, 0xd8 //0 2 4 6
  2349. vpackod.w D3, D1, D0
  2350. vshuf4i.w D3, D3, 0xd8 //1 3 5 7
  2351. vfmul.s D2, U0, VALPHAR
  2352. vfmul.s D3, U1, VALPHAR
  2353. VNMSUB D2, U1, VALPHAI, D2
  2354. VFMADD D3, U0, VALPHAI, D3
  2355. vand.v D4, D3, D3 //1 3 5 7
  2356. vpermi.w D4, D2, 0x44 //0 2 1 3
  2357. vshuf4i.w D4, D4, 0xd8 //0 1 2 3
  2358. vand.v D5, D3, D3 //1 3 5 7
  2359. vpermi.w D5, D2, 0xee //4 6 5 7
  2360. vshuf4i.w D5, D5, 0xd8 //4 5 6 7
  2361. vst D4, C0, 0x00
  2362. vst D5, C0, 0x10
  2363. addi.d C0, C0, 0x20
  2364. #else
  2365. //res00 res01 res02 res03
  2366. vld D0, C0, 0x00 //c0: 0 1 2 3
  2367. vld D1, C0, 0x10 //c0: 4 5 6 7
  2368. vpackev.w D2, D1, D0
  2369. vshuf4i.w D2, D2, 0xd8 //0 2 4 6
  2370. vpackod.w D3, D1, D0
  2371. vshuf4i.w D3, D3, 0xd8 //1 3 5 7
  2372. VFMADD D2, U0, VALPHAR, D2
  2373. VFMADD D3, U1, VALPHAR, D3
  2374. VNMSUB D2, U1, VALPHAI, D2
  2375. VFMADD D3, U0, VALPHAI, D3
  2376. vand.v D4, D3, D3 //1 3 5 7
  2377. vpermi.w D4, D2, 0x44 //0 2 1 3
  2378. vshuf4i.w D4, D4, 0xd8 //0 1 2 3
  2379. vand.v D5, D3, D3 //1 3 5 7
  2380. vpermi.w D5, D2, 0xee //4 6 5 7
  2381. vshuf4i.w D5, D5, 0xd8 //4 5 6 7
  2382. vst D4, C0, 0x00
  2383. vst D5, C0, 0x10
  2384. addi.d C0, C0, 0x20
  2385. #endif
  2386. #if defined(TRMMKERNEL)
  2387. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2388. sub.d TL, K, OFF
  2389. #ifdef LEFT
  2390. addi.d TL, TL, -4
  2391. #else
  2392. addi.d TL, TL, -1
  2393. #endif
  2394. slli.d T3, TL, 0x05
  2395. add.d A0, A0, T3
  2396. slli.d T3, TL, 0x03
  2397. add.d B0, B0, T3
  2398. #endif
  2399. #ifdef LEFT
  2400. addi.d OFF, OFF, 4
  2401. #endif
  2402. #endif // #if defined(TRMMKERNEL)
  2403. .L38: /* if ( bm & 2 ) */
  2404. move I, $r0
  2405. andi T1, M, 2 //bm&2
  2406. beq I, T1, .L312
  2407. .L39:
  2408. move B0, B //ptrbb
  2409. move TL, K /* TL = bk */
  2410. #if defined(TRMMKERNEL)
  2411. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  2412. move B0, B //ptrbb
  2413. #else
  2414. slli.d T3, OFF, 0x04
  2415. add.d A0, A0, T3
  2416. slli.d T3, OFF, 0x03
  2417. add.d B0, B, T3
  2418. #endif
  2419. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2420. sub.d TL, K, OFF
  2421. #elif defined(LEFT)
  2422. addi.d TL, OFF, 2
  2423. #else
  2424. addi.d TL, OFF, 1
  2425. #endif
  2426. #endif // #if defined(TRMMKERNEL)
  2427. MTC c11, $r0
  2428. MTC c12, $r0
  2429. MTC c21, $r0
  2430. MTC c22, $r0
  2431. move L, $r0 //cycle param k
  2432. beq L, TL, .L311
  2433. blt TL, L, .L311
  2434. .L310: /* for (k=0; k<temp; k++) */
  2435. LD a1, A0, 0x00 //a0r
  2436. LD a2, A0, 0x04 //a0i
  2437. LD a3, A0, 0x08 //a1r
  2438. LD a4, A0, 0x0c //a1i
  2439. LD b1, B0, 0x00 //b0r
  2440. LD b2, B0, 0x04 //b0i
  2441. MADD1 c11, a1, b1, c11 //res00r
  2442. MADD2 c12, a2, b1, c12 //res00i
  2443. MADD3 c11, a2, b2, c11
  2444. MADD4 c12, a1, b2, c12
  2445. MADD1 c21, a3, b1, c21 //res10r
  2446. MADD2 c22, a4, b1, c22 //res10i
  2447. MADD3 c21, a4, b2, c21
  2448. MADD4 c22, a3, b2, c22
  2449. addi.d A0, A0, 0x10
  2450. addi.d B0, B0, 0x08
  2451. addi.d L, L, 1
  2452. blt L, TL, .L310
  2453. .L311:
  2454. #if defined(TRMMKERNEL)
  2455. //res00 res10
  2456. LD a5, C0, 0x00 //C0[0]
  2457. LD a6, C0, 0x04 //C0[1]
  2458. LD a7, C0, 0x08 //C0[2]
  2459. LD a8, C0, 0x0c //C0[3]
  2460. MUL a5, c11, ALPHA_R
  2461. MUL a6, c12, ALPHA_R
  2462. NMSUB a5, c12, ALPHA_I, a5
  2463. MADD a6, c11, ALPHA_I, a6
  2464. MUL a7, c21, ALPHA_R
  2465. MUL a8, c22, ALPHA_R
  2466. NMSUB a7, c22, ALPHA_I, a7
  2467. MADD a8, c21, ALPHA_I, a8
  2468. ST a5, C0, 0x00
  2469. ST a6, C0, 0x04
  2470. ST a7, C0, 0x08
  2471. ST a8, C0, 0x0c
  2472. addi.d C0, C0, 0x10
  2473. #else
  2474. //res00 res10
  2475. LD a5, C0, 0x00 //C0[0]
  2476. LD a6, C0, 0x04 //C0[1]
  2477. LD a7, C0, 0x08 //C0[2]
  2478. LD a8, C0, 0x0c //C0[3]
  2479. MADD a5, c11, ALPHA_R, a5
  2480. MADD a6, c12, ALPHA_R, a6
  2481. NMSUB a5, c12, ALPHA_I, a5
  2482. MADD a6, c11, ALPHA_I, a6
  2483. MADD a7, c21, ALPHA_R, a7
  2484. MADD a8, c22, ALPHA_R, a8
  2485. NMSUB a7, c22, ALPHA_I, a7
  2486. MADD a8, c21, ALPHA_I, a8
  2487. ST a5, C0, 0x00
  2488. ST a6, C0, 0x04
  2489. ST a7, C0, 0x08
  2490. ST a8, C0, 0x0c
  2491. addi.d C0, C0, 0x10
  2492. #endif
  2493. #if defined(TRMMKERNEL)
  2494. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2495. sub.d TL, K, OFF
  2496. #ifdef LEFT
  2497. addi.d TL, TL, -2
  2498. #else
  2499. addi.d TL, TL, -1
  2500. #endif
  2501. slli.d T3, TL, 0x04
  2502. add.d A0, A0, T3
  2503. slli.d T3, TL, 0x03
  2504. add.d B0, B0, T3
  2505. #endif
  2506. #ifdef LEFT
  2507. addi.d OFF, OFF, 2
  2508. #endif
  2509. #endif // #if defined(TRMMKERNEL)
  2510. .L312: /* if ( bm & 1 )*/
  2511. move I, $r0
  2512. andi T1, M, 1 //bm&1
  2513. beq I, T1, .L316
  2514. .L313:
  2515. move B0, B //ptrbb
  2516. move TL, K /* TL = bk */
  2517. #if defined(TRMMKERNEL)
  2518. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  2519. move B0, B //ptrbb
  2520. #else
  2521. slli.d T3, OFF, 0x03
  2522. add.d A0, A0, T3
  2523. slli.d T3, OFF, 0x03
  2524. add.d B0, B, T3
  2525. #endif
  2526. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2527. sub.d TL, K, OFF
  2528. #elif defined(LEFT)
  2529. addi.d TL, OFF, 1
  2530. #else
  2531. addi.d TL, OFF, 1
  2532. #endif
  2533. #endif // #if defined(TRMMKERNEL)
  2534. MTC c11, $r0
  2535. MTC c12, $r0
  2536. move L, $r0 //cycle param k
  2537. beq L, TL, .L315
  2538. blt TL, L, .L315
  2539. .L314: /* for (k=0; k<temp; k++) */
  2540. LD a1, A0, 0x00
  2541. LD a2, A0, 0x04
  2542. LD b1, B0, 0x00
  2543. LD b2, B0, 0x04
  2544. MADD1 c11, a1, b1, c11
  2545. MADD2 c12, a2, b1, c12
  2546. MADD3 c11, a2, b2, c11
  2547. MADD4 c12, a1, b2, c12
  2548. addi.d A0, A0, 0x08
  2549. addi.d B0, B0, 0x08
  2550. addi.d L, L, 1
  2551. blt L, TL, .L314
  2552. .L315:
  2553. #if defined(TRMMKERNEL)
  2554. MUL a5, c11, ALPHA_R
  2555. MUL a6, c12, ALPHA_I
  2556. SUB a5, a5, a6
  2557. ST a5, C0, 0x00
  2558. MUL a5, c12, ALPHA_R
  2559. MUL a6, c11, ALPHA_I
  2560. ADD a6, a5, a6
  2561. ST a6, C0, 0x04
  2562. addi.d C0, C0, 0x08
  2563. #else
  2564. LD a5, C0, 0x00 //C0[0]
  2565. LD a6, C0, 0x04 //C0[1]
  2566. MADD a5, c11, ALPHA_R, a5
  2567. MADD a6, c12, ALPHA_R, a6
  2568. NMSUB a5, c12, ALPHA_I, a5
  2569. MADD a6, c11, ALPHA_I, a6
  2570. ST a5, C0, 0x00
  2571. ST a6, C0, 0x04
  2572. addi.d C0, C0, 0x08
  2573. #endif
  2574. #if defined(TRMMKERNEL)
  2575. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2576. sub.d TL, K, OFF
  2577. #ifdef LEFT
  2578. addi.d TL, TL, -1
  2579. #else
  2580. addi.d TL, TL, -1
  2581. #endif
  2582. slli.d T3, TL, 0x03
  2583. add.d A0, A0, T3
  2584. add.d B0, B0, T3
  2585. #endif
  2586. #ifdef LEFT
  2587. addi.d OFF, OFF, 1
  2588. #endif
  2589. #endif // #if defined(TRMMKERNEL)
  2590. .L316:
  2591. slli.d L, K, 3
  2592. add.d B, B, L
  2593. slli.d I, LDC, 1
  2594. add.d C, C, I
  2595. addi.d J, J, 1
  2596. andi T0, N, 1
  2597. blt J, T0, .L300
  2598. .L999:
  2599. LDARG $r23, $sp, 0
  2600. LDARG $r24, $sp, 8
  2601. LDARG $r25, $sp, 16
  2602. LDARG $r26, $sp, 24
  2603. LDARG $r27, $sp, 32
  2604. LD $f23, $sp, 40
  2605. LD $f24, $sp, 48
  2606. LD $f25, $sp, 56
  2607. LD $f26, $sp, 64
  2608. LD $f27, $sp, 72
  2609. LD $f28, $sp, 80
  2610. LD $f29, $sp, 88
  2611. LD $f30, $sp, 96
  2612. LD $f31, $sp, 104
  2613. addi.d $sp, $sp, 128
  2614. jirl $r0, $r1, 0x0
  2615. EPILOGUE