You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm_kernel_4x4_lsx.S 65 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316
  1. /*******************************************************************************
  2. Copyright (c) 2024, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *******************************************************************************/
  27. #define ASSEMBLER
  28. #include "common.h"
  29. /* Function parameters */
  30. #define M $r4 // param 1: bm
  31. #define N $r5 // param 2: bn
  32. #define K $r6 // param 3: bk
  33. #define ALPHA_R $f0 // param 4: alphar
  34. #define ALPHA_I $f1 // param 5: alphai
  35. #define A $r7 // param 6: ba
  36. #define B $r8 // param 7: bb
  37. #define C $r9 // param 8: bc
  38. #define LDC $r10 // param 9: ldc
  39. #if defined (TRMMKERNEL)
  40. #define OFFSET $r11 // param 10: offset
  41. #endif
  42. #define OFF $r26
  43. #define I $r12
  44. #define J $r13
  45. #define L $r14
  46. #define TL $r15
  47. #define A0 $r16
  48. #define B0 $r17
  49. #define C0 $r18
  50. #define C1 $r19
  51. #define C2 $r20
  52. #define C3 $r23
  53. #define T0 $r24
  54. #define T1 $r25
  55. #define T2 $r26
  56. #define T3 $r27
  57. #define a1 $f2
  58. #define a2 $f3
  59. #define a3 $f4
  60. #define a4 $f5
  61. #define a5 $f6
  62. #define a6 $f7
  63. #define a7 $f8
  64. #define a8 $f9
  65. #define b1 $f10
  66. #define b2 $f11
  67. #define b3 $f12
  68. #define b4 $f13
  69. #define b5 $f14
  70. #define b6 $f15
  71. #define b7 $f16
  72. #define b8 $f17
  73. #define c11 $f18
  74. #define c12 $f19
  75. #define c21 $f20
  76. #define c22 $f21
  77. #define c31 $f22
  78. #define c32 $f23
  79. #define c41 $f24
  80. #define c42 $f25
  81. /* LSX vectors */
  82. #define U0 $vr30
  83. #define U1 $vr31
  84. #define U2 $vr2
  85. #define U3 $vr3
  86. #define U4 $vr4
  87. #define U5 $vr5
  88. #define U6 $vr6
  89. #define U7 $vr7
  90. #define U8 $vr8
  91. #define U9 $vr9
  92. #define U10 $vr10
  93. #define U11 $vr11
  94. #define U12 $vr12
  95. #define U13 $vr13
  96. #define U14 $vr14
  97. #define U15 $vr15
  98. #define D0 $vr16
  99. #define D1 $vr17
  100. #define D2 $vr18
  101. #define D3 $vr19
  102. #define D4 $vr20
  103. #define D5 $vr21
  104. #define D6 $vr22
  105. #define D7 $vr23
  106. #define D8 $vr24
  107. #define D9 $vr25
  108. #define D10 $vr26
  109. #define D11 $vr27
  110. #define D12 $vr28
  111. #define D13 $vr29
  112. #define VALPHAR $vr28
  113. #define VALPHAI $vr29
  114. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  115. #define VMADD1 VFMADD
  116. #define VMADD2 VFMADD
  117. #define VMADD3 VNMSUB
  118. #define VMADD4 VFMADD
  119. #define MADD1 MADD
  120. #define MADD2 MADD
  121. #define MADD3 NMSUB
  122. #define MADD4 MADD
  123. #endif
  124. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  125. #define VMADD1 VFMADD
  126. #define VMADD2 VFMADD
  127. #define VMADD3 VFMADD
  128. #define VMADD4 VNMSUB
  129. #define MADD1 MADD
  130. #define MADD2 MADD
  131. #define MADD3 MADD
  132. #define MADD4 NMSUB
  133. #endif
  134. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  135. #define VMADD1 VFMADD
  136. #define VMADD2 VNMSUB
  137. #define VMADD3 VFMADD
  138. #define VMADD4 VFMADD
  139. #define MADD1 MADD
  140. #define MADD2 NMSUB
  141. #define MADD3 MADD
  142. #define MADD4 MADD
  143. #endif
  144. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  145. #define VMADD1 VFMADD
  146. #define VMADD2 VNMSUB
  147. #define VMADD3 VNMSUB
  148. #define VMADD4 VNMSUB
  149. #define MADD1 MADD
  150. #define MADD2 NMSUB
  151. #define MADD3 NMSUB
  152. #define MADD4 NMSUB
  153. #endif
  154. PROLOGUE
  155. addi.d $sp, $sp, -128
  156. SDARG $r23, $sp, 0
  157. SDARG $r24, $sp, 8
  158. SDARG $r25, $sp, 16
  159. SDARG $r26, $sp, 24
  160. SDARG $r27, $sp, 32
  161. ST $f23, $sp, 40
  162. ST $f24, $sp, 48
  163. ST $f25, $sp, 56
  164. ST $f26, $sp, 64
  165. ST $f27, $sp, 72
  166. ST $f28, $sp, 80
  167. ST $f29, $sp, 88
  168. ST $f30, $sp, 96
  169. ST $f31, $sp, 104
  170. ST ALPHA_R,$sp, 112
  171. ST ALPHA_I,$sp, 120
  172. vldrepl.d VALPHAR, $sp, 112
  173. vldrepl.d VALPHAI, $sp, 120
  174. #if defined (TRMMKERNEL) && !defined(LEFT)
  175. sub.d OFF, $r0, OFFSET
  176. #else
  177. xor OFF, OFF, OFF
  178. #endif
  179. slli.d LDC, LDC, BASE_SHIFT
  180. move J, $r0
  181. srai.d T0, N, 2 //bn/4
  182. beq J, T0, .L19
  183. .L10: /* for(j=0; j<bn/4; j+=1) */
  184. move C0, C
  185. slli.d TL, LDC, 1
  186. add.d C1, C0, TL
  187. add.d C2, C1, TL
  188. add.d C3, C2, TL
  189. move A0, A //ptrba
  190. #if defined(TRMMKERNEL) && defined(LEFT)
  191. move OFF, OFFSET
  192. #endif
  193. move I, $r0
  194. srai.d T0, M, 2 //bm/4
  195. beq I, T0, .L18
  196. .L11: /* for(i=0; i<bm/4; i+=1) */
  197. move B0, B //ptrbb
  198. move TL, K /* TL = bk */
  199. #if defined(TRMMKERNEL)
  200. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  201. move B0, B //ptrbb
  202. #else
  203. slli.d T3, OFF, 0x06
  204. add.d A0, A0, T3
  205. slli.d T3, OFF, 0x06
  206. add.d B0, B, T3
  207. #endif
  208. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  209. sub.d TL, K, OFF //temp
  210. #elif defined(LEFT)
  211. addi.d TL, OFF, 4
  212. #else
  213. addi.d TL, OFF, 4
  214. #endif
  215. #endif // #if defined(TRMMKERNEL)
  216. vxor.v U0, U0, U0
  217. vxor.v U1, U1, U1
  218. vxor.v U2, U2, U2
  219. vxor.v U3, U3, U3
  220. vxor.v U4, U4, U4
  221. vxor.v U5, U5, U5
  222. vxor.v U6, U6, U6
  223. vxor.v U7, U7, U7
  224. vxor.v U8, U8, U8
  225. vxor.v U9, U9, U9
  226. vxor.v U10, U10, U10
  227. vxor.v U11, U11, U11
  228. vxor.v U12, U12, U12
  229. vxor.v U13, U13, U13
  230. vxor.v U14, U14, U14
  231. vxor.v U15, U15, U15
  232. move L, $r0 //cycle param k
  233. beq L, TL, .L13
  234. blt TL, L, .L13
  235. .L12: /* for(k=0; k<temp; k+=1) */
  236. vld D1, B0, 0x00 // b0ri
  237. vld D2, B0, 0x10 // b1ri
  238. vld D3, B0, 0x20 // b2ri
  239. vld D4, B0, 0x30 // b3ri
  240. vld D0, A0, 0x00 // a0ri
  241. vand.v D5, D0, D0
  242. vand.v D6, D0, D0
  243. vshuf4i.d D5, D0, 0x00 //a0rr
  244. vshuf4i.d D6, D0, 0x55 //a0ii
  245. vpackev.d D7, D2, D1 //b0r b1r
  246. vpackod.d D8, D2, D1 //b0i b1i
  247. vpackev.d D9, D4, D3 //b2r b3r
  248. vpackod.d D10, D4, D3 //b2i b3i
  249. VMADD1 U0, D5, D7, U0 //00r 10r
  250. VMADD2 U1, D6, D7, U1 //00i 10i
  251. VMADD3 U0, D6, D8, U0
  252. VMADD4 U1, D5, D8, U1
  253. VMADD1 U2, D5, D9, U2 //20r 30r
  254. VMADD2 U3, D6, D9, U3 //20i 30i
  255. VMADD3 U2, D6, D10, U2
  256. VMADD4 U3, D5, D10, U3
  257. vld D0, A0, 0x10 // a1ri
  258. vand.v D5, D0, D0
  259. vand.v D6, D0, D0
  260. vshuf4i.d D5, D0, 0x00 //a1rr
  261. vshuf4i.d D6, D0, 0x55 //a1ii
  262. VMADD1 U4, D5, D7, U4 //01r 11r
  263. VMADD2 U5, D6, D7, U5 //01i 11i
  264. VMADD3 U4, D6, D8, U4
  265. VMADD4 U5, D5, D8, U5
  266. VMADD1 U6, D5, D9, U6 //21r 31r
  267. VMADD2 U7, D6, D9, U7 //21i 31i
  268. VMADD3 U6, D6, D10, U6
  269. VMADD4 U7, D5, D10, U7
  270. vld D0, A0, 0x20 // a2ri
  271. vand.v D5, D0, D0
  272. vand.v D6, D0, D0
  273. vshuf4i.d D5, D0, 0x00 //a2rr
  274. vshuf4i.d D6, D0, 0x55 //a2ii
  275. VMADD1 U8, D5, D7, U8 //02r 12r
  276. VMADD2 U9, D6, D7, U9 //02i 12i
  277. VMADD3 U8, D6, D8, U8
  278. VMADD4 U9, D5, D8, U9
  279. VMADD1 U10, D5, D9, U10 //22r 32r
  280. VMADD2 U11, D6, D9, U11 //22i 32i
  281. VMADD3 U10, D6, D10, U10
  282. VMADD4 U11, D5, D10, U11
  283. vld D0, A0, 0x30 // a3ri
  284. vand.v D5, D0, D0
  285. vand.v D6, D0, D0
  286. vshuf4i.d D5, D0, 0x00 //a3rr
  287. vshuf4i.d D6, D0, 0x55 //a3ii
  288. VMADD1 U12, D5, D7, U12 //03r 13r
  289. VMADD2 U13, D6, D7, U13 //03i 13i
  290. VMADD3 U12, D6, D8, U12
  291. VMADD4 U13, D5, D8, U13
  292. VMADD1 U14, D5, D9, U14 //23r 33r
  293. VMADD2 U15, D6, D9, U15 //23i 33i
  294. VMADD3 U14, D6, D10, U14
  295. VMADD4 U15, D5, D10, U15
  296. addi.d A0, A0, 0x40
  297. addi.d B0, B0, 0x40
  298. addi.d L, L, 1
  299. blt L, TL, .L12
  300. .L13:
  301. #if defined(TRMMKERNEL)
  302. //res00 res10
  303. vld D0, C0, 0x00 //c0: 0 1
  304. vld D1, C1, 0x00 //c1: 0 1
  305. vpackev.d D2, D1, D0 //c0[0] c1[0]
  306. vpackod.d D3, D1, D0 //c0[1] c1[1]
  307. vfmul.d D2, U0, VALPHAR
  308. vfmul.d D3, U1, VALPHAR
  309. VNMSUB D2, U1, VALPHAI, D2
  310. VFMADD D3, U0, VALPHAI, D3
  311. vpackev.d D4, D3, D2 //c0[0] c0[1]
  312. vpackod.d D5, D3, D2 //c1[0] c1[1]
  313. vst D4, C0, 0x00
  314. vst D5, C1, 0x00
  315. addi.d C0, C0, 0x10
  316. addi.d C1, C1, 0x10
  317. //res20 res30
  318. vld D0, C2, 0x00 //c2: 0 1
  319. vld D1, C3, 0x00 //c3: 0 1
  320. vpackev.d D2, D1, D0 //c2[0] c3[0]
  321. vpackod.d D3, D1, D0 //c2[1] c3[1]
  322. vfmul.d D2, U2, VALPHAR
  323. vfmul.d D3, U3, VALPHAR
  324. VNMSUB D2, U3, VALPHAI, D2
  325. VFMADD D3, U2, VALPHAI, D3
  326. vpackev.d D4, D3, D2 //c2[0] c2[1]
  327. vpackod.d D5, D3, D2 //c3[0] c3[1]
  328. vst D4, C2, 0x00
  329. vst D5, C3, 0x00
  330. addi.d C2, C2, 0x10
  331. addi.d C3, C3, 0x10
  332. //res01 res11
  333. vld D0, C0, 0x00 //c0: 0 1
  334. vld D1, C1, 0x00 //c1: 0 1
  335. vpackev.d D2, D1, D0 //c0[0] c1[0]
  336. vpackod.d D3, D1, D0 //c0[1] c1[1]
  337. vfmul.d D2, U4, VALPHAR
  338. vfmul.d D3, U5, VALPHAR
  339. VNMSUB D2, U5, VALPHAI, D2
  340. VFMADD D3, U4, VALPHAI, D3
  341. vpackev.d D4, D3, D2 //c0[0] c0[1]
  342. vpackod.d D5, D3, D2 //c1[0] c1[1]
  343. vst D4, C0, 0x00
  344. vst D5, C1, 0x00
  345. addi.d C0, C0, 0x10
  346. addi.d C1, C1, 0x10
  347. //res21 res31
  348. vld D0, C2, 0x00 //c2: 0 1
  349. vld D1, C3, 0x00 //c3: 0 1
  350. vpackev.d D2, D1, D0 //c2[0] c3[0]
  351. vpackod.d D3, D1, D0 //c2[1] c3[1]
  352. vfmul.d D2, U6, VALPHAR
  353. vfmul.d D3, U7, VALPHAR
  354. VNMSUB D2, U7, VALPHAI, D2
  355. VFMADD D3, U6, VALPHAI, D3
  356. vpackev.d D4, D3, D2 //c2[0] c2[1]
  357. vpackod.d D5, D3, D2 //c3[0] c3[1]
  358. vst D4, C2, 0x00
  359. vst D5, C3, 0x00
  360. addi.d C2, C2, 0x10
  361. addi.d C3, C3, 0x10
  362. //res02 res12
  363. vld D0, C0, 0x00 //c0: 0 1
  364. vld D1, C1, 0x00 //c1: 0 1
  365. vpackev.d D2, D1, D0 //c0[0] c1[0]
  366. vpackod.d D3, D1, D0 //c0[1] c1[1]
  367. vfmul.d D2, U8, VALPHAR
  368. vfmul.d D3, U9, VALPHAR
  369. VNMSUB D2, U9, VALPHAI, D2
  370. VFMADD D3, U8, VALPHAI, D3
  371. vpackev.d D4, D3, D2 //c0[0] c0[1]
  372. vpackod.d D5, D3, D2 //c1[0] c1[1]
  373. vst D4, C0, 0x00
  374. vst D5, C1, 0x00
  375. addi.d C0, C0, 0x10
  376. addi.d C1, C1, 0x10
  377. //res22 res32
  378. vld D0, C2, 0x00 //c2: 0 1
  379. vld D1, C3, 0x00 //c3: 0 1
  380. vpackev.d D2, D1, D0 //c2[0] c3[0]
  381. vpackod.d D3, D1, D0 //c2[1] c3[1]
  382. vfmul.d D2, U10, VALPHAR
  383. vfmul.d D3, U11, VALPHAR
  384. VNMSUB D2, U11, VALPHAI, D2
  385. VFMADD D3, U10, VALPHAI, D3
  386. vpackev.d D4, D3, D2 //c2[0] c2[1]
  387. vpackod.d D5, D3, D2 //c3[0] c3[1]
  388. vst D4, C2, 0x00
  389. vst D5, C3, 0x00
  390. addi.d C2, C2, 0x10
  391. addi.d C3, C3, 0x10
  392. //res03 res13
  393. vld D0, C0, 0x00 //c0: 0 1
  394. vld D1, C1, 0x00 //c1: 0 1
  395. vpackev.d D2, D1, D0 //c0[0] c1[0]
  396. vpackod.d D3, D1, D0 //c0[1] c1[1]
  397. vfmul.d D2, U12, VALPHAR
  398. vfmul.d D3, U13, VALPHAR
  399. VNMSUB D2, U13, VALPHAI, D2
  400. VFMADD D3, U12, VALPHAI, D3
  401. vpackev.d D4, D3, D2 //c0[0] c0[1]
  402. vpackod.d D5, D3, D2 //c1[0] c1[1]
  403. vst D4, C0, 0x00
  404. vst D5, C1, 0x00
  405. addi.d C0, C0, 0x10
  406. addi.d C1, C1, 0x10
  407. //res23 res33
  408. vld D0, C2, 0x00 //c2: 0 1
  409. vld D1, C3, 0x00 //c3: 0 1
  410. vpackev.d D2, D1, D0 //c2[0] c3[0]
  411. vpackod.d D3, D1, D0 //c2[1] c3[1]
  412. vfmul.d D2, U14, VALPHAR
  413. vfmul.d D3, U15, VALPHAR
  414. VNMSUB D2, U15, VALPHAI, D2
  415. VFMADD D3, U14, VALPHAI, D3
  416. vpackev.d D4, D3, D2 //c2[0] c2[1]
  417. vpackod.d D5, D3, D2 //c3[0] c3[1]
  418. vst D4, C2, 0x00
  419. vst D5, C3, 0x00
  420. addi.d C2, C2, 0x10
  421. addi.d C3, C3, 0x10
  422. #else
  423. //res00 res10
  424. vld D0, C0, 0x00 //c0: 0 1
  425. vld D1, C1, 0x00 //c1: 0 1
  426. vst U0, C0, 0x00
  427. fld.d $f27, C0, 0x00
  428. fld.d $f27, C0, 0x08
  429. vst U1, C0, 0x00
  430. fld.d $f27, C0, 0x00
  431. fld.d $f27, C0, 0x08
  432. vst U2, C0, 0x00
  433. fld.d $f27, C0, 0x00
  434. fld.d $f27, C0, 0x08
  435. vst U3, C0, 0x00
  436. fld.d $f27, C0, 0x00
  437. fld.d $f27, C0, 0x08
  438. vst U4, C0, 0x00
  439. fld.d $f27, C0, 0x00
  440. fld.d $f27, C0, 0x08
  441. vst U5, C0, 0x00
  442. fld.d $f27, C0, 0x00
  443. fld.d $f27, C0, 0x08
  444. vst U6, C0, 0x00
  445. fld.d $f27, C0, 0x00
  446. fld.d $f27, C0, 0x08
  447. vst U7, C0, 0x00
  448. fld.d $f27, C0, 0x00
  449. fld.d $f27, C0, 0x08
  450. vst U8, C0, 0x00
  451. fld.d $f27, C0, 0x00
  452. fld.d $f27, C0, 0x08
  453. vst U9, C0, 0x00
  454. fld.d $f27, C0, 0x00
  455. fld.d $f27, C0, 0x08
  456. vst U10, C0, 0x00
  457. fld.d $f27, C0, 0x00
  458. fld.d $f27, C0, 0x08
  459. vst U11, C0, 0x00
  460. fld.d $f27, C0, 0x00
  461. fld.d $f27, C0, 0x08
  462. vst U12, C0, 0x00
  463. fld.d $f27, C0, 0x00
  464. fld.d $f27, C0, 0x08
  465. vst U13, C0, 0x00
  466. fld.d $f27, C0, 0x00
  467. fld.d $f27, C0, 0x08
  468. vst U14, C0, 0x00
  469. fld.d $f27, C0, 0x00
  470. fld.d $f27, C0, 0x08
  471. vst U15, C0, 0x00
  472. fld.d $f27, C0, 0x00
  473. fld.d $f27, C0, 0x08
  474. vpackev.d D2, D1, D0 //c0[0] c1[0]
  475. vpackod.d D3, D1, D0 //c0[1] c1[1]
  476. VFMADD D2, U0, VALPHAR, D2
  477. VFMADD D3, U1, VALPHAR, D3
  478. VNMSUB D2, U1, VALPHAI, D2
  479. VFMADD D3, U0, VALPHAI, D3
  480. vpackev.d D4, D3, D2 //c0[0] c0[1]
  481. vpackod.d D5, D3, D2 //c1[0] c1[1]
  482. vst D4, C0, 0x00
  483. vst D5, C1, 0x00
  484. addi.d C0, C0, 0x10
  485. addi.d C1, C1, 0x10
  486. //res20 res30
  487. vld D0, C2, 0x00 //c2: 0 1
  488. vld D1, C3, 0x00 //c3: 0 1
  489. vpackev.d D2, D1, D0 //c2[0] c3[0]
  490. vpackod.d D3, D1, D0 //c2[1] c3[1]
  491. VFMADD D2, U2, VALPHAR, D2
  492. VFMADD D3, U3, VALPHAR, D3
  493. VNMSUB D2, U3, VALPHAI, D2
  494. VFMADD D3, U2, VALPHAI, D3
  495. vpackev.d D4, D3, D2 //c2[0] c2[1]
  496. vpackod.d D5, D3, D2 //c3[0] c3[1]
  497. vst D4, C2, 0x00
  498. vst D5, C3, 0x00
  499. addi.d C2, C2, 0x10
  500. addi.d C3, C3, 0x10
  501. //res01 res11
  502. vld D0, C0, 0x00 //c0: 0 1
  503. vld D1, C1, 0x00 //c1: 0 1
  504. vpackev.d D2, D1, D0 //c0[0] c1[0]
  505. vpackod.d D3, D1, D0 //c0[1] c1[1]
  506. VFMADD D2, U4, VALPHAR, D2
  507. VFMADD D3, U5, VALPHAR, D3
  508. VNMSUB D2, U5, VALPHAI, D2
  509. VFMADD D3, U4, VALPHAI, D3
  510. vpackev.d D4, D3, D2 //c0[0] c0[1]
  511. vpackod.d D5, D3, D2 //c1[0] c1[1]
  512. vst D4, C0, 0x00
  513. vst D5, C1, 0x00
  514. addi.d C0, C0, 0x10
  515. addi.d C1, C1, 0x10
  516. //res21 res31
  517. vld D0, C2, 0x00 //c2: 0 1
  518. vld D1, C3, 0x00 //c3: 0 1
  519. vpackev.d D2, D1, D0 //c2[0] c3[0]
  520. vpackod.d D3, D1, D0 //c2[1] c3[1]
  521. VFMADD D2, U6, VALPHAR, D2
  522. VFMADD D3, U7, VALPHAR, D3
  523. VNMSUB D2, U7, VALPHAI, D2
  524. VFMADD D3, U6, VALPHAI, D3
  525. vpackev.d D4, D3, D2 //c2[0] c2[1]
  526. vpackod.d D5, D3, D2 //c3[0] c3[1]
  527. vst D4, C2, 0x00
  528. vst D5, C3, 0x00
  529. addi.d C2, C2, 0x10
  530. addi.d C3, C3, 0x10
  531. //res02 res12
  532. vld D0, C0, 0x00 //c0: 0 1
  533. vld D1, C1, 0x00 //c1: 0 1
  534. vpackev.d D2, D1, D0 //c0[0] c1[0]
  535. vpackod.d D3, D1, D0 //c0[1] c1[1]
  536. VFMADD D2, U8, VALPHAR, D2
  537. VFMADD D3, U9, VALPHAR, D3
  538. VNMSUB D2, U9, VALPHAI, D2
  539. VFMADD D3, U8, VALPHAI, D3
  540. vpackev.d D4, D3, D2 //c0[0] c0[1]
  541. vpackod.d D5, D3, D2 //c1[0] c1[1]
  542. vst D4, C0, 0x00
  543. vst D5, C1, 0x00
  544. addi.d C0, C0, 0x10
  545. addi.d C1, C1, 0x10
  546. //res22 res32
  547. vld D0, C2, 0x00 //c2: 0 1
  548. vld D1, C3, 0x00 //c3: 0 1
  549. vpackev.d D2, D1, D0 //c2[0] c3[0]
  550. vpackod.d D3, D1, D0 //c2[1] c3[1]
  551. VFMADD D2, U10, VALPHAR, D2
  552. VFMADD D3, U11, VALPHAR, D3
  553. VNMSUB D2, U11, VALPHAI, D2
  554. VFMADD D3, U10, VALPHAI, D3
  555. vpackev.d D4, D3, D2 //c2[0] c2[1]
  556. vpackod.d D5, D3, D2 //c3[0] c3[1]
  557. vst D4, C2, 0x00
  558. vst D5, C3, 0x00
  559. addi.d C2, C2, 0x10
  560. addi.d C3, C3, 0x10
  561. //res03 res13
  562. vld D0, C0, 0x00 //c0: 0 1
  563. vld D1, C1, 0x00 //c1: 0 1
  564. vpackev.d D2, D1, D0 //c0[0] c1[0]
  565. vpackod.d D3, D1, D0 //c0[1] c1[1]
  566. VFMADD D2, U12, VALPHAR, D2
  567. VFMADD D3, U13, VALPHAR, D3
  568. VNMSUB D2, U13, VALPHAI, D2
  569. VFMADD D3, U12, VALPHAI, D3
  570. vpackev.d D4, D3, D2 //c0[0] c0[1]
  571. vpackod.d D5, D3, D2 //c1[0] c1[1]
  572. vst D4, C0, 0x00
  573. vst D5, C1, 0x00
  574. addi.d C0, C0, 0x10
  575. addi.d C1, C1, 0x10
  576. //res23 res33
  577. vld D0, C2, 0x00 //c2: 0 1
  578. vld D1, C3, 0x00 //c3: 0 1
  579. vpackev.d D2, D1, D0 //c2[0] c3[0]
  580. vpackod.d D3, D1, D0 //c2[1] c3[1]
  581. VFMADD D2, U14, VALPHAR, D2
  582. VFMADD D3, U15, VALPHAR, D3
  583. VNMSUB D2, U15, VALPHAI, D2
  584. VFMADD D3, U14, VALPHAI, D3
  585. vpackev.d D4, D3, D2 //c2[0] c2[1]
  586. vpackod.d D5, D3, D2 //c3[0] c3[1]
  587. vst D4, C2, 0x00
  588. vst D5, C3, 0x00
  589. addi.d C2, C2, 0x10
  590. addi.d C3, C3, 0x10
  591. #endif
  592. #if defined(TRMMKERNEL)
  593. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  594. sub.d TL, K, OFF
  595. #ifdef LEFT
  596. addi.d TL, TL, -4
  597. #else
  598. addi.d TL, TL, -4
  599. #endif
  600. slli.d T3, TL, 0x06
  601. add.d A0, A0, T3
  602. slli.d T3, TL, 0x06
  603. add.d B0, B0, T3
  604. #endif
  605. #ifdef LEFT
  606. addi.d OFF, OFF, 4
  607. #endif
  608. #endif // #if defined(TRMMKERNEL)
  609. addi.d I, I, 1
  610. blt I, T0, .L11
  611. .L18: /* if (bm & 2) */
  612. move I, $r0
  613. andi T0, M, 2
  614. beq I, T0, .L183
  615. move B0, B //ptrbb
  616. move TL, K /* TL = bk */
  617. #if defined(TRMMKERNEL)
  618. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  619. move B0, B //ptrbb
  620. #else
  621. slli.d T3, OFF, 0x05
  622. add.d A0, A0, T3
  623. slli.d T3, OFF, 0x06
  624. add.d B0, B, T3
  625. #endif
  626. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  627. sub.d TL, K, OFF
  628. #elif defined(LEFT)
  629. addi.d TL, OFF, 2
  630. #else
  631. addi.d TL, OFF, 4
  632. #endif
  633. #endif // #if defined(TRMMKERNEL)
  634. vxor.v U0, U0, U0
  635. vxor.v U1, U1, U1
  636. vxor.v U2, U2, U2
  637. vxor.v U3, U3, U3
  638. vxor.v U4, U4, U4
  639. vxor.v U5, U5, U5
  640. vxor.v U6, U6, U6
  641. vxor.v U7, U7, U7
  642. move L, $r0 //cycle param k
  643. beq L, TL, .L182
  644. blt TL, L, .L182
  645. .L181: /* for (k=0; k<temp; k++) */
  646. vld D1, B0, 0x00 // b0ri
  647. vld D2, B0, 0x10 // b1ri
  648. vld D3, B0, 0x20 // b2ri
  649. vld D4, B0, 0x30 // b3ri
  650. vld D0, A0, 0x00 // a0ri
  651. vand.v D5, D0, D0
  652. vand.v D6, D0, D0
  653. vshuf4i.d D5, D0, 0x00 //a0rr
  654. vshuf4i.d D6, D0, 0x55 //a0ii
  655. vpackev.d D7, D2, D1 //b0r b1r
  656. vpackod.d D8, D2, D1 //b0i b1i
  657. vpackev.d D9, D4, D3 //b2r b3r
  658. vpackod.d D10, D4, D3 //b2i b3i
  659. VMADD1 U0, D5, D7, U0 //00r 10r
  660. VMADD2 U1, D6, D7, U1 //00i 10i
  661. VMADD3 U0, D6, D8, U0
  662. VMADD4 U1, D5, D8, U1
  663. VMADD1 U2, D5, D9, U2 //20r 30r
  664. VMADD2 U3, D6, D9, U3 //20i 30i
  665. VMADD3 U2, D6, D10, U2
  666. VMADD4 U3, D5, D10, U3
  667. vld D0, A0, 0x10 // a1ri
  668. vand.v D5, D0, D0
  669. vand.v D6, D0, D0
  670. vshuf4i.d D5, D0, 0x00 //a1rr
  671. vshuf4i.d D6, D0, 0x55 //a1ii
  672. VMADD1 U4, D5, D7, U4 //01r 11r
  673. VMADD2 U5, D6, D7, U5 //01i 11i
  674. VMADD3 U4, D6, D8, U4
  675. VMADD4 U5, D5, D8, U5
  676. VMADD1 U6, D5, D9, U6 //21r 31r
  677. VMADD2 U7, D6, D9, U7 //21i 31i
  678. VMADD3 U6, D6, D10, U6
  679. VMADD4 U7, D5, D10, U7
  680. addi.d A0, A0, 0x20
  681. addi.d B0, B0, 0x40
  682. addi.d L, L, 1
  683. blt L, TL, .L181
  684. .L182:
  685. #if defined(TRMMKERNEL)
  686. //res00 res10
  687. vld D0, C0, 0x00 //c0: 0 1
  688. vld D1, C1, 0x00 //c1: 0 1
  689. vpackev.d D2, D1, D0 //c0[0] c1[0]
  690. vpackod.d D3, D1, D0 //c0[1] c1[1]
  691. vfmul.d D2, U0, VALPHAR
  692. vfmul.d D3, U1, VALPHAR
  693. VNMSUB D2, U1, VALPHAI, D2
  694. VFMADD D3, U0, VALPHAI, D3
  695. vpackev.d D4, D3, D2 //c0[0] c0[1]
  696. vpackod.d D5, D3, D2 //c1[0] c1[1]
  697. vst D4, C0, 0x00
  698. vst D5, C1, 0x00
  699. addi.d C0, C0, 0x10
  700. addi.d C1, C1, 0x10
  701. //res20 res30
  702. vld D0, C2, 0x00 //c2: 0 1
  703. vld D1, C3, 0x00 //c3: 0 1
  704. vpackev.d D2, D1, D0 //c2[0] c3[0]
  705. vpackod.d D3, D1, D0 //c2[1] c3[1]
  706. vfmul.d D2, U2, VALPHAR
  707. vfmul.d D3, U3, VALPHAR
  708. VNMSUB D2, U3, VALPHAI, D2
  709. VFMADD D3, U2, VALPHAI, D3
  710. vpackev.d D4, D3, D2 //c2[0] c2[1]
  711. vpackod.d D5, D3, D2 //c3[0] c3[1]
  712. vst D4, C2, 0x00
  713. vst D5, C3, 0x00
  714. addi.d C2, C2, 0x10
  715. addi.d C3, C3, 0x10
  716. //res01 res11
  717. vld D0, C0, 0x00 //c0: 0 1
  718. vld D1, C1, 0x00 //c1: 0 1
  719. vpackev.d D2, D1, D0 //c0[0] c1[0]
  720. vpackod.d D3, D1, D0 //c0[1] c1[1]
  721. vfmul.d D2, U4, VALPHAR
  722. vfmul.d D3, U5, VALPHAR
  723. VNMSUB D2, U5, VALPHAI, D2
  724. VFMADD D3, U4, VALPHAI, D3
  725. vpackev.d D4, D3, D2 //c0[0] c0[1]
  726. vpackod.d D5, D3, D2 //c1[0] c1[1]
  727. vst D4, C0, 0x00
  728. vst D5, C1, 0x00
  729. addi.d C0, C0, 0x10
  730. addi.d C1, C1, 0x10
  731. //res21 res31
  732. vld D0, C2, 0x00 //c2: 0 1
  733. vld D1, C3, 0x00 //c3: 0 1
  734. vpackev.d D2, D1, D0 //c2[0] c3[0]
  735. vpackod.d D3, D1, D0 //c2[1] c3[1]
  736. vfmul.d D2, U6, VALPHAR
  737. vfmul.d D3, U7, VALPHAR
  738. VNMSUB D2, U7, VALPHAI, D2
  739. VFMADD D3, U6, VALPHAI, D3
  740. vpackev.d D4, D3, D2 //c2[0] c2[1]
  741. vpackod.d D5, D3, D2 //c3[0] c3[1]
  742. vst D4, C2, 0x00
  743. vst D5, C3, 0x00
  744. addi.d C2, C2, 0x10
  745. addi.d C3, C3, 0x10
  746. #else
  747. //res00 res10
  748. vld D0, C0, 0x00 //c0: 0 1
  749. vld D1, C1, 0x00 //c1: 0 1
  750. vpackev.d D2, D1, D0 //c0[0] c1[0]
  751. vpackod.d D3, D1, D0 //c0[1] c1[1]
  752. VFMADD D2, U0, VALPHAR, D2
  753. VFMADD D3, U1, VALPHAR, D3
  754. VNMSUB D2, U1, VALPHAI, D2
  755. VFMADD D3, U0, VALPHAI, D3
  756. vpackev.d D4, D3, D2 //c0[0] c0[1]
  757. vpackod.d D5, D3, D2 //c1[0] c1[1]
  758. vst D4, C0, 0x00
  759. vst D5, C1, 0x00
  760. addi.d C0, C0, 0x10
  761. addi.d C1, C1, 0x10
  762. //res20 res30
  763. vld D0, C2, 0x00 //c2: 0 1
  764. vld D1, C3, 0x00 //c3: 0 1
  765. vpackev.d D2, D1, D0 //c2[0] c3[0]
  766. vpackod.d D3, D1, D0 //c2[1] c3[1]
  767. VFMADD D2, U2, VALPHAR, D2
  768. VFMADD D3, U3, VALPHAR, D3
  769. VNMSUB D2, U3, VALPHAI, D2
  770. VFMADD D3, U2, VALPHAI, D3
  771. vpackev.d D4, D3, D2 //c2[0] c2[1]
  772. vpackod.d D5, D3, D2 //c3[0] c3[1]
  773. vst D4, C2, 0x00
  774. vst D5, C3, 0x00
  775. addi.d C2, C2, 0x10
  776. addi.d C3, C3, 0x10
  777. //res01 res11
  778. vld D0, C0, 0x00 //c0: 0 1
  779. vld D1, C1, 0x00 //c1: 0 1
  780. vpackev.d D2, D1, D0 //c0[0] c1[0]
  781. vpackod.d D3, D1, D0 //c0[1] c1[1]
  782. VFMADD D2, U4, VALPHAR, D2
  783. VFMADD D3, U5, VALPHAR, D3
  784. VNMSUB D2, U5, VALPHAI, D2
  785. VFMADD D3, U4, VALPHAI, D3
  786. vpackev.d D4, D3, D2 //c0[0] c0[1]
  787. vpackod.d D5, D3, D2 //c1[0] c1[1]
  788. vst D4, C0, 0x00
  789. vst D5, C1, 0x00
  790. addi.d C0, C0, 0x10
  791. addi.d C1, C1, 0x10
  792. //res21 res31
  793. vld D0, C2, 0x00 //c2: 0 1
  794. vld D1, C3, 0x00 //c3: 0 1
  795. vpackev.d D2, D1, D0 //c2[0] c3[0]
  796. vpackod.d D3, D1, D0 //c2[1] c3[1]
  797. VFMADD D2, U6, VALPHAR, D2
  798. VFMADD D3, U7, VALPHAR, D3
  799. VNMSUB D2, U7, VALPHAI, D2
  800. VFMADD D3, U6, VALPHAI, D3
  801. vpackev.d D4, D3, D2 //c2[0] c2[1]
  802. vpackod.d D5, D3, D2 //c3[0] c3[1]
  803. vst D4, C2, 0x00
  804. vst D5, C3, 0x00
  805. addi.d C2, C2, 0x10
  806. addi.d C3, C3, 0x10
  807. #endif
  808. #if defined(TRMMKERNEL)
  809. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  810. sub.d TL, K, OFF
  811. #ifdef LEFT
  812. addi.d TL, TL, -2
  813. #else
  814. addi.d TL, TL, -4
  815. #endif
  816. slli.d T3, TL, 0x05
  817. add.d A0, A0, T3
  818. slli.d T3, TL, 0x06
  819. add.d B0, B0, T3
  820. #endif
  821. #ifdef LEFT
  822. addi.d OFF, OFF, 2
  823. #endif
  824. #endif // #if defined(TRMMKERNEL)
  825. .L183: /* if (bm & 1) */
  826. move I, $r0
  827. andi T0, M, 1
  828. beq I, T0, .L186
  829. move B0, B //ptrbb
  830. move TL, K /* TL = bk */
  831. #if defined(TRMMKERNEL)
  832. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  833. move B0, B //ptrbb
  834. #else
  835. slli.d T3, OFF, 0x04
  836. add.d A0, A0, T3
  837. slli.d T3, OFF, 0x06
  838. add.d B0, B, T3
  839. #endif
  840. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  841. sub.d TL, K, OFF
  842. #elif defined(LEFT)
  843. addi.d TL, OFF, 1
  844. #else
  845. addi.d TL, OFF, 4
  846. #endif
  847. #endif // #if defined(TRMMKERNEL)
  848. vxor.v U0, U0, U0
  849. vxor.v U1, U1, U1
  850. vxor.v U2, U2, U2
  851. vxor.v U3, U3, U3
  852. move L, $r0 //cycle param k
  853. beq L, TL, .L185
  854. blt TL, L, .L185
  855. .L184: /* for (k=0; k<temp; k++) */
  856. vld D1, B0, 0x00 // b0ri
  857. vld D2, B0, 0x10 // b1ri
  858. vld D3, B0, 0x20 // b2ri
  859. vld D4, B0, 0x30 // b3ri
  860. vld D0, A0, 0x00 // a0ri
  861. vand.v D5, D0, D0
  862. vand.v D6, D0, D0
  863. vshuf4i.d D5, D0, 0x00 //a0rr
  864. vshuf4i.d D6, D0, 0x55 //a0ii
  865. vpackev.d D7, D2, D1 //b0r b1r
  866. vpackod.d D8, D2, D1 //b0i b1i
  867. vpackev.d D9, D4, D3 //b2r b3r
  868. vpackod.d D10, D4, D3 //b2i b3i
  869. VMADD1 U0, D5, D7, U0 //00r 10r
  870. VMADD2 U1, D6, D7, U1 //00i 10i
  871. VMADD3 U0, D6, D8, U0
  872. VMADD4 U1, D5, D8, U1
  873. VMADD1 U2, D5, D9, U2 //20r 30r
  874. VMADD2 U3, D6, D9, U3 //20i 30i
  875. VMADD3 U2, D6, D10, U2
  876. VMADD4 U3, D5, D10, U3
  877. addi.d A0, A0, 0x10
  878. addi.d B0, B0, 0x40
  879. addi.d L, L, 1
  880. blt L, TL, .L184
  881. .L185:
  882. #if defined(TRMMKERNEL)
  883. //res00 res10
  884. vld D0, C0, 0x00 //c0: 0 1
  885. vld D1, C1, 0x00 //c1: 0 1
  886. vpackev.d D2, D1, D0 //c0[0] c1[0]
  887. vpackod.d D3, D1, D0 //c0[1] c1[1]
  888. vfmul.d D2, U0, VALPHAR
  889. vfmul.d D3, U1, VALPHAR
  890. VNMSUB D2, U1, VALPHAI, D2
  891. VFMADD D3, U0, VALPHAI, D3
  892. vpackev.d D4, D3, D2 //c0[0] c0[1]
  893. vpackod.d D5, D3, D2 //c1[0] c1[1]
  894. vst D4, C0, 0x00
  895. vst D5, C1, 0x00
  896. addi.d C0, C0, 0x10
  897. addi.d C1, C1, 0x10
  898. //res20 res30
  899. vld D0, C2, 0x00 //c2: 0 1
  900. vld D1, C3, 0x00 //c3: 0 1
  901. vpackev.d D2, D1, D0 //c2[0] c3[0]
  902. vpackod.d D3, D1, D0 //c2[1] c3[1]
  903. vfmul.d D2, U2, VALPHAR
  904. vfmul.d D3, U3, VALPHAR
  905. VNMSUB D2, U3, VALPHAI, D2
  906. VFMADD D3, U2, VALPHAI, D3
  907. vpackev.d D4, D3, D2 //c2[0] c2[1]
  908. vpackod.d D5, D3, D2 //c3[0] c3[1]
  909. vst D4, C2, 0x00
  910. vst D5, C3, 0x00
  911. addi.d C2, C2, 0x10
  912. addi.d C3, C3, 0x10
  913. #else
  914. //res00 res10
  915. vld D0, C0, 0x00 //c0: 0 1
  916. vld D1, C1, 0x00 //c1: 0 1
  917. vpackev.d D2, D1, D0 //c0[0] c1[0]
  918. vpackod.d D3, D1, D0 //c0[1] c1[1]
  919. VFMADD D2, U0, VALPHAR, D2
  920. VFMADD D3, U1, VALPHAR, D3
  921. VNMSUB D2, U1, VALPHAI, D2
  922. VFMADD D3, U0, VALPHAI, D3
  923. vpackev.d D4, D3, D2 //c0[0] c0[1]
  924. vpackod.d D5, D3, D2 //c1[0] c1[1]
  925. vst D4, C0, 0x00
  926. vst D5, C1, 0x00
  927. addi.d C0, C0, 0x10
  928. addi.d C1, C1, 0x10
  929. //res20 res30
  930. vld D0, C2, 0x00 //c2: 0 1
  931. vld D1, C3, 0x00 //c3: 0 1
  932. vpackev.d D2, D1, D0 //c2[0] c3[0]
  933. vpackod.d D3, D1, D0 //c2[1] c3[1]
  934. VFMADD D2, U2, VALPHAR, D2
  935. VFMADD D3, U3, VALPHAR, D3
  936. VNMSUB D2, U3, VALPHAI, D2
  937. VFMADD D3, U2, VALPHAI, D3
  938. vpackev.d D4, D3, D2 //c2[0] c2[1]
  939. vpackod.d D5, D3, D2 //c3[0] c3[1]
  940. vst D4, C2, 0x00
  941. vst D5, C3, 0x00
  942. addi.d C2, C2, 0x10
  943. addi.d C3, C3, 0x10
  944. #endif
  945. #if defined(TRMMKERNEL)
  946. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  947. sub.d TL, K, OFF
  948. #ifdef LEFT
  949. addi.d TL, TL, -1
  950. #else
  951. addi.d TL, TL, -4
  952. #endif
  953. slli.d T3, TL, 0x04
  954. add.d A0, A0, T3
  955. slli.d C3, TL, 0x06
  956. add.d B0, B0, T3
  957. #endif
  958. #ifdef LEFT
  959. addi.d OFF, OFF, 1
  960. #endif
  961. #endif // #if defined(TRMMKERNEL)
  962. .L186:
  963. #if defined(TRMMKERNEL) && !defined(LEFT)
  964. addi.d OFF, OFF, 4
  965. #endif
  966. slli.d L, K, 0x06
  967. add.d B, B, L
  968. slli.d I, LDC, 0x03
  969. add.d C, C, I
  970. addi.d J, J, 1
  971. srai.d T0, N, 2
  972. blt J, T0, .L10
  973. .L19:
  974. move J, $r0
  975. andi T0, N, 2
  976. beq J, T0, .L30
  977. .L20: /* for (j=0; j<(bn&2); j+=2) */
  978. #if defined(TRMMKERNEL) && defined(LEFT)
  979. move OFF, OFFSET
  980. #endif
  981. move C0, C
  982. slli.d TL, LDC, 1
  983. add.d C1, C0, TL
  984. move A0, A //ptrba
  985. move I, $r0
  986. srai.d T0, M, 2 //bm/4
  987. beq I, T0, .L280
  988. .L21: /* for (i=0; i<bm/4; i+=1) */
  989. move B0, B //ptrbb
  990. move TL, K /* TL = bk */
  991. #if defined(TRMMKERNEL)
  992. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  993. move B0, B //ptrbb
  994. #else
  995. slli.d T3, OFF, 0x06
  996. add.d A0, A0, T3
  997. slli.d T3, OFF, 0x05
  998. add.d B0, B, T3
  999. #endif
  1000. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1001. sub.d TL, K, OFF
  1002. #elif defined(LEFT)
  1003. addi.d TL, OFF, 4
  1004. #else
  1005. addi.d TL, OFF, 2
  1006. #endif
  1007. #endif // #if defined(TRMMKERNEL)
  1008. vxor.v U0, U0, U0
  1009. vxor.v U1, U1, U1
  1010. vxor.v U2, U2, U2
  1011. vxor.v U3, U3, U3
  1012. vxor.v U4, U4, U4
  1013. vxor.v U5, U5, U5
  1014. vxor.v U6, U6, U6
  1015. vxor.v U7, U7, U7
  1016. move L, $r0 //cycle param k
  1017. beq L, TL, .L23
  1018. blt TL, L, .L23
  1019. .L22: /* for (k=0; k<temp; k++) */
  1020. vld D1, B0, 0x00 // b0ri
  1021. vld D2, B0, 0x10 // b1ri
  1022. vld D0, A0, 0x00 // a0ri
  1023. vand.v D5, D0, D0
  1024. vand.v D6, D0, D0
  1025. vshuf4i.d D5, D0, 0x00 //a0rr
  1026. vshuf4i.d D6, D0, 0x55 //a0ii
  1027. vpackev.d D7, D2, D1 //b0r b1r
  1028. vpackod.d D8, D2, D1 //b0i b1i
  1029. VMADD1 U0, D5, D7, U0 //00r 10r
  1030. VMADD2 U1, D6, D7, U1 //00i 10i
  1031. VMADD3 U0, D6, D8, U0
  1032. VMADD4 U1, D5, D8, U1
  1033. vld D0, A0, 0x10 // a1ri
  1034. vand.v D5, D0, D0
  1035. vand.v D6, D0, D0
  1036. vshuf4i.d D5, D0, 0x00 //a1rr
  1037. vshuf4i.d D6, D0, 0x55 //a1ii
  1038. VMADD1 U2, D5, D7, U2 //01r 11r
  1039. VMADD2 U3, D6, D7, U3 //01i 11i
  1040. VMADD3 U2, D6, D8, U2
  1041. VMADD4 U3, D5, D8, U3
  1042. vld D0, A0, 0x20 // a2ri
  1043. vand.v D5, D0, D0
  1044. vand.v D6, D0, D0
  1045. vshuf4i.d D5, D0, 0x00 //a2rr
  1046. vshuf4i.d D6, D0, 0x55 //a2ii
  1047. VMADD1 U4, D5, D7, U4 //02r 12r
  1048. VMADD2 U5, D6, D7, U5 //02i 12i
  1049. VMADD3 U4, D6, D8, U4
  1050. VMADD4 U5, D5, D8, U5
  1051. vld D0, A0, 0x30 // a3ri
  1052. vand.v D5, D0, D0
  1053. vand.v D6, D0, D0
  1054. vshuf4i.d D5, D0, 0x00 //a3rr
  1055. vshuf4i.d D6, D0, 0x55 //a3ii
  1056. VMADD1 U6, D5, D7, U6 //03r 13r
  1057. VMADD2 U7, D6, D7, U7 //03i 13i
  1058. VMADD3 U6, D6, D8, U6
  1059. VMADD4 U7, D5, D8, U7
  1060. addi.d A0, A0, 0x40
  1061. addi.d B0, B0, 0x20
  1062. addi.d L, L, 1
  1063. blt L, TL, .L22
  1064. .L23:
  1065. #if defined(TRMMKERNEL)
  1066. //res00 res10
  1067. vld D0, C0, 0x00 //c0: 0 1
  1068. vld D1, C1, 0x00 //c1: 0 1
  1069. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1070. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1071. vfmul.d D2, U0, VALPHAR
  1072. vfmul.d D3, U1, VALPHAR
  1073. VNMSUB D2, U1, VALPHAI, D2
  1074. VFMADD D3, U0, VALPHAI, D3
  1075. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1076. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1077. vst D4, C0, 0x00
  1078. vst D5, C1, 0x00
  1079. addi.d C0, C0, 0x10
  1080. addi.d C1, C1, 0x10
  1081. //res01 res11
  1082. vld D0, C0, 0x00 //c0: 0 1
  1083. vld D1, C1, 0x00 //c1: 0 1
  1084. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1085. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1086. vfmul.d D2, U2, VALPHAR
  1087. vfmul.d D3, U3, VALPHAR
  1088. VNMSUB D2, U3, VALPHAI, D2
  1089. VFMADD D3, U2, VALPHAI, D3
  1090. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1091. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1092. vst D4, C0, 0x00
  1093. vst D5, C1, 0x00
  1094. addi.d C0, C0, 0x10
  1095. addi.d C1, C1, 0x10
  1096. //res02 res12
  1097. vld D0, C0, 0x00 //c0: 0 1
  1098. vld D1, C1, 0x00 //c1: 0 1
  1099. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1100. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1101. vfmul.d D2, U4, VALPHAR
  1102. vfmul.d D3, U5, VALPHAR
  1103. VNMSUB D2, U5, VALPHAI, D2
  1104. VFMADD D3, U4, VALPHAI, D3
  1105. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1106. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1107. vst D4, C0, 0x00
  1108. vst D5, C1, 0x00
  1109. addi.d C0, C0, 0x10
  1110. addi.d C1, C1, 0x10
  1111. //res03 res13
  1112. vld D0, C0, 0x00 //c0: 0 1
  1113. vld D1, C1, 0x00 //c1: 0 1
  1114. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1115. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1116. vfmul.d D2, U6, VALPHAR
  1117. vfmul.d D3, U7, VALPHAR
  1118. VNMSUB D2, U7, VALPHAI, D2
  1119. VFMADD D3, U6, VALPHAI, D3
  1120. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1121. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1122. vst D4, C0, 0x00
  1123. vst D5, C1, 0x00
  1124. addi.d C0, C0, 0x10
  1125. addi.d C1, C1, 0x10
  1126. #else
  1127. //res00 res10
  1128. vld D0, C0, 0x00 //c0: 0 1
  1129. vld D1, C1, 0x00 //c1: 0 1
  1130. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1131. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1132. VFMADD D2, U0, VALPHAR, D2
  1133. VFMADD D3, U1, VALPHAR, D3
  1134. VNMSUB D2, U1, VALPHAI, D2
  1135. VFMADD D3, U0, VALPHAI, D3
  1136. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1137. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1138. vst D4, C0, 0x00
  1139. vst D5, C1, 0x00
  1140. addi.d C0, C0, 0x10
  1141. addi.d C1, C1, 0x10
  1142. //res01 res11
  1143. vld D0, C0, 0x00 //c0: 0 1
  1144. vld D1, C1, 0x00 //c1: 0 1
  1145. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1146. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1147. VFMADD D2, U2, VALPHAR, D2
  1148. VFMADD D3, U3, VALPHAR, D3
  1149. VNMSUB D2, U3, VALPHAI, D2
  1150. VFMADD D3, U2, VALPHAI, D3
  1151. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1152. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1153. vst D4, C0, 0x00
  1154. vst D5, C1, 0x00
  1155. addi.d C0, C0, 0x10
  1156. addi.d C1, C1, 0x10
  1157. //res02 res12
  1158. vld D0, C0, 0x00 //c0: 0 1
  1159. vld D1, C1, 0x00 //c1: 0 1
  1160. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1161. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1162. VFMADD D2, U4, VALPHAR, D2
  1163. VFMADD D3, U5, VALPHAR, D3
  1164. VNMSUB D2, U5, VALPHAI, D2
  1165. VFMADD D3, U4, VALPHAI, D3
  1166. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1167. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1168. vst D4, C0, 0x00
  1169. vst D5, C1, 0x00
  1170. addi.d C0, C0, 0x10
  1171. addi.d C1, C1, 0x10
  1172. //res03 res13
  1173. vld D0, C0, 0x00 //c0: 0 1
  1174. vld D1, C1, 0x00 //c1: 0 1
  1175. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1176. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1177. VFMADD D2, U6, VALPHAR, D2
  1178. VFMADD D3, U7, VALPHAR, D3
  1179. VNMSUB D2, U7, VALPHAI, D2
  1180. VFMADD D3, U6, VALPHAI, D3
  1181. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1182. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1183. vst D4, C0, 0x00
  1184. vst D5, C1, 0x00
  1185. addi.d C0, C0, 0x10
  1186. addi.d C1, C1, 0x10
  1187. #endif
  1188. #if defined(TRMMKERNEL)
  1189. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1190. sub.d TL, K, OFF
  1191. #ifdef LEFT
  1192. addi.d TL, TL, -4
  1193. #else
  1194. addi.d TL, TL, -2
  1195. #endif
  1196. slli.d T3, TL, 0x06
  1197. add.d A0, A0, T3
  1198. slli.d T3, TL, 0x05
  1199. add.d B0, B0, T3
  1200. #endif
  1201. #ifdef LEFT
  1202. addi.d OFF, OFF, 4
  1203. #endif
  1204. #endif // #if defined(TRMMKERNEL)
  1205. addi.d I, I, 1
  1206. blt I, T0, .L21
  1207. .L280: /* if ( bm & 2 )*/
  1208. move I, $r0
  1209. andi T1, M, 2 //bm&2
  1210. beq I, T1, .L284
  1211. .L281:
  1212. move B0, B //ptrbb
  1213. move TL, K /* TL = bk */
  1214. #if defined(TRMMKERNEL)
  1215. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  1216. move B0, B //ptrbb
  1217. #else
  1218. slli.d T3, OFF, 0x05
  1219. add.d A0, A0, T3
  1220. add.d B0, B, T3
  1221. #endif
  1222. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1223. sub.d TL, K, OFF
  1224. #elif defined(LEFT)
  1225. addi.d TL, OFF, 2
  1226. #else
  1227. addi.d TL, OFF, 2
  1228. #endif
  1229. #endif // #if defined(TRMMKERNEL)
  1230. vxor.v U0, U0, U0
  1231. vxor.v U1, U1, U1
  1232. vxor.v U2, U2, U2
  1233. vxor.v U3, U3, U3
  1234. move L, $r0 //cycle param k
  1235. beq L, TL, .L283
  1236. blt TL, L, .L283
  1237. .L282: /* for (k=0; k<temp; k++) */
  1238. vld D1, B0, 0x00 // b0ri
  1239. vld D2, B0, 0x10 // b1ri
  1240. vld D0, A0, 0x00 // a0ri
  1241. vand.v D5, D0, D0
  1242. vand.v D6, D0, D0
  1243. vshuf4i.d D5, D0, 0x00 //a0rr
  1244. vshuf4i.d D6, D0, 0x55 //a0ii
  1245. vpackev.d D7, D2, D1 //b0r b1r
  1246. vpackod.d D8, D2, D1 //b0i b1i
  1247. VMADD1 U0, D5, D7, U0 //00r 10r
  1248. VMADD2 U1, D6, D7, U1 //00i 10i
  1249. VMADD3 U0, D6, D8, U0
  1250. VMADD4 U1, D5, D8, U1
  1251. vld D0, A0, 0x10 // a1ri
  1252. vand.v D5, D0, D0
  1253. vand.v D6, D0, D0
  1254. vshuf4i.d D5, D0, 0x00 //a1rr
  1255. vshuf4i.d D6, D0, 0x55 //a1ii
  1256. VMADD1 U2, D5, D7, U2 //01r 11r
  1257. VMADD2 U3, D6, D7, U3 //01i 11i
  1258. VMADD3 U2, D6, D8, U2
  1259. VMADD4 U3, D5, D8, U3
  1260. addi.d A0, A0, 0x20
  1261. addi.d B0, B0, 0x20
  1262. addi.d L, L, 1
  1263. blt L, TL, .L282
  1264. .L283:
  1265. #if defined(TRMMKERNEL)
  1266. //res00 res10
  1267. vld D0, C0, 0x00 //c0: 0 1
  1268. vld D1, C1, 0x00 //c1: 0 1
  1269. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1270. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1271. vfmul.d D2, U0, VALPHAR
  1272. vfmul.d D3, U1, VALPHAR
  1273. VNMSUB D2, U1, VALPHAI, D2
  1274. VFMADD D3, U0, VALPHAI, D3
  1275. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1276. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1277. vst D4, C0, 0x00
  1278. vst D5, C1, 0x00
  1279. addi.d C0, C0, 0x10
  1280. addi.d C1, C1, 0x10
  1281. //res01 res11
  1282. vld D0, C0, 0x00 //c0: 0 1
  1283. vld D1, C1, 0x00 //c1: 0 1
  1284. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1285. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1286. vfmul.d D2, U2, VALPHAR
  1287. vfmul.d D3, U3, VALPHAR
  1288. VNMSUB D2, U3, VALPHAI, D2
  1289. VFMADD D3, U2, VALPHAI, D3
  1290. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1291. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1292. vst D4, C0, 0x00
  1293. vst D5, C1, 0x00
  1294. addi.d C0, C0, 0x10
  1295. addi.d C1, C1, 0x10
  1296. #else
  1297. //res00 res10
  1298. vld D0, C0, 0x00 //c0: 0 1
  1299. vld D1, C1, 0x00 //c1: 0 1
  1300. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1301. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1302. VFMADD D2, U0, VALPHAR, D2
  1303. VFMADD D3, U1, VALPHAR, D3
  1304. VNMSUB D2, U1, VALPHAI, D2
  1305. VFMADD D3, U0, VALPHAI, D3
  1306. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1307. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1308. vst D4, C0, 0x00
  1309. vst D5, C1, 0x00
  1310. addi.d C0, C0, 0x10
  1311. addi.d C1, C1, 0x10
  1312. //res01 res11
  1313. vld D0, C0, 0x00 //c0: 0 1
  1314. vld D1, C1, 0x00 //c1: 0 1
  1315. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1316. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1317. VFMADD D2, U2, VALPHAR, D2
  1318. VFMADD D3, U3, VALPHAR, D3
  1319. VNMSUB D2, U3, VALPHAI, D2
  1320. VFMADD D3, U2, VALPHAI, D3
  1321. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1322. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1323. vst D4, C0, 0x00
  1324. vst D5, C1, 0x00
  1325. addi.d C0, C0, 0x10
  1326. addi.d C1, C1, 0x10
  1327. #endif
  1328. #if defined(TRMMKERNEL)
  1329. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1330. sub.d TL, K, OFF
  1331. #ifdef LEFT
  1332. addi.d TL, TL, -2
  1333. #else
  1334. addi.d TL, TL, -2
  1335. #endif
  1336. slli.d T3, TL, 0x05
  1337. add.d A0, A0, T3
  1338. slli.d T3, TL, 0x05
  1339. add.d B0, B0, T3
  1340. #endif
  1341. #ifdef LEFT
  1342. addi.d OFF, OFF, 2
  1343. #endif
  1344. #endif // #if defined(TRMMKERNEL)
  1345. .L284: /* if ( bm & 1 )*/
  1346. move I, $r0
  1347. andi T1, M, 1 //bm&1
  1348. beq I, T1, .L288
  1349. .L285:
  1350. move B0, B //ptrbb
  1351. move TL, K /* TL = bk */
  1352. #if defined(TRMMKERNEL)
  1353. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  1354. move B0, B //ptrbb
  1355. #else
  1356. slli.d T3, OFF, 0x04
  1357. add.d A0, A0, T3
  1358. slli.d T3, OFF, 0x05
  1359. add.d B0, B, T3
  1360. #endif
  1361. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1362. sub.d TL, K, OFF
  1363. #elif defined(LEFT)
  1364. addi.d TL, OFF, 1
  1365. #else
  1366. addi.d TL, OFF, 2
  1367. #endif
  1368. #endif // #if defined(TRMMKERNEL)
  1369. vxor.v U0, U0, U0
  1370. vxor.v U1, U1, U1
  1371. move L, $r0 //cycle param k
  1372. beq L, TL, .L287
  1373. blt TL, L, .L287
  1374. .L286: /* for (k=0; k<temp; k++) */
  1375. vld D1, B0, 0x00 // b0ri
  1376. vld D2, B0, 0x10 // b1ri
  1377. vld D0, A0, 0x00 // a0ri
  1378. vand.v D5, D0, D0
  1379. vand.v D6, D0, D0
  1380. vshuf4i.d D5, D0, 0x00 //a0rr
  1381. vshuf4i.d D6, D0, 0x55 //a0ii
  1382. vpackev.d D7, D2, D1 //b0r b1r
  1383. vpackod.d D8, D2, D1 //b0i b1i
  1384. VMADD1 U0, D5, D7, U0 //00r 10r
  1385. VMADD2 U1, D6, D7, U1 //00i 10i
  1386. VMADD3 U0, D6, D8, U0
  1387. VMADD4 U1, D5, D8, U1
  1388. addi.d A0, A0, 0x10
  1389. addi.d B0, B0, 0x20
  1390. addi.d L, L, 1
  1391. blt L, TL, .L286
  1392. .L287:
  1393. #if defined(TRMMKERNEL)
  1394. //res00 res10
  1395. vld D0, C0, 0x00 //c0: 0 1
  1396. vld D1, C1, 0x00 //c1: 0 1
  1397. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1398. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1399. vfmul.d D2, U0, VALPHAR
  1400. vfmul.d D3, U1, VALPHAR
  1401. VNMSUB D2, U1, VALPHAI, D2
  1402. VFMADD D3, U0, VALPHAI, D3
  1403. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1404. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1405. vst D4, C0, 0x00
  1406. vst D5, C1, 0x00
  1407. addi.d C0, C0, 0x10
  1408. addi.d C1, C1, 0x10
  1409. #else
  1410. //res00 res10
  1411. vld D0, C0, 0x00 //c0: 0 1
  1412. vld D1, C1, 0x00 //c1: 0 1
  1413. vpackev.d D2, D1, D0 //c0[0] c1[0]
  1414. vpackod.d D3, D1, D0 //c0[1] c1[1]
  1415. VFMADD D2, U0, VALPHAR, D2
  1416. VFMADD D3, U1, VALPHAR, D3
  1417. VNMSUB D2, U1, VALPHAI, D2
  1418. VFMADD D3, U0, VALPHAI, D3
  1419. vpackev.d D4, D3, D2 //c0[0] c0[1]
  1420. vpackod.d D5, D3, D2 //c1[0] c1[1]
  1421. vst D4, C0, 0x00
  1422. vst D5, C1, 0x00
  1423. addi.d C0, C0, 0x10
  1424. addi.d C1, C1, 0x10
  1425. #endif
  1426. #if defined(TRMMKERNEL)
  1427. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1428. sub.d TL, K, OFF
  1429. #ifdef LEFT
  1430. addi.d TL, TL, -1
  1431. #else
  1432. addi.d TL, TL, -2
  1433. #endif
  1434. slli.d T3, TL, 0x04
  1435. add.d A0, A0, T3
  1436. slli.d T3, TL, 0x05
  1437. add.d B0, B0, T3
  1438. #endif
  1439. #ifdef LEFT
  1440. addi.d OFF, OFF, 1
  1441. #endif
  1442. #endif // #if defined(TRMMKERNEL)
  1443. .L288:
  1444. #if defined(TRMMKERNEL) && !defined(LEFT)
  1445. addi.d OFF, OFF, 2
  1446. #endif
  1447. slli.d L, K, 5
  1448. add.d B, B, L
  1449. slli.d I, LDC, 2
  1450. add.d C, C, I
  1451. addi.d J, J, 2
  1452. andi T0, N, 2
  1453. blt J, T0, .L20
  1454. .L30:
  1455. move J, $r0
  1456. andi T0, N, 1
  1457. beq J, T0, .L999
  1458. .L300: /* for (j=0; j<(bn&1); j+=1) */
  1459. #if defined(TRMMKERNEL) && defined(LEFT)
  1460. move OFF, OFFSET
  1461. #endif
  1462. move C0, C
  1463. move A0, A //ptrba
  1464. move I, $r0
  1465. srai.d T0, M, 2 //bm/4
  1466. beq I, T0, .L38
  1467. .L31: /* for (i=0; i<bm/4; i+=1) */
  1468. move B0, B //ptrbb
  1469. move TL, K /* TL = bk */
  1470. #if defined(TRMMKERNEL)
  1471. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  1472. move B0, B //ptrbb
  1473. #else
  1474. slli.d T3, OFF, 0x06
  1475. add.d A0, A0, T3
  1476. slli.d T3, OFF, 0x04
  1477. add.d B0, B, T3
  1478. #endif
  1479. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1480. sub.d TL, K, OFF
  1481. #elif defined(LEFT)
  1482. addi.d TL, OFF, 4
  1483. #else
  1484. addi.d TL, OFF, 1
  1485. #endif
  1486. #endif // #if defined(TRMMKERNEL)
  1487. vxor.v U0, U0, U0
  1488. vxor.v U1, U1, U1
  1489. vxor.v U2, U2, U2
  1490. vxor.v U3, U3, U3
  1491. move L, $r0 //cycle param k
  1492. beq L, TL, .L33
  1493. blt TL, L, .L33
  1494. .L32: /* for (k=0; k<temp; k++) */
  1495. vld D1, B0, 0x00 // b0ri
  1496. vld D0, A0, 0x00 // a0ri
  1497. vld D2, A0, 0x10 // a1ri
  1498. vpackev.d D5, D2, D0 //a0r a1r
  1499. vpackod.d D6, D2, D0 //a0i a1i
  1500. vand.v D7, D1, D1
  1501. vand.v D8, D1, D1
  1502. vshuf4i.d D7, D1, 0x00 //b0rr
  1503. vshuf4i.d D8, D1, 0x55 //b0ii
  1504. VMADD1 U0, D5, D7, U0 //00r 01r
  1505. VMADD2 U1, D6, D7, U1 //00i 01i
  1506. VMADD3 U0, D6, D8, U0
  1507. VMADD4 U1, D5, D8, U1
  1508. vld D0, A0, 0x20 // a0ri
  1509. vld D2, A0, 0x30 // a1ri
  1510. vpackev.d D5, D2, D0 //a0r a1r
  1511. vpackod.d D6, D2, D0 //a0i a1i
  1512. VMADD1 U2, D5, D7, U2 //02r 03r
  1513. VMADD2 U3, D6, D7, U3 //02i 03i
  1514. VMADD3 U2, D6, D8, U2
  1515. VMADD4 U3, D5, D8, U3
  1516. addi.d A0, A0, 0x40
  1517. addi.d B0, B0, 0x10
  1518. addi.d L, L, 1
  1519. blt L, TL, .L32
  1520. .L33:
  1521. #if defined(TRMMKERNEL)
  1522. //res00 res01
  1523. vld D0, C0, 0x00 //c0: 0 1
  1524. vld D1, C0, 0x10 //c0: 2 3
  1525. vpackev.d D2, D1, D0 //c0: 0 2
  1526. vpackod.d D3, D1, D0 //c0: 1 3
  1527. vfmul.d D2, U0, VALPHAR
  1528. vfmul.d D3, U1, VALPHAR
  1529. VNMSUB D2, U1, VALPHAI, D2
  1530. VFMADD D3, U0, VALPHAI, D3
  1531. vpackev.d D4, D3, D2 //c0: 0 1
  1532. vpackod.d D5, D3, D2 //c0: 2 3
  1533. vst D4, C0, 0x00
  1534. vst D5, C0, 0x10
  1535. addi.d C0, C0, 0x20
  1536. //res02 res03
  1537. vld D0, C0, 0x00 //c0: 0 1
  1538. vld D1, C0, 0x10 //c0: 2 3
  1539. vpackev.d D2, D1, D0 //c0: 0 2
  1540. vpackod.d D3, D1, D0 //c0: 1 3
  1541. vfmul.d D2, U2, VALPHAR
  1542. vfmul.d D3, U3, VALPHAR
  1543. VNMSUB D2, U3, VALPHAI, D2
  1544. VFMADD D3, U2, VALPHAI, D3
  1545. vpackev.d D4, D3, D2 //c0: 0 1
  1546. vpackod.d D5, D3, D2 //c0: 2 3
  1547. vst D4, C0, 0x00
  1548. vst D5, C0, 0x10
  1549. addi.d C0, C0, 0x20
  1550. #else
  1551. //res00 res01
  1552. vld D0, C0, 0x00 //c0: 0 1
  1553. vld D1, C0, 0x10 //c0: 2 3
  1554. vpackev.d D2, D1, D0 //c0: 0 2
  1555. vpackod.d D3, D1, D0 //c0: 1 3
  1556. VFMADD D2, U0, VALPHAR, D2
  1557. VFMADD D3, U1, VALPHAR, D3
  1558. VNMSUB D2, U1, VALPHAI, D2
  1559. VFMADD D3, U0, VALPHAI, D3
  1560. vpackev.d D4, D3, D2 //c0: 0 1
  1561. vpackod.d D5, D3, D2 //c0: 2 3
  1562. vst D4, C0, 0x00
  1563. vst D5, C0, 0x10
  1564. addi.d C0, C0, 0x20
  1565. //res02 res03
  1566. vld D0, C0, 0x00 //c0: 0 1
  1567. vld D1, C0, 0x10 //c0: 2 3
  1568. vpackev.d D2, D1, D0 //c0: 0 2
  1569. vpackod.d D3, D1, D0 //c0: 1 3
  1570. VFMADD D2, U2, VALPHAR, D2
  1571. VFMADD D3, U3, VALPHAR, D3
  1572. VNMSUB D2, U3, VALPHAI, D2
  1573. VFMADD D3, U2, VALPHAI, D3
  1574. vpackev.d D4, D3, D2 //c0: 0 1
  1575. vpackod.d D5, D3, D2 //c0: 2 3
  1576. vst D4, C0, 0x00
  1577. vst D5, C0, 0x10
  1578. addi.d C0, C0, 0x20
  1579. #endif
  1580. #if defined(TRMMKERNEL)
  1581. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1582. sub.d TL, K, OFF
  1583. #ifdef LEFT
  1584. addi.d TL, TL, -4
  1585. #else
  1586. addi.d TL, TL, -1
  1587. #endif
  1588. slli.d T3, TL, 0x06
  1589. add.d A0, A0, T3
  1590. slli.d T3, TL, 0x04
  1591. add.d B0, B0, T3
  1592. #endif
  1593. #ifdef LEFT
  1594. addi.d OFF, OFF, 4
  1595. #endif
  1596. #endif // #if defined(TRMMKERNEL)
  1597. addi.d I, I, 1
  1598. blt I, T0, .L31
  1599. .L38: /* if ( bm & 2 ) */
  1600. move I, $r0
  1601. andi T1, M, 2 //bm&2
  1602. beq I, T1, .L312
  1603. .L39:
  1604. move B0, B //ptrbb
  1605. move TL, K /* TL = bk */
  1606. #if defined(TRMMKERNEL)
  1607. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  1608. move B0, B //ptrbb
  1609. #else
  1610. slli.d T3, OFF, 0x05
  1611. add.d A0, A0, T3
  1612. slli.d T3, OFF, 0x04
  1613. add.d B0, B, T3
  1614. #endif
  1615. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1616. sub.d TL, K, OFF
  1617. #elif defined(LEFT)
  1618. addi.d TL, OFF, 2
  1619. #else
  1620. addi.d TL, OFF, 1
  1621. #endif
  1622. #endif // #if defined(TRMMKERNEL)
  1623. vxor.v U0, U0, U0
  1624. vxor.v U1, U1, U1
  1625. move L, $r0 //cycle param k
  1626. beq L, TL, .L311
  1627. blt TL, L, .L311
  1628. .L310: /* for (k=0; k<temp; k++) */
  1629. vld D1, B0, 0x00 // b0ri
  1630. vld D0, A0, 0x00 // a0ri
  1631. vld D2, A0, 0x10 // a1ri
  1632. vpackev.d D5, D2, D0 //a0r a1r
  1633. vpackod.d D6, D2, D0 //a0i a1i
  1634. vand.v D7, D1, D1
  1635. vand.v D8, D1, D1
  1636. vshuf4i.d D7, D1, 0x00 //b0rr
  1637. vshuf4i.d D8, D1, 0x55 //b0ii
  1638. VMADD1 U0, D5, D7, U0 //00r 01r
  1639. VMADD2 U1, D6, D7, U1 //00i 01i
  1640. VMADD3 U0, D6, D8, U0
  1641. VMADD4 U1, D5, D8, U1
  1642. addi.d A0, A0, 0x20
  1643. addi.d B0, B0, 0x10
  1644. addi.d L, L, 1
  1645. blt L, TL, .L310
  1646. .L311:
  1647. #if defined(TRMMKERNEL)
  1648. //res00 res01
  1649. vld D0, C0, 0x00 //c0: 0 1
  1650. vld D1, C0, 0x10 //c0: 2 3
  1651. vpackev.d D2, D1, D0 //c0: 0 2
  1652. vpackod.d D3, D1, D0 //c0: 1 3
  1653. vfmul.d D2, U0, VALPHAR
  1654. vfmul.d D3, U1, VALPHAR
  1655. VNMSUB D2, U1, VALPHAI, D2
  1656. VFMADD D3, U0, VALPHAI, D3
  1657. vpackev.d D4, D3, D2 //c0: 0 1
  1658. vpackod.d D5, D3, D2 //c0: 2 3
  1659. vst D4, C0, 0x00
  1660. vst D5, C0, 0x10
  1661. addi.d C0, C0, 0x20
  1662. #else
  1663. //res00 res01
  1664. vld D0, C0, 0x00 //c0: 0 1
  1665. vld D1, C0, 0x10 //c0: 2 3
  1666. vpackev.d D2, D1, D0 //c0: 0 2
  1667. vpackod.d D3, D1, D0 //c0: 1 3
  1668. VFMADD D2, U0, VALPHAR, D2
  1669. VFMADD D3, U1, VALPHAR, D3
  1670. VNMSUB D2, U1, VALPHAI, D2
  1671. VFMADD D3, U0, VALPHAI, D3
  1672. vpackev.d D4, D3, D2 //c0: 0 1
  1673. vpackod.d D5, D3, D2 //c0: 2 3
  1674. vst D4, C0, 0x00
  1675. vst D5, C0, 0x10
  1676. addi.d C0, C0, 0x20
  1677. #endif
  1678. #if defined(TRMMKERNEL)
  1679. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1680. sub.d TL, K, OFF
  1681. #ifdef LEFT
  1682. addi.d TL, TL, -2
  1683. #else
  1684. addi.d TL, TL, -1
  1685. #endif
  1686. slli.d T3, TL, 0x05
  1687. add.d A0, A0, T3
  1688. slli.d T3, TL, 0x04
  1689. add.d B0, B0, T3
  1690. #endif
  1691. #ifdef LEFT
  1692. addi.d OFF, OFF, 2
  1693. #endif
  1694. #endif // #if defined(TRMMKERNEL)
  1695. .L312: /* if ( bm & 1 )*/
  1696. move I, $r0
  1697. andi T1, M, 1 //bm&1
  1698. beq I, T1, .L316
  1699. .L313:
  1700. move B0, B //ptrbb
  1701. move TL, K /* TL = bk */
  1702. #if defined(TRMMKERNEL)
  1703. #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  1704. move B0, B //ptrbb
  1705. #else
  1706. slli.d T3, OFF, 0x04
  1707. add.d A0, A0, T3
  1708. slli.d T3, OFF, 0x04
  1709. add.d B0, B, T3
  1710. #endif
  1711. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1712. sub.d TL, K, OFF
  1713. #elif defined(LEFT)
  1714. addi.d TL, OFF, 1
  1715. #else
  1716. addi.d TL, OFF, 1
  1717. #endif
  1718. #endif // #if defined(TRMMKERNEL)
  1719. MTC c11, $r0
  1720. MTC c12, $r0
  1721. move L, $r0 //cycle param k
  1722. beq L, TL, .L315
  1723. blt TL, L, .L315
  1724. .L314: /* for (k=0; k<temp; k++) */
  1725. LD a1, A0, 0x00
  1726. LD a2, A0, 0x08
  1727. LD b1, B0, 0x00
  1728. LD b2, B0, 0x08
  1729. MADD1 c11, a1, b1, c11
  1730. MADD2 c12, a2, b1, c12
  1731. MADD3 c11, a2, b2, c11
  1732. MADD4 c12, a1, b2, c12
  1733. addi.d A0, A0, 0x10
  1734. addi.d B0, B0, 0x10
  1735. addi.d L, L, 1
  1736. blt L, TL, .L314
  1737. .L315:
  1738. #if defined(TRMMKERNEL)
  1739. MUL a5, c11, ALPHA_R
  1740. MUL a6, c12, ALPHA_I
  1741. SUB a5, a5, a6
  1742. ST a5, C0, 0x00
  1743. MUL a5, c12, ALPHA_R
  1744. MUL a6, c11, ALPHA_I
  1745. ADD a6, a5, a6
  1746. ST a6, C0, 0x08
  1747. #else
  1748. LD a5, C0, 0x00 //C0[0]
  1749. LD a6, C0, 0x08 //C0[1]
  1750. MADD a5, c11, ALPHA_R, a5
  1751. MADD a6, c12, ALPHA_R, a6
  1752. NMSUB a5, c12, ALPHA_I, a5
  1753. MADD a6, c11, ALPHA_I, a6
  1754. ST a5, C0, 0x00
  1755. ST a6, C0, 0x08
  1756. addi.d C0, C0, 0x10
  1757. #endif
  1758. #if defined(TRMMKERNEL)
  1759. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1760. sub.d TL, K, OFF
  1761. #ifdef LEFT
  1762. addi.d TL, TL, -1
  1763. #else
  1764. addi.d TL, TL, -1
  1765. #endif
  1766. slli.d T3, TL, 0x04
  1767. add.d A0, A0, T3
  1768. add.d B0, B0, T3
  1769. #endif
  1770. #ifdef LEFT
  1771. addi.d OFF, OFF, 1
  1772. #endif
  1773. #endif // #if defined(TRMMKERNEL)
  1774. .L316:
  1775. slli.d L, K, 4
  1776. add.d B, B, L
  1777. slli.d I, LDC, 1
  1778. add.d C, C, I
  1779. addi.d J, J, 1
  1780. andi T0, N, 1
  1781. blt J, T0, .L300
  1782. .L999:
  1783. LDARG $r23, $sp, 0
  1784. LDARG $r24, $sp, 8
  1785. LDARG $r25, $sp, 16
  1786. LDARG $r26, $sp, 24
  1787. LDARG $r27, $sp, 32
  1788. LD $f23, $sp, 40
  1789. LD $f24, $sp, 48
  1790. LD $f25, $sp, 56
  1791. LD $f26, $sp, 64
  1792. LD $f27, $sp, 72
  1793. LD $f28, $sp, 80
  1794. LD $f29, $sp, 88
  1795. LD $f30, $sp, 96
  1796. LD $f31, $sp, 104
  1797. addi.d $sp, $sp, 128
  1798. jirl $r0, $r1, 0x0
  1799. EPILOGUE