You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel.S 40 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define N $5
  42. #define K $6
  43. #define A $8
  44. #define B $9
  45. #define C $10
  46. #define LDC $11
  47. #define AO $12
  48. #define BO $13
  49. #define I $2
  50. #define J $3
  51. #define L $7
  52. #define PREFETCHSIZE (4 * 10)
  53. #define CO1 $14
  54. #define CO2 $15
  55. #define CO3 $16
  56. #define CO4 $17
  57. #define CO5 $18
  58. #define CO6 $19
  59. #define CO7 $20
  60. #define CO8 $21
  61. #define BB $22
  62. #if defined(TRMMKERNEL)
  63. #define OFFSET $23
  64. #define KK $24
  65. #define TEMP $25
  66. #endif
  67. #define a1 $f0
  68. #define a2 $f1
  69. #define a3 $f27
  70. #define a4 $f28
  71. #define b1 $f2
  72. #define b2 $f3
  73. #define b3 $f4
  74. #define b4 $f5
  75. #define b5 $f6
  76. #define b6 $f7
  77. #define b7 $f8
  78. #define b8 $f9
  79. #define a5 b8
  80. #define c11 $f10
  81. #define c12 $f11
  82. #define c21 $f12
  83. #define c22 $f13
  84. #define c31 $f14
  85. #define c32 $f16
  86. #define c41 $f17
  87. #define c42 $f18
  88. #define c51 $f19
  89. #define c52 $f20
  90. #define c61 $f21
  91. #define c62 $f22
  92. #define c71 $f23
  93. #define c72 $f24
  94. #define c81 $f25
  95. #define c82 $f26
  96. #define ALPHA $f15
  97. PROLOGUE
  98. daddiu $sp, $sp, -160
  99. SDARG $16, 0($sp)
  100. SDARG $17, 8($sp)
  101. SDARG $18, 16($sp)
  102. SDARG $19, 24($sp)
  103. SDARG $20, 32($sp)
  104. SDARG $21, 40($sp)
  105. SDARG $22, 48($sp)
  106. sdc1 $f24, 56($sp)
  107. sdc1 $f25, 64($sp)
  108. sdc1 $f26, 72($sp)
  109. sdc1 $f27, 80($sp)
  110. sdc1 $f28, 88($sp)
  111. #if defined(TRMMKERNEL)
  112. SDARG $23, 96($sp)
  113. SDARG $24, 104($sp)
  114. SDARG $25, 112($sp)
  115. LDARG OFFSET, 160($sp)
  116. #endif
  117. #ifndef __64BIT__
  118. sdc1 $f20,120($sp)
  119. sdc1 $f21,128($sp)
  120. sdc1 $f22,136($sp)
  121. sdc1 $f23,144($sp)
  122. #endif
  123. dsll LDC, LDC, BASE_SHIFT
  124. #if defined(TRMMKERNEL) && !defined(LEFT)
  125. neg KK, OFFSET
  126. #endif
  127. dsra J, N, 3
  128. blez J, .L30
  129. nop
  130. .L10:
  131. move CO1, C
  132. MTC $0, c11
  133. daddu CO2, C, LDC
  134. move AO, A
  135. daddu CO3, CO2, LDC
  136. daddiu J, J, -1
  137. daddu CO4, CO3, LDC
  138. MOV c21, c11
  139. daddu CO5, CO4, LDC
  140. MOV c31, c11
  141. daddu CO6, CO5, LDC
  142. MOV c41, c11
  143. daddu CO7, CO6, LDC
  144. MOV c51, c11
  145. daddu CO8, CO7, LDC
  146. dsra I, M, 1
  147. daddu C, CO8, LDC
  148. dsll BB, K, 2 + BASE_SHIFT
  149. daddu BB, B, BB
  150. #if defined(TRMMKERNEL) && defined(LEFT)
  151. move KK, OFFSET
  152. #endif
  153. blez I, .L20
  154. MOV c61, c11
  155. .L11:
  156. #if defined(TRMMKERNEL)
  157. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  158. move BO, B
  159. #else
  160. dsll L, KK, 1 + BASE_SHIFT
  161. dsll TEMP, KK, 3 + BASE_SHIFT
  162. daddu AO, AO, L
  163. daddu BO, B, TEMP
  164. #endif
  165. LD a1, 0 * SIZE(AO)
  166. MOV c71, c11
  167. LD b1, 0 * SIZE(BO)
  168. MOV c81, c11
  169. LD a3, 4 * SIZE(AO)
  170. MOV c12, c11
  171. LD b2, 1 * SIZE(BO)
  172. MOV c22, c11
  173. MOV c32, c11
  174. LD b3, 2 * SIZE(BO)
  175. MOV c42, c11
  176. LD b4, 3 * SIZE(BO)
  177. MOV c52, c11
  178. LD b5, 4 * SIZE(BO)
  179. MOV c62, c11
  180. LD b6, 8 * SIZE(BO)
  181. MOV c72, c11
  182. LD b7, 12 * SIZE(BO)
  183. MOV c82, c11
  184. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  185. dsubu TEMP, K, KK
  186. #elif defined(LEFT)
  187. daddiu TEMP, KK, 2
  188. #else
  189. daddiu TEMP, KK, 8
  190. #endif
  191. dsra L, TEMP, 2
  192. blez L, .L15
  193. NOP
  194. #else
  195. LD a1, 0 * SIZE(AO)
  196. MOV c71, c11
  197. LD b1, 0 * SIZE(B)
  198. MOV c81, c11
  199. pref 1, 3 * SIZE(CO1)
  200. pref 1, 3 * SIZE(CO2)
  201. LD a3, 4 * SIZE(AO)
  202. MOV c12, c11
  203. LD b2, 1 * SIZE(B)
  204. MOV c22, c11
  205. dsra L, K, 2
  206. MOV c32, c11
  207. LD b3, 2 * SIZE(B)
  208. MOV c42, c11
  209. LD b4, 3 * SIZE(B)
  210. MOV c52, c11
  211. LD b5, 4 * SIZE(B)
  212. MOV c62, c11
  213. LD b6, 8 * SIZE(B)
  214. MOV c72, c11
  215. LD b7, 12 * SIZE(B)
  216. MOV c82, c11
  217. blez L, .L15
  218. move BO, B
  219. #endif
  220. MADD c11, c11, a1, b1
  221. LD a2, 1 * SIZE(AO)
  222. MADD c21, c21, a1, b2
  223. daddiu L, L, -1
  224. MADD c31, c31, a1, b3
  225. blez L, .L13
  226. MADD c41, c41, a1, b4
  227. pref 1, 2 * SIZE(CO3)
  228. .align 3
  229. .L12:
  230. MADD c12, c12, a2, b1
  231. LD b1, 16 * SIZE(BO)
  232. MADD c22, c22, a2, b2
  233. LD b2, 5 * SIZE(BO)
  234. MADD c32, c32, a2, b3
  235. LD b3, 6 * SIZE(BO)
  236. MADD c42, c42, a2, b4
  237. LD b4, 7 * SIZE(BO)
  238. MADD c51, c51, a1, b5
  239. LD a4, 2 * SIZE(AO)
  240. MADD c61, c61, a1, b2
  241. NOP
  242. MADD c71, c71, a1, b3
  243. NOP
  244. MADD c81, c81, a1, b4
  245. LD a1, 8 * SIZE(AO)
  246. MADD c52, c52, a2, b5
  247. LD b5, 20 * SIZE(BO)
  248. MADD c62, c62, a2, b2
  249. LD b2, 9 * SIZE(BO)
  250. MADD c72, c72, a2, b3
  251. LD b3, 10 * SIZE(BO)
  252. MADD c82, c82, a2, b4
  253. LD b4, 11 * SIZE(BO)
  254. MADD c11, c11, a4, b6
  255. LD a2, 3 * SIZE(AO)
  256. MADD c21, c21, a4, b2
  257. NOP
  258. MADD c31, c31, a4, b3
  259. NOP
  260. MADD c41, c41, a4, b4
  261. NOP
  262. MADD c12, c12, a2, b6
  263. LD b6, 24 * SIZE(BO)
  264. MADD c22, c22, a2, b2
  265. LD b2, 13 * SIZE(BO)
  266. MADD c32, c32, a2, b3
  267. LD b3, 14 * SIZE(BO)
  268. MADD c42, c42, a2, b4
  269. LD b4, 15 * SIZE(BO)
  270. MADD c51, c51, a4, b7
  271. NOP
  272. MADD c61, c61, a4, b2
  273. NOP
  274. MADD c71, c71, a4, b3
  275. NOP
  276. MADD c81, c81, a4, b4
  277. NOP
  278. MADD c52, c52, a2, b7
  279. LD b7, 28 * SIZE(BO)
  280. MADD c62, c62, a2, b2
  281. LD b2, 17 * SIZE(BO)
  282. MADD c72, c72, a2, b3
  283. LD b3, 18 * SIZE(BO)
  284. MADD c82, c82, a2, b4
  285. LD b4, 19 * SIZE(BO)
  286. MADD c11, c11, a3, b1
  287. LD a2, 5 * SIZE(AO)
  288. MADD c21, c21, a3, b2
  289. NOP
  290. MADD c31, c31, a3, b3
  291. NOP
  292. MADD c41, c41, a3, b4
  293. NOP
  294. MADD c12, c12, a2, b1
  295. LD b1, 32 * SIZE(BO)
  296. MADD c22, c22, a2, b2
  297. LD b2, 21 * SIZE(BO)
  298. MADD c32, c32, a2, b3
  299. LD b3, 22 * SIZE(BO)
  300. MADD c42, c42, a2, b4
  301. LD b4, 23 * SIZE(BO)
  302. MADD c51, c51, a3, b5
  303. LD a4, 6 * SIZE(AO)
  304. MADD c61, c61, a3, b2
  305. NOP
  306. MADD c71, c71, a3, b3
  307. NOP
  308. MADD c81, c81, a3, b4
  309. LD a3, 12 * SIZE(AO)
  310. MADD c52, c52, a2, b5
  311. LD b5, 36 * SIZE(BO)
  312. MADD c62, c62, a2, b2
  313. LD b2, 25 * SIZE(BO)
  314. MADD c72, c72, a2, b3
  315. LD b3, 26 * SIZE(BO)
  316. MADD c82, c82, a2, b4
  317. LD b4, 27 * SIZE(BO)
  318. MADD c11, c11, a4, b6
  319. LD a2, 7 * SIZE(AO)
  320. MADD c21, c21, a4, b2
  321. NOP
  322. MADD c31, c31, a4, b3
  323. NOP
  324. MADD c41, c41, a4, b4
  325. daddiu L, L, -1
  326. MADD c12, c12, a2, b6
  327. LD b6, 40 * SIZE(BO)
  328. MADD c22, c22, a2, b2
  329. LD b2, 29 * SIZE(BO)
  330. MADD c32, c32, a2, b3
  331. LD b3, 30 * SIZE(BO)
  332. MADD c42, c42, a2, b4
  333. LD b4, 31 * SIZE(BO)
  334. MADD c51, c51, a4, b7
  335. daddiu BO, BO, 32 * SIZE
  336. MADD c61, c61, a4, b2
  337. daddiu AO, AO, 8 * SIZE
  338. MADD c71, c71, a4, b3
  339. NOP
  340. MADD c81, c81, a4, b4
  341. NOP
  342. MADD c52, c52, a2, b7
  343. LD b7, 12 * SIZE(BO)
  344. MADD c62, c62, a2, b2
  345. LD b2, 1 * SIZE(BO)
  346. MADD c72, c72, a2, b3
  347. LD b3, 2 * SIZE(BO)
  348. MADD c82, c82, a2, b4
  349. LD b4, 3 * SIZE(BO)
  350. MADD c11, c11, a1, b1
  351. LD a2, 1 * SIZE(AO)
  352. MADD c21, c21, a1, b2
  353. NOP
  354. MADD c31, c31, a1, b3
  355. bgtz L, .L12
  356. MADD c41, c41, a1, b4
  357. NOP
  358. .align 3
  359. .L13:
  360. MADD c12, c12, a2, b1
  361. LD b1, 16 * SIZE(BO)
  362. MADD c22, c22, a2, b2
  363. LD b2, 5 * SIZE(BO)
  364. MADD c32, c32, a2, b3
  365. LD b3, 6 * SIZE(BO)
  366. MADD c42, c42, a2, b4
  367. LD b4, 7 * SIZE(BO)
  368. MADD c51, c51, a1, b5
  369. NOP
  370. MADD c61, c61, a1, b2
  371. LD a4, 2 * SIZE(AO)
  372. MADD c71, c71, a1, b3
  373. NOP
  374. MADD c81, c81, a1, b4
  375. LD a1, 8 * SIZE(AO)
  376. MADD c52, c52, a2, b5
  377. LD b5, 20 * SIZE(BO)
  378. MADD c62, c62, a2, b2
  379. LD b2, 9 * SIZE(BO)
  380. MADD c72, c72, a2, b3
  381. LD b3, 10 * SIZE(BO)
  382. MADD c82, c82, a2, b4
  383. LD b4, 11 * SIZE(BO)
  384. MADD c11, c11, a4, b6
  385. LD a2, 3 * SIZE(AO)
  386. MADD c21, c21, a4, b2
  387. NOP
  388. MADD c31, c31, a4, b3
  389. pref 1, 3 * SIZE(CO4)
  390. MADD c41, c41, a4, b4
  391. NOP
  392. MADD c12, c12, a2, b6
  393. LD b6, 24 * SIZE(BO)
  394. MADD c22, c22, a2, b2
  395. LD b2, 13 * SIZE(BO)
  396. MADD c32, c32, a2, b3
  397. LD b3, 14 * SIZE(BO)
  398. MADD c42, c42, a2, b4
  399. LD b4, 15 * SIZE(BO)
  400. MADD c51, c51, a4, b7
  401. pref 1, 3 * SIZE(CO5)
  402. MADD c61, c61, a4, b2
  403. NOP
  404. MADD c71, c71, a4, b3
  405. pref 1, 3 * SIZE(CO6)
  406. MADD c81, c81, a4, b4
  407. NOP
  408. MADD c52, c52, a2, b7
  409. LD b7, 28 * SIZE(BO)
  410. MADD c62, c62, a2, b2
  411. LD b2, 17 * SIZE(BO)
  412. MADD c72, c72, a2, b3
  413. LD b3, 18 * SIZE(BO)
  414. MADD c82, c82, a2, b4
  415. LD b4, 19 * SIZE(BO)
  416. MADD c11, c11, a3, b1
  417. LD a2, 5 * SIZE(AO)
  418. MADD c21, c21, a3, b2
  419. NOP
  420. MADD c31, c31, a3, b3
  421. pref 1, 3 * SIZE(CO7)
  422. MADD c41, c41, a3, b4
  423. NOP
  424. MADD c12, c12, a2, b1
  425. LD b1, 32 * SIZE(BO)
  426. MADD c22, c22, a2, b2
  427. LD b2, 21 * SIZE(BO)
  428. MADD c32, c32, a2, b3
  429. LD b3, 22 * SIZE(BO)
  430. MADD c42, c42, a2, b4
  431. LD b4, 23 * SIZE(BO)
  432. MADD c51, c51, a3, b5
  433. NOP
  434. MADD c61, c61, a3, b2
  435. LD a4, 6 * SIZE(AO)
  436. MADD c71, c71, a3, b3
  437. NOP
  438. MADD c81, c81, a3, b4
  439. NOP
  440. MADD c52, c52, a2, b5
  441. LD b5, 36 * SIZE(BO)
  442. MADD c62, c62, a2, b2
  443. LD b2, 25 * SIZE(BO)
  444. MADD c72, c72, a2, b3
  445. LD b3, 26 * SIZE(BO)
  446. MADD c82, c82, a2, b4
  447. LD b4, 27 * SIZE(BO)
  448. MADD c11, c11, a4, b6
  449. LD a2, 7 * SIZE(AO)
  450. MADD c21, c21, a4, b2
  451. NOP
  452. MADD c31, c31, a4, b3
  453. NOP
  454. MADD c41, c41, a4, b4
  455. NOP
  456. MADD c12, c12, a2, b6
  457. LD b6, 40 * SIZE(BO)
  458. MADD c22, c22, a2, b2
  459. LD b2, 29 * SIZE(BO)
  460. MADD c32, c32, a2, b3
  461. LD b3, 30 * SIZE(BO)
  462. MADD c42, c42, a2, b4
  463. LD b4, 31 * SIZE(BO)
  464. MADD c51, c51, a4, b7
  465. daddiu BO, BO, 32 * SIZE
  466. MADD c61, c61, a4, b2
  467. daddiu AO, AO, 8 * SIZE
  468. MADD c71, c71, a4, b3
  469. NOP
  470. MADD c81, c81, a4, b4
  471. NOP
  472. MADD c52, c52, a2, b7
  473. LD b7, 12 * SIZE(BO)
  474. MADD c62, c62, a2, b2
  475. LD b2, 1 * SIZE(BO)
  476. MADD c72, c72, a2, b3
  477. LD b3, 2 * SIZE(BO)
  478. MADD c82, c82, a2, b4
  479. LD b4, 3 * SIZE(BO)
  480. .align 3
  481. .L15:
  482. #ifndef TRMMKERNEL
  483. andi L, K, 3
  484. #else
  485. andi L, TEMP, 3
  486. #endif
  487. NOP
  488. blez L, .L18
  489. pref 1, 3 * SIZE(CO8)
  490. .align 3
  491. .L16:
  492. MADD c11, c11, a1, b1
  493. LD a2, 1 * SIZE(AO)
  494. MADD c21, c21, a1, b2
  495. NOP
  496. MADD c31, c31, a1, b3
  497. NOP
  498. MADD c41, c41, a1, b4
  499. NOP
  500. MADD c12, c12, a2, b1
  501. LD b1, 8 * SIZE(BO)
  502. MADD c22, c22, a2, b2
  503. LD b2, 5 * SIZE(BO)
  504. MADD c32, c32, a2, b3
  505. LD b3, 6 * SIZE(BO)
  506. MADD c42, c42, a2, b4
  507. LD b4, 7 * SIZE(BO)
  508. MADD c51, c51, a1, b5
  509. daddiu L, L, -1
  510. MADD c61, c61, a1, b2
  511. daddiu AO, AO, 2 * SIZE
  512. MADD c71, c71, a1, b3
  513. daddiu BO, BO, 8 * SIZE
  514. MADD c81, c81, a1, b4
  515. LD a1, 0 * SIZE(AO)
  516. MADD c52, c52, a2, b5
  517. LD b5, 4 * SIZE(BO)
  518. MADD c62, c62, a2, b2
  519. LD b2, 1 * SIZE(BO)
  520. MADD c72, c72, a2, b3
  521. LD b3, 2 * SIZE(BO)
  522. MADD c82, c82, a2, b4
  523. bgtz L, .L16
  524. LD b4, 3 * SIZE(BO)
  525. .L18:
  526. #ifndef TRMMKERNEL
  527. LD $f0, 0 * SIZE(CO1)
  528. daddiu CO3,CO3, 2 * SIZE
  529. LD $f1, 1 * SIZE(CO1)
  530. daddiu CO1,CO1, 2 * SIZE
  531. LD $f2, 0 * SIZE(CO2)
  532. daddiu CO4,CO4, 2 * SIZE
  533. LD $f3, 1 * SIZE(CO2)
  534. daddiu CO2,CO2, 2 * SIZE
  535. LD $f4, -2 * SIZE(CO3)
  536. daddiu CO5,CO5, 2 * SIZE
  537. LD $f5, -1 * SIZE(CO3)
  538. daddiu CO6,CO6, 2 * SIZE
  539. LD $f6, -2 * SIZE(CO4)
  540. daddiu CO7,CO7, 2 * SIZE
  541. LD $f7, -1 * SIZE(CO4)
  542. daddiu I, I, -1
  543. MADD c11, $f0, ALPHA, c11
  544. LD $f0,-2 * SIZE(CO5)
  545. MADD c12, $f1, ALPHA, c12
  546. LD $f1,-1 * SIZE(CO5)
  547. MADD c21, $f2, ALPHA, c21
  548. LD $f2,-2 * SIZE(CO6)
  549. MADD c22, $f3, ALPHA, c22
  550. LD $f3,-1 * SIZE(CO6)
  551. MADD c31, $f4, ALPHA, c31
  552. LD $f4,-2 * SIZE(CO7)
  553. MADD c32, $f5, ALPHA, c32
  554. LD $f5,-1 * SIZE(CO7)
  555. MADD c41, $f6, ALPHA, c41
  556. LD $f6, 0 * SIZE(CO8)
  557. MADD c42, $f7, ALPHA, c42
  558. LD $f7, 1 * SIZE(CO8)
  559. pref 0, 0 * SIZE(BB)
  560. pref 0, 8 * SIZE(BB)
  561. ST c11, -2 * SIZE(CO1)
  562. MTC $0, c11
  563. ST c12, -1 * SIZE(CO1)
  564. daddiu CO8,CO8, 2 * SIZE
  565. ST c21, -2 * SIZE(CO2)
  566. MOV c21, c11
  567. ST c22, -1 * SIZE(CO2)
  568. daddiu BB, BB, 16 * SIZE
  569. MADD c51, $f0, ALPHA, c51
  570. ST c31, -2 * SIZE(CO3)
  571. MADD c52, $f1, ALPHA, c52
  572. ST c32, -1 * SIZE(CO3)
  573. MADD c61, $f2, ALPHA, c61
  574. ST c41, -2 * SIZE(CO4)
  575. MADD c62, $f3, ALPHA, c62
  576. ST c42, -1 * SIZE(CO4)
  577. MADD c71, $f4, ALPHA, c71
  578. ST c51, -2 * SIZE(CO5)
  579. MADD c72, $f5, ALPHA, c72
  580. ST c52, -1 * SIZE(CO5)
  581. MADD c81, $f6, ALPHA, c81
  582. ST c61, -2 * SIZE(CO6)
  583. MADD c82, $f7, ALPHA, c82
  584. ST c62, -1 * SIZE(CO6)
  585. ST c71, -2 * SIZE(CO7)
  586. MOV c31, c11
  587. ST c72, -1 * SIZE(CO7)
  588. MOV c41, c11
  589. ST c81, -2 * SIZE(CO8)
  590. MOV c51, c11
  591. ST c82, -1 * SIZE(CO8)
  592. bgtz I, .L11
  593. MOV c61, c11
  594. #else
  595. daddiu CO4,CO4, 2 * SIZE
  596. daddiu CO5,CO5, 2 * SIZE
  597. daddiu CO6,CO6, 2 * SIZE
  598. daddiu CO7,CO7, 2 * SIZE
  599. pref 0, 0 * SIZE(BB)
  600. pref 0, 8 * SIZE(BB)
  601. MUL c11, ALPHA, c11
  602. daddiu CO1,CO1, 2 * SIZE
  603. MUL c12, ALPHA, c12
  604. MTC $0, a1
  605. MUL c21, ALPHA, c21
  606. daddiu CO2,CO2, 2 * SIZE
  607. MUL c22, ALPHA, c22
  608. daddiu CO3,CO3, 2 * SIZE
  609. ST c11, -2 * SIZE(CO1)
  610. MUL c31, ALPHA, c31
  611. ST c12, -1 * SIZE(CO1)
  612. MUL c32, ALPHA, c32
  613. ST c21, -2 * SIZE(CO2)
  614. MUL c41, ALPHA, c41
  615. ST c22, -1 * SIZE(CO2)
  616. MUL c42, ALPHA, c42
  617. ST c31, -2 * SIZE(CO3)
  618. MUL c51, ALPHA, c51
  619. ST c32, -1 * SIZE(CO3)
  620. MUL c52, ALPHA, c52
  621. ST c41, -2 * SIZE(CO4)
  622. MUL c61, ALPHA, c61
  623. ST c42, -1 * SIZE(CO4)
  624. MUL c62, ALPHA, c62
  625. ST c51, -2 * SIZE(CO5)
  626. MUL c71, ALPHA, c71
  627. ST c52, -1 * SIZE(CO5)
  628. MUL c72, ALPHA, c72
  629. ST c61, -2 * SIZE(CO6)
  630. MUL c81, ALPHA, c81
  631. ST c62, -1 * SIZE(CO6)
  632. MUL c82, ALPHA, c82
  633. ST c71, -2 * SIZE(CO7)
  634. MOV c11, a1
  635. ST c72, -1 * SIZE(CO7)
  636. MOV c21, a1
  637. daddiu CO8,CO8, 2 * SIZE
  638. daddiu BB, BB, 16 * SIZE
  639. ST c81, -2 * SIZE(CO8)
  640. MOV c31, a1
  641. ST c82, -1 * SIZE(CO8)
  642. MOV c41, a1
  643. daddiu I, I, -1
  644. MOV c51, a1
  645. #if ( defined(LEFT) && defined(TRANSA)) || \
  646. (!defined(LEFT) && !defined(TRANSA))
  647. dsubu TEMP, K, KK
  648. #ifdef LEFT
  649. daddiu TEMP, TEMP, -2
  650. #else
  651. daddiu TEMP, TEMP, -8
  652. #endif
  653. dsll L, TEMP, 1 + BASE_SHIFT
  654. dsll TEMP, TEMP, 3 + BASE_SHIFT
  655. daddu AO, AO, L
  656. daddu BO, BO, TEMP
  657. #endif
  658. #ifdef LEFT
  659. daddiu KK, KK, 2
  660. #endif
  661. bgtz I, .L11
  662. MOV c61, a1
  663. #endif
  664. .align 3
  665. .L20:
  666. andi I, M, 1
  667. MOV c61, c11
  668. blez I, .L29
  669. MOV c71, c11
  670. #if defined(TRMMKERNEL)
  671. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  672. move BO, B
  673. #else
  674. dsll L, KK, 0 + BASE_SHIFT
  675. dsll TEMP, KK, 3 + BASE_SHIFT
  676. daddu AO, AO, L
  677. daddu BO, B, TEMP
  678. #endif
  679. LD a1, 0 * SIZE(AO)
  680. LD a2, 1 * SIZE(AO)
  681. LD a3, 2 * SIZE(AO)
  682. LD a4, 3 * SIZE(AO)
  683. LD b1, 0 * SIZE(BO)
  684. LD b2, 1 * SIZE(BO)
  685. LD b3, 2 * SIZE(BO)
  686. LD b4, 3 * SIZE(BO)
  687. LD b5, 4 * SIZE(BO)
  688. LD b6, 8 * SIZE(BO)
  689. LD b7, 12 * SIZE(BO)
  690. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  691. dsubu TEMP, K, KK
  692. #elif defined(LEFT)
  693. daddiu TEMP, KK, 1
  694. #else
  695. daddiu TEMP, KK, 8
  696. #endif
  697. dsra L, TEMP, 2
  698. blez L, .L25
  699. MOV c81, c11
  700. #else
  701. LD a1, 0 * SIZE(AO)
  702. LD a2, 1 * SIZE(AO)
  703. LD a3, 2 * SIZE(AO)
  704. LD a4, 3 * SIZE(AO)
  705. LD b1, 0 * SIZE(B)
  706. LD b2, 1 * SIZE(B)
  707. LD b3, 2 * SIZE(B)
  708. LD b4, 3 * SIZE(B)
  709. LD b5, 4 * SIZE(B)
  710. LD b6, 8 * SIZE(B)
  711. LD b7, 12 * SIZE(B)
  712. dsra L, K, 2
  713. MOV c81, c11
  714. blez L, .L25
  715. move BO, B
  716. #endif
  717. .align 3
  718. .L22:
  719. MADD c11, c11, a1, b1
  720. LD b1, 16 * SIZE(BO)
  721. MADD c21, c21, a1, b2
  722. LD b2, 5 * SIZE(BO)
  723. MADD c31, c31, a1, b3
  724. LD b3, 6 * SIZE(BO)
  725. MADD c41, c41, a1, b4
  726. LD b4, 7 * SIZE(BO)
  727. MADD c51, c51, a1, b5
  728. LD b5, 20 * SIZE(BO)
  729. MADD c61, c61, a1, b2
  730. LD b2, 9 * SIZE(BO)
  731. MADD c71, c71, a1, b3
  732. LD b3, 10 * SIZE(BO)
  733. MADD c81, c81, a1, b4
  734. LD b4, 11 * SIZE(BO)
  735. LD a1, 4 * SIZE(AO)
  736. daddiu L, L, -1
  737. MADD c11, c11, a2, b6
  738. LD b6, 24 * SIZE(BO)
  739. MADD c21, c21, a2, b2
  740. LD b2, 13 * SIZE(BO)
  741. MADD c31, c31, a2, b3
  742. LD b3, 14 * SIZE(BO)
  743. MADD c41, c41, a2, b4
  744. LD b4, 15 * SIZE(BO)
  745. MADD c51, c51, a2, b7
  746. LD b7, 28 * SIZE(BO)
  747. MADD c61, c61, a2, b2
  748. LD b2, 17 * SIZE(BO)
  749. MADD c71, c71, a2, b3
  750. LD b3, 18 * SIZE(BO)
  751. MADD c81, c81, a2, b4
  752. LD b4, 19 * SIZE(BO)
  753. LD a2, 5 * SIZE(AO)
  754. daddiu AO, AO, 4 * SIZE
  755. MADD c11, c11, a3, b1
  756. LD b1, 32 * SIZE(BO)
  757. MADD c21, c21, a3, b2
  758. LD b2, 21 * SIZE(BO)
  759. MADD c31, c31, a3, b3
  760. LD b3, 22 * SIZE(BO)
  761. MADD c41, c41, a3, b4
  762. LD b4, 23 * SIZE(BO)
  763. MADD c51, c51, a3, b5
  764. LD b5, 36 * SIZE(BO)
  765. MADD c61, c61, a3, b2
  766. LD b2, 25 * SIZE(BO)
  767. MADD c71, c71, a3, b3
  768. LD b3, 26 * SIZE(BO)
  769. MADD c81, c81, a3, b4
  770. LD b4, 27 * SIZE(BO)
  771. LD a3, 2 * SIZE(AO)
  772. daddiu BO, BO, 32 * SIZE
  773. MADD c11, c11, a4, b6
  774. LD b6, 8 * SIZE(BO)
  775. MADD c21, c21, a4, b2
  776. LD b2, -3 * SIZE(BO)
  777. MADD c31, c31, a4, b3
  778. LD b3, -2 * SIZE(BO)
  779. MADD c41, c41, a4, b4
  780. LD b4, -1 * SIZE(BO)
  781. MADD c51, c51, a4, b7
  782. LD b7, 12 * SIZE(BO)
  783. MADD c61, c61, a4, b2
  784. LD b2, 1 * SIZE(BO)
  785. MADD c71, c71, a4, b3
  786. LD b3, 2 * SIZE(BO)
  787. MADD c81, c81, a4, b4
  788. LD b4, 3 * SIZE(BO)
  789. bgtz L, .L22
  790. LD a4, 3 * SIZE(AO)
  791. .align 3
  792. .L25:
  793. #ifndef TRMMKERNEL
  794. andi L, K, 3
  795. #else
  796. andi L, TEMP, 3
  797. #endif
  798. NOP
  799. blez L, .L28
  800. NOP
  801. .align 3
  802. .L26:
  803. MADD c11, c11, a1, b1
  804. LD b1, 8 * SIZE(BO)
  805. MADD c21, c21, a1, b2
  806. LD b2, 5 * SIZE(BO)
  807. MADD c31, c31, a1, b3
  808. LD b3, 6 * SIZE(BO)
  809. MADD c41, c41, a1, b4
  810. LD b4, 7 * SIZE(BO)
  811. daddiu L, L, -1
  812. MOV a2, a2
  813. daddiu AO, AO, 1 * SIZE
  814. daddiu BO, BO, 8 * SIZE
  815. MADD c51, c51, a1, b5
  816. LD b5, 4 * SIZE(BO)
  817. MADD c61, c61, a1, b2
  818. LD b2, 1 * SIZE(BO)
  819. MADD c71, c71, a1, b3
  820. LD b3, 2 * SIZE(BO)
  821. MADD c81, c81, a1, b4
  822. LD a1, 0 * SIZE(AO)
  823. bgtz L, .L26
  824. LD b4, 3 * SIZE(BO)
  825. .L28:
  826. #ifndef TRMMKERNEL
  827. LD $f0, 0 * SIZE(CO1)
  828. LD $f1, 0 * SIZE(CO2)
  829. LD $f2, 0 * SIZE(CO3)
  830. LD $f3, 0 * SIZE(CO4)
  831. MADD c11, $f0, ALPHA, c11
  832. LD $f4, 0 * SIZE(CO5)
  833. MADD c21, $f1, ALPHA, c21
  834. LD $f5, 0 * SIZE(CO6)
  835. MADD c31, $f2, ALPHA, c31
  836. LD $f6, 0 * SIZE(CO7)
  837. MADD c41, $f3, ALPHA, c41
  838. LD $f7, 0 * SIZE(CO8)
  839. MADD c51, $f4, ALPHA, c51
  840. ST c11, 0 * SIZE(CO1)
  841. MADD c61, $f5, ALPHA, c61
  842. ST c21, 0 * SIZE(CO2)
  843. MADD c71, $f6, ALPHA, c71
  844. ST c31, 0 * SIZE(CO3)
  845. MADD c81, $f7, ALPHA, c81
  846. ST c41, 0 * SIZE(CO4)
  847. ST c51, 0 * SIZE(CO5)
  848. ST c61, 0 * SIZE(CO6)
  849. ST c71, 0 * SIZE(CO7)
  850. ST c81, 0 * SIZE(CO8)
  851. #else
  852. MUL c11, ALPHA, c11
  853. MUL c21, ALPHA, c21
  854. MUL c31, ALPHA, c31
  855. MUL c41, ALPHA, c41
  856. ST c11, 0 * SIZE(CO1)
  857. MUL c51, ALPHA, c51
  858. ST c21, 0 * SIZE(CO2)
  859. MUL c61, ALPHA, c61
  860. ST c31, 0 * SIZE(CO3)
  861. MUL c71, ALPHA, c71
  862. ST c41, 0 * SIZE(CO4)
  863. MUL c81, ALPHA, c81
  864. ST c51, 0 * SIZE(CO5)
  865. ST c61, 0 * SIZE(CO6)
  866. ST c71, 0 * SIZE(CO7)
  867. ST c81, 0 * SIZE(CO8)
  868. #if ( defined(LEFT) && defined(TRANSA)) || \
  869. (!defined(LEFT) && !defined(TRANSA))
  870. dsubu TEMP, K, KK
  871. #ifdef LEFT
  872. daddiu TEMP, TEMP, -1
  873. #else
  874. daddiu TEMP, TEMP, -8
  875. #endif
  876. dsll L, TEMP, 0 + BASE_SHIFT
  877. dsll TEMP, TEMP, 3 + BASE_SHIFT
  878. daddu AO, AO, L
  879. daddu BO, BO, TEMP
  880. #endif
  881. #ifdef LEFT
  882. daddiu KK, KK, 1
  883. #endif
  884. #endif
  885. .align 3
  886. .L29:
  887. #if defined(TRMMKERNEL) && !defined(LEFT)
  888. daddiu KK, KK, 8
  889. #endif
  890. bgtz J, .L10
  891. move B, BO
  892. .align 3
  893. .L30:
  894. andi J, N, 4
  895. blez J, .L50
  896. move AO, A
  897. move CO1, C
  898. MTC $0, c11
  899. daddu CO2, C, LDC
  900. daddu CO3, CO2, LDC
  901. daddu CO4, CO3, LDC
  902. MOV c21, c11
  903. daddu C, CO4, LDC
  904. MOV c31, c11
  905. #if defined(TRMMKERNEL) && defined(LEFT)
  906. move KK, OFFSET
  907. #endif
  908. dsra I, M, 1
  909. blez I, .L40
  910. MOV c41, c11
  911. .L31:
  912. #if defined(TRMMKERNEL)
  913. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  914. move BO, B
  915. #else
  916. dsll L, KK, 1 + BASE_SHIFT
  917. dsll TEMP, KK, 2 + BASE_SHIFT
  918. daddu AO, AO, L
  919. daddu BO, B, TEMP
  920. #endif
  921. LD a1, 0 * SIZE(AO)
  922. LD a3, 4 * SIZE(AO)
  923. LD b1, 0 * SIZE(BO)
  924. MOV c12, c11
  925. LD b2, 1 * SIZE(BO)
  926. MOV c22, c11
  927. LD b3, 2 * SIZE(BO)
  928. MOV c32, c11
  929. LD b4, 3 * SIZE(BO)
  930. MOV c42, c11
  931. LD b5, 4 * SIZE(BO)
  932. LD b6, 8 * SIZE(BO)
  933. LD b7, 12 * SIZE(BO)
  934. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  935. dsubu TEMP, K, KK
  936. #elif defined(LEFT)
  937. daddiu TEMP, KK, 2
  938. #else
  939. daddiu TEMP, KK, 4
  940. #endif
  941. dsra L, TEMP, 2
  942. blez L, .L35
  943. NOP
  944. #else
  945. LD a1, 0 * SIZE(AO)
  946. LD a3, 4 * SIZE(AO)
  947. LD b1, 0 * SIZE(B)
  948. MOV c12, c11
  949. LD b2, 1 * SIZE(B)
  950. MOV c22, c11
  951. LD b3, 2 * SIZE(B)
  952. MOV c32, c11
  953. LD b4, 3 * SIZE(B)
  954. MOV c42, c11
  955. LD b5, 4 * SIZE(B)
  956. dsra L, K, 2
  957. LD b6, 8 * SIZE(B)
  958. LD b7, 12 * SIZE(B)
  959. blez L, .L35
  960. move BO, B
  961. #endif
  962. .align 3
  963. .L32:
  964. MADD c11, c11, a1, b1
  965. LD a2, 1 * SIZE(AO)
  966. MADD c21, c21, a1, b2
  967. daddiu L, L, -1
  968. MADD c31, c31, a1, b3
  969. NOP
  970. MADD c41, c41, a1, b4
  971. LD a1, 2 * SIZE(AO)
  972. MADD c12, c12, a2, b1
  973. LD b1, 16 * SIZE(BO)
  974. MADD c22, c22, a2, b2
  975. LD b2, 5 * SIZE(BO)
  976. MADD c32, c32, a2, b3
  977. LD b3, 6 * SIZE(BO)
  978. MADD c42, c42, a2, b4
  979. LD b4, 7 * SIZE(BO)
  980. MADD c11, c11, a1, b5
  981. LD a2, 3 * SIZE(AO)
  982. MADD c21, c21, a1, b2
  983. NOP
  984. MADD c31, c31, a1, b3
  985. NOP
  986. MADD c41, c41, a1, b4
  987. LD a1, 8 * SIZE(AO)
  988. MADD c12, c12, a2, b5
  989. LD b5, 20 * SIZE(BO)
  990. MADD c22, c22, a2, b2
  991. LD b2, 9 * SIZE(BO)
  992. MADD c32, c32, a2, b3
  993. LD b3, 10 * SIZE(BO)
  994. MADD c42, c42, a2, b4
  995. LD b4, 11 * SIZE(BO)
  996. MADD c11, c11, a3, b6
  997. LD a2, 5 * SIZE(AO)
  998. MADD c21, c21, a3, b2
  999. NOP
  1000. MADD c31, c31, a3, b3
  1001. NOP
  1002. MADD c41, c41, a3, b4
  1003. LD a3, 6 * SIZE(AO)
  1004. MADD c12, c12, a2, b6
  1005. LD b6, 24 * SIZE(BO)
  1006. MADD c22, c22, a2, b2
  1007. LD b2, 13 * SIZE(BO)
  1008. MADD c32, c32, a2, b3
  1009. LD b3, 14 * SIZE(BO)
  1010. MADD c42, c42, a2, b4
  1011. LD b4, 15 * SIZE(BO)
  1012. MADD c11, c11, a3, b7
  1013. LD a2, 7 * SIZE(AO)
  1014. MADD c21, c21, a3, b2
  1015. daddiu AO, AO, 8 * SIZE
  1016. MADD c31, c31, a3, b3
  1017. daddiu BO, BO, 16 * SIZE
  1018. MADD c41, c41, a3, b4
  1019. LD a3, 4 * SIZE(AO)
  1020. MADD c12, c12, a2, b7
  1021. LD b7, 12 * SIZE(BO)
  1022. MADD c22, c22, a2, b2
  1023. LD b2, 1 * SIZE(BO)
  1024. MADD c32, c32, a2, b3
  1025. LD b3, 2 * SIZE(BO)
  1026. MADD c42, c42, a2, b4
  1027. NOP
  1028. bgtz L, .L32
  1029. LD b4, 3 * SIZE(BO)
  1030. .align 3
  1031. .L35:
  1032. #ifndef TRMMKERNEL
  1033. andi L, K, 3
  1034. #else
  1035. andi L, TEMP, 3
  1036. #endif
  1037. NOP
  1038. blez L, .L38
  1039. NOP
  1040. .align 3
  1041. .L36:
  1042. MADD c11, c11, a1, b1
  1043. LD a2, 1 * SIZE(AO)
  1044. MADD c21, c21, a1, b2
  1045. daddiu L, L, -1
  1046. MADD c31, c31, a1, b3
  1047. daddiu AO, AO, 2 * SIZE
  1048. MADD c41, c41, a1, b4
  1049. LD a1, 0 * SIZE(AO)
  1050. MADD c12, c12, a2, b1
  1051. LD b1, 4 * SIZE(BO)
  1052. MADD c22, c22, a2, b2
  1053. LD b2, 5 * SIZE(BO)
  1054. MADD c32, c32, a2, b3
  1055. LD b3, 6 * SIZE(BO)
  1056. MADD c42, c42, a2, b4
  1057. LD b4, 7 * SIZE(BO)
  1058. bgtz L, .L36
  1059. daddiu BO, BO, 4 * SIZE
  1060. .L38:
  1061. #ifndef TRMMKERNEL
  1062. LD $f0, 0 * SIZE(CO1)
  1063. daddiu CO3,CO3, 2 * SIZE
  1064. LD $f1, 1 * SIZE(CO1)
  1065. daddiu CO1,CO1, 2 * SIZE
  1066. LD $f2, 0 * SIZE(CO2)
  1067. daddiu CO4,CO4, 2 * SIZE
  1068. LD $f3, 1 * SIZE(CO2)
  1069. daddiu CO2,CO2, 2 * SIZE
  1070. LD $f4, -2 * SIZE(CO3)
  1071. MADD c11, $f0, ALPHA, c11
  1072. LD $f5, -1 * SIZE(CO3)
  1073. MADD c12, $f1, ALPHA, c12
  1074. LD $f6, -2 * SIZE(CO4)
  1075. MADD c21, $f2, ALPHA, c21
  1076. LD $f7, -1 * SIZE(CO4)
  1077. MADD c22, $f3, ALPHA, c22
  1078. MADD c31, $f4, ALPHA, c31
  1079. ST c11, -2 * SIZE(CO1)
  1080. MADD c32, $f5, ALPHA, c32
  1081. ST c12, -1 * SIZE(CO1)
  1082. MADD c41, $f6, ALPHA, c41
  1083. ST c21, -2 * SIZE(CO2)
  1084. MADD c42, $f7, ALPHA, c42
  1085. ST c22, -1 * SIZE(CO2)
  1086. ST c31, -2 * SIZE(CO3)
  1087. MTC $0, c11
  1088. ST c32, -1 * SIZE(CO3)
  1089. daddiu I, I, -1
  1090. ST c41, -2 * SIZE(CO4)
  1091. MOV c21, c11
  1092. ST c42, -1 * SIZE(CO4)
  1093. MOV c31, c11
  1094. #else
  1095. MUL c11, ALPHA, c11
  1096. daddiu CO3,CO3, 2 * SIZE
  1097. MUL c12, ALPHA, c12
  1098. daddiu CO1,CO1, 2 * SIZE
  1099. MUL c21, ALPHA, c21
  1100. daddiu CO4,CO4, 2 * SIZE
  1101. MUL c22, ALPHA, c22
  1102. daddiu CO2,CO2, 2 * SIZE
  1103. ST c11, -2 * SIZE(CO1)
  1104. MUL c31, ALPHA, c31
  1105. ST c12, -1 * SIZE(CO1)
  1106. MUL c32, ALPHA, c32
  1107. ST c21, -2 * SIZE(CO2)
  1108. MUL c41, ALPHA, c41
  1109. ST c22, -1 * SIZE(CO2)
  1110. MUL c42, ALPHA, c42
  1111. ST c31, -2 * SIZE(CO3)
  1112. MTC $0, c11
  1113. ST c32, -1 * SIZE(CO3)
  1114. daddiu I, I, -1
  1115. ST c41, -2 * SIZE(CO4)
  1116. MOV c21, c11
  1117. ST c42, -1 * SIZE(CO4)
  1118. MOV c31, c11
  1119. #if ( defined(LEFT) && defined(TRANSA)) || \
  1120. (!defined(LEFT) && !defined(TRANSA))
  1121. dsubu TEMP, K, KK
  1122. #ifdef LEFT
  1123. daddiu TEMP, TEMP, -2
  1124. #else
  1125. daddiu TEMP, TEMP, -4
  1126. #endif
  1127. dsll L, TEMP, 1 + BASE_SHIFT
  1128. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1129. daddu AO, AO, L
  1130. daddu BO, BO, TEMP
  1131. #endif
  1132. #ifdef LEFT
  1133. daddiu KK, KK, 2
  1134. #endif
  1135. #endif
  1136. bgtz I, .L31
  1137. MOV c41, c11
  1138. .align 3
  1139. .L40:
  1140. andi I, M, 1
  1141. blez I, .L49
  1142. MOV c61, c11
  1143. #if defined(TRMMKERNEL)
  1144. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1145. move BO, B
  1146. #else
  1147. dsll L, KK, 0 + BASE_SHIFT
  1148. dsll TEMP, KK, 2 + BASE_SHIFT
  1149. daddu AO, AO, L
  1150. daddu BO, B, TEMP
  1151. #endif
  1152. LD a1, 0 * SIZE(AO)
  1153. MOV c71, c11
  1154. LD a2, 1 * SIZE(AO)
  1155. MOV c81, c11
  1156. LD b1, 0 * SIZE(BO)
  1157. LD b2, 1 * SIZE(BO)
  1158. LD b3, 2 * SIZE(BO)
  1159. LD b4, 3 * SIZE(BO)
  1160. LD b5, 4 * SIZE(BO)
  1161. LD b6, 8 * SIZE(BO)
  1162. LD b7, 12 * SIZE(BO)
  1163. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1164. dsubu TEMP, K, KK
  1165. #elif defined(LEFT)
  1166. daddiu TEMP, KK, 1
  1167. #else
  1168. daddiu TEMP, KK, 4
  1169. #endif
  1170. dsra L, TEMP, 2
  1171. blez L, .L45
  1172. NOP
  1173. #else
  1174. LD a1, 0 * SIZE(AO)
  1175. MOV c71, c11
  1176. LD a2, 1 * SIZE(AO)
  1177. MOV c81, c11
  1178. LD b1, 0 * SIZE(B)
  1179. LD b2, 1 * SIZE(B)
  1180. LD b3, 2 * SIZE(B)
  1181. LD b4, 3 * SIZE(B)
  1182. LD b5, 4 * SIZE(B)
  1183. LD b6, 8 * SIZE(B)
  1184. LD b7, 12 * SIZE(B)
  1185. dsra L, K, 2
  1186. blez L, .L45
  1187. move BO, B
  1188. #endif
  1189. .align 3
  1190. .L42:
  1191. MADD c11, c11, a1, b1
  1192. LD b1, 16 * SIZE(BO)
  1193. MADD c21, c21, a1, b2
  1194. LD b2, 5 * SIZE(BO)
  1195. MADD c31, c31, a1, b3
  1196. LD b3, 6 * SIZE(BO)
  1197. MADD c41, c41, a1, b4
  1198. LD b4, 7 * SIZE(BO)
  1199. LD a1, 4 * SIZE(AO)
  1200. daddiu L, L, -1
  1201. MADD c11, c11, a2, b5
  1202. LD b5, 20 * SIZE(BO)
  1203. MADD c21, c21, a2, b2
  1204. LD b2, 9 * SIZE(BO)
  1205. MADD c31, c31, a2, b3
  1206. LD b3, 10 * SIZE(BO)
  1207. MADD c41, c41, a2, b4
  1208. LD b4, 11 * SIZE(BO)
  1209. LD a2, 2 * SIZE(AO)
  1210. daddiu AO, AO, 4 * SIZE
  1211. MADD c11, c11, a2, b6
  1212. LD b6, 24 * SIZE(BO)
  1213. MADD c21, c21, a2, b2
  1214. LD b2, 13 * SIZE(BO)
  1215. MADD c31, c31, a2, b3
  1216. LD b3, 14 * SIZE(BO)
  1217. MADD c41, c41, a2, b4
  1218. LD b4, 15 * SIZE(BO)
  1219. LD a2, -1 * SIZE(AO)
  1220. daddiu BO, BO, 16 * SIZE
  1221. MADD c11, c11, a2, b7
  1222. LD b7, 12 * SIZE(BO)
  1223. MADD c21, c21, a2, b2
  1224. LD b2, 1 * SIZE(BO)
  1225. MADD c31, c31, a2, b3
  1226. LD b3, 2 * SIZE(BO)
  1227. MADD c41, c41, a2, b4
  1228. LD b4, 3 * SIZE(BO)
  1229. bgtz L, .L42
  1230. LD a2, 1 * SIZE(AO)
  1231. .align 3
  1232. .L45:
  1233. #ifndef TRMMKERNEL
  1234. andi L, K, 3
  1235. #else
  1236. andi L, TEMP, 3
  1237. #endif
  1238. NOP
  1239. blez L, .L48
  1240. NOP
  1241. .align 3
  1242. .L46:
  1243. MADD c11, c11, a1, b1
  1244. LD b1, 4 * SIZE(BO)
  1245. MADD c21, c21, a1, b2
  1246. LD b2, 5 * SIZE(BO)
  1247. MADD c31, c31, a1, b3
  1248. LD b3, 6 * SIZE(BO)
  1249. MADD c41, c41, a1, b4
  1250. LD a1, 1 * SIZE(AO)
  1251. LD b4, 7 * SIZE(BO)
  1252. daddiu L, L, -1
  1253. daddiu AO, AO, 1 * SIZE
  1254. MOV a2, a2
  1255. bgtz L, .L46
  1256. daddiu BO, BO, 4 * SIZE
  1257. .L48:
  1258. #ifndef TRMMKERNEL
  1259. LD $f0, 0 * SIZE(CO1)
  1260. LD $f1, 0 * SIZE(CO2)
  1261. LD $f2, 0 * SIZE(CO3)
  1262. LD $f3, 0 * SIZE(CO4)
  1263. MADD c11, $f0, ALPHA, c11
  1264. MADD c21, $f1, ALPHA, c21
  1265. MADD c31, $f2, ALPHA, c31
  1266. MADD c41, $f3, ALPHA, c41
  1267. ST c11, 0 * SIZE(CO1)
  1268. ST c21, 0 * SIZE(CO2)
  1269. ST c31, 0 * SIZE(CO3)
  1270. ST c41, 0 * SIZE(CO4)
  1271. #else
  1272. MUL c11, ALPHA, c11
  1273. MUL c21, ALPHA, c21
  1274. MUL c31, ALPHA, c31
  1275. MUL c41, ALPHA, c41
  1276. ST c11, 0 * SIZE(CO1)
  1277. ST c21, 0 * SIZE(CO2)
  1278. ST c31, 0 * SIZE(CO3)
  1279. ST c41, 0 * SIZE(CO4)
  1280. #if ( defined(LEFT) && defined(TRANSA)) || \
  1281. (!defined(LEFT) && !defined(TRANSA))
  1282. dsubu TEMP, K, KK
  1283. #ifdef LEFT
  1284. daddiu TEMP, TEMP, -1
  1285. #else
  1286. daddiu TEMP, TEMP, -4
  1287. #endif
  1288. dsll L, TEMP, 0 + BASE_SHIFT
  1289. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1290. daddu AO, AO, L
  1291. daddu BO, BO, TEMP
  1292. #endif
  1293. #ifdef LEFT
  1294. daddiu KK, KK, 1
  1295. #endif
  1296. #endif
  1297. .align 3
  1298. .L49:
  1299. #if defined(TRMMKERNEL) && !defined(LEFT)
  1300. daddiu KK, KK, 4
  1301. #endif
  1302. move B, BO
  1303. .align 3
  1304. .L50:
  1305. andi J, N, 2
  1306. blez J, .L70
  1307. move AO, A
  1308. move CO1, C
  1309. daddu CO2, C, LDC
  1310. #if defined(TRMMKERNEL) && defined(LEFT)
  1311. move KK, OFFSET
  1312. #endif
  1313. dsra I, M, 1
  1314. blez I, .L60
  1315. daddu C, CO2, LDC
  1316. .L51:
  1317. #if defined(TRMMKERNEL)
  1318. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1319. move BO, B
  1320. #else
  1321. dsll L, KK, 1 + BASE_SHIFT
  1322. dsll TEMP, KK, 1 + BASE_SHIFT
  1323. daddu AO, AO, L
  1324. daddu BO, B, TEMP
  1325. #endif
  1326. LD a1, 0 * SIZE(AO)
  1327. MTC $0, c11
  1328. LD a2, 1 * SIZE(AO)
  1329. MOV c21, c11
  1330. LD a5, 4 * SIZE(AO)
  1331. LD b1, 0 * SIZE(BO)
  1332. MOV c12, c11
  1333. LD b2, 1 * SIZE(BO)
  1334. MOV c22, c11
  1335. LD b3, 2 * SIZE(BO)
  1336. LD b5, 4 * SIZE(BO)
  1337. LD b6, 8 * SIZE(BO)
  1338. LD b7, 12 * SIZE(BO)
  1339. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1340. dsubu TEMP, K, KK
  1341. #elif defined(LEFT)
  1342. daddiu TEMP, KK, 2
  1343. #else
  1344. daddiu TEMP, KK, 2
  1345. #endif
  1346. dsra L, TEMP, 2
  1347. blez L, .L55
  1348. NOP
  1349. #else
  1350. LD a1, 0 * SIZE(AO)
  1351. MTC $0, c11
  1352. LD a2, 1 * SIZE(AO)
  1353. MOV c21, c11
  1354. LD a5, 4 * SIZE(AO)
  1355. LD b1, 0 * SIZE(B)
  1356. MOV c12, c11
  1357. LD b2, 1 * SIZE(B)
  1358. MOV c22, c11
  1359. LD b3, 2 * SIZE(B)
  1360. LD b5, 4 * SIZE(B)
  1361. dsra L, K, 2
  1362. LD b6, 8 * SIZE(B)
  1363. LD b7, 12 * SIZE(B)
  1364. blez L, .L55
  1365. move BO, B
  1366. #endif
  1367. .align 3
  1368. .L52:
  1369. MADD c11, c11, a1, b1
  1370. LD a3, 2 * SIZE(AO)
  1371. MADD c21, c21, a1, b2
  1372. LD b4, 3 * SIZE(BO)
  1373. MADD c12, c12, a2, b1
  1374. LD a4, 3 * SIZE(AO)
  1375. MADD c22, c22, a2, b2
  1376. LD b1, 8 * SIZE(BO)
  1377. MADD c11, c11, a3, b3
  1378. LD a1, 8 * SIZE(AO)
  1379. MADD c21, c21, a3, b4
  1380. LD b2, 5 * SIZE(BO)
  1381. MADD c12, c12, a4, b3
  1382. LD a2, 5 * SIZE(AO)
  1383. MADD c22, c22, a4, b4
  1384. LD b3, 6 * SIZE(BO)
  1385. MADD c11, c11, a5, b5
  1386. LD a3, 6 * SIZE(AO)
  1387. MADD c21, c21, a5, b2
  1388. LD b4, 7 * SIZE(BO)
  1389. MADD c12, c12, a2, b5
  1390. LD a4, 7 * SIZE(AO)
  1391. MADD c22, c22, a2, b2
  1392. LD b5, 12 * SIZE(BO)
  1393. MADD c11, c11, a3, b3
  1394. LD a5, 12 * SIZE(AO)
  1395. MADD c21, c21, a3, b4
  1396. LD b2, 9 * SIZE(BO)
  1397. MADD c12, c12, a4, b3
  1398. LD a2, 9 * SIZE(AO)
  1399. MADD c22, c22, a4, b4
  1400. LD b3, 10 * SIZE(BO)
  1401. daddiu AO, AO, 8 * SIZE
  1402. daddiu L, L, -1
  1403. bgtz L, .L52
  1404. daddiu BO, BO, 8 * SIZE
  1405. .align 3
  1406. .L55:
  1407. #ifndef TRMMKERNEL
  1408. andi L, K, 3
  1409. #else
  1410. andi L, TEMP, 3
  1411. #endif
  1412. NOP
  1413. blez L, .L58
  1414. NOP
  1415. .align 3
  1416. .L56:
  1417. MADD c11, c11, a1, b1
  1418. LD a2, 1 * SIZE(AO)
  1419. MADD c21, c21, a1, b2
  1420. LD a1, 2 * SIZE(AO)
  1421. MADD c12, c12, a2, b1
  1422. LD b1, 2 * SIZE(BO)
  1423. MADD c22, c22, a2, b2
  1424. LD b2, 3 * SIZE(BO)
  1425. daddiu L, L, -1
  1426. daddiu AO, AO, 2 * SIZE
  1427. bgtz L, .L56
  1428. daddiu BO, BO, 2 * SIZE
  1429. .L58:
  1430. #ifndef TRMMKERNEL
  1431. LD $f0, 0 * SIZE(CO1)
  1432. daddiu I, I, -1
  1433. LD $f1, 1 * SIZE(CO1)
  1434. daddiu CO1,CO1, 2 * SIZE
  1435. LD $f2, 0 * SIZE(CO2)
  1436. NOP
  1437. LD $f3, 1 * SIZE(CO2)
  1438. daddiu CO2,CO2, 2 * SIZE
  1439. MADD c11, $f0, ALPHA, c11
  1440. MADD c12, $f1, ALPHA, c12
  1441. MADD c21, $f2, ALPHA, c21
  1442. MADD c22, $f3, ALPHA, c22
  1443. ST c11, -2 * SIZE(CO1)
  1444. ST c12, -1 * SIZE(CO1)
  1445. ST c21, -2 * SIZE(CO2)
  1446. NOP
  1447. bgtz I, .L51
  1448. ST c22, -1 * SIZE(CO2)
  1449. #else
  1450. daddiu I, I, -1
  1451. daddiu CO1,CO1, 2 * SIZE
  1452. daddiu CO2,CO2, 2 * SIZE
  1453. MUL c11, ALPHA, c11
  1454. MUL c12, ALPHA, c12
  1455. MUL c21, ALPHA, c21
  1456. MUL c22, ALPHA, c22
  1457. ST c11, -2 * SIZE(CO1)
  1458. ST c12, -1 * SIZE(CO1)
  1459. ST c21, -2 * SIZE(CO2)
  1460. ST c22, -1 * SIZE(CO2)
  1461. #if ( defined(LEFT) && defined(TRANSA)) || \
  1462. (!defined(LEFT) && !defined(TRANSA))
  1463. dsubu TEMP, K, KK
  1464. #ifdef LEFT
  1465. daddiu TEMP, TEMP, -2
  1466. #else
  1467. daddiu TEMP, TEMP, -2
  1468. #endif
  1469. dsll L, TEMP, 1 + BASE_SHIFT
  1470. dsll TEMP, TEMP, 1 + BASE_SHIFT
  1471. daddu AO, AO, L
  1472. daddu BO, BO, TEMP
  1473. #endif
  1474. #ifdef LEFT
  1475. daddiu KK, KK, 2
  1476. #endif
  1477. bgtz I, .L51
  1478. NOP
  1479. #endif
  1480. .align 3
  1481. .L60:
  1482. andi I, M, 1
  1483. blez I, .L69
  1484. NOP
  1485. #if defined(TRMMKERNEL)
  1486. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1487. move BO, B
  1488. #else
  1489. dsll L, KK, 0 + BASE_SHIFT
  1490. dsll TEMP, KK, 1 + BASE_SHIFT
  1491. daddu AO, AO, L
  1492. daddu BO, B, TEMP
  1493. #endif
  1494. LD a1, 0 * SIZE(AO)
  1495. MTC $0, c11
  1496. LD a2, 1 * SIZE(AO)
  1497. MOV c21, c11
  1498. LD a3, 2 * SIZE(AO)
  1499. MOV c31, c11
  1500. LD a4, 3 * SIZE(AO)
  1501. MOV c41, c11
  1502. LD b1, 0 * SIZE(BO)
  1503. LD b2, 1 * SIZE(BO)
  1504. LD b3, 2 * SIZE(BO)
  1505. LD b4, 3 * SIZE(BO)
  1506. LD b5, 4 * SIZE(BO)
  1507. LD b6, 8 * SIZE(BO)
  1508. LD b7, 12 * SIZE(BO)
  1509. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1510. dsubu TEMP, K, KK
  1511. #elif defined(LEFT)
  1512. daddiu TEMP, KK, 1
  1513. #else
  1514. daddiu TEMP, KK, 2
  1515. #endif
  1516. dsra L, TEMP, 2
  1517. blez L, .L65
  1518. NOP
  1519. #else
  1520. dsra L, K, 2
  1521. LD a1, 0 * SIZE(AO)
  1522. MTC $0, c11
  1523. LD a2, 1 * SIZE(AO)
  1524. MOV c21, c11
  1525. LD a3, 2 * SIZE(AO)
  1526. MOV c31, c11
  1527. LD a4, 3 * SIZE(AO)
  1528. MOV c41, c11
  1529. LD b1, 0 * SIZE(B)
  1530. LD b2, 1 * SIZE(B)
  1531. LD b3, 2 * SIZE(B)
  1532. LD b4, 3 * SIZE(B)
  1533. LD b5, 4 * SIZE(B)
  1534. LD b6, 8 * SIZE(B)
  1535. LD b7, 12 * SIZE(B)
  1536. blez L, .L65
  1537. move BO, B
  1538. #endif
  1539. .align 3
  1540. .L62:
  1541. MADD c11, c11, a1, b1
  1542. LD b1, 4 * SIZE(BO)
  1543. MADD c21, c21, a1, b2
  1544. LD b2, 5 * SIZE(BO)
  1545. MADD c31, c31, a2, b3
  1546. LD b3, 6 * SIZE(BO)
  1547. MADD c41, c41, a2, b4
  1548. LD b4, 7 * SIZE(BO)
  1549. LD a1, 4 * SIZE(AO)
  1550. LD a2, 5 * SIZE(AO)
  1551. MADD c11, c11, a3, b1
  1552. LD b1, 8 * SIZE(BO)
  1553. MADD c21, c21, a3, b2
  1554. LD b2, 9 * SIZE(BO)
  1555. MADD c31, c31, a4, b3
  1556. LD b3, 10 * SIZE(BO)
  1557. MADD c41, c41, a4, b4
  1558. LD b4, 11 * SIZE(BO)
  1559. LD a3, 6 * SIZE(AO)
  1560. LD a4, 7 * SIZE(AO)
  1561. daddiu L, L, -1
  1562. daddiu AO, AO, 4 * SIZE
  1563. bgtz L, .L62
  1564. daddiu BO, BO, 8 * SIZE
  1565. .align 3
  1566. .L65:
  1567. #ifndef TRMMKERNEL
  1568. andi L, K, 3
  1569. #else
  1570. andi L, TEMP, 3
  1571. #endif
  1572. NOP
  1573. blez L, .L68
  1574. NOP
  1575. .align 3
  1576. .L66:
  1577. MADD c11, c11, a1, b1
  1578. LD b1, 2 * SIZE(BO)
  1579. MADD c21, c21, a1, b2
  1580. LD b2, 3 * SIZE(BO)
  1581. LD a1, 1 * SIZE(AO)
  1582. daddiu L, L, -1
  1583. daddiu AO, AO, 1 * SIZE
  1584. bgtz L, .L66
  1585. daddiu BO, BO, 2 * SIZE
  1586. .L68:
  1587. #ifndef TRMMKERNEL
  1588. LD $f0, 0 * SIZE(CO1)
  1589. LD $f1, 0 * SIZE(CO2)
  1590. ADD c11, c11, c31
  1591. ADD c21, c21, c41
  1592. MADD c11, $f0, ALPHA, c11
  1593. MADD c21, $f1, ALPHA, c21
  1594. ST c11, 0 * SIZE(CO1)
  1595. ST c21, 0 * SIZE(CO2)
  1596. #else
  1597. ADD c11, c11, c31
  1598. ADD c21, c21, c41
  1599. MUL c11, ALPHA, c11
  1600. MUL c21, ALPHA, c21
  1601. ST c11, 0 * SIZE(CO1)
  1602. ST c21, 0 * SIZE(CO2)
  1603. #if ( defined(LEFT) && defined(TRANSA)) || \
  1604. (!defined(LEFT) && !defined(TRANSA))
  1605. dsubu TEMP, K, KK
  1606. #ifdef LEFT
  1607. daddiu TEMP, TEMP, -1
  1608. #else
  1609. daddiu TEMP, TEMP, -2
  1610. #endif
  1611. dsll L, TEMP, 0 + BASE_SHIFT
  1612. dsll TEMP, TEMP, 1 + BASE_SHIFT
  1613. daddu AO, AO, L
  1614. daddu BO, BO, TEMP
  1615. #endif
  1616. #ifdef LEFT
  1617. daddiu KK, KK, 1
  1618. #endif
  1619. #endif
  1620. .align 3
  1621. .L69:
  1622. #if defined(TRMMKERNEL) && !defined(LEFT)
  1623. daddiu KK, KK, 2
  1624. #endif
  1625. move B, BO
  1626. .align 3
  1627. .L70:
  1628. andi J, N, 1
  1629. blez J, .L999
  1630. move AO, A
  1631. move CO1, C
  1632. #if defined(TRMMKERNEL) && defined(LEFT)
  1633. move KK, OFFSET
  1634. #endif
  1635. dsra I, M, 1
  1636. blez I, .L80
  1637. daddu C, CO1, LDC
  1638. .L71:
  1639. #if defined(TRMMKERNEL)
  1640. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1641. move BO, B
  1642. #else
  1643. dsll L, KK, 1 + BASE_SHIFT
  1644. dsll TEMP, KK, 0 + BASE_SHIFT
  1645. daddu AO, AO, L
  1646. daddu BO, B, TEMP
  1647. #endif
  1648. LD a1, 0 * SIZE(AO)
  1649. MTC $0, c11
  1650. LD a2, 1 * SIZE(AO)
  1651. MOV c21, c11
  1652. LD a5, 4 * SIZE(AO)
  1653. LD b1, 0 * SIZE(BO)
  1654. MOV c12, c11
  1655. LD b2, 1 * SIZE(BO)
  1656. MOV c22, c11
  1657. LD b3, 2 * SIZE(BO)
  1658. LD b5, 4 * SIZE(BO)
  1659. LD b6, 8 * SIZE(BO)
  1660. LD b7, 12 * SIZE(BO)
  1661. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1662. dsubu TEMP, K, KK
  1663. #elif defined(LEFT)
  1664. daddiu TEMP, KK, 2
  1665. #else
  1666. daddiu TEMP, KK, 1
  1667. #endif
  1668. dsra L, TEMP, 2
  1669. blez L, .L75
  1670. NOP
  1671. #else
  1672. LD a1, 0 * SIZE(AO)
  1673. MTC $0, c11
  1674. LD a2, 1 * SIZE(AO)
  1675. MOV c21, c11
  1676. LD a5, 4 * SIZE(AO)
  1677. LD b1, 0 * SIZE(B)
  1678. MOV c12, c11
  1679. LD b2, 1 * SIZE(B)
  1680. MOV c22, c11
  1681. LD b3, 2 * SIZE(B)
  1682. LD b5, 4 * SIZE(B)
  1683. dsra L, K, 2
  1684. LD b6, 8 * SIZE(B)
  1685. LD b7, 12 * SIZE(B)
  1686. blez L, .L75
  1687. move BO, B
  1688. #endif
  1689. .align 3
  1690. .L72:
  1691. LD a1, 0 * SIZE(AO)
  1692. LD a2, 1 * SIZE(AO)
  1693. LD b1, 0 * SIZE(BO)
  1694. MADD c11, c11, a1, b1
  1695. MADD c12, c12, a2, b1
  1696. LD a1, 2 * SIZE(AO)
  1697. LD a2, 3 * SIZE(AO)
  1698. LD b1, 1 * SIZE(BO)
  1699. MADD c11, c11, a1, b1
  1700. MADD c12, c12, a2, b1
  1701. LD a1, 4 * SIZE(AO)
  1702. LD a2, 5 * SIZE(AO)
  1703. LD b1, 2 * SIZE(BO)
  1704. MADD c11, c11, a1, b1
  1705. MADD c12, c12, a2, b1
  1706. LD a1, 6 * SIZE(AO)
  1707. LD a2, 7 * SIZE(AO)
  1708. LD b1, 3 * SIZE(BO)
  1709. MADD c11, c11, a1, b1
  1710. MADD c12, c12, a2, b1
  1711. daddiu L, L, -1
  1712. daddiu AO, AO, 8 * SIZE
  1713. bgtz L, .L72
  1714. daddiu BO, BO, 4 * SIZE
  1715. .align 3
  1716. .L75:
  1717. #ifndef TRMMKERNEL
  1718. andi L, K, 3
  1719. #else
  1720. andi L, TEMP, 3
  1721. #endif
  1722. NOP
  1723. blez L, .L78
  1724. NOP
  1725. .align 3
  1726. .L76:
  1727. LD a1, 0 * SIZE(AO)
  1728. LD a2, 1 * SIZE(AO)
  1729. LD b1, 0 * SIZE(BO)
  1730. MADD c11, c11, a1, b1
  1731. MADD c12, c12, a2, b1
  1732. daddiu L, L, -1
  1733. daddiu AO, AO, 2 * SIZE
  1734. bgtz L, .L76
  1735. daddiu BO, BO, 1 * SIZE
  1736. .L78:
  1737. #ifndef TRMMKERNEL
  1738. LD $f0, 0 * SIZE(CO1)
  1739. daddiu I, I, -1
  1740. LD $f1, 1 * SIZE(CO1)
  1741. daddiu CO1,CO1, 2 * SIZE
  1742. ADD c11, c11, c21
  1743. ADD c12, c12, c22
  1744. MADD c11, $f0, ALPHA, c11
  1745. MADD c12, $f1, ALPHA, c12
  1746. ST c11, -2 * SIZE(CO1)
  1747. bgtz I, .L71
  1748. ST c12, -1 * SIZE(CO1)
  1749. #else
  1750. ADD c11, c11, c21
  1751. daddiu I, I, -1
  1752. ADD c12, c12, c22
  1753. daddiu CO1,CO1, 2 * SIZE
  1754. MUL c11, ALPHA, c11
  1755. MUL c12, ALPHA, c12
  1756. ST c11, -2 * SIZE(CO1)
  1757. ST c12, -1 * SIZE(CO1)
  1758. #if ( defined(LEFT) && defined(TRANSA)) || \
  1759. (!defined(LEFT) && !defined(TRANSA))
  1760. dsubu TEMP, K, KK
  1761. #ifdef LEFT
  1762. daddiu TEMP, TEMP, -2
  1763. #else
  1764. daddiu TEMP, TEMP, -1
  1765. #endif
  1766. dsll L, TEMP, 1 + BASE_SHIFT
  1767. dsll TEMP, TEMP, 0 + BASE_SHIFT
  1768. daddu AO, AO, L
  1769. daddu BO, BO, TEMP
  1770. #endif
  1771. #ifdef LEFT
  1772. daddiu KK, KK, 2
  1773. #endif
  1774. bgtz I, .L71
  1775. NOP
  1776. #endif
  1777. .align 3
  1778. .L80:
  1779. andi I, M, 1
  1780. blez I, .L89
  1781. NOP
  1782. #if defined(TRMMKERNEL)
  1783. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1784. move BO, B
  1785. #else
  1786. dsll L, KK, 0 + BASE_SHIFT
  1787. dsll TEMP, KK, 0 + BASE_SHIFT
  1788. daddu AO, AO, L
  1789. daddu BO, B, TEMP
  1790. #endif
  1791. LD a1, 0 * SIZE(AO)
  1792. MTC $0, c11
  1793. LD a2, 1 * SIZE(AO)
  1794. MOV c21, c11
  1795. LD a3, 2 * SIZE(AO)
  1796. LD a4, 3 * SIZE(AO)
  1797. LD b1, 0 * SIZE(BO)
  1798. LD b2, 1 * SIZE(BO)
  1799. LD b3, 2 * SIZE(BO)
  1800. LD b4, 3 * SIZE(BO)
  1801. LD b5, 4 * SIZE(BO)
  1802. LD b6, 8 * SIZE(BO)
  1803. LD b7, 12 * SIZE(BO)
  1804. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1805. dsubu TEMP, K, KK
  1806. #elif defined(LEFT)
  1807. daddiu TEMP, KK, 1
  1808. #else
  1809. daddiu TEMP, KK, 1
  1810. #endif
  1811. dsra L, TEMP, 2
  1812. blez L, .L85
  1813. NOP
  1814. #else
  1815. LD a1, 0 * SIZE(AO)
  1816. MTC $0, c11
  1817. LD a2, 1 * SIZE(AO)
  1818. MOV c21, c11
  1819. LD a3, 2 * SIZE(AO)
  1820. LD a4, 3 * SIZE(AO)
  1821. LD b1, 0 * SIZE(B)
  1822. LD b2, 1 * SIZE(B)
  1823. LD b3, 2 * SIZE(B)
  1824. LD b4, 3 * SIZE(B)
  1825. LD b5, 4 * SIZE(B)
  1826. LD b6, 8 * SIZE(B)
  1827. LD b7, 12 * SIZE(B)
  1828. dsra L, K, 2
  1829. blez L, .L85
  1830. move BO, B
  1831. #endif
  1832. .align 3
  1833. .L82:
  1834. LD a1, 0 * SIZE(AO)
  1835. LD b1, 0 * SIZE(BO)
  1836. MADD c11, c11, a1, b1
  1837. LD a1, 1 * SIZE(AO)
  1838. LD b1, 1 * SIZE(BO)
  1839. MADD c21, c21, a1, b1
  1840. LD a1, 2 * SIZE(AO)
  1841. LD b1, 2 * SIZE(BO)
  1842. MADD c11, c11, a1, b1
  1843. LD a1, 3 * SIZE(AO)
  1844. LD b1, 3 * SIZE(BO)
  1845. MADD c21, c21, a1, b1
  1846. daddiu L, L, -1
  1847. daddiu AO, AO, 4 * SIZE
  1848. bgtz L, .L82
  1849. daddiu BO, BO, 4 * SIZE
  1850. .align 3
  1851. .L85:
  1852. #ifndef TRMMKERNEL
  1853. andi L, K, 3
  1854. #else
  1855. andi L, TEMP, 3
  1856. #endif
  1857. NOP
  1858. blez L, .L88
  1859. NOP
  1860. .align 3
  1861. .L86:
  1862. LD a1, 0 * SIZE(AO)
  1863. LD b1, 0 * SIZE(BO)
  1864. MADD c11, c11, a1, b1
  1865. daddiu L, L, -1
  1866. daddiu AO, AO, 1 * SIZE
  1867. bgtz L, .L86
  1868. daddiu BO, BO, 1 * SIZE
  1869. .L88:
  1870. #ifndef TRMMKERNEL
  1871. LD $f0, 0 * SIZE(CO1)
  1872. ADD c11, c11, c21
  1873. MADD c11, $f0, ALPHA, c11
  1874. ST c11, 0 * SIZE(CO1)
  1875. #else
  1876. ADD c11, c11, c21
  1877. MUL c11, ALPHA, c11
  1878. ST c11, 0 * SIZE(CO1)
  1879. #endif
  1880. .align 3
  1881. .L89:
  1882. #if defined(TRMMKERNEL) && !defined(LEFT)
  1883. daddiu KK, KK, 1
  1884. #endif
  1885. move B, BO
  1886. .align 3
  1887. .L999:
  1888. LDARG $16, 0($sp)
  1889. LDARG $17, 8($sp)
  1890. LDARG $18, 16($sp)
  1891. LDARG $19, 24($sp)
  1892. LDARG $20, 32($sp)
  1893. LDARG $21, 40($sp)
  1894. LDARG $22, 48($sp)
  1895. ldc1 $f24, 56($sp)
  1896. ldc1 $f25, 64($sp)
  1897. ldc1 $f26, 72($sp)
  1898. ldc1 $f27, 80($sp)
  1899. ldc1 $f28, 88($sp)
  1900. #if defined(TRMMKERNEL)
  1901. LDARG $23, 96($sp)
  1902. LDARG $24, 104($sp)
  1903. LDARG $25, 112($sp)
  1904. #endif
  1905. #ifndef __64BIT__
  1906. ldc1 $f20,120($sp)
  1907. ldc1 $f21,128($sp)
  1908. ldc1 $f22,136($sp)
  1909. ldc1 $f23,144($sp)
  1910. #endif
  1911. j $31
  1912. daddiu $sp, $sp, 160
  1913. EPILOGUE