You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zgemm3m_kernel.S 30 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define N $5
  42. #define K $6
  43. #define A $9
  44. #define B $10
  45. #define C $11
  46. #define LDC $8
  47. #define AO $12
  48. #define BO $13
  49. #define I $2
  50. #define J $3
  51. #define L $7
  52. #define CO1 $14
  53. #define CO2 $15
  54. #define CO3 $16
  55. #define CO4 $17
  56. #define CO5 $18
  57. #define CO6 $19
  58. #define CO7 $20
  59. #define CO8 $21
  60. #if defined(TRMMKERNEL)
  61. #define OFFSET $22
  62. #define KK $23
  63. #define TEMP $24
  64. #endif
  65. #define a1 $f0
  66. #define a2 $f1
  67. #define a3 $f28
  68. #define a4 $f29
  69. #define b1 $f2
  70. #define b2 $f3
  71. #define b3 $f4
  72. #define b4 $f5
  73. #define b5 $f6
  74. #define b6 $f7
  75. #define b7 $f8
  76. #define b8 $f9
  77. #define a5 b8
  78. #define c11 $f10
  79. #define c12 $f11
  80. #define c21 $f12
  81. #define c22 $f13
  82. #define c31 $f14
  83. #define c32 $f17
  84. #define c41 $f18
  85. #define c42 $f19
  86. #define c51 $f20
  87. #define c52 $f21
  88. #define c61 $f22
  89. #define c62 $f23
  90. #define c71 $f24
  91. #define c72 $f25
  92. #define c81 $f26
  93. #define c82 $f27
  94. #define ALPHA_R $f15
  95. #define ALPHA_I $f16
  96. PROLOGUE
  97. daddiu $sp, $sp, -128
  98. SDARG $16, 0($sp)
  99. SDARG $17, 8($sp)
  100. SDARG $18, 16($sp)
  101. SDARG $19, 24($sp)
  102. SDARG $20, 32($sp)
  103. SDARG $21, 40($sp)
  104. sdc1 $f24, 48($sp)
  105. sdc1 $f25, 56($sp)
  106. sdc1 $f26, 64($sp)
  107. sdc1 $f27, 72($sp)
  108. sdc1 $f28, 80($sp)
  109. sdc1 $f29, 88($sp)
  110. LDARG LDC, 128($sp)
  111. dsll LDC, LDC, ZBASE_SHIFT
  112. dsra J, N, 3
  113. blez J, .L30
  114. nop
  115. .L10:
  116. move CO1, C
  117. MTC $0, c11
  118. daddu CO2, C, LDC
  119. move AO, A
  120. daddu CO3, CO2, LDC
  121. daddiu J, J, -1
  122. daddu CO4, CO3, LDC
  123. MOV c21, c11
  124. daddu CO5, CO4, LDC
  125. MOV c31, c11
  126. daddu CO6, CO5, LDC
  127. MOV c41, c11
  128. daddu CO7, CO6, LDC
  129. MOV c51, c11
  130. daddu CO8, CO7, LDC
  131. dsra I, M, 1
  132. daddu C, CO8, LDC
  133. blez I, .L20
  134. MOV c61, c11
  135. .L11:
  136. LD a1, 0 * SIZE(AO)
  137. MOV c71, c11
  138. LD b1, 0 * SIZE(B)
  139. MOV c81, c11
  140. LD a3, 4 * SIZE(AO)
  141. MOV c12, c11
  142. LD b2, 1 * SIZE(B)
  143. MOV c22, c11
  144. dsra L, K, 2
  145. MOV c32, c11
  146. LD b3, 2 * SIZE(B)
  147. MOV c42, c11
  148. LD b4, 3 * SIZE(B)
  149. MOV c52, c11
  150. LD b5, 4 * SIZE(B)
  151. MOV c62, c11
  152. LD b6, 8 * SIZE(B)
  153. MOV c72, c11
  154. LD b7, 12 * SIZE(B)
  155. MOV c82, c11
  156. blez L, .L15
  157. move BO, B
  158. MADD c11, c11, a1, b1
  159. LD a2, 1 * SIZE(AO)
  160. MADD c21, c21, a1, b2
  161. daddiu L, L, -1
  162. MADD c31, c31, a1, b3
  163. blez L, .L13
  164. MADD c41, c41, a1, b4
  165. NOP
  166. .align 3
  167. .L12:
  168. MADD c12, c12, a2, b1
  169. LD b1, 16 * SIZE(BO)
  170. MADD c22, c22, a2, b2
  171. LD b2, 5 * SIZE(BO)
  172. MADD c32, c32, a2, b3
  173. LD b3, 6 * SIZE(BO)
  174. MADD c42, c42, a2, b4
  175. LD b4, 7 * SIZE(BO)
  176. MADD c51, c51, a1, b5
  177. LD a4, 2 * SIZE(AO)
  178. MADD c61, c61, a1, b2
  179. NOP
  180. MADD c71, c71, a1, b3
  181. NOP
  182. MADD c81, c81, a1, b4
  183. LD a1, 8 * SIZE(AO)
  184. MADD c52, c52, a2, b5
  185. LD b5, 20 * SIZE(BO)
  186. MADD c62, c62, a2, b2
  187. LD b2, 9 * SIZE(BO)
  188. MADD c72, c72, a2, b3
  189. LD b3, 10 * SIZE(BO)
  190. MADD c82, c82, a2, b4
  191. LD b4, 11 * SIZE(BO)
  192. MADD c11, c11, a4, b6
  193. LD a2, 3 * SIZE(AO)
  194. MADD c21, c21, a4, b2
  195. NOP
  196. MADD c31, c31, a4, b3
  197. NOP
  198. MADD c41, c41, a4, b4
  199. NOP
  200. MADD c12, c12, a2, b6
  201. LD b6, 24 * SIZE(BO)
  202. MADD c22, c22, a2, b2
  203. LD b2, 13 * SIZE(BO)
  204. MADD c32, c32, a2, b3
  205. LD b3, 14 * SIZE(BO)
  206. MADD c42, c42, a2, b4
  207. LD b4, 15 * SIZE(BO)
  208. MADD c51, c51, a4, b7
  209. NOP
  210. MADD c61, c61, a4, b2
  211. NOP
  212. MADD c71, c71, a4, b3
  213. NOP
  214. MADD c81, c81, a4, b4
  215. NOP
  216. MADD c52, c52, a2, b7
  217. LD b7, 28 * SIZE(BO)
  218. MADD c62, c62, a2, b2
  219. LD b2, 17 * SIZE(BO)
  220. MADD c72, c72, a2, b3
  221. LD b3, 18 * SIZE(BO)
  222. MADD c82, c82, a2, b4
  223. LD b4, 19 * SIZE(BO)
  224. MADD c11, c11, a3, b1
  225. LD a2, 5 * SIZE(AO)
  226. MADD c21, c21, a3, b2
  227. NOP
  228. MADD c31, c31, a3, b3
  229. NOP
  230. MADD c41, c41, a3, b4
  231. NOP
  232. MADD c12, c12, a2, b1
  233. LD b1, 32 * SIZE(BO)
  234. MADD c22, c22, a2, b2
  235. LD b2, 21 * SIZE(BO)
  236. MADD c32, c32, a2, b3
  237. LD b3, 22 * SIZE(BO)
  238. MADD c42, c42, a2, b4
  239. LD b4, 23 * SIZE(BO)
  240. MADD c51, c51, a3, b5
  241. LD a4, 6 * SIZE(AO)
  242. MADD c61, c61, a3, b2
  243. NOP
  244. MADD c71, c71, a3, b3
  245. NOP
  246. MADD c81, c81, a3, b4
  247. LD a3, 12 * SIZE(AO)
  248. MADD c52, c52, a2, b5
  249. LD b5, 36 * SIZE(BO)
  250. MADD c62, c62, a2, b2
  251. LD b2, 25 * SIZE(BO)
  252. MADD c72, c72, a2, b3
  253. LD b3, 26 * SIZE(BO)
  254. MADD c82, c82, a2, b4
  255. LD b4, 27 * SIZE(BO)
  256. MADD c11, c11, a4, b6
  257. LD a2, 7 * SIZE(AO)
  258. MADD c21, c21, a4, b2
  259. NOP
  260. MADD c31, c31, a4, b3
  261. NOP
  262. MADD c41, c41, a4, b4
  263. daddiu L, L, -1
  264. MADD c12, c12, a2, b6
  265. LD b6, 40 * SIZE(BO)
  266. MADD c22, c22, a2, b2
  267. LD b2, 29 * SIZE(BO)
  268. MADD c32, c32, a2, b3
  269. LD b3, 30 * SIZE(BO)
  270. MADD c42, c42, a2, b4
  271. LD b4, 31 * SIZE(BO)
  272. MADD c51, c51, a4, b7
  273. daddiu BO, BO, 32 * SIZE
  274. MADD c61, c61, a4, b2
  275. daddiu AO, AO, 8 * SIZE
  276. MADD c71, c71, a4, b3
  277. NOP
  278. MADD c81, c81, a4, b4
  279. NOP
  280. MADD c52, c52, a2, b7
  281. LD b7, 12 * SIZE(BO)
  282. MADD c62, c62, a2, b2
  283. LD b2, 1 * SIZE(BO)
  284. MADD c72, c72, a2, b3
  285. LD b3, 2 * SIZE(BO)
  286. MADD c82, c82, a2, b4
  287. LD b4, 3 * SIZE(BO)
  288. MADD c11, c11, a1, b1
  289. LD a2, 1 * SIZE(AO)
  290. MADD c21, c21, a1, b2
  291. NOP
  292. MADD c31, c31, a1, b3
  293. bgtz L, .L12
  294. MADD c41, c41, a1, b4
  295. NOP
  296. .align 3
  297. .L13:
  298. MADD c12, c12, a2, b1
  299. LD b1, 16 * SIZE(BO)
  300. MADD c22, c22, a2, b2
  301. LD b2, 5 * SIZE(BO)
  302. MADD c32, c32, a2, b3
  303. LD b3, 6 * SIZE(BO)
  304. MADD c42, c42, a2, b4
  305. LD b4, 7 * SIZE(BO)
  306. MADD c51, c51, a1, b5
  307. NOP
  308. MADD c61, c61, a1, b2
  309. LD a4, 2 * SIZE(AO)
  310. MADD c71, c71, a1, b3
  311. NOP
  312. MADD c81, c81, a1, b4
  313. LD a1, 8 * SIZE(AO)
  314. MADD c52, c52, a2, b5
  315. LD b5, 20 * SIZE(BO)
  316. MADD c62, c62, a2, b2
  317. LD b2, 9 * SIZE(BO)
  318. MADD c72, c72, a2, b3
  319. LD b3, 10 * SIZE(BO)
  320. MADD c82, c82, a2, b4
  321. LD b4, 11 * SIZE(BO)
  322. MADD c11, c11, a4, b6
  323. LD a2, 3 * SIZE(AO)
  324. MADD c21, c21, a4, b2
  325. NOP
  326. MADD c31, c31, a4, b3
  327. NOP
  328. MADD c41, c41, a4, b4
  329. NOP
  330. MADD c12, c12, a2, b6
  331. LD b6, 24 * SIZE(BO)
  332. MADD c22, c22, a2, b2
  333. LD b2, 13 * SIZE(BO)
  334. MADD c32, c32, a2, b3
  335. LD b3, 14 * SIZE(BO)
  336. MADD c42, c42, a2, b4
  337. LD b4, 15 * SIZE(BO)
  338. MADD c51, c51, a4, b7
  339. NOP
  340. MADD c61, c61, a4, b2
  341. NOP
  342. MADD c71, c71, a4, b3
  343. NOP
  344. MADD c81, c81, a4, b4
  345. NOP
  346. MADD c52, c52, a2, b7
  347. LD b7, 28 * SIZE(BO)
  348. MADD c62, c62, a2, b2
  349. LD b2, 17 * SIZE(BO)
  350. MADD c72, c72, a2, b3
  351. LD b3, 18 * SIZE(BO)
  352. MADD c82, c82, a2, b4
  353. LD b4, 19 * SIZE(BO)
  354. MADD c11, c11, a3, b1
  355. LD a2, 5 * SIZE(AO)
  356. MADD c21, c21, a3, b2
  357. NOP
  358. MADD c31, c31, a3, b3
  359. NOP
  360. MADD c41, c41, a3, b4
  361. NOP
  362. MADD c12, c12, a2, b1
  363. LD b1, 32 * SIZE(BO)
  364. MADD c22, c22, a2, b2
  365. LD b2, 21 * SIZE(BO)
  366. MADD c32, c32, a2, b3
  367. LD b3, 22 * SIZE(BO)
  368. MADD c42, c42, a2, b4
  369. LD b4, 23 * SIZE(BO)
  370. MADD c51, c51, a3, b5
  371. NOP
  372. MADD c61, c61, a3, b2
  373. LD a4, 6 * SIZE(AO)
  374. MADD c71, c71, a3, b3
  375. NOP
  376. MADD c81, c81, a3, b4
  377. LD a3, 12 * SIZE(AO)
  378. MADD c52, c52, a2, b5
  379. LD b5, 36 * SIZE(BO)
  380. MADD c62, c62, a2, b2
  381. LD b2, 25 * SIZE(BO)
  382. MADD c72, c72, a2, b3
  383. LD b3, 26 * SIZE(BO)
  384. MADD c82, c82, a2, b4
  385. LD b4, 27 * SIZE(BO)
  386. MADD c11, c11, a4, b6
  387. LD a2, 7 * SIZE(AO)
  388. MADD c21, c21, a4, b2
  389. NOP
  390. MADD c31, c31, a4, b3
  391. NOP
  392. MADD c41, c41, a4, b4
  393. NOP
  394. MADD c12, c12, a2, b6
  395. LD b6, 40 * SIZE(BO)
  396. MADD c22, c22, a2, b2
  397. LD b2, 29 * SIZE(BO)
  398. MADD c32, c32, a2, b3
  399. LD b3, 30 * SIZE(BO)
  400. MADD c42, c42, a2, b4
  401. LD b4, 31 * SIZE(BO)
  402. MADD c51, c51, a4, b7
  403. daddiu BO, BO, 32 * SIZE
  404. MADD c61, c61, a4, b2
  405. daddiu AO, AO, 8 * SIZE
  406. MADD c71, c71, a4, b3
  407. NOP
  408. MADD c81, c81, a4, b4
  409. NOP
  410. MADD c52, c52, a2, b7
  411. LD b7, 12 * SIZE(BO)
  412. MADD c62, c62, a2, b2
  413. LD b2, 1 * SIZE(BO)
  414. MADD c72, c72, a2, b3
  415. LD b3, 2 * SIZE(BO)
  416. MADD c82, c82, a2, b4
  417. LD b4, 3 * SIZE(BO)
  418. .align 3
  419. .L15:
  420. andi L, K, 3
  421. NOP
  422. blez L, .L18
  423. NOP
  424. .align 3
  425. .L16:
  426. MADD c11, c11, a1, b1
  427. LD a2, 1 * SIZE(AO)
  428. MADD c21, c21, a1, b2
  429. NOP
  430. MADD c31, c31, a1, b3
  431. NOP
  432. MADD c41, c41, a1, b4
  433. NOP
  434. MADD c12, c12, a2, b1
  435. LD b1, 8 * SIZE(BO)
  436. MADD c22, c22, a2, b2
  437. LD b2, 5 * SIZE(BO)
  438. MADD c32, c32, a2, b3
  439. LD b3, 6 * SIZE(BO)
  440. MADD c42, c42, a2, b4
  441. LD b4, 7 * SIZE(BO)
  442. MADD c51, c51, a1, b5
  443. daddiu L, L, -1
  444. MADD c61, c61, a1, b2
  445. daddiu AO, AO, 2 * SIZE
  446. MADD c71, c71, a1, b3
  447. daddiu BO, BO, 8 * SIZE
  448. MADD c81, c81, a1, b4
  449. LD a1, 0 * SIZE(AO)
  450. MADD c52, c52, a2, b5
  451. LD b5, 4 * SIZE(BO)
  452. MADD c62, c62, a2, b2
  453. LD b2, 1 * SIZE(BO)
  454. MADD c72, c72, a2, b3
  455. LD b3, 2 * SIZE(BO)
  456. MADD c82, c82, a2, b4
  457. bgtz L, .L16
  458. LD b4, 3 * SIZE(BO)
  459. .L18:
  460. LD $f0, 0 * SIZE(CO1)
  461. LD $f1, 1 * SIZE(CO1)
  462. LD $f2, 2 * SIZE(CO1)
  463. LD $f3, 3 * SIZE(CO1)
  464. LD $f4, 0 * SIZE(CO2)
  465. MADD $f0, $f0, ALPHA_R, c11
  466. LD $f5, 1 * SIZE(CO2)
  467. MADD $f1, $f1, ALPHA_I, c11
  468. LD $f6, 2 * SIZE(CO2)
  469. MADD $f2, $f2, ALPHA_R, c12
  470. LD $f7, 3 * SIZE(CO2)
  471. MADD $f3, $f3, ALPHA_I, c12
  472. MADD $f4, $f4, ALPHA_R, c21
  473. ST $f0, 0 * SIZE(CO1)
  474. MADD $f5, $f5, ALPHA_I, c21
  475. ST $f1, 1 * SIZE(CO1)
  476. MADD $f6, $f6, ALPHA_R, c22
  477. ST $f2, 2 * SIZE(CO1)
  478. MADD $f7, $f7, ALPHA_I, c22
  479. ST $f3, 3 * SIZE(CO1)
  480. LD $f0, 0 * SIZE(CO3)
  481. LD $f1, 1 * SIZE(CO3)
  482. LD $f2, 2 * SIZE(CO3)
  483. LD $f3, 3 * SIZE(CO3)
  484. ST $f4, 0 * SIZE(CO2)
  485. ST $f5, 1 * SIZE(CO2)
  486. ST $f6, 2 * SIZE(CO2)
  487. ST $f7, 3 * SIZE(CO2)
  488. LD $f4, 0 * SIZE(CO4)
  489. LD $f5, 1 * SIZE(CO4)
  490. LD $f6, 2 * SIZE(CO4)
  491. LD $f7, 3 * SIZE(CO4)
  492. MADD $f0, $f0, ALPHA_R, c31
  493. MADD $f1, $f1, ALPHA_I, c31
  494. MADD $f2, $f2, ALPHA_R, c32
  495. MADD $f3, $f3, ALPHA_I, c32
  496. MADD $f4, $f4, ALPHA_R, c41
  497. ST $f0, 0 * SIZE(CO3)
  498. MADD $f5, $f5, ALPHA_I, c41
  499. ST $f1, 1 * SIZE(CO3)
  500. MADD $f6, $f6, ALPHA_R, c42
  501. ST $f2, 2 * SIZE(CO3)
  502. MADD $f7, $f7, ALPHA_I, c42
  503. ST $f3, 3 * SIZE(CO3)
  504. LD $f0, 0 * SIZE(CO5)
  505. LD $f1, 1 * SIZE(CO5)
  506. LD $f2, 2 * SIZE(CO5)
  507. LD $f3, 3 * SIZE(CO5)
  508. ST $f4, 0 * SIZE(CO4)
  509. ST $f5, 1 * SIZE(CO4)
  510. ST $f6, 2 * SIZE(CO4)
  511. ST $f7, 3 * SIZE(CO4)
  512. LD $f4, 0 * SIZE(CO6)
  513. LD $f5, 1 * SIZE(CO6)
  514. LD $f6, 2 * SIZE(CO6)
  515. LD $f7, 3 * SIZE(CO6)
  516. MADD $f0, $f0, ALPHA_R, c51
  517. daddiu CO1,CO1, 4 * SIZE
  518. MADD $f1, $f1, ALPHA_I, c51
  519. daddiu CO2,CO2, 4 * SIZE
  520. MADD $f2, $f2, ALPHA_R, c52
  521. daddiu CO3,CO3, 4 * SIZE
  522. MADD $f3, $f3, ALPHA_I, c52
  523. daddiu CO4,CO4, 4 * SIZE
  524. MADD $f4, $f4, ALPHA_R, c61
  525. ST $f0, 0 * SIZE(CO5)
  526. MADD $f5, $f5, ALPHA_I, c61
  527. ST $f1, 1 * SIZE(CO5)
  528. MADD $f6, $f6, ALPHA_R, c62
  529. ST $f2, 2 * SIZE(CO5)
  530. MADD $f7, $f7, ALPHA_I, c62
  531. ST $f3, 3 * SIZE(CO5)
  532. LD $f0, 0 * SIZE(CO7)
  533. LD $f1, 1 * SIZE(CO7)
  534. LD $f2, 2 * SIZE(CO7)
  535. LD $f3, 3 * SIZE(CO7)
  536. ST $f4, 0 * SIZE(CO6)
  537. ST $f5, 1 * SIZE(CO6)
  538. ST $f6, 2 * SIZE(CO6)
  539. ST $f7, 3 * SIZE(CO6)
  540. LD $f4, 0 * SIZE(CO8)
  541. daddiu I, I, -1
  542. LD $f5, 1 * SIZE(CO8)
  543. MTC $0, c11
  544. LD $f6, 2 * SIZE(CO8)
  545. LD $f7, 3 * SIZE(CO8)
  546. MADD $f0, $f0, ALPHA_R, c71
  547. daddiu CO5,CO5, 4 * SIZE
  548. MADD $f1, $f1, ALPHA_I, c71
  549. daddiu CO6,CO6, 4 * SIZE
  550. MADD $f2, $f2, ALPHA_R, c72
  551. daddiu CO7,CO7, 4 * SIZE
  552. MADD $f3, $f3, ALPHA_I, c72
  553. daddiu CO8,CO8, 4 * SIZE
  554. MADD $f4, $f4, ALPHA_R, c81
  555. ST $f0, -4 * SIZE(CO7)
  556. MADD $f5, $f5, ALPHA_I, c81
  557. ST $f1, -3 * SIZE(CO7)
  558. MADD $f6, $f6, ALPHA_R, c82
  559. ST $f2, -2 * SIZE(CO7)
  560. MADD $f7, $f7, ALPHA_I, c82
  561. ST $f3, -1 * SIZE(CO7)
  562. ST $f4, -4 * SIZE(CO8)
  563. MOV c21, c11
  564. ST $f5, -3 * SIZE(CO8)
  565. MOV c31, c11
  566. ST $f6, -2 * SIZE(CO8)
  567. MOV c41, c11
  568. ST $f7, -1 * SIZE(CO8)
  569. MOV c51, c11
  570. bgtz I, .L11
  571. MOV c61, c11
  572. .align 3
  573. .L20:
  574. andi I, M, 1
  575. MOV c61, c11
  576. blez I, .L29
  577. MOV c71, c11
  578. LD a1, 0 * SIZE(AO)
  579. LD a2, 1 * SIZE(AO)
  580. LD a3, 2 * SIZE(AO)
  581. LD a4, 3 * SIZE(AO)
  582. LD b1, 0 * SIZE(B)
  583. LD b2, 1 * SIZE(B)
  584. LD b3, 2 * SIZE(B)
  585. LD b4, 3 * SIZE(B)
  586. LD b5, 4 * SIZE(B)
  587. LD b6, 8 * SIZE(B)
  588. LD b7, 12 * SIZE(B)
  589. dsra L, K, 2
  590. MOV c81, c11
  591. blez L, .L25
  592. move BO, B
  593. .align 3
  594. .L22:
  595. MADD c11, c11, a1, b1
  596. LD b1, 16 * SIZE(BO)
  597. MADD c21, c21, a1, b2
  598. LD b2, 5 * SIZE(BO)
  599. MADD c31, c31, a1, b3
  600. LD b3, 6 * SIZE(BO)
  601. MADD c41, c41, a1, b4
  602. LD b4, 7 * SIZE(BO)
  603. MADD c51, c51, a1, b5
  604. LD b5, 20 * SIZE(BO)
  605. MADD c61, c61, a1, b2
  606. LD b2, 9 * SIZE(BO)
  607. MADD c71, c71, a1, b3
  608. LD b3, 10 * SIZE(BO)
  609. MADD c81, c81, a1, b4
  610. LD b4, 11 * SIZE(BO)
  611. LD a1, 4 * SIZE(AO)
  612. daddiu L, L, -1
  613. MADD c11, c11, a2, b6
  614. LD b6, 24 * SIZE(BO)
  615. MADD c21, c21, a2, b2
  616. LD b2, 13 * SIZE(BO)
  617. MADD c31, c31, a2, b3
  618. LD b3, 14 * SIZE(BO)
  619. MADD c41, c41, a2, b4
  620. LD b4, 15 * SIZE(BO)
  621. MADD c51, c51, a2, b7
  622. LD b7, 28 * SIZE(BO)
  623. MADD c61, c61, a2, b2
  624. LD b2, 17 * SIZE(BO)
  625. MADD c71, c71, a2, b3
  626. LD b3, 18 * SIZE(BO)
  627. MADD c81, c81, a2, b4
  628. LD b4, 19 * SIZE(BO)
  629. LD a2, 5 * SIZE(AO)
  630. daddiu AO, AO, 4 * SIZE
  631. MADD c11, c11, a3, b1
  632. LD b1, 32 * SIZE(BO)
  633. MADD c21, c21, a3, b2
  634. LD b2, 21 * SIZE(BO)
  635. MADD c31, c31, a3, b3
  636. LD b3, 22 * SIZE(BO)
  637. MADD c41, c41, a3, b4
  638. LD b4, 23 * SIZE(BO)
  639. MADD c51, c51, a3, b5
  640. LD b5, 36 * SIZE(BO)
  641. MADD c61, c61, a3, b2
  642. LD b2, 25 * SIZE(BO)
  643. MADD c71, c71, a3, b3
  644. LD b3, 26 * SIZE(BO)
  645. MADD c81, c81, a3, b4
  646. LD b4, 27 * SIZE(BO)
  647. LD a3, 2 * SIZE(AO)
  648. daddiu BO, BO, 32 * SIZE
  649. MADD c11, c11, a4, b6
  650. LD b6, 8 * SIZE(BO)
  651. MADD c21, c21, a4, b2
  652. LD b2, -3 * SIZE(BO)
  653. MADD c31, c31, a4, b3
  654. LD b3, -2 * SIZE(BO)
  655. MADD c41, c41, a4, b4
  656. LD b4, -1 * SIZE(BO)
  657. MADD c51, c51, a4, b7
  658. LD b7, 12 * SIZE(BO)
  659. MADD c61, c61, a4, b2
  660. LD b2, 1 * SIZE(BO)
  661. MADD c71, c71, a4, b3
  662. LD b3, 2 * SIZE(BO)
  663. MADD c81, c81, a4, b4
  664. LD b4, 3 * SIZE(BO)
  665. bgtz L, .L22
  666. LD a4, 3 * SIZE(AO)
  667. .align 3
  668. .L25:
  669. andi L, K, 3
  670. NOP
  671. blez L, .L28
  672. NOP
  673. .align 3
  674. .L26:
  675. MADD c11, c11, a1, b1
  676. LD b1, 8 * SIZE(BO)
  677. MADD c21, c21, a1, b2
  678. LD b2, 5 * SIZE(BO)
  679. MADD c31, c31, a1, b3
  680. LD b3, 6 * SIZE(BO)
  681. MADD c41, c41, a1, b4
  682. LD b4, 7 * SIZE(BO)
  683. daddiu L, L, -1
  684. MOV a2, a2
  685. daddiu AO, AO, 1 * SIZE
  686. daddiu BO, BO, 8 * SIZE
  687. MADD c51, c51, a1, b5
  688. LD b5, 4 * SIZE(BO)
  689. MADD c61, c61, a1, b2
  690. LD b2, 1 * SIZE(BO)
  691. MADD c71, c71, a1, b3
  692. LD b3, 2 * SIZE(BO)
  693. MADD c81, c81, a1, b4
  694. LD a1, 0 * SIZE(AO)
  695. bgtz L, .L26
  696. LD b4, 3 * SIZE(BO)
  697. .L28:
  698. LD $f0, 0 * SIZE(CO1)
  699. LD $f1, 1 * SIZE(CO1)
  700. LD $f2, 0 * SIZE(CO2)
  701. LD $f3, 1 * SIZE(CO2)
  702. LD $f4, 0 * SIZE(CO3)
  703. MADD $f0, $f0, ALPHA_R, c11
  704. LD $f5, 1 * SIZE(CO3)
  705. MADD $f1, $f1, ALPHA_I, c11
  706. LD $f6, 0 * SIZE(CO4)
  707. MADD $f2, $f2, ALPHA_R, c21
  708. LD $f7, 1 * SIZE(CO4)
  709. MADD $f3, $f3, ALPHA_I, c21
  710. MADD $f4, $f4, ALPHA_R, c31
  711. ST $f0, 0 * SIZE(CO1)
  712. MADD $f5, $f5, ALPHA_I, c31
  713. ST $f1, 1 * SIZE(CO1)
  714. MADD $f6, $f6, ALPHA_R, c41
  715. ST $f2, 0 * SIZE(CO2)
  716. MADD $f7, $f7, ALPHA_I, c41
  717. ST $f3, 1 * SIZE(CO2)
  718. LD $f0, 0 * SIZE(CO5)
  719. LD $f1, 1 * SIZE(CO5)
  720. LD $f2, 0 * SIZE(CO6)
  721. LD $f3, 1 * SIZE(CO6)
  722. ST $f4, 0 * SIZE(CO3)
  723. ST $f5, 1 * SIZE(CO3)
  724. ST $f6, 0 * SIZE(CO4)
  725. ST $f7, 1 * SIZE(CO4)
  726. LD $f4, 0 * SIZE(CO7)
  727. MADD $f0, $f0, ALPHA_R, c51
  728. LD $f5, 1 * SIZE(CO7)
  729. MADD $f1, $f1, ALPHA_I, c51
  730. LD $f6, 0 * SIZE(CO8)
  731. MADD $f2, $f2, ALPHA_R, c61
  732. LD $f7, 1 * SIZE(CO8)
  733. MADD $f3, $f3, ALPHA_I, c61
  734. MADD $f4, $f4, ALPHA_R, c71
  735. ST $f0, 0 * SIZE(CO5)
  736. MADD $f5, $f5, ALPHA_I, c71
  737. ST $f1, 1 * SIZE(CO5)
  738. MADD $f6, $f6, ALPHA_R, c81
  739. ST $f2, 0 * SIZE(CO6)
  740. MADD $f7, $f7, ALPHA_I, c81
  741. ST $f3, 1 * SIZE(CO6)
  742. ST $f4, 0 * SIZE(CO7)
  743. ST $f5, 1 * SIZE(CO7)
  744. ST $f6, 0 * SIZE(CO8)
  745. ST $f7, 1 * SIZE(CO8)
  746. .align 3
  747. .L29:
  748. bgtz J, .L10
  749. move B, BO
  750. .align 3
  751. .L30:
  752. andi J, N, 4
  753. blez J, .L50
  754. move AO, A
  755. move CO1, C
  756. MTC $0, c11
  757. daddu CO2, C, LDC
  758. daddu CO3, CO2, LDC
  759. daddu CO4, CO3, LDC
  760. MOV c21, c11
  761. daddu C, CO4, LDC
  762. MOV c31, c11
  763. dsra I, M, 1
  764. blez I, .L40
  765. MOV c41, c11
  766. .L31:
  767. LD a1, 0 * SIZE(AO)
  768. LD a3, 4 * SIZE(AO)
  769. LD b1, 0 * SIZE(B)
  770. MOV c12, c11
  771. LD b2, 1 * SIZE(B)
  772. MOV c22, c11
  773. LD b3, 2 * SIZE(B)
  774. MOV c32, c11
  775. LD b4, 3 * SIZE(B)
  776. MOV c42, c11
  777. LD b5, 4 * SIZE(B)
  778. dsra L, K, 2
  779. LD b6, 8 * SIZE(B)
  780. LD b7, 12 * SIZE(B)
  781. blez L, .L35
  782. move BO, B
  783. .align 3
  784. .L32:
  785. MADD c11, c11, a1, b1
  786. LD a2, 1 * SIZE(AO)
  787. MADD c21, c21, a1, b2
  788. daddiu L, L, -1
  789. MADD c31, c31, a1, b3
  790. NOP
  791. MADD c41, c41, a1, b4
  792. LD a1, 2 * SIZE(AO)
  793. MADD c12, c12, a2, b1
  794. LD b1, 16 * SIZE(BO)
  795. MADD c22, c22, a2, b2
  796. LD b2, 5 * SIZE(BO)
  797. MADD c32, c32, a2, b3
  798. LD b3, 6 * SIZE(BO)
  799. MADD c42, c42, a2, b4
  800. LD b4, 7 * SIZE(BO)
  801. MADD c11, c11, a1, b5
  802. LD a2, 3 * SIZE(AO)
  803. MADD c21, c21, a1, b2
  804. NOP
  805. MADD c31, c31, a1, b3
  806. NOP
  807. MADD c41, c41, a1, b4
  808. LD a1, 8 * SIZE(AO)
  809. MADD c12, c12, a2, b5
  810. LD b5, 20 * SIZE(BO)
  811. MADD c22, c22, a2, b2
  812. LD b2, 9 * SIZE(BO)
  813. MADD c32, c32, a2, b3
  814. LD b3, 10 * SIZE(BO)
  815. MADD c42, c42, a2, b4
  816. LD b4, 11 * SIZE(BO)
  817. MADD c11, c11, a3, b6
  818. LD a2, 5 * SIZE(AO)
  819. MADD c21, c21, a3, b2
  820. NOP
  821. MADD c31, c31, a3, b3
  822. NOP
  823. MADD c41, c41, a3, b4
  824. LD a3, 6 * SIZE(AO)
  825. MADD c12, c12, a2, b6
  826. LD b6, 24 * SIZE(BO)
  827. MADD c22, c22, a2, b2
  828. LD b2, 13 * SIZE(BO)
  829. MADD c32, c32, a2, b3
  830. LD b3, 14 * SIZE(BO)
  831. MADD c42, c42, a2, b4
  832. LD b4, 15 * SIZE(BO)
  833. MADD c11, c11, a3, b7
  834. LD a2, 7 * SIZE(AO)
  835. MADD c21, c21, a3, b2
  836. daddiu AO, AO, 8 * SIZE
  837. MADD c31, c31, a3, b3
  838. daddiu BO, BO, 16 * SIZE
  839. MADD c41, c41, a3, b4
  840. LD a3, 4 * SIZE(AO)
  841. MADD c12, c12, a2, b7
  842. LD b7, 12 * SIZE(BO)
  843. MADD c22, c22, a2, b2
  844. LD b2, 1 * SIZE(BO)
  845. MADD c32, c32, a2, b3
  846. LD b3, 2 * SIZE(BO)
  847. MADD c42, c42, a2, b4
  848. NOP
  849. bgtz L, .L32
  850. LD b4, 3 * SIZE(BO)
  851. .align 3
  852. .L35:
  853. andi L, K, 3
  854. NOP
  855. blez L, .L38
  856. NOP
  857. .align 3
  858. .L36:
  859. MADD c11, c11, a1, b1
  860. LD a2, 1 * SIZE(AO)
  861. MADD c21, c21, a1, b2
  862. daddiu L, L, -1
  863. MADD c31, c31, a1, b3
  864. daddiu AO, AO, 2 * SIZE
  865. MADD c41, c41, a1, b4
  866. LD a1, 0 * SIZE(AO)
  867. MADD c12, c12, a2, b1
  868. LD b1, 4 * SIZE(BO)
  869. MADD c22, c22, a2, b2
  870. LD b2, 5 * SIZE(BO)
  871. MADD c32, c32, a2, b3
  872. LD b3, 6 * SIZE(BO)
  873. MADD c42, c42, a2, b4
  874. LD b4, 7 * SIZE(BO)
  875. bgtz L, .L36
  876. daddiu BO, BO, 4 * SIZE
  877. .L38:
  878. LD $f0, 0 * SIZE(CO1)
  879. LD $f1, 1 * SIZE(CO1)
  880. LD $f2, 2 * SIZE(CO1)
  881. LD $f3, 3 * SIZE(CO1)
  882. LD $f4, 0 * SIZE(CO2)
  883. LD $f5, 1 * SIZE(CO2)
  884. LD $f6, 2 * SIZE(CO2)
  885. LD $f7, 3 * SIZE(CO2)
  886. MADD $f0, $f0, ALPHA_R, c11
  887. MADD $f1, $f1, ALPHA_I, c11
  888. MADD $f2, $f2, ALPHA_R, c12
  889. MADD $f3, $f3, ALPHA_I, c12
  890. MADD $f4, $f4, ALPHA_R, c21
  891. ST $f0, 0 * SIZE(CO1)
  892. MADD $f5, $f5, ALPHA_I, c21
  893. ST $f1, 1 * SIZE(CO1)
  894. MADD $f6, $f6, ALPHA_R, c22
  895. ST $f2, 2 * SIZE(CO1)
  896. MADD $f7, $f7, ALPHA_I, c22
  897. ST $f3, 3 * SIZE(CO1)
  898. LD $f0, 0 * SIZE(CO3)
  899. LD $f1, 1 * SIZE(CO3)
  900. LD $f2, 2 * SIZE(CO3)
  901. LD $f3, 3 * SIZE(CO3)
  902. ST $f4, 0 * SIZE(CO2)
  903. MADD $f0, $f0, ALPHA_R, c31
  904. ST $f5, 1 * SIZE(CO2)
  905. MADD $f1, $f1, ALPHA_I, c31
  906. ST $f6, 2 * SIZE(CO2)
  907. MADD $f2, $f2, ALPHA_R, c32
  908. ST $f7, 3 * SIZE(CO2)
  909. MADD $f3, $f3, ALPHA_I, c32
  910. LD $f4, 0 * SIZE(CO4)
  911. LD $f5, 1 * SIZE(CO4)
  912. LD $f6, 2 * SIZE(CO4)
  913. LD $f7, 3 * SIZE(CO4)
  914. MADD $f4, $f4, ALPHA_R, c41
  915. daddiu CO1,CO1, 4 * SIZE
  916. MADD $f5, $f5, ALPHA_I, c41
  917. daddiu CO2,CO2, 4 * SIZE
  918. MADD $f6, $f6, ALPHA_R, c42
  919. daddiu CO3,CO3, 4 * SIZE
  920. MADD $f7, $f7, ALPHA_I, c42
  921. daddiu CO4,CO4, 4 * SIZE
  922. ST $f0, -4 * SIZE(CO3)
  923. daddiu I, I, -1
  924. ST $f1, -3 * SIZE(CO3)
  925. ST $f2, -2 * SIZE(CO3)
  926. ST $f3, -1 * SIZE(CO3)
  927. ST $f4, -4 * SIZE(CO4)
  928. MTC $0, c11
  929. ST $f5, -3 * SIZE(CO4)
  930. MOV c21, c11
  931. ST $f6, -2 * SIZE(CO4)
  932. MOV c31, c11
  933. ST $f7, -1 * SIZE(CO4)
  934. bgtz I, .L31
  935. MOV c41, c11
  936. .align 3
  937. .L40:
  938. andi I, M, 1
  939. blez I, .L49
  940. MOV c61, c11
  941. LD a1, 0 * SIZE(AO)
  942. MOV c71, c11
  943. LD a2, 1 * SIZE(AO)
  944. MOV c81, c11
  945. LD b1, 0 * SIZE(B)
  946. LD b2, 1 * SIZE(B)
  947. LD b3, 2 * SIZE(B)
  948. LD b4, 3 * SIZE(B)
  949. LD b5, 4 * SIZE(B)
  950. LD b6, 8 * SIZE(B)
  951. LD b7, 12 * SIZE(B)
  952. dsra L, K, 2
  953. blez L, .L45
  954. move BO, B
  955. .align 3
  956. .L42:
  957. MADD c11, c11, a1, b1
  958. LD b1, 16 * SIZE(BO)
  959. MADD c21, c21, a1, b2
  960. LD b2, 5 * SIZE(BO)
  961. MADD c31, c31, a1, b3
  962. LD b3, 6 * SIZE(BO)
  963. MADD c41, c41, a1, b4
  964. LD b4, 7 * SIZE(BO)
  965. LD a1, 4 * SIZE(AO)
  966. daddiu L, L, -1
  967. MADD c11, c11, a2, b5
  968. LD b5, 20 * SIZE(BO)
  969. MADD c21, c21, a2, b2
  970. LD b2, 9 * SIZE(BO)
  971. MADD c31, c31, a2, b3
  972. LD b3, 10 * SIZE(BO)
  973. MADD c41, c41, a2, b4
  974. LD b4, 11 * SIZE(BO)
  975. LD a2, 2 * SIZE(AO)
  976. daddiu AO, AO, 4 * SIZE
  977. MADD c11, c11, a2, b6
  978. LD b6, 24 * SIZE(BO)
  979. MADD c21, c21, a2, b2
  980. LD b2, 13 * SIZE(BO)
  981. MADD c31, c31, a2, b3
  982. LD b3, 14 * SIZE(BO)
  983. MADD c41, c41, a2, b4
  984. LD b4, 15 * SIZE(BO)
  985. LD a2, -1 * SIZE(AO)
  986. daddiu BO, BO, 16 * SIZE
  987. MADD c11, c11, a2, b7
  988. LD b7, 12 * SIZE(BO)
  989. MADD c21, c21, a2, b2
  990. LD b2, 1 * SIZE(BO)
  991. MADD c31, c31, a2, b3
  992. LD b3, 2 * SIZE(BO)
  993. MADD c41, c41, a2, b4
  994. LD b4, 3 * SIZE(BO)
  995. bgtz L, .L42
  996. LD a2, 1 * SIZE(AO)
  997. .align 3
  998. .L45:
  999. andi L, K, 3
  1000. NOP
  1001. blez L, .L48
  1002. NOP
  1003. .align 3
  1004. .L46:
  1005. MADD c11, c11, a1, b1
  1006. LD b1, 4 * SIZE(BO)
  1007. MADD c21, c21, a1, b2
  1008. LD b2, 5 * SIZE(BO)
  1009. MADD c31, c31, a1, b3
  1010. LD b3, 6 * SIZE(BO)
  1011. MADD c41, c41, a1, b4
  1012. LD a1, 1 * SIZE(AO)
  1013. LD b4, 7 * SIZE(BO)
  1014. daddiu L, L, -1
  1015. daddiu AO, AO, 1 * SIZE
  1016. MOV a2, a2
  1017. bgtz L, .L46
  1018. daddiu BO, BO, 4 * SIZE
  1019. .L48:
  1020. LD $f0, 0 * SIZE(CO1)
  1021. LD $f1, 1 * SIZE(CO1)
  1022. LD $f2, 0 * SIZE(CO2)
  1023. LD $f3, 1 * SIZE(CO2)
  1024. LD $f4, 0 * SIZE(CO3)
  1025. MADD $f0, $f0, ALPHA_R, c11
  1026. LD $f5, 1 * SIZE(CO3)
  1027. MADD $f1, $f1, ALPHA_I, c11
  1028. LD $f6, 0 * SIZE(CO4)
  1029. MADD $f2, $f2, ALPHA_R, c21
  1030. LD $f7, 1 * SIZE(CO4)
  1031. MADD $f3, $f3, ALPHA_I, c21
  1032. MADD $f4, $f4, ALPHA_R, c31
  1033. ST $f0, 0 * SIZE(CO1)
  1034. MADD $f5, $f5, ALPHA_I, c31
  1035. ST $f1, 1 * SIZE(CO1)
  1036. MADD $f6, $f6, ALPHA_R, c41
  1037. ST $f2, 0 * SIZE(CO2)
  1038. MADD $f7, $f7, ALPHA_I, c41
  1039. ST $f3, 1 * SIZE(CO2)
  1040. ST $f4, 0 * SIZE(CO3)
  1041. ST $f5, 1 * SIZE(CO3)
  1042. ST $f6, 0 * SIZE(CO4)
  1043. ST $f7, 1 * SIZE(CO4)
  1044. .align 3
  1045. .L49:
  1046. move B, BO
  1047. .align 3
  1048. .L50:
  1049. andi J, N, 2
  1050. blez J, .L70
  1051. move AO, A
  1052. move CO1, C
  1053. daddu CO2, C, LDC
  1054. dsra I, M, 1
  1055. blez I, .L60
  1056. daddu C, CO2, LDC
  1057. .L51:
  1058. LD a1, 0 * SIZE(AO)
  1059. MTC $0, c11
  1060. LD a2, 1 * SIZE(AO)
  1061. MOV c21, c11
  1062. LD a5, 4 * SIZE(AO)
  1063. LD b1, 0 * SIZE(B)
  1064. MOV c12, c11
  1065. LD b2, 1 * SIZE(B)
  1066. MOV c22, c11
  1067. LD b3, 2 * SIZE(B)
  1068. LD b5, 4 * SIZE(B)
  1069. dsra L, K, 2
  1070. LD b6, 8 * SIZE(B)
  1071. LD b7, 12 * SIZE(B)
  1072. blez L, .L55
  1073. move BO, B
  1074. .align 3
  1075. .L52:
  1076. MADD c11, c11, a1, b1
  1077. LD a3, 2 * SIZE(AO)
  1078. MADD c21, c21, a1, b2
  1079. LD b4, 3 * SIZE(BO)
  1080. MADD c12, c12, a2, b1
  1081. LD a4, 3 * SIZE(AO)
  1082. MADD c22, c22, a2, b2
  1083. LD b1, 8 * SIZE(BO)
  1084. MADD c11, c11, a3, b3
  1085. LD a1, 8 * SIZE(AO)
  1086. MADD c21, c21, a3, b4
  1087. LD b2, 5 * SIZE(BO)
  1088. MADD c12, c12, a4, b3
  1089. LD a2, 5 * SIZE(AO)
  1090. MADD c22, c22, a4, b4
  1091. LD b3, 6 * SIZE(BO)
  1092. MADD c11, c11, a5, b5
  1093. LD a3, 6 * SIZE(AO)
  1094. MADD c21, c21, a5, b2
  1095. LD b4, 7 * SIZE(BO)
  1096. MADD c12, c12, a2, b5
  1097. LD a4, 7 * SIZE(AO)
  1098. MADD c22, c22, a2, b2
  1099. LD b5, 12 * SIZE(BO)
  1100. MADD c11, c11, a3, b3
  1101. LD a5, 12 * SIZE(AO)
  1102. MADD c21, c21, a3, b4
  1103. LD b2, 9 * SIZE(BO)
  1104. MADD c12, c12, a4, b3
  1105. LD a2, 9 * SIZE(AO)
  1106. MADD c22, c22, a4, b4
  1107. LD b3, 10 * SIZE(BO)
  1108. daddiu AO, AO, 8 * SIZE
  1109. daddiu L, L, -1
  1110. bgtz L, .L52
  1111. daddiu BO, BO, 8 * SIZE
  1112. .align 3
  1113. .L55:
  1114. andi L, K, 3
  1115. NOP
  1116. blez L, .L58
  1117. NOP
  1118. .align 3
  1119. .L56:
  1120. MADD c11, c11, a1, b1
  1121. LD a2, 1 * SIZE(AO)
  1122. MADD c21, c21, a1, b2
  1123. LD a1, 2 * SIZE(AO)
  1124. MADD c12, c12, a2, b1
  1125. LD b1, 2 * SIZE(BO)
  1126. MADD c22, c22, a2, b2
  1127. LD b2, 3 * SIZE(BO)
  1128. daddiu L, L, -1
  1129. daddiu AO, AO, 2 * SIZE
  1130. bgtz L, .L56
  1131. daddiu BO, BO, 2 * SIZE
  1132. .L58:
  1133. LD $f0, 0 * SIZE(CO1)
  1134. LD $f1, 1 * SIZE(CO1)
  1135. LD $f2, 2 * SIZE(CO1)
  1136. LD $f3, 3 * SIZE(CO1)
  1137. LD $f4, 0 * SIZE(CO2)
  1138. LD $f5, 1 * SIZE(CO2)
  1139. LD $f6, 2 * SIZE(CO2)
  1140. LD $f7, 3 * SIZE(CO2)
  1141. MADD $f0, $f0, ALPHA_R, c11
  1142. daddiu I, I, -1
  1143. MADD $f1, $f1, ALPHA_I, c11
  1144. daddiu CO1,CO1, 4 * SIZE
  1145. MADD $f2, $f2, ALPHA_R, c12
  1146. daddiu CO2,CO2, 4 * SIZE
  1147. MADD $f3, $f3, ALPHA_I, c12
  1148. MADD $f4, $f4, ALPHA_R, c21
  1149. MADD $f5, $f5, ALPHA_I, c21
  1150. MADD $f6, $f6, ALPHA_R, c22
  1151. MADD $f7, $f7, ALPHA_I, c22
  1152. ST $f0, -4 * SIZE(CO1)
  1153. ST $f1, -3 * SIZE(CO1)
  1154. ST $f2, -2 * SIZE(CO1)
  1155. ST $f3, -1 * SIZE(CO1)
  1156. ST $f4, -4 * SIZE(CO2)
  1157. ST $f5, -3 * SIZE(CO2)
  1158. ST $f6, -2 * SIZE(CO2)
  1159. bgtz I, .L51
  1160. ST $f7, -1 * SIZE(CO2)
  1161. .align 3
  1162. .L60:
  1163. andi I, M, 1
  1164. blez I, .L69
  1165. NOP
  1166. dsra L, K, 2
  1167. LD a1, 0 * SIZE(AO)
  1168. MTC $0, c11
  1169. LD a2, 1 * SIZE(AO)
  1170. MOV c21, c11
  1171. LD a3, 2 * SIZE(AO)
  1172. MOV c31, c11
  1173. LD a4, 3 * SIZE(AO)
  1174. MOV c41, c11
  1175. LD b1, 0 * SIZE(B)
  1176. LD b2, 1 * SIZE(B)
  1177. LD b3, 2 * SIZE(B)
  1178. LD b4, 3 * SIZE(B)
  1179. LD b5, 4 * SIZE(B)
  1180. LD b6, 8 * SIZE(B)
  1181. LD b7, 12 * SIZE(B)
  1182. blez L, .L65
  1183. move BO, B
  1184. .align 3
  1185. .L62:
  1186. MADD c11, c11, a1, b1
  1187. LD b1, 4 * SIZE(BO)
  1188. MADD c21, c21, a1, b2
  1189. LD b2, 5 * SIZE(BO)
  1190. MADD c31, c31, a2, b3
  1191. LD b3, 6 * SIZE(BO)
  1192. MADD c41, c41, a2, b4
  1193. LD b4, 7 * SIZE(BO)
  1194. LD a1, 4 * SIZE(AO)
  1195. LD a2, 5 * SIZE(AO)
  1196. MADD c11, c11, a3, b1
  1197. LD b1, 8 * SIZE(BO)
  1198. MADD c21, c21, a3, b2
  1199. LD b2, 9 * SIZE(BO)
  1200. MADD c31, c31, a4, b3
  1201. LD b3, 10 * SIZE(BO)
  1202. MADD c41, c41, a4, b4
  1203. LD b4, 11 * SIZE(BO)
  1204. LD a3, 6 * SIZE(AO)
  1205. LD a4, 7 * SIZE(AO)
  1206. daddiu L, L, -1
  1207. daddiu AO, AO, 4 * SIZE
  1208. bgtz L, .L62
  1209. daddiu BO, BO, 8 * SIZE
  1210. .align 3
  1211. .L65:
  1212. andi L, K, 3
  1213. NOP
  1214. blez L, .L68
  1215. NOP
  1216. .align 3
  1217. .L66:
  1218. MADD c11, c11, a1, b1
  1219. LD b1, 2 * SIZE(BO)
  1220. MADD c21, c21, a1, b2
  1221. LD b2, 3 * SIZE(BO)
  1222. LD a1, 1 * SIZE(AO)
  1223. daddiu L, L, -1
  1224. daddiu AO, AO, 1 * SIZE
  1225. bgtz L, .L66
  1226. daddiu BO, BO, 2 * SIZE
  1227. .L68:
  1228. LD $f0, 0 * SIZE(CO1)
  1229. LD $f1, 1 * SIZE(CO1)
  1230. LD $f2, 0 * SIZE(CO2)
  1231. LD $f3, 1 * SIZE(CO2)
  1232. ADD c11, c11, c31
  1233. ADD c21, c21, c41
  1234. MADD $f0, $f0, ALPHA_R, c11
  1235. MADD $f1, $f1, ALPHA_I, c11
  1236. MADD $f2, $f2, ALPHA_R, c21
  1237. MADD $f3, $f3, ALPHA_I, c21
  1238. ST $f0, 0 * SIZE(CO1)
  1239. ST $f1, 1 * SIZE(CO1)
  1240. ST $f2, 0 * SIZE(CO2)
  1241. ST $f3, 1 * SIZE(CO2)
  1242. .align 3
  1243. .L69:
  1244. move B, BO
  1245. .align 3
  1246. .L70:
  1247. andi J, N, 1
  1248. blez J, .L999
  1249. move AO, A
  1250. move CO1, C
  1251. dsra I, M, 1
  1252. blez I, .L80
  1253. daddu C, CO1, LDC
  1254. .L71:
  1255. LD a1, 0 * SIZE(AO)
  1256. MTC $0, c11
  1257. LD a2, 1 * SIZE(AO)
  1258. MOV c21, c11
  1259. LD a5, 4 * SIZE(AO)
  1260. LD b1, 0 * SIZE(B)
  1261. MOV c12, c11
  1262. LD b2, 1 * SIZE(B)
  1263. MOV c22, c11
  1264. LD b3, 2 * SIZE(B)
  1265. LD b5, 4 * SIZE(B)
  1266. dsra L, K, 2
  1267. LD b6, 8 * SIZE(B)
  1268. LD b7, 12 * SIZE(B)
  1269. blez L, .L75
  1270. move BO, B
  1271. .align 3
  1272. .L72:
  1273. LD a1, 0 * SIZE(AO)
  1274. LD a2, 1 * SIZE(AO)
  1275. LD b1, 0 * SIZE(BO)
  1276. MADD c11, c11, a1, b1
  1277. MADD c12, c12, a2, b1
  1278. LD a1, 2 * SIZE(AO)
  1279. LD a2, 3 * SIZE(AO)
  1280. LD b1, 1 * SIZE(BO)
  1281. MADD c11, c11, a1, b1
  1282. MADD c12, c12, a2, b1
  1283. LD a1, 4 * SIZE(AO)
  1284. LD a2, 5 * SIZE(AO)
  1285. LD b1, 2 * SIZE(BO)
  1286. MADD c11, c11, a1, b1
  1287. MADD c12, c12, a2, b1
  1288. LD a1, 6 * SIZE(AO)
  1289. LD a2, 7 * SIZE(AO)
  1290. LD b1, 3 * SIZE(BO)
  1291. MADD c11, c11, a1, b1
  1292. MADD c12, c12, a2, b1
  1293. daddiu L, L, -1
  1294. daddiu AO, AO, 8 * SIZE
  1295. bgtz L, .L72
  1296. daddiu BO, BO, 4 * SIZE
  1297. .align 3
  1298. .L75:
  1299. andi L, K, 3
  1300. NOP
  1301. blez L, .L78
  1302. NOP
  1303. .align 3
  1304. .L76:
  1305. LD a1, 0 * SIZE(AO)
  1306. LD a2, 1 * SIZE(AO)
  1307. LD b1, 0 * SIZE(BO)
  1308. MADD c11, c11, a1, b1
  1309. MADD c12, c12, a2, b1
  1310. daddiu L, L, -1
  1311. daddiu AO, AO, 2 * SIZE
  1312. bgtz L, .L76
  1313. daddiu BO, BO, 1 * SIZE
  1314. .L78:
  1315. LD $f0, 0 * SIZE(CO1)
  1316. LD $f1, 1 * SIZE(CO1)
  1317. LD $f2, 2 * SIZE(CO1)
  1318. LD $f3, 3 * SIZE(CO1)
  1319. ADD c11, c11, c21
  1320. daddiu I, I, -1
  1321. ADD c12, c12, c22
  1322. daddiu CO1,CO1, 4 * SIZE
  1323. MADD $f0, $f0, ALPHA_R, c11
  1324. MADD $f1, $f1, ALPHA_I, c11
  1325. MADD $f2, $f2, ALPHA_R, c12
  1326. MADD $f3, $f3, ALPHA_I, c12
  1327. ST $f0, -4 * SIZE(CO1)
  1328. ST $f1, -3 * SIZE(CO1)
  1329. ST $f2, -2 * SIZE(CO1)
  1330. bgtz I, .L71
  1331. ST $f3, -1 * SIZE(CO1)
  1332. .align 3
  1333. .L80:
  1334. andi I, M, 1
  1335. blez I, .L89
  1336. NOP
  1337. LD a1, 0 * SIZE(AO)
  1338. MTC $0, c11
  1339. LD a2, 1 * SIZE(AO)
  1340. MOV c21, c11
  1341. LD a3, 2 * SIZE(AO)
  1342. LD a4, 3 * SIZE(AO)
  1343. LD b1, 0 * SIZE(B)
  1344. LD b2, 1 * SIZE(B)
  1345. LD b3, 2 * SIZE(B)
  1346. LD b4, 3 * SIZE(B)
  1347. LD b5, 4 * SIZE(B)
  1348. LD b6, 8 * SIZE(B)
  1349. LD b7, 12 * SIZE(B)
  1350. dsra L, K, 2
  1351. blez L, .L85
  1352. move BO, B
  1353. .align 3
  1354. .L82:
  1355. LD a1, 0 * SIZE(AO)
  1356. LD b1, 0 * SIZE(BO)
  1357. MADD c11, c11, a1, b1
  1358. LD a1, 1 * SIZE(AO)
  1359. LD b1, 1 * SIZE(BO)
  1360. MADD c21, c21, a1, b1
  1361. LD a1, 2 * SIZE(AO)
  1362. LD b1, 2 * SIZE(BO)
  1363. MADD c11, c11, a1, b1
  1364. LD a1, 3 * SIZE(AO)
  1365. LD b1, 3 * SIZE(BO)
  1366. MADD c21, c21, a1, b1
  1367. daddiu L, L, -1
  1368. daddiu AO, AO, 4 * SIZE
  1369. bgtz L, .L82
  1370. daddiu BO, BO, 4 * SIZE
  1371. .align 3
  1372. .L85:
  1373. andi L, K, 3
  1374. NOP
  1375. blez L, .L88
  1376. NOP
  1377. .align 3
  1378. .L86:
  1379. LD a1, 0 * SIZE(AO)
  1380. LD b1, 0 * SIZE(BO)
  1381. MADD c11, c11, a1, b1
  1382. daddiu L, L, -1
  1383. daddiu AO, AO, 1 * SIZE
  1384. bgtz L, .L86
  1385. daddiu BO, BO, 1 * SIZE
  1386. .L88:
  1387. LD $f0, 0 * SIZE(CO1)
  1388. LD $f1, 1 * SIZE(CO1)
  1389. ADD c11, c11, c21
  1390. MADD $f0, $f0, ALPHA_R, c11
  1391. MADD $f1, $f1, ALPHA_I, c11
  1392. ST $f0, 0 * SIZE(CO1)
  1393. ST $f1, 1 * SIZE(CO1)
  1394. .align 3
  1395. .L89:
  1396. move B, BO
  1397. .align 3
  1398. .L999:
  1399. LDARG $16, 0($sp)
  1400. LDARG $17, 8($sp)
  1401. LDARG $18, 16($sp)
  1402. LDARG $19, 24($sp)
  1403. LDARG $20, 32($sp)
  1404. LDARG $21, 40($sp)
  1405. ldc1 $f24, 48($sp)
  1406. ldc1 $f25, 56($sp)
  1407. ldc1 $f26, 64($sp)
  1408. ldc1 $f27, 72($sp)
  1409. ldc1 $f28, 80($sp)
  1410. ldc1 $f29, 88($sp)
  1411. j $31
  1412. daddiu $sp, $sp, 128
  1413. EPILOGUE