You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LN_loongson3a.S 35 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938
  1. #define REALNAME ASMNAME
  2. #define ASSEMBLER
  3. #include "common.h"
  4. #define M $4
  5. #define N $5
  6. #define K $6
  7. #define A $8
  8. #define B $9
  9. #define C $10
  10. #define LDC $11
  11. #define AO $12
  12. #define BO $13
  13. #define I $2
  14. #define J $3
  15. #define L $7
  16. #define CO1 $14
  17. #define CO2 $15
  18. #define CO3 $16
  19. #define CO4 $17
  20. #define OFFSET $22
  21. #define KK $23
  22. #define TEMP $24
  23. #define AORIG $25
  24. #define a1 $f0
  25. #define a2 $f1
  26. #define a3 $f2
  27. #define a4 $f3
  28. #define a5 $f4
  29. #define a6 $f5
  30. #define a7 $f6
  31. #define a8 $f7
  32. #define b1 $f8
  33. #define b2 $f9
  34. #define b3 $f10
  35. #define b4 $f11
  36. #define b5 $f12
  37. #define b6 $f13
  38. #define b7 $f14
  39. #define b8 $f15
  40. #define t11 $f16
  41. #define t21 $f17
  42. #define t31 $f18
  43. #define t41 $f19
  44. #define t12 $f20
  45. #define t22 $f21
  46. #define t32 $f22
  47. #define t42 $f23
  48. #define t13 $f24
  49. #define t23 $f25
  50. #define t33 $f26
  51. #define t43 $f27
  52. #define t14 $f28
  53. #define t24 $f29
  54. #define t34 $f30
  55. #define t44 $f31
  56. #define ALPHA $f15
  57. PROLOGUE
  58. daddiu $sp, $sp, -144
  59. SDARG $16, 0($sp)
  60. SDARG $17, 8($sp)
  61. SDARG $18, 16($sp)
  62. SDARG $19, 24($sp)
  63. SDARG $20, 32($sp)
  64. SDARG $21, 40($sp)
  65. sdc1 $f24, 48($sp)
  66. sdc1 $f25, 56($sp)
  67. sdc1 $f26, 64($sp)
  68. sdc1 $f27, 72($sp)
  69. sdc1 $f28, 80($sp)
  70. SDARG $22, 88($sp)
  71. SDARG $23, 96($sp)
  72. SDARG $24, 104($sp)
  73. SDARG $25, 112($sp)
  74. #ifndef __64BIT__
  75. sdc1 $f20,112($sp)
  76. sdc1 $f21,120($sp)
  77. sdc1 $f22,128($sp)
  78. sdc1 $f23,136($sp)
  79. #endif
  80. # LN compute from bottom to top
  81. LDARG OFFSET, 144($sp)
  82. dsll LDC, LDC, BASE_SHIFT # ldc
  83. mult M, K
  84. mflo TEMP # TEMP=MC*KC
  85. dsll TEMP, TEMP, BASE_SHIFT
  86. daddu A, A, TEMP # A move to the end of sa
  87. dsll TEMP, M, BASE_SHIFT
  88. daddu C, C, TEMP # C+=MC
  89. dsra J, N, 2 # j = nc/4
  90. blez J, .L30
  91. nop
  92. .L10: # nr=4
  93. daddiu J, J, -1
  94. move CO1, C
  95. daddu CO2, C, LDC
  96. daddu CO3, CO2, LDC
  97. daddu CO4, CO3, LDC
  98. MTC $0, t11 # clear result registers
  99. MOV t21, t11
  100. MOV t31, t11
  101. MOV t41, t11
  102. MOV t12, t11
  103. MOV t22, t11
  104. MOV t32, t11
  105. MOV t42, t11
  106. daddu KK, M, OFFSET # kc - kk is the length of the rectangular data part of panel Ai
  107. move AORIG, A # reset A
  108. daddu C, CO4, LDC # fixed pointer C, the write back address
  109. andi I, M, 1 # mr=2,nr=4
  110. blez I, .L50
  111. nop
  112. dsll TEMP, K, BASE_SHIFT # mr=1
  113. dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai
  114. dsll L, KK, BASE_SHIFT # mr=1
  115. dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
  116. daddu AO, AORIG, L # AO point to the rectangular data part
  117. daddu BO, B, TEMP
  118. dsubu TEMP, K, KK
  119. MOV t13, t11 # mr=2
  120. MOV t23, t11
  121. MOV t33, t11
  122. MOV t43, t11
  123. MOV t14, t11
  124. MOV t24, t11
  125. MOV t34, t11
  126. MOV t44, t11
  127. LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
  128. LD b1, 0 * SIZE(BO) # get 4b
  129. LD b2, 1 * SIZE(BO)
  130. LD b3, 2 * SIZE(BO)
  131. LD b4, 3 * SIZE(BO)
  132. dsra L, TEMP, 2
  133. blez L, .L55
  134. nop
  135. .align 3
  136. .L52:
  137. LD a5, 1 * SIZE(AO)
  138. LD b5, 4 * SIZE(BO)
  139. LD b6, 5 * SIZE(BO)
  140. LD b7, 6 * SIZE(BO)
  141. LD b8, 7 * SIZE(BO)
  142. MADD t11, t11, a1, b1 # 1st compute
  143. MADD t12, t12, a1, b2
  144. MADD t13, t13, a1, b3
  145. MADD t14, t14, a1, b4
  146. LD a3, 2 * SIZE(AO)
  147. LD b1, 8 * SIZE(BO)
  148. LD b2, 9 * SIZE(BO)
  149. LD b3, 10 * SIZE(BO)
  150. LD b4, 11 * SIZE(BO)
  151. MADD t11, t11, a5, b5 # 2ed compute
  152. MADD t12, t12, a5, b6
  153. MADD t13, t13, a5, b7
  154. MADD t14, t14, a5, b8
  155. LD a7, 3 * SIZE(AO)
  156. LD b5, 12 * SIZE(BO)
  157. LD b6, 13 * SIZE(BO)
  158. LD b7, 14 * SIZE(BO)
  159. LD b8, 15 * SIZE(BO)
  160. MADD t11, t11, a3, b1 # 3rd compute
  161. MADD t12, t12, a3, b2
  162. MADD t13, t13, a3, b3
  163. MADD t14, t14, a3, b4
  164. daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
  165. daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
  166. LD a1, 0 * SIZE(AO) # next
  167. LD b1, 0 * SIZE(BO)
  168. LD b2, 1 * SIZE(BO)
  169. LD b3, 2 * SIZE(BO)
  170. LD b4, 3 * SIZE(BO)
  171. MADD t11, t11, a7, b5 # 4th compute
  172. MADD t12, t12, a7, b6
  173. MADD t13, t13, a7, b7
  174. MADD t14, t14, a7, b8
  175. daddiu L, L, -1
  176. bgtz L, .L52
  177. nop
  178. .align 3
  179. .L55:
  180. andi L, TEMP, 3
  181. blez L, .L58
  182. nop
  183. .align 3
  184. .L56:
  185. MADD t11, t11, a1, b1 # 3rd compute
  186. MADD t12, t12, a1, b2
  187. MADD t13, t13, a1, b3
  188. MADD t14, t14, a1, b4
  189. daddiu AO, AO, 1 * SIZE # AO += 1mr
  190. daddiu BO, BO, 4 * SIZE # BO += 4nr
  191. LD a1, 0 * SIZE(AO) # next
  192. LD b1, 0 * SIZE(BO)
  193. LD b2, 1 * SIZE(BO)
  194. LD b3, 2 * SIZE(BO)
  195. LD b4, 3 * SIZE(BO)
  196. daddiu L, L, -1
  197. bgtz L, .L56
  198. nop
  199. .L58: # deal with the triangular part
  200. daddiu TEMP, KK, -1
  201. dsll L, TEMP, BASE_SHIFT # mr=1
  202. dsll TEMP, TEMP, 2 + BASE_SHIFT
  203. daddu AO, AORIG, L # Ao point to the triangular data part
  204. daddu BO, B, TEMP
  205. LD b1, 0 * SIZE(BO)
  206. LD b2, 1 * SIZE(BO)
  207. LD b3, 2 * SIZE(BO)
  208. LD b4, 3 * SIZE(BO)
  209. SUB t11, b1, t11
  210. SUB t12, b2, t12
  211. SUB t13, b3, t13
  212. SUB t14, b4, t14
  213. LD b3, 0 * SIZE(AO)
  214. MUL t11, b3, t11
  215. MUL t12, b3, t12
  216. MUL t13, b3, t13
  217. MUL t14, b3, t14
  218. daddiu CO1, CO1, -1 * SIZE
  219. daddiu CO2, CO2, -1 * SIZE
  220. daddiu CO3, CO3, -1 * SIZE
  221. daddiu CO4, CO4, -1 * SIZE
  222. ST t11, 0 * SIZE(BO)
  223. ST t12, 1 * SIZE(BO)
  224. ST t13, 2 * SIZE(BO)
  225. ST t14, 3 * SIZE(BO)
  226. ST t11, 0 * SIZE(CO1)
  227. ST t12, 0 * SIZE(CO2)
  228. ST t13, 0 * SIZE(CO3)
  229. ST t14, 0 * SIZE(CO4)
  230. daddiu KK, KK, -1 # the length of rectangular data part increases by 1
  231. MTC $0, t11 # clear result registers
  232. MOV t21, t11
  233. MOV t31, t11
  234. MOV t41, t11
  235. MOV t12, t11
  236. MOV t22, t11
  237. MOV t32, t11
  238. MOV t42, t11
  239. .L50:
  240. andi I, M, 2 # mr=2,nr=4
  241. blez I, .L20
  242. nop
  243. dsll TEMP, K, 1 + BASE_SHIFT
  244. dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of Ai
  245. dsll L, KK, 1 + BASE_SHIFT
  246. dsll TEMP, KK, 2 + BASE_SHIFT
  247. daddu AO, AORIG, L # AO point to the rectangular data part
  248. daddu BO, B, TEMP
  249. dsubu TEMP, K, KK
  250. MOV t13, t11 # mr=2
  251. MOV t23, t11
  252. MOV t33, t11
  253. MOV t43, t11
  254. MOV t14, t11
  255. MOV t24, t11
  256. MOV t34, t11
  257. MOV t44, t11
  258. LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
  259. LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
  260. LD b1, 0 * SIZE(BO) # get 4b
  261. LD b2, 1 * SIZE(BO)
  262. LD b3, 2 * SIZE(BO)
  263. LD b4, 3 * SIZE(BO)
  264. dsra L, TEMP, 2
  265. blez L, .L25
  266. nop
  267. .align 3
  268. .L22:
  269. LD a5, 2 * SIZE(AO)
  270. LD a6, 3 * SIZE(AO)
  271. LD b5, 4 * SIZE(BO)
  272. LD b6, 5 * SIZE(BO)
  273. LD b7, 6 * SIZE(BO)
  274. LD b8, 7 * SIZE(BO)
  275. MADD t11, t11, a1, b1 # 1st compute
  276. MADD t21, t21, a2, b1
  277. MADD t12, t12, a1, b2
  278. MADD t22, t22, a2, b2
  279. MADD t13, t13, a1, b3
  280. MADD t23, t23, a2, b3
  281. MADD t14, t14, a1, b4
  282. MADD t24, t24, a2, b4
  283. LD a3, 4 * SIZE(AO)
  284. LD a4, 5 * SIZE(AO)
  285. LD b1, 8 * SIZE(BO)
  286. LD b2, 9 * SIZE(BO)
  287. LD b3, 10 * SIZE(BO)
  288. LD b4, 11 * SIZE(BO)
  289. MADD t11, t11, a5, b5 # 2ed compute
  290. MADD t21, t21, a6, b5
  291. MADD t12, t12, a5, b6
  292. MADD t22, t22, a6, b6
  293. MADD t13, t13, a5, b7
  294. MADD t23, t23, a6, b7
  295. MADD t14, t14, a5, b8
  296. MADD t24, t24, a6, b8
  297. LD a7, 6 * SIZE(AO)
  298. LD a8, 7 * SIZE(AO)
  299. LD b5, 12 * SIZE(BO)
  300. LD b6, 13 * SIZE(BO)
  301. LD b7, 14 * SIZE(BO)
  302. LD b8, 15 * SIZE(BO)
  303. MADD t11, t11, a3, b1 # 3rd compute
  304. MADD t21, t21, a4, b1
  305. MADD t12, t12, a3, b2
  306. MADD t22, t22, a4, b2
  307. MADD t13, t13, a3, b3
  308. MADD t23, t23, a4, b3
  309. MADD t14, t14, a3, b4
  310. MADD t24, t24, a4, b4
  311. daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
  312. daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
  313. LD a1, 0 * SIZE(AO) # next
  314. LD a2, 1 * SIZE(AO)
  315. LD b1, 0 * SIZE(BO)
  316. LD b2, 1 * SIZE(BO)
  317. LD b3, 2 * SIZE(BO)
  318. LD b4, 3 * SIZE(BO)
  319. MADD t11, t11, a7, b5 # 4th compute
  320. MADD t21, t21, a8, b5
  321. MADD t12, t12, a7, b6
  322. MADD t22, t22, a8, b6
  323. MADD t13, t13, a7, b7
  324. MADD t23, t23, a8, b7
  325. MADD t14, t14, a7, b8
  326. MADD t24, t24, a8, b8
  327. daddiu L, L, -1
  328. bgtz L, .L22
  329. nop
  330. .align 3
  331. .L25:
  332. andi L, TEMP, 3
  333. blez L, .L28
  334. nop
  335. .align 3
  336. .L26:
  337. MADD t11, t11, a1, b1 # 3rd compute
  338. MADD t21, t21, a2, b1
  339. MADD t12, t12, a1, b2
  340. MADD t22, t22, a2, b2
  341. MADD t13, t13, a1, b3
  342. MADD t23, t23, a2, b3
  343. MADD t14, t14, a1, b4
  344. MADD t24, t24, a2, b4
  345. daddiu AO, AO, 2 * SIZE # AO += 2mr
  346. daddiu BO, BO, 4 * SIZE # BO += 4nr
  347. LD a1, 0 * SIZE(AO) # next
  348. LD a2, 1 * SIZE(AO)
  349. LD b1, 0 * SIZE(BO)
  350. LD b2, 1 * SIZE(BO)
  351. LD b3, 2 * SIZE(BO)
  352. LD b4, 3 * SIZE(BO)
  353. daddiu L, L, -1
  354. bgtz L, .L26
  355. nop
  356. .L28: # deal with the triangular part
  357. daddiu TEMP, KK, -2
  358. dsll L, TEMP, 1 + BASE_SHIFT
  359. dsll TEMP, TEMP, 2 + BASE_SHIFT
  360. daddu AO, AORIG, L # Ao point to the triangular data part
  361. daddu BO, B, TEMP
  362. LD b1, 0 * SIZE(BO)
  363. LD b2, 1 * SIZE(BO)
  364. LD b3, 2 * SIZE(BO)
  365. LD b4, 3 * SIZE(BO)
  366. LD b5, 4 * SIZE(BO)
  367. LD b6, 5 * SIZE(BO)
  368. LD b7, 6 * SIZE(BO)
  369. LD b8, 7 * SIZE(BO)
  370. SUB t11, b1, t11
  371. SUB t12, b2, t12
  372. SUB t13, b3, t13
  373. SUB t14, b4, t14
  374. SUB t21, b5, t21
  375. SUB t22, b6, t22
  376. SUB t23, b7, t23
  377. SUB t24, b8, t24
  378. LD b1, 3 * SIZE(AO) # computes the triangular_part
  379. LD b2, 2 * SIZE(AO)
  380. MUL t21, b1, t21
  381. MUL t22, b1, t22
  382. MUL t23, b1, t23
  383. MUL t24, b1, t24
  384. NMSUB t11, t11, b2, t21
  385. NMSUB t12, t12, b2, t22
  386. NMSUB t13, t13, b2, t23
  387. NMSUB t14, t14, b2, t24
  388. LD b3, 0 * SIZE(AO)
  389. MUL t11, b3, t11
  390. MUL t12, b3, t12
  391. MUL t13, b3, t13
  392. MUL t14, b3, t14
  393. daddiu CO1, CO1, -2 * SIZE
  394. daddiu CO2, CO2, -2 * SIZE
  395. daddiu CO3, CO3, -2 * SIZE
  396. daddiu CO4, CO4, -2 * SIZE
  397. ST t11, 0 * SIZE(BO)
  398. ST t12, 1 * SIZE(BO)
  399. ST t13, 2 * SIZE(BO)
  400. ST t14, 3 * SIZE(BO)
  401. ST t21, 4 * SIZE(BO)
  402. ST t22, 5 * SIZE(BO)
  403. ST t23, 6 * SIZE(BO)
  404. ST t24, 7 * SIZE(BO)
  405. ST t11, 0 * SIZE(CO1)
  406. ST t21, 1 * SIZE(CO1)
  407. ST t12, 0 * SIZE(CO2)
  408. ST t22, 1 * SIZE(CO2)
  409. ST t13, 0 * SIZE(CO3)
  410. ST t23, 1 * SIZE(CO3)
  411. ST t14, 0 * SIZE(CO4)
  412. ST t24, 1 * SIZE(CO4)
  413. daddiu KK, KK, -2 # the length of rectangular data part increases by 2
  414. MTC $0, t11 # clear result registers
  415. MOV t21, t11
  416. MOV t31, t11
  417. MOV t41, t11
  418. MOV t12, t11
  419. MOV t22, t11
  420. MOV t32, t11
  421. MOV t42, t11
  422. .L20:
  423. dsra I, M, 2 # I=MC/4
  424. blez I, .L29
  425. nop
  426. .L11: # mr=4
  427. dsll TEMP, K, 2 + BASE_SHIFT # TEMP=KC*MR*data_Byte
  428. dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai
  429. dsll L, KK, 2 + BASE_SHIFT # KC-KK is the length of the rectangular data part of Ai
  430. dsll TEMP, KK, 2 + BASE_SHIFT # KK*NR*data_Byte
  431. daddu AO, AORIG, L # AO point to the rectangular data part
  432. daddu BO, B, TEMP
  433. dsubu TEMP, K, KK
  434. LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
  435. LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
  436. LD a3, 2 * SIZE(AO)
  437. LD a4, 3 * SIZE(AO) # get 4a
  438. LD b1, 0 * SIZE(BO) # get 4b
  439. LD b2, 1 * SIZE(BO)
  440. LD b3, 2 * SIZE(BO)
  441. LD b4, 3 * SIZE(BO)
  442. MOV t13, t11 # clear result registers
  443. MOV t23, t11
  444. MOV t33, t11
  445. MOV t43, t11
  446. MOV t14, t11
  447. MOV t24, t11
  448. MOV t34, t11
  449. MOV t44, t11
  450. dsra L, TEMP, 2 # L=(KC-offset)/4
  451. blez L, .L15
  452. nop
  453. .align 3
  454. .L12:
  455. LD a5, 4 * SIZE(AO)
  456. LD a6, 5 * SIZE(AO)
  457. LD a7, 6 * SIZE(AO)
  458. LD a8, 7 * SIZE(AO)
  459. LD b5, 4 * SIZE(BO)
  460. LD b6, 5 * SIZE(BO)
  461. LD b7, 6 * SIZE(BO)
  462. LD b8, 7 * SIZE(BO)
  463. MADD t11, t11, a1, b1 # 1st compute
  464. MADD t21, t21, a2, b1
  465. MADD t31, t31, a3, b1
  466. MADD t41, t41, a4, b1
  467. MADD t12, t12, a1, b2
  468. MADD t22, t22, a2, b2
  469. MADD t32, t32, a3, b2
  470. MADD t42, t42, a4, b2
  471. MADD t13, t13, a1, b3
  472. MADD t23, t23, a2, b3
  473. MADD t33, t33, a3, b3
  474. MADD t43, t43, a4, b3
  475. MADD t14, t14, a1, b4
  476. MADD t24, t24, a2, b4
  477. MADD t34, t34, a3, b4
  478. MADD t44, t44, a4, b4
  479. LD a1, 8 * SIZE(AO)
  480. LD a2, 9 * SIZE(AO)
  481. LD a3, 10 * SIZE(AO)
  482. LD a4, 11 * SIZE(AO)
  483. LD b1, 8 * SIZE(BO)
  484. LD b2, 9 * SIZE(BO)
  485. LD b3, 10 * SIZE(BO)
  486. LD b4, 11 * SIZE(BO)
  487. MADD t11, t11, a5, b5 # 2ed compute
  488. MADD t21, t21, a6, b5
  489. MADD t31, t31, a7, b5
  490. MADD t41, t41, a8, b5
  491. MADD t12, t12, a5, b6
  492. MADD t22, t22, a6, b6
  493. MADD t32, t32, a7, b6
  494. MADD t42, t42, a8, b6
  495. MADD t13, t13, a5, b7
  496. MADD t23, t23, a6, b7
  497. MADD t33, t33, a7, b7
  498. MADD t43, t43, a8, b7
  499. MADD t14, t14, a5, b8
  500. MADD t24, t24, a6, b8
  501. MADD t34, t34, a7, b8
  502. MADD t44, t44, a8, b8
  503. LD a5, 12 * SIZE(AO)
  504. LD a6, 13 * SIZE(AO)
  505. LD a7, 14 * SIZE(AO)
  506. LD a8, 15 * SIZE(AO)
  507. LD b5, 12 * SIZE(BO)
  508. LD b6, 13 * SIZE(BO)
  509. LD b7, 14 * SIZE(BO)
  510. LD b8, 15 * SIZE(BO)
  511. MADD t11, t11, a1, b1 # 3rd compute
  512. MADD t21, t21, a2, b1
  513. MADD t31, t31, a3, b1
  514. MADD t41, t41, a4, b1
  515. MADD t12, t12, a1, b2
  516. MADD t22, t22, a2, b2
  517. MADD t32, t32, a3, b2
  518. MADD t42, t42, a4, b2
  519. MADD t13, t13, a1, b3
  520. MADD t23, t23, a2, b3
  521. MADD t33, t33, a3, b3
  522. MADD t43, t43, a4, b3
  523. MADD t14, t14, a1, b4
  524. MADD t24, t24, a2, b4
  525. MADD t34, t34, a3, b4
  526. MADD t44, t44, a4, b4
  527. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  528. daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
  529. LD a1, 0 * SIZE(AO) # next
  530. LD a2, 1 * SIZE(AO)
  531. LD a3, 2 * SIZE(AO)
  532. LD a4, 3 * SIZE(AO)
  533. LD b1, 0 * SIZE(BO)
  534. LD b2, 1 * SIZE(BO)
  535. LD b3, 2 * SIZE(BO)
  536. LD b4, 3 * SIZE(BO)
  537. MADD t11, t11, a5, b5 # 4th compute
  538. MADD t21, t21, a6, b5
  539. MADD t31, t31, a7, b5
  540. MADD t41, t41, a8, b5
  541. MADD t12, t12, a5, b6
  542. MADD t22, t22, a6, b6
  543. MADD t32, t32, a7, b6
  544. MADD t42, t42, a8, b6
  545. MADD t13, t13, a5, b7
  546. MADD t23, t23, a6, b7
  547. MADD t33, t33, a7, b7
  548. MADD t43, t43, a8, b7
  549. MADD t14, t14, a5, b8
  550. MADD t24, t24, a6, b8
  551. MADD t34, t34, a7, b8
  552. MADD t44, t44, a8, b8
  553. daddiu L, L, -1
  554. bgtz L, .L12
  555. nop
  556. .align 3
  557. .L15:
  558. andi L, TEMP, 3
  559. blez L, .L18
  560. nop
  561. .align 3
  562. .L16:
  563. MADD t11, t11, a1, b1
  564. MADD t21, t21, a2, b1
  565. MADD t31, t31, a3, b1
  566. MADD t41, t41, a4, b1
  567. MADD t12, t12, a1, b2
  568. MADD t22, t22, a2, b2
  569. MADD t32, t32, a3, b2
  570. MADD t42, t42, a4, b2
  571. MADD t13, t13, a1, b3
  572. MADD t23, t23, a2, b3
  573. MADD t33, t33, a3, b3
  574. MADD t43, t43, a4, b3
  575. MADD t14, t14, a1, b4
  576. MADD t24, t24, a2, b4
  577. MADD t34, t34, a3, b4
  578. MADD t44, t44, a4, b4
  579. daddiu AO, AO, 4 * SIZE # AO += 4mr
  580. daddiu BO, BO, 4 * SIZE # BO += 4nr
  581. LD a1, 0 * SIZE(AO) # next
  582. LD a2, 1 * SIZE(AO)
  583. LD a3, 2 * SIZE(AO)
  584. LD a4, 3 * SIZE(AO)
  585. LD b1, 0 * SIZE(BO)
  586. LD b2, 1 * SIZE(BO)
  587. LD b3, 2 * SIZE(BO)
  588. LD b4, 3 * SIZE(BO)
  589. daddiu L, L, -1
  590. bgtz L, .L16
  591. nop
  592. .L18: # deal with the triangular data part of panel Ai
  593. daddiu TEMP, KK, -4 #
  594. dsll L, TEMP, 2 + BASE_SHIFT
  595. dsll TEMP, TEMP, 2 + BASE_SHIFT
  596. daddu AO, AORIG, L # AO point to the triangular data part
  597. daddu BO, B, TEMP
  598. LD b1, 0 * SIZE(BO) # triangular_part*X + rectangular_part = B
  599. LD b2, 1 * SIZE(BO) # triangular_part*X = B - rectangular_part
  600. LD b3, 2 * SIZE(BO)
  601. LD b4, 3 * SIZE(BO)
  602. SUB t11, b1, t11
  603. SUB t12, b2, t12
  604. SUB t13, b3, t13
  605. SUB t14, b4, t14
  606. LD b5, 4 * SIZE(BO) # sb store in row major
  607. LD b6, 5 * SIZE(BO)
  608. LD b7, 6 * SIZE(BO)
  609. LD b8, 7 * SIZE(BO)
  610. SUB t21, b5, t21
  611. SUB t22, b6, t22
  612. SUB t23, b7, t23
  613. SUB t24, b8, t24
  614. LD b1, 8 * SIZE(BO)
  615. LD b2, 9 * SIZE(BO)
  616. LD b3, 10 * SIZE(BO)
  617. LD b4, 11 * SIZE(BO)
  618. SUB t31, b1, t31
  619. SUB t32, b2, t32
  620. SUB t33, b3, t33
  621. SUB t34, b4, t34
  622. LD b5, 12 * SIZE(BO)
  623. LD b6, 13 * SIZE(BO)
  624. LD b7, 14 * SIZE(BO)
  625. LD b8, 15 * SIZE(BO)
  626. SUB t41, b5, t41
  627. SUB t42, b6, t42
  628. SUB t43, b7, t43
  629. SUB t44, b8, t44
  630. LD b1, 15 * SIZE(AO)
  631. LD b2, 14 * SIZE(AO)
  632. LD b4, 13 * SIZE(AO)
  633. LD b7, 12 * SIZE(AO)
  634. MUL t41, b1, t41
  635. MUL t42, b1, t42
  636. MUL t43, b1, t43
  637. MUL t44, b1, t44
  638. NMSUB t31, t31, b2, t41
  639. NMSUB t32, t32, b2, t42
  640. NMSUB t33, t33, b2, t43
  641. NMSUB t34, t34, b2, t44
  642. NMSUB t21, t21, b4, t41
  643. NMSUB t22, t22, b4, t42
  644. NMSUB t23, t23, b4, t43
  645. NMSUB t24, t24, b4, t44
  646. NMSUB t11, t11, b7, t41
  647. NMSUB t12, t12, b7, t42
  648. NMSUB t13, t13, b7, t43
  649. NMSUB t14, t14, b7, t44
  650. LD b3, 10 * SIZE(AO)
  651. LD b5, 9 * SIZE(AO)
  652. LD b8, 8 * SIZE(AO)
  653. MUL t31, b3, t31
  654. MUL t32, b3, t32
  655. MUL t33, b3, t33
  656. MUL t34, b3, t34
  657. NMSUB t21, t21, b5, t31
  658. NMSUB t22, t22, b5, t32
  659. NMSUB t23, t23, b5, t33
  660. NMSUB t24, t24, b5, t34
  661. NMSUB t11, t11, b8, t31
  662. NMSUB t12, t12, b8, t32
  663. NMSUB t13, t13, b8, t33
  664. NMSUB t14, t14, b8, t34
  665. LD b6, 5 * SIZE(AO)
  666. LD b1, 4 * SIZE(AO)
  667. MUL t21, b6, t21
  668. MUL t22, b6, t22
  669. MUL t23, b6, t23
  670. MUL t24, b6, t24
  671. NMSUB t11, t11, b1, t21
  672. NMSUB t12, t12, b1, t22
  673. NMSUB t13, t13, b1, t23
  674. NMSUB t14, t14, b1, t24
  675. LD b2, 0 * SIZE(AO)
  676. MUL t11, b2, t11
  677. MUL t12, b2, t12
  678. MUL t13, b2, t13
  679. MUL t14, b2, t14
  680. daddiu CO1, CO1, -4 * SIZE # modify
  681. daddiu CO2, CO2, -4 * SIZE
  682. daddiu CO3, CO3, -4 * SIZE
  683. daddiu CO4, CO4, -4 * SIZE
  684. ST t11, 0 * SIZE(BO) # update packed B
  685. ST t12, 1 * SIZE(BO)
  686. ST t13, 2 * SIZE(BO)
  687. ST t14, 3 * SIZE(BO)
  688. ST t21, 4 * SIZE(BO)
  689. ST t22, 5 * SIZE(BO)
  690. ST t23, 6 * SIZE(BO)
  691. ST t24, 7 * SIZE(BO)
  692. ST t31, 8 * SIZE(BO)
  693. ST t32, 9 * SIZE(BO)
  694. ST t33, 10 * SIZE(BO)
  695. ST t34, 11 * SIZE(BO)
  696. ST t41, 12 * SIZE(BO)
  697. ST t42, 13 * SIZE(BO)
  698. ST t43, 14 * SIZE(BO)
  699. ST t44, 15 * SIZE(BO)
  700. ST t11, 0 * SIZE(CO1) # write back
  701. ST t21, 1 * SIZE(CO1)
  702. ST t31, 2 * SIZE(CO1)
  703. ST t41, 3 * SIZE(CO1)
  704. ST t12, 0 * SIZE(CO2)
  705. ST t22, 1 * SIZE(CO2)
  706. ST t32, 2 * SIZE(CO2)
  707. ST t42, 3 * SIZE(CO2)
  708. ST t13, 0 * SIZE(CO3)
  709. ST t23, 1 * SIZE(CO3)
  710. ST t33, 2 * SIZE(CO3)
  711. ST t43, 3 * SIZE(CO3)
  712. ST t14, 0 * SIZE(CO4)
  713. ST t24, 1 * SIZE(CO4)
  714. ST t34, 2 * SIZE(CO4)
  715. ST t44, 3 * SIZE(CO4)
  716. daddiu KK, KK, -4 # KC-KK is the length of the rectangular data part, LN compute from bottom to top so KK-=4
  717. daddiu I, I, -1
  718. MTC $0, a1
  719. MOV t11, a1
  720. MOV t21, a1
  721. MOV t31, a1
  722. MOV t41, a1
  723. MOV t12, a1
  724. MOV t22, a1
  725. MOV t32, a1
  726. MOV t42, a1
  727. bgtz I, .L11
  728. nop
  729. .align 3
  730. .L29:
  731. dsll TEMP, K, 2 + BASE_SHIFT
  732. daddu B, B, TEMP # B point to next Bj
  733. bgtz J, .L10
  734. nop
  735. .align 3
  736. .L30:
  737. andi J, N, 2 # nr=2
  738. blez J, .L70
  739. nop
  740. move CO1, C
  741. daddu CO2, C, LDC
  742. MTC $0, t11 # clear result regusters
  743. MOV t21, t11
  744. MOV t31, t11
  745. MOV t41, t11
  746. daddu KK, M, OFFSET
  747. move AORIG, A # reset A
  748. daddu C, CO2, LDC # fixed
  749. andi I, M, 1 # mr=1
  750. blez I, .L60
  751. nop
  752. dsll TEMP, K, BASE_SHIFT
  753. dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of everypanel of Ai
  754. dsll L, KK, BASE_SHIFT # mr=1
  755. dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
  756. daddu AO, AORIG, L # AO point to rectangular data part
  757. daddu BO, B, TEMP
  758. dsubu TEMP, K, KK
  759. MOV t12, t11 # clear result registers
  760. MOV t22, t11
  761. MOV t32, t11
  762. MOV t42, t11
  763. LD a1, 0 * SIZE(AO)
  764. LD b1, 0 * SIZE(BO)
  765. LD b2, 1 * SIZE(BO)
  766. dsra L, TEMP, 2
  767. blez L, .L65
  768. nop
  769. .align 3
  770. .L62:
  771. LD a5, 1 * SIZE(AO)
  772. LD b5, 2 * SIZE(BO)
  773. LD b6, 3 * SIZE(BO)
  774. MADD t11, t11, a1, b1 # 1st compute
  775. MADD t12, t12, a1, b2
  776. LD a3, 2 * SIZE(AO)
  777. LD b3, 4 * SIZE(BO)
  778. LD b4, 5 * SIZE(BO)
  779. MADD t11, t11, a5, b5 # 2ed compute
  780. MADD t12, t12, a5, b6
  781. LD a7, 3 * SIZE(AO)
  782. LD b7, 6 * SIZE(BO)
  783. LD b8, 7 * SIZE(BO)
  784. MADD t11, t11, a3, b3 # 3rd compute
  785. MADD t12, t12, a3, b4
  786. daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
  787. daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
  788. LD a1, 0 * SIZE(AO) # next
  789. LD b1, 0 * SIZE(BO)
  790. LD b2, 1 * SIZE(BO)
  791. MADD t11, t11, a7, b7 # 4th compute
  792. MADD t12, t12, a7, b8
  793. daddiu L, L, -1
  794. bgtz L, .L62
  795. nop
  796. .align 3
  797. .L65:
  798. andi L, TEMP, 3
  799. blez L, .L68
  800. nop
  801. .align 3
  802. .L66:
  803. MADD t11, t11, a1, b1 # 3rd compute
  804. MADD t21, t21, a2, b1
  805. MADD t12, t12, a1, b2
  806. MADD t22, t22, a2, b2
  807. daddiu AO, AO, 1 * SIZE # AO += mr
  808. daddiu BO, BO, 2 * SIZE # BO += 2nr
  809. LD a1, 0 * SIZE(AO) # next
  810. LD b1, 0 * SIZE(BO)
  811. LD b2, 1 * SIZE(BO)
  812. daddiu L, L, -1
  813. bgtz L, .L66
  814. nop
  815. .L68:
  816. daddiu TEMP, KK, -1 # mr=1
  817. dsll L, TEMP, BASE_SHIFT
  818. dsll TEMP, TEMP, 1 + BASE_SHIFT
  819. daddu AO, AORIG, L # Ao point to the triangular data part
  820. daddu BO, B, TEMP
  821. LD b1, 0 * SIZE(BO)
  822. LD b2, 1 * SIZE(BO)
  823. SUB t11, b1, t11
  824. SUB t12, b2, t12
  825. LD b3, 0 * SIZE(AO)
  826. MUL t11, b3, t11
  827. MUL t12, b3, t12
  828. daddiu CO1, CO1, -1 * SIZE
  829. daddiu CO2, CO2, -1 * SIZE
  830. ST t11, 0 * SIZE(BO)
  831. ST t12, 1 * SIZE(BO)
  832. ST t11, 0 * SIZE(CO1)
  833. ST t12, 0 * SIZE(CO2)
  834. daddiu KK, KK, -1
  835. MTC $0, t11 # clear result regusters
  836. MOV t21, t11
  837. MOV t31, t11
  838. MOV t41, t11
  839. .L60:
  840. andi I, M, 2
  841. blez I, .L40
  842. nop
  843. dsll TEMP, K, 1 + BASE_SHIFT
  844. dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of everypanel of Ai
  845. dsll L, KK, 1 + BASE_SHIFT # mr=2
  846. dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
  847. daddu AO, AORIG, L # AO point to rectangular data part
  848. daddu BO, B, TEMP
  849. dsubu TEMP, K, KK
  850. MOV t12, t11 # clear result registers
  851. MOV t22, t11
  852. MOV t32, t11
  853. MOV t42, t11
  854. LD a1, 0 * SIZE(AO)
  855. LD a2, 1 * SIZE(AO)
  856. LD b1, 0 * SIZE(BO)
  857. LD b2, 1 * SIZE(BO)
  858. dsra L, TEMP, 2
  859. blez L, .L45
  860. nop
  861. .align 3
  862. .L42:
  863. LD a5, 2 * SIZE(AO)
  864. LD a6, 3 * SIZE(AO)
  865. LD b5, 2 * SIZE(BO)
  866. LD b6, 3 * SIZE(BO)
  867. MADD t11, t11, a1, b1 # 1st compute
  868. MADD t21, t21, a2, b1
  869. MADD t12, t12, a1, b2
  870. MADD t22, t22, a2, b2
  871. LD a3, 4 * SIZE(AO)
  872. LD a4, 5 * SIZE(AO)
  873. LD b3, 4 * SIZE(BO)
  874. LD b4, 5 * SIZE(BO)
  875. MADD t11, t11, a5, b5 # 2ed compute
  876. MADD t21, t21, a6, b5
  877. MADD t12, t12, a5, b6
  878. MADD t22, t22, a6, b6
  879. LD a7, 6 * SIZE(AO)
  880. LD a8, 7 * SIZE(AO)
  881. LD b7, 6 * SIZE(BO)
  882. LD b8, 7 * SIZE(BO)
  883. MADD t11, t11, a3, b3 # 3rd compute
  884. MADD t21, t21, a4, b3
  885. MADD t12, t12, a3, b4
  886. MADD t22, t22, a4, b4
  887. daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
  888. daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
  889. LD a1, 0 * SIZE(AO) # next
  890. LD a2, 1 * SIZE(AO)
  891. LD b1, 0 * SIZE(BO)
  892. LD b2, 1 * SIZE(BO)
  893. MADD t11, t11, a7, b7 # 4th compute
  894. MADD t21, t21, a8, b7
  895. MADD t12, t12, a7, b8
  896. MADD t22, t22, a8, b8
  897. daddiu L, L, -1
  898. bgtz L, .L42
  899. nop
  900. .align 3
  901. .L45:
  902. andi L, TEMP, 3
  903. blez L, .L48
  904. nop
  905. .align 3
  906. .L46:
  907. MADD t11, t11, a1, b1 # 3rd compute
  908. MADD t21, t21, a2, b1
  909. MADD t12, t12, a1, b2
  910. MADD t22, t22, a2, b2
  911. daddiu AO, AO, 2 * SIZE # AO += 2mr
  912. daddiu BO, BO, 2 * SIZE # BO += 2nr
  913. LD a1, 0 * SIZE(AO) # next
  914. LD a2, 1 * SIZE(AO)
  915. LD b1, 0 * SIZE(BO)
  916. LD b2, 1 * SIZE(BO)
  917. daddiu L, L, -1
  918. bgtz L, .L46
  919. nop
  920. .L48:
  921. daddiu TEMP, KK, -2
  922. dsll L, TEMP, 1 + BASE_SHIFT
  923. dsll TEMP, TEMP, 1 + BASE_SHIFT
  924. daddu AO, AORIG, L # Ao point to the triangular data part
  925. daddu BO, B, TEMP
  926. LD b1, 0 * SIZE(BO)
  927. LD b2, 1 * SIZE(BO)
  928. LD b3, 2 * SIZE(BO)
  929. LD b4, 3 * SIZE(BO)
  930. SUB t11, b1, t11
  931. SUB t12, b2, t12
  932. SUB t21, b3, t21
  933. SUB t22, b4, t22
  934. LD b1, 3 * SIZE(AO) # computes the triangular_part
  935. LD b2, 2 * SIZE(AO)
  936. MUL t21, b1, t21
  937. MUL t22, b1, t22
  938. NMSUB t11, t11, b2, t21
  939. NMSUB t12, t12, b2, t22
  940. LD b3, 0 * SIZE(AO)
  941. MUL t11, b3, t11
  942. MUL t12, b3, t12
  943. daddiu CO1, CO1, -2 * SIZE
  944. daddiu CO2, CO2, -2 * SIZE
  945. ST t11, 0 * SIZE(BO)
  946. ST t12, 1 * SIZE(BO)
  947. ST t21, 2 * SIZE(BO)
  948. ST t22, 3 * SIZE(BO)
  949. ST t11, 0 * SIZE(CO1)
  950. ST t21, 1 * SIZE(CO1)
  951. ST t12, 0 * SIZE(CO2)
  952. ST t22, 1 * SIZE(CO2)
  953. daddiu KK, KK, -2
  954. MTC $0, t11 # clear result regusters
  955. MOV t21, t11
  956. MOV t31, t11
  957. MOV t41, t11
  958. .L40:
  959. dsra I, M, 2 # I = mc/4
  960. blez I, .L49
  961. nop
  962. .L31:
  963. dsll TEMP, K, 2 + BASE_SHIFT
  964. dsubu AORIG, AORIG, TEMP # AORIG point to the beginning address of panel Ai
  965. dsll L, KK, 2 + BASE_SHIFT # mr=4
  966. dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
  967. daddu AO, AORIG, L # AO point to the rectangular data part
  968. daddu BO, B, TEMP
  969. dsubu TEMP, K, KK
  970. MOV t12, t11
  971. MOV t22, t11
  972. MOV t32, t11
  973. MOV t42, t11
  974. LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
  975. LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
  976. LD a3, 2 * SIZE(AO)
  977. LD a4, 3 * SIZE(AO) # get 4a
  978. LD b1, 0 * SIZE(BO) # get 4b
  979. LD b2, 1 * SIZE(BO)
  980. dsra L, TEMP, 2
  981. blez L, .L35
  982. nop
  983. .align 3
  984. .L32:
  985. LD a5, 4 * SIZE(AO)
  986. LD a6, 5 * SIZE(AO)
  987. LD a7, 6 * SIZE(AO)
  988. LD a8, 7 * SIZE(AO)
  989. LD b5, 2 * SIZE(BO)
  990. LD b6, 3 * SIZE(BO)
  991. MADD t11, t11, a1, b1 # 1st compute
  992. MADD t21, t21, a2, b1
  993. MADD t31, t31, a3, b1
  994. MADD t41, t41, a4, b1
  995. MADD t12, t12, a1, b2
  996. MADD t22, t22, a2, b2
  997. MADD t32, t32, a3, b2
  998. MADD t42, t42, a4, b2
  999. LD a1, 8 * SIZE(AO)
  1000. LD a2, 9 * SIZE(AO)
  1001. LD a3, 10 * SIZE(AO)
  1002. LD a4, 11 * SIZE(AO)
  1003. LD b3, 4 * SIZE(BO)
  1004. LD b4, 5 * SIZE(BO)
  1005. MADD t11, t11, a5, b5 # 2ed compute
  1006. MADD t21, t21, a6, b5
  1007. MADD t31, t31, a7, b5
  1008. MADD t41, t41, a8, b5
  1009. MADD t12, t12, a5, b6
  1010. MADD t22, t22, a6, b6
  1011. MADD t32, t32, a7, b6
  1012. MADD t42, t42, a8, b6
  1013. LD a5, 12 * SIZE(AO)
  1014. LD a6, 13 * SIZE(AO)
  1015. LD a7, 14 * SIZE(AO)
  1016. LD a8, 15 * SIZE(AO)
  1017. LD b7, 6 * SIZE(BO)
  1018. LD b8, 7 * SIZE(BO)
  1019. MADD t11, t11, a1, b3 # 3rd compute
  1020. MADD t21, t21, a2, b3
  1021. MADD t31, t31, a3, b3
  1022. MADD t41, t41, a4, b3
  1023. MADD t12, t12, a1, b4
  1024. MADD t22, t22, a2, b4
  1025. MADD t32, t32, a3, b4
  1026. MADD t42, t42, a4, b4
  1027. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  1028. daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
  1029. LD a1, 0 * SIZE(AO) # next
  1030. LD a2, 1 * SIZE(AO)
  1031. LD a3, 2 * SIZE(AO)
  1032. LD a4, 3 * SIZE(AO)
  1033. LD b1, 0 * SIZE(BO)
  1034. LD b2, 1 * SIZE(BO)
  1035. MADD t11, t11, a5, b7 # 4th compute
  1036. MADD t21, t21, a6, b7
  1037. MADD t31, t31, a7, b7
  1038. MADD t41, t41, a8, b7
  1039. MADD t12, t12, a5, b8
  1040. MADD t22, t22, a6, b8
  1041. MADD t32, t32, a7, b8
  1042. MADD t42, t42, a8, b8
  1043. daddiu L, L, -1
  1044. bgtz L, .L32
  1045. nop
  1046. .align 3
  1047. .L35:
  1048. andi L, TEMP, 3
  1049. blez L, .L38
  1050. nop
  1051. .align 3
  1052. .L36:
  1053. MADD t11, t11, a1, b1 # 3rd compute
  1054. MADD t21, t21, a2, b1
  1055. MADD t31, t31, a3, b1
  1056. MADD t41, t41, a4, b1
  1057. MADD t12, t12, a1, b2
  1058. MADD t22, t22, a2, b2
  1059. MADD t32, t32, a3, b2
  1060. MADD t42, t42, a4, b2
  1061. daddiu AO, AO, 4 * SIZE # AO += 4mr
  1062. daddiu BO, BO, 2 * SIZE # BO += 2nr
  1063. LD a1, 0 * SIZE(AO) # next
  1064. LD a2, 1 * SIZE(AO)
  1065. LD a3, 2 * SIZE(AO)
  1066. LD a4, 3 * SIZE(AO)
  1067. LD b1, 0 * SIZE(BO)
  1068. LD b2, 1 * SIZE(BO)
  1069. daddiu L, L, -1
  1070. bgtz L, .L36
  1071. nop
  1072. .L38: #
  1073. daddiu TEMP, KK, -4
  1074. dsll L, TEMP, 2 + BASE_SHIFT # mr=4
  1075. dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
  1076. daddu AO, AORIG, L # AO point to the triangular data part
  1077. daddu BO, B, TEMP
  1078. LD b1, 0 * SIZE(BO)
  1079. LD b2, 1 * SIZE(BO)
  1080. LD b3, 2 * SIZE(BO)
  1081. LD b4, 3 * SIZE(BO)
  1082. LD b5, 4 * SIZE(BO)
  1083. LD b6, 5 * SIZE(BO)
  1084. LD b7, 6 * SIZE(BO)
  1085. LD b8, 7 * SIZE(BO)
  1086. SUB t11, b1, t11
  1087. SUB t12, b2, t12
  1088. SUB t21, b3, t21
  1089. SUB t22, b4, t22
  1090. SUB t31, b5, t31
  1091. SUB t32, b6, t32
  1092. SUB t41, b7, t41
  1093. SUB t42, b8, t42
  1094. LD b1, 15 * SIZE(AO)
  1095. LD b2, 14 * SIZE(AO)
  1096. LD b4, 13 * SIZE(AO)
  1097. LD b7, 12 * SIZE(AO)
  1098. MUL t41, b1, t41
  1099. MUL t42, b1, t42
  1100. NMSUB t31, t31, b2, t41
  1101. NMSUB t32, t32, b2, t42
  1102. NMSUB t21, t21, b4, t41
  1103. NMSUB t22, t22, b4, t42
  1104. NMSUB t11, t11, b7, t41
  1105. NMSUB t12, t12, b7, t42
  1106. LD b3, 10 * SIZE(AO)
  1107. LD b5, 9 * SIZE(AO)
  1108. LD b8, 8 * SIZE(AO)
  1109. MUL t31, b3, t31
  1110. MUL t32, b3, t32
  1111. NMSUB t21, t21, b5, t31
  1112. NMSUB t22, t22, b5, t32
  1113. NMSUB t11, t11, b8, t31
  1114. NMSUB t12, t12, b8, t32
  1115. LD b6, 5 * SIZE(AO)
  1116. LD b1, 4 * SIZE(AO)
  1117. MUL t21, b6, t21
  1118. MUL t22, b6, t22
  1119. NMSUB t11, t11, b1, t21
  1120. NMSUB t12, t12, b1, t22
  1121. LD b2, 0 * SIZE(AO)
  1122. MUL t11, b2, t11
  1123. MUL t12, b2, t12
  1124. daddiu CO1, CO1, -4 * SIZE
  1125. daddiu CO2, CO2, -4 * SIZE
  1126. ST t11, 0 * SIZE(BO)
  1127. ST t12, 1 * SIZE(BO)
  1128. ST t21, 2 * SIZE(BO)
  1129. ST t22, 3 * SIZE(BO)
  1130. ST t31, 4 * SIZE(BO)
  1131. ST t32, 5 * SIZE(BO)
  1132. ST t41, 6 * SIZE(BO)
  1133. ST t42, 7 * SIZE(BO)
  1134. ST t11, 0 * SIZE(CO1)
  1135. ST t21, 1 * SIZE(CO1)
  1136. ST t31, 2 * SIZE(CO1)
  1137. ST t41, 3 * SIZE(CO1)
  1138. ST t12, 0 * SIZE(CO2)
  1139. ST t22, 1 * SIZE(CO2)
  1140. ST t32, 2 * SIZE(CO2)
  1141. ST t42, 3 * SIZE(CO2)
  1142. daddiu KK, KK, -4
  1143. MTC $0, t11
  1144. MOV t21, t11
  1145. MOV t31, t11
  1146. MOV t41, t11
  1147. daddiu I, I, -1
  1148. bgtz I, .L31
  1149. nop
  1150. .align 3
  1151. .L49:
  1152. dsll TEMP, K, 1 + BASE_SHIFT # nr=2
  1153. daddu B, B, TEMP
  1154. .align 3
  1155. .L70:
  1156. andi J, N, 1 # nr=1
  1157. blez J, .L999 # END
  1158. nop
  1159. move CO1, C
  1160. daddu KK, M, OFFSET
  1161. move AORIG, A # reset A
  1162. andi I, M, 1 # mr=1
  1163. blez I, .L90
  1164. NOP
  1165. MTC $0, t11
  1166. dsll TEMP, K, BASE_SHIFT # mr=1
  1167. dsubu AORIG, AORIG, TEMP
  1168. dsll L, KK, BASE_SHIFT
  1169. daddu AO, AORIG, L # AO point to the rectangular data part
  1170. daddu BO, B, L
  1171. dsubu TEMP, K, KK
  1172. LD a1, 0 * SIZE(AO)
  1173. LD b1, 0 * SIZE(BO)
  1174. dsra L, TEMP, 2
  1175. blez L, .L95
  1176. nop
  1177. .align 3
  1178. .L92:
  1179. LD a5, 1 * SIZE(AO)
  1180. LD b5, 1 * SIZE(BO)
  1181. MADD t11, t11, a1, b1 # 1st compute
  1182. LD a3, 2 * SIZE(AO)
  1183. LD b3, 2 * SIZE(BO)
  1184. MADD t11, t11, a5, b5 # 2ed compute
  1185. LD a7, 3 * SIZE(AO)
  1186. LD b7, 3 * SIZE(BO)
  1187. MADD t11, t11, a3, b3 # 3rd compute
  1188. daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
  1189. daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
  1190. LD a1, 0 * SIZE(AO) # next
  1191. LD b1, 0 * SIZE(BO)
  1192. MADD t11, t11, a7, b7 # 4th compute
  1193. daddiu L, L, -1
  1194. bgtz L, .L92
  1195. nop
  1196. .align 3
  1197. .L95:
  1198. andi L, TEMP, 3
  1199. blez L, .L98
  1200. nop
  1201. .align 3
  1202. .L96:
  1203. MADD t11, t11, a1, b1 # 3rd compute
  1204. daddiu AO, AO, 1 * SIZE # AO += 1mr
  1205. daddiu BO, BO, 1 * SIZE # BO += 1nr
  1206. LD a1, 0 * SIZE(AO) # next
  1207. LD b1, 0 * SIZE(BO)
  1208. daddiu L, L, -1
  1209. bgtz L, .L96
  1210. nop
  1211. .L98:
  1212. daddiu TEMP, KK, -1 # mr=2
  1213. dsll TEMP, TEMP, BASE_SHIFT
  1214. daddu AO, AORIG, TEMP # AO point to the triangular data part
  1215. daddu BO, B, TEMP
  1216. LD b1, 0 * SIZE(BO)
  1217. SUB t11, b1, t11
  1218. LD b3, 0 * SIZE(AO)
  1219. MUL t11, b3, t11
  1220. daddiu CO1, CO1, -1 * SIZE
  1221. ST t11, 0 * SIZE(BO)
  1222. ST t11, 0 * SIZE(CO1)
  1223. daddiu KK, KK, -1
  1224. .L90:
  1225. andi I, M, 2
  1226. blez I, .L80
  1227. NOP
  1228. MTC $0, t11
  1229. MOV t21, t11 # clear result registers
  1230. dsll TEMP, K, 1+BASE_SHIFT # mr=2
  1231. dsubu AORIG, AORIG, TEMP
  1232. dsll L, KK, 1 + BASE_SHIFT
  1233. dsll TEMP, KK, 0 + BASE_SHIFT
  1234. daddu AO, AORIG, L # AO point to the rectangular data part
  1235. daddu BO, B, TEMP
  1236. dsubu TEMP, K, KK
  1237. LD a1, 0 * SIZE(AO)
  1238. LD a2, 1 * SIZE(AO)
  1239. LD b1, 0 * SIZE(BO)
  1240. dsra L, TEMP, 2
  1241. blez L, .L85
  1242. nop
  1243. .align 3
  1244. .L82:
  1245. LD a5, 2 * SIZE(AO)
  1246. LD a6, 3 * SIZE(AO)
  1247. LD b5, 1 * SIZE(BO)
  1248. MADD t11, t11, a1, b1 # 1st compute
  1249. MADD t21, t21, a2, b1
  1250. LD a3, 4 * SIZE(AO)
  1251. LD a4, 5 * SIZE(AO)
  1252. LD b3, 2 * SIZE(BO)
  1253. MADD t11, t11, a5, b5 # 2ed compute
  1254. MADD t21, t21, a6, b5
  1255. LD a7, 6 * SIZE(AO)
  1256. LD a8, 7 * SIZE(AO)
  1257. LD b7, 3 * SIZE(BO)
  1258. MADD t11, t11, a3, b3 # 3rd compute
  1259. MADD t21, t21, a4, b3
  1260. daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
  1261. daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
  1262. LD a1, 0 * SIZE(AO) # next
  1263. LD a2, 1 * SIZE(AO)
  1264. LD b1, 0 * SIZE(BO)
  1265. MADD t11, t11, a7, b7 # 4th compute
  1266. MADD t21, t21, a8, b7
  1267. daddiu L, L, -1
  1268. bgtz L, .L82
  1269. nop
  1270. .align 3
  1271. .L85:
  1272. andi L, TEMP, 3
  1273. blez L, .L88
  1274. nop
  1275. .align 3
  1276. .L86:
  1277. MADD t11, t11, a1, b1 # 3rd compute
  1278. MADD t21, t21, a2, b1
  1279. daddiu AO, AO, 2 * SIZE # AO += 2mr
  1280. daddiu BO, BO, 1 * SIZE # BO += 1nr
  1281. LD a1, 0 * SIZE(AO) # next
  1282. LD a2, 1 * SIZE(AO)
  1283. LD b1, 0 * SIZE(BO)
  1284. daddiu L, L, -1
  1285. bgtz L, .L86
  1286. nop
  1287. .L88:
  1288. daddiu TEMP, KK, -2 # mr=2
  1289. dsll L, TEMP, 1 + BASE_SHIFT
  1290. dsll TEMP, TEMP, 0 + BASE_SHIFT
  1291. daddu AO, AORIG, L # AO point to the triangular data part
  1292. daddu BO, B, TEMP
  1293. LD b1, 0 * SIZE(BO)
  1294. LD b2, 1 * SIZE(BO)
  1295. SUB t11, b1, t11
  1296. SUB t21, b2, t21
  1297. LD b1, 3 * SIZE(AO) # computes the triangular_part
  1298. LD b2, 2 * SIZE(AO)
  1299. MUL t21, b1, t21
  1300. NMSUB t11, t11, b2, t21
  1301. LD b3, 0 * SIZE(AO)
  1302. MUL t11, b3, t11
  1303. daddiu CO1, CO1, -2 * SIZE
  1304. ST t11, 0 * SIZE(BO)
  1305. ST t21, 1 * SIZE(BO)
  1306. ST t11, 0 * SIZE(CO1)
  1307. ST t21, 1 * SIZE(CO1)
  1308. daddiu KK, KK, -2
  1309. .align 3
  1310. .L80:
  1311. dsra I, M, 2
  1312. blez I, .L89
  1313. nop
  1314. .L71:
  1315. dsll TEMP, K, 2 + BASE_SHIFT # mr=4
  1316. dsubu AORIG, AORIG, TEMP
  1317. dsll L, KK, 2 + BASE_SHIFT # mr=4
  1318. dsll TEMP, KK, 0 + BASE_SHIFT # nr=1
  1319. daddu AO, AORIG, L # AO point to the rectangular
  1320. daddu BO, B, TEMP
  1321. dsubu TEMP, K, KK
  1322. MTC $0, t11 # clear result regusters
  1323. MOV t21, t11
  1324. MOV t31, t11
  1325. MOV t41, t11
  1326. LD a1, 0 * SIZE(AO) # this part compute the rectangular data part of Ai
  1327. LD a2, 1 * SIZE(AO) # mr*KK with nr*KK
  1328. LD a3, 2 * SIZE(AO)
  1329. LD a4, 3 * SIZE(AO) # get 4a
  1330. LD b1, 0 * SIZE(BO) # get 4b
  1331. dsra L, TEMP, 2
  1332. blez L, .L75
  1333. nop # reset B
  1334. .align 3
  1335. .L72:
  1336. LD a5, 4 * SIZE(AO)
  1337. LD a6, 5 * SIZE(AO)
  1338. LD a7, 6 * SIZE(AO)
  1339. LD a8, 7 * SIZE(AO)
  1340. LD b5, 1 * SIZE(BO)
  1341. MADD t11, t11, a1, b1 # 1st compute
  1342. MADD t21, t21, a2, b1
  1343. MADD t31, t31, a3, b1
  1344. MADD t41, t41, a4, b1
  1345. LD a1, 8 * SIZE(AO)
  1346. LD a2, 9 * SIZE(AO)
  1347. LD a3, 10 * SIZE(AO)
  1348. LD a4, 11 * SIZE(AO)
  1349. LD b3, 2 * SIZE(BO)
  1350. MADD t11, t11, a5, b5 # 2ed compute
  1351. MADD t21, t21, a6, b5
  1352. MADD t31, t31, a7, b5
  1353. MADD t41, t41, a8, b5
  1354. LD a5, 12 * SIZE(AO)
  1355. LD a6, 13 * SIZE(AO)
  1356. LD a7, 14 * SIZE(AO)
  1357. LD a8, 15 * SIZE(AO)
  1358. LD b7, 3 * SIZE(BO)
  1359. MADD t11, t11, a1, b3 # 3rd compute
  1360. MADD t21, t21, a2, b3
  1361. MADD t31, t31, a3, b3
  1362. MADD t41, t41, a4, b3
  1363. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  1364. daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
  1365. LD a1, 0 * SIZE(AO) # next
  1366. LD a2, 1 * SIZE(AO)
  1367. LD a3, 2 * SIZE(AO)
  1368. LD a4, 3 * SIZE(AO)
  1369. LD b1, 0 * SIZE(BO)
  1370. MADD t11, t11, a5, b7 # 4th compute
  1371. MADD t21, t21, a6, b7
  1372. MADD t31, t31, a7, b7
  1373. MADD t41, t41, a8, b7
  1374. daddiu L, L, -1
  1375. bgtz L, .L72
  1376. nop
  1377. .align 3
  1378. .L75:
  1379. andi L, TEMP, 3
  1380. blez L, .L78
  1381. nop
  1382. .align 3
  1383. .L76:
  1384. MADD t11, t11, a1, b1 # 3rd compute
  1385. MADD t21, t21, a2, b1
  1386. MADD t31, t31, a3, b1
  1387. MADD t41, t41, a4, b1
  1388. daddiu AO, AO, 4 * SIZE # AO += 4mr
  1389. daddiu BO, BO, 1 * SIZE # BO += 1nr
  1390. LD a1, 0 * SIZE(AO) # next
  1391. LD a2, 1 * SIZE(AO)
  1392. LD a3, 2 * SIZE(AO)
  1393. LD a4, 3 * SIZE(AO)
  1394. LD b1, 0 * SIZE(BO)
  1395. daddiu L, L, -1
  1396. bgtz L, .L76
  1397. nop
  1398. .L78:
  1399. daddiu TEMP, KK, -4 # mr=4
  1400. dsll L, TEMP, 2 + BASE_SHIFT # mr=4
  1401. dsll TEMP, TEMP, 0 + BASE_SHIFT # nr=1
  1402. daddu AO, AORIG, L # AO point to the triangular
  1403. daddu BO, B, TEMP
  1404. LD b1, 0 * SIZE(BO)
  1405. LD b2, 1 * SIZE(BO)
  1406. LD b3, 2 * SIZE(BO)
  1407. LD b4, 3 * SIZE(BO)
  1408. SUB t11, b1, t11
  1409. SUB t21, b2, t21
  1410. SUB t31, b3, t31
  1411. SUB t41, b4, t41
  1412. LD b1, 15 * SIZE(AO)
  1413. LD b2, 14 * SIZE(AO)
  1414. LD b4, 13 * SIZE(AO)
  1415. LD b7, 12 * SIZE(AO)
  1416. MUL t41, b1, t41
  1417. NMSUB t31, t31, b2, t41
  1418. NMSUB t21, t21, b4, t41
  1419. NMSUB t11, t11, b7, t41
  1420. LD b3, 10 * SIZE(AO)
  1421. LD b5, 9 * SIZE(AO)
  1422. LD b8, 8 * SIZE(AO)
  1423. MUL t31, b3, t31
  1424. NMSUB t21, t21, b5, t31
  1425. NMSUB t11, t11, b8, t31
  1426. LD b6, 5 * SIZE(AO)
  1427. LD b1, 4 * SIZE(AO)
  1428. MUL t21, b6, t21
  1429. NMSUB t11, t11, b1, t21
  1430. LD b2, 0 * SIZE(AO)
  1431. MUL t11, b2, t11
  1432. daddiu CO1, CO1, -4 * SIZE
  1433. ST t11, 0 * SIZE(BO)
  1434. ST t21, 1 * SIZE(BO)
  1435. ST t31, 2 * SIZE(BO)
  1436. ST t41, 3 * SIZE(BO)
  1437. ST t11, 0 * SIZE(CO1)
  1438. ST t21, 1 * SIZE(CO1)
  1439. ST t31, 2 * SIZE(CO1)
  1440. ST t41, 3 * SIZE(CO1)
  1441. daddiu KK, KK, -4
  1442. daddiu I, I, -1
  1443. bgtz I, .L71
  1444. nop
  1445. .align 3
  1446. .L89:
  1447. dsll TEMP, K, BASE_SHIFT # nr=1
  1448. daddu B, B, TEMP
  1449. .align 3
  1450. .L999:
  1451. LDARG $16, 0($sp)
  1452. LDARG $17, 8($sp)
  1453. LDARG $18, 16($sp)
  1454. LDARG $19, 24($sp)
  1455. LDARG $20, 32($sp)
  1456. LDARG $21, 40($sp)
  1457. ldc1 $f24, 48($sp)
  1458. ldc1 $f25, 56($sp)
  1459. ldc1 $f26, 64($sp)
  1460. ldc1 $f27, 72($sp)
  1461. ldc1 $f28, 80($sp)
  1462. LDARG $22, 88($sp)
  1463. LDARG $23, 96($sp)
  1464. LDARG $24, 104($sp)
  1465. LDARG $25, 112($sp)
  1466. #ifndef __64BIT__
  1467. ldc1 $f20,112($sp)
  1468. ldc1 $f21,120($sp)
  1469. ldc1 $f22,128($sp)
  1470. ldc1 $f23,136($sp)
  1471. #endif
  1472. j $31
  1473. daddiu $sp, $sp, 144
  1474. EPILOGUE