You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT_loongson3a.S 34 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958
  1. #define REALNAME ASMNAME
  2. #define ASSEMBLER
  3. #include "common.h"
  4. #define M $4
  5. #define N $5
  6. #define K $6
  7. #define A $8
  8. #define B $9
  9. #define C $10
  10. #define LDC $11
  11. #define AO $12
  12. #define BO $13
  13. #define I $2
  14. #define J $3
  15. #define L $7
  16. #define CO1 $14
  17. #define CO2 $15
  18. #define CO3 $16
  19. #define CO4 $17
  20. #define OFFSET $22
  21. #define KK $23
  22. #define TEMP $24
  23. #define AORIG $25
  24. #define a1 $f0
  25. #define a2 $f1
  26. #define a3 $f26
  27. #define a4 $f27
  28. #define a5 $f28
  29. #define a6 $f29
  30. #define a7 $f30
  31. #define a8 $f31
  32. #define b1 $f2
  33. #define b2 $f3
  34. #define b3 $f4
  35. #define b4 $f5
  36. #define b5 $f6
  37. #define b6 $f7
  38. #define b7 $f8
  39. #define b8 $f9
  40. #define t11 $f10
  41. #define t21 $f11
  42. #define t31 $f12
  43. #define t41 $f13
  44. #define t12 $f14
  45. #define t22 $f15
  46. #define t32 $f16
  47. #define t42 $f17
  48. #define t13 $f18
  49. #define t23 $f19
  50. #define t33 $f20
  51. #define t43 $f21
  52. #define t14 $f22
  53. #define t24 $f23
  54. #define t34 $f24
  55. #define t44 $f25
  56. PROLOGUE
  57. daddiu $sp, $sp, -144
  58. SDARG $16, 0($sp)
  59. SDARG $17, 8($sp)
  60. SDARG $18, 16($sp)
  61. SDARG $19, 24($sp)
  62. SDARG $20, 32($sp)
  63. SDARG $21, 40($sp)
  64. sdc1 $f24, 48($sp)
  65. sdc1 $f25, 56($sp)
  66. sdc1 $f26, 64($sp)
  67. sdc1 $f27, 72($sp)
  68. sdc1 $f28, 80($sp)
  69. SDARG $22, 88($sp)
  70. SDARG $23, 96($sp)
  71. SDARG $24, 104($sp)
  72. SDARG $25, 112($sp)
  73. #ifndef __64BIT__
  74. sdc1 $f20,112($sp)
  75. sdc1 $f21,120($sp)
  76. sdc1 $f22,128($sp)
  77. sdc1 $f23,136($sp)
  78. #endif
  79. .align 3 # RT compute from right to left
  80. LDARG OFFSET, 144($sp) # get the last parameter
  81. dsll LDC, LDC, BASE_SHIFT # LDC * data_Byte
  82. mult N, K
  83. mflo TEMP
  84. dsll TEMP, TEMP, BASE_SHIFT # B Representative triangle matrix!!!
  85. daddu B, B, TEMP # B point to the end of sb
  86. # Be carefull B has no effeck of mc!!
  87. mult N, LDC
  88. mflo TEMP
  89. daddu C, C, TEMP # C point to the last colum of blockB
  90. dsubu KK, K, OFFSET # KC-KK is the length of rectangular data part of Bj
  91. andi J, N, 1
  92. blez J, .L30
  93. nop
  94. dsll TEMP, K, BASE_SHIFT
  95. dsubu B, B, TEMP # move B to the beginning address of Bj
  96. dsubu C, C, LDC
  97. move CO1, C
  98. move AORIG, A
  99. dsra I, M, 2
  100. blez I, .L80
  101. NOP
  102. .L31: # mr=4,nr=1
  103. dsll L, KK, 2 + BASE_SHIFT # mr=4
  104. dsll TEMP, KK, BASE_SHIFT # nr=1
  105. daddu AO, AORIG, L
  106. daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
  107. dsubu TEMP, K, KK # temp = the length of rectangular data part
  108. MTC $0, t11 # clear 4 results registers
  109. MOV t21, t11
  110. MOV t31, t11
  111. MOV t41, t11
  112. LD a1, 0 * SIZE(AO)
  113. LD a2, 1 * SIZE(AO)
  114. LD a3, 2 * SIZE(AO)
  115. LD a4, 3 * SIZE(AO)
  116. LD b1, 0 * SIZE(BO)
  117. dsra L, TEMP, 2 # L=(KC-offset)/4
  118. blez L, .L35
  119. NOP
  120. .align 3
  121. .L32:
  122. LD a5, 4 * SIZE(AO)
  123. LD a6, 5 * SIZE(AO)
  124. LD a7, 6 * SIZE(AO)
  125. LD a8, 7 * SIZE(AO)
  126. LD b5, 1 * SIZE(BO)
  127. MADD t11, t11, a1, b1
  128. MADD t21, t21, a2, b1
  129. MADD t31, t31, a3, b1
  130. MADD t41, t41, a4, b1
  131. LD a1, 8 * SIZE(AO)
  132. LD a2, 9 * SIZE(AO)
  133. LD a3, 10 * SIZE(AO)
  134. LD a4, 11 * SIZE(AO)
  135. LD b3, 2 * SIZE(BO)
  136. MADD t11, t11, a5, b5
  137. MADD t21, t21, a6, b5
  138. MADD t31, t31, a7, b5
  139. MADD t41, t41, a8, b5
  140. LD a5, 12 * SIZE(AO)
  141. LD a6, 13 * SIZE(AO)
  142. LD a7, 14 * SIZE(AO)
  143. LD a8, 15 * SIZE(AO)
  144. LD b7, 3 * SIZE(BO)
  145. MADD t11, t11, a1, b3
  146. MADD t21, t21, a2, b3
  147. MADD t31, t31, a3, b3
  148. MADD t41, t41, a4, b3
  149. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  150. daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
  151. LD a1, 0 * SIZE(AO)
  152. LD a2, 1 * SIZE(AO)
  153. LD a3, 2 * SIZE(AO)
  154. LD a4, 3 * SIZE(AO)
  155. LD b1, 0 * SIZE(BO)
  156. MADD t11, t11, a5, b7
  157. MADD t21, t21, a6, b7
  158. MADD t31, t31, a7, b7
  159. MADD t41, t41, a8, b7
  160. daddiu L, L, -1
  161. bgtz L, .L32
  162. NOP
  163. .align 3
  164. .L35:
  165. andi L, TEMP, 3
  166. blez L, .L38
  167. NOP
  168. .align 3
  169. .L36:
  170. MADD t11, t11, a1, b1
  171. MADD t21, t21, a2, b1
  172. MADD t31, t31, a3, b1
  173. MADD t41, t41, a4, b1
  174. daddiu AO, AO, 4 * SIZE # AO += 4mr
  175. daddiu BO, BO, 1 * SIZE # BO += 2nr
  176. LD a1, 0 * SIZE(AO)
  177. LD a2, 1 * SIZE(AO)
  178. LD a3, 2 * SIZE(AO)
  179. LD a4, 3 * SIZE(AO)
  180. LD b1, 0 * SIZE(BO)
  181. daddiu L, L, -1
  182. bgtz L, .L36
  183. NOP
  184. .align
  185. .L38:
  186. daddiu TEMP, KK, -1 # deal with the triangular data part
  187. dsll L, TEMP, 2 + BASE_SHIFT
  188. dsll TEMP, TEMP, BASE_SHIFT # nr=1
  189. daddu AO, AORIG, L
  190. daddu BO, B, TEMP # BO point to the trigular data part
  191. LD b1, 0 * SIZE(AO) # fixed results
  192. LD b2, 1 * SIZE(AO)
  193. LD b3, 2 * SIZE(AO)
  194. LD b4, 3 * SIZE(AO)
  195. SUB t11, b1, t11
  196. SUB t21, b2, t21
  197. SUB t31, b3, t31
  198. SUB t41, b4, t41
  199. LD b2, 0 * SIZE(BO)
  200. MUL t11, b2, t11
  201. MUL t21, b2, t21
  202. MUL t31, b2, t31
  203. MUL t41, b2, t41
  204. ST t11, 0 * SIZE(AO) # updata packed A
  205. ST t21, 1 * SIZE(AO)
  206. ST t31, 2 * SIZE(AO)
  207. ST t41, 3 * SIZE(AO)
  208. ST t11, 0 * SIZE(CO1) # write back
  209. ST t21, 1 * SIZE(CO1)
  210. ST t31, 2 * SIZE(CO1)
  211. ST t41, 3 * SIZE(CO1)
  212. daddiu CO1, CO1, 4 * SIZE # fixed pointer
  213. dsll TEMP, K, 2 + BASE_SHIFT
  214. daddu AORIG, AORIG, TEMP # move to next panel Ai
  215. daddiu I, I, -1
  216. bgtz I, .L31
  217. NOP
  218. .align 3
  219. .L80:
  220. andi I, M, 2
  221. blez I, .L90
  222. nop
  223. dsll L, KK, 1 + BASE_SHIFT # mr=2
  224. dsll TEMP, KK, BASE_SHIFT # nr=1
  225. daddu AO, AORIG, L
  226. daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
  227. dsubu TEMP, K, KK # temp = the length of rectangular data part
  228. MTC $0, t11 # clear 4 results registers
  229. MOV t21, t11
  230. LD a1, 0 * SIZE(AO)
  231. LD a2, 1 * SIZE(AO)
  232. LD b1, 0 * SIZE(BO)
  233. dsra L, TEMP, 2 # L=(KC-offset)/4
  234. blez L, .L85
  235. NOP
  236. .align 3
  237. .L82:
  238. LD a5, 2 * SIZE(AO)
  239. LD a6, 3 * SIZE(AO)
  240. LD b5, 1 * SIZE(BO)
  241. MADD t11, t11, a1, b1
  242. MADD t21, t21, a2, b1
  243. LD a3, 4 * SIZE(AO)
  244. LD a4, 5 * SIZE(AO)
  245. LD b3, 2 * SIZE(BO)
  246. MADD t11, t11, a5, b5
  247. MADD t21, t21, a6, b5
  248. LD a7, 6 * SIZE(AO)
  249. LD a8, 7 * SIZE(AO)
  250. LD b7, 3 * SIZE(BO)
  251. MADD t11, t11, a3, b3
  252. MADD t21, t21, a4, b3
  253. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  254. daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
  255. LD a1, 0 * SIZE(AO)
  256. LD a2, 1 * SIZE(AO)
  257. LD b1, 0 * SIZE(BO)
  258. MADD t11, t11, a7, b7
  259. MADD t21, t21, a8, b7
  260. daddiu L, L, -1
  261. bgtz L, .L82
  262. NOP
  263. .align 3
  264. .L85:
  265. andi L, TEMP, 3
  266. blez L, .L88
  267. NOP
  268. .align 3
  269. .L86:
  270. MADD t11, t11, a1, b1
  271. MADD t21, t21, a2, b1
  272. daddiu AO, AO, 2 * SIZE # AO += 2mr
  273. daddiu BO, BO, 1 * SIZE # BO += 1nr
  274. LD a1, 0 * SIZE(AO)
  275. LD a2, 1 * SIZE(AO)
  276. LD b1, 0 * SIZE(BO)
  277. daddiu L, L, -1
  278. bgtz L, .L86
  279. NOP
  280. .align
  281. .L88:
  282. daddiu TEMP, KK, -1 # deal with the triangular data part
  283. dsll L, TEMP, 1 + BASE_SHIFT
  284. dsll TEMP, TEMP, BASE_SHIFT # nr=1
  285. daddu AO, AORIG, L
  286. daddu BO, B, TEMP # BO point to the trigular data part
  287. LD b1, 0 * SIZE(AO) # fixed results
  288. LD b2, 1 * SIZE(AO)
  289. SUB t11, b1, t11
  290. SUB t21, b2, t21
  291. LD b2, 0 * SIZE(BO)
  292. MUL t11, b2, t11
  293. MUL t21, b2, t21
  294. ST t11, 0 * SIZE(AO) # updata packed A
  295. ST t21, 1 * SIZE(AO)
  296. ST t11, 0 * SIZE(CO1) # write back
  297. ST t21, 1 * SIZE(CO1)
  298. daddiu CO1, CO1, 2 * SIZE # fixed pointer
  299. dsll TEMP, K, 1 + BASE_SHIFT
  300. daddu AORIG, AORIG, TEMP # move to next panel Ai
  301. .align 3
  302. .L90:
  303. andi I, M, 1
  304. blez I, .L39
  305. nop
  306. dsll L, KK, BASE_SHIFT # mr=1
  307. dsll TEMP, KK, BASE_SHIFT # nr=1
  308. daddu AO, AORIG, L
  309. daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
  310. dsubu TEMP, K, KK # temp = the length of rectangular data part
  311. MTC $0, t11 # clear 4 results registers
  312. LD a1, 0 * SIZE(AO)
  313. LD b1, 0 * SIZE(BO)
  314. dsra L, TEMP, 2 # L=(KC-offset)/4
  315. blez L, .L95
  316. NOP
  317. .align 3
  318. .L92:
  319. LD a5, 1 * SIZE(AO)
  320. LD b5, 1 * SIZE(BO)
  321. MADD t11, t11, a1, b1
  322. LD a3, 2 * SIZE(AO)
  323. LD b3, 2 * SIZE(BO)
  324. MADD t11, t11, a5, b5
  325. LD a7, 3 * SIZE(AO)
  326. LD b7, 3 * SIZE(BO)
  327. MADD t11, t11, a3, b3
  328. daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
  329. daddiu BO, BO, 4 * SIZE # BO += 1nr*4kr
  330. LD a1, 0 * SIZE(AO)
  331. LD b1, 0 * SIZE(BO)
  332. MADD t11, t11, a7, b7
  333. daddiu L, L, -1
  334. bgtz L, .L92
  335. NOP
  336. .align 3
  337. .L95:
  338. andi L, TEMP, 3
  339. blez L, .L98
  340. NOP
  341. .align 3
  342. .L96:
  343. MADD t11, t11, a1, b1
  344. daddiu AO, AO, 1 * SIZE # AO += 2mr
  345. daddiu BO, BO, 1 * SIZE # BO += 1nr
  346. LD a1, 0 * SIZE(AO)
  347. LD b1, 0 * SIZE(BO)
  348. daddiu L, L, -1
  349. bgtz L, .L96
  350. NOP
  351. .align
  352. .L98:
  353. daddiu TEMP, KK, -1 # deal with the triangular data part
  354. dsll L, TEMP, BASE_SHIFT
  355. dsll TEMP, TEMP, BASE_SHIFT # nr=1
  356. daddu AO, AORIG, L
  357. daddu BO, B, TEMP # BO point to the trigular data part
  358. LD b1, 0 * SIZE(AO) # fixed results
  359. SUB t11, b1, t11
  360. LD b2, 0 * SIZE(BO)
  361. MUL t11, b2, t11
  362. ST t11, 0 * SIZE(AO) # updata packed A
  363. ST t11, 0 * SIZE(CO1) # write back
  364. daddiu CO1, CO1, 1 * SIZE # fixed pointer
  365. dsll TEMP, K, BASE_SHIFT
  366. daddu AORIG, AORIG, TEMP # move to next panel Ai
  367. .L39:
  368. daddiu KK, KK, -1 # rectangular data length increased by 1
  369. .align 3
  370. .L30: # nr=2
  371. andi J, N, 2
  372. blez J, .L50
  373. nop
  374. dsll TEMP, K, 1 + BASE_SHIFT # Kc*2nr move B to the beginning address of Bj
  375. dsubu B, B, TEMP
  376. dsll TEMP, LDC, 1 # C
  377. dsubu C, C, TEMP
  378. move CO1, C
  379. daddu CO2, C, LDC
  380. move AORIG, A
  381. dsra I, M, 2
  382. blez I, .L60
  383. NOP
  384. .L51: # mr=4,nr=2
  385. dsll L, KK, 2 + BASE_SHIFT # mr=4
  386. dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
  387. daddu AO, AORIG, L
  388. daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
  389. dsubu TEMP, K, KK # temp = the length of rectangular data part
  390. MTC $0, t11 # clear 8 results registers
  391. MOV t21, t11
  392. MOV t31, t11
  393. MOV t41, t11
  394. MOV t12, t11
  395. MOV t22, t11
  396. MOV t32, t11
  397. MOV t42, t11
  398. LD a1, 0 * SIZE(AO)
  399. LD a2, 1 * SIZE(AO)
  400. LD a3, 2 * SIZE(AO)
  401. LD a4, 3 * SIZE(AO)
  402. LD b1, 0 * SIZE(BO)
  403. LD b2, 1 * SIZE(BO)
  404. dsra L, TEMP, 2 # L=(KC-offset)/4
  405. blez L, .L55
  406. NOP
  407. .align 3
  408. .L52:
  409. LD a5, 4 * SIZE(AO)
  410. LD a6, 5 * SIZE(AO)
  411. LD a7, 6 * SIZE(AO)
  412. LD a8, 7 * SIZE(AO)
  413. LD b5, 2 * SIZE(BO)
  414. LD b6, 3 * SIZE(BO)
  415. MADD t11, t11, a1, b1
  416. MADD t21, t21, a2, b1
  417. MADD t31, t31, a3, b1
  418. MADD t41, t41, a4, b1
  419. MADD t12, t12, a1, b2
  420. MADD t22, t22, a2, b2
  421. MADD t32, t32, a3, b2
  422. MADD t42, t42, a4, b2
  423. LD a1, 8 * SIZE(AO)
  424. LD a2, 9 * SIZE(AO)
  425. LD a3, 10 * SIZE(AO)
  426. LD a4, 11 * SIZE(AO)
  427. LD b3, 4 * SIZE(BO)
  428. LD b4, 5 * SIZE(BO)
  429. MADD t11, t11, a5, b5
  430. MADD t21, t21, a6, b5
  431. MADD t31, t31, a7, b5
  432. MADD t41, t41, a8, b5
  433. MADD t12, t12, a5, b6
  434. MADD t22, t22, a6, b6
  435. MADD t32, t32, a7, b6
  436. MADD t42, t42, a8, b6
  437. LD a5, 12 * SIZE(AO)
  438. LD a6, 13 * SIZE(AO)
  439. LD a7, 14 * SIZE(AO)
  440. LD a8, 15 * SIZE(AO)
  441. LD b7, 6 * SIZE(BO)
  442. LD b8, 7 * SIZE(BO)
  443. MADD t11, t11, a1, b3
  444. MADD t21, t21, a2, b3
  445. MADD t31, t31, a3, b3
  446. MADD t41, t41, a4, b3
  447. MADD t12, t12, a1, b4
  448. MADD t22, t22, a2, b4
  449. MADD t32, t32, a3, b4
  450. MADD t42, t42, a4, b4
  451. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  452. daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
  453. LD a1, 0 * SIZE(AO)
  454. LD a2, 1 * SIZE(AO)
  455. LD a3, 2 * SIZE(AO)
  456. LD a4, 3 * SIZE(AO)
  457. LD b1, 0 * SIZE(BO)
  458. LD b2, 1 * SIZE(BO)
  459. MADD t11, t11, a5, b7
  460. MADD t21, t21, a6, b7
  461. MADD t31, t31, a7, b7
  462. MADD t41, t41, a8, b7
  463. MADD t12, t12, a5, b8
  464. MADD t22, t22, a6, b8
  465. MADD t32, t32, a7, b8
  466. MADD t42, t42, a8, b8
  467. daddiu L, L, -1
  468. bgtz L, .L52
  469. NOP
  470. .align 3
  471. .L55:
  472. andi L, TEMP, 3
  473. blez L, .L58
  474. NOP
  475. .align 3
  476. .L56:
  477. MADD t11, t11, a1, b1
  478. MADD t21, t21, a2, b1
  479. MADD t31, t31, a3, b1
  480. MADD t41, t41, a4, b1
  481. MADD t12, t12, a1, b2
  482. MADD t22, t22, a2, b2
  483. MADD t32, t32, a3, b2
  484. MADD t42, t42, a4, b2
  485. daddiu AO, AO, 4 * SIZE # AO += 4mr
  486. daddiu BO, BO, 2 * SIZE # BO += 2nr
  487. LD a1, 0 * SIZE(AO)
  488. LD a2, 1 * SIZE(AO)
  489. LD a3, 2 * SIZE(AO)
  490. LD a4, 3 * SIZE(AO)
  491. LD b1, 0 * SIZE(BO)
  492. LD b2, 1 * SIZE(BO)
  493. daddiu L, L, -1
  494. bgtz L, .L56
  495. NOP
  496. .align
  497. .L58:
  498. daddiu TEMP, KK, -2 # deal with the triangular data part
  499. dsll L, TEMP, 2 + BASE_SHIFT
  500. dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
  501. daddu AO, AORIG, L
  502. daddu BO, B, TEMP # BO point to the trigular data part
  503. LD b1, 0 * SIZE(AO) # fixed results
  504. LD b2, 1 * SIZE(AO)
  505. LD b3, 2 * SIZE(AO)
  506. LD b4, 3 * SIZE(AO)
  507. SUB t11, b1, t11
  508. SUB t21, b2, t21
  509. SUB t31, b3, t31
  510. SUB t41, b4, t41
  511. LD b5, 4 * SIZE(AO)
  512. LD b6, 5 * SIZE(AO)
  513. LD b7, 6 * SIZE(AO)
  514. LD b8, 7 * SIZE(AO)
  515. SUB t12, b5, t12
  516. SUB t22, b6, t22
  517. SUB t32, b7, t32
  518. SUB t42, b8, t42
  519. LD b8, 3 * SIZE(BO)
  520. LD b1, 2 * SIZE(BO)
  521. MUL t12, b8, t12
  522. MUL t22, b8, t22
  523. MUL t32, b8, t32
  524. MUL t42, b8, t42
  525. NMSUB t11, t11, b1, t12
  526. NMSUB t21, t21, b1, t22
  527. NMSUB t31, t31, b1, t32
  528. NMSUB t41, t41, b1, t42
  529. LD b2, 0 * SIZE(BO)
  530. MUL t11, b2, t11
  531. MUL t21, b2, t21
  532. MUL t31, b2, t31
  533. MUL t41, b2, t41
  534. ST t11, 0 * SIZE(AO) # updata packed A
  535. ST t21, 1 * SIZE(AO)
  536. ST t31, 2 * SIZE(AO)
  537. ST t41, 3 * SIZE(AO)
  538. ST t12, 4 * SIZE(AO)
  539. ST t22, 5 * SIZE(AO)
  540. ST t32, 6 * SIZE(AO)
  541. ST t42, 7 * SIZE(AO)
  542. ST t11, 0 * SIZE(CO1) # write back
  543. ST t21, 1 * SIZE(CO1)
  544. ST t31, 2 * SIZE(CO1)
  545. ST t41, 3 * SIZE(CO1)
  546. ST t12, 0 * SIZE(CO2)
  547. ST t22, 1 * SIZE(CO2)
  548. ST t32, 2 * SIZE(CO2)
  549. ST t42, 3 * SIZE(CO2)
  550. daddiu CO1, CO1, 4 * SIZE # fixed pointer
  551. daddiu CO2, CO2, 4 * SIZE
  552. dsll TEMP, K, 2 + BASE_SHIFT
  553. daddu AORIG, AORIG, TEMP # move to next panel Ai
  554. daddiu I, I, -1
  555. bgtz I, .L51
  556. NOP
  557. .align 3
  558. .L60:
  559. andi I, M, 2 # mr=2
  560. blez I, .L70
  561. nop
  562. dsll L, KK, 1 + BASE_SHIFT # mr=2
  563. dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
  564. daddu AO, AORIG, L
  565. daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
  566. dsubu TEMP, K, KK # temp = the length of rectangular data part
  567. MTC $0, t11 # clear 8 results registers
  568. MOV t21, t11
  569. MOV t12, t11
  570. MOV t22, t11
  571. LD a1, 0 * SIZE(AO)
  572. LD a2, 1 * SIZE(AO)
  573. LD b1, 0 * SIZE(BO)
  574. LD b2, 1 * SIZE(BO)
  575. dsra L, TEMP, 2 # L=(KC-offset)/4
  576. blez L, .L65
  577. NOP
  578. .align 3
  579. .L62:
  580. LD a5, 2 * SIZE(AO)
  581. LD a6, 3 * SIZE(AO)
  582. LD b5, 2 * SIZE(BO)
  583. LD b6, 3 * SIZE(BO)
  584. MADD t11, t11, a1, b1
  585. MADD t21, t21, a2, b1
  586. MADD t12, t12, a1, b2
  587. MADD t22, t22, a2, b2
  588. LD a3, 4 * SIZE(AO)
  589. LD a4, 5 * SIZE(AO)
  590. LD b3, 4 * SIZE(BO)
  591. LD b4, 5 * SIZE(BO)
  592. MADD t11, t11, a5, b5
  593. MADD t21, t21, a6, b5
  594. MADD t12, t12, a5, b6
  595. MADD t22, t22, a6, b6
  596. LD a7, 6 * SIZE(AO)
  597. LD a8, 7 * SIZE(AO)
  598. LD b7, 6 * SIZE(BO)
  599. LD b8, 7 * SIZE(BO)
  600. MADD t11, t11, a3, b3
  601. MADD t21, t21, a4, b3
  602. MADD t12, t12, a3, b4
  603. MADD t22, t22, a4, b4
  604. daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
  605. daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
  606. LD a1, 0 * SIZE(AO)
  607. LD a2, 1 * SIZE(AO)
  608. LD b1, 0 * SIZE(BO)
  609. LD b2, 1 * SIZE(BO)
  610. MADD t11, t11, a7, b7
  611. MADD t21, t21, a8, b7
  612. MADD t12, t12, a7, b8
  613. MADD t22, t22, a8, b8
  614. daddiu L, L, -1
  615. bgtz L, .L62
  616. NOP
  617. .align 3
  618. .L65:
  619. andi L, TEMP, 3
  620. blez L, .L68
  621. NOP
  622. .align 3
  623. .L66:
  624. MADD t11, t11, a1, b1
  625. MADD t21, t21, a2, b1
  626. MADD t12, t12, a1, b2
  627. MADD t22, t22, a2, b2
  628. daddiu AO, AO, 2 * SIZE # AO += 2mr
  629. daddiu BO, BO, 2 * SIZE # BO += 2nr
  630. LD a1, 0 * SIZE(AO)
  631. LD a2, 1 * SIZE(AO)
  632. LD b1, 0 * SIZE(BO)
  633. LD b2, 1 * SIZE(BO)
  634. daddiu L, L, -1
  635. bgtz L, .L66
  636. NOP
  637. .align
  638. .L68:
  639. daddiu TEMP, KK, -2 # deal with the triangular data part
  640. dsll L, TEMP, 1 + BASE_SHIFT
  641. dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
  642. daddu AO, AORIG, L
  643. daddu BO, B, TEMP # BO point to the trigular data part
  644. LD b1, 0 * SIZE(AO) # fixed results
  645. LD b2, 1 * SIZE(AO)
  646. LD b3, 2 * SIZE(AO)
  647. LD b4, 3 * SIZE(AO)
  648. SUB t11, b1, t11
  649. SUB t21, b2, t21
  650. SUB t12, b3, t12
  651. SUB t22, b4, t22
  652. LD b8, 3 * SIZE(BO)
  653. LD b7, 2 * SIZE(BO)
  654. MUL t12, b8, t12
  655. MUL t22, b8, t22
  656. NMSUB t11, t11, b7, t12
  657. NMSUB t21, t21, b7, t22
  658. LD b6, 0 * SIZE(BO)
  659. MUL t11, b6, t11
  660. MUL t21, b6, t21
  661. ST t11, 0 * SIZE(AO) # updata packed A
  662. ST t21, 1 * SIZE(AO)
  663. ST t12, 2 * SIZE(AO)
  664. ST t22, 3 * SIZE(AO)
  665. ST t11, 0 * SIZE(CO1) # write back
  666. ST t21, 1 * SIZE(CO1)
  667. ST t12, 0 * SIZE(CO2)
  668. ST t22, 1 * SIZE(CO2)
  669. daddiu CO1, CO1, 2 * SIZE # fixed pointer
  670. daddiu CO2, CO2, 2 * SIZE
  671. dsll TEMP, K, 1 + BASE_SHIFT # mr=2
  672. daddu AORIG, AORIG, TEMP # move to next panel Ai
  673. .align 3
  674. .L70:
  675. andi I, M, 1 # mr=1
  676. blez I, .L59
  677. nop
  678. dsll L, KK, BASE_SHIFT # mr=1
  679. dsll TEMP, KK, 1 + BASE_SHIFT # nr=2
  680. daddu AO, AORIG, L
  681. daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
  682. dsubu TEMP, K, KK # temp = the length of rectangular data part
  683. MTC $0, t11 # clear 8 results registers
  684. MOV t12, t11
  685. LD a1, 0 * SIZE(AO)
  686. LD b1, 0 * SIZE(BO)
  687. LD b2, 1 * SIZE(BO)
  688. dsra L, TEMP, 2 # L=(KC-offset)/4
  689. blez L, .L75
  690. NOP
  691. .align 3
  692. .L72:
  693. LD a5, 1 * SIZE(AO)
  694. LD b5, 2 * SIZE(BO)
  695. LD b6, 3 * SIZE(BO)
  696. MADD t11, t11, a1, b1
  697. MADD t12, t12, a1, b2
  698. LD a3, 2 * SIZE(AO)
  699. LD b3, 4 * SIZE(BO)
  700. LD b4, 5 * SIZE(BO)
  701. MADD t11, t11, a5, b5
  702. MADD t12, t12, a5, b6
  703. LD a7, 3 * SIZE(AO)
  704. LD b7, 6 * SIZE(BO)
  705. LD b8, 7 * SIZE(BO)
  706. MADD t11, t11, a3, b3
  707. MADD t12, t12, a3, b4
  708. daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
  709. daddiu BO, BO, 8 * SIZE # BO += 2nr*4kr
  710. LD a1, 0 * SIZE(AO)
  711. LD b1, 0 * SIZE(BO)
  712. LD b2, 1 * SIZE(BO)
  713. MADD t11, t11, a7, b7
  714. MADD t12, t12, a7, b8
  715. daddiu L, L, -1
  716. bgtz L, .L72
  717. NOP
  718. .align 3
  719. .L75:
  720. andi L, TEMP, 3
  721. blez L, .L78
  722. NOP
  723. .align 3
  724. .L76:
  725. MADD t11, t11, a1, b1
  726. MADD t12, t12, a1, b2
  727. daddiu AO, AO, 1 * SIZE # AO += 1mr
  728. daddiu BO, BO, 2 * SIZE # BO += 2nr
  729. LD a1, 0 * SIZE(AO)
  730. LD b1, 0 * SIZE(BO)
  731. LD b2, 1 * SIZE(BO)
  732. daddiu L, L, -1
  733. bgtz L, .L76
  734. NOP
  735. .align
  736. .L78:
  737. daddiu TEMP, KK, -2 # deal with the triangular data part
  738. dsll L, TEMP, BASE_SHIFT
  739. dsll TEMP, TEMP, 1 + BASE_SHIFT # nr=2
  740. daddu AO, AORIG, L
  741. daddu BO, B, TEMP # BO point to the trigular data part
  742. LD b1, 0 * SIZE(AO) # fixed results
  743. LD b2, 1 * SIZE(AO)
  744. SUB t11, b1, t11
  745. SUB t12, b2, t12
  746. LD b8, 3 * SIZE(BO)
  747. LD b7, 2 * SIZE(BO)
  748. MUL t12, b8, t12
  749. NMSUB t11, t11, b7, t12
  750. LD b6, 0 * SIZE(BO)
  751. MUL t11, b6, t11
  752. ST t11, 0 * SIZE(AO) # updata packed A
  753. ST t12, 1 * SIZE(AO)
  754. ST t11, 0 * SIZE(CO1) # write back
  755. ST t12, 0 * SIZE(CO2)
  756. daddiu CO1, CO1, 1 * SIZE # fixed pointer
  757. daddiu CO2, CO2, 1 * SIZE
  758. dsll TEMP, K, BASE_SHIFT # mr=2
  759. daddu AORIG, AORIG, TEMP # move to next panel Ai
  760. .L59:
  761. daddiu KK, KK, -2 # rectangular data length increased by 2
  762. .align 3
  763. .L50:
  764. dsra J, N, 2 # J = NC/4
  765. blez J, .L999
  766. NOP
  767. .L10:
  768. dsll TEMP, K, 2 + BASE_SHIFT
  769. dsubu B, B, TEMP # move B to the beginning address of Bj
  770. dsll TEMP, LDC, 2
  771. dsubu C, C, TEMP # move C to the beginning address of Cj
  772. daddiu J, J, -1
  773. move CO1, C
  774. daddu CO2, C, LDC
  775. daddu CO3, CO2, LDC
  776. daddu CO4, CO3, LDC
  777. move AORIG, A # reset A
  778. dsra I, M, 2 # I=MC/4
  779. blez I, .L20
  780. NOP
  781. .align 3
  782. .L11:
  783. dsll L, KK, 2 + BASE_SHIFT # mr=4
  784. dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
  785. daddu AO, AORIG, L
  786. daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
  787. dsubu TEMP, K, KK # temp = the length of rectangular data part
  788. MTC $0, t11 # clear 16 results registers
  789. MOV t21, t11
  790. MOV t31, t11
  791. MOV t41, t11
  792. MOV t12, t11
  793. MOV t22, t11
  794. MOV t32, t11
  795. MOV t42, t11
  796. MOV t13, t11
  797. MOV t23, t11
  798. MOV t33, t11
  799. MOV t43, t11
  800. MOV t14, t11
  801. MOV t24, t11
  802. MOV t34, t11
  803. MOV t44, t11
  804. LD a1, 0 * SIZE(AO)
  805. LD a2, 1 * SIZE(AO)
  806. LD a3, 2 * SIZE(AO)
  807. LD a4, 3 * SIZE(AO)
  808. LD b1, 0 * SIZE(BO)
  809. LD b2, 1 * SIZE(BO)
  810. LD b3, 2 * SIZE(BO)
  811. LD b4, 3 * SIZE(BO)
  812. dsra L, TEMP, 2 # L=(KC-offset)/4
  813. blez L, .L15
  814. NOP
  815. .align 3
  816. .L12:
  817. LD a5, 4 * SIZE(AO)
  818. LD a6, 5 * SIZE(AO)
  819. LD a7, 6 * SIZE(AO)
  820. LD a8, 7 * SIZE(AO)
  821. LD b5, 4 * SIZE(BO)
  822. LD b6, 5 * SIZE(BO)
  823. LD b7, 6 * SIZE(BO)
  824. LD b8, 7 * SIZE(BO)
  825. MADD t11, t11, a1, b1
  826. MADD t21, t21, a2, b1
  827. MADD t31, t31, a3, b1
  828. MADD t41, t41, a4, b1
  829. MADD t12, t12, a1, b2
  830. MADD t22, t22, a2, b2
  831. MADD t32, t32, a3, b2
  832. MADD t42, t42, a4, b2
  833. MADD t13, t13, a1, b3
  834. MADD t23, t23, a2, b3
  835. MADD t33, t33, a3, b3
  836. MADD t43, t43, a4, b3
  837. MADD t14, t14, a1, b4
  838. MADD t24, t24, a2, b4
  839. MADD t34, t34, a3, b4
  840. MADD t44, t44, a4, b4 # fisrt
  841. LD a1, 8 * SIZE(AO)
  842. LD a2, 9 * SIZE(AO)
  843. LD a3, 10 * SIZE(AO)
  844. LD a4, 11 * SIZE(AO)
  845. LD b1, 8 * SIZE(BO)
  846. LD b2, 9 * SIZE(BO)
  847. LD b3, 10 * SIZE(BO)
  848. LD b4, 11 * SIZE(BO)
  849. MADD t11, t11, a5, b5
  850. MADD t21, t21, a6, b5
  851. MADD t31, t31, a7, b5
  852. MADD t41, t41, a8, b5
  853. MADD t12, t12, a5, b6
  854. MADD t22, t22, a6, b6
  855. MADD t32, t32, a7, b6
  856. MADD t42, t42, a8, b6
  857. MADD t13, t13, a5, b7
  858. MADD t23, t23, a6, b7
  859. MADD t33, t33, a7, b7
  860. MADD t43, t43, a8, b7
  861. MADD t14, t14, a5, b8
  862. MADD t24, t24, a6, b8
  863. MADD t34, t34, a7, b8
  864. MADD t44, t44, a8, b8 # second
  865. LD a5, 12 * SIZE(AO)
  866. LD a6, 13 * SIZE(AO)
  867. LD a7, 14 * SIZE(AO)
  868. LD a8, 15 * SIZE(AO)
  869. LD b5, 12 * SIZE(BO)
  870. LD b6, 13 * SIZE(BO)
  871. LD b7, 14 * SIZE(BO)
  872. LD b8, 15 * SIZE(BO)
  873. MADD t11, t11, a1, b1
  874. MADD t21, t21, a2, b1
  875. MADD t31, t31, a3, b1
  876. MADD t41, t41, a4, b1
  877. MADD t12, t12, a1, b2
  878. MADD t22, t22, a2, b2
  879. MADD t32, t32, a3, b2
  880. MADD t42, t42, a4, b2
  881. MADD t13, t13, a1, b3
  882. MADD t23, t23, a2, b3
  883. MADD t33, t33, a3, b3
  884. MADD t43, t43, a4, b3
  885. MADD t14, t14, a1, b4
  886. MADD t24, t24, a2, b4
  887. MADD t34, t34, a3, b4
  888. MADD t44, t44, a4, b4 # third
  889. daddiu AO, AO, 16 * SIZE # AO += 4mr*4kr
  890. daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
  891. LD a1, 0 * SIZE(AO)
  892. LD a2, 1 * SIZE(AO)
  893. LD a3, 2 * SIZE(AO)
  894. LD a4, 3 * SIZE(AO)
  895. LD b1, 0 * SIZE(BO)
  896. LD b2, 1 * SIZE(BO)
  897. LD b3, 2 * SIZE(BO)
  898. LD b4, 3 * SIZE(BO)
  899. MADD t11, t11, a5, b5
  900. MADD t21, t21, a6, b5
  901. MADD t31, t31, a7, b5
  902. MADD t41, t41, a8, b5
  903. MADD t12, t12, a5, b6
  904. MADD t22, t22, a6, b6
  905. MADD t32, t32, a7, b6
  906. MADD t42, t42, a8, b6
  907. MADD t13, t13, a5, b7
  908. MADD t23, t23, a6, b7
  909. MADD t33, t33, a7, b7
  910. MADD t43, t43, a8, b7
  911. MADD t14, t14, a5, b8
  912. MADD t24, t24, a6, b8
  913. MADD t34, t34, a7, b8
  914. MADD t44, t44, a8, b8 # fouth
  915. daddiu L, L, -1
  916. bgtz L, .L12
  917. NOP
  918. .align 3
  919. .L15:
  920. andi L, TEMP, 3
  921. blez L, .L18
  922. NOP
  923. .align 3
  924. .L16:
  925. MADD t11, t11, a1, b1
  926. MADD t21, t21, a2, b1
  927. MADD t31, t31, a3, b1
  928. MADD t41, t41, a4, b1
  929. MADD t12, t12, a1, b2
  930. MADD t22, t22, a2, b2
  931. MADD t32, t32, a3, b2
  932. MADD t42, t42, a4, b2
  933. MADD t13, t13, a1, b3
  934. MADD t23, t23, a2, b3
  935. MADD t33, t33, a3, b3
  936. MADD t43, t43, a4, b3
  937. MADD t14, t14, a1, b4
  938. MADD t24, t24, a2, b4
  939. MADD t34, t34, a3, b4
  940. MADD t44, t44, a4, b4 # third
  941. daddiu AO, AO, 4 * SIZE # AO += 4mr
  942. daddiu BO, BO, 4 * SIZE # BO += 4nr
  943. LD a1, 0 * SIZE(AO)
  944. LD a2, 1 * SIZE(AO)
  945. LD a3, 2 * SIZE(AO)
  946. LD a4, 3 * SIZE(AO)
  947. LD b1, 0 * SIZE(BO)
  948. LD b2, 1 * SIZE(BO)
  949. LD b3, 2 * SIZE(BO)
  950. LD b4, 3 * SIZE(BO)
  951. daddiu L, L, -1
  952. bgtz L, .L16
  953. NOP
  954. .align
  955. .L18:
  956. daddiu TEMP, KK, -4 # deal with the triangular data part
  957. dsll L, TEMP, 2 + BASE_SHIFT
  958. dsll TEMP, TEMP, 2 + BASE_SHIFT
  959. daddu AO, AORIG, L
  960. daddu BO, B, TEMP # BO point to the trigular data part
  961. LD b1, 0 * SIZE(AO) # fixed results
  962. LD b2, 1 * SIZE(AO)
  963. LD b3, 2 * SIZE(AO)
  964. LD b4, 3 * SIZE(AO)
  965. SUB t11, b1, t11
  966. SUB t21, b2, t21
  967. SUB t31, b3, t31
  968. SUB t41, b4, t41
  969. LD b5, 4 * SIZE(AO)
  970. LD b6, 5 * SIZE(AO)
  971. LD b7, 6 * SIZE(AO)
  972. LD b8, 7 * SIZE(AO)
  973. SUB t12, b5, t12
  974. SUB t22, b6, t22
  975. SUB t32, b7, t32
  976. SUB t42, b8, t42
  977. LD b1, 8 * SIZE(AO)
  978. LD b2, 9 * SIZE(AO)
  979. LD b3, 10 * SIZE(AO)
  980. LD b4, 11 * SIZE(AO)
  981. SUB t13, b1, t13
  982. SUB t23, b2, t23
  983. SUB t33, b3, t33
  984. SUB t43, b4, t43
  985. LD b5, 12 * SIZE(AO)
  986. LD b6, 13 * SIZE(AO)
  987. LD b7, 14 * SIZE(AO)
  988. LD b8, 15 * SIZE(AO)
  989. SUB t14, b5, t14
  990. SUB t24, b6, t24
  991. SUB t34, b7, t34
  992. SUB t44, b8, t44
  993. LD b1, 15 * SIZE(BO)
  994. LD b2, 14 * SIZE(BO)
  995. LD b3, 13 * SIZE(BO)
  996. LD b4, 12 * SIZE(BO)
  997. MUL t14, b1, t14
  998. MUL t24, b1, t24
  999. MUL t34, b1, t34
  1000. MUL t44, b1, t44
  1001. NMSUB t13, t13, b2, t14
  1002. NMSUB t23, t23, b2, t24
  1003. NMSUB t33, t33, b2, t34
  1004. NMSUB t43, t43, b2, t44
  1005. NMSUB t12, t12, b3, t14
  1006. NMSUB t22, t22, b3, t24
  1007. NMSUB t32, t32, b3, t34
  1008. NMSUB t42, t42, b3, t44
  1009. NMSUB t11, t11, b4, t14
  1010. NMSUB t21, t21, b4, t24
  1011. NMSUB t31, t31, b4, t34
  1012. NMSUB t41, t41, b4, t44
  1013. LD b5, 10 * SIZE(BO)
  1014. LD b6, 9 * SIZE(BO)
  1015. LD b7, 8 * SIZE(BO)
  1016. MUL t13, b5, t13
  1017. MUL t23, b5, t23
  1018. MUL t33, b5, t33
  1019. MUL t43, b5, t43
  1020. NMSUB t12, t12, b6, t13
  1021. NMSUB t22, t22, b6, t23
  1022. NMSUB t32, t32, b6, t33
  1023. NMSUB t42, t42, b6, t43
  1024. NMSUB t11, t11, b7, t13
  1025. NMSUB t21, t21, b7, t23
  1026. NMSUB t31, t31, b7, t33
  1027. NMSUB t41, t41, b7, t43
  1028. LD b8, 5 * SIZE(BO)
  1029. LD b1, 4 * SIZE(BO)
  1030. MUL t12, b8, t12
  1031. MUL t22, b8, t22
  1032. MUL t32, b8, t32
  1033. MUL t42, b8, t42
  1034. NMSUB t11, t11, b1, t12
  1035. NMSUB t21, t21, b1, t22
  1036. NMSUB t31, t31, b1, t32
  1037. NMSUB t41, t41, b1, t42
  1038. LD b2, 0 * SIZE(BO)
  1039. MUL t11, b2, t11
  1040. MUL t21, b2, t21
  1041. MUL t31, b2, t31
  1042. MUL t41, b2, t41
  1043. ST t11, 0 * SIZE(AO) # updata packed A
  1044. ST t21, 1 * SIZE(AO)
  1045. ST t31, 2 * SIZE(AO)
  1046. ST t41, 3 * SIZE(AO)
  1047. ST t12, 4 * SIZE(AO)
  1048. ST t22, 5 * SIZE(AO)
  1049. ST t32, 6 * SIZE(AO)
  1050. ST t42, 7 * SIZE(AO)
  1051. ST t13, 8 * SIZE(AO)
  1052. ST t23, 9 * SIZE(AO)
  1053. ST t33, 10 * SIZE(AO)
  1054. ST t43, 11 * SIZE(AO)
  1055. ST t14, 12 * SIZE(AO)
  1056. ST t24, 13 * SIZE(AO)
  1057. ST t34, 14 * SIZE(AO)
  1058. ST t44, 15 * SIZE(AO)
  1059. ST t11, 0 * SIZE(CO1) # write back
  1060. ST t21, 1 * SIZE(CO1)
  1061. ST t31, 2 * SIZE(CO1)
  1062. ST t41, 3 * SIZE(CO1)
  1063. ST t12, 0 * SIZE(CO2)
  1064. ST t22, 1 * SIZE(CO2)
  1065. ST t32, 2 * SIZE(CO2)
  1066. ST t42, 3 * SIZE(CO2)
  1067. ST t13, 0 * SIZE(CO3)
  1068. ST t23, 1 * SIZE(CO3)
  1069. ST t33, 2 * SIZE(CO3)
  1070. ST t43, 3 * SIZE(CO3)
  1071. ST t14, 0 * SIZE(CO4)
  1072. ST t24, 1 * SIZE(CO4)
  1073. ST t34, 2 * SIZE(CO4)
  1074. ST t44, 3 * SIZE(CO4)
  1075. daddiu CO1, CO1, 4 * SIZE # fixed pointer
  1076. daddiu CO2, CO2, 4 * SIZE
  1077. daddiu CO3, CO3, 4 * SIZE
  1078. daddiu CO4, CO4, 4 * SIZE
  1079. dsll TEMP, K, 2 + BASE_SHIFT
  1080. daddu AORIG, AORIG, TEMP # move to next panel Ai
  1081. daddiu I, I, -1
  1082. bgtz I, .L11
  1083. NOP
  1084. .align 3
  1085. .L20:
  1086. andi I, M, 2 # mr=2
  1087. blez I, .L40
  1088. NOP
  1089. dsll L, KK, 1 + BASE_SHIFT # mr=2
  1090. dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
  1091. daddu AO, AORIG, L
  1092. daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
  1093. dsubu TEMP, K, KK # temp = the length of rectangular data part
  1094. MTC $0, t11 # clear 8 results registers
  1095. MOV t21, t11
  1096. MOV t12, t11
  1097. MOV t22, t11
  1098. MOV t13, t11
  1099. MOV t23, t11
  1100. MOV t14, t11
  1101. MOV t24, t11
  1102. LD a1, 0 * SIZE(AO)
  1103. LD a2, 1 * SIZE(AO)
  1104. LD b1, 0 * SIZE(BO)
  1105. LD b2, 1 * SIZE(BO)
  1106. LD b3, 2 * SIZE(BO)
  1107. LD b4, 3 * SIZE(BO)
  1108. dsra L, TEMP, 2 # L=(KC-offset)/4
  1109. blez L, .L25
  1110. NOP
  1111. .align 3
  1112. .L22:
  1113. LD a5, 2 * SIZE(AO)
  1114. LD a6, 3 * SIZE(AO)
  1115. LD b5, 4 * SIZE(BO)
  1116. LD b6, 5 * SIZE(BO)
  1117. LD b7, 6 * SIZE(BO)
  1118. LD b8, 7 * SIZE(BO)
  1119. MADD t11, t11, a1, b1
  1120. MADD t21, t21, a2, b1
  1121. MADD t12, t12, a1, b2
  1122. MADD t22, t22, a2, b2
  1123. MADD t13, t13, a1, b3
  1124. MADD t23, t23, a2, b3
  1125. MADD t14, t14, a1, b4
  1126. MADD t24, t24, a2, b4
  1127. LD a3, 4 * SIZE(AO)
  1128. LD a4, 5 * SIZE(AO)
  1129. LD b1, 8 * SIZE(BO)
  1130. LD b2, 9 * SIZE(BO)
  1131. LD b3, 10 * SIZE(BO)
  1132. LD b4, 11 * SIZE(BO)
  1133. MADD t11, t11, a5, b5
  1134. MADD t21, t21, a6, b5
  1135. MADD t12, t12, a5, b6
  1136. MADD t22, t22, a6, b6
  1137. MADD t13, t13, a5, b7
  1138. MADD t23, t23, a6, b7
  1139. MADD t14, t14, a5, b8
  1140. MADD t24, t24, a6, b8
  1141. LD a7, 6 * SIZE(AO)
  1142. LD a8, 7 * SIZE(AO)
  1143. LD b5, 12 * SIZE(BO)
  1144. LD b6, 13 * SIZE(BO)
  1145. LD b7, 14 * SIZE(BO)
  1146. LD b8, 15 * SIZE(BO)
  1147. MADD t11, t11, a3, b1
  1148. MADD t21, t21, a4, b1
  1149. MADD t12, t12, a3, b2
  1150. MADD t22, t22, a4, b2
  1151. MADD t13, t13, a3, b3
  1152. MADD t23, t23, a4, b3
  1153. MADD t14, t14, a3, b4
  1154. MADD t24, t24, a4, b4
  1155. daddiu AO, AO, 8 * SIZE # AO += 2mr*4kr
  1156. daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
  1157. LD a1, 0 * SIZE(AO)
  1158. LD a2, 1 * SIZE(AO)
  1159. LD b1, 0 * SIZE(BO)
  1160. LD b2, 1 * SIZE(BO)
  1161. LD b3, 2 * SIZE(BO)
  1162. LD b4, 3 * SIZE(BO)
  1163. MADD t11, t11, a7, b5
  1164. MADD t21, t21, a8, b5
  1165. MADD t12, t12, a7, b6
  1166. MADD t22, t22, a8, b6
  1167. MADD t13, t13, a7, b7
  1168. MADD t23, t23, a8, b7
  1169. MADD t14, t14, a7, b8
  1170. MADD t24, t24, a8, b8
  1171. daddiu L, L, -1
  1172. bgtz L, .L22
  1173. NOP
  1174. .align 3
  1175. .L25:
  1176. andi L, TEMP, 3
  1177. blez L, .L28
  1178. NOP
  1179. .align 3
  1180. .L26:
  1181. MADD t11, t11, a1, b1
  1182. MADD t21, t21, a2, b1
  1183. MADD t12, t12, a1, b2
  1184. MADD t22, t22, a2, b2
  1185. MADD t13, t13, a1, b3
  1186. MADD t23, t23, a2, b3
  1187. MADD t14, t14, a1, b4
  1188. MADD t24, t24, a2, b4
  1189. daddiu AO, AO, 2 * SIZE # AO += 2mr
  1190. daddiu BO, BO, 4 * SIZE # BO += 4nr
  1191. LD a1, 0 * SIZE(AO)
  1192. LD a2, 1 * SIZE(AO)
  1193. LD b1, 0 * SIZE(BO)
  1194. LD b2, 1 * SIZE(BO)
  1195. LD b3, 2 * SIZE(BO)
  1196. LD b4, 3 * SIZE(BO)
  1197. daddiu L, L, -1
  1198. bgtz L, .L26
  1199. NOP
  1200. .align
  1201. .L28:
  1202. daddiu TEMP, KK, -4 # deal with the triangular data part
  1203. dsll L, TEMP, 1 + BASE_SHIFT # mr=2
  1204. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1205. daddu AO, AORIG, L
  1206. daddu BO, B, TEMP # BO point to the trigular data part
  1207. LD b1, 0 * SIZE(AO) # fixed results
  1208. LD b2, 1 * SIZE(AO)
  1209. SUB t11, b1, t11
  1210. SUB t21, b2, t21
  1211. LD b5, 2 * SIZE(AO)
  1212. LD b6, 3 * SIZE(AO)
  1213. SUB t12, b5, t12
  1214. SUB t22, b6, t22
  1215. LD b3, 4 * SIZE(AO)
  1216. LD b4, 5 * SIZE(AO)
  1217. SUB t13, b3, t13
  1218. SUB t23, b4, t23
  1219. LD b7, 6 * SIZE(AO)
  1220. LD b8, 7 * SIZE(AO)
  1221. SUB t14, b7, t14
  1222. SUB t24, b8, t24
  1223. LD b1, 15 * SIZE(BO)
  1224. LD b2, 14 * SIZE(BO)
  1225. LD b3, 13 * SIZE(BO)
  1226. LD b4, 12 * SIZE(BO)
  1227. MUL t14, b1, t14
  1228. MUL t24, b1, t24
  1229. NMSUB t13, t13, b2, t14
  1230. NMSUB t23, t23, b2, t24
  1231. NMSUB t12, t12, b3, t14
  1232. NMSUB t22, t22, b3, t24
  1233. NMSUB t11, t11, b4, t14
  1234. NMSUB t21, t21, b4, t24
  1235. LD b5, 10 * SIZE(BO)
  1236. LD b6, 9 * SIZE(BO)
  1237. LD b7, 8 * SIZE(BO)
  1238. MUL t13, b5, t13
  1239. MUL t23, b5, t23
  1240. NMSUB t12, t12, b6, t13
  1241. NMSUB t22, t22, b6, t23
  1242. NMSUB t11, t11, b7, t13
  1243. NMSUB t21, t21, b7, t23
  1244. LD b8, 5 * SIZE(BO)
  1245. LD b1, 4 * SIZE(BO)
  1246. MUL t12, b8, t12
  1247. MUL t22, b8, t22
  1248. NMSUB t11, t11, b1, t12
  1249. NMSUB t21, t21, b1, t22
  1250. LD b2, 0 * SIZE(BO)
  1251. MUL t11, b2, t11
  1252. MUL t21, b2, t21
  1253. ST t11, 0 * SIZE(AO) # updata packed A
  1254. ST t21, 1 * SIZE(AO)
  1255. ST t12, 2 * SIZE(AO)
  1256. ST t22, 3 * SIZE(AO)
  1257. ST t13, 4 * SIZE(AO)
  1258. ST t23, 5 * SIZE(AO)
  1259. ST t14, 6 * SIZE(AO)
  1260. ST t24, 7 * SIZE(AO)
  1261. ST t11, 0 * SIZE(CO1) # write back
  1262. ST t21, 1 * SIZE(CO1)
  1263. ST t12, 0 * SIZE(CO2)
  1264. ST t22, 1 * SIZE(CO2)
  1265. ST t13, 0 * SIZE(CO3)
  1266. ST t23, 1 * SIZE(CO3)
  1267. ST t14, 0 * SIZE(CO4)
  1268. ST t24, 1 * SIZE(CO4)
  1269. daddiu CO1, CO1, 2 * SIZE # fixed pointer
  1270. daddiu CO2, CO2, 2 * SIZE
  1271. daddiu CO3, CO3, 2 * SIZE
  1272. daddiu CO4, CO4, 2 * SIZE
  1273. dsll TEMP, K, 1 + BASE_SHIFT # mr=2
  1274. daddu AORIG, AORIG, TEMP # move to next panel Ai
  1275. .align 3
  1276. .L40:
  1277. andi I, M, 1
  1278. blez I, .L29
  1279. NOP
  1280. dsll L, KK, BASE_SHIFT # mr=1
  1281. dsll TEMP, KK, 2 + BASE_SHIFT # nr=4
  1282. daddu AO, AORIG, L
  1283. daddu BO, B, TEMP # BO point to the retangular data part,also reset BO
  1284. dsubu TEMP, K, KK # temp = the length of rectangular data part
  1285. MTC $0, t11 # clear 4 results registers
  1286. MOV t12, t11
  1287. MOV t13, t11
  1288. MOV t14, t11
  1289. LD a1, 0 * SIZE(AO)
  1290. LD b1, 0 * SIZE(BO)
  1291. LD b2, 1 * SIZE(BO)
  1292. LD b3, 2 * SIZE(BO)
  1293. LD b4, 3 * SIZE(BO)
  1294. dsra L, TEMP, 2 # L=(KC-offset)/4
  1295. blez L, .L45
  1296. NOP
  1297. .align 3
  1298. .L42:
  1299. LD a5, 1 * SIZE(AO)
  1300. LD b5, 4 * SIZE(BO)
  1301. LD b6, 5 * SIZE(BO)
  1302. LD b7, 6 * SIZE(BO)
  1303. LD b8, 7 * SIZE(BO)
  1304. MADD t11, t11, a1, b1
  1305. MADD t12, t12, a1, b2
  1306. MADD t13, t13, a1, b3
  1307. MADD t14, t14, a1, b4
  1308. LD a3, 2 * SIZE(AO)
  1309. LD b1, 8 * SIZE(BO)
  1310. LD b2, 9 * SIZE(BO)
  1311. LD b3, 10 * SIZE(BO)
  1312. LD b4, 11 * SIZE(BO)
  1313. MADD t11, t11, a5, b5
  1314. MADD t12, t12, a5, b6
  1315. MADD t13, t13, a5, b7
  1316. MADD t14, t14, a5, b8
  1317. LD a7, 3 * SIZE(AO)
  1318. LD b5, 12 * SIZE(BO)
  1319. LD b6, 13 * SIZE(BO)
  1320. LD b7, 14 * SIZE(BO)
  1321. LD b8, 15 * SIZE(BO)
  1322. MADD t11, t11, a3, b1
  1323. MADD t12, t12, a3, b2
  1324. MADD t13, t13, a3, b3
  1325. MADD t14, t14, a3, b4
  1326. daddiu AO, AO, 4 * SIZE # AO += 1mr*4kr
  1327. daddiu BO, BO, 16 * SIZE # BO += 4nr*4kr
  1328. LD a1, 0 * SIZE(AO)
  1329. LD b1, 0 * SIZE(BO)
  1330. LD b2, 1 * SIZE(BO)
  1331. LD b3, 2 * SIZE(BO)
  1332. LD b4, 3 * SIZE(BO)
  1333. MADD t11, t11, a7, b5
  1334. MADD t12, t12, a7, b6
  1335. MADD t13, t13, a7, b7
  1336. MADD t14, t14, a7, b8
  1337. daddiu L, L, -1
  1338. bgtz L, .L42
  1339. NOP
  1340. .align 3
  1341. .L45:
  1342. andi L, TEMP, 3
  1343. blez L, .L48
  1344. NOP
  1345. .align 3
  1346. .L46:
  1347. MADD t11, t11, a1, b1
  1348. MADD t12, t12, a1, b2
  1349. MADD t13, t13, a1, b3
  1350. MADD t14, t14, a1, b4
  1351. daddiu AO, AO, 1 * SIZE # AO += 2mr
  1352. daddiu BO, BO, 4 * SIZE # BO += 4nr
  1353. LD a1, 0 * SIZE(AO)
  1354. LD b1, 0 * SIZE(BO)
  1355. LD b2, 1 * SIZE(BO)
  1356. LD b3, 2 * SIZE(BO)
  1357. LD b4, 3 * SIZE(BO)
  1358. daddiu L, L, -1
  1359. bgtz L, .L46
  1360. NOP
  1361. .align
  1362. .L48:
  1363. daddiu TEMP, KK, -4 # deal with the triangular data part
  1364. dsll L, TEMP, BASE_SHIFT # mr=1
  1365. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1366. daddu AO, AORIG, L
  1367. daddu BO, B, TEMP # BO point to the trigular data part
  1368. LD b1, 0 * SIZE(AO) # fixed results
  1369. LD b5, 1 * SIZE(AO)
  1370. LD b3, 2 * SIZE(AO)
  1371. LD b7, 3 * SIZE(AO)
  1372. SUB t11, b1, t11
  1373. SUB t12, b5, t12
  1374. SUB t13, b3, t13
  1375. SUB t14, b7, t14
  1376. LD b1, 15 * SIZE(BO)
  1377. LD b2, 14 * SIZE(BO)
  1378. LD b3, 13 * SIZE(BO)
  1379. LD b4, 12 * SIZE(BO)
  1380. MUL t14, b1, t14
  1381. NMSUB t13, t13, b2, t14
  1382. NMSUB t12, t12, b3, t14
  1383. NMSUB t11, t11, b4, t14
  1384. LD b5, 10 * SIZE(BO)
  1385. LD b6, 9 * SIZE(BO)
  1386. LD b7, 8 * SIZE(BO)
  1387. MUL t13, b5, t13
  1388. NMSUB t12, t12, b6, t13
  1389. NMSUB t11, t11, b7, t13
  1390. LD b8, 5 * SIZE(BO)
  1391. LD b1, 4 * SIZE(BO)
  1392. MUL t12, b8, t12
  1393. NMSUB t11, t11, b1, t12
  1394. LD b2, 0 * SIZE(BO)
  1395. MUL t11, b2, t11
  1396. ST t11, 0 * SIZE(AO) # updata packed A
  1397. ST t12, 1 * SIZE(AO)
  1398. ST t13, 2 * SIZE(AO)
  1399. ST t14, 3 * SIZE(AO)
  1400. ST t11, 0 * SIZE(CO1) # write back
  1401. ST t12, 0 * SIZE(CO2)
  1402. ST t13, 0 * SIZE(CO3)
  1403. ST t14, 0 * SIZE(CO4)
  1404. daddiu CO1, CO1, 1 * SIZE # fixed pointer
  1405. daddiu CO2, CO2, 1 * SIZE
  1406. daddiu CO3, CO3, 1 * SIZE
  1407. daddiu CO4, CO4, 1 * SIZE
  1408. dsll TEMP, K, BASE_SHIFT # mr=2
  1409. daddu AORIG, AORIG, TEMP # move to next panel Ai
  1410. .L29:
  1411. daddiu KK, KK, -4 # rectangular data part increased by 4
  1412. bgtz J, .L10
  1413. NOP
  1414. .align 3
  1415. .L999:
  1416. LDARG $16, 0($sp)
  1417. LDARG $17, 8($sp)
  1418. LDARG $18, 16($sp)
  1419. LDARG $19, 24($sp)
  1420. LDARG $20, 32($sp)
  1421. LDARG $21, 40($sp)
  1422. ldc1 $f24, 48($sp)
  1423. ldc1 $f25, 56($sp)
  1424. ldc1 $f26, 64($sp)
  1425. ldc1 $f27, 72($sp)
  1426. ldc1 $f28, 80($sp)
  1427. LDARG $22, 88($sp)
  1428. LDARG $23, 96($sp)
  1429. LDARG $24, 104($sp)
  1430. LDARG $25, 112($sp)
  1431. #ifndef __64BIT__
  1432. ldc1 $f20,112($sp)
  1433. ldc1 $f21,120($sp)
  1434. ldc1 $f22,128($sp)
  1435. ldc1 $f23,136($sp)
  1436. #endif
  1437. j $31
  1438. daddiu $sp, $sp, 144
  1439. EPILOGUE