You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_loongson3b_4x4.S 41 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579
  1. #define REALNAME ASMNAME
  2. #define ASSEMBLER
  3. #include "common.h"
  4. #define FETCH ld
  5. #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  6. #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  7. #define M $4
  8. #define N $5
  9. #define K $6
  10. #define A $8
  11. #define B $9
  12. #define C $10
  13. #define LDC $11
  14. #define AO $12
  15. #define BO $13
  16. #define CO1 $14
  17. #define CO2 $15
  18. #define CO3 $16
  19. #define CO4 $17
  20. #define KCO $18
  21. #define MCO $19
  22. #define NCO $20
  23. #define SPANB $21
  24. #define PREB $23
  25. #define PREA $24
  26. #define SPANA $25
  27. #define ALPHA $f15
  28. #if defined(TRMMKERNEL)
  29. #define OFFSET $2
  30. #define KK $3
  31. #define TEMP $7
  32. #endif
  33. #define R8 8
  34. #define R9 9
  35. #define R14 14
  36. #define R15 15
  37. #define R16 16
  38. #define R17 17
  39. #define t11 $f30
  40. #define t21 $f31
  41. #define t31 $f28
  42. #define t41 $f29
  43. #define t12 $f26
  44. #define t22 $f27
  45. #define t32 $f24
  46. #define t42 $f25
  47. #define t13 $f22
  48. #define t23 $f23
  49. #define t33 $f20
  50. #define t43 $f21
  51. #define t14 $f18
  52. #define t24 $f19
  53. #define t34 $f16
  54. #define t44 $f17
  55. #define c11 $f0
  56. #define c21 $f1
  57. #define c31 $f2
  58. #define c41 $f3
  59. #define c12 $f4
  60. #define c22 $f5
  61. #define c32 $f6
  62. #define c42 $f7
  63. #define c13 $f8
  64. #define c23 $f9
  65. #define c33 $f10
  66. #define c43 $f11
  67. #define c14 $f12
  68. #define c24 $f13
  69. #define c34 $f14
  70. #define c44 $f0
  71. #define a0 $f0
  72. #define a1 $f1
  73. #define a2 $f2
  74. #define a3 $f3
  75. #define a4 $f4
  76. #define a5 $f5
  77. #define a6 $f6
  78. #define a7 $f7
  79. #define b0 $f8
  80. #define b1 $f9
  81. #define b2 $f10
  82. #define b3 $f11
  83. #define b4 $f12
  84. #define b5 $f13
  85. #define b6 $f14
  86. #define b7 $f15
  87. #define F31 31
  88. #define F30 30
  89. #define F29 29
  90. #define F28 28
  91. #define F27 27
  92. #define F26 26
  93. #define F25 25
  94. #define F24 24
  95. #define F23 23
  96. #define F22 22
  97. #define F21 21
  98. #define F20 20
  99. #define F19 19
  100. #define F18 18
  101. #define F17 17
  102. #define F16 16
  103. #define F15 15
  104. #define F14 14
  105. #define F13 13
  106. #define F12 12
  107. #define F11 11
  108. #define F10 10
  109. #define F9 9
  110. #define F8 8
  111. #define F7 7
  112. #define F6 6
  113. #define F5 5
  114. #define F4 4
  115. #define F3 3
  116. #define F2 2
  117. #define F1 1
  118. #define F0 0
  119. PROLOGUE
  120. daddiu $sp, $sp, -160
  121. sd $16, 0($sp)
  122. sd $17, 8($sp)
  123. sd $18, 16($sp)
  124. sd $19, 24($sp)
  125. sd $20, 32($sp)
  126. sd $21, 40($sp)
  127. sd $22, 48($sp)
  128. ST $f24, 56($sp)
  129. ST $f25, 64($sp)
  130. ST $f26, 72($sp)
  131. ST $f27, 80($sp)
  132. ST $f28, 88($sp)
  133. sd $23, 96($sp)
  134. sd $24, 104($sp)
  135. sd $25, 112($sp)
  136. ST $f20,120($sp)
  137. ST $f21,128($sp)
  138. ST $f22,136($sp)
  139. ST $f23,144($sp)
  140. .align 5
  141. .L0_N4: # Loop N
  142. ST ALPHA,152($sp) # Backup ALPHA
  143. move MCO,M # Backup M
  144. move NCO,N # Backup N
  145. move KCO,K # Backup K
  146. move AO,A # Backup A_addr
  147. dsra N,NCO,2 # N=NCO/2
  148. dsll LDC,LDC,BASE_SHIFT # LDC*8Byte
  149. dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5
  150. #if defined(TRMMKERNEL)
  151. LDARG OFFSET,160($sp) # OFFSET is relate to the data part
  152. #endif
  153. #if defined(TRMMKERNEL) && !defined(LEFT)
  154. neg KK,OFFSET
  155. #endif
  156. move BO,B # Backup B_addr
  157. beq N,$0,.L0_N2 # N=0,NCO<4
  158. dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte
  159. .L0_N4_Lb: # mr=4,nr=4
  160. move CO1,C
  161. dsra M,MCO,2 # M=MCO/2
  162. move A,AO # Reset A
  163. daddu CO2,C,LDC
  164. daddu PREB,BO,SPANB # PreB point next panelB
  165. daddu CO3,CO2,LDC
  166. daddu PREA,AO,SPANA
  167. daddu CO4,CO3,LDC
  168. #if defined(TRMMKERNEL) && defined(LEFT)
  169. move KK,OFFSET
  170. #endif
  171. beqz M,.L14_M2
  172. daddu C,CO4,LDC # move C to next panel Cj
  173. .L10:
  174. #if defined(TRMMKERNEL)
  175. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  176. move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U)
  177. #else
  178. dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part
  179. dsll TEMP,KK,2 + BASE_SHIFT
  180. daddu A,A,K # move A B to data part
  181. daddu B,BO,TEMP
  182. #endif
  183. MTC $0,t11 # GEMM part NR=4,MR=4
  184. LD a0,0(A)
  185. MOV t21,t11
  186. MOV t31,t11
  187. LD a1,1*SIZE(A)
  188. MOV t41,t11
  189. MOV t12,t11
  190. LD b0,0(B)
  191. MOV t22,t11
  192. MOV t32,t11
  193. LD b1,1*SIZE(B)
  194. MOV t42,t11
  195. LD a2,2*SIZE(A)
  196. MOV t13,t11
  197. MOV t23,t11
  198. LD b2,2*SIZE(B)
  199. MOV t33,t11
  200. MOV t43,t11
  201. LD a3,3*SIZE(A)
  202. MOV t14,t11
  203. MOV t24,t11
  204. LD b3,3*SIZE(B)
  205. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  206. dsubu TEMP,KCO,KK # temp is the length of the data part
  207. #elif defined(LEFT)
  208. daddiu TEMP, KK, 4 # S=L,U=L
  209. #else
  210. daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part
  211. #endif
  212. dsra K,TEMP,2 # K=KCO/2
  213. MOV t34,t11
  214. beqz K,.L15
  215. MOV t44,t11
  216. #else
  217. move B,BO # Reset B
  218. MTC $0,t11 # GEMM part NR=4,MR=4
  219. LD a0,0(A)
  220. MOV t21,t11
  221. MOV t31,t11
  222. LD a1,1*SIZE(A)
  223. MOV t41,t11
  224. MOV t12,t11
  225. LD b0,0(B)
  226. MOV t22,t11
  227. MOV t32,t11
  228. LD b1,1*SIZE(B)
  229. MOV t42,t11
  230. dsra K,KCO,2 # K=KCO/2
  231. LD a2,2*SIZE(A)
  232. MOV t13,t11
  233. MOV t23,t11
  234. LD b2,2*SIZE(B)
  235. MOV t33,t11
  236. MOV t43,t11
  237. LD a3,3*SIZE(A)
  238. MOV t14,t11
  239. MOV t24,t11
  240. LD b3,3*SIZE(B)
  241. MOV t34,t11
  242. beqz K,.L15
  243. MOV t44,t11 # clear 16 results registers
  244. #endif
  245. .align 5
  246. .L11: # kr=4
  247. MADD t11,t11,a0,b0
  248. MADD t21,t21,a1,b0
  249. LD a4,4*SIZE(A)
  250. MADD t12,t12,a0,b1
  251. MADD t22,t22,a1,b1
  252. LD a5,5*SIZE(A)
  253. MADD t31,t31,a2,b0
  254. MADD t41,t41,a3,b0
  255. LD b4,4*SIZE(B)
  256. MADD t32,t32,a2,b1
  257. MADD t42,t42,a3,b1
  258. LD b5,5*SIZE(B)
  259. FETCH $0,(PREB)
  260. MADD t13,t13,a0,b2
  261. MADD t23,t23,a1,b2
  262. LD a6,6*SIZE(A)
  263. MADD t14,t14,a0,b3
  264. MADD t24,t24,a1,b3
  265. LD b6,6*SIZE(B)
  266. FETCH $0,(PREA)
  267. MADD t33,t33,a2,b2
  268. MADD t43,t43,a3,b2
  269. LD a7,7*SIZE(A)
  270. MADD t34,t34,a2,b3
  271. MADD t44,t44,a3,b3
  272. LD b7,7*SIZE(B)
  273. .L12:
  274. MADD t11,t11,a4,b4
  275. MADD t21,t21,a5,b4
  276. LD a0,8*SIZE(A)
  277. MADD t12,t12,a4,b5
  278. MADD t22,t22,a5,b5
  279. LD a1,9*SIZE(A)
  280. MADD t31,t31,a6,b4
  281. MADD t41,t41,a7,b4
  282. LD b0,8*SIZE(B)
  283. MADD t32,t32,a6,b5
  284. MADD t42,t42,a7,b5
  285. LD b1,9*SIZE(B)
  286. FETCH $0,4*SIZE(PREB)
  287. MADD t13,t13,a4,b6
  288. MADD t23,t23,a5,b6
  289. LD a2,10*SIZE(A)
  290. MADD t14,t14,a4,b7
  291. MADD t24,t24,a5,b7
  292. LD b2,10*SIZE(B)
  293. FETCH $0,4*SIZE(PREA)
  294. MADD t33,t33,a6,b6
  295. MADD t43,t43,a7,b6
  296. LD a3,11*SIZE(A)
  297. MADD t34,t34,a6,b7
  298. MADD t44,t44,a7,b7
  299. LD b3,11*SIZE(B)
  300. .L13:
  301. MADD t11,t11,a0,b0
  302. MADD t21,t21,a1,b0
  303. LD a4,12*SIZE(A)
  304. MADD t12,t12,a0,b1
  305. MADD t22,t22,a1,b1
  306. LD a5,13*SIZE(A)
  307. MADD t31,t31,a2,b0
  308. MADD t41,t41,a3,b0
  309. LD b4,12*SIZE(B)
  310. FETCH $0,8*SIZE(PREA)
  311. MADD t32,t32,a2,b1
  312. MADD t42,t42,a3,b1
  313. LD b5,13*SIZE(B)
  314. FETCH $0,8*SIZE(PREB)
  315. MADD t13,t13,a0,b2
  316. MADD t23,t23,a1,b2
  317. LD a6,14*SIZE(A)
  318. MADD t14,t14,a0,b3
  319. MADD t24,t24,a1,b3
  320. daddu A,A,16*SIZE # 4mr*4kr
  321. LD b6,14*SIZE(B)
  322. MADD t33,t33,a2,b2
  323. MADD t43,t43,a3,b2
  324. daddu B,B,16*SIZE # 4nr*4kr
  325. LD a7,-1*SIZE(A)
  326. MADD t34,t34,a2,b3
  327. MADD t44,t44,a3,b3
  328. LD b7,-1*SIZE(B)
  329. .L14:
  330. MADD t11,t11,a4,b4
  331. MADD t21,t21,a5,b4
  332. LD a0,0(A)
  333. MADD t12,t12,a4,b5
  334. MADD t22,t22,a5,b5
  335. LD a1,1*SIZE(A)
  336. MADD t31,t31,a6,b4
  337. MADD t41,t41,a7,b4
  338. daddiu K,K,-1
  339. LD b0,0(B)
  340. MADD t32,t32,a6,b5
  341. MADD t42,t42,a7,b5
  342. daddu PREA,PREA,16*SIZE
  343. LD b1,1*SIZE(B)
  344. FETCH $0,12*SIZE(PREB)
  345. MADD t13,t13,a4,b6
  346. MADD t23,t23,a5,b6
  347. LD a2,2*SIZE(A)
  348. FETCH $0,-4*SIZE(PREA)
  349. MADD t14,t14,a4,b7
  350. MADD t24,t24,a5,b7
  351. LD b2,2*SIZE(B)
  352. MADD t33,t33,a6,b6
  353. MADD t43,t43,a7,b6
  354. daddu PREB,PREB,16*SIZE
  355. LD a3,3*SIZE(A)
  356. MADD t34,t34,a6,b7
  357. MADD t44,t44,a7,b7
  358. bnez K,.L11
  359. LD b3,3*SIZE(B)
  360. .L15: # kr=2
  361. #ifndef TRMMKERNEL
  362. andi K,KCO,2
  363. #else
  364. andi K,TEMP, 2
  365. #endif
  366. beqz K,.L18
  367. nop
  368. .L16:
  369. MADD t11,t11,a0,b0
  370. MADD t21,t21,a1,b0
  371. LD a4,4*SIZE(A)
  372. MADD t12,t12,a0,b1
  373. MADD t22,t22,a1,b1
  374. LD a5,5*SIZE(A)
  375. MADD t31,t31,a2,b0
  376. MADD t41,t41,a3,b0
  377. LD b4,4*SIZE(B)
  378. FETCH $0,0(PREA)
  379. MADD t32,t32,a2,b1
  380. MADD t42,t42,a3,b1
  381. LD b5,5*SIZE(B)
  382. FETCH $0,0(PREB)
  383. MADD t13,t13,a0,b2
  384. MADD t23,t23,a1,b2
  385. LD a6,6*SIZE(A)
  386. MADD t14,t14,a0,b3
  387. MADD t24,t24,a1,b3
  388. daddu A,A,8*SIZE # 4mr*2kr
  389. LD b6,6*SIZE(B)
  390. MADD t33,t33,a2,b2
  391. MADD t43,t43,a3,b2
  392. daddu B,B,8*SIZE # 4nr*2kr
  393. LD a7,-1*SIZE(A)
  394. MADD t34,t34,a2,b3
  395. MADD t44,t44,a3,b3
  396. LD b7,-1*SIZE(B)
  397. .L17:
  398. MADD t11,t11,a4,b4
  399. MADD t21,t21,a5,b4
  400. LD a0,0*SIZE(A)
  401. MADD t12,t12,a4,b5
  402. MADD t22,t22,a5,b5
  403. LD a1,1*SIZE(A)
  404. MADD t31,t31,a6,b4
  405. MADD t41,t41,a7,b4
  406. LD b0,0*SIZE(B)
  407. MADD t32,t32,a6,b5
  408. MADD t42,t42,a7,b5
  409. LD b1,1*SIZE(B)
  410. FETCH $0,4*SIZE(PREB)
  411. MADD t13,t13,a4,b6
  412. MADD t23,t23,a5,b6
  413. LD a2,2*SIZE(A)
  414. FETCH $0,4*SIZE(PREA)
  415. MADD t14,t14,a4,b7
  416. MADD t24,t24,a5,b7
  417. LD b2,2*SIZE(B)
  418. MADD t33,t33,a6,b6
  419. MADD t43,t43,a7,b6
  420. daddu PREA,PREA,8*SIZE
  421. LD a3,3*SIZE(A)
  422. MADD t34,t34,a6,b7
  423. MADD t44,t44,a7,b7
  424. daddu PREB,PREB,8*SIZE
  425. LD b3,3*SIZE(B)
  426. .L18: # kr=1
  427. #ifndef TRMMKERNEL
  428. andi K,KCO,1
  429. #else
  430. andi K,TEMP,1
  431. #endif
  432. beqz K,.L19
  433. LD ALPHA,152($sp) # Get ALPHA
  434. FETCH $0,0(PREB)
  435. MADD t11,t11,a0,b0
  436. MADD t21,t21,a1,b0
  437. daddu A,A,4*SIZE # 4mr*kr
  438. MADD t12,t12,a0,b1
  439. MADD t22,t22,a1,b1
  440. daddu B,B,4*SIZE # 4nr*kr
  441. FETCH $0,0(PREA)
  442. MADD t31,t31,a2,b0
  443. MADD t41,t41,a3,b0
  444. daddu PREB,PREB,4*SIZE
  445. MADD t32,t32,a2,b1
  446. MADD t42,t42,a3,b1
  447. daddu PREA,PREA,4*SIZE
  448. MADD t13,t13,a0,b2
  449. MADD t23,t23,a1,b2
  450. MADD t14,t14,a0,b3
  451. MADD t24,t24,a1,b3
  452. MADD t33,t33,a2,b2
  453. MADD t43,t43,a3,b2
  454. MADD t34,t34,a2,b3
  455. MADD t44,t44,a3,b3
  456. .L19: # Write Back to C
  457. #ifndef TRMMKERNEL
  458. LD c11,0(CO1) # GEMM write part
  459. LD c21,1*SIZE(CO1) # get 16 C
  460. LD c31,2*SIZE(CO1)
  461. LD c41,3*SIZE(CO1)
  462. LD c12,0(CO2)
  463. MADD t11,c11,t11,ALPHA
  464. LD c22,1*SIZE(CO2)
  465. MADD t21,c21,t21,ALPHA
  466. LD c32,2*SIZE(CO2)
  467. MADD t31,c31,t31,ALPHA
  468. LD c42,3*SIZE(CO2)
  469. MADD t41,c41,t41,ALPHA
  470. LD c13,0(CO3)
  471. MADD t12,c12,t12,ALPHA
  472. LD c23,1*SIZE(CO3)
  473. MADD t22,c22,t22,ALPHA
  474. LD c33,2*SIZE(CO3)
  475. MADD t32,c32,t32,ALPHA
  476. LD c43,3*SIZE(CO3)
  477. MADD t42,c42,t42,ALPHA
  478. LD c14,0(CO4)
  479. MADD t13,c13,t13,ALPHA
  480. LD c24,1*SIZE(CO4)
  481. MADD t23,c23,t23,ALPHA
  482. LD c34,2*SIZE(CO4)
  483. MADD t33,c33,t33,ALPHA
  484. LD c44,3*SIZE(CO4)
  485. MADD t43,c43,t43,ALPHA
  486. ST t11,0(CO1)
  487. MADD t14,c14,t14,ALPHA
  488. ST t21,1*SIZE(CO1)
  489. MADD t24,c24,t24,ALPHA
  490. ST t31,2*SIZE(CO1)
  491. MADD t34,c34,t34,ALPHA
  492. ST t41,3*SIZE(CO1)
  493. MADD t44,c44,t44,ALPHA
  494. daddiu M,M,-1 # M--
  495. ST t12,0(CO2)
  496. ST t22,1*SIZE(CO2)
  497. ST t32,2*SIZE(CO2)
  498. ST t42,3*SIZE(CO2)
  499. ST t13,0(CO3)
  500. ST t23,1*SIZE(CO3)
  501. ST t33,2*SIZE(CO3)
  502. ST t43,3*SIZE(CO3)
  503. FETCH $0,4*SIZE(CO1)
  504. FETCH $0,4*SIZE(CO2)
  505. FETCH $0,4*SIZE(CO3)
  506. FETCH $0,4*SIZE(CO4)
  507. FETCH $0,8*SIZE(CO1)
  508. FETCH $0,8*SIZE(CO2)
  509. FETCH $0,8*SIZE(CO3)
  510. FETCH $0,8*SIZE(CO4)
  511. ST t14,0(CO4)
  512. daddu CO1,CO1,4*SIZE # COi += 4
  513. ST t24,1*SIZE(CO4)
  514. daddu CO2,CO2,4*SIZE
  515. ST t34,2*SIZE(CO4)
  516. daddu CO3,CO3,4*SIZE
  517. ST t44,3*SIZE(CO4)
  518. daddu PREB,BO,SPANB
  519. bnez M,.L10
  520. daddu CO4,CO4,4*SIZE
  521. #else
  522. MUL t11, ALPHA, t11 # TRMM write back part
  523. MUL t21, ALPHA, t21
  524. MUL t31, ALPHA, t31
  525. MUL t41, ALPHA, t41
  526. ST t11, 0 * SIZE(CO1)
  527. MUL t12, ALPHA, t12
  528. ST t21, 1 * SIZE(CO1)
  529. MUL t22, ALPHA, t22
  530. ST t31, 2 * SIZE(CO1)
  531. MUL t32, ALPHA, t32
  532. ST t41, 3 * SIZE(CO1)
  533. MUL t42, ALPHA, t42
  534. ST t12, 0 * SIZE(CO2)
  535. MUL t13, ALPHA, t13
  536. ST t22, 1 * SIZE(CO2)
  537. MUL t23, ALPHA, t23
  538. ST t32, 2 * SIZE(CO2)
  539. MUL t33, ALPHA, t33
  540. ST t42, 3 * SIZE(CO2)
  541. MUL t43, ALPHA, t43
  542. ST t13, 0 * SIZE(CO3)
  543. MUL t14, ALPHA, t14
  544. ST t23, 1 * SIZE(CO3)
  545. MUL t24, ALPHA, t24
  546. ST t33, 2 * SIZE(CO3)
  547. MUL t34, ALPHA, t34
  548. ST t43, 3 * SIZE(CO3)
  549. MUL t44, ALPHA, t44
  550. ST t14, 0 * SIZE(CO4)
  551. daddiu M,M,-1 # M--
  552. ST t24, 1 * SIZE(CO4)
  553. ST t34, 2 * SIZE(CO4)
  554. ST t44, 3 * SIZE(CO4)
  555. daddiu CO1,CO1, 4 * SIZE
  556. daddiu CO2,CO2, 4 * SIZE
  557. daddiu CO3,CO3, 4 * SIZE
  558. daddiu CO4,CO4, 4 * SIZE
  559. FETCH $0,4*SIZE(CO1)
  560. FETCH $0,4*SIZE(CO2)
  561. FETCH $0,4*SIZE(CO3)
  562. FETCH $0,4*SIZE(CO4)
  563. FETCH $0,0(CO1)
  564. FETCH $0,0(CO2)
  565. FETCH $0,0(CO3)
  566. FETCH $0,0(CO4)
  567. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  568. dsubu TEMP,KCO,KK
  569. #ifdef LEFT
  570. daddiu TEMP,TEMP, -4
  571. #else
  572. daddiu TEMP,TEMP, -4
  573. #endif
  574. dsll K,TEMP,2 + BASE_SHIFT
  575. dsll TEMP,TEMP,2 + BASE_SHIFT
  576. daddu A,A,K # mov A to the end of panel Ai
  577. daddu B,B,TEMP # mov B to the end of panel Bj
  578. #endif
  579. #ifdef LEFT
  580. daddiu KK, KK,4
  581. #endif
  582. bnez M,.L10
  583. nop
  584. #endif
  585. .align 3
  586. .L14_M2:
  587. andi M, MCO, 2 # nr=4,mr=2
  588. beqz M,.L14_M1
  589. nop
  590. .L20:
  591. #if defined(TRMMKERNEL)
  592. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  593. move B,BO # Reset B
  594. #else
  595. dsll K,KK,1 + BASE_SHIFT # mr=2
  596. dsll TEMP,KK,2 + BASE_SHIFT # nr=4
  597. daddu A,A,K
  598. daddu B,BO,TEMP
  599. #endif
  600. LD a0,0*SIZE(A)
  601. MTC $0,t11
  602. LD a1,1*SIZE(A)
  603. MOV t21,t11
  604. LD b0,0*SIZE(B)
  605. MOV t12,t11
  606. LD b1,1*SIZE(B)
  607. MOV t22,t11
  608. LD b2,2*SIZE(B)
  609. MOV t13,t11
  610. MOV t23,t11
  611. LD b3,3*SIZE(B)
  612. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  613. dsubu TEMP,KCO,KK
  614. #elif defined(LEFT)
  615. daddiu TEMP,KK,2 # left part,controlled by mr, mr=2
  616. #else
  617. daddiu TEMP,KK,4 # right part,controlled by nr,nr=4
  618. #endif
  619. dsra K,TEMP,2
  620. MOV t14,t11
  621. beqz K,.L25
  622. MOV t24,t11 # clear 2*4=8 results registers
  623. #else
  624. move B,BO # Reset B
  625. LD a0,0*SIZE(A)
  626. MTC $0,t11
  627. LD a1,1*SIZE(A)
  628. MOV t21,t11
  629. LD b0,0*SIZE(B)
  630. MOV t12,t11
  631. LD b1,1*SIZE(B)
  632. MOV t22,t11
  633. dsra K,KCO,2
  634. LD b2,2*SIZE(B)
  635. MOV t13,t11
  636. MOV t23,t11
  637. LD b3,3*SIZE(B)
  638. MOV t14,t11
  639. beqz K,.L25
  640. MOV t24,t11
  641. #endif
  642. .L21: # nr=4,mr=2,kr=4
  643. MADD t11,t11,a0,b0
  644. LD a4,2*SIZE(A)
  645. MADD t21,t21,a1,b0
  646. LD a5,3*SIZE(A)
  647. MADD t12,t12,a0,b1
  648. LD b4,4*SIZE(B)
  649. MADD t22,t22,a1,b1
  650. LD b5,5*SIZE(B)
  651. MADD t13,t13,a0,b2
  652. LD b6,6*SIZE(B)
  653. MADD t23,t23,a1,b2
  654. LD b7,7*SIZE(B)
  655. MADD t14,t14,a0,b3
  656. MADD t24,t24,a1,b3
  657. MADD t11,t11,a4,b4
  658. LD a2,4*SIZE(A)
  659. MADD t21,t21,a5,b4
  660. LD a3,5*SIZE(A)
  661. MADD t12,t12,a4,b5
  662. LD b0,8*SIZE(B)
  663. MADD t22,t22,a5,b5
  664. LD b1,9*SIZE(B)
  665. MADD t13,t13,a4,b6
  666. LD b2,10*SIZE(B)
  667. MADD t23,t23,a5,b6
  668. LD b3,11*SIZE(B)
  669. MADD t14,t14,a4,b7
  670. MADD t24,t24,a5,b7
  671. daddiu K,K,-1
  672. MADD t11,t11,a2,b0
  673. LD a6,6*SIZE(A)
  674. MADD t21,t21,a3,b0
  675. LD a7,7*SIZE(A)
  676. MADD t12,t12,a2,b1
  677. LD b4,12*SIZE(B)
  678. MADD t22,t22,a3,b1
  679. LD b5,13*SIZE(B)
  680. MADD t13,t13,a2,b2
  681. LD b6,14*SIZE(B)
  682. MADD t23,t23,a3,b2
  683. LD b7,15*SIZE(B)
  684. MADD t14,t14,a2,b3
  685. MADD t24,t24,a3,b3
  686. daddu A,A,8*SIZE # 2mr*4kr
  687. daddu B,B,16*SIZE # 4nr*4kr
  688. MADD t11,t11,a6,b4
  689. LD a0,0*SIZE(A)
  690. MADD t21,t21,a7,b4
  691. LD a1,1*SIZE(A)
  692. MADD t12,t12,a6,b5
  693. LD b0,0*SIZE(B)
  694. MADD t22,t22,a7,b5
  695. LD b1,1*SIZE(B)
  696. MADD t13,t13,a6,b6
  697. LD b2,2*SIZE(B)
  698. MADD t23,t23,a7,b6
  699. LD b3,3*SIZE(B)
  700. MADD t14,t14,a6,b7
  701. bnez K,.L21
  702. MADD t24,t24,a7,b7
  703. .L25:
  704. #ifndef TRMMKERNEL
  705. andi K,KCO,2 # kr=2
  706. #else
  707. andi K,TEMP,2
  708. #endif
  709. beqz K,.L28
  710. nop
  711. .L26:
  712. MADD t11,t11,a0,b0
  713. LD a4,2*SIZE(A)
  714. MADD t21,t21,a1,b0
  715. LD a5,3*SIZE(A)
  716. MADD t12,t12,a0,b1
  717. LD b4,4*SIZE(B)
  718. MADD t22,t22,a1,b1
  719. LD b5,5*SIZE(B)
  720. MADD t13,t13,a0,b2
  721. LD b6,6*SIZE(B)
  722. MADD t23,t23,a1,b2
  723. LD b7,7*SIZE(B)
  724. MADD t14,t14,a0,b3
  725. MADD t24,t24,a1,b3
  726. daddu A,A,4*SIZE # 2mr*2kr
  727. daddu B,B,8*SIZE # 4nr*2kr
  728. .L27:
  729. MADD t11,t11,a4,b4
  730. LD a0,0*SIZE(A)
  731. MADD t21,t21,a5,b4
  732. LD a1,1*SIZE(A)
  733. MADD t12,t12,a4,b5
  734. LD b0,0*SIZE(B)
  735. MADD t22,t22,a5,b5
  736. LD b1,1*SIZE(B)
  737. MADD t13,t13,a4,b6
  738. LD b2,2*SIZE(B)
  739. MADD t23,t23,a5,b6
  740. LD b3,3*SIZE(B)
  741. MADD t14,t14,a4,b7
  742. MADD t24,t24,a5,b7
  743. .L28: # kr=1
  744. #ifndef TRMMKERNEL
  745. andi K,KCO,1
  746. #else
  747. andi K,TEMP,1
  748. #endif
  749. beqz K,.L29
  750. LD ALPHA,152($sp) # Get ALPHA
  751. MADD t11,t11,a0,b0
  752. MADD t21,t21,a1,b0
  753. daddu A,A,2*SIZE # 2mr*kr
  754. daddu B,B,4*SIZE # 4nr*kr
  755. MADD t12,t12,a0,b1
  756. MADD t22,t22,a1,b1
  757. MADD t13,t13,a0,b2
  758. MADD t23,t23,a1,b2
  759. MADD t14,t14,a0,b3
  760. MADD t24,t24,a1,b3
  761. .L29: # Write Back to C
  762. #ifndef TRMMKERNEL
  763. LD c11,0(CO1) # GEMM write back part
  764. LD c21,1*SIZE(CO1)
  765. LD c12,0(CO2)
  766. LD c22,1*SIZE(CO2)
  767. LD c13,0(CO3)
  768. MADD t11,c11,t11,ALPHA
  769. LD c23,1*SIZE(CO3)
  770. MADD t21,c21,t21,ALPHA
  771. LD c14,0(CO4)
  772. MADD t12,c12,t12,ALPHA
  773. LD c24,1*SIZE(CO4)
  774. MADD t22,c22,t22,ALPHA
  775. ST t11,0(CO1)
  776. MADD t13,c13,t13,ALPHA
  777. ST t21,1*SIZE(CO1)
  778. MADD t23,c23,t23,ALPHA
  779. ST t12,0(CO2)
  780. MADD t14,c14,t14,ALPHA
  781. ST t22,1*SIZE(CO2)
  782. MADD t24,c24,t24,ALPHA
  783. ST t13,0(CO3)
  784. daddu CO1,CO1,2*SIZE # COi += 2
  785. ST t23,1*SIZE(CO3)
  786. daddu CO2,CO2,2*SIZE
  787. ST t14,0(CO4)
  788. daddu CO3,CO3,2*SIZE
  789. ST t24,1*SIZE(CO4)
  790. daddu CO4,CO4,2*SIZE
  791. FETCH $0,0(CO1)
  792. FETCH $0,0(CO2)
  793. FETCH $0,0(CO3)
  794. FETCH $0,0(CO4)
  795. #else
  796. MUL t11, ALPHA, t11 # TRMM write back part
  797. MUL t21, ALPHA, t21
  798. ST t11, 0 * SIZE(CO1)
  799. MUL t12, ALPHA, t12
  800. ST t21, 1 * SIZE(CO1)
  801. MUL t22, ALPHA, t22
  802. ST t12, 0 * SIZE(CO2)
  803. MUL t13, ALPHA, t13
  804. ST t22, 1 * SIZE(CO2)
  805. MUL t23, ALPHA, t23
  806. ST t13, 0 * SIZE(CO3)
  807. MUL t14, ALPHA, t14
  808. ST t23, 1 * SIZE(CO3)
  809. MUL t24, ALPHA, t24
  810. ST t14, 0 * SIZE(CO4)
  811. ST t24, 1 * SIZE(CO4)
  812. daddiu CO1,CO1, 2 * SIZE
  813. daddiu CO2,CO2, 2 * SIZE
  814. daddiu CO3,CO3, 2 * SIZE
  815. daddiu CO4,CO4, 2 * SIZE
  816. FETCH $0,0(CO1)
  817. FETCH $0,0(CO2)
  818. FETCH $0,0(CO3)
  819. FETCH $0,0(CO4)
  820. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  821. dsubu TEMP,KCO,KK
  822. #ifdef LEFT
  823. daddiu TEMP,TEMP,-2
  824. #else
  825. daddiu TEMP,TEMP,-4
  826. #endif
  827. dsll K,TEMP,1 + BASE_SHIFT
  828. dsll TEMP,TEMP,2 + BASE_SHIFT
  829. daddu A,A,K # move A to next panel Ai
  830. daddu B,B,TEMP # move B to next panel Bj
  831. #endif
  832. #ifdef LEFT
  833. daddiu KK, KK, 2
  834. #endif
  835. #endif
  836. .align 3
  837. .L14_M1:
  838. andi M,MCO,1 # mr=1
  839. beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj
  840. nop
  841. .L30:
  842. #if defined(TRMMKERNEL)
  843. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  844. move B,BO # Reset B
  845. #else
  846. dsll K,KK, BASE_SHIFT
  847. dsll TEMP,KK,2 + BASE_SHIFT
  848. daddu A,A,K
  849. daddu B,BO,TEMP
  850. #endif
  851. LD a0, 0 * SIZE(A) # a0
  852. MTC $0,t11
  853. LD b0,0*SIZE(B)
  854. MOV t12,t11
  855. LD b1,1*SIZE(B)
  856. MOV t13,t11
  857. LD b2,2*SIZE(B)
  858. MOV t14,t11
  859. LD b3,3*SIZE(B)
  860. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  861. dsubu TEMP, KCO, KK
  862. #elif defined(LEFT)
  863. daddiu TEMP, KK, 1
  864. #else
  865. daddiu TEMP, KK, 4
  866. #endif
  867. dsra K,TEMP, 2
  868. nop
  869. beqz K,.L35
  870. nop
  871. #else
  872. move B,BO # Reset B, GEMM part
  873. dsra K,KCO,2 # K=KCO/2
  874. LD a0, 0 * SIZE(A) # a0
  875. MTC $0,t11
  876. LD b0,0*SIZE(B)
  877. MOV t12,t11
  878. LD b1,1*SIZE(B)
  879. MOV t13,t11
  880. LD b2,2*SIZE(B)
  881. MOV t14,t11
  882. beqz K,.L35
  883. LD b3,3*SIZE(B)
  884. #endif
  885. .L31: # nr=4,mr=1,kr=4
  886. LD a1, 1*SIZE(A) # load a1
  887. MADD t11,t11,a0,b0
  888. LD b4,4*SIZE(B)
  889. LD b5,5*SIZE(B)
  890. MADD t12,t12,a0,b1
  891. LD b6,6*SIZE(B)
  892. LD b7,7*SIZE(B)
  893. MADD t13,t13,a0,b2
  894. MADD t14,t14,a0,b3
  895. LD a2, 2*SIZE(A) # a2
  896. MADD t11,t11,a1,b4
  897. LD b0,8*SIZE(B)
  898. LD b1,9*SIZE(B)
  899. MADD t12,t12,a1,b5
  900. LD b2,10*SIZE(B)
  901. LD b3,11*SIZE(B)
  902. MADD t13,t13,a1,b6
  903. MADD t14,t14,a1,b7
  904. LD a3, 3*SIZE(A) # a3
  905. MADD t11,t11,a2,b0
  906. daddiu K,K,-1
  907. LD b4,12*SIZE(B)
  908. LD b5,13*SIZE(B)
  909. MADD t12,t12,a2,b1
  910. daddu A,A,4*SIZE # 1mr*4kr
  911. LD b6,14*SIZE(B)
  912. LD b7,15*SIZE(B)
  913. MADD t13,t13,a2,b2
  914. MADD t14,t14,a2,b3
  915. LD a0, 0*SIZE(A) # a0
  916. daddu B,B,16*SIZE # 4nr*4kr
  917. MADD t11,t11,a3,b4
  918. LD b0,0*SIZE(B)
  919. MADD t12,t12,a3,b5
  920. LD b1,1*SIZE(B)
  921. MADD t13,t13,a3,b6
  922. LD b2,2*SIZE(B)
  923. MADD t14,t14,a3,b7
  924. bnez K,.L31
  925. LD b3,3*SIZE(B)
  926. .L35: # kr=2
  927. #ifndef TRMMKERNEL
  928. andi K,KCO,2
  929. #else
  930. andi K,TEMP,2
  931. #endif
  932. beqz K,.L38
  933. nop
  934. .L36:
  935. LD a1,1*SIZE(A) # load a1
  936. MADD t11,t11,a0,b0
  937. LD b4,4*SIZE(B)
  938. LD b5,5*SIZE(B)
  939. MADD t12,t12,a0,b1
  940. daddu A,A,2*SIZE # mr*2kr
  941. LD b6,6*SIZE(B)
  942. MADD t13,t13,a0,b2
  943. LD b7,7*SIZE(B)
  944. MADD t14,t14,a0,b3
  945. daddu B,B,8*SIZE # 4nr*2kr
  946. .L37:
  947. LD a0,0(A)
  948. MADD t11,t11,a1,b4
  949. LD b0,0*SIZE(B)
  950. LD b1,1*SIZE(B)
  951. MADD t12,t12,a1,b5
  952. LD b2,2*SIZE(B)
  953. LD b3,3*SIZE(B)
  954. MADD t13,t13,a1,b6
  955. MADD t14,t14,a1,b7
  956. .L38: # kr=1
  957. #ifndef TRMMKERNEL
  958. andi K,KCO,1
  959. #else
  960. andi K,TEMP,1
  961. #endif
  962. beqz K,.L39
  963. LD ALPHA,152($sp) # Get ALPHA
  964. MADD t11,t11,a0,b0
  965. MADD t12,t12,a0,b1
  966. daddu A,A,1*SIZE
  967. daddu B,B,4*SIZE
  968. MADD t13,t13,a0,b2
  969. MADD t14,t14,a0,b3
  970. .L39: # Write Back
  971. #ifndef TRMMKERNEL
  972. LD c11,0(CO1)
  973. LD c12,0(CO2)
  974. LD c13,0(CO3)
  975. LD c14,0(CO4)
  976. MADD t11,c11,t11,ALPHA
  977. MADD t12,c12,t12,ALPHA
  978. MADD t13,c13,t13,ALPHA
  979. MADD t14,c14,t14,ALPHA
  980. ST t11,0(CO1)
  981. ST t12,0(CO2)
  982. ST t13,0(CO3)
  983. ST t14,0(CO4)
  984. #else
  985. MUL t11, ALPHA, t11
  986. MUL t12, ALPHA, t12
  987. MUL t13, ALPHA, t13
  988. MUL t14, ALPHA, t14
  989. ST t11, 0 * SIZE(CO1)
  990. ST t12, 0 * SIZE(CO2)
  991. ST t13, 0 * SIZE(CO3)
  992. ST t14, 0 * SIZE(CO4)
  993. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  994. dsubu TEMP, KCO, KK
  995. #ifdef LEFT
  996. daddiu TEMP, TEMP, -1
  997. #else
  998. daddiu TEMP, TEMP, -4
  999. #endif
  1000. dsll K,TEMP, BASE_SHIFT
  1001. dsll TEMP,TEMP, 2 + BASE_SHIFT
  1002. daddu A,A,K
  1003. daddu B,B,TEMP
  1004. #endif
  1005. #ifdef LEFT
  1006. daddiu KK, KK, 1
  1007. #endif
  1008. #endif
  1009. .align 3
  1010. .L0_N4_Loop: # mc finished
  1011. daddiu N,N,-1 # N--
  1012. #if defined(TRMMKERNEL) && !defined(LEFT)
  1013. daddiu KK, KK,4
  1014. #endif
  1015. bnez N,.L0_N4_Lb
  1016. move BO,B # Set BO point to next panel Bj
  1017. .align 5
  1018. .L0_N2:
  1019. andi N,NCO,2 # nr = 2
  1020. beqz N,.L0_N1
  1021. nop
  1022. .L0_N2_Lb:
  1023. move CO1,C
  1024. daddu CO2,C,LDC
  1025. dsra M,MCO,2
  1026. move A,AO # Reset A
  1027. daddu PREA,AO,SPANA
  1028. daddu C,CO2,LDC
  1029. #if defined(TRMMKERNEL) && defined(LEFT)
  1030. move KK, OFFSET
  1031. #endif
  1032. beqz M,.L12_M2
  1033. nop
  1034. .L40:
  1035. #if defined(TRMMKERNEL)
  1036. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1037. move B,BO # Reset B
  1038. #else
  1039. dsll K,KK, 2 + BASE_SHIFT
  1040. dsll TEMP, KK,1 + BASE_SHIFT
  1041. daddu A,A,K
  1042. daddu B,BO,TEMP
  1043. #endif
  1044. LD a0,0*SIZE(A)
  1045. MTC $0,t11 # gemm part
  1046. LD a1,1*SIZE(A)
  1047. MOV t21,t11
  1048. LD b0,0*SIZE(B)
  1049. MOV t31,t11
  1050. LD b1,1*SIZE(B)
  1051. MOV t41,t11
  1052. LD a2,2*SIZE(A)
  1053. LD a3,3*SIZE(A)
  1054. MOV t12,t11
  1055. MOV t22,t11
  1056. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1057. dsubu TEMP,KCO,KK
  1058. #elif defined(LEFT)
  1059. daddiu TEMP, KK, 4
  1060. #else
  1061. daddiu TEMP, KK, 2
  1062. #endif
  1063. dsra K,TEMP,2
  1064. MOV t32,t11
  1065. beqz K,.L45
  1066. MOV t42,t11
  1067. #else
  1068. move B,BO # Reset B
  1069. LD a0,0*SIZE(A)
  1070. MTC $0,t11 # gemm part
  1071. LD a1,1*SIZE(A)
  1072. MOV t21,t11
  1073. LD b0,0*SIZE(B)
  1074. MOV t31,t11
  1075. LD b1,1*SIZE(B)
  1076. MOV t41,t11
  1077. LD a2,2*SIZE(A)
  1078. dsra K,KCO,2 # K=KCO/2
  1079. LD a3,3*SIZE(A)
  1080. MOV t12,t11
  1081. MOV t22,t11
  1082. MOV t32,t11
  1083. beqz K,.L45
  1084. MOV t42,t11
  1085. #endif
  1086. .L41: # nr=2,mr=kr=4
  1087. MADD t11,t11,a0,b0
  1088. LD a4,4*SIZE(A)
  1089. MADD t21,t21,a1,b0
  1090. LD a5,5*SIZE(A)
  1091. MADD t12,t12,a0,b1
  1092. LD b4,2*SIZE(B)
  1093. MADD t22,t22,a1,b1
  1094. LD b5,3*SIZE(B)
  1095. MADD t31,t31,a2,b0
  1096. LD a6,6*SIZE(A)
  1097. MADD t41,t41,a3,b0
  1098. LD a7,7*SIZE(A)
  1099. FETCH $0,(PREA)
  1100. MADD t32,t32,a2,b1
  1101. MADD t42,t42,a3,b1
  1102. .L42:
  1103. MADD t11,t11,a4,b4
  1104. LD a0,8*SIZE(A)
  1105. MADD t21,t21,a5,b4
  1106. LD a1,9*SIZE(A)
  1107. MADD t12,t12,a4,b5
  1108. LD b2,4*SIZE(B)
  1109. MADD t22,t22,a5,b5
  1110. LD b3,5*SIZE(B)
  1111. MADD t31,t31,a6,b4
  1112. LD a2,10*SIZE(A)
  1113. MADD t41,t41,a7,b4
  1114. LD a3,11*SIZE(A)
  1115. FETCH $0,4*SIZE(PREA)
  1116. MADD t32,t32,a6,b5
  1117. MADD t42,t42,a7,b5
  1118. .L43:
  1119. MADD t11,t11,a0,b2
  1120. LD a4,12*SIZE(A)
  1121. MADD t21,t21,a1,b2
  1122. LD a5,13*SIZE(A)
  1123. MADD t12,t12,a0,b3
  1124. LD b6,6*SIZE(B)
  1125. MADD t22,t22,a1,b3
  1126. LD b7,7*SIZE(B)
  1127. MADD t31,t31,a2,b2
  1128. LD a6,14*SIZE(A)
  1129. MADD t41,t41,a3,b2
  1130. LD a7,15*SIZE(A)
  1131. FETCH $0,8*SIZE(PREA)
  1132. MADD t32,t32,a2,b3
  1133. MADD t42,t42,a3,b3
  1134. daddu A,A,16*SIZE # 4mr*4kr
  1135. daddu B,B,8*SIZE # 2nr*4kr
  1136. .L44:
  1137. MADD t11,t11,a4,b6
  1138. LD a0,0*SIZE(A)
  1139. MADD t21,t21,a5,b6
  1140. LD a1,1*SIZE(A)
  1141. MADD t12,t12,a4,b7
  1142. LD b0,0*SIZE(B)
  1143. MADD t22,t22,a5,b7
  1144. LD b1,1*SIZE(B)
  1145. daddiu K,K,-1
  1146. daddu PREA,PREA,16*SIZE
  1147. MADD t31,t31,a6,b6
  1148. LD a2,2*SIZE(A)
  1149. MADD t41,t41,a7,b6
  1150. LD a3,3*SIZE(A)
  1151. FETCH $0,-4*SIZE(PREA)
  1152. MADD t32,t32,a6,b7
  1153. bnez K,.L41
  1154. MADD t42,t42,a7,b7
  1155. .L45: # kr=2
  1156. #ifndef TRMMKERNEL
  1157. andi K,KCO,2
  1158. #else
  1159. andi K,TEMP,2
  1160. #endif
  1161. beqz K,.L48
  1162. nop
  1163. .L46:
  1164. MADD t11,t11,a0,b0
  1165. LD a4,4*SIZE(A)
  1166. MADD t21,t21,a1,b0
  1167. LD a5,5*SIZE(A)
  1168. MADD t12,t12,a0,b1
  1169. LD b4,2*SIZE(B)
  1170. MADD t22,t22,a1,b1
  1171. LD b5,3*SIZE(B)
  1172. MADD t31,t31,a2,b0
  1173. LD a6,6*SIZE(A)
  1174. MADD t41,t41,a3,b0
  1175. LD a7,7*SIZE(A)
  1176. FETCH $0,0(PREA)
  1177. MADD t32,t32,a2,b1
  1178. daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32
  1179. MADD t42,t42,a3,b1
  1180. daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
  1181. .L47:
  1182. MADD t11,t11,a4,b4
  1183. LD a0,0*SIZE(A)
  1184. MADD t21,t21,a5,b4
  1185. LD a1,1*SIZE(A)
  1186. MADD t12,t12,a4,b5
  1187. LD b0,0*SIZE(B)
  1188. MADD t22,t22,a5,b5
  1189. LD b1,1*SIZE(B)
  1190. MADD t31,t31,a6,b4
  1191. LD a2,2*SIZE(A)
  1192. MADD t41,t41,a7,b4
  1193. LD a3,3*SIZE(A)
  1194. FETCH $0,4*SIZE(PREA)
  1195. MADD t32,t32,a6,b5
  1196. MADD t42,t42,a7,b5
  1197. daddu PREA,PREA,8*SIZE
  1198. .L48: # kr=1
  1199. #ifndef TRMMKERNEL
  1200. andi K,KCO,1
  1201. #else
  1202. andi K,TEMP,1
  1203. #endif
  1204. beqz K,.L49
  1205. LD ALPHA,152($sp) # Get ALPHA
  1206. FETCH $0,0(PREA)
  1207. MADD t11,t11,a0,b0
  1208. MADD t21,t21,a1,b0
  1209. daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
  1210. MADD t12,t12,a0,b1
  1211. MADD t22,t22,a1,b1
  1212. daddu B,B,2*SIZE
  1213. daddu PREA,PREA,4*SIZE
  1214. MADD t31,t31,a2,b0
  1215. MADD t41,t41,a3,b0
  1216. MADD t32,t32,a2,b1
  1217. MADD t42,t42,a3,b1
  1218. .L49: # Write Back
  1219. #ifndef TRMMKERNEL
  1220. LD c11,0(CO1) # gemm write back part Fetch 16 C
  1221. LD c21,1*SIZE(CO1)
  1222. LD c31,2*SIZE(CO1)
  1223. LD c41,3*SIZE(CO1)
  1224. LD c12,0(CO2)
  1225. MADD t11,c11,t11,ALPHA
  1226. LD c22,1*SIZE(CO2)
  1227. MADD t21,c21,t21,ALPHA
  1228. LD c32,2*SIZE(CO2)
  1229. MADD t31,c31,t31,ALPHA
  1230. LD c42,3*SIZE(CO2)
  1231. MADD t41,c41,t41,ALPHA
  1232. ST t11,0(CO1)
  1233. MADD t12,c12,t12,ALPHA
  1234. ST t21,1*SIZE(CO1)
  1235. MADD t22,c22,t22,ALPHA
  1236. ST t31,2*SIZE(CO1)
  1237. MADD t32,c32,t32,ALPHA
  1238. ST t41,3*SIZE(CO1)
  1239. MADD t42,c42,t42,ALPHA
  1240. daddiu M,M,-1
  1241. ST t12,0(CO2)
  1242. ST t22,1*SIZE(CO2)
  1243. ST t32,2*SIZE(CO2)
  1244. ST t42,3*SIZE(CO2)
  1245. FETCH $0,4*SIZE(CO1)
  1246. FETCH $0,4*SIZE(CO2)
  1247. FETCH $0,8*SIZE(CO1)
  1248. FETCH $0,8*SIZE(CO2)
  1249. daddu CO1,CO1,4*SIZE
  1250. bnez M,.L40
  1251. daddu CO2,CO2,4*SIZE
  1252. #else
  1253. MUL t11, ALPHA, t11
  1254. MUL t21, ALPHA, t21
  1255. MUL t31, ALPHA, t31
  1256. MUL t41, ALPHA, t41
  1257. MUL t12, ALPHA, t12
  1258. ST t11, 0 * SIZE(CO1)
  1259. MUL t22, ALPHA, t22
  1260. ST t21, 1 * SIZE(CO1)
  1261. MUL t32, ALPHA, t32
  1262. ST t31, 2 * SIZE(CO1)
  1263. MUL t42, ALPHA, t42
  1264. ST t41, 3 * SIZE(CO1)
  1265. ST t12, 0 * SIZE(CO2)
  1266. daddiu M,M,-1
  1267. ST t22, 1 * SIZE(CO2)
  1268. ST t32, 2 * SIZE(CO2)
  1269. ST t42, 3 * SIZE(CO2)
  1270. daddiu CO1,CO1, 4*SIZE
  1271. daddiu CO2,CO2, 4*SIZE
  1272. FETCH $0,0(CO1)
  1273. FETCH $0,0(CO2)
  1274. FETCH $0,4(CO1)
  1275. FETCH $0,4(CO2)
  1276. #if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
  1277. dsubu TEMP, KCO, KK
  1278. #ifdef LEFT
  1279. daddiu TEMP, TEMP, -4
  1280. #else
  1281. daddiu TEMP, TEMP, -2
  1282. #endif
  1283. dsll K,TEMP, 2 + BASE_SHIFT
  1284. dsll TEMP, TEMP, 1 + BASE_SHIFT
  1285. daddu A,A,K
  1286. daddu B,B,TEMP
  1287. #endif
  1288. #ifdef LEFT
  1289. daddiu KK, KK, 4
  1290. #endif
  1291. bnez M,.L40
  1292. nop
  1293. #endif
  1294. .align 3
  1295. .L12_M2:
  1296. andi M,MCO,2 # mr = 2
  1297. beqz M,.L12_M1
  1298. nop
  1299. .L50:
  1300. #if defined(TRMMKERNEL)
  1301. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1302. move B,BO
  1303. #else
  1304. dsll K, KK, 1 + BASE_SHIFT #mr=2
  1305. dsll TEMP, KK, 1 + BASE_SHIFT #nr=2
  1306. daddu A, A, K
  1307. daddu B, BO, TEMP
  1308. #endif
  1309. LD a0,0*SIZE(A)
  1310. LD a1,1*SIZE(A)
  1311. MTC $0,t11
  1312. LD b0,0*SIZE(B)
  1313. MOV t21,t11
  1314. LD b1,1*SIZE(B)
  1315. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1316. dsubu TEMP, KCO, KK
  1317. #elif defined(LEFT)
  1318. daddiu TEMP, KK, 2
  1319. #else
  1320. daddiu TEMP, KK, 2
  1321. #endif
  1322. dsra K,TEMP,2
  1323. MOV t12,t11
  1324. beqz K,.L55
  1325. MOV t22,t11
  1326. #else
  1327. move B,BO
  1328. LD a0,0*SIZE(A)
  1329. dsra K,KCO,2 # K=KCO/2
  1330. LD a1,1*SIZE(A)
  1331. MTC $0,t11
  1332. LD b0,0*SIZE(B)
  1333. MOV t21,t11
  1334. LD b1,1*SIZE(B)
  1335. MOV t12,t11
  1336. beqz K,.L55
  1337. MOV t22,t11
  1338. #endif
  1339. .L51: # nr=2 mr=2,kr=4
  1340. MADD t11,t11,a0,b0
  1341. LD a4,2*SIZE(A)
  1342. MADD t21,t21,a1,b0
  1343. LD b4,2*SIZE(B)
  1344. MADD t12,t12,a0,b1
  1345. LD a5,3*SIZE(A)
  1346. MADD t22,t22,a1,b1
  1347. LD b5,3*SIZE(B)
  1348. MADD t11,t11,a4,b4
  1349. LD a2,4*SIZE(A)
  1350. MADD t21,t21,a5,b4
  1351. LD b2,4*SIZE(B)
  1352. MADD t12,t12,a4,b5
  1353. LD a3,5*SIZE(A)
  1354. MADD t22,t22,a5,b5
  1355. daddiu K,K,-1
  1356. LD b3,5*SIZE(B)
  1357. MADD t11,t11,a2,b2
  1358. LD a6,6*SIZE(A)
  1359. MADD t21,t21,a3,b2
  1360. daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
  1361. LD b6,6*SIZE(B)
  1362. MADD t12,t12,a2,b3
  1363. daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE
  1364. LD a7,-1*SIZE(A)
  1365. MADD t22,t22,a3,b3
  1366. LD b7,-1*SIZE(B)
  1367. MADD t11,t11,a6,b6
  1368. LD a0,0*SIZE(A)
  1369. MADD t21,t21,a7,b6
  1370. LD b0,0*SIZE(B)
  1371. MADD t12,t12,a6,b7
  1372. LD a1,1*SIZE(A)
  1373. MADD t22,t22,a7,b7
  1374. bnez K,.L51
  1375. LD b1,1*SIZE(B)
  1376. .L55: # kr=2
  1377. #ifndef TRMMKERNEL
  1378. andi K,KCO,2
  1379. #else
  1380. andi K,TEMP,2
  1381. #endif
  1382. beqz K,.L58
  1383. nop
  1384. .L56:
  1385. MADD t11,t11,a0,b0
  1386. LD a4,2*SIZE(A)
  1387. MADD t21,t21,a1,b0
  1388. daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
  1389. LD b4,2*SIZE(B)
  1390. MADD t12,t12,a0,b1
  1391. daddu B,B,4*SIZE # 2nr*2kr
  1392. LD a5,-1*SIZE(A)
  1393. MADD t22,t22,a1,b1
  1394. LD b5,-1*SIZE(B)
  1395. .L57:
  1396. MADD t11,t11,a4,b4
  1397. LD a0,0*SIZE(A)
  1398. MADD t21,t21,a5,b4
  1399. LD b0,0*SIZE(B)
  1400. MADD t12,t12,a4,b5
  1401. LD a1,1*SIZE(A)
  1402. MADD t22,t22,a5,b5
  1403. LD b1,1*SIZE(B)
  1404. .L58: # kr=1
  1405. #ifndef TRMMKERNEL
  1406. andi K,KCO,1
  1407. #else
  1408. andi K,TEMP, 1
  1409. #endif
  1410. beqz K,.L59
  1411. LD ALPHA,152($sp) # Get ALPHA
  1412. MADD t11,t11,a0,b0
  1413. MADD t21,t21,a1,b0
  1414. daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
  1415. daddu B,B,2*SIZE # 2nr*kr
  1416. MADD t12,t12,a0,b1
  1417. MADD t22,t22,a1,b1
  1418. .L59: # Write Back
  1419. #ifndef TRMMKERNEL
  1420. LD c11,0(CO1) # write gemm part back Fetch 16 C
  1421. LD c21,1*SIZE(CO1)
  1422. LD c12,0(CO2)
  1423. LD c22,1*SIZE(CO2)
  1424. MADD t11,c11,t11,ALPHA
  1425. MADD t21,c21,t21,ALPHA
  1426. MADD t12,c12,t12,ALPHA
  1427. MADD t22,c22,t22,ALPHA
  1428. ST t11,0(CO1)
  1429. ST t21,1*SIZE(CO1)
  1430. ST t12,0(CO2)
  1431. ST t22,1*SIZE(CO2)
  1432. daddu CO1,CO1,2*SIZE
  1433. daddu CO2,CO2,2*SIZE
  1434. FETCH $0,0(CO1)
  1435. FETCH $0,0(CO2)
  1436. #else
  1437. daddiu M, M, -1
  1438. daddiu CO1,CO1, 2 * SIZE
  1439. daddiu CO2,CO2, 2 * SIZE
  1440. MUL t11, ALPHA, t11
  1441. MUL t21, ALPHA, t21
  1442. MUL t12, ALPHA, t12
  1443. MUL t22, ALPHA, t22
  1444. ST t11, -2 * SIZE(CO1)
  1445. ST t21, -1 * SIZE(CO1)
  1446. ST t12, -2 * SIZE(CO2)
  1447. ST t22, -1 * SIZE(CO2)
  1448. FETCH $0,0(CO1)
  1449. FETCH $0,0(CO2)
  1450. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1451. dsubu TEMP, KCO, KK
  1452. #ifdef LEFT
  1453. daddiu TEMP, TEMP, -2
  1454. #else
  1455. daddiu TEMP, TEMP, -2
  1456. #endif
  1457. dsll K, TEMP, 1 + BASE_SHIFT
  1458. dsll TEMP, TEMP, 1 + BASE_SHIFT
  1459. daddu A, A, K
  1460. daddu B, B, TEMP
  1461. #endif
  1462. #ifdef LEFT
  1463. daddiu KK, KK, 2
  1464. #endif
  1465. #endif
  1466. .align 3
  1467. .L12_M1:
  1468. andi M,MCO,1 # mr = 1
  1469. beqz M,.L0_N2_Loop
  1470. nop
  1471. .L60:
  1472. #if defined(TRMMKERNEL)
  1473. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1474. move B,BO # Reset B
  1475. #else
  1476. dsll K, KK, 0 + BASE_SHIFT
  1477. dsll TEMP, KK, 1 + BASE_SHIFT
  1478. daddu A, A, K
  1479. daddu B, BO, TEMP
  1480. #endif
  1481. LD a0,0*SIZE(A)
  1482. MTC $0,t11
  1483. MOV t21,t11
  1484. LD b0,0*SIZE(B)
  1485. MOV t12,t11
  1486. LD b1,1*SIZE(B)
  1487. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1488. dsubu TEMP, KCO, KK
  1489. #elif defined(LEFT)
  1490. daddiu TEMP, KK, 1
  1491. #else
  1492. daddiu TEMP, KK, 2
  1493. #endif
  1494. dsra K,TEMP,2
  1495. MOV t22,t11
  1496. beqz K,.L65
  1497. nop
  1498. #else
  1499. dsra K,KCO,2
  1500. move B,BO # Reset B
  1501. LD a0,0*SIZE(A)
  1502. MTC $0,t11
  1503. MOV t21,t11
  1504. LD b0,0*SIZE(B)
  1505. MOV t12,t11
  1506. LD b1,1*SIZE(B)
  1507. beqz K,.L65
  1508. MOV t22,t11
  1509. #endif
  1510. .L61: # nr=2,mr=1,kr=4
  1511. LD a4, 1*SIZE(A) # a2
  1512. LD b4, 2*SIZE(B)
  1513. MADD t11,t11,a0,b0
  1514. LD b5,3*SIZE(B)
  1515. MADD t12,t12,a0,b1
  1516. LD a2, 2*SIZE(A) # a3
  1517. LD b2,4*SIZE(B)
  1518. MADD t11,t11,a4,b4
  1519. LD b3,5*SIZE(B)
  1520. MADD t12,t12,a4,b5
  1521. LD a6, 3*SIZE(A) # a4
  1522. daddiu K,K,-1
  1523. LD b6,6*SIZE(B)
  1524. MADD t11,t11,a2,b2
  1525. LD b7,7*SIZE(B)
  1526. MADD t12,t12,a2,b3
  1527. daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
  1528. LD a0, 0*SIZE(A)
  1529. daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE
  1530. LD b0,0*SIZE(B)
  1531. MADD t11,t11,a6,b6
  1532. LD b1,1*SIZE(B)
  1533. bnez K,.L61
  1534. MADD t12,t12,a6,b7
  1535. .L65: # kr=2
  1536. #ifndef TRMMKERNEL
  1537. andi K,KCO,2
  1538. #else
  1539. andi K,TEMP,2
  1540. #endif
  1541. beqz K,.L68
  1542. nop
  1543. .L66:
  1544. LD a4, 1*SIZE(A) # a1
  1545. MADD t11,t11,a0,b0
  1546. LD b4,2*SIZE(B)
  1547. daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16
  1548. LD b5,3*SIZE(B)
  1549. MADD t12,t12,a0,b1
  1550. daddu B,B,4*SIZE
  1551. .L67:
  1552. LD a0,0(A) # a0
  1553. LD b0,0*SIZE(B)
  1554. MADD t11,t11,a4,b4
  1555. LD b1,1*SIZE(B)
  1556. MADD t12,t12,a4,b5
  1557. .L68: # kr=1
  1558. #ifndef TRMMKERNEL
  1559. andi K,KCO,1
  1560. #else
  1561. andi K,TEMP,1
  1562. #endif
  1563. beqz K,.L69
  1564. LD ALPHA,152($sp) # Get ALPHA
  1565. MADD t11,t11,a0,b0
  1566. MADD t12,t12,a0,b1
  1567. daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16
  1568. daddu B,B,2*SIZE
  1569. .L69: # Write Back
  1570. #ifndef TRMMKERNEL
  1571. LD c11,0(CO1) # Fetch 16 C
  1572. LD c12,0(CO2)
  1573. MADD t11,c11,t11,ALPHA
  1574. MADD t12,c12,t12,ALPHA
  1575. ST t11,0(CO1)
  1576. ST t12,0(CO2)
  1577. daddu CO1,CO1,1*SIZE
  1578. daddu CO2,CO2,1*SIZE
  1579. #else
  1580. MUL t11, ALPHA, t11
  1581. MUL t12, ALPHA, t12
  1582. ST t11, 0 * SIZE(CO1)
  1583. ST t12, 0 * SIZE(CO2)
  1584. daddu CO1,CO1,1*SIZE
  1585. daddu CO2,CO2,1*SIZE
  1586. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1587. dsubu TEMP, KCO, KK
  1588. #ifdef LEFT
  1589. daddiu TEMP, TEMP, -1
  1590. #else
  1591. daddiu TEMP, TEMP, -2
  1592. #endif
  1593. dsll K, TEMP, 0 + BASE_SHIFT
  1594. dsll TEMP, TEMP, 1 + BASE_SHIFT
  1595. daddu A, A, K
  1596. daddu B, B, TEMP
  1597. #endif
  1598. #ifdef LEFT
  1599. daddiu KK, KK, 1
  1600. #endif
  1601. #endif
  1602. .L0_N2_Loop:
  1603. #if defined(TRMMKERNEL) && !defined(LEFT)
  1604. daddiu KK, KK, 2
  1605. #endif
  1606. move BO, B
  1607. .align 5
  1608. .L0_N1:
  1609. andi N,NCO,1 # nr = 1
  1610. beqz N,.L999
  1611. nop
  1612. move CO1,C
  1613. dsra M,MCO,2
  1614. move A,AO # Reset A
  1615. daddu PREA,AO,SPANA
  1616. #if defined(TRMMKERNEL) && defined(LEFT)
  1617. move KK, OFFSET
  1618. #endif
  1619. beqz M,.L11_M2
  1620. daddu C,CO1,LDC
  1621. .L70:
  1622. #if defined(TRMMKERNEL)
  1623. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1624. move B, BO # Reset B
  1625. #else
  1626. dsll K, KK, 2 + BASE_SHIFT
  1627. dsll TEMP, KK, 0 + BASE_SHIFT
  1628. daddu A, A, K
  1629. daddu B, BO, TEMP
  1630. #endif
  1631. LD b0, 0*SIZE(B)
  1632. MTC $0,t11
  1633. LD a0,0*SIZE(A)
  1634. MOV t21,t11
  1635. LD a1,1*SIZE(A)
  1636. MOV t31,t11
  1637. LD a2,2*SIZE(A)
  1638. MOV t41,t11
  1639. LD a3,3*SIZE(A)
  1640. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1641. dsubu TEMP, KCO, KK
  1642. #elif defined(LEFT)
  1643. daddiu TEMP, KK, 4
  1644. #else
  1645. daddiu TEMP, KK, 1
  1646. #endif
  1647. dsra K,TEMP,2
  1648. beqz K,.L75
  1649. nop
  1650. #else
  1651. move B, BO # Reset B
  1652. dsra K,KCO,2
  1653. LD b0, 0*SIZE(B)
  1654. MTC $0,t11
  1655. LD a0,0*SIZE(A)
  1656. MOV t21,t11
  1657. LD a1,1*SIZE(A)
  1658. MOV t31,t11
  1659. LD a2,2*SIZE(A)
  1660. MOV t41,t11
  1661. beqz K,.L75
  1662. LD a3,3*SIZE(A)
  1663. #endif
  1664. .L71: # nr=1,mr=kr=4
  1665. LD b4, 1*SIZE(B) # b1
  1666. MADD t11,t11,a0,b0
  1667. LD a4, 4*SIZE(A)
  1668. MADD t21,t21,a1,b0
  1669. LD a5, 5*SIZE(A)
  1670. FETCH $0,(PREA)
  1671. LD a6,6*SIZE(A)
  1672. MADD t31,t31,a2,b0
  1673. LD a7,7*SIZE(A)
  1674. MADD t41,t41,a3,b0
  1675. .L72:
  1676. LD b2, 2*SIZE(B) # b2
  1677. MADD t11,t11,a4,b4
  1678. LD a0,8*SIZE(A)
  1679. MADD t21,t21,a5,b4
  1680. LD a1,9*SIZE(A)
  1681. FETCH $0,4*SIZE(PREA)
  1682. LD a2,10*SIZE(A)
  1683. MADD t31,t31,a6,b4
  1684. LD a3,11*SIZE(A)
  1685. MADD t41,t41,a7,b4
  1686. .L73:
  1687. LD b6, 3*SIZE(B)
  1688. MADD t11,t11,a0,b2
  1689. LD a4,12*SIZE(A)
  1690. daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
  1691. LD a5,13*SIZE(A)
  1692. MADD t21,t21,a1,b2
  1693. LD a6,14*SIZE(A)
  1694. FETCH $0,8*SIZE(PREA)
  1695. MADD t31,t31,a2,b2
  1696. LD a7,15*SIZE(A)
  1697. MADD t41,t41,a3,b2
  1698. daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE
  1699. .L74:
  1700. LD b0, 0*SIZE(B)
  1701. MADD t11,t11,a4,b6
  1702. LD a0,0*SIZE(A)
  1703. daddu PREA,PREA,16*SIZE
  1704. LD a1,1*SIZE(A)
  1705. MADD t21,t21,a5,b6
  1706. LD a2,2*SIZE(A)
  1707. daddiu K,K,-1
  1708. MADD t31,t31,a6,b6
  1709. LD a3,3*SIZE(A)
  1710. MADD t41,t41,a7,b6
  1711. bnez K,.L71
  1712. FETCH $0,-32(PREA)
  1713. .L75: # kr=2
  1714. #ifndef TRMMKERNEL
  1715. andi K,KCO,2
  1716. #else
  1717. andi K,TEMP,2
  1718. #endif
  1719. beqz K,.L78
  1720. nop
  1721. .L76:
  1722. LD b4, 1*SIZE(B)
  1723. MADD t11,t11,a0,b0
  1724. LD a4,4*SIZE(A)
  1725. daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32
  1726. LD a5,5*SIZE(A)
  1727. MADD t21,t21,a1,b0
  1728. FETCH $0,0(PREA)
  1729. LD a6,6*SIZE(A)
  1730. MADD t31,t31,a2,b0
  1731. LD a7,7*SIZE(A)
  1732. MADD t41,t41,a3,b0
  1733. daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE
  1734. .L77:
  1735. LD b0,0(B)
  1736. MADD t11,t11,a4,b4
  1737. LD a0,0*SIZE(A)
  1738. MADD t21,t21,a5,b4
  1739. FETCH $0,4*SIZE(PREA)
  1740. LD a1,1*SIZE(A)
  1741. MADD t31,t31,a6,b4
  1742. LD a2,2*SIZE(A)
  1743. MADD t41,t41,a7,b4
  1744. LD a3,3*SIZE(A)
  1745. daddu PREA,PREA,8*SIZE
  1746. .L78: # kr=1
  1747. #ifndef TRMMKERNEL
  1748. andi K,KCO,1
  1749. #else
  1750. andi K,TEMP,1
  1751. #endif
  1752. beqz K,.L79
  1753. LD ALPHA,152($sp) # Get ALPHA
  1754. FETCH $0,0(PREA)
  1755. MADD t11,t11,a0,b0
  1756. MADD t21,t21,a1,b0
  1757. daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32
  1758. MADD t31,t31,a2,b0
  1759. MADD t41,t41,a3,b0
  1760. daddu B,B,1*SIZE
  1761. daddu PREA,PREA,4*SIZE
  1762. .L79: # Write Back
  1763. #ifndef TRMMKERNEL
  1764. LD c11,0(CO1) # Fetch 16 C
  1765. LD c21,1*SIZE(CO1)
  1766. LD c31,2*SIZE(CO1)
  1767. LD c41,3*SIZE(CO1)
  1768. MADD t11,c11,t11,ALPHA
  1769. MADD t21,c21,t21,ALPHA
  1770. MADD t31,c31,t31,ALPHA
  1771. MADD t41,c41,t41,ALPHA
  1772. ST t11,0(CO1)
  1773. ST t21,1*SIZE(CO1)
  1774. ST t31,2*SIZE(CO1)
  1775. ST t41,3*SIZE(CO1)
  1776. daddiu M,M,-1 # M--
  1777. FETCH $0,4*SIZE(CO1)
  1778. FETCH $0,8*SIZE(CO1)
  1779. bnez M,.L70 # M!=0
  1780. daddu CO1,CO1,4*SIZE # COx += 4*8Byte
  1781. #else
  1782. daddiu M,M,-1 # M--
  1783. MUL t11, ALPHA, t11
  1784. MUL t21, ALPHA, t21
  1785. MUL t31, ALPHA, t31
  1786. MUL t41, ALPHA, t41
  1787. ST t11,0(CO1)
  1788. ST t21,1*SIZE(CO1)
  1789. ST t31,2*SIZE(CO1)
  1790. ST t41,3*SIZE(CO1)
  1791. FETCH $0,4*SIZE(CO1)
  1792. FETCH $0,8*SIZE(CO1)
  1793. daddu CO1,CO1,4*SIZE
  1794. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1795. dsubu TEMP, KCO, KK
  1796. #ifdef LEFT
  1797. daddiu TEMP, TEMP, -4
  1798. #else
  1799. daddiu TEMP, TEMP, -1
  1800. #endif
  1801. dsll K, TEMP, 2 + BASE_SHIFT
  1802. dsll TEMP, TEMP, 0 + BASE_SHIFT
  1803. daddu A, A,K
  1804. daddu B, B, TEMP
  1805. #endif
  1806. #ifdef LEFT
  1807. daddiu KK, KK, 4
  1808. #endif
  1809. bnez M,.L70
  1810. nop
  1811. #endif
  1812. .align 3
  1813. .L11_M2:
  1814. andi M,MCO,2 # mr = 2
  1815. beqz M,.L11_M1
  1816. nop
  1817. .L80:
  1818. #if defined(TRMMKERNEL)
  1819. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1820. move B, BO
  1821. #else
  1822. dsll K, KK, 1 + BASE_SHIFT
  1823. dsll TEMP, KK, 0 + BASE_SHIFT
  1824. daddu A, A, K
  1825. daddu B, BO, TEMP
  1826. #endif
  1827. LD b0, 0*SIZE(B)
  1828. MTC $0,t11
  1829. MOV t21,t11
  1830. LD a0,0*SIZE(A)
  1831. LD a1,1*SIZE(A)
  1832. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1833. dsubu TEMP, KCO, KK
  1834. #elif defined(LEFT)
  1835. daddiu TEMP, KK, 2
  1836. #else
  1837. daddiu TEMP, KK, 1
  1838. #endif
  1839. dsra K,TEMP,2 # K=KCO/2
  1840. beqz K,.L85
  1841. nop
  1842. #else
  1843. move B, BO
  1844. dsra K,KCO,2
  1845. LD b0, 0*SIZE(B)
  1846. MTC $0,t11
  1847. MOV t21,t11
  1848. LD a0,0*SIZE(A)
  1849. beqz K,.L85
  1850. LD a1,1*SIZE(A)
  1851. #endif
  1852. .L81: # nr=1,mr=2,kr=4
  1853. LD b4, 1*SIZE(B)
  1854. LD a4,2*SIZE(A)
  1855. MADD t11,t11,a0,b0
  1856. LD a5,3*SIZE(A)
  1857. MADD t21,t21,a1,b0
  1858. LD b2, 2*SIZE(B)
  1859. LD a2,4*SIZE(A)
  1860. MADD t11,t11,a4,b4
  1861. LD a3,5*SIZE(A)
  1862. MADD t21,t21,a5,b4
  1863. LD b6, 3*SIZE(B)
  1864. LD a6,6*SIZE(A)
  1865. MADD t11,t11,a2,b2
  1866. LD a7,7*SIZE(A)
  1867. MADD t21,t21,a3,b2
  1868. daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE
  1869. daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
  1870. LD b0, 0*SIZE(B)
  1871. daddiu K,K,-1
  1872. LD a0,0*SIZE(A)
  1873. MADD t11,t11,a6,b6
  1874. LD a1,1*SIZE(A)
  1875. bnez K,.L81
  1876. MADD t21,t21,a7,b6
  1877. .L85: # kr=2
  1878. #ifndef TRMMKERNEL
  1879. andi K,KCO,2
  1880. #else
  1881. andi K,TEMP,2
  1882. #endif
  1883. beqz K,.L88
  1884. nop
  1885. .L86:
  1886. LD b4, 1*SIZE(B)
  1887. LD a4,2*SIZE(A)
  1888. MADD t11,t11,a0,b0
  1889. LD a5,3*SIZE(A)
  1890. MADD t21,t21,a1,b0
  1891. daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32
  1892. daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
  1893. LD b0,0(B)
  1894. LD a0,0*SIZE(A)
  1895. MADD t11,t11,a4,b4
  1896. LD a1,1*SIZE(A)
  1897. MADD t21,t21,a5,b4
  1898. .L88: # kr=1
  1899. #ifndef TRMMKERNEL
  1900. andi K,KCO,1
  1901. #else
  1902. andi K,TEMP,1
  1903. #endif
  1904. beqz K,.L89
  1905. LD ALPHA,152($sp) # Get ALPHA
  1906. MADD t11,t11,a0,b0
  1907. MADD t21,t21,a1,b0
  1908. daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16
  1909. daddu B,B,1*SIZE
  1910. .L89: # Write Back
  1911. #ifndef TRMMKERNEL
  1912. LD c11,0(CO1) # Fetch 16 C
  1913. LD c21,1*SIZE(CO1)
  1914. MADD t11,c11,t11,ALPHA
  1915. MADD t21,c21,t21,ALPHA
  1916. ST t11,0(CO1)
  1917. ST t21,1*SIZE(CO1)
  1918. FETCH $0,2*SIZE(CO1)
  1919. daddu CO1,CO1,2*SIZE # COx += 2*8Byte
  1920. #else
  1921. daddu CO1,CO1,2*SIZE # COx += 2*8Byte
  1922. MUL t11, ALPHA, t11
  1923. MUL t21, ALPHA, t21
  1924. FETCH $0,0(CO1)
  1925. ST t11, -2 * SIZE(CO1)
  1926. ST t21, -1 * SIZE(CO1)
  1927. #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1928. dsubu TEMP, KCO, KK
  1929. #ifdef LEFT
  1930. daddiu TEMP, TEMP, -2
  1931. #else
  1932. daddiu TEMP, TEMP, -1
  1933. #endif
  1934. dsll K, TEMP, 1 + BASE_SHIFT
  1935. dsll TEMP, TEMP, 0 + BASE_SHIFT
  1936. daddu A, A, K
  1937. daddu B, B, TEMP
  1938. #endif
  1939. #ifdef LEFT
  1940. daddiu KK, KK, 2
  1941. #endif
  1942. #endif
  1943. .align 3
  1944. .L11_M1:
  1945. andi M,MCO,1 # mr = 1
  1946. beqz M,.L999
  1947. nop
  1948. .L90:
  1949. #if defined(TRMMKERNEL)
  1950. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1951. move B, BO
  1952. #else
  1953. dsll K, KK, 0 + BASE_SHIFT
  1954. dsll TEMP, KK, 0 + BASE_SHIFT
  1955. daddu A, A, K
  1956. daddu B, BO, TEMP
  1957. #endif
  1958. LD a0, 0*SIZE(A)
  1959. LD b0, 0*SIZE(B)
  1960. MTC $0,t11
  1961. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1962. dsubu TEMP, KCO, KK
  1963. #elif defined(LEFT)
  1964. daddiu TEMP, KK, 1
  1965. #else
  1966. daddiu TEMP, KK, 1
  1967. #endif
  1968. dsra K, TEMP, 2
  1969. beqz K,.L95
  1970. nop
  1971. #else
  1972. move B, BO
  1973. LD a0, 0*SIZE(A)
  1974. LD b0, 0*SIZE(B)
  1975. dsra K,KCO,2
  1976. beqz K,.L95
  1977. MTC $0,t11
  1978. #endif
  1979. .L91: # nr=mr=1,kr=4
  1980. LD a4, 1*SIZE(A)
  1981. LD b4, 1*SIZE(B)
  1982. MADD t11,t11,a0,b0
  1983. LD a2, 2*SIZE(A)
  1984. LD b2, 2*SIZE(B)
  1985. MADD t11,t11,a4,b4
  1986. LD a6, 3*SIZE(A)
  1987. LD b6, 3*SIZE(B)
  1988. MADD t11,t11,a2,b2
  1989. daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32
  1990. daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32
  1991. LD a0, 0*SIZE(A)
  1992. LD b0, 0*SIZE(B)
  1993. MADD t11,t11,a6,b6
  1994. daddiu K,K,-1
  1995. bnez K,.L91
  1996. nop
  1997. .L95: # kr=2
  1998. #ifndef TRMMKERNEL
  1999. andi K,KCO,2
  2000. #else
  2001. andi K,TEMP,2
  2002. #endif
  2003. beqz K,.L98
  2004. nop
  2005. .L96:
  2006. LD a4, 1*SIZE(A)
  2007. LD b4, 1*SIZE(B)
  2008. MADD t11,t11,a0,b0
  2009. daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16
  2010. daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32
  2011. LD b0,0(B)
  2012. LD a0,0(A)
  2013. MADD t11,t11,a4,b4
  2014. .L98: # kr=1
  2015. #ifndef TRMMKERNEL
  2016. andi K,KCO,1
  2017. #else
  2018. andi K,TEMP,1
  2019. #endif
  2020. beqz K,.L99
  2021. LD ALPHA,152($sp) # Get ALPHA
  2022. MADD t11,t11,a0,b0
  2023. .L99: # Write Back
  2024. #ifndef TRMMKERNEL
  2025. LD c11,0(CO1) # Fetch 16 C
  2026. MADD t11,c11,t11,ALPHA
  2027. ST t11,0(CO1)
  2028. #else
  2029. MUL t11, ALPHA, t11
  2030. ST t11, 0 * SIZE(CO1)
  2031. #endif
  2032. .L999: # End
  2033. ld $16, 0($sp)
  2034. ld $17, 8($sp)
  2035. ld $18, 16($sp)
  2036. ld $19, 24($sp)
  2037. ld $20, 32($sp)
  2038. ld $21, 40($sp)
  2039. ld $22, 48($sp)
  2040. LD $f24, 56($sp)
  2041. LD $f25, 64($sp)
  2042. LD $f26, 72($sp)
  2043. LD $f27, 80($sp)
  2044. LD $f28, 88($sp)
  2045. ld $23, 96($sp)
  2046. ld $24, 104($sp)
  2047. ld $25, 112($sp)
  2048. LD $f20,120($sp)
  2049. LD $f21,128($sp)
  2050. LD $f22,136($sp)
  2051. LD $f23,144($sp)
  2052. j $31
  2053. daddiu $sp, $sp, 160
  2054. EPILOGUE