You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT.S 59 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define N $5
  42. #define K $6
  43. #define A $8
  44. #define B $9
  45. #define C $10
  46. #define LDC $11
  47. #define AO $12
  48. #define BO $13
  49. #define I $2
  50. #define J $3
  51. #define L $7
  52. #define CO1 $14
  53. #define CO2 $15
  54. #define CO3 $16
  55. #define CO4 $17
  56. #define CO5 $18
  57. #define CO6 $19
  58. #define CO7 $20
  59. #define CO8 $21
  60. #define OFFSET $22
  61. #define KK $23
  62. #define TEMP $24
  63. #define AORIG $25
  64. #define a1 $f0
  65. #define a2 $f1
  66. #define a3 $f27
  67. #define a4 $f28
  68. #define b1 $f2
  69. #define b2 $f3
  70. #define b3 $f4
  71. #define b4 $f5
  72. #define b5 $f6
  73. #define b6 $f7
  74. #define b7 $f8
  75. #define b8 $f9
  76. #define a5 b8
  77. #define c11 $f10
  78. #define c12 $f11
  79. #define c21 $f12
  80. #define c22 $f13
  81. #define c31 $f14
  82. #define c32 $f16
  83. #define c41 $f17
  84. #define c42 $f18
  85. #define c51 $f19
  86. #define c52 $f20
  87. #define c61 $f21
  88. #define c62 $f22
  89. #define c71 $f23
  90. #define c72 $f24
  91. #define c81 $f25
  92. #define c82 $f26
  93. #define ALPHA $f15
  94. PROLOGUE
  95. daddiu $sp, $sp, -144
  96. SDARG $16, 0($sp)
  97. SDARG $17, 8($sp)
  98. SDARG $18, 16($sp)
  99. SDARG $19, 24($sp)
  100. SDARG $20, 32($sp)
  101. SDARG $21, 40($sp)
  102. sdc1 $f24, 48($sp)
  103. sdc1 $f25, 56($sp)
  104. sdc1 $f26, 64($sp)
  105. sdc1 $f27, 72($sp)
  106. sdc1 $f28, 80($sp)
  107. SDARG $22, 88($sp)
  108. SDARG $23, 96($sp)
  109. SDARG $24, 104($sp)
  110. SDARG $25, 112($sp)
  111. #ifndef __64BIT__
  112. sdc1 $f20,112($sp)
  113. sdc1 $f21,120($sp)
  114. sdc1 $f22,128($sp)
  115. sdc1 $f23,136($sp)
  116. #endif
  117. LDARG OFFSET, 144($sp)
  118. dsll LDC, LDC, BASE_SHIFT
  119. #ifdef LN
  120. mult M, K
  121. mflo TEMP
  122. dsll TEMP, TEMP, BASE_SHIFT
  123. daddu A, A, TEMP
  124. dsll TEMP, M, BASE_SHIFT
  125. daddu C, C, TEMP
  126. #endif
  127. #ifdef RN
  128. neg KK, OFFSET
  129. #endif
  130. #ifdef RT
  131. mult N, K
  132. mflo TEMP
  133. dsll TEMP, TEMP, BASE_SHIFT
  134. daddu B, B, TEMP
  135. mult N, LDC
  136. mflo TEMP
  137. daddu C, C, TEMP
  138. dsubu KK, N, OFFSET
  139. #endif
  140. dsra J, N, 3
  141. blez J, .L30
  142. nop
  143. .L10:
  144. #ifdef RT
  145. dsll TEMP, K, 3 + BASE_SHIFT
  146. dsubu B, B, TEMP
  147. dsll TEMP, LDC, 3
  148. dsubu C, C, TEMP
  149. #endif
  150. move CO1, C
  151. MTC $0, c11
  152. daddu CO2, C, LDC
  153. daddu CO3, CO2, LDC
  154. daddiu J, J, -1
  155. daddu CO4, CO3, LDC
  156. MOV c21, c11
  157. daddu CO5, CO4, LDC
  158. MOV c31, c11
  159. daddu CO6, CO5, LDC
  160. MOV c41, c11
  161. daddu CO7, CO6, LDC
  162. MOV c51, c11
  163. daddu CO8, CO7, LDC
  164. dsra I, M, 1
  165. #ifdef LN
  166. daddu KK, M, OFFSET
  167. #endif
  168. #ifdef LT
  169. move KK, OFFSET
  170. #endif
  171. #if defined(LN) || defined(RT)
  172. move AORIG, A
  173. #else
  174. move AO, A
  175. #endif
  176. #ifndef RT
  177. daddu C, CO8, LDC
  178. #endif
  179. blez I, .L20
  180. MOV c61, c11
  181. .L11:
  182. #if defined(LT) || defined(RN)
  183. LD a1, 0 * SIZE(AO)
  184. MOV c71, c11
  185. LD b1, 0 * SIZE(B)
  186. MOV c81, c11
  187. LD a3, 4 * SIZE(AO)
  188. MOV c12, c11
  189. LD b2, 1 * SIZE(B)
  190. MOV c22, c11
  191. dsra L, KK, 2
  192. MOV c32, c11
  193. LD b3, 2 * SIZE(B)
  194. MOV c42, c11
  195. LD b4, 3 * SIZE(B)
  196. MOV c52, c11
  197. LD b5, 4 * SIZE(B)
  198. MOV c62, c11
  199. LD b6, 8 * SIZE(B)
  200. MOV c72, c11
  201. LD b7, 12 * SIZE(B)
  202. MOV c82, c11
  203. blez L, .L15
  204. move BO, B
  205. #else
  206. #ifdef LN
  207. dsll TEMP, K, 1 + BASE_SHIFT
  208. dsubu AORIG, AORIG, TEMP
  209. #endif
  210. dsll L, KK, 1 + BASE_SHIFT
  211. dsll TEMP, KK, 3 + BASE_SHIFT
  212. daddu AO, AORIG, L
  213. daddu BO, B, TEMP
  214. dsubu TEMP, K, KK
  215. LD a1, 0 * SIZE(AO)
  216. MOV c71, c11
  217. LD b1, 0 * SIZE(BO)
  218. MOV c81, c11
  219. LD a3, 4 * SIZE(AO)
  220. MOV c12, c11
  221. LD b2, 1 * SIZE(BO)
  222. MOV c22, c11
  223. dsra L, TEMP, 2
  224. MOV c32, c11
  225. LD b3, 2 * SIZE(BO)
  226. MOV c42, c11
  227. LD b4, 3 * SIZE(BO)
  228. MOV c52, c11
  229. LD b5, 4 * SIZE(BO)
  230. MOV c62, c11
  231. LD b6, 8 * SIZE(BO)
  232. MOV c72, c11
  233. LD b7, 12 * SIZE(BO)
  234. MOV c82, c11
  235. blez L, .L15
  236. NOP
  237. #endif
  238. MADD c11, c11, a1, b1
  239. LD a2, 1 * SIZE(AO)
  240. MADD c21, c21, a1, b2
  241. daddiu L, L, -1
  242. MADD c31, c31, a1, b3
  243. blez L, .L13
  244. MADD c41, c41, a1, b4
  245. NOP
  246. .align 3
  247. .L12:
  248. MADD c12, c12, a2, b1
  249. LD b1, 16 * SIZE(BO)
  250. MADD c22, c22, a2, b2
  251. LD b2, 5 * SIZE(BO)
  252. MADD c32, c32, a2, b3
  253. LD b3, 6 * SIZE(BO)
  254. MADD c42, c42, a2, b4
  255. LD b4, 7 * SIZE(BO)
  256. MADD c51, c51, a1, b5
  257. NOP
  258. MADD c61, c61, a1, b2
  259. LD a4, 2 * SIZE(AO)
  260. MADD c71, c71, a1, b3
  261. NOP
  262. MADD c81, c81, a1, b4
  263. LD a1, 8 * SIZE(AO)
  264. MADD c52, c52, a2, b5
  265. LD b5, 20 * SIZE(BO)
  266. MADD c62, c62, a2, b2
  267. LD b2, 9 * SIZE(BO)
  268. MADD c72, c72, a2, b3
  269. LD b3, 10 * SIZE(BO)
  270. MADD c82, c82, a2, b4
  271. LD b4, 11 * SIZE(BO)
  272. MADD c11, c11, a4, b6
  273. LD a2, 3 * SIZE(AO)
  274. MADD c21, c21, a4, b2
  275. NOP
  276. MADD c31, c31, a4, b3
  277. NOP
  278. MADD c41, c41, a4, b4
  279. NOP
  280. MADD c12, c12, a2, b6
  281. LD b6, 24 * SIZE(BO)
  282. MADD c22, c22, a2, b2
  283. LD b2, 13 * SIZE(BO)
  284. MADD c32, c32, a2, b3
  285. LD b3, 14 * SIZE(BO)
  286. MADD c42, c42, a2, b4
  287. LD b4, 15 * SIZE(BO)
  288. MADD c51, c51, a4, b7
  289. NOP
  290. MADD c61, c61, a4, b2
  291. NOP
  292. MADD c71, c71, a4, b3
  293. NOP
  294. MADD c81, c81, a4, b4
  295. NOP
  296. MADD c52, c52, a2, b7
  297. LD b7, 28 * SIZE(BO)
  298. MADD c62, c62, a2, b2
  299. LD b2, 17 * SIZE(BO)
  300. MADD c72, c72, a2, b3
  301. LD b3, 18 * SIZE(BO)
  302. MADD c82, c82, a2, b4
  303. LD b4, 19 * SIZE(BO)
  304. MADD c11, c11, a3, b1
  305. LD a2, 5 * SIZE(AO)
  306. MADD c21, c21, a3, b2
  307. NOP
  308. MADD c31, c31, a3, b3
  309. NOP
  310. MADD c41, c41, a3, b4
  311. NOP
  312. MADD c12, c12, a2, b1
  313. LD b1, 32 * SIZE(BO)
  314. MADD c22, c22, a2, b2
  315. LD b2, 21 * SIZE(BO)
  316. MADD c32, c32, a2, b3
  317. LD b3, 22 * SIZE(BO)
  318. MADD c42, c42, a2, b4
  319. LD b4, 23 * SIZE(BO)
  320. MADD c51, c51, a3, b5
  321. NOP
  322. MADD c61, c61, a3, b2
  323. LD a4, 6 * SIZE(AO)
  324. MADD c71, c71, a3, b3
  325. NOP
  326. MADD c81, c81, a3, b4
  327. LD a3, 12 * SIZE(AO)
  328. MADD c52, c52, a2, b5
  329. LD b5, 36 * SIZE(BO)
  330. MADD c62, c62, a2, b2
  331. LD b2, 25 * SIZE(BO)
  332. MADD c72, c72, a2, b3
  333. LD b3, 26 * SIZE(BO)
  334. MADD c82, c82, a2, b4
  335. LD b4, 27 * SIZE(BO)
  336. MADD c11, c11, a4, b6
  337. LD a2, 7 * SIZE(AO)
  338. MADD c21, c21, a4, b2
  339. NOP
  340. MADD c31, c31, a4, b3
  341. NOP
  342. MADD c41, c41, a4, b4
  343. daddiu L, L, -1
  344. MADD c12, c12, a2, b6
  345. LD b6, 40 * SIZE(BO)
  346. MADD c22, c22, a2, b2
  347. LD b2, 29 * SIZE(BO)
  348. MADD c32, c32, a2, b3
  349. LD b3, 30 * SIZE(BO)
  350. MADD c42, c42, a2, b4
  351. LD b4, 31 * SIZE(BO)
  352. MADD c51, c51, a4, b7
  353. daddiu BO, BO, 32 * SIZE
  354. MADD c61, c61, a4, b2
  355. daddiu AO, AO, 8 * SIZE
  356. MADD c71, c71, a4, b3
  357. NOP
  358. MADD c81, c81, a4, b4
  359. NOP
  360. MADD c52, c52, a2, b7
  361. LD b7, 12 * SIZE(BO)
  362. MADD c62, c62, a2, b2
  363. LD b2, 1 * SIZE(BO)
  364. MADD c72, c72, a2, b3
  365. LD b3, 2 * SIZE(BO)
  366. MADD c82, c82, a2, b4
  367. LD b4, 3 * SIZE(BO)
  368. MADD c11, c11, a1, b1
  369. LD a2, 1 * SIZE(AO)
  370. MADD c21, c21, a1, b2
  371. NOP
  372. MADD c31, c31, a1, b3
  373. bgtz L, .L12
  374. MADD c41, c41, a1, b4
  375. NOP
  376. .align 3
  377. .L13:
  378. MADD c12, c12, a2, b1
  379. LD b1, 16 * SIZE(BO)
  380. MADD c22, c22, a2, b2
  381. LD b2, 5 * SIZE(BO)
  382. MADD c32, c32, a2, b3
  383. LD b3, 6 * SIZE(BO)
  384. MADD c42, c42, a2, b4
  385. LD b4, 7 * SIZE(BO)
  386. MADD c51, c51, a1, b5
  387. NOP
  388. MADD c61, c61, a1, b2
  389. LD a4, 2 * SIZE(AO)
  390. MADD c71, c71, a1, b3
  391. NOP
  392. MADD c81, c81, a1, b4
  393. LD a1, 8 * SIZE(AO)
  394. MADD c52, c52, a2, b5
  395. LD b5, 20 * SIZE(BO)
  396. MADD c62, c62, a2, b2
  397. LD b2, 9 * SIZE(BO)
  398. MADD c72, c72, a2, b3
  399. LD b3, 10 * SIZE(BO)
  400. MADD c82, c82, a2, b4
  401. LD b4, 11 * SIZE(BO)
  402. MADD c11, c11, a4, b6
  403. LD a2, 3 * SIZE(AO)
  404. MADD c21, c21, a4, b2
  405. NOP
  406. MADD c31, c31, a4, b3
  407. NOP
  408. MADD c41, c41, a4, b4
  409. NOP
  410. MADD c12, c12, a2, b6
  411. LD b6, 24 * SIZE(BO)
  412. MADD c22, c22, a2, b2
  413. LD b2, 13 * SIZE(BO)
  414. MADD c32, c32, a2, b3
  415. LD b3, 14 * SIZE(BO)
  416. MADD c42, c42, a2, b4
  417. LD b4, 15 * SIZE(BO)
  418. MADD c51, c51, a4, b7
  419. NOP
  420. MADD c61, c61, a4, b2
  421. NOP
  422. MADD c71, c71, a4, b3
  423. NOP
  424. MADD c81, c81, a4, b4
  425. NOP
  426. MADD c52, c52, a2, b7
  427. LD b7, 28 * SIZE(BO)
  428. MADD c62, c62, a2, b2
  429. LD b2, 17 * SIZE(BO)
  430. MADD c72, c72, a2, b3
  431. LD b3, 18 * SIZE(BO)
  432. MADD c82, c82, a2, b4
  433. LD b4, 19 * SIZE(BO)
  434. MADD c11, c11, a3, b1
  435. LD a2, 5 * SIZE(AO)
  436. MADD c21, c21, a3, b2
  437. NOP
  438. MADD c31, c31, a3, b3
  439. NOP
  440. MADD c41, c41, a3, b4
  441. NOP
  442. MADD c12, c12, a2, b1
  443. LD b1, 32 * SIZE(BO)
  444. MADD c22, c22, a2, b2
  445. LD b2, 21 * SIZE(BO)
  446. MADD c32, c32, a2, b3
  447. LD b3, 22 * SIZE(BO)
  448. MADD c42, c42, a2, b4
  449. LD b4, 23 * SIZE(BO)
  450. MADD c51, c51, a3, b5
  451. NOP
  452. MADD c61, c61, a3, b2
  453. LD a4, 6 * SIZE(AO)
  454. MADD c71, c71, a3, b3
  455. NOP
  456. MADD c81, c81, a3, b4
  457. LD a3, 12 * SIZE(AO)
  458. MADD c52, c52, a2, b5
  459. LD b5, 36 * SIZE(BO)
  460. MADD c62, c62, a2, b2
  461. LD b2, 25 * SIZE(BO)
  462. MADD c72, c72, a2, b3
  463. LD b3, 26 * SIZE(BO)
  464. MADD c82, c82, a2, b4
  465. LD b4, 27 * SIZE(BO)
  466. MADD c11, c11, a4, b6
  467. LD a2, 7 * SIZE(AO)
  468. MADD c21, c21, a4, b2
  469. NOP
  470. MADD c31, c31, a4, b3
  471. NOP
  472. MADD c41, c41, a4, b4
  473. NOP
  474. MADD c12, c12, a2, b6
  475. LD b6, 40 * SIZE(BO)
  476. MADD c22, c22, a2, b2
  477. LD b2, 29 * SIZE(BO)
  478. MADD c32, c32, a2, b3
  479. LD b3, 30 * SIZE(BO)
  480. MADD c42, c42, a2, b4
  481. LD b4, 31 * SIZE(BO)
  482. MADD c51, c51, a4, b7
  483. daddiu BO, BO, 32 * SIZE
  484. MADD c61, c61, a4, b2
  485. daddiu AO, AO, 8 * SIZE
  486. MADD c71, c71, a4, b3
  487. NOP
  488. MADD c81, c81, a4, b4
  489. NOP
  490. MADD c52, c52, a2, b7
  491. LD b7, 12 * SIZE(BO)
  492. MADD c62, c62, a2, b2
  493. LD b2, 1 * SIZE(BO)
  494. MADD c72, c72, a2, b3
  495. LD b3, 2 * SIZE(BO)
  496. MADD c82, c82, a2, b4
  497. LD b4, 3 * SIZE(BO)
  498. .align 3
  499. .L15:
  500. #if defined(LT) || defined(RN)
  501. andi L, KK, 3
  502. #else
  503. andi L, TEMP, 3
  504. #endif
  505. blez L, .L18
  506. NOP
  507. .align 3
  508. .L16:
  509. MADD c11, c11, a1, b1
  510. LD a2, 1 * SIZE(AO)
  511. MADD c21, c21, a1, b2
  512. NOP
  513. MADD c31, c31, a1, b3
  514. NOP
  515. MADD c41, c41, a1, b4
  516. NOP
  517. MADD c12, c12, a2, b1
  518. LD b1, 8 * SIZE(BO)
  519. MADD c22, c22, a2, b2
  520. LD b2, 5 * SIZE(BO)
  521. MADD c32, c32, a2, b3
  522. LD b3, 6 * SIZE(BO)
  523. MADD c42, c42, a2, b4
  524. LD b4, 7 * SIZE(BO)
  525. MADD c51, c51, a1, b5
  526. daddiu L, L, -1
  527. MADD c61, c61, a1, b2
  528. daddiu AO, AO, 2 * SIZE
  529. MADD c71, c71, a1, b3
  530. daddiu BO, BO, 8 * SIZE
  531. MADD c81, c81, a1, b4
  532. LD a1, 0 * SIZE(AO)
  533. MADD c52, c52, a2, b5
  534. LD b5, 4 * SIZE(BO)
  535. MADD c62, c62, a2, b2
  536. LD b2, 1 * SIZE(BO)
  537. MADD c72, c72, a2, b3
  538. LD b3, 2 * SIZE(BO)
  539. MADD c82, c82, a2, b4
  540. bgtz L, .L16
  541. LD b4, 3 * SIZE(BO)
  542. .L18:
  543. #if defined(LN) || defined(RT)
  544. #ifdef LN
  545. daddiu TEMP, KK, -2
  546. #else
  547. daddiu TEMP, KK, -8
  548. #endif
  549. dsll L, TEMP, 1 + BASE_SHIFT
  550. dsll TEMP, TEMP, 3 + BASE_SHIFT
  551. daddu AO, AORIG, L
  552. daddu BO, B, TEMP
  553. #endif
  554. #if defined(LN) || defined(LT)
  555. LD b1, 0 * SIZE(BO)
  556. LD b2, 1 * SIZE(BO)
  557. LD b3, 2 * SIZE(BO)
  558. LD b4, 3 * SIZE(BO)
  559. SUB c11, b1, c11
  560. LD b5, 4 * SIZE(BO)
  561. SUB c21, b2, c21
  562. LD b6, 5 * SIZE(BO)
  563. SUB c31, b3, c31
  564. LD b7, 6 * SIZE(BO)
  565. SUB c41, b4, c41
  566. LD b8, 7 * SIZE(BO)
  567. SUB c51, b5, c51
  568. LD b1, 8 * SIZE(BO)
  569. SUB c61, b6, c61
  570. LD b2, 9 * SIZE(BO)
  571. SUB c71, b7, c71
  572. LD b3, 10 * SIZE(BO)
  573. SUB c81, b8, c81
  574. LD b4, 11 * SIZE(BO)
  575. SUB c12, b1, c12
  576. LD b5, 12 * SIZE(BO)
  577. SUB c22, b2, c22
  578. LD b6, 13 * SIZE(BO)
  579. SUB c32, b3, c32
  580. LD b7, 14 * SIZE(BO)
  581. SUB c42, b4, c42
  582. LD b8, 15 * SIZE(BO)
  583. SUB c52, b5, c52
  584. #ifdef LN
  585. LD b1, 3 * SIZE(AO)
  586. #else
  587. LD b1, 0 * SIZE(AO)
  588. #endif
  589. SUB c62, b6, c62
  590. SUB c72, b7, c72
  591. SUB c82, b8, c82
  592. #else
  593. LD b1, 0 * SIZE(AO)
  594. LD b2, 1 * SIZE(AO)
  595. LD b3, 2 * SIZE(AO)
  596. LD b4, 3 * SIZE(AO)
  597. SUB c11, b1, c11
  598. LD b5, 4 * SIZE(AO)
  599. SUB c12, b2, c12
  600. LD b6, 5 * SIZE(AO)
  601. SUB c21, b3, c21
  602. LD b7, 6 * SIZE(AO)
  603. SUB c22, b4, c22
  604. LD b8, 7 * SIZE(AO)
  605. SUB c31, b5, c31
  606. LD b1, 8 * SIZE(AO)
  607. SUB c32, b6, c32
  608. LD b2, 9 * SIZE(AO)
  609. SUB c41, b7, c41
  610. LD b3, 10 * SIZE(AO)
  611. SUB c42, b8, c42
  612. LD b4, 11 * SIZE(AO)
  613. LD b5, 12 * SIZE(AO)
  614. SUB c51, b1, c51
  615. LD b6, 13 * SIZE(AO)
  616. SUB c52, b2, c52
  617. LD b7, 14 * SIZE(AO)
  618. SUB c61, b3, c61
  619. LD b8, 15 * SIZE(AO)
  620. SUB c62, b4, c62
  621. SUB c71, b5, c71
  622. SUB c72, b6, c72
  623. SUB c81, b7, c81
  624. SUB c82, b8, c82
  625. #endif
  626. #ifdef LN
  627. MUL c12, b1, c12
  628. LD b2, 2 * SIZE(AO)
  629. MUL c22, b1, c22
  630. MUL c32, b1, c32
  631. MUL c42, b1, c42
  632. MUL c52, b1, c52
  633. MUL c62, b1, c62
  634. MUL c72, b1, c72
  635. MUL c82, b1, c82
  636. NMSUB c11, c11, b2, c12
  637. LD b3, 0 * SIZE(AO)
  638. NMSUB c21, c21, b2, c22
  639. NMSUB c31, c31, b2, c32
  640. NMSUB c41, c41, b2, c42
  641. NMSUB c51, c51, b2, c52
  642. NMSUB c61, c61, b2, c62
  643. NMSUB c71, c71, b2, c72
  644. NMSUB c81, c81, b2, c82
  645. MUL c11, b3, c11
  646. daddiu CO1, CO1, -2 * SIZE
  647. MUL c21, b3, c21
  648. daddiu CO2, CO2, -2 * SIZE
  649. MUL c31, b3, c31
  650. daddiu CO3, CO3, -2 * SIZE
  651. MUL c41, b3, c41
  652. daddiu CO4, CO4, -2 * SIZE
  653. MUL c51, b3, c51
  654. daddiu CO5, CO5, -2 * SIZE
  655. MUL c61, b3, c61
  656. daddiu CO6, CO6, -2 * SIZE
  657. MUL c71, b3, c71
  658. daddiu CO7, CO7, -2 * SIZE
  659. MUL c81, b3, c81
  660. daddiu CO8, CO8, -2 * SIZE
  661. #endif
  662. #ifdef LT
  663. MUL c11, b1, c11
  664. LD b2, 1 * SIZE(AO)
  665. MUL c21, b1, c21
  666. MUL c31, b1, c31
  667. MUL c41, b1, c41
  668. MUL c51, b1, c51
  669. MUL c61, b1, c61
  670. MUL c71, b1, c71
  671. MUL c81, b1, c81
  672. NMSUB c12, c12, b2, c11
  673. LD b3, 3 * SIZE(AO)
  674. NMSUB c22, c22, b2, c21
  675. NMSUB c32, c32, b2, c31
  676. NMSUB c42, c42, b2, c41
  677. NMSUB c52, c52, b2, c51
  678. NMSUB c62, c62, b2, c61
  679. NMSUB c72, c72, b2, c71
  680. NMSUB c82, c82, b2, c81
  681. MUL c12, b3, c12
  682. MUL c22, b3, c22
  683. MUL c32, b3, c32
  684. MUL c42, b3, c42
  685. MUL c52, b3, c52
  686. MUL c62, b3, c62
  687. MUL c72, b3, c72
  688. MUL c82, b3, c82
  689. #endif
  690. #ifdef RN
  691. LD b1, 0 * SIZE(BO)
  692. LD b2, 1 * SIZE(BO)
  693. LD b3, 2 * SIZE(BO)
  694. LD b4, 3 * SIZE(BO)
  695. MUL c11, b1, c11
  696. MUL c12, b1, c12
  697. LD b5, 4 * SIZE(BO)
  698. NMSUB c21, c21, b2, c11
  699. NMSUB c22, c22, b2, c12
  700. LD b6, 5 * SIZE(BO)
  701. NMSUB c31, c31, b3, c11
  702. NMSUB c32, c32, b3, c12
  703. LD b7, 6 * SIZE(BO)
  704. NMSUB c41, c41, b4, c11
  705. NMSUB c42, c42, b4, c12
  706. LD b8, 7 * SIZE(BO)
  707. NMSUB c51, c51, b5, c11
  708. NMSUB c52, c52, b5, c12
  709. LD b2, 9 * SIZE(BO)
  710. NMSUB c61, c61, b6, c11
  711. NMSUB c62, c62, b6, c12
  712. LD b3, 10 * SIZE(BO)
  713. NMSUB c71, c71, b7, c11
  714. NMSUB c72, c72, b7, c12
  715. LD b4, 11 * SIZE(BO)
  716. NMSUB c81, c81, b8, c11
  717. NMSUB c82, c82, b8, c12
  718. LD b5, 12 * SIZE(BO)
  719. MUL c21, b2, c21
  720. MUL c22, b2, c22
  721. LD b6, 13 * SIZE(BO)
  722. NMSUB c31, c31, b3, c21
  723. NMSUB c32, c32, b3, c22
  724. LD b7, 14 * SIZE(BO)
  725. NMSUB c41, c41, b4, c21
  726. NMSUB c42, c42, b4, c22
  727. LD b8, 15 * SIZE(BO)
  728. NMSUB c51, c51, b5, c21
  729. NMSUB c52, c52, b5, c22
  730. LD b3, 18 * SIZE(BO)
  731. NMSUB c61, c61, b6, c21
  732. NMSUB c62, c62, b6, c22
  733. LD b4, 19 * SIZE(BO)
  734. NMSUB c71, c71, b7, c21
  735. NMSUB c72, c72, b7, c22
  736. LD b5, 20 * SIZE(BO)
  737. NMSUB c81, c81, b8, c21
  738. NMSUB c82, c82, b8, c22
  739. LD b6, 21 * SIZE(BO)
  740. MUL c31, b3, c31
  741. MUL c32, b3, c32
  742. LD b7, 22 * SIZE(BO)
  743. NMSUB c41, c41, b4, c31
  744. NMSUB c42, c42, b4, c32
  745. LD b8, 23 * SIZE(BO)
  746. NMSUB c51, c51, b5, c31
  747. NMSUB c52, c52, b5, c32
  748. LD b4, 27 * SIZE(BO)
  749. NMSUB c61, c61, b6, c31
  750. NMSUB c62, c62, b6, c32
  751. LD b5, 28 * SIZE(BO)
  752. NMSUB c71, c71, b7, c31
  753. NMSUB c72, c72, b7, c32
  754. LD b6, 29 * SIZE(BO)
  755. NMSUB c81, c81, b8, c31
  756. NMSUB c82, c82, b8, c32
  757. LD b7, 30 * SIZE(BO)
  758. MUL c41, b4, c41
  759. MUL c42, b4, c42
  760. LD b8, 31 * SIZE(BO)
  761. NMSUB c51, c51, b5, c41
  762. NMSUB c52, c52, b5, c42
  763. LD b5, 36 * SIZE(BO)
  764. NMSUB c61, c61, b6, c41
  765. NMSUB c62, c62, b6, c42
  766. LD b6, 37 * SIZE(BO)
  767. NMSUB c71, c71, b7, c41
  768. NMSUB c72, c72, b7, c42
  769. LD b7, 38 * SIZE(BO)
  770. NMSUB c81, c81, b8, c41
  771. NMSUB c82, c82, b8, c42
  772. LD b8, 39 * SIZE(BO)
  773. MUL c51, b5, c51
  774. MUL c52, b5, c52
  775. NMSUB c61, c61, b6, c51
  776. NMSUB c62, c62, b6, c52
  777. LD b6, 45 * SIZE(BO)
  778. NMSUB c71, c71, b7, c51
  779. NMSUB c72, c72, b7, c52
  780. LD b7, 46 * SIZE(BO)
  781. NMSUB c81, c81, b8, c51
  782. NMSUB c82, c82, b8, c52
  783. LD b8, 47 * SIZE(BO)
  784. MUL c61, b6, c61
  785. MUL c62, b6, c62
  786. NMSUB c71, c71, b7, c61
  787. NMSUB c72, c72, b7, c62
  788. LD b7, 54 * SIZE(BO)
  789. NMSUB c81, c81, b8, c61
  790. NMSUB c82, c82, b8, c62
  791. LD b8, 55 * SIZE(BO)
  792. MUL c71, b7, c71
  793. MUL c72, b7, c72
  794. NMSUB c81, c81, b8, c71
  795. NMSUB c82, c82, b8, c72
  796. LD b8, 63 * SIZE(BO)
  797. MUL c81, b8, c81
  798. MUL c82, b8, c82
  799. #endif
  800. #ifdef RT
  801. LD b1, 63 * SIZE(BO)
  802. LD b2, 62 * SIZE(BO)
  803. LD b3, 61 * SIZE(BO)
  804. LD b4, 60 * SIZE(BO)
  805. MUL c81, b1, c81
  806. MUL c82, b1, c82
  807. LD b5, 59 * SIZE(BO)
  808. NMSUB c71, c71, b2, c81
  809. NMSUB c72, c72, b2, c82
  810. LD b6, 58 * SIZE(BO)
  811. NMSUB c61, c61, b3, c81
  812. NMSUB c62, c62, b3, c82
  813. LD b7, 57 * SIZE(BO)
  814. NMSUB c51, c51, b4, c81
  815. NMSUB c52, c52, b4, c82
  816. LD b8, 56 * SIZE(BO)
  817. NMSUB c41, c41, b5, c81
  818. NMSUB c42, c42, b5, c82
  819. LD b2, 54 * SIZE(BO)
  820. NMSUB c31, c31, b6, c81
  821. NMSUB c32, c32, b6, c82
  822. LD b3, 53 * SIZE(BO)
  823. NMSUB c21, c21, b7, c81
  824. NMSUB c22, c22, b7, c82
  825. LD b4, 52 * SIZE(BO)
  826. NMSUB c11, c11, b8, c81
  827. NMSUB c12, c12, b8, c82
  828. LD b5, 51 * SIZE(BO)
  829. MUL c71, b2, c71
  830. MUL c72, b2, c72
  831. LD b6, 50 * SIZE(BO)
  832. NMSUB c61, c61, b3, c71
  833. NMSUB c62, c62, b3, c72
  834. LD b7, 49 * SIZE(BO)
  835. NMSUB c51, c51, b4, c71
  836. NMSUB c52, c52, b4, c72
  837. LD b8, 48 * SIZE(BO)
  838. NMSUB c41, c41, b5, c71
  839. NMSUB c42, c42, b5, c72
  840. LD b3, 45 * SIZE(BO)
  841. NMSUB c31, c31, b6, c71
  842. NMSUB c32, c32, b6, c72
  843. LD b4, 44 * SIZE(BO)
  844. NMSUB c21, c21, b7, c71
  845. NMSUB c22, c22, b7, c72
  846. LD b5, 43 * SIZE(BO)
  847. NMSUB c11, c11, b8, c71
  848. NMSUB c12, c12, b8, c72
  849. LD b6, 42 * SIZE(BO)
  850. MUL c61, b3, c61
  851. MUL c62, b3, c62
  852. LD b7, 41 * SIZE(BO)
  853. NMSUB c51, c51, b4, c61
  854. NMSUB c52, c52, b4, c62
  855. LD b8, 40 * SIZE(BO)
  856. NMSUB c41, c41, b5, c61
  857. NMSUB c42, c42, b5, c62
  858. LD b4, 36 * SIZE(BO)
  859. NMSUB c31, c31, b6, c61
  860. NMSUB c32, c32, b6, c62
  861. LD b5, 35 * SIZE(BO)
  862. NMSUB c21, c21, b7, c61
  863. NMSUB c22, c22, b7, c62
  864. LD b6, 34 * SIZE(BO)
  865. NMSUB c11, c11, b8, c61
  866. NMSUB c12, c12, b8, c62
  867. LD b7, 33 * SIZE(BO)
  868. MUL c51, b4, c51
  869. MUL c52, b4, c52
  870. LD b8, 32 * SIZE(BO)
  871. NMSUB c41, c41, b5, c51
  872. NMSUB c42, c42, b5, c52
  873. LD b5, 27 * SIZE(BO)
  874. NMSUB c31, c31, b6, c51
  875. NMSUB c32, c32, b6, c52
  876. LD b6, 26 * SIZE(BO)
  877. NMSUB c21, c21, b7, c51
  878. NMSUB c22, c22, b7, c52
  879. LD b7, 25 * SIZE(BO)
  880. NMSUB c11, c11, b8, c51
  881. NMSUB c12, c12, b8, c52
  882. LD b8, 24 * SIZE(BO)
  883. MUL c41, b5, c41
  884. MUL c42, b5, c42
  885. NMSUB c31, c31, b6, c41
  886. NMSUB c32, c32, b6, c42
  887. LD b6, 18 * SIZE(BO)
  888. NMSUB c21, c21, b7, c41
  889. NMSUB c22, c22, b7, c42
  890. LD b7, 17 * SIZE(BO)
  891. NMSUB c11, c11, b8, c41
  892. NMSUB c12, c12, b8, c42
  893. LD b8, 16 * SIZE(BO)
  894. MUL c31, b6, c31
  895. MUL c32, b6, c32
  896. NMSUB c21, c21, b7, c31
  897. NMSUB c22, c22, b7, c32
  898. LD b7, 9 * SIZE(BO)
  899. NMSUB c11, c11, b8, c31
  900. NMSUB c12, c12, b8, c32
  901. LD b8, 8 * SIZE(BO)
  902. MUL c21, b7, c21
  903. MUL c22, b7, c22
  904. NMSUB c11, c11, b8, c21
  905. NMSUB c12, c12, b8, c22
  906. LD b8, 0 * SIZE(BO)
  907. MUL c11, b8, c11
  908. MUL c12, b8, c12
  909. #endif
  910. #if defined(LN) || defined(LT)
  911. ST c11, 0 * SIZE(BO)
  912. ST c21, 1 * SIZE(BO)
  913. ST c31, 2 * SIZE(BO)
  914. ST c41, 3 * SIZE(BO)
  915. ST c51, 4 * SIZE(BO)
  916. ST c61, 5 * SIZE(BO)
  917. ST c71, 6 * SIZE(BO)
  918. ST c81, 7 * SIZE(BO)
  919. ST c12, 8 * SIZE(BO)
  920. ST c22, 9 * SIZE(BO)
  921. ST c32, 10 * SIZE(BO)
  922. ST c42, 11 * SIZE(BO)
  923. ST c52, 12 * SIZE(BO)
  924. ST c62, 13 * SIZE(BO)
  925. ST c72, 14 * SIZE(BO)
  926. ST c82, 15 * SIZE(BO)
  927. #else
  928. ST c11, 0 * SIZE(AO)
  929. ST c12, 1 * SIZE(AO)
  930. ST c21, 2 * SIZE(AO)
  931. ST c22, 3 * SIZE(AO)
  932. ST c31, 4 * SIZE(AO)
  933. ST c32, 5 * SIZE(AO)
  934. ST c41, 6 * SIZE(AO)
  935. ST c42, 7 * SIZE(AO)
  936. ST c51, 8 * SIZE(AO)
  937. ST c52, 9 * SIZE(AO)
  938. ST c61, 10 * SIZE(AO)
  939. ST c62, 11 * SIZE(AO)
  940. ST c71, 12 * SIZE(AO)
  941. ST c72, 13 * SIZE(AO)
  942. ST c81, 14 * SIZE(AO)
  943. ST c82, 15 * SIZE(AO)
  944. #endif
  945. ST c11, 0 * SIZE(CO1)
  946. ST c12, 1 * SIZE(CO1)
  947. ST c21, 0 * SIZE(CO2)
  948. ST c22, 1 * SIZE(CO2)
  949. ST c31, 0 * SIZE(CO3)
  950. ST c32, 1 * SIZE(CO3)
  951. ST c41, 0 * SIZE(CO4)
  952. ST c42, 1 * SIZE(CO4)
  953. ST c51, 0 * SIZE(CO5)
  954. ST c52, 1 * SIZE(CO5)
  955. ST c61, 0 * SIZE(CO6)
  956. ST c62, 1 * SIZE(CO6)
  957. ST c71, 0 * SIZE(CO7)
  958. ST c72, 1 * SIZE(CO7)
  959. ST c81, 0 * SIZE(CO8)
  960. ST c82, 1 * SIZE(CO8)
  961. MTC $0, a1
  962. #ifndef LN
  963. daddiu CO1, CO1, 2 * SIZE
  964. daddiu CO2, CO2, 2 * SIZE
  965. daddiu CO3, CO3, 2 * SIZE
  966. daddiu CO4, CO4, 2 * SIZE
  967. daddiu CO5, CO5, 2 * SIZE
  968. daddiu CO6, CO6, 2 * SIZE
  969. daddiu CO7, CO7, 2 * SIZE
  970. daddiu CO8, CO8, 2 * SIZE
  971. #endif
  972. MOV c11, a1
  973. MOV c21, a1
  974. #ifdef RT
  975. dsll TEMP, K, 1 + BASE_SHIFT
  976. daddu AORIG, AORIG, TEMP
  977. #endif
  978. MOV c31, a1
  979. MOV c41, a1
  980. #if defined(LT) || defined(RN)
  981. dsubu TEMP, K, KK
  982. dsll L, TEMP, 1 + BASE_SHIFT
  983. dsll TEMP, TEMP, 3 + BASE_SHIFT
  984. daddu AO, AO, L
  985. daddu BO, BO, TEMP
  986. #endif
  987. #ifdef LT
  988. daddiu KK, KK, 2
  989. #endif
  990. #ifdef LN
  991. daddiu KK, KK, -2
  992. #endif
  993. daddiu I, I, -1
  994. MOV c51, a1
  995. bgtz I, .L11
  996. MOV c61, a1
  997. .align 3
  998. .L20:
  999. andi I, M, 1
  1000. MOV c61, c11
  1001. blez I, .L29
  1002. MOV c71, c11
  1003. #if defined(LT) || defined(RN)
  1004. LD a1, 0 * SIZE(AO)
  1005. LD a2, 1 * SIZE(AO)
  1006. LD a3, 2 * SIZE(AO)
  1007. LD a4, 3 * SIZE(AO)
  1008. LD b1, 0 * SIZE(B)
  1009. LD b2, 1 * SIZE(B)
  1010. LD b3, 2 * SIZE(B)
  1011. LD b4, 3 * SIZE(B)
  1012. LD b5, 4 * SIZE(B)
  1013. LD b6, 8 * SIZE(B)
  1014. LD b7, 12 * SIZE(B)
  1015. dsra L, KK, 2
  1016. MOV c81, c11
  1017. blez L, .L25
  1018. move BO, B
  1019. #else
  1020. #ifdef LN
  1021. dsll TEMP, K, 0 + BASE_SHIFT
  1022. dsubu AORIG, AORIG, TEMP
  1023. #endif
  1024. dsll L, KK, 0 + BASE_SHIFT
  1025. dsll TEMP, KK, 3 + BASE_SHIFT
  1026. daddu AO, AORIG, L
  1027. daddu BO, B, TEMP
  1028. dsubu TEMP, K, KK
  1029. LD a1, 0 * SIZE(AO)
  1030. LD a2, 1 * SIZE(AO)
  1031. LD a3, 2 * SIZE(AO)
  1032. LD a4, 3 * SIZE(AO)
  1033. LD b1, 0 * SIZE(BO)
  1034. LD b2, 1 * SIZE(BO)
  1035. LD b3, 2 * SIZE(BO)
  1036. LD b4, 3 * SIZE(BO)
  1037. LD b5, 4 * SIZE(BO)
  1038. LD b6, 8 * SIZE(BO)
  1039. LD b7, 12 * SIZE(BO)
  1040. dsra L, TEMP, 2
  1041. MOV c81, c11
  1042. blez L, .L25
  1043. NOP
  1044. #endif
  1045. .align 3
  1046. .L22:
  1047. MADD c11, c11, a1, b1
  1048. LD b1, 16 * SIZE(BO)
  1049. MADD c21, c21, a1, b2
  1050. LD b2, 5 * SIZE(BO)
  1051. MADD c31, c31, a1, b3
  1052. LD b3, 6 * SIZE(BO)
  1053. MADD c41, c41, a1, b4
  1054. LD b4, 7 * SIZE(BO)
  1055. MADD c51, c51, a1, b5
  1056. LD b5, 20 * SIZE(BO)
  1057. MADD c61, c61, a1, b2
  1058. LD b2, 9 * SIZE(BO)
  1059. MADD c71, c71, a1, b3
  1060. LD b3, 10 * SIZE(BO)
  1061. MADD c81, c81, a1, b4
  1062. LD b4, 11 * SIZE(BO)
  1063. LD a1, 4 * SIZE(AO)
  1064. daddiu L, L, -1
  1065. MADD c11, c11, a2, b6
  1066. LD b6, 24 * SIZE(BO)
  1067. MADD c21, c21, a2, b2
  1068. LD b2, 13 * SIZE(BO)
  1069. MADD c31, c31, a2, b3
  1070. LD b3, 14 * SIZE(BO)
  1071. MADD c41, c41, a2, b4
  1072. LD b4, 15 * SIZE(BO)
  1073. MADD c51, c51, a2, b7
  1074. LD b7, 28 * SIZE(BO)
  1075. MADD c61, c61, a2, b2
  1076. LD b2, 17 * SIZE(BO)
  1077. MADD c71, c71, a2, b3
  1078. LD b3, 18 * SIZE(BO)
  1079. MADD c81, c81, a2, b4
  1080. LD b4, 19 * SIZE(BO)
  1081. LD a2, 5 * SIZE(AO)
  1082. daddiu AO, AO, 4 * SIZE
  1083. MADD c11, c11, a3, b1
  1084. LD b1, 32 * SIZE(BO)
  1085. MADD c21, c21, a3, b2
  1086. LD b2, 21 * SIZE(BO)
  1087. MADD c31, c31, a3, b3
  1088. LD b3, 22 * SIZE(BO)
  1089. MADD c41, c41, a3, b4
  1090. LD b4, 23 * SIZE(BO)
  1091. MADD c51, c51, a3, b5
  1092. LD b5, 36 * SIZE(BO)
  1093. MADD c61, c61, a3, b2
  1094. LD b2, 25 * SIZE(BO)
  1095. MADD c71, c71, a3, b3
  1096. LD b3, 26 * SIZE(BO)
  1097. MADD c81, c81, a3, b4
  1098. LD b4, 27 * SIZE(BO)
  1099. LD a3, 2 * SIZE(AO)
  1100. daddiu BO, BO, 32 * SIZE
  1101. MADD c11, c11, a4, b6
  1102. LD b6, 8 * SIZE(BO)
  1103. MADD c21, c21, a4, b2
  1104. LD b2, -3 * SIZE(BO)
  1105. MADD c31, c31, a4, b3
  1106. LD b3, -2 * SIZE(BO)
  1107. MADD c41, c41, a4, b4
  1108. LD b4, -1 * SIZE(BO)
  1109. MADD c51, c51, a4, b7
  1110. LD b7, 12 * SIZE(BO)
  1111. MADD c61, c61, a4, b2
  1112. LD b2, 1 * SIZE(BO)
  1113. MADD c71, c71, a4, b3
  1114. LD b3, 2 * SIZE(BO)
  1115. MADD c81, c81, a4, b4
  1116. LD b4, 3 * SIZE(BO)
  1117. bgtz L, .L22
  1118. LD a4, 3 * SIZE(AO)
  1119. .align 3
  1120. .L25:
  1121. #if defined(LT) || defined(RN)
  1122. andi L, KK, 3
  1123. #else
  1124. andi L, TEMP, 3
  1125. #endif
  1126. NOP
  1127. blez L, .L28
  1128. NOP
  1129. .align 3
  1130. .L26:
  1131. MADD c11, c11, a1, b1
  1132. LD b1, 8 * SIZE(BO)
  1133. MADD c21, c21, a1, b2
  1134. LD b2, 5 * SIZE(BO)
  1135. MADD c31, c31, a1, b3
  1136. LD b3, 6 * SIZE(BO)
  1137. MADD c41, c41, a1, b4
  1138. LD b4, 7 * SIZE(BO)
  1139. daddiu L, L, -1
  1140. MOV a2, a2
  1141. daddiu AO, AO, 1 * SIZE
  1142. daddiu BO, BO, 8 * SIZE
  1143. MADD c51, c51, a1, b5
  1144. LD b5, 4 * SIZE(BO)
  1145. MADD c61, c61, a1, b2
  1146. LD b2, 1 * SIZE(BO)
  1147. MADD c71, c71, a1, b3
  1148. LD b3, 2 * SIZE(BO)
  1149. MADD c81, c81, a1, b4
  1150. LD a1, 0 * SIZE(AO)
  1151. bgtz L, .L26
  1152. LD b4, 3 * SIZE(BO)
  1153. .L28:
  1154. #if defined(LN) || defined(RT)
  1155. #ifdef LN
  1156. daddiu TEMP, KK, -1
  1157. #else
  1158. daddiu TEMP, KK, -8
  1159. #endif
  1160. dsll L, TEMP, 0 + BASE_SHIFT
  1161. dsll TEMP, TEMP, 3 + BASE_SHIFT
  1162. daddu AO, AORIG, L
  1163. daddu BO, B, TEMP
  1164. #endif
  1165. #if defined(LN) || defined(LT)
  1166. LD b1, 0 * SIZE(BO)
  1167. LD b2, 1 * SIZE(BO)
  1168. LD b3, 2 * SIZE(BO)
  1169. LD b4, 3 * SIZE(BO)
  1170. LD b5, 4 * SIZE(BO)
  1171. LD b6, 5 * SIZE(BO)
  1172. LD b7, 6 * SIZE(BO)
  1173. LD b8, 7 * SIZE(BO)
  1174. SUB c11, b1, c11
  1175. SUB c21, b2, c21
  1176. SUB c31, b3, c31
  1177. SUB c41, b4, c41
  1178. SUB c51, b5, c51
  1179. SUB c61, b6, c61
  1180. SUB c71, b7, c71
  1181. SUB c81, b8, c81
  1182. #else
  1183. LD b1, 0 * SIZE(AO)
  1184. LD b2, 1 * SIZE(AO)
  1185. LD b3, 2 * SIZE(AO)
  1186. LD b4, 3 * SIZE(AO)
  1187. LD b5, 4 * SIZE(AO)
  1188. LD b6, 5 * SIZE(AO)
  1189. LD b7, 6 * SIZE(AO)
  1190. LD b8, 7 * SIZE(AO)
  1191. SUB c11, b1, c11
  1192. SUB c21, b2, c21
  1193. SUB c31, b3, c31
  1194. SUB c41, b4, c41
  1195. SUB c51, b5, c51
  1196. SUB c61, b6, c61
  1197. SUB c71, b7, c71
  1198. SUB c81, b8, c81
  1199. #endif
  1200. #if defined(LN) || defined(LT)
  1201. LD b1, 0 * SIZE(AO)
  1202. MUL c11, b1, c11
  1203. MUL c21, b1, c21
  1204. MUL c31, b1, c31
  1205. MUL c41, b1, c41
  1206. MUL c51, b1, c51
  1207. MUL c61, b1, c61
  1208. MUL c71, b1, c71
  1209. MUL c81, b1, c81
  1210. #endif
  1211. #ifdef RN
  1212. LD b1, 0 * SIZE(BO)
  1213. LD b2, 1 * SIZE(BO)
  1214. LD b3, 2 * SIZE(BO)
  1215. LD b4, 3 * SIZE(BO)
  1216. LD b5, 4 * SIZE(BO)
  1217. LD b6, 5 * SIZE(BO)
  1218. LD b7, 6 * SIZE(BO)
  1219. LD b8, 7 * SIZE(BO)
  1220. MUL c11, b1, c11
  1221. NMSUB c21, c21, b2, c11
  1222. NMSUB c31, c31, b3, c11
  1223. NMSUB c41, c41, b4, c11
  1224. NMSUB c51, c51, b5, c11
  1225. NMSUB c61, c61, b6, c11
  1226. NMSUB c71, c71, b7, c11
  1227. NMSUB c81, c81, b8, c11
  1228. LD b2, 9 * SIZE(BO)
  1229. LD b3, 10 * SIZE(BO)
  1230. LD b4, 11 * SIZE(BO)
  1231. LD b5, 12 * SIZE(BO)
  1232. LD b6, 13 * SIZE(BO)
  1233. LD b7, 14 * SIZE(BO)
  1234. LD b8, 15 * SIZE(BO)
  1235. MUL c21, b2, c21
  1236. NMSUB c31, c31, b3, c21
  1237. NMSUB c41, c41, b4, c21
  1238. NMSUB c51, c51, b5, c21
  1239. NMSUB c61, c61, b6, c21
  1240. NMSUB c71, c71, b7, c21
  1241. NMSUB c81, c81, b8, c21
  1242. LD b3, 18 * SIZE(BO)
  1243. LD b4, 19 * SIZE(BO)
  1244. LD b5, 20 * SIZE(BO)
  1245. LD b6, 21 * SIZE(BO)
  1246. LD b7, 22 * SIZE(BO)
  1247. LD b8, 23 * SIZE(BO)
  1248. MUL c31, b3, c31
  1249. NMSUB c41, c41, b4, c31
  1250. NMSUB c51, c51, b5, c31
  1251. NMSUB c61, c61, b6, c31
  1252. NMSUB c71, c71, b7, c31
  1253. NMSUB c81, c81, b8, c31
  1254. LD b4, 27 * SIZE(BO)
  1255. LD b5, 28 * SIZE(BO)
  1256. LD b6, 29 * SIZE(BO)
  1257. LD b7, 30 * SIZE(BO)
  1258. LD b8, 31 * SIZE(BO)
  1259. MUL c41, b4, c41
  1260. NMSUB c51, c51, b5, c41
  1261. NMSUB c61, c61, b6, c41
  1262. NMSUB c71, c71, b7, c41
  1263. NMSUB c81, c81, b8, c41
  1264. LD b5, 36 * SIZE(BO)
  1265. LD b6, 37 * SIZE(BO)
  1266. LD b7, 38 * SIZE(BO)
  1267. LD b8, 39 * SIZE(BO)
  1268. MUL c51, b5, c51
  1269. NMSUB c61, c61, b6, c51
  1270. NMSUB c71, c71, b7, c51
  1271. NMSUB c81, c81, b8, c51
  1272. LD b6, 45 * SIZE(BO)
  1273. LD b7, 46 * SIZE(BO)
  1274. LD b8, 47 * SIZE(BO)
  1275. MUL c61, b6, c61
  1276. NMSUB c71, c71, b7, c61
  1277. NMSUB c81, c81, b8, c61
  1278. LD b7, 54 * SIZE(BO)
  1279. LD b8, 55 * SIZE(BO)
  1280. MUL c71, b7, c71
  1281. NMSUB c81, c81, b8, c71
  1282. LD b8, 63 * SIZE(BO)
  1283. MUL c81, b8, c81
  1284. #endif
  1285. #ifdef RT
  1286. LD b1, 63 * SIZE(BO)
  1287. LD b2, 62 * SIZE(BO)
  1288. LD b3, 61 * SIZE(BO)
  1289. LD b4, 60 * SIZE(BO)
  1290. LD b5, 59 * SIZE(BO)
  1291. LD b6, 58 * SIZE(BO)
  1292. LD b7, 57 * SIZE(BO)
  1293. LD b8, 56 * SIZE(BO)
  1294. MUL c81, b1, c81
  1295. NMSUB c71, c71, b2, c81
  1296. NMSUB c61, c61, b3, c81
  1297. NMSUB c51, c51, b4, c81
  1298. NMSUB c41, c41, b5, c81
  1299. NMSUB c31, c31, b6, c81
  1300. NMSUB c21, c21, b7, c81
  1301. NMSUB c11, c11, b8, c81
  1302. LD b2, 54 * SIZE(BO)
  1303. LD b3, 53 * SIZE(BO)
  1304. LD b4, 52 * SIZE(BO)
  1305. LD b5, 51 * SIZE(BO)
  1306. LD b6, 50 * SIZE(BO)
  1307. LD b7, 49 * SIZE(BO)
  1308. LD b8, 48 * SIZE(BO)
  1309. MUL c71, b2, c71
  1310. NMSUB c61, c61, b3, c71
  1311. NMSUB c51, c51, b4, c71
  1312. NMSUB c41, c41, b5, c71
  1313. NMSUB c31, c31, b6, c71
  1314. NMSUB c21, c21, b7, c71
  1315. NMSUB c11, c11, b8, c71
  1316. LD b3, 45 * SIZE(BO)
  1317. LD b4, 44 * SIZE(BO)
  1318. LD b5, 43 * SIZE(BO)
  1319. LD b6, 42 * SIZE(BO)
  1320. LD b7, 41 * SIZE(BO)
  1321. LD b8, 40 * SIZE(BO)
  1322. MUL c61, b3, c61
  1323. NMSUB c51, c51, b4, c61
  1324. NMSUB c41, c41, b5, c61
  1325. NMSUB c31, c31, b6, c61
  1326. NMSUB c21, c21, b7, c61
  1327. NMSUB c11, c11, b8, c61
  1328. LD b4, 36 * SIZE(BO)
  1329. LD b5, 35 * SIZE(BO)
  1330. LD b6, 34 * SIZE(BO)
  1331. LD b7, 33 * SIZE(BO)
  1332. LD b8, 32 * SIZE(BO)
  1333. MUL c51, b4, c51
  1334. NMSUB c41, c41, b5, c51
  1335. NMSUB c31, c31, b6, c51
  1336. NMSUB c21, c21, b7, c51
  1337. NMSUB c11, c11, b8, c51
  1338. LD b5, 27 * SIZE(BO)
  1339. LD b6, 26 * SIZE(BO)
  1340. LD b7, 25 * SIZE(BO)
  1341. LD b8, 24 * SIZE(BO)
  1342. MUL c41, b5, c41
  1343. NMSUB c31, c31, b6, c41
  1344. NMSUB c21, c21, b7, c41
  1345. NMSUB c11, c11, b8, c41
  1346. LD b6, 18 * SIZE(BO)
  1347. LD b7, 17 * SIZE(BO)
  1348. LD b8, 16 * SIZE(BO)
  1349. MUL c31, b6, c31
  1350. NMSUB c21, c21, b7, c31
  1351. NMSUB c11, c11, b8, c31
  1352. LD b7, 9 * SIZE(BO)
  1353. LD b8, 8 * SIZE(BO)
  1354. MUL c21, b7, c21
  1355. NMSUB c11, c11, b8, c21
  1356. LD b8, 0 * SIZE(BO)
  1357. MUL c11, b8, c11
  1358. #endif
  1359. #ifdef LN
  1360. daddiu CO1, CO1, -1 * SIZE
  1361. daddiu CO2, CO2, -1 * SIZE
  1362. daddiu CO3, CO3, -1 * SIZE
  1363. daddiu CO4, CO4, -1 * SIZE
  1364. daddiu CO5, CO5, -1 * SIZE
  1365. daddiu CO6, CO6, -1 * SIZE
  1366. daddiu CO7, CO7, -1 * SIZE
  1367. daddiu CO8, CO8, -1 * SIZE
  1368. #endif
  1369. #if defined(LN) || defined(LT)
  1370. ST c11, 0 * SIZE(BO)
  1371. ST c21, 1 * SIZE(BO)
  1372. ST c31, 2 * SIZE(BO)
  1373. ST c41, 3 * SIZE(BO)
  1374. ST c51, 4 * SIZE(BO)
  1375. ST c61, 5 * SIZE(BO)
  1376. ST c71, 6 * SIZE(BO)
  1377. ST c81, 7 * SIZE(BO)
  1378. #else
  1379. ST c11, 0 * SIZE(AO)
  1380. ST c21, 1 * SIZE(AO)
  1381. ST c31, 2 * SIZE(AO)
  1382. ST c41, 3 * SIZE(AO)
  1383. ST c51, 4 * SIZE(AO)
  1384. ST c61, 5 * SIZE(AO)
  1385. ST c71, 6 * SIZE(AO)
  1386. ST c81, 7 * SIZE(AO)
  1387. #endif
  1388. ST c11, 0 * SIZE(CO1)
  1389. ST c21, 0 * SIZE(CO2)
  1390. ST c31, 0 * SIZE(CO3)
  1391. ST c41, 0 * SIZE(CO4)
  1392. ST c51, 0 * SIZE(CO5)
  1393. ST c61, 0 * SIZE(CO6)
  1394. ST c71, 0 * SIZE(CO7)
  1395. ST c81, 0 * SIZE(CO8)
  1396. #ifndef LN
  1397. daddiu CO1, CO1, 1 * SIZE
  1398. daddiu CO2, CO2, 1 * SIZE
  1399. daddiu CO3, CO3, 1 * SIZE
  1400. daddiu CO4, CO4, 1 * SIZE
  1401. daddiu CO5, CO5, 1 * SIZE
  1402. daddiu CO6, CO6, 1 * SIZE
  1403. daddiu CO7, CO7, 1 * SIZE
  1404. daddiu CO8, CO8, 1 * SIZE
  1405. #endif
  1406. #ifdef RT
  1407. dsll TEMP, K, BASE_SHIFT
  1408. daddu AORIG, AORIG, TEMP
  1409. #endif
  1410. #if defined(LT) || defined(RN)
  1411. dsubu TEMP, K, KK
  1412. dsll L, TEMP, 0 + BASE_SHIFT
  1413. dsll TEMP, TEMP, 3 + BASE_SHIFT
  1414. daddu AO, AO, L
  1415. daddu BO, BO, TEMP
  1416. #endif
  1417. #ifdef LT
  1418. daddiu KK, KK, 1
  1419. #endif
  1420. #ifdef LN
  1421. daddiu KK, KK, -1
  1422. #endif
  1423. .align 3
  1424. .L29:
  1425. #ifdef LN
  1426. dsll TEMP, K, 3 + BASE_SHIFT
  1427. daddu B, B, TEMP
  1428. #endif
  1429. #if defined(LT) || defined(RN)
  1430. move B, BO
  1431. #endif
  1432. #ifdef RN
  1433. daddiu KK, KK, 8
  1434. #endif
  1435. #ifdef RT
  1436. daddiu KK, KK, -8
  1437. #endif
  1438. bgtz J, .L10
  1439. NOP
  1440. .align 3
  1441. .L30:
  1442. andi J, N, 4
  1443. blez J, .L50
  1444. move AO, A
  1445. #ifdef RT
  1446. dsll TEMP, K, 2 + BASE_SHIFT
  1447. dsubu B, B, TEMP
  1448. dsll TEMP, LDC, 2
  1449. dsubu C, C, TEMP
  1450. #endif
  1451. move CO1, C
  1452. MTC $0, c11
  1453. daddu CO2, C, LDC
  1454. daddu CO3, CO2, LDC
  1455. daddu CO4, CO3, LDC
  1456. MOV c21, c11
  1457. dsra I, M, 1
  1458. MOV c31, c11
  1459. #ifdef LN
  1460. daddu KK, M, OFFSET
  1461. #endif
  1462. #ifdef LT
  1463. move KK, OFFSET
  1464. #endif
  1465. #if defined(LN) || defined(RT)
  1466. move AORIG, A
  1467. #else
  1468. move AO, A
  1469. #endif
  1470. #ifndef RT
  1471. daddu C, CO4, LDC
  1472. #endif
  1473. blez I, .L40
  1474. MOV c41, c11
  1475. .L31:
  1476. #if defined(LT) || defined(RN)
  1477. LD a1, 0 * SIZE(AO)
  1478. LD a3, 4 * SIZE(AO)
  1479. LD b1, 0 * SIZE(B)
  1480. MOV c12, c11
  1481. LD b2, 1 * SIZE(B)
  1482. MOV c22, c11
  1483. LD b3, 2 * SIZE(B)
  1484. MOV c32, c11
  1485. LD b4, 3 * SIZE(B)
  1486. MOV c42, c11
  1487. LD b5, 4 * SIZE(B)
  1488. dsra L, KK, 2
  1489. LD b6, 8 * SIZE(B)
  1490. LD b7, 12 * SIZE(B)
  1491. blez L, .L35
  1492. move BO, B
  1493. #else
  1494. #ifdef LN
  1495. dsll TEMP, K, 1 + BASE_SHIFT
  1496. dsubu AORIG, AORIG, TEMP
  1497. #endif
  1498. dsll L, KK, 1 + BASE_SHIFT
  1499. dsll TEMP, KK, 2 + BASE_SHIFT
  1500. daddu AO, AORIG, L
  1501. daddu BO, B, TEMP
  1502. dsubu TEMP, K, KK
  1503. LD a1, 0 * SIZE(AO)
  1504. LD a3, 4 * SIZE(AO)
  1505. LD b1, 0 * SIZE(BO)
  1506. MOV c12, c11
  1507. LD b2, 1 * SIZE(BO)
  1508. MOV c22, c11
  1509. LD b3, 2 * SIZE(BO)
  1510. MOV c32, c11
  1511. LD b4, 3 * SIZE(BO)
  1512. MOV c42, c11
  1513. LD b5, 4 * SIZE(BO)
  1514. dsra L, TEMP, 2
  1515. LD b6, 8 * SIZE(BO)
  1516. LD b7, 12 * SIZE(BO)
  1517. blez L, .L35
  1518. NOP
  1519. #endif
  1520. .align 3
  1521. .L32:
  1522. MADD c11, c11, a1, b1
  1523. LD a2, 1 * SIZE(AO)
  1524. MADD c21, c21, a1, b2
  1525. daddiu L, L, -1
  1526. MADD c31, c31, a1, b3
  1527. NOP
  1528. MADD c41, c41, a1, b4
  1529. LD a1, 2 * SIZE(AO)
  1530. MADD c12, c12, a2, b1
  1531. LD b1, 16 * SIZE(BO)
  1532. MADD c22, c22, a2, b2
  1533. LD b2, 5 * SIZE(BO)
  1534. MADD c32, c32, a2, b3
  1535. LD b3, 6 * SIZE(BO)
  1536. MADD c42, c42, a2, b4
  1537. LD b4, 7 * SIZE(BO)
  1538. MADD c11, c11, a1, b5
  1539. LD a2, 3 * SIZE(AO)
  1540. MADD c21, c21, a1, b2
  1541. NOP
  1542. MADD c31, c31, a1, b3
  1543. NOP
  1544. MADD c41, c41, a1, b4
  1545. LD a1, 8 * SIZE(AO)
  1546. MADD c12, c12, a2, b5
  1547. LD b5, 20 * SIZE(BO)
  1548. MADD c22, c22, a2, b2
  1549. LD b2, 9 * SIZE(BO)
  1550. MADD c32, c32, a2, b3
  1551. LD b3, 10 * SIZE(BO)
  1552. MADD c42, c42, a2, b4
  1553. LD b4, 11 * SIZE(BO)
  1554. MADD c11, c11, a3, b6
  1555. LD a2, 5 * SIZE(AO)
  1556. MADD c21, c21, a3, b2
  1557. NOP
  1558. MADD c31, c31, a3, b3
  1559. NOP
  1560. MADD c41, c41, a3, b4
  1561. LD a3, 6 * SIZE(AO)
  1562. MADD c12, c12, a2, b6
  1563. LD b6, 24 * SIZE(BO)
  1564. MADD c22, c22, a2, b2
  1565. LD b2, 13 * SIZE(BO)
  1566. MADD c32, c32, a2, b3
  1567. LD b3, 14 * SIZE(BO)
  1568. MADD c42, c42, a2, b4
  1569. LD b4, 15 * SIZE(BO)
  1570. MADD c11, c11, a3, b7
  1571. LD a2, 7 * SIZE(AO)
  1572. MADD c21, c21, a3, b2
  1573. daddiu AO, AO, 8 * SIZE
  1574. MADD c31, c31, a3, b3
  1575. daddiu BO, BO, 16 * SIZE
  1576. MADD c41, c41, a3, b4
  1577. LD a3, 4 * SIZE(AO)
  1578. MADD c12, c12, a2, b7
  1579. LD b7, 12 * SIZE(BO)
  1580. MADD c22, c22, a2, b2
  1581. LD b2, 1 * SIZE(BO)
  1582. MADD c32, c32, a2, b3
  1583. LD b3, 2 * SIZE(BO)
  1584. MADD c42, c42, a2, b4
  1585. NOP
  1586. bgtz L, .L32
  1587. LD b4, 3 * SIZE(BO)
  1588. .align 3
  1589. .L35:
  1590. #if defined(LT) || defined(RN)
  1591. andi L, KK, 3
  1592. #else
  1593. andi L, TEMP, 3
  1594. #endif
  1595. NOP
  1596. blez L, .L38
  1597. NOP
  1598. .align 3
  1599. .L36:
  1600. MADD c11, c11, a1, b1
  1601. LD a2, 1 * SIZE(AO)
  1602. MADD c21, c21, a1, b2
  1603. daddiu L, L, -1
  1604. MADD c31, c31, a1, b3
  1605. daddiu AO, AO, 2 * SIZE
  1606. MADD c41, c41, a1, b4
  1607. LD a1, 0 * SIZE(AO)
  1608. MADD c12, c12, a2, b1
  1609. LD b1, 4 * SIZE(BO)
  1610. MADD c22, c22, a2, b2
  1611. LD b2, 5 * SIZE(BO)
  1612. MADD c32, c32, a2, b3
  1613. LD b3, 6 * SIZE(BO)
  1614. MADD c42, c42, a2, b4
  1615. LD b4, 7 * SIZE(BO)
  1616. bgtz L, .L36
  1617. daddiu BO, BO, 4 * SIZE
  1618. .L38:
  1619. #if defined(LN) || defined(RT)
  1620. #ifdef LN
  1621. daddiu TEMP, KK, -2
  1622. #else
  1623. daddiu TEMP, KK, -4
  1624. #endif
  1625. dsll L, TEMP, 1 + BASE_SHIFT
  1626. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1627. daddu AO, AORIG, L
  1628. daddu BO, B, TEMP
  1629. #endif
  1630. #if defined(LN) || defined(LT)
  1631. LD b1, 0 * SIZE(BO)
  1632. LD b2, 1 * SIZE(BO)
  1633. LD b3, 2 * SIZE(BO)
  1634. LD b4, 3 * SIZE(BO)
  1635. LD b5, 4 * SIZE(BO)
  1636. LD b6, 5 * SIZE(BO)
  1637. LD b7, 6 * SIZE(BO)
  1638. LD b8, 7 * SIZE(BO)
  1639. SUB c11, b1, c11
  1640. SUB c21, b2, c21
  1641. SUB c31, b3, c31
  1642. SUB c41, b4, c41
  1643. SUB c12, b5, c12
  1644. SUB c22, b6, c22
  1645. SUB c32, b7, c32
  1646. SUB c42, b8, c42
  1647. #else
  1648. LD b1, 0 * SIZE(AO)
  1649. LD b2, 1 * SIZE(AO)
  1650. LD b3, 2 * SIZE(AO)
  1651. LD b4, 3 * SIZE(AO)
  1652. LD b5, 4 * SIZE(AO)
  1653. LD b6, 5 * SIZE(AO)
  1654. LD b7, 6 * SIZE(AO)
  1655. LD b8, 7 * SIZE(AO)
  1656. SUB c11, b1, c11
  1657. SUB c12, b2, c12
  1658. SUB c21, b3, c21
  1659. SUB c22, b4, c22
  1660. SUB c31, b5, c31
  1661. SUB c32, b6, c32
  1662. SUB c41, b7, c41
  1663. SUB c42, b8, c42
  1664. #endif
  1665. #ifdef LN
  1666. LD b1, 3 * SIZE(AO)
  1667. LD b2, 2 * SIZE(AO)
  1668. LD b3, 0 * SIZE(AO)
  1669. MUL c12, b1, c12
  1670. MUL c22, b1, c22
  1671. MUL c32, b1, c32
  1672. MUL c42, b1, c42
  1673. NMSUB c11, c11, b2, c12
  1674. NMSUB c21, c21, b2, c22
  1675. NMSUB c31, c31, b2, c32
  1676. NMSUB c41, c41, b2, c42
  1677. MUL c11, b3, c11
  1678. MUL c21, b3, c21
  1679. MUL c31, b3, c31
  1680. MUL c41, b3, c41
  1681. #endif
  1682. #ifdef LT
  1683. LD b1, 0 * SIZE(AO)
  1684. LD b2, 1 * SIZE(AO)
  1685. LD b3, 3 * SIZE(AO)
  1686. MUL c11, b1, c11
  1687. MUL c21, b1, c21
  1688. MUL c31, b1, c31
  1689. MUL c41, b1, c41
  1690. NMSUB c12, c12, b2, c11
  1691. NMSUB c22, c22, b2, c21
  1692. NMSUB c32, c32, b2, c31
  1693. NMSUB c42, c42, b2, c41
  1694. MUL c12, b3, c12
  1695. MUL c22, b3, c22
  1696. MUL c32, b3, c32
  1697. MUL c42, b3, c42
  1698. #endif
  1699. #ifdef RN
  1700. LD b1, 0 * SIZE(BO)
  1701. LD b2, 1 * SIZE(BO)
  1702. LD b3, 2 * SIZE(BO)
  1703. LD b4, 3 * SIZE(BO)
  1704. MUL c11, b1, c11
  1705. MUL c12, b1, c12
  1706. NMSUB c21, c21, b2, c11
  1707. NMSUB c22, c22, b2, c12
  1708. NMSUB c31, c31, b3, c11
  1709. NMSUB c32, c32, b3, c12
  1710. NMSUB c41, c41, b4, c11
  1711. NMSUB c42, c42, b4, c12
  1712. LD b2, 5 * SIZE(BO)
  1713. LD b3, 6 * SIZE(BO)
  1714. LD b4, 7 * SIZE(BO)
  1715. MUL c21, b2, c21
  1716. MUL c22, b2, c22
  1717. NMSUB c31, c31, b3, c21
  1718. NMSUB c32, c32, b3, c22
  1719. NMSUB c41, c41, b4, c21
  1720. NMSUB c42, c42, b4, c22
  1721. LD b3, 10 * SIZE(BO)
  1722. LD b4, 11 * SIZE(BO)
  1723. MUL c31, b3, c31
  1724. MUL c32, b3, c32
  1725. NMSUB c41, c41, b4, c31
  1726. NMSUB c42, c42, b4, c32
  1727. LD b4, 15 * SIZE(BO)
  1728. MUL c41, b4, c41
  1729. MUL c42, b4, c42
  1730. #endif
  1731. #ifdef RT
  1732. LD b5, 15 * SIZE(BO)
  1733. LD b6, 14 * SIZE(BO)
  1734. LD b7, 13 * SIZE(BO)
  1735. LD b8, 12 * SIZE(BO)
  1736. MUL c41, b5, c41
  1737. MUL c42, b5, c42
  1738. NMSUB c31, c31, b6, c41
  1739. NMSUB c32, c32, b6, c42
  1740. NMSUB c21, c21, b7, c41
  1741. NMSUB c22, c22, b7, c42
  1742. NMSUB c11, c11, b8, c41
  1743. NMSUB c12, c12, b8, c42
  1744. LD b6, 10 * SIZE(BO)
  1745. LD b7, 9 * SIZE(BO)
  1746. LD b8, 8 * SIZE(BO)
  1747. MUL c31, b6, c31
  1748. MUL c32, b6, c32
  1749. NMSUB c21, c21, b7, c31
  1750. NMSUB c22, c22, b7, c32
  1751. NMSUB c11, c11, b8, c31
  1752. NMSUB c12, c12, b8, c32
  1753. LD b7, 5 * SIZE(BO)
  1754. LD b8, 4 * SIZE(BO)
  1755. MUL c21, b7, c21
  1756. MUL c22, b7, c22
  1757. NMSUB c11, c11, b8, c21
  1758. NMSUB c12, c12, b8, c22
  1759. LD b8, 0 * SIZE(BO)
  1760. MUL c11, b8, c11
  1761. MUL c12, b8, c12
  1762. #endif
  1763. #ifdef LN
  1764. daddiu CO1, CO1, -2 * SIZE
  1765. daddiu CO2, CO2, -2 * SIZE
  1766. daddiu CO3, CO3, -2 * SIZE
  1767. daddiu CO4, CO4, -2 * SIZE
  1768. #endif
  1769. #if defined(LN) || defined(LT)
  1770. ST c11, 0 * SIZE(BO)
  1771. ST c21, 1 * SIZE(BO)
  1772. ST c31, 2 * SIZE(BO)
  1773. ST c41, 3 * SIZE(BO)
  1774. ST c12, 4 * SIZE(BO)
  1775. ST c22, 5 * SIZE(BO)
  1776. ST c32, 6 * SIZE(BO)
  1777. ST c42, 7 * SIZE(BO)
  1778. #else
  1779. ST c11, 0 * SIZE(AO)
  1780. ST c12, 1 * SIZE(AO)
  1781. ST c21, 2 * SIZE(AO)
  1782. ST c22, 3 * SIZE(AO)
  1783. ST c31, 4 * SIZE(AO)
  1784. ST c32, 5 * SIZE(AO)
  1785. ST c41, 6 * SIZE(AO)
  1786. ST c42, 7 * SIZE(AO)
  1787. #endif
  1788. ST c11, 0 * SIZE(CO1)
  1789. ST c12, 1 * SIZE(CO1)
  1790. ST c21, 0 * SIZE(CO2)
  1791. ST c22, 1 * SIZE(CO2)
  1792. ST c31, 0 * SIZE(CO3)
  1793. ST c32, 1 * SIZE(CO3)
  1794. ST c41, 0 * SIZE(CO4)
  1795. ST c42, 1 * SIZE(CO4)
  1796. #ifndef LN
  1797. daddiu CO1, CO1, 2 * SIZE
  1798. daddiu CO2, CO2, 2 * SIZE
  1799. daddiu CO3, CO3, 2 * SIZE
  1800. daddiu CO4, CO4, 2 * SIZE
  1801. #endif
  1802. #ifdef RT
  1803. dsll TEMP, K, 1 + BASE_SHIFT
  1804. daddu AORIG, AORIG, TEMP
  1805. #endif
  1806. #if defined(LT) || defined(RN)
  1807. dsubu TEMP, K, KK
  1808. dsll L, TEMP, 1 + BASE_SHIFT
  1809. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1810. daddu AO, AO, L
  1811. daddu BO, BO, TEMP
  1812. #endif
  1813. #ifdef LT
  1814. daddiu KK, KK, 2
  1815. #endif
  1816. #ifdef LN
  1817. daddiu KK, KK, -2
  1818. #endif
  1819. MTC $0, a1
  1820. MOV c11, a1
  1821. MOV c21, a1
  1822. MOV c31, a1
  1823. daddiu I, I, -1
  1824. bgtz I, .L31
  1825. MOV c41, c11
  1826. .align 3
  1827. .L40:
  1828. andi I, M, 1
  1829. blez I, .L49
  1830. MOV c61, c11
  1831. #if defined(LT) || defined(RN)
  1832. LD a1, 0 * SIZE(AO)
  1833. MOV c71, c11
  1834. LD a2, 1 * SIZE(AO)
  1835. MOV c81, c11
  1836. LD b1, 0 * SIZE(B)
  1837. LD b2, 1 * SIZE(B)
  1838. LD b3, 2 * SIZE(B)
  1839. LD b4, 3 * SIZE(B)
  1840. LD b5, 4 * SIZE(B)
  1841. LD b6, 8 * SIZE(B)
  1842. LD b7, 12 * SIZE(B)
  1843. dsra L, KK, 2
  1844. blez L, .L45
  1845. move BO, B
  1846. #else
  1847. #ifdef LN
  1848. dsll TEMP, K, BASE_SHIFT
  1849. dsubu AORIG, AORIG, TEMP
  1850. #endif
  1851. dsll L, KK, 0 + BASE_SHIFT
  1852. dsll TEMP, KK, 2 + BASE_SHIFT
  1853. daddu AO, AORIG, L
  1854. daddu BO, B, TEMP
  1855. dsubu TEMP, K, KK
  1856. LD a1, 0 * SIZE(AO)
  1857. MOV c71, c11
  1858. LD a2, 1 * SIZE(AO)
  1859. MOV c81, c11
  1860. LD b1, 0 * SIZE(BO)
  1861. LD b2, 1 * SIZE(BO)
  1862. LD b3, 2 * SIZE(BO)
  1863. LD b4, 3 * SIZE(BO)
  1864. LD b5, 4 * SIZE(BO)
  1865. LD b6, 8 * SIZE(BO)
  1866. LD b7, 12 * SIZE(BO)
  1867. dsra L, TEMP, 2
  1868. blez L, .L45
  1869. NOP
  1870. #endif
  1871. .align 3
  1872. .L42:
  1873. MADD c11, c11, a1, b1
  1874. LD b1, 16 * SIZE(BO)
  1875. MADD c21, c21, a1, b2
  1876. LD b2, 5 * SIZE(BO)
  1877. MADD c31, c31, a1, b3
  1878. LD b3, 6 * SIZE(BO)
  1879. MADD c41, c41, a1, b4
  1880. LD b4, 7 * SIZE(BO)
  1881. LD a1, 4 * SIZE(AO)
  1882. daddiu L, L, -1
  1883. MADD c11, c11, a2, b5
  1884. LD b5, 20 * SIZE(BO)
  1885. MADD c21, c21, a2, b2
  1886. LD b2, 9 * SIZE(BO)
  1887. MADD c31, c31, a2, b3
  1888. LD b3, 10 * SIZE(BO)
  1889. MADD c41, c41, a2, b4
  1890. LD b4, 11 * SIZE(BO)
  1891. LD a2, 2 * SIZE(AO)
  1892. daddiu AO, AO, 4 * SIZE
  1893. MADD c11, c11, a2, b6
  1894. LD b6, 24 * SIZE(BO)
  1895. MADD c21, c21, a2, b2
  1896. LD b2, 13 * SIZE(BO)
  1897. MADD c31, c31, a2, b3
  1898. LD b3, 14 * SIZE(BO)
  1899. MADD c41, c41, a2, b4
  1900. LD b4, 15 * SIZE(BO)
  1901. LD a2, -1 * SIZE(AO)
  1902. daddiu BO, BO, 16 * SIZE
  1903. MADD c11, c11, a2, b7
  1904. LD b7, 12 * SIZE(BO)
  1905. MADD c21, c21, a2, b2
  1906. LD b2, 1 * SIZE(BO)
  1907. MADD c31, c31, a2, b3
  1908. LD b3, 2 * SIZE(BO)
  1909. MADD c41, c41, a2, b4
  1910. LD b4, 3 * SIZE(BO)
  1911. bgtz L, .L42
  1912. LD a2, 1 * SIZE(AO)
  1913. .align 3
  1914. .L45:
  1915. #if defined(LT) || defined(RN)
  1916. andi L, KK, 3
  1917. #else
  1918. andi L, TEMP, 3
  1919. #endif
  1920. NOP
  1921. blez L, .L48
  1922. NOP
  1923. .align 3
  1924. .L46:
  1925. MADD c11, c11, a1, b1
  1926. LD b1, 4 * SIZE(BO)
  1927. MADD c21, c21, a1, b2
  1928. LD b2, 5 * SIZE(BO)
  1929. MADD c31, c31, a1, b3
  1930. LD b3, 6 * SIZE(BO)
  1931. MADD c41, c41, a1, b4
  1932. LD a1, 1 * SIZE(AO)
  1933. LD b4, 7 * SIZE(BO)
  1934. daddiu L, L, -1
  1935. daddiu AO, AO, 1 * SIZE
  1936. MOV a2, a2
  1937. bgtz L, .L46
  1938. daddiu BO, BO, 4 * SIZE
  1939. .L48:
  1940. #if defined(LN) || defined(RT)
  1941. #ifdef LN
  1942. daddiu TEMP, KK, -1
  1943. #else
  1944. daddiu TEMP, KK, -4
  1945. #endif
  1946. dsll L, TEMP, 0 + BASE_SHIFT
  1947. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1948. daddu AO, AORIG, L
  1949. daddu BO, B, TEMP
  1950. #endif
  1951. #if defined(LN) || defined(LT)
  1952. LD b1, 0 * SIZE(BO)
  1953. LD b2, 1 * SIZE(BO)
  1954. LD b3, 2 * SIZE(BO)
  1955. LD b4, 3 * SIZE(BO)
  1956. SUB c11, b1, c11
  1957. SUB c21, b2, c21
  1958. SUB c31, b3, c31
  1959. SUB c41, b4, c41
  1960. #else
  1961. LD b1, 0 * SIZE(AO)
  1962. LD b2, 1 * SIZE(AO)
  1963. LD b3, 2 * SIZE(AO)
  1964. LD b4, 3 * SIZE(AO)
  1965. SUB c11, b1, c11
  1966. SUB c21, b2, c21
  1967. SUB c31, b3, c31
  1968. SUB c41, b4, c41
  1969. #endif
  1970. #if defined(LN) || defined(LT)
  1971. LD b1, 0 * SIZE(AO)
  1972. MUL c11, b1, c11
  1973. MUL c21, b1, c21
  1974. MUL c31, b1, c31
  1975. MUL c41, b1, c41
  1976. #endif
  1977. #ifdef RN
  1978. LD b1, 0 * SIZE(BO)
  1979. LD b2, 1 * SIZE(BO)
  1980. LD b3, 2 * SIZE(BO)
  1981. LD b4, 3 * SIZE(BO)
  1982. MUL c11, b1, c11
  1983. NMSUB c21, c21, b2, c11
  1984. NMSUB c31, c31, b3, c11
  1985. NMSUB c41, c41, b4, c11
  1986. LD b2, 5 * SIZE(BO)
  1987. LD b3, 6 * SIZE(BO)
  1988. LD b4, 7 * SIZE(BO)
  1989. MUL c21, b2, c21
  1990. NMSUB c31, c31, b3, c21
  1991. NMSUB c41, c41, b4, c21
  1992. LD b3, 10 * SIZE(BO)
  1993. LD b4, 11 * SIZE(BO)
  1994. MUL c31, b3, c31
  1995. NMSUB c41, c41, b4, c31
  1996. LD b4, 15 * SIZE(BO)
  1997. MUL c41, b4, c41
  1998. #endif
  1999. #ifdef RT
  2000. LD b5, 15 * SIZE(BO)
  2001. LD b6, 14 * SIZE(BO)
  2002. LD b7, 13 * SIZE(BO)
  2003. LD b8, 12 * SIZE(BO)
  2004. MUL c41, b5, c41
  2005. NMSUB c31, c31, b6, c41
  2006. NMSUB c21, c21, b7, c41
  2007. NMSUB c11, c11, b8, c41
  2008. LD b6, 10 * SIZE(BO)
  2009. LD b7, 9 * SIZE(BO)
  2010. LD b8, 8 * SIZE(BO)
  2011. MUL c31, b6, c31
  2012. NMSUB c21, c21, b7, c31
  2013. NMSUB c11, c11, b8, c31
  2014. LD b7, 5 * SIZE(BO)
  2015. LD b8, 4 * SIZE(BO)
  2016. MUL c21, b7, c21
  2017. NMSUB c11, c11, b8, c21
  2018. LD b8, 0 * SIZE(BO)
  2019. MUL c11, b8, c11
  2020. #endif
  2021. #ifdef LN
  2022. daddiu CO1, CO1, -1 * SIZE
  2023. daddiu CO2, CO2, -1 * SIZE
  2024. daddiu CO3, CO3, -1 * SIZE
  2025. daddiu CO4, CO4, -1 * SIZE
  2026. #endif
  2027. #if defined(LN) || defined(LT)
  2028. ST c11, 0 * SIZE(BO)
  2029. ST c21, 1 * SIZE(BO)
  2030. ST c31, 2 * SIZE(BO)
  2031. ST c41, 3 * SIZE(BO)
  2032. #else
  2033. ST c11, 0 * SIZE(AO)
  2034. ST c21, 1 * SIZE(AO)
  2035. ST c31, 2 * SIZE(AO)
  2036. ST c41, 3 * SIZE(AO)
  2037. #endif
  2038. ST c11, 0 * SIZE(CO1)
  2039. ST c21, 0 * SIZE(CO2)
  2040. ST c31, 0 * SIZE(CO3)
  2041. ST c41, 0 * SIZE(CO4)
  2042. #ifndef LN
  2043. daddiu CO1, CO1, 1 * SIZE
  2044. daddiu CO2, CO2, 1 * SIZE
  2045. daddiu CO3, CO3, 1 * SIZE
  2046. daddiu CO4, CO4, 1 * SIZE
  2047. #endif
  2048. #ifdef RT
  2049. dsll TEMP, K, BASE_SHIFT
  2050. daddu AORIG, AORIG, TEMP
  2051. #endif
  2052. #if defined(LT) || defined(RN)
  2053. dsubu TEMP, K, KK
  2054. dsll L, TEMP, 0 + BASE_SHIFT
  2055. dsll TEMP, TEMP, 2 + BASE_SHIFT
  2056. daddu AO, AO, L
  2057. daddu BO, BO, TEMP
  2058. #endif
  2059. #ifdef LT
  2060. daddiu KK, KK, 1
  2061. #endif
  2062. #ifdef LN
  2063. daddiu KK, KK, -1
  2064. #endif
  2065. .align 3
  2066. .L49:
  2067. #ifdef LN
  2068. dsll TEMP, K, 2 + BASE_SHIFT
  2069. daddu B, B, TEMP
  2070. #endif
  2071. #if defined(LT) || defined(RN)
  2072. move B, BO
  2073. #endif
  2074. #ifdef RN
  2075. daddiu KK, KK, 4
  2076. #endif
  2077. #ifdef RT
  2078. daddiu KK, KK, -4
  2079. #endif
  2080. .align 3
  2081. .L50:
  2082. andi J, N, 2
  2083. blez J, .L70
  2084. #ifdef RT
  2085. dsll TEMP, K, 1 + BASE_SHIFT
  2086. dsubu B, B, TEMP
  2087. dsll TEMP, LDC, 1
  2088. dsubu C, C, TEMP
  2089. #endif
  2090. move AO, A
  2091. move CO1, C
  2092. daddu CO2, C, LDC
  2093. #ifdef LN
  2094. daddu KK, M, OFFSET
  2095. #endif
  2096. #ifdef LT
  2097. move KK, OFFSET
  2098. #endif
  2099. #if defined(LN) || defined(RT)
  2100. move AORIG, A
  2101. #else
  2102. move AO, A
  2103. #endif
  2104. #ifndef RT
  2105. daddu C, CO2, LDC
  2106. #endif
  2107. dsra I, M, 1
  2108. blez I, .L60
  2109. NOP
  2110. .L51:
  2111. #if defined(LT) || defined(RN)
  2112. LD a1, 0 * SIZE(AO)
  2113. MTC $0, c11
  2114. LD a2, 1 * SIZE(AO)
  2115. MOV c21, c11
  2116. LD a5, 4 * SIZE(AO)
  2117. LD b1, 0 * SIZE(B)
  2118. MOV c12, c11
  2119. LD b2, 1 * SIZE(B)
  2120. MOV c22, c11
  2121. LD b3, 2 * SIZE(B)
  2122. LD b5, 4 * SIZE(B)
  2123. dsra L, KK, 2
  2124. LD b6, 8 * SIZE(B)
  2125. LD b7, 12 * SIZE(B)
  2126. blez L, .L55
  2127. move BO, B
  2128. #else
  2129. #ifdef LN
  2130. dsll TEMP, K, 1 + BASE_SHIFT
  2131. dsubu AORIG, AORIG, TEMP
  2132. #endif
  2133. dsll L, KK, 1 + BASE_SHIFT
  2134. dsll TEMP, KK, 1 + BASE_SHIFT
  2135. daddu AO, AORIG, L
  2136. daddu BO, B, TEMP
  2137. dsubu TEMP, K, KK
  2138. LD a1, 0 * SIZE(AO)
  2139. MTC $0, c11
  2140. LD a2, 1 * SIZE(AO)
  2141. MOV c21, c11
  2142. LD a5, 4 * SIZE(AO)
  2143. LD b1, 0 * SIZE(BO)
  2144. MOV c12, c11
  2145. LD b2, 1 * SIZE(BO)
  2146. MOV c22, c11
  2147. LD b3, 2 * SIZE(BO)
  2148. LD b5, 4 * SIZE(BO)
  2149. dsra L, TEMP, 2
  2150. LD b6, 8 * SIZE(BO)
  2151. LD b7, 12 * SIZE(BO)
  2152. blez L, .L55
  2153. NOP
  2154. #endif
  2155. .align 3
  2156. .L52:
  2157. MADD c11, c11, a1, b1
  2158. LD a3, 2 * SIZE(AO)
  2159. MADD c21, c21, a1, b2
  2160. LD b4, 3 * SIZE(BO)
  2161. MADD c12, c12, a2, b1
  2162. LD a4, 3 * SIZE(AO)
  2163. MADD c22, c22, a2, b2
  2164. LD b1, 8 * SIZE(BO)
  2165. MADD c11, c11, a3, b3
  2166. LD a1, 8 * SIZE(AO)
  2167. MADD c21, c21, a3, b4
  2168. LD b2, 5 * SIZE(BO)
  2169. MADD c12, c12, a4, b3
  2170. LD a2, 5 * SIZE(AO)
  2171. MADD c22, c22, a4, b4
  2172. LD b3, 6 * SIZE(BO)
  2173. MADD c11, c11, a5, b5
  2174. LD a3, 6 * SIZE(AO)
  2175. MADD c21, c21, a5, b2
  2176. LD b4, 7 * SIZE(BO)
  2177. MADD c12, c12, a2, b5
  2178. LD a4, 7 * SIZE(AO)
  2179. MADD c22, c22, a2, b2
  2180. LD b5, 12 * SIZE(BO)
  2181. MADD c11, c11, a3, b3
  2182. LD a5, 12 * SIZE(AO)
  2183. MADD c21, c21, a3, b4
  2184. LD b2, 9 * SIZE(BO)
  2185. MADD c12, c12, a4, b3
  2186. LD a2, 9 * SIZE(AO)
  2187. MADD c22, c22, a4, b4
  2188. LD b3, 10 * SIZE(BO)
  2189. daddiu AO, AO, 8 * SIZE
  2190. daddiu L, L, -1
  2191. bgtz L, .L52
  2192. daddiu BO, BO, 8 * SIZE
  2193. .align 3
  2194. .L55:
  2195. #if defined(LT) || defined(RN)
  2196. andi L, KK, 3
  2197. #else
  2198. andi L, TEMP, 3
  2199. #endif
  2200. NOP
  2201. blez L, .L58
  2202. NOP
  2203. .align 3
  2204. .L56:
  2205. MADD c11, c11, a1, b1
  2206. LD a2, 1 * SIZE(AO)
  2207. MADD c21, c21, a1, b2
  2208. LD a1, 2 * SIZE(AO)
  2209. MADD c12, c12, a2, b1
  2210. LD b1, 2 * SIZE(BO)
  2211. MADD c22, c22, a2, b2
  2212. LD b2, 3 * SIZE(BO)
  2213. daddiu L, L, -1
  2214. daddiu AO, AO, 2 * SIZE
  2215. bgtz L, .L56
  2216. daddiu BO, BO, 2 * SIZE
  2217. .L58:
  2218. #if defined(LN) || defined(RT)
  2219. #ifdef LN
  2220. daddiu TEMP, KK, -2
  2221. #else
  2222. daddiu TEMP, KK, -2
  2223. #endif
  2224. dsll L, TEMP, 1 + BASE_SHIFT
  2225. dsll TEMP, TEMP, 1 + BASE_SHIFT
  2226. daddu AO, AORIG, L
  2227. daddu BO, B, TEMP
  2228. #endif
  2229. #if defined(LN) || defined(LT)
  2230. LD b1, 0 * SIZE(BO)
  2231. LD b2, 1 * SIZE(BO)
  2232. LD b3, 2 * SIZE(BO)
  2233. LD b4, 3 * SIZE(BO)
  2234. SUB c11, b1, c11
  2235. SUB c21, b2, c21
  2236. SUB c12, b3, c12
  2237. SUB c22, b4, c22
  2238. #else
  2239. LD b1, 0 * SIZE(AO)
  2240. LD b2, 1 * SIZE(AO)
  2241. LD b3, 2 * SIZE(AO)
  2242. LD b4, 3 * SIZE(AO)
  2243. SUB c11, b1, c11
  2244. SUB c12, b2, c12
  2245. SUB c21, b3, c21
  2246. SUB c22, b4, c22
  2247. #endif
  2248. #ifdef LN
  2249. LD b1, 3 * SIZE(AO)
  2250. LD b2, 2 * SIZE(AO)
  2251. LD b3, 0 * SIZE(AO)
  2252. MUL c12, b1, c12
  2253. MUL c22, b1, c22
  2254. NMSUB c11, c11, b2, c12
  2255. NMSUB c21, c21, b2, c22
  2256. MUL c11, b3, c11
  2257. MUL c21, b3, c21
  2258. #endif
  2259. #ifdef LT
  2260. LD b1, 0 * SIZE(AO)
  2261. LD b2, 1 * SIZE(AO)
  2262. LD b3, 3 * SIZE(AO)
  2263. MUL c11, b1, c11
  2264. MUL c21, b1, c21
  2265. NMSUB c12, c12, b2, c11
  2266. NMSUB c22, c22, b2, c21
  2267. MUL c12, b3, c12
  2268. MUL c22, b3, c22
  2269. #endif
  2270. #ifdef RN
  2271. LD b1, 0 * SIZE(BO)
  2272. LD b2, 1 * SIZE(BO)
  2273. LD b3, 3 * SIZE(BO)
  2274. MUL c11, b1, c11
  2275. MUL c12, b1, c12
  2276. NMSUB c21, c21, b2, c11
  2277. NMSUB c22, c22, b2, c12
  2278. MUL c21, b3, c21
  2279. MUL c22, b3, c22
  2280. #endif
  2281. #ifdef RT
  2282. LD b1, 3 * SIZE(BO)
  2283. LD b2, 2 * SIZE(BO)
  2284. LD b3, 0 * SIZE(BO)
  2285. MUL c21, b1, c21
  2286. MUL c22, b1, c22
  2287. NMSUB c11, c11, b2, c21
  2288. NMSUB c12, c12, b2, c22
  2289. MUL c11, b3, c11
  2290. MUL c12, b3, c12
  2291. #endif
  2292. #ifdef LN
  2293. daddiu CO1, CO1, -2 * SIZE
  2294. daddiu CO2, CO2, -2 * SIZE
  2295. #endif
  2296. #if defined(LN) || defined(LT)
  2297. ST c11, 0 * SIZE(BO)
  2298. ST c21, 1 * SIZE(BO)
  2299. ST c12, 2 * SIZE(BO)
  2300. ST c22, 3 * SIZE(BO)
  2301. #else
  2302. ST c11, 0 * SIZE(AO)
  2303. ST c12, 1 * SIZE(AO)
  2304. ST c21, 2 * SIZE(AO)
  2305. ST c22, 3 * SIZE(AO)
  2306. #endif
  2307. ST c11, 0 * SIZE(CO1)
  2308. ST c12, 1 * SIZE(CO1)
  2309. ST c21, 0 * SIZE(CO2)
  2310. ST c22, 1 * SIZE(CO2)
  2311. #ifndef LN
  2312. daddiu CO1, CO1, 2 * SIZE
  2313. daddiu CO2, CO2, 2 * SIZE
  2314. #endif
  2315. #ifdef RT
  2316. dsll TEMP, K, 1 + BASE_SHIFT
  2317. daddu AORIG, AORIG, TEMP
  2318. #endif
  2319. #if defined(LT) || defined(RN)
  2320. dsubu TEMP, K, KK
  2321. dsll TEMP, TEMP, 1 + BASE_SHIFT
  2322. daddu AO, AO, TEMP
  2323. daddu BO, BO, TEMP
  2324. #endif
  2325. #ifdef LT
  2326. daddiu KK, KK, 2
  2327. #endif
  2328. #ifdef LN
  2329. daddiu KK, KK, -2
  2330. #endif
  2331. MTC $0, a1
  2332. MOV c11, a1
  2333. MOV c21, a1
  2334. MOV c31, a1
  2335. daddiu I, I, -1
  2336. bgtz I, .L51
  2337. MOV c41, c11
  2338. .align 3
  2339. .L60:
  2340. andi I, M, 1
  2341. blez I, .L69
  2342. NOP
  2343. #if defined(LT) || defined(RN)
  2344. dsra L, KK, 2
  2345. LD a1, 0 * SIZE(AO)
  2346. MTC $0, c11
  2347. LD a2, 1 * SIZE(AO)
  2348. MOV c21, c11
  2349. LD a3, 2 * SIZE(AO)
  2350. MOV c31, c11
  2351. LD a4, 3 * SIZE(AO)
  2352. MOV c41, c11
  2353. LD b1, 0 * SIZE(B)
  2354. LD b2, 1 * SIZE(B)
  2355. LD b3, 2 * SIZE(B)
  2356. LD b4, 3 * SIZE(B)
  2357. LD b5, 4 * SIZE(B)
  2358. LD b6, 8 * SIZE(B)
  2359. LD b7, 12 * SIZE(B)
  2360. blez L, .L65
  2361. move BO, B
  2362. #else
  2363. #ifdef LN
  2364. dsll TEMP, K, BASE_SHIFT
  2365. dsubu AORIG, AORIG, TEMP
  2366. #endif
  2367. dsll L, KK, 0 + BASE_SHIFT
  2368. dsll TEMP, KK, 1 + BASE_SHIFT
  2369. daddu AO, AORIG, L
  2370. daddu BO, B, TEMP
  2371. dsubu TEMP, K, KK
  2372. dsra L, TEMP, 2
  2373. LD a1, 0 * SIZE(AO)
  2374. MTC $0, c11
  2375. LD a2, 1 * SIZE(AO)
  2376. MOV c21, c11
  2377. LD a3, 2 * SIZE(AO)
  2378. MOV c31, c11
  2379. LD a4, 3 * SIZE(AO)
  2380. MOV c41, c11
  2381. LD b1, 0 * SIZE(BO)
  2382. LD b2, 1 * SIZE(BO)
  2383. LD b3, 2 * SIZE(BO)
  2384. LD b4, 3 * SIZE(BO)
  2385. LD b5, 4 * SIZE(BO)
  2386. LD b6, 8 * SIZE(BO)
  2387. LD b7, 12 * SIZE(BO)
  2388. blez L, .L65
  2389. NOP
  2390. #endif
  2391. .align 3
  2392. .L62:
  2393. MADD c11, c11, a1, b1
  2394. LD b1, 4 * SIZE(BO)
  2395. MADD c21, c21, a1, b2
  2396. LD b2, 5 * SIZE(BO)
  2397. MADD c31, c31, a2, b3
  2398. LD b3, 6 * SIZE(BO)
  2399. MADD c41, c41, a2, b4
  2400. LD b4, 7 * SIZE(BO)
  2401. LD a1, 4 * SIZE(AO)
  2402. LD a2, 5 * SIZE(AO)
  2403. MADD c11, c11, a3, b1
  2404. LD b1, 8 * SIZE(BO)
  2405. MADD c21, c21, a3, b2
  2406. LD b2, 9 * SIZE(BO)
  2407. MADD c31, c31, a4, b3
  2408. LD b3, 10 * SIZE(BO)
  2409. MADD c41, c41, a4, b4
  2410. LD b4, 11 * SIZE(BO)
  2411. LD a3, 6 * SIZE(AO)
  2412. LD a4, 7 * SIZE(AO)
  2413. daddiu L, L, -1
  2414. daddiu AO, AO, 4 * SIZE
  2415. bgtz L, .L62
  2416. daddiu BO, BO, 8 * SIZE
  2417. .align 3
  2418. .L65:
  2419. #if defined(LT) || defined(RN)
  2420. andi L, KK, 3
  2421. #else
  2422. andi L, TEMP, 3
  2423. #endif
  2424. NOP
  2425. blez L, .L68
  2426. NOP
  2427. .align 3
  2428. .L66:
  2429. MADD c11, c11, a1, b1
  2430. LD b1, 2 * SIZE(BO)
  2431. MADD c21, c21, a1, b2
  2432. LD b2, 3 * SIZE(BO)
  2433. LD a1, 1 * SIZE(AO)
  2434. daddiu L, L, -1
  2435. daddiu AO, AO, 1 * SIZE
  2436. bgtz L, .L66
  2437. daddiu BO, BO, 2 * SIZE
  2438. .L68:
  2439. ADD c11, c11, c31
  2440. ADD c21, c21, c41
  2441. #if defined(LN) || defined(RT)
  2442. #ifdef LN
  2443. daddiu TEMP, KK, -1
  2444. #else
  2445. daddiu TEMP, KK, -2
  2446. #endif
  2447. dsll L, TEMP, 0 + BASE_SHIFT
  2448. dsll TEMP, TEMP, 1 + BASE_SHIFT
  2449. daddu AO, AORIG, L
  2450. daddu BO, B, TEMP
  2451. #endif
  2452. #if defined(LN) || defined(LT)
  2453. LD b1, 0 * SIZE(BO)
  2454. LD b2, 1 * SIZE(BO)
  2455. SUB c11, b1, c11
  2456. SUB c21, b2, c21
  2457. #else
  2458. LD b1, 0 * SIZE(AO)
  2459. LD b2, 1 * SIZE(AO)
  2460. SUB c11, b1, c11
  2461. SUB c21, b2, c21
  2462. #endif
  2463. #if defined(LN) || defined(LT)
  2464. LD b3, 0 * SIZE(AO)
  2465. MUL c11, b3, c11
  2466. MUL c21, b3, c21
  2467. #endif
  2468. #ifdef RN
  2469. LD b1, 0 * SIZE(BO)
  2470. LD b2, 1 * SIZE(BO)
  2471. LD b3, 3 * SIZE(BO)
  2472. MUL c11, b1, c11
  2473. NMSUB c21, c21, b2, c11
  2474. MUL c21, b3, c21
  2475. #endif
  2476. #ifdef RT
  2477. LD b1, 3 * SIZE(BO)
  2478. LD b2, 2 * SIZE(BO)
  2479. LD b3, 0 * SIZE(BO)
  2480. MUL c21, b1, c21
  2481. NMSUB c11, c11, b2, c21
  2482. MUL c11, b3, c11
  2483. #endif
  2484. #ifdef LN
  2485. daddiu CO1, CO1, -1 * SIZE
  2486. daddiu CO2, CO2, -1 * SIZE
  2487. #endif
  2488. #if defined(LN) || defined(LT)
  2489. ST c11, 0 * SIZE(BO)
  2490. ST c21, 1 * SIZE(BO)
  2491. #else
  2492. ST c11, 0 * SIZE(AO)
  2493. ST c21, 1 * SIZE(AO)
  2494. #endif
  2495. ST c11, 0 * SIZE(CO1)
  2496. ST c21, 0 * SIZE(CO2)
  2497. #ifndef LN
  2498. daddiu CO1, CO1, 1 * SIZE
  2499. daddiu CO2, CO2, 1 * SIZE
  2500. #endif
  2501. #ifdef RT
  2502. dsll TEMP, K, 0 + BASE_SHIFT
  2503. daddu AORIG, AORIG, TEMP
  2504. #endif
  2505. #if defined(LT) || defined(RN)
  2506. dsubu TEMP, K, KK
  2507. dsll L, TEMP, 0 + BASE_SHIFT
  2508. dsll TEMP, TEMP, 1 + BASE_SHIFT
  2509. daddu AO, AO, L
  2510. daddu BO, BO, TEMP
  2511. #endif
  2512. #ifdef LT
  2513. daddiu KK, KK, 1
  2514. #endif
  2515. #ifdef LN
  2516. daddiu KK, KK, -1
  2517. #endif
  2518. .align 3
  2519. .L69:
  2520. #ifdef LN
  2521. dsll TEMP, K, 1 + BASE_SHIFT
  2522. daddu B, B, TEMP
  2523. #endif
  2524. #if defined(LT) || defined(RN)
  2525. move B, BO
  2526. #endif
  2527. #ifdef RN
  2528. daddiu KK, KK, 2
  2529. #endif
  2530. #ifdef RT
  2531. daddiu KK, KK, -2
  2532. #endif
  2533. .align 3
  2534. .L70:
  2535. andi J, N, 1
  2536. blez J, .L999
  2537. NOP
  2538. #ifdef RT
  2539. dsll TEMP, K, BASE_SHIFT
  2540. dsubu B, B, TEMP
  2541. dsubu C, C, LDC
  2542. #endif
  2543. move AO, A
  2544. move CO1, C
  2545. #ifdef LN
  2546. daddu KK, M, OFFSET
  2547. #endif
  2548. #ifdef LT
  2549. move KK, OFFSET
  2550. #endif
  2551. #if defined(LN) || defined(RT)
  2552. move AORIG, A
  2553. #else
  2554. move AO, A
  2555. #endif
  2556. #ifndef RT
  2557. daddu C, CO1, LDC
  2558. #endif
  2559. dsra I, M, 1
  2560. blez I, .L80
  2561. NOP
  2562. .L71:
  2563. #if defined(LT) || defined(RN)
  2564. LD a1, 0 * SIZE(AO)
  2565. MTC $0, c11
  2566. LD a2, 1 * SIZE(AO)
  2567. MOV c21, c11
  2568. LD a5, 4 * SIZE(AO)
  2569. LD b1, 0 * SIZE(B)
  2570. MOV c12, c11
  2571. LD b2, 1 * SIZE(B)
  2572. MOV c22, c11
  2573. LD b3, 2 * SIZE(B)
  2574. LD b5, 4 * SIZE(B)
  2575. dsra L, KK, 2
  2576. LD b6, 8 * SIZE(B)
  2577. LD b7, 12 * SIZE(B)
  2578. blez L, .L75
  2579. move BO, B
  2580. #else
  2581. #ifdef LN
  2582. dsll TEMP, K, 1 + BASE_SHIFT
  2583. dsubu AORIG, AORIG, TEMP
  2584. #endif
  2585. dsll L, KK, 1 + BASE_SHIFT
  2586. dsll TEMP, KK, 0 + BASE_SHIFT
  2587. daddu AO, AORIG, L
  2588. daddu BO, B, TEMP
  2589. dsubu TEMP, K, KK
  2590. LD a1, 0 * SIZE(AO)
  2591. MTC $0, c11
  2592. LD a2, 1 * SIZE(AO)
  2593. MOV c21, c11
  2594. LD a5, 4 * SIZE(AO)
  2595. LD b1, 0 * SIZE(BO)
  2596. MOV c12, c11
  2597. LD b2, 1 * SIZE(BO)
  2598. MOV c22, c11
  2599. LD b3, 2 * SIZE(BO)
  2600. LD b5, 4 * SIZE(BO)
  2601. dsra L, TEMP, 2
  2602. LD b6, 8 * SIZE(BO)
  2603. LD b7, 12 * SIZE(BO)
  2604. blez L, .L75
  2605. NOP
  2606. #endif
  2607. .align 3
  2608. .L72:
  2609. LD a1, 0 * SIZE(AO)
  2610. LD a2, 1 * SIZE(AO)
  2611. LD b1, 0 * SIZE(BO)
  2612. MADD c11, c11, a1, b1
  2613. MADD c12, c12, a2, b1
  2614. LD a1, 2 * SIZE(AO)
  2615. LD a2, 3 * SIZE(AO)
  2616. LD b1, 1 * SIZE(BO)
  2617. MADD c11, c11, a1, b1
  2618. MADD c12, c12, a2, b1
  2619. LD a1, 4 * SIZE(AO)
  2620. LD a2, 5 * SIZE(AO)
  2621. LD b1, 2 * SIZE(BO)
  2622. MADD c11, c11, a1, b1
  2623. MADD c12, c12, a2, b1
  2624. LD a1, 6 * SIZE(AO)
  2625. LD a2, 7 * SIZE(AO)
  2626. LD b1, 3 * SIZE(BO)
  2627. MADD c11, c11, a1, b1
  2628. MADD c12, c12, a2, b1
  2629. daddiu L, L, -1
  2630. daddiu AO, AO, 8 * SIZE
  2631. bgtz L, .L72
  2632. daddiu BO, BO, 4 * SIZE
  2633. .align 3
  2634. .L75:
  2635. #if defined(LT) || defined(RN)
  2636. andi L, KK, 3
  2637. #else
  2638. andi L, TEMP, 3
  2639. #endif
  2640. NOP
  2641. blez L, .L78
  2642. NOP
  2643. .align 3
  2644. .L76:
  2645. LD a1, 0 * SIZE(AO)
  2646. LD a2, 1 * SIZE(AO)
  2647. LD b1, 0 * SIZE(BO)
  2648. MADD c11, c11, a1, b1
  2649. MADD c12, c12, a2, b1
  2650. daddiu L, L, -1
  2651. daddiu AO, AO, 2 * SIZE
  2652. bgtz L, .L76
  2653. daddiu BO, BO, 1 * SIZE
  2654. .L78:
  2655. ADD c11, c11, c21
  2656. ADD c12, c12, c22
  2657. #if defined(LN) || defined(RT)
  2658. #ifdef LN
  2659. daddiu TEMP, KK, -2
  2660. #else
  2661. daddiu TEMP, KK, -1
  2662. #endif
  2663. dsll L, TEMP, 1 + BASE_SHIFT
  2664. dsll TEMP, TEMP, 0 + BASE_SHIFT
  2665. daddu AO, AORIG, L
  2666. daddu BO, B, TEMP
  2667. #endif
  2668. #if defined(LN) || defined(LT)
  2669. LD b1, 0 * SIZE(BO)
  2670. LD b2, 1 * SIZE(BO)
  2671. SUB c11, b1, c11
  2672. SUB c12, b2, c12
  2673. #else
  2674. LD b1, 0 * SIZE(AO)
  2675. LD b2, 1 * SIZE(AO)
  2676. SUB c11, b1, c11
  2677. SUB c12, b2, c12
  2678. #endif
  2679. #ifdef LN
  2680. LD b1, 3 * SIZE(AO)
  2681. LD b2, 2 * SIZE(AO)
  2682. LD b3, 0 * SIZE(AO)
  2683. MUL c12, b1, c12
  2684. NMSUB c11, c11, b2, c12
  2685. MUL c11, b3, c11
  2686. #endif
  2687. #ifdef LT
  2688. LD b1, 0 * SIZE(AO)
  2689. LD b2, 1 * SIZE(AO)
  2690. LD b3, 3 * SIZE(AO)
  2691. MUL c11, b1, c11
  2692. NMSUB c12, c12, b2, c11
  2693. MUL c12, b3, c12
  2694. #endif
  2695. #if defined(RN) || defined(RT)
  2696. LD b1, 0 * SIZE(BO)
  2697. MUL c11, b1, c11
  2698. MUL c12, b1, c12
  2699. #endif
  2700. #ifdef LN
  2701. daddiu CO1, CO1, -2 * SIZE
  2702. #endif
  2703. #if defined(LN) || defined(LT)
  2704. ST c11, 0 * SIZE(BO)
  2705. ST c12, 1 * SIZE(BO)
  2706. #else
  2707. ST c11, 0 * SIZE(AO)
  2708. ST c12, 1 * SIZE(AO)
  2709. #endif
  2710. ST c11, 0 * SIZE(CO1)
  2711. ST c12, 1 * SIZE(CO1)
  2712. #ifndef LN
  2713. daddiu CO1, CO1, 2 * SIZE
  2714. #endif
  2715. #ifdef RT
  2716. dsll TEMP, K, 1 + BASE_SHIFT
  2717. daddu AORIG, AORIG, TEMP
  2718. #endif
  2719. #if defined(LT) || defined(RN)
  2720. dsubu TEMP, K, KK
  2721. dsll L, TEMP, 1 + BASE_SHIFT
  2722. dsll TEMP, TEMP, 0 + BASE_SHIFT
  2723. daddu AO, AO, L
  2724. daddu BO, BO, TEMP
  2725. #endif
  2726. #ifdef LT
  2727. daddiu KK, KK, 2
  2728. #endif
  2729. #ifdef LN
  2730. daddiu KK, KK, -2
  2731. #endif
  2732. daddiu I, I, -1
  2733. bgtz I, .L71
  2734. NOP
  2735. .align 3
  2736. .L80:
  2737. andi I, M, 1
  2738. blez I, .L89
  2739. NOP
  2740. #if defined(LT) || defined(RN)
  2741. LD a1, 0 * SIZE(AO)
  2742. MTC $0, c11
  2743. LD a2, 1 * SIZE(AO)
  2744. MOV c21, c11
  2745. LD a3, 2 * SIZE(AO)
  2746. LD a4, 3 * SIZE(AO)
  2747. LD b1, 0 * SIZE(B)
  2748. LD b2, 1 * SIZE(B)
  2749. LD b3, 2 * SIZE(B)
  2750. LD b4, 3 * SIZE(B)
  2751. LD b5, 4 * SIZE(B)
  2752. LD b6, 8 * SIZE(B)
  2753. LD b7, 12 * SIZE(B)
  2754. dsra L, KK, 2
  2755. blez L, .L85
  2756. move BO, B
  2757. #else
  2758. #ifdef LN
  2759. dsll TEMP, K, BASE_SHIFT
  2760. dsubu AORIG, AORIG, TEMP
  2761. #endif
  2762. dsll TEMP, KK, BASE_SHIFT
  2763. daddu AO, AORIG, TEMP
  2764. daddu BO, B, TEMP
  2765. dsubu TEMP, K, KK
  2766. LD a1, 0 * SIZE(AO)
  2767. MTC $0, c11
  2768. LD a2, 1 * SIZE(AO)
  2769. MOV c21, c11
  2770. LD a3, 2 * SIZE(AO)
  2771. LD a4, 3 * SIZE(AO)
  2772. LD b1, 0 * SIZE(BO)
  2773. LD b2, 1 * SIZE(BO)
  2774. LD b3, 2 * SIZE(BO)
  2775. LD b4, 3 * SIZE(BO)
  2776. LD b5, 4 * SIZE(BO)
  2777. LD b6, 8 * SIZE(BO)
  2778. LD b7, 12 * SIZE(BO)
  2779. dsra L, TEMP, 2
  2780. blez L, .L85
  2781. NOP
  2782. #endif
  2783. .align 3
  2784. .L82:
  2785. LD a1, 0 * SIZE(AO)
  2786. LD b1, 0 * SIZE(BO)
  2787. MADD c11, c11, a1, b1
  2788. LD a1, 1 * SIZE(AO)
  2789. LD b1, 1 * SIZE(BO)
  2790. MADD c21, c21, a1, b1
  2791. LD a1, 2 * SIZE(AO)
  2792. LD b1, 2 * SIZE(BO)
  2793. MADD c11, c11, a1, b1
  2794. LD a1, 3 * SIZE(AO)
  2795. LD b1, 3 * SIZE(BO)
  2796. MADD c21, c21, a1, b1
  2797. daddiu L, L, -1
  2798. daddiu AO, AO, 4 * SIZE
  2799. bgtz L, .L82
  2800. daddiu BO, BO, 4 * SIZE
  2801. .align 3
  2802. .L85:
  2803. #if defined(LT) || defined(RN)
  2804. andi L, KK, 3
  2805. #else
  2806. andi L, TEMP, 3
  2807. #endif
  2808. NOP
  2809. blez L, .L88
  2810. NOP
  2811. .align 3
  2812. .L86:
  2813. LD a1, 0 * SIZE(AO)
  2814. LD b1, 0 * SIZE(BO)
  2815. MADD c11, c11, a1, b1
  2816. daddiu L, L, -1
  2817. daddiu AO, AO, 1 * SIZE
  2818. bgtz L, .L86
  2819. daddiu BO, BO, 1 * SIZE
  2820. .L88:
  2821. ADD c11, c11, c21
  2822. #if defined(LN) || defined(RT)
  2823. #ifdef LN
  2824. daddiu TEMP, KK, -1
  2825. #else
  2826. daddiu TEMP, KK, -1
  2827. #endif
  2828. dsll TEMP, TEMP, 0 + BASE_SHIFT
  2829. daddu AO, AORIG, TEMP
  2830. daddu BO, B, TEMP
  2831. #endif
  2832. #if defined(LN) || defined(LT)
  2833. LD b1, 0 * SIZE(BO)
  2834. SUB c11, b1, c11
  2835. #else
  2836. LD b1, 0 * SIZE(AO)
  2837. SUB c11, b1, c11
  2838. #endif
  2839. #if defined(LN) || defined(LT)
  2840. LD b1, 0 * SIZE(AO)
  2841. MUL c11, b1, c11
  2842. #endif
  2843. #if defined(RN) || defined(RT)
  2844. LD b1, 0 * SIZE(BO)
  2845. MUL c11, b1, c11
  2846. #endif
  2847. #ifdef LN
  2848. daddiu CO1, CO1, -1 * SIZE
  2849. #endif
  2850. #if defined(LN) || defined(LT)
  2851. ST c11, 0 * SIZE(BO)
  2852. #else
  2853. ST c11, 0 * SIZE(AO)
  2854. #endif
  2855. ST c11, 0 * SIZE(CO1)
  2856. #ifndef LN
  2857. daddiu CO1, CO1, 1 * SIZE
  2858. #endif
  2859. #ifdef RT
  2860. dsll TEMP, K, BASE_SHIFT
  2861. daddu AORIG, AORIG, TEMP
  2862. #endif
  2863. #if defined(LT) || defined(RN)
  2864. dsubu TEMP, K, KK
  2865. dsll TEMP, TEMP, 0 + BASE_SHIFT
  2866. daddu AO, AO, TEMP
  2867. daddu BO, BO, TEMP
  2868. #endif
  2869. #ifdef LT
  2870. daddiu KK, KK, 1
  2871. #endif
  2872. #ifdef LN
  2873. daddiu KK, KK, -1
  2874. #endif
  2875. .align 3
  2876. .L89:
  2877. #ifdef LN
  2878. dsll TEMP, K, BASE_SHIFT
  2879. daddu B, B, TEMP
  2880. #endif
  2881. #if defined(LT) || defined(RN)
  2882. move B, BO
  2883. #endif
  2884. #ifdef RN
  2885. daddiu KK, KK, 1
  2886. #endif
  2887. #ifdef RT
  2888. daddiu KK, KK, -1
  2889. #endif
  2890. .align 3
  2891. .L999:
  2892. LDARG $16, 0($sp)
  2893. LDARG $17, 8($sp)
  2894. LDARG $18, 16($sp)
  2895. LDARG $19, 24($sp)
  2896. LDARG $20, 32($sp)
  2897. LDARG $21, 40($sp)
  2898. ldc1 $f24, 48($sp)
  2899. ldc1 $f25, 56($sp)
  2900. ldc1 $f26, 64($sp)
  2901. ldc1 $f27, 72($sp)
  2902. ldc1 $f28, 80($sp)
  2903. LDARG $22, 88($sp)
  2904. LDARG $23, 96($sp)
  2905. LDARG $24, 104($sp)
  2906. LDARG $25, 112($sp)
  2907. #ifndef __64BIT__
  2908. ldc1 $f20,112($sp)
  2909. ldc1 $f21,120($sp)
  2910. ldc1 $f22,128($sp)
  2911. ldc1 $f23,136($sp)
  2912. #endif
  2913. j $31
  2914. daddiu $sp, $sp, 144
  2915. EPILOGUE