You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_RT.S 59 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M $4
  41. #define N $5
  42. #define K $6
  43. #define A $8
  44. #define B $9
  45. #define C $10
  46. #define LDC $11
  47. #define AO $12
  48. #define BO $13
  49. #define I $2
  50. #define J $3
  51. #define L $7
  52. #define CO1 $14
  53. #define CO2 $15
  54. #define CO3 $16
  55. #define CO4 $17
  56. #define CO5 $18
  57. #define CO6 $19
  58. #define CO7 $20
  59. #define CO8 $21
  60. #define OFFSET $22
  61. #define KK $23
  62. #define TEMP $24
  63. #define AORIG $25
  64. #define a1 $f0
  65. #define a2 $f1
  66. #define a3 $f27
  67. #define a4 $f28
  68. #define b1 $f2
  69. #define b2 $f3
  70. #define b3 $f4
  71. #define b4 $f5
  72. #define b5 $f6
  73. #define b6 $f7
  74. #define b7 $f8
  75. #define b8 $f9
  76. #define a5 b8
  77. #define c11 $f10
  78. #define c12 $f11
  79. #define c21 $f12
  80. #define c22 $f13
  81. #define c31 $f14
  82. #define c32 $f16
  83. #define c41 $f17
  84. #define c42 $f18
  85. #define c51 $f19
  86. #define c52 $f20
  87. #define c61 $f21
  88. #define c62 $f22
  89. #define c71 $f23
  90. #define c72 $f24
  91. #define c81 $f25
  92. #define c82 $f26
  93. #define ALPHA $f15
  94. PROLOGUE
  95. daddiu $sp, $sp, -144
  96. SDARG $16, 0($sp)
  97. SDARG $17, 8($sp)
  98. SDARG $18, 16($sp)
  99. SDARG $19, 24($sp)
  100. SDARG $20, 32($sp)
  101. SDARG $21, 40($sp)
  102. sdc1 $f24, 48($sp)
  103. sdc1 $f25, 56($sp)
  104. sdc1 $f26, 64($sp)
  105. sdc1 $f27, 72($sp)
  106. sdc1 $f28, 80($sp)
  107. SDARG $22, 88($sp)
  108. SDARG $23, 96($sp)
  109. SDARG $24, 104($sp)
  110. SDARG $25, 112($sp)
  111. #ifndef __64BIT__
  112. sdc1 $f20,112($sp)
  113. sdc1 $f21,120($sp)
  114. sdc1 $f22,128($sp)
  115. sdc1 $f23,136($sp)
  116. #endif
  117. LDARG OFFSET, 144($sp)
  118. dsll LDC, LDC, BASE_SHIFT
  119. #ifdef LN
  120. mult M, K
  121. mflo TEMP
  122. dsll TEMP, TEMP, BASE_SHIFT
  123. daddu A, A, TEMP
  124. dsll TEMP, M, BASE_SHIFT
  125. daddu C, C, TEMP
  126. #endif
  127. #ifdef RN
  128. neg KK, OFFSET
  129. #endif
  130. #ifdef RT
  131. mult N, K
  132. mflo TEMP
  133. dsll TEMP, TEMP, BASE_SHIFT
  134. daddu B, B, TEMP
  135. mult N, LDC
  136. mflo TEMP
  137. daddu C, C, TEMP
  138. dsubu KK, N, OFFSET
  139. #endif
  140. andi J, N, 1
  141. blez J, .L30
  142. NOP
  143. #ifdef RT
  144. dsll TEMP, K, BASE_SHIFT
  145. dsubu B, B, TEMP
  146. dsubu C, C, LDC
  147. #endif
  148. move AO, A
  149. move CO1, C
  150. #ifdef LN
  151. daddu KK, M, OFFSET
  152. #endif
  153. #ifdef LT
  154. move KK, OFFSET
  155. #endif
  156. #if defined(LN) || defined(RT)
  157. move AORIG, A
  158. #else
  159. move AO, A
  160. #endif
  161. #ifndef RT
  162. daddu C, CO1, LDC
  163. #endif
  164. dsra I, M, 1
  165. blez I, .L80
  166. NOP
  167. .L71:
  168. #if defined(LT) || defined(RN)
  169. LD a1, 0 * SIZE(AO)
  170. MTC $0, c11
  171. LD a2, 1 * SIZE(AO)
  172. MOV c21, c11
  173. LD a5, 4 * SIZE(AO)
  174. LD b1, 0 * SIZE(B)
  175. MOV c12, c11
  176. LD b2, 1 * SIZE(B)
  177. MOV c22, c11
  178. LD b3, 2 * SIZE(B)
  179. LD b5, 4 * SIZE(B)
  180. dsra L, KK, 2
  181. LD b6, 8 * SIZE(B)
  182. LD b7, 12 * SIZE(B)
  183. blez L, .L75
  184. move BO, B
  185. #else
  186. #ifdef LN
  187. dsll TEMP, K, 1 + BASE_SHIFT
  188. dsubu AORIG, AORIG, TEMP
  189. #endif
  190. dsll L, KK, 1 + BASE_SHIFT
  191. dsll TEMP, KK, 0 + BASE_SHIFT
  192. daddu AO, AORIG, L
  193. daddu BO, B, TEMP
  194. dsubu TEMP, K, KK
  195. LD a1, 0 * SIZE(AO)
  196. MTC $0, c11
  197. LD a2, 1 * SIZE(AO)
  198. MOV c21, c11
  199. LD a5, 4 * SIZE(AO)
  200. LD b1, 0 * SIZE(BO)
  201. MOV c12, c11
  202. LD b2, 1 * SIZE(BO)
  203. MOV c22, c11
  204. LD b3, 2 * SIZE(BO)
  205. LD b5, 4 * SIZE(BO)
  206. dsra L, TEMP, 2
  207. LD b6, 8 * SIZE(BO)
  208. LD b7, 12 * SIZE(BO)
  209. blez L, .L75
  210. NOP
  211. #endif
  212. .align 3
  213. .L72:
  214. LD a1, 0 * SIZE(AO)
  215. LD a2, 1 * SIZE(AO)
  216. LD b1, 0 * SIZE(BO)
  217. MADD c11, c11, a1, b1
  218. MADD c12, c12, a2, b1
  219. LD a1, 2 * SIZE(AO)
  220. LD a2, 3 * SIZE(AO)
  221. LD b1, 1 * SIZE(BO)
  222. MADD c11, c11, a1, b1
  223. MADD c12, c12, a2, b1
  224. LD a1, 4 * SIZE(AO)
  225. LD a2, 5 * SIZE(AO)
  226. LD b1, 2 * SIZE(BO)
  227. MADD c11, c11, a1, b1
  228. MADD c12, c12, a2, b1
  229. LD a1, 6 * SIZE(AO)
  230. LD a2, 7 * SIZE(AO)
  231. LD b1, 3 * SIZE(BO)
  232. MADD c11, c11, a1, b1
  233. MADD c12, c12, a2, b1
  234. daddiu L, L, -1
  235. daddiu AO, AO, 8 * SIZE
  236. bgtz L, .L72
  237. daddiu BO, BO, 4 * SIZE
  238. .align 3
  239. .L75:
  240. #if defined(LT) || defined(RN)
  241. andi L, KK, 3
  242. #else
  243. andi L, TEMP, 3
  244. #endif
  245. NOP
  246. blez L, .L78
  247. NOP
  248. .align 3
  249. .L76:
  250. LD a1, 0 * SIZE(AO)
  251. LD a2, 1 * SIZE(AO)
  252. LD b1, 0 * SIZE(BO)
  253. MADD c11, c11, a1, b1
  254. MADD c12, c12, a2, b1
  255. daddiu L, L, -1
  256. daddiu AO, AO, 2 * SIZE
  257. bgtz L, .L76
  258. daddiu BO, BO, 1 * SIZE
  259. .L78:
  260. ADD c11, c11, c21
  261. ADD c12, c12, c22
  262. #if defined(LN) || defined(RT)
  263. #ifdef LN
  264. daddiu TEMP, KK, -2
  265. #else
  266. daddiu TEMP, KK, -1
  267. #endif
  268. dsll L, TEMP, 1 + BASE_SHIFT
  269. dsll TEMP, TEMP, 0 + BASE_SHIFT
  270. daddu AO, AORIG, L
  271. daddu BO, B, TEMP
  272. #endif
  273. #if defined(LN) || defined(LT)
  274. LD b1, 0 * SIZE(BO)
  275. LD b2, 1 * SIZE(BO)
  276. SUB c11, b1, c11
  277. SUB c12, b2, c12
  278. #else
  279. LD b1, 0 * SIZE(AO)
  280. LD b2, 1 * SIZE(AO)
  281. SUB c11, b1, c11
  282. SUB c12, b2, c12
  283. #endif
  284. #ifdef LN
  285. LD b1, 3 * SIZE(AO)
  286. LD b2, 2 * SIZE(AO)
  287. LD b3, 0 * SIZE(AO)
  288. MUL c12, b1, c12
  289. NMSUB c11, c11, b2, c12
  290. MUL c11, b3, c11
  291. #endif
  292. #ifdef LT
  293. LD b1, 0 * SIZE(AO)
  294. LD b2, 1 * SIZE(AO)
  295. LD b3, 3 * SIZE(AO)
  296. MUL c11, b1, c11
  297. NMSUB c12, c12, b2, c11
  298. MUL c12, b3, c12
  299. #endif
  300. #if defined(RN) || defined(RT)
  301. LD b1, 0 * SIZE(BO)
  302. MUL c11, b1, c11
  303. MUL c12, b1, c12
  304. #endif
  305. #ifdef LN
  306. daddiu CO1, CO1, -2 * SIZE
  307. #endif
  308. #if defined(LN) || defined(LT)
  309. ST c11, 0 * SIZE(BO)
  310. ST c12, 1 * SIZE(BO)
  311. #else
  312. ST c11, 0 * SIZE(AO)
  313. ST c12, 1 * SIZE(AO)
  314. #endif
  315. ST c11, 0 * SIZE(CO1)
  316. ST c12, 1 * SIZE(CO1)
  317. #ifndef LN
  318. daddiu CO1, CO1, 2 * SIZE
  319. #endif
  320. #ifdef RT
  321. dsll TEMP, K, 1 + BASE_SHIFT
  322. daddu AORIG, AORIG, TEMP
  323. #endif
  324. #if defined(LT) || defined(RN)
  325. dsubu TEMP, K, KK
  326. dsll L, TEMP, 1 + BASE_SHIFT
  327. dsll TEMP, TEMP, 0 + BASE_SHIFT
  328. daddu AO, AO, L
  329. daddu BO, BO, TEMP
  330. #endif
  331. #ifdef LT
  332. daddiu KK, KK, 2
  333. #endif
  334. #ifdef LN
  335. daddiu KK, KK, -2
  336. #endif
  337. daddiu I, I, -1
  338. bgtz I, .L71
  339. NOP
  340. .align 3
  341. .L80:
  342. andi I, M, 1
  343. blez I, .L89
  344. NOP
  345. #if defined(LT) || defined(RN)
  346. LD a1, 0 * SIZE(AO)
  347. MTC $0, c11
  348. LD a2, 1 * SIZE(AO)
  349. LD a3, 2 * SIZE(AO)
  350. LD a4, 3 * SIZE(AO)
  351. LD b1, 0 * SIZE(B)
  352. LD b2, 1 * SIZE(B)
  353. MOV c21, c11
  354. LD b3, 2 * SIZE(B)
  355. LD b4, 3 * SIZE(B)
  356. LD b5, 4 * SIZE(B)
  357. LD b6, 8 * SIZE(B)
  358. LD b7, 12 * SIZE(B)
  359. dsra L, KK, 2
  360. blez L, .L85
  361. move BO, B
  362. #else
  363. #ifdef LN
  364. dsll TEMP, K, BASE_SHIFT
  365. dsubu AORIG, AORIG, TEMP
  366. #endif
  367. dsll TEMP, KK, BASE_SHIFT
  368. daddu AO, AORIG, TEMP
  369. daddu BO, B, TEMP
  370. dsubu TEMP, K, KK
  371. LD a1, 0 * SIZE(AO)
  372. MTC $0, c11
  373. LD a2, 1 * SIZE(AO)
  374. LD a3, 2 * SIZE(AO)
  375. LD a4, 3 * SIZE(AO)
  376. LD b1, 0 * SIZE(BO)
  377. LD b2, 1 * SIZE(BO)
  378. LD b3, 2 * SIZE(BO)
  379. LD b4, 3 * SIZE(BO)
  380. MOV c21, c11
  381. LD b5, 4 * SIZE(BO)
  382. LD b6, 8 * SIZE(BO)
  383. LD b7, 12 * SIZE(BO)
  384. dsra L, TEMP, 2
  385. blez L, .L85
  386. NOP
  387. #endif
  388. .align 3
  389. .L82:
  390. LD a1, 0 * SIZE(AO)
  391. LD b1, 0 * SIZE(BO)
  392. MADD c11, c11, a1, b1
  393. LD a1, 1 * SIZE(AO)
  394. LD b1, 1 * SIZE(BO)
  395. MADD c21, c21, a1, b1
  396. LD a1, 2 * SIZE(AO)
  397. LD b1, 2 * SIZE(BO)
  398. MADD c11, c11, a1, b1
  399. LD a1, 3 * SIZE(AO)
  400. LD b1, 3 * SIZE(BO)
  401. MADD c21, c21, a1, b1
  402. daddiu L, L, -1
  403. daddiu AO, AO, 4 * SIZE
  404. bgtz L, .L82
  405. daddiu BO, BO, 4 * SIZE
  406. .align 3
  407. .L85:
  408. #if defined(LT) || defined(RN)
  409. andi L, KK, 3
  410. #else
  411. andi L, TEMP, 3
  412. #endif
  413. NOP
  414. blez L, .L88
  415. NOP
  416. .align 3
  417. .L86:
  418. LD a1, 0 * SIZE(AO)
  419. LD b1, 0 * SIZE(BO)
  420. MADD c11, c11, a1, b1
  421. daddiu L, L, -1
  422. daddiu AO, AO, 1 * SIZE
  423. bgtz L, .L86
  424. daddiu BO, BO, 1 * SIZE
  425. .L88:
  426. ADD c11, c11, c21
  427. #if defined(LN) || defined(RT)
  428. #ifdef LN
  429. daddiu TEMP, KK, -1
  430. #else
  431. daddiu TEMP, KK, -1
  432. #endif
  433. dsll TEMP, TEMP, 0 + BASE_SHIFT
  434. daddu AO, AORIG, TEMP
  435. daddu BO, B, TEMP
  436. #endif
  437. #if defined(LN) || defined(LT)
  438. LD b1, 0 * SIZE(BO)
  439. SUB c11, b1, c11
  440. #else
  441. LD b1, 0 * SIZE(AO)
  442. SUB c11, b1, c11
  443. #endif
  444. #if defined(LN) || defined(LT)
  445. LD b1, 0 * SIZE(AO)
  446. MUL c11, b1, c11
  447. #endif
  448. #if defined(RN) || defined(RT)
  449. LD b1, 0 * SIZE(BO)
  450. MUL c11, b1, c11
  451. #endif
  452. #ifdef LN
  453. daddiu CO1, CO1, -1 * SIZE
  454. #endif
  455. #if defined(LN) || defined(LT)
  456. ST c11, 0 * SIZE(BO)
  457. #else
  458. ST c11, 0 * SIZE(AO)
  459. #endif
  460. ST c11, 0 * SIZE(CO1)
  461. #ifndef LN
  462. daddiu CO1, CO1, 1 * SIZE
  463. #endif
  464. #ifdef RT
  465. dsll TEMP, K, BASE_SHIFT
  466. daddu AORIG, AORIG, TEMP
  467. #endif
  468. #if defined(LT) || defined(RN)
  469. dsubu TEMP, K, KK
  470. dsll TEMP, TEMP, 0 + BASE_SHIFT
  471. daddu AO, AO, TEMP
  472. daddu BO, BO, TEMP
  473. #endif
  474. #ifdef LT
  475. daddiu KK, KK, 1
  476. #endif
  477. #ifdef LN
  478. daddiu KK, KK, -1
  479. #endif
  480. .align 3
  481. .L89:
  482. #ifdef LN
  483. dsll TEMP, K, BASE_SHIFT
  484. daddu B, B, TEMP
  485. #endif
  486. #if defined(LT) || defined(RN)
  487. move B, BO
  488. #endif
  489. #ifdef RN
  490. daddiu KK, KK, 1
  491. #endif
  492. #ifdef RT
  493. daddiu KK, KK, -1
  494. #endif
  495. .align 3
  496. .L30:
  497. andi J, N, 2
  498. blez J, .L50
  499. NOP
  500. #ifdef RT
  501. dsll TEMP, K, 1 + BASE_SHIFT
  502. dsubu B, B, TEMP
  503. dsll TEMP, LDC, 1
  504. dsubu C, C, TEMP
  505. #endif
  506. move AO, A
  507. move CO1, C
  508. daddu CO2, C, LDC
  509. #ifdef LN
  510. daddu KK, M, OFFSET
  511. #endif
  512. #ifdef LT
  513. move KK, OFFSET
  514. #endif
  515. #if defined(LN) || defined(RT)
  516. move AORIG, A
  517. #else
  518. move AO, A
  519. #endif
  520. #ifndef RT
  521. daddu C, CO2, LDC
  522. #endif
  523. dsra I, M, 1
  524. blez I, .L60
  525. NOP
  526. .L51:
  527. #if defined(LT) || defined(RN)
  528. LD a1, 0 * SIZE(AO)
  529. MTC $0, c11
  530. LD a2, 1 * SIZE(AO)
  531. MOV c21, c11
  532. LD a5, 4 * SIZE(AO)
  533. LD b1, 0 * SIZE(B)
  534. MOV c12, c11
  535. LD b2, 1 * SIZE(B)
  536. MOV c22, c11
  537. LD b3, 2 * SIZE(B)
  538. LD b5, 4 * SIZE(B)
  539. dsra L, KK, 2
  540. LD b6, 8 * SIZE(B)
  541. LD b7, 12 * SIZE(B)
  542. blez L, .L55
  543. move BO, B
  544. #else
  545. #ifdef LN
  546. dsll TEMP, K, 1 + BASE_SHIFT
  547. dsubu AORIG, AORIG, TEMP
  548. #endif
  549. dsll L, KK, 1 + BASE_SHIFT
  550. dsll TEMP, KK, 1 + BASE_SHIFT
  551. daddu AO, AORIG, L
  552. daddu BO, B, TEMP
  553. dsubu TEMP, K, KK
  554. LD a1, 0 * SIZE(AO)
  555. MTC $0, c11
  556. LD a2, 1 * SIZE(AO)
  557. MOV c21, c11
  558. LD a5, 4 * SIZE(AO)
  559. LD b1, 0 * SIZE(BO)
  560. MOV c12, c11
  561. LD b2, 1 * SIZE(BO)
  562. MOV c22, c11
  563. LD b3, 2 * SIZE(BO)
  564. LD b5, 4 * SIZE(BO)
  565. dsra L, TEMP, 2
  566. LD b6, 8 * SIZE(BO)
  567. LD b7, 12 * SIZE(BO)
  568. blez L, .L55
  569. NOP
  570. #endif
  571. .align 3
  572. .L52:
  573. MADD c11, c11, a1, b1
  574. LD a3, 2 * SIZE(AO)
  575. MADD c21, c21, a1, b2
  576. LD b4, 3 * SIZE(BO)
  577. MADD c12, c12, a2, b1
  578. LD a4, 3 * SIZE(AO)
  579. MADD c22, c22, a2, b2
  580. LD b1, 8 * SIZE(BO)
  581. MADD c11, c11, a3, b3
  582. LD a1, 8 * SIZE(AO)
  583. MADD c21, c21, a3, b4
  584. LD b2, 5 * SIZE(BO)
  585. MADD c12, c12, a4, b3
  586. LD a2, 5 * SIZE(AO)
  587. MADD c22, c22, a4, b4
  588. LD b3, 6 * SIZE(BO)
  589. MADD c11, c11, a5, b5
  590. LD a3, 6 * SIZE(AO)
  591. MADD c21, c21, a5, b2
  592. LD b4, 7 * SIZE(BO)
  593. MADD c12, c12, a2, b5
  594. LD a4, 7 * SIZE(AO)
  595. MADD c22, c22, a2, b2
  596. LD b5, 12 * SIZE(BO)
  597. MADD c11, c11, a3, b3
  598. LD a5, 12 * SIZE(AO)
  599. MADD c21, c21, a3, b4
  600. LD b2, 9 * SIZE(BO)
  601. MADD c12, c12, a4, b3
  602. LD a2, 9 * SIZE(AO)
  603. MADD c22, c22, a4, b4
  604. LD b3, 10 * SIZE(BO)
  605. daddiu AO, AO, 8 * SIZE
  606. daddiu L, L, -1
  607. bgtz L, .L52
  608. daddiu BO, BO, 8 * SIZE
  609. .align 3
  610. .L55:
  611. #if defined(LT) || defined(RN)
  612. andi L, KK, 3
  613. #else
  614. andi L, TEMP, 3
  615. #endif
  616. NOP
  617. blez L, .L58
  618. NOP
  619. .align 3
  620. .L56:
  621. MADD c11, c11, a1, b1
  622. LD a2, 1 * SIZE(AO)
  623. MADD c21, c21, a1, b2
  624. LD a1, 2 * SIZE(AO)
  625. MADD c12, c12, a2, b1
  626. LD b1, 2 * SIZE(BO)
  627. MADD c22, c22, a2, b2
  628. LD b2, 3 * SIZE(BO)
  629. daddiu L, L, -1
  630. daddiu AO, AO, 2 * SIZE
  631. bgtz L, .L56
  632. daddiu BO, BO, 2 * SIZE
  633. .L58:
  634. #if defined(LN) || defined(RT)
  635. #ifdef LN
  636. daddiu TEMP, KK, -2
  637. #else
  638. daddiu TEMP, KK, -2
  639. #endif
  640. dsll L, TEMP, 1 + BASE_SHIFT
  641. dsll TEMP, TEMP, 1 + BASE_SHIFT
  642. daddu AO, AORIG, L
  643. daddu BO, B, TEMP
  644. #endif
  645. #if defined(LN) || defined(LT)
  646. LD b1, 0 * SIZE(BO)
  647. LD b2, 1 * SIZE(BO)
  648. LD b3, 2 * SIZE(BO)
  649. LD b4, 3 * SIZE(BO)
  650. SUB c11, b1, c11
  651. SUB c21, b2, c21
  652. SUB c12, b3, c12
  653. SUB c22, b4, c22
  654. #else
  655. LD b1, 0 * SIZE(AO)
  656. LD b2, 1 * SIZE(AO)
  657. LD b3, 2 * SIZE(AO)
  658. LD b4, 3 * SIZE(AO)
  659. SUB c11, b1, c11
  660. SUB c12, b2, c12
  661. SUB c21, b3, c21
  662. SUB c22, b4, c22
  663. #endif
  664. #ifdef LN
  665. LD b1, 3 * SIZE(AO)
  666. LD b2, 2 * SIZE(AO)
  667. LD b3, 0 * SIZE(AO)
  668. MUL c12, b1, c12
  669. MUL c22, b1, c22
  670. NMSUB c11, c11, b2, c12
  671. NMSUB c21, c21, b2, c22
  672. MUL c11, b3, c11
  673. MUL c21, b3, c21
  674. #endif
  675. #ifdef LT
  676. LD b1, 0 * SIZE(AO)
  677. LD b2, 1 * SIZE(AO)
  678. LD b3, 3 * SIZE(AO)
  679. MUL c11, b1, c11
  680. MUL c21, b1, c21
  681. NMSUB c12, c12, b2, c11
  682. NMSUB c22, c22, b2, c21
  683. MUL c12, b3, c12
  684. MUL c22, b3, c22
  685. #endif
  686. #ifdef RN
  687. LD b1, 0 * SIZE(BO)
  688. LD b2, 1 * SIZE(BO)
  689. LD b3, 3 * SIZE(BO)
  690. MUL c11, b1, c11
  691. MUL c12, b1, c12
  692. NMSUB c21, c21, b2, c11
  693. NMSUB c22, c22, b2, c12
  694. MUL c21, b3, c21
  695. MUL c22, b3, c22
  696. #endif
  697. #ifdef RT
  698. LD b1, 3 * SIZE(BO)
  699. LD b2, 2 * SIZE(BO)
  700. LD b3, 0 * SIZE(BO)
  701. MUL c21, b1, c21
  702. MUL c22, b1, c22
  703. NMSUB c11, c11, b2, c21
  704. NMSUB c12, c12, b2, c22
  705. MUL c11, b3, c11
  706. MUL c12, b3, c12
  707. #endif
  708. #ifdef LN
  709. daddiu CO1, CO1, -2 * SIZE
  710. daddiu CO2, CO2, -2 * SIZE
  711. #endif
  712. #if defined(LN) || defined(LT)
  713. ST c11, 0 * SIZE(BO)
  714. ST c21, 1 * SIZE(BO)
  715. ST c12, 2 * SIZE(BO)
  716. ST c22, 3 * SIZE(BO)
  717. #else
  718. ST c11, 0 * SIZE(AO)
  719. ST c12, 1 * SIZE(AO)
  720. ST c21, 2 * SIZE(AO)
  721. ST c22, 3 * SIZE(AO)
  722. #endif
  723. ST c11, 0 * SIZE(CO1)
  724. ST c12, 1 * SIZE(CO1)
  725. ST c21, 0 * SIZE(CO2)
  726. ST c22, 1 * SIZE(CO2)
  727. #ifndef LN
  728. daddiu CO1, CO1, 2 * SIZE
  729. daddiu CO2, CO2, 2 * SIZE
  730. #endif
  731. #ifdef RT
  732. dsll TEMP, K, 1 + BASE_SHIFT
  733. daddu AORIG, AORIG, TEMP
  734. #endif
  735. #if defined(LT) || defined(RN)
  736. dsubu TEMP, K, KK
  737. dsll TEMP, TEMP, 1 + BASE_SHIFT
  738. daddu AO, AO, TEMP
  739. daddu BO, BO, TEMP
  740. #endif
  741. #ifdef LT
  742. daddiu KK, KK, 2
  743. #endif
  744. #ifdef LN
  745. daddiu KK, KK, -2
  746. #endif
  747. MTC $0, a1
  748. MOV c11, a1
  749. MOV c21, a1
  750. MOV c31, a1
  751. daddiu I, I, -1
  752. bgtz I, .L51
  753. MOV c41, c11
  754. .align 3
  755. .L60:
  756. andi I, M, 1
  757. blez I, .L69
  758. NOP
  759. #if defined(LT) || defined(RN)
  760. dsra L, KK, 2
  761. LD a1, 0 * SIZE(AO)
  762. MTC $0, c11
  763. LD a2, 1 * SIZE(AO)
  764. MOV c21, c11
  765. LD a3, 2 * SIZE(AO)
  766. MOV c31, c11
  767. LD a4, 3 * SIZE(AO)
  768. MOV c41, c11
  769. LD b1, 0 * SIZE(B)
  770. LD b2, 1 * SIZE(B)
  771. LD b3, 2 * SIZE(B)
  772. LD b4, 3 * SIZE(B)
  773. LD b5, 4 * SIZE(B)
  774. LD b6, 8 * SIZE(B)
  775. LD b7, 12 * SIZE(B)
  776. blez L, .L65
  777. move BO, B
  778. #else
  779. #ifdef LN
  780. dsll TEMP, K, BASE_SHIFT
  781. dsubu AORIG, AORIG, TEMP
  782. #endif
  783. dsll L, KK, 0 + BASE_SHIFT
  784. dsll TEMP, KK, 1 + BASE_SHIFT
  785. daddu AO, AORIG, L
  786. daddu BO, B, TEMP
  787. dsubu TEMP, K, KK
  788. dsra L, TEMP, 2
  789. LD a1, 0 * SIZE(AO)
  790. MTC $0, c11
  791. LD a2, 1 * SIZE(AO)
  792. MOV c21, c11
  793. LD a3, 2 * SIZE(AO)
  794. MOV c31, c11
  795. LD a4, 3 * SIZE(AO)
  796. MOV c41, c11
  797. LD b1, 0 * SIZE(BO)
  798. LD b2, 1 * SIZE(BO)
  799. LD b3, 2 * SIZE(BO)
  800. LD b4, 3 * SIZE(BO)
  801. LD b5, 4 * SIZE(BO)
  802. LD b6, 8 * SIZE(BO)
  803. LD b7, 12 * SIZE(BO)
  804. blez L, .L65
  805. NOP
  806. #endif
  807. .align 3
  808. .L62:
  809. MADD c11, c11, a1, b1
  810. LD b1, 4 * SIZE(BO)
  811. MADD c21, c21, a1, b2
  812. LD b2, 5 * SIZE(BO)
  813. MADD c31, c31, a2, b3
  814. LD b3, 6 * SIZE(BO)
  815. MADD c41, c41, a2, b4
  816. LD b4, 7 * SIZE(BO)
  817. LD a1, 4 * SIZE(AO)
  818. LD a2, 5 * SIZE(AO)
  819. MADD c11, c11, a3, b1
  820. LD b1, 8 * SIZE(BO)
  821. MADD c21, c21, a3, b2
  822. LD b2, 9 * SIZE(BO)
  823. MADD c31, c31, a4, b3
  824. LD b3, 10 * SIZE(BO)
  825. MADD c41, c41, a4, b4
  826. LD b4, 11 * SIZE(BO)
  827. LD a3, 6 * SIZE(AO)
  828. LD a4, 7 * SIZE(AO)
  829. daddiu L, L, -1
  830. daddiu AO, AO, 4 * SIZE
  831. bgtz L, .L62
  832. daddiu BO, BO, 8 * SIZE
  833. .align 3
  834. .L65:
  835. #if defined(LT) || defined(RN)
  836. andi L, KK, 3
  837. #else
  838. andi L, TEMP, 3
  839. #endif
  840. NOP
  841. blez L, .L68
  842. NOP
  843. .align 3
  844. .L66:
  845. MADD c11, c11, a1, b1
  846. LD b1, 2 * SIZE(BO)
  847. MADD c21, c21, a1, b2
  848. LD b2, 3 * SIZE(BO)
  849. LD a1, 1 * SIZE(AO)
  850. daddiu L, L, -1
  851. daddiu AO, AO, 1 * SIZE
  852. bgtz L, .L66
  853. daddiu BO, BO, 2 * SIZE
  854. .L68:
  855. ADD c11, c11, c31
  856. ADD c21, c21, c41
  857. #if defined(LN) || defined(RT)
  858. #ifdef LN
  859. daddiu TEMP, KK, -1
  860. #else
  861. daddiu TEMP, KK, -2
  862. #endif
  863. dsll L, TEMP, 0 + BASE_SHIFT
  864. dsll TEMP, TEMP, 1 + BASE_SHIFT
  865. daddu AO, AORIG, L
  866. daddu BO, B, TEMP
  867. #endif
  868. #if defined(LN) || defined(LT)
  869. LD b1, 0 * SIZE(BO)
  870. LD b2, 1 * SIZE(BO)
  871. SUB c11, b1, c11
  872. SUB c21, b2, c21
  873. #else
  874. LD b1, 0 * SIZE(AO)
  875. LD b2, 1 * SIZE(AO)
  876. SUB c11, b1, c11
  877. SUB c21, b2, c21
  878. #endif
  879. #if defined(LN) || defined(LT)
  880. LD b3, 0 * SIZE(AO)
  881. MUL c11, b3, c11
  882. MUL c21, b3, c21
  883. #endif
  884. #ifdef RN
  885. LD b1, 0 * SIZE(BO)
  886. LD b2, 1 * SIZE(BO)
  887. LD b3, 3 * SIZE(BO)
  888. MUL c11, b1, c11
  889. NMSUB c21, c21, b2, c11
  890. MUL c21, b3, c21
  891. #endif
  892. #ifdef RT
  893. LD b1, 3 * SIZE(BO)
  894. LD b2, 2 * SIZE(BO)
  895. LD b3, 0 * SIZE(BO)
  896. MUL c21, b1, c21
  897. NMSUB c11, c11, b2, c21
  898. MUL c11, b3, c11
  899. #endif
  900. #ifdef LN
  901. daddiu CO1, CO1, -1 * SIZE
  902. daddiu CO2, CO2, -1 * SIZE
  903. #endif
  904. #if defined(LN) || defined(LT)
  905. ST c11, 0 * SIZE(BO)
  906. ST c21, 1 * SIZE(BO)
  907. #else
  908. ST c11, 0 * SIZE(AO)
  909. ST c21, 1 * SIZE(AO)
  910. #endif
  911. ST c11, 0 * SIZE(CO1)
  912. ST c21, 0 * SIZE(CO2)
  913. #ifndef LN
  914. daddiu CO1, CO1, 1 * SIZE
  915. daddiu CO2, CO2, 1 * SIZE
  916. #endif
  917. #ifdef RT
  918. dsll TEMP, K, 0 + BASE_SHIFT
  919. daddu AORIG, AORIG, TEMP
  920. #endif
  921. #if defined(LT) || defined(RN)
  922. dsubu TEMP, K, KK
  923. dsll L, TEMP, 0 + BASE_SHIFT
  924. dsll TEMP, TEMP, 1 + BASE_SHIFT
  925. daddu AO, AO, L
  926. daddu BO, BO, TEMP
  927. #endif
  928. #ifdef LT
  929. daddiu KK, KK, 1
  930. #endif
  931. #ifdef LN
  932. daddiu KK, KK, -1
  933. #endif
  934. .align 3
  935. .L69:
  936. #ifdef LN
  937. dsll TEMP, K, 1 + BASE_SHIFT
  938. daddu B, B, TEMP
  939. #endif
  940. #if defined(LT) || defined(RN)
  941. move B, BO
  942. #endif
  943. #ifdef RN
  944. daddiu KK, KK, 2
  945. #endif
  946. #ifdef RT
  947. daddiu KK, KK, -2
  948. #endif
  949. .align 3
  950. .L50:
  951. andi J, N, 4
  952. blez J, .L70
  953. move AO, A
  954. #ifdef RT
  955. dsll TEMP, K, 2 + BASE_SHIFT
  956. dsubu B, B, TEMP
  957. dsll TEMP, LDC, 2
  958. dsubu C, C, TEMP
  959. #endif
  960. move CO1, C
  961. MTC $0, c11
  962. daddu CO2, C, LDC
  963. daddu CO3, CO2, LDC
  964. daddu CO4, CO3, LDC
  965. MOV c21, c11
  966. dsra I, M, 1
  967. MOV c31, c11
  968. #ifdef LN
  969. daddu KK, M, OFFSET
  970. #endif
  971. #ifdef LT
  972. move KK, OFFSET
  973. #endif
  974. #if defined(LN) || defined(RT)
  975. move AORIG, A
  976. #else
  977. move AO, A
  978. #endif
  979. #ifndef RT
  980. daddu C, CO4, LDC
  981. #endif
  982. blez I, .L40
  983. MOV c41, c11
  984. .L31:
  985. #if defined(LT) || defined(RN)
  986. LD a1, 0 * SIZE(AO)
  987. LD a3, 4 * SIZE(AO)
  988. LD b1, 0 * SIZE(B)
  989. MOV c12, c11
  990. LD b2, 1 * SIZE(B)
  991. MOV c22, c11
  992. LD b3, 2 * SIZE(B)
  993. MOV c32, c11
  994. LD b4, 3 * SIZE(B)
  995. MOV c42, c11
  996. LD b5, 4 * SIZE(B)
  997. dsra L, KK, 2
  998. LD b6, 8 * SIZE(B)
  999. LD b7, 12 * SIZE(B)
  1000. blez L, .L35
  1001. move BO, B
  1002. #else
  1003. #ifdef LN
  1004. dsll TEMP, K, 1 + BASE_SHIFT
  1005. dsubu AORIG, AORIG, TEMP
  1006. #endif
  1007. dsll L, KK, 1 + BASE_SHIFT
  1008. dsll TEMP, KK, 2 + BASE_SHIFT
  1009. daddu AO, AORIG, L
  1010. daddu BO, B, TEMP
  1011. dsubu TEMP, K, KK
  1012. LD a1, 0 * SIZE(AO)
  1013. LD a3, 4 * SIZE(AO)
  1014. LD b1, 0 * SIZE(BO)
  1015. MOV c12, c11
  1016. LD b2, 1 * SIZE(BO)
  1017. MOV c22, c11
  1018. LD b3, 2 * SIZE(BO)
  1019. MOV c32, c11
  1020. LD b4, 3 * SIZE(BO)
  1021. MOV c42, c11
  1022. LD b5, 4 * SIZE(BO)
  1023. dsra L, TEMP, 2
  1024. LD b6, 8 * SIZE(BO)
  1025. LD b7, 12 * SIZE(BO)
  1026. blez L, .L35
  1027. NOP
  1028. #endif
  1029. .align 3
  1030. .L32:
  1031. MADD c11, c11, a1, b1
  1032. LD a2, 1 * SIZE(AO)
  1033. MADD c21, c21, a1, b2
  1034. daddiu L, L, -1
  1035. MADD c31, c31, a1, b3
  1036. NOP
  1037. MADD c41, c41, a1, b4
  1038. LD a1, 2 * SIZE(AO)
  1039. MADD c12, c12, a2, b1
  1040. LD b1, 16 * SIZE(BO)
  1041. MADD c22, c22, a2, b2
  1042. LD b2, 5 * SIZE(BO)
  1043. MADD c32, c32, a2, b3
  1044. LD b3, 6 * SIZE(BO)
  1045. MADD c42, c42, a2, b4
  1046. LD b4, 7 * SIZE(BO)
  1047. MADD c11, c11, a1, b5
  1048. LD a2, 3 * SIZE(AO)
  1049. MADD c21, c21, a1, b2
  1050. NOP
  1051. MADD c31, c31, a1, b3
  1052. NOP
  1053. MADD c41, c41, a1, b4
  1054. LD a1, 8 * SIZE(AO)
  1055. MADD c12, c12, a2, b5
  1056. LD b5, 20 * SIZE(BO)
  1057. MADD c22, c22, a2, b2
  1058. LD b2, 9 * SIZE(BO)
  1059. MADD c32, c32, a2, b3
  1060. LD b3, 10 * SIZE(BO)
  1061. MADD c42, c42, a2, b4
  1062. LD b4, 11 * SIZE(BO)
  1063. MADD c11, c11, a3, b6
  1064. LD a2, 5 * SIZE(AO)
  1065. MADD c21, c21, a3, b2
  1066. NOP
  1067. MADD c31, c31, a3, b3
  1068. NOP
  1069. MADD c41, c41, a3, b4
  1070. LD a3, 6 * SIZE(AO)
  1071. MADD c12, c12, a2, b6
  1072. LD b6, 24 * SIZE(BO)
  1073. MADD c22, c22, a2, b2
  1074. LD b2, 13 * SIZE(BO)
  1075. MADD c32, c32, a2, b3
  1076. LD b3, 14 * SIZE(BO)
  1077. MADD c42, c42, a2, b4
  1078. LD b4, 15 * SIZE(BO)
  1079. MADD c11, c11, a3, b7
  1080. LD a2, 7 * SIZE(AO)
  1081. MADD c21, c21, a3, b2
  1082. daddiu AO, AO, 8 * SIZE
  1083. MADD c31, c31, a3, b3
  1084. daddiu BO, BO, 16 * SIZE
  1085. MADD c41, c41, a3, b4
  1086. LD a3, 4 * SIZE(AO)
  1087. MADD c12, c12, a2, b7
  1088. LD b7, 12 * SIZE(BO)
  1089. MADD c22, c22, a2, b2
  1090. LD b2, 1 * SIZE(BO)
  1091. MADD c32, c32, a2, b3
  1092. LD b3, 2 * SIZE(BO)
  1093. MADD c42, c42, a2, b4
  1094. NOP
  1095. bgtz L, .L32
  1096. LD b4, 3 * SIZE(BO)
  1097. .align 3
  1098. .L35:
  1099. #if defined(LT) || defined(RN)
  1100. andi L, KK, 3
  1101. #else
  1102. andi L, TEMP, 3
  1103. #endif
  1104. NOP
  1105. blez L, .L38
  1106. NOP
  1107. .align 3
  1108. .L36:
  1109. MADD c11, c11, a1, b1
  1110. LD a2, 1 * SIZE(AO)
  1111. MADD c21, c21, a1, b2
  1112. daddiu L, L, -1
  1113. MADD c31, c31, a1, b3
  1114. daddiu AO, AO, 2 * SIZE
  1115. MADD c41, c41, a1, b4
  1116. LD a1, 0 * SIZE(AO)
  1117. MADD c12, c12, a2, b1
  1118. LD b1, 4 * SIZE(BO)
  1119. MADD c22, c22, a2, b2
  1120. LD b2, 5 * SIZE(BO)
  1121. MADD c32, c32, a2, b3
  1122. LD b3, 6 * SIZE(BO)
  1123. MADD c42, c42, a2, b4
  1124. LD b4, 7 * SIZE(BO)
  1125. bgtz L, .L36
  1126. daddiu BO, BO, 4 * SIZE
  1127. .L38:
  1128. #if defined(LN) || defined(RT)
  1129. #ifdef LN
  1130. daddiu TEMP, KK, -2
  1131. #else
  1132. daddiu TEMP, KK, -4
  1133. #endif
  1134. dsll L, TEMP, 1 + BASE_SHIFT
  1135. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1136. daddu AO, AORIG, L
  1137. daddu BO, B, TEMP
  1138. #endif
  1139. #if defined(LN) || defined(LT)
  1140. LD b1, 0 * SIZE(BO)
  1141. LD b2, 1 * SIZE(BO)
  1142. LD b3, 2 * SIZE(BO)
  1143. LD b4, 3 * SIZE(BO)
  1144. LD b5, 4 * SIZE(BO)
  1145. LD b6, 5 * SIZE(BO)
  1146. LD b7, 6 * SIZE(BO)
  1147. LD b8, 7 * SIZE(BO)
  1148. SUB c11, b1, c11
  1149. SUB c21, b2, c21
  1150. SUB c31, b3, c31
  1151. SUB c41, b4, c41
  1152. SUB c12, b5, c12
  1153. SUB c22, b6, c22
  1154. SUB c32, b7, c32
  1155. SUB c42, b8, c42
  1156. #else
  1157. LD b1, 0 * SIZE(AO)
  1158. LD b2, 1 * SIZE(AO)
  1159. LD b3, 2 * SIZE(AO)
  1160. LD b4, 3 * SIZE(AO)
  1161. LD b5, 4 * SIZE(AO)
  1162. LD b6, 5 * SIZE(AO)
  1163. LD b7, 6 * SIZE(AO)
  1164. LD b8, 7 * SIZE(AO)
  1165. SUB c11, b1, c11
  1166. SUB c12, b2, c12
  1167. SUB c21, b3, c21
  1168. SUB c22, b4, c22
  1169. SUB c31, b5, c31
  1170. SUB c32, b6, c32
  1171. SUB c41, b7, c41
  1172. SUB c42, b8, c42
  1173. #endif
  1174. #ifdef LN
  1175. LD b1, 3 * SIZE(AO)
  1176. LD b2, 2 * SIZE(AO)
  1177. LD b3, 0 * SIZE(AO)
  1178. MUL c12, b1, c12
  1179. MUL c22, b1, c22
  1180. MUL c32, b1, c32
  1181. MUL c42, b1, c42
  1182. NMSUB c11, c11, b2, c12
  1183. NMSUB c21, c21, b2, c22
  1184. NMSUB c31, c31, b2, c32
  1185. NMSUB c41, c41, b2, c42
  1186. MUL c11, b3, c11
  1187. MUL c21, b3, c21
  1188. MUL c31, b3, c31
  1189. MUL c41, b3, c41
  1190. #endif
  1191. #ifdef LT
  1192. LD b1, 0 * SIZE(AO)
  1193. LD b2, 1 * SIZE(AO)
  1194. LD b3, 3 * SIZE(AO)
  1195. MUL c11, b1, c11
  1196. MUL c21, b1, c21
  1197. MUL c31, b1, c31
  1198. MUL c41, b1, c41
  1199. NMSUB c12, c12, b2, c11
  1200. NMSUB c22, c22, b2, c21
  1201. NMSUB c32, c32, b2, c31
  1202. NMSUB c42, c42, b2, c41
  1203. MUL c12, b3, c12
  1204. MUL c22, b3, c22
  1205. MUL c32, b3, c32
  1206. MUL c42, b3, c42
  1207. #endif
  1208. #ifdef RN
  1209. LD b1, 0 * SIZE(BO)
  1210. LD b2, 1 * SIZE(BO)
  1211. LD b3, 2 * SIZE(BO)
  1212. LD b4, 3 * SIZE(BO)
  1213. MUL c11, b1, c11
  1214. MUL c12, b1, c12
  1215. NMSUB c21, c21, b2, c11
  1216. NMSUB c22, c22, b2, c12
  1217. NMSUB c31, c31, b3, c11
  1218. NMSUB c32, c32, b3, c12
  1219. NMSUB c41, c41, b4, c11
  1220. NMSUB c42, c42, b4, c12
  1221. LD b2, 5 * SIZE(BO)
  1222. LD b3, 6 * SIZE(BO)
  1223. LD b4, 7 * SIZE(BO)
  1224. MUL c21, b2, c21
  1225. MUL c22, b2, c22
  1226. NMSUB c31, c31, b3, c21
  1227. NMSUB c32, c32, b3, c22
  1228. NMSUB c41, c41, b4, c21
  1229. NMSUB c42, c42, b4, c22
  1230. LD b3, 10 * SIZE(BO)
  1231. LD b4, 11 * SIZE(BO)
  1232. MUL c31, b3, c31
  1233. MUL c32, b3, c32
  1234. NMSUB c41, c41, b4, c31
  1235. NMSUB c42, c42, b4, c32
  1236. LD b4, 15 * SIZE(BO)
  1237. MUL c41, b4, c41
  1238. MUL c42, b4, c42
  1239. #endif
  1240. #ifdef RT
  1241. LD b5, 15 * SIZE(BO)
  1242. LD b6, 14 * SIZE(BO)
  1243. LD b7, 13 * SIZE(BO)
  1244. LD b8, 12 * SIZE(BO)
  1245. MUL c41, b5, c41
  1246. MUL c42, b5, c42
  1247. NMSUB c31, c31, b6, c41
  1248. NMSUB c32, c32, b6, c42
  1249. NMSUB c21, c21, b7, c41
  1250. NMSUB c22, c22, b7, c42
  1251. NMSUB c11, c11, b8, c41
  1252. NMSUB c12, c12, b8, c42
  1253. LD b6, 10 * SIZE(BO)
  1254. LD b7, 9 * SIZE(BO)
  1255. LD b8, 8 * SIZE(BO)
  1256. MUL c31, b6, c31
  1257. MUL c32, b6, c32
  1258. NMSUB c21, c21, b7, c31
  1259. NMSUB c22, c22, b7, c32
  1260. NMSUB c11, c11, b8, c31
  1261. NMSUB c12, c12, b8, c32
  1262. LD b7, 5 * SIZE(BO)
  1263. LD b8, 4 * SIZE(BO)
  1264. MUL c21, b7, c21
  1265. MUL c22, b7, c22
  1266. NMSUB c11, c11, b8, c21
  1267. NMSUB c12, c12, b8, c22
  1268. LD b8, 0 * SIZE(BO)
  1269. MUL c11, b8, c11
  1270. MUL c12, b8, c12
  1271. #endif
  1272. #ifdef LN
  1273. daddiu CO1, CO1, -2 * SIZE
  1274. daddiu CO2, CO2, -2 * SIZE
  1275. daddiu CO3, CO3, -2 * SIZE
  1276. daddiu CO4, CO4, -2 * SIZE
  1277. #endif
  1278. #if defined(LN) || defined(LT)
  1279. ST c11, 0 * SIZE(BO)
  1280. ST c21, 1 * SIZE(BO)
  1281. ST c31, 2 * SIZE(BO)
  1282. ST c41, 3 * SIZE(BO)
  1283. ST c12, 4 * SIZE(BO)
  1284. ST c22, 5 * SIZE(BO)
  1285. ST c32, 6 * SIZE(BO)
  1286. ST c42, 7 * SIZE(BO)
  1287. #else
  1288. ST c11, 0 * SIZE(AO)
  1289. ST c12, 1 * SIZE(AO)
  1290. ST c21, 2 * SIZE(AO)
  1291. ST c22, 3 * SIZE(AO)
  1292. ST c31, 4 * SIZE(AO)
  1293. ST c32, 5 * SIZE(AO)
  1294. ST c41, 6 * SIZE(AO)
  1295. ST c42, 7 * SIZE(AO)
  1296. #endif
  1297. ST c11, 0 * SIZE(CO1)
  1298. ST c12, 1 * SIZE(CO1)
  1299. ST c21, 0 * SIZE(CO2)
  1300. ST c22, 1 * SIZE(CO2)
  1301. ST c31, 0 * SIZE(CO3)
  1302. ST c32, 1 * SIZE(CO3)
  1303. ST c41, 0 * SIZE(CO4)
  1304. ST c42, 1 * SIZE(CO4)
  1305. #ifndef LN
  1306. daddiu CO1, CO1, 2 * SIZE
  1307. daddiu CO2, CO2, 2 * SIZE
  1308. daddiu CO3, CO3, 2 * SIZE
  1309. daddiu CO4, CO4, 2 * SIZE
  1310. #endif
  1311. #ifdef RT
  1312. dsll TEMP, K, 1 + BASE_SHIFT
  1313. daddu AORIG, AORIG, TEMP
  1314. #endif
  1315. #if defined(LT) || defined(RN)
  1316. dsubu TEMP, K, KK
  1317. dsll L, TEMP, 1 + BASE_SHIFT
  1318. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1319. daddu AO, AO, L
  1320. daddu BO, BO, TEMP
  1321. #endif
  1322. #ifdef LT
  1323. daddiu KK, KK, 2
  1324. #endif
  1325. #ifdef LN
  1326. daddiu KK, KK, -2
  1327. #endif
  1328. MTC $0, a1
  1329. MOV c11, a1
  1330. MOV c21, a1
  1331. MOV c31, a1
  1332. daddiu I, I, -1
  1333. bgtz I, .L31
  1334. MOV c41, c11
  1335. .align 3
  1336. .L40:
  1337. andi I, M, 1
  1338. blez I, .L49
  1339. MOV c61, c11
  1340. #if defined(LT) || defined(RN)
  1341. LD a1, 0 * SIZE(AO)
  1342. MOV c71, c11
  1343. LD a2, 1 * SIZE(AO)
  1344. MOV c81, c11
  1345. LD b1, 0 * SIZE(B)
  1346. LD b2, 1 * SIZE(B)
  1347. LD b3, 2 * SIZE(B)
  1348. LD b4, 3 * SIZE(B)
  1349. LD b5, 4 * SIZE(B)
  1350. LD b6, 8 * SIZE(B)
  1351. LD b7, 12 * SIZE(B)
  1352. dsra L, KK, 2
  1353. blez L, .L45
  1354. move BO, B
  1355. #else
  1356. #ifdef LN
  1357. dsll TEMP, K, BASE_SHIFT
  1358. dsubu AORIG, AORIG, TEMP
  1359. #endif
  1360. dsll L, KK, 0 + BASE_SHIFT
  1361. dsll TEMP, KK, 2 + BASE_SHIFT
  1362. daddu AO, AORIG, L
  1363. daddu BO, B, TEMP
  1364. dsubu TEMP, K, KK
  1365. LD a1, 0 * SIZE(AO)
  1366. MOV c71, c11
  1367. LD a2, 1 * SIZE(AO)
  1368. MOV c81, c11
  1369. LD b1, 0 * SIZE(BO)
  1370. LD b2, 1 * SIZE(BO)
  1371. LD b3, 2 * SIZE(BO)
  1372. LD b4, 3 * SIZE(BO)
  1373. LD b5, 4 * SIZE(BO)
  1374. LD b6, 8 * SIZE(BO)
  1375. LD b7, 12 * SIZE(BO)
  1376. dsra L, TEMP, 2
  1377. blez L, .L45
  1378. NOP
  1379. #endif
  1380. .align 3
  1381. .L42:
  1382. MADD c11, c11, a1, b1
  1383. LD b1, 16 * SIZE(BO)
  1384. MADD c21, c21, a1, b2
  1385. LD b2, 5 * SIZE(BO)
  1386. MADD c31, c31, a1, b3
  1387. LD b3, 6 * SIZE(BO)
  1388. MADD c41, c41, a1, b4
  1389. LD b4, 7 * SIZE(BO)
  1390. LD a1, 4 * SIZE(AO)
  1391. daddiu L, L, -1
  1392. MADD c11, c11, a2, b5
  1393. LD b5, 20 * SIZE(BO)
  1394. MADD c21, c21, a2, b2
  1395. LD b2, 9 * SIZE(BO)
  1396. MADD c31, c31, a2, b3
  1397. LD b3, 10 * SIZE(BO)
  1398. MADD c41, c41, a2, b4
  1399. LD b4, 11 * SIZE(BO)
  1400. LD a2, 2 * SIZE(AO)
  1401. daddiu AO, AO, 4 * SIZE
  1402. MADD c11, c11, a2, b6
  1403. LD b6, 24 * SIZE(BO)
  1404. MADD c21, c21, a2, b2
  1405. LD b2, 13 * SIZE(BO)
  1406. MADD c31, c31, a2, b3
  1407. LD b3, 14 * SIZE(BO)
  1408. MADD c41, c41, a2, b4
  1409. LD b4, 15 * SIZE(BO)
  1410. LD a2, -1 * SIZE(AO)
  1411. daddiu BO, BO, 16 * SIZE
  1412. MADD c11, c11, a2, b7
  1413. LD b7, 12 * SIZE(BO)
  1414. MADD c21, c21, a2, b2
  1415. LD b2, 1 * SIZE(BO)
  1416. MADD c31, c31, a2, b3
  1417. LD b3, 2 * SIZE(BO)
  1418. MADD c41, c41, a2, b4
  1419. LD b4, 3 * SIZE(BO)
  1420. bgtz L, .L42
  1421. LD a2, 1 * SIZE(AO)
  1422. .align 3
  1423. .L45:
  1424. #if defined(LT) || defined(RN)
  1425. andi L, KK, 3
  1426. #else
  1427. andi L, TEMP, 3
  1428. #endif
  1429. NOP
  1430. blez L, .L48
  1431. NOP
  1432. .align 3
  1433. .L46:
  1434. MADD c11, c11, a1, b1
  1435. LD b1, 4 * SIZE(BO)
  1436. MADD c21, c21, a1, b2
  1437. LD b2, 5 * SIZE(BO)
  1438. MADD c31, c31, a1, b3
  1439. LD b3, 6 * SIZE(BO)
  1440. MADD c41, c41, a1, b4
  1441. LD a1, 1 * SIZE(AO)
  1442. LD b4, 7 * SIZE(BO)
  1443. daddiu L, L, -1
  1444. daddiu AO, AO, 1 * SIZE
  1445. MOV a2, a2
  1446. bgtz L, .L46
  1447. daddiu BO, BO, 4 * SIZE
  1448. .L48:
  1449. #if defined(LN) || defined(RT)
  1450. #ifdef LN
  1451. daddiu TEMP, KK, -1
  1452. #else
  1453. daddiu TEMP, KK, -4
  1454. #endif
  1455. dsll L, TEMP, 0 + BASE_SHIFT
  1456. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1457. daddu AO, AORIG, L
  1458. daddu BO, B, TEMP
  1459. #endif
  1460. #if defined(LN) || defined(LT)
  1461. LD b1, 0 * SIZE(BO)
  1462. LD b2, 1 * SIZE(BO)
  1463. LD b3, 2 * SIZE(BO)
  1464. LD b4, 3 * SIZE(BO)
  1465. SUB c11, b1, c11
  1466. SUB c21, b2, c21
  1467. SUB c31, b3, c31
  1468. SUB c41, b4, c41
  1469. #else
  1470. LD b1, 0 * SIZE(AO)
  1471. LD b2, 1 * SIZE(AO)
  1472. LD b3, 2 * SIZE(AO)
  1473. LD b4, 3 * SIZE(AO)
  1474. SUB c11, b1, c11
  1475. SUB c21, b2, c21
  1476. SUB c31, b3, c31
  1477. SUB c41, b4, c41
  1478. #endif
  1479. #if defined(LN) || defined(LT)
  1480. LD b1, 0 * SIZE(AO)
  1481. MUL c11, b1, c11
  1482. MUL c21, b1, c21
  1483. MUL c31, b1, c31
  1484. MUL c41, b1, c41
  1485. #endif
  1486. #ifdef RN
  1487. LD b1, 0 * SIZE(BO)
  1488. LD b2, 1 * SIZE(BO)
  1489. LD b3, 2 * SIZE(BO)
  1490. LD b4, 3 * SIZE(BO)
  1491. MUL c11, b1, c11
  1492. NMSUB c21, c21, b2, c11
  1493. NMSUB c31, c31, b3, c11
  1494. NMSUB c41, c41, b4, c11
  1495. LD b2, 5 * SIZE(BO)
  1496. LD b3, 6 * SIZE(BO)
  1497. LD b4, 7 * SIZE(BO)
  1498. MUL c21, b2, c21
  1499. NMSUB c31, c31, b3, c21
  1500. NMSUB c41, c41, b4, c21
  1501. LD b3, 10 * SIZE(BO)
  1502. LD b4, 11 * SIZE(BO)
  1503. MUL c31, b3, c31
  1504. NMSUB c41, c41, b4, c31
  1505. LD b4, 15 * SIZE(BO)
  1506. MUL c41, b4, c41
  1507. #endif
  1508. #ifdef RT
  1509. LD b5, 15 * SIZE(BO)
  1510. LD b6, 14 * SIZE(BO)
  1511. LD b7, 13 * SIZE(BO)
  1512. LD b8, 12 * SIZE(BO)
  1513. MUL c41, b5, c41
  1514. NMSUB c31, c31, b6, c41
  1515. NMSUB c21, c21, b7, c41
  1516. NMSUB c11, c11, b8, c41
  1517. LD b6, 10 * SIZE(BO)
  1518. LD b7, 9 * SIZE(BO)
  1519. LD b8, 8 * SIZE(BO)
  1520. MUL c31, b6, c31
  1521. NMSUB c21, c21, b7, c31
  1522. NMSUB c11, c11, b8, c31
  1523. LD b7, 5 * SIZE(BO)
  1524. LD b8, 4 * SIZE(BO)
  1525. MUL c21, b7, c21
  1526. NMSUB c11, c11, b8, c21
  1527. LD b8, 0 * SIZE(BO)
  1528. MUL c11, b8, c11
  1529. #endif
  1530. #ifdef LN
  1531. daddiu CO1, CO1, -1 * SIZE
  1532. daddiu CO2, CO2, -1 * SIZE
  1533. daddiu CO3, CO3, -1 * SIZE
  1534. daddiu CO4, CO4, -1 * SIZE
  1535. #endif
  1536. #if defined(LN) || defined(LT)
  1537. ST c11, 0 * SIZE(BO)
  1538. ST c21, 1 * SIZE(BO)
  1539. ST c31, 2 * SIZE(BO)
  1540. ST c41, 3 * SIZE(BO)
  1541. #else
  1542. ST c11, 0 * SIZE(AO)
  1543. ST c21, 1 * SIZE(AO)
  1544. ST c31, 2 * SIZE(AO)
  1545. ST c41, 3 * SIZE(AO)
  1546. #endif
  1547. ST c11, 0 * SIZE(CO1)
  1548. ST c21, 0 * SIZE(CO2)
  1549. ST c31, 0 * SIZE(CO3)
  1550. ST c41, 0 * SIZE(CO4)
  1551. #ifndef LN
  1552. daddiu CO1, CO1, 1 * SIZE
  1553. daddiu CO2, CO2, 1 * SIZE
  1554. daddiu CO3, CO3, 1 * SIZE
  1555. daddiu CO4, CO4, 1 * SIZE
  1556. #endif
  1557. #ifdef RT
  1558. dsll TEMP, K, BASE_SHIFT
  1559. daddu AORIG, AORIG, TEMP
  1560. #endif
  1561. #if defined(LT) || defined(RN)
  1562. dsubu TEMP, K, KK
  1563. dsll L, TEMP, 0 + BASE_SHIFT
  1564. dsll TEMP, TEMP, 2 + BASE_SHIFT
  1565. daddu AO, AO, L
  1566. daddu BO, BO, TEMP
  1567. #endif
  1568. #ifdef LT
  1569. daddiu KK, KK, 1
  1570. #endif
  1571. #ifdef LN
  1572. daddiu KK, KK, -1
  1573. #endif
  1574. .align 3
  1575. .L49:
  1576. #ifdef LN
  1577. dsll TEMP, K, 2 + BASE_SHIFT
  1578. daddu B, B, TEMP
  1579. #endif
  1580. #if defined(LT) || defined(RN)
  1581. move B, BO
  1582. #endif
  1583. #ifdef RN
  1584. daddiu KK, KK, 4
  1585. #endif
  1586. #ifdef RT
  1587. daddiu KK, KK, -4
  1588. #endif
  1589. .align 3
  1590. .L70:
  1591. dsra J, N, 3
  1592. blez J, .L999
  1593. nop
  1594. .L10:
  1595. #ifdef RT
  1596. dsll TEMP, K, 3 + BASE_SHIFT
  1597. dsubu B, B, TEMP
  1598. dsll TEMP, LDC, 3
  1599. dsubu C, C, TEMP
  1600. #endif
  1601. move CO1, C
  1602. MTC $0, c11
  1603. daddu CO2, C, LDC
  1604. daddu CO3, CO2, LDC
  1605. daddiu J, J, -1
  1606. daddu CO4, CO3, LDC
  1607. MOV c21, c11
  1608. daddu CO5, CO4, LDC
  1609. MOV c31, c11
  1610. daddu CO6, CO5, LDC
  1611. MOV c41, c11
  1612. daddu CO7, CO6, LDC
  1613. MOV c51, c11
  1614. daddu CO8, CO7, LDC
  1615. dsra I, M, 1
  1616. #ifdef LN
  1617. daddu KK, M, OFFSET
  1618. #endif
  1619. #ifdef LT
  1620. move KK, OFFSET
  1621. #endif
  1622. #if defined(LN) || defined(RT)
  1623. move AORIG, A
  1624. #else
  1625. move AO, A
  1626. #endif
  1627. #ifndef RT
  1628. daddu C, CO8, LDC
  1629. #endif
  1630. blez I, .L20
  1631. MOV c61, c11
  1632. .L11:
  1633. #if defined(LT) || defined(RN)
  1634. LD a1, 0 * SIZE(AO)
  1635. MOV c71, c11
  1636. LD b1, 0 * SIZE(B)
  1637. MOV c81, c11
  1638. LD a3, 4 * SIZE(AO)
  1639. MOV c12, c11
  1640. LD b2, 1 * SIZE(B)
  1641. MOV c22, c11
  1642. dsra L, KK, 2
  1643. MOV c32, c11
  1644. LD b3, 2 * SIZE(B)
  1645. MOV c42, c11
  1646. LD b4, 3 * SIZE(B)
  1647. MOV c52, c11
  1648. LD b5, 4 * SIZE(B)
  1649. MOV c62, c11
  1650. LD b6, 8 * SIZE(B)
  1651. MOV c72, c11
  1652. LD b7, 12 * SIZE(B)
  1653. MOV c82, c11
  1654. blez L, .L15
  1655. move BO, B
  1656. #else
  1657. #ifdef LN
  1658. dsll TEMP, K, 1 + BASE_SHIFT
  1659. dsubu AORIG, AORIG, TEMP
  1660. #endif
  1661. dsll L, KK, 1 + BASE_SHIFT
  1662. dsll TEMP, KK, 3 + BASE_SHIFT
  1663. daddu AO, AORIG, L
  1664. daddu BO, B, TEMP
  1665. dsubu TEMP, K, KK
  1666. LD a1, 0 * SIZE(AO)
  1667. MOV c71, c11
  1668. LD b1, 0 * SIZE(BO)
  1669. MOV c81, c11
  1670. LD a3, 4 * SIZE(AO)
  1671. MOV c12, c11
  1672. LD b2, 1 * SIZE(BO)
  1673. MOV c22, c11
  1674. MOV c32, c11
  1675. LD b3, 2 * SIZE(BO)
  1676. MOV c42, c11
  1677. LD b4, 3 * SIZE(BO)
  1678. MOV c52, c11
  1679. LD b5, 4 * SIZE(BO)
  1680. MOV c62, c11
  1681. LD b6, 8 * SIZE(BO)
  1682. MOV c72, c11
  1683. LD b7, 12 * SIZE(BO)
  1684. MOV c82, c11
  1685. dsra L, TEMP, 2
  1686. blez L, .L15
  1687. NOP
  1688. #endif
  1689. MADD c11, c11, a1, b1
  1690. LD a2, 1 * SIZE(AO)
  1691. MADD c21, c21, a1, b2
  1692. daddiu L, L, -1
  1693. MADD c31, c31, a1, b3
  1694. blez L, .L13
  1695. MADD c41, c41, a1, b4
  1696. NOP
  1697. .align 3
  1698. .L12:
  1699. MADD c12, c12, a2, b1
  1700. LD b1, 16 * SIZE(BO)
  1701. MADD c22, c22, a2, b2
  1702. LD b2, 5 * SIZE(BO)
  1703. MADD c32, c32, a2, b3
  1704. LD b3, 6 * SIZE(BO)
  1705. MADD c42, c42, a2, b4
  1706. LD b4, 7 * SIZE(BO)
  1707. MADD c51, c51, a1, b5
  1708. NOP
  1709. MADD c61, c61, a1, b2
  1710. LD a4, 2 * SIZE(AO)
  1711. MADD c71, c71, a1, b3
  1712. NOP
  1713. MADD c81, c81, a1, b4
  1714. LD a1, 8 * SIZE(AO)
  1715. MADD c52, c52, a2, b5
  1716. LD b5, 20 * SIZE(BO)
  1717. MADD c62, c62, a2, b2
  1718. LD b2, 9 * SIZE(BO)
  1719. MADD c72, c72, a2, b3
  1720. LD b3, 10 * SIZE(BO)
  1721. MADD c82, c82, a2, b4
  1722. LD b4, 11 * SIZE(BO)
  1723. MADD c11, c11, a4, b6
  1724. LD a2, 3 * SIZE(AO)
  1725. MADD c21, c21, a4, b2
  1726. NOP
  1727. MADD c31, c31, a4, b3
  1728. NOP
  1729. MADD c41, c41, a4, b4
  1730. NOP
  1731. MADD c12, c12, a2, b6
  1732. LD b6, 24 * SIZE(BO)
  1733. MADD c22, c22, a2, b2
  1734. LD b2, 13 * SIZE(BO)
  1735. MADD c32, c32, a2, b3
  1736. LD b3, 14 * SIZE(BO)
  1737. MADD c42, c42, a2, b4
  1738. LD b4, 15 * SIZE(BO)
  1739. MADD c51, c51, a4, b7
  1740. NOP
  1741. MADD c61, c61, a4, b2
  1742. NOP
  1743. MADD c71, c71, a4, b3
  1744. NOP
  1745. MADD c81, c81, a4, b4
  1746. NOP
  1747. MADD c52, c52, a2, b7
  1748. LD b7, 28 * SIZE(BO)
  1749. MADD c62, c62, a2, b2
  1750. LD b2, 17 * SIZE(BO)
  1751. MADD c72, c72, a2, b3
  1752. LD b3, 18 * SIZE(BO)
  1753. MADD c82, c82, a2, b4
  1754. LD b4, 19 * SIZE(BO)
  1755. MADD c11, c11, a3, b1
  1756. LD a2, 5 * SIZE(AO)
  1757. MADD c21, c21, a3, b2
  1758. NOP
  1759. MADD c31, c31, a3, b3
  1760. NOP
  1761. MADD c41, c41, a3, b4
  1762. NOP
  1763. MADD c12, c12, a2, b1
  1764. LD b1, 32 * SIZE(BO)
  1765. MADD c22, c22, a2, b2
  1766. LD b2, 21 * SIZE(BO)
  1767. MADD c32, c32, a2, b3
  1768. LD b3, 22 * SIZE(BO)
  1769. MADD c42, c42, a2, b4
  1770. LD b4, 23 * SIZE(BO)
  1771. MADD c51, c51, a3, b5
  1772. NOP
  1773. MADD c61, c61, a3, b2
  1774. LD a4, 6 * SIZE(AO)
  1775. MADD c71, c71, a3, b3
  1776. NOP
  1777. MADD c81, c81, a3, b4
  1778. LD a3, 12 * SIZE(AO)
  1779. MADD c52, c52, a2, b5
  1780. LD b5, 36 * SIZE(BO)
  1781. MADD c62, c62, a2, b2
  1782. LD b2, 25 * SIZE(BO)
  1783. MADD c72, c72, a2, b3
  1784. LD b3, 26 * SIZE(BO)
  1785. MADD c82, c82, a2, b4
  1786. LD b4, 27 * SIZE(BO)
  1787. MADD c11, c11, a4, b6
  1788. LD a2, 7 * SIZE(AO)
  1789. MADD c21, c21, a4, b2
  1790. NOP
  1791. MADD c31, c31, a4, b3
  1792. NOP
  1793. MADD c41, c41, a4, b4
  1794. daddiu L, L, -1
  1795. MADD c12, c12, a2, b6
  1796. LD b6, 40 * SIZE(BO)
  1797. MADD c22, c22, a2, b2
  1798. LD b2, 29 * SIZE(BO)
  1799. MADD c32, c32, a2, b3
  1800. LD b3, 30 * SIZE(BO)
  1801. MADD c42, c42, a2, b4
  1802. LD b4, 31 * SIZE(BO)
  1803. MADD c51, c51, a4, b7
  1804. daddiu BO, BO, 32 * SIZE
  1805. MADD c61, c61, a4, b2
  1806. daddiu AO, AO, 8 * SIZE
  1807. MADD c71, c71, a4, b3
  1808. NOP
  1809. MADD c81, c81, a4, b4
  1810. NOP
  1811. MADD c52, c52, a2, b7
  1812. LD b7, 12 * SIZE(BO)
  1813. MADD c62, c62, a2, b2
  1814. LD b2, 1 * SIZE(BO)
  1815. MADD c72, c72, a2, b3
  1816. LD b3, 2 * SIZE(BO)
  1817. MADD c82, c82, a2, b4
  1818. LD b4, 3 * SIZE(BO)
  1819. MADD c11, c11, a1, b1
  1820. LD a2, 1 * SIZE(AO)
  1821. MADD c21, c21, a1, b2
  1822. NOP
  1823. MADD c31, c31, a1, b3
  1824. bgtz L, .L12
  1825. MADD c41, c41, a1, b4
  1826. NOP
  1827. .align 3
  1828. .L13:
  1829. MADD c12, c12, a2, b1
  1830. LD b1, 16 * SIZE(BO)
  1831. MADD c22, c22, a2, b2
  1832. LD b2, 5 * SIZE(BO)
  1833. MADD c32, c32, a2, b3
  1834. LD b3, 6 * SIZE(BO)
  1835. MADD c42, c42, a2, b4
  1836. LD b4, 7 * SIZE(BO)
  1837. MADD c51, c51, a1, b5
  1838. NOP
  1839. MADD c61, c61, a1, b2
  1840. LD a4, 2 * SIZE(AO)
  1841. MADD c71, c71, a1, b3
  1842. NOP
  1843. MADD c81, c81, a1, b4
  1844. LD a1, 8 * SIZE(AO)
  1845. MADD c52, c52, a2, b5
  1846. LD b5, 20 * SIZE(BO)
  1847. MADD c62, c62, a2, b2
  1848. LD b2, 9 * SIZE(BO)
  1849. MADD c72, c72, a2, b3
  1850. LD b3, 10 * SIZE(BO)
  1851. MADD c82, c82, a2, b4
  1852. LD b4, 11 * SIZE(BO)
  1853. MADD c11, c11, a4, b6
  1854. LD a2, 3 * SIZE(AO)
  1855. MADD c21, c21, a4, b2
  1856. NOP
  1857. MADD c31, c31, a4, b3
  1858. NOP
  1859. MADD c41, c41, a4, b4
  1860. NOP
  1861. MADD c12, c12, a2, b6
  1862. LD b6, 24 * SIZE(BO)
  1863. MADD c22, c22, a2, b2
  1864. LD b2, 13 * SIZE(BO)
  1865. MADD c32, c32, a2, b3
  1866. LD b3, 14 * SIZE(BO)
  1867. MADD c42, c42, a2, b4
  1868. LD b4, 15 * SIZE(BO)
  1869. MADD c51, c51, a4, b7
  1870. NOP
  1871. MADD c61, c61, a4, b2
  1872. NOP
  1873. MADD c71, c71, a4, b3
  1874. NOP
  1875. MADD c81, c81, a4, b4
  1876. NOP
  1877. MADD c52, c52, a2, b7
  1878. LD b7, 28 * SIZE(BO)
  1879. MADD c62, c62, a2, b2
  1880. LD b2, 17 * SIZE(BO)
  1881. MADD c72, c72, a2, b3
  1882. LD b3, 18 * SIZE(BO)
  1883. MADD c82, c82, a2, b4
  1884. LD b4, 19 * SIZE(BO)
  1885. MADD c11, c11, a3, b1
  1886. LD a2, 5 * SIZE(AO)
  1887. MADD c21, c21, a3, b2
  1888. NOP
  1889. MADD c31, c31, a3, b3
  1890. NOP
  1891. MADD c41, c41, a3, b4
  1892. NOP
  1893. MADD c12, c12, a2, b1
  1894. LD b1, 32 * SIZE(BO)
  1895. MADD c22, c22, a2, b2
  1896. LD b2, 21 * SIZE(BO)
  1897. MADD c32, c32, a2, b3
  1898. LD b3, 22 * SIZE(BO)
  1899. MADD c42, c42, a2, b4
  1900. LD b4, 23 * SIZE(BO)
  1901. MADD c51, c51, a3, b5
  1902. NOP
  1903. MADD c61, c61, a3, b2
  1904. LD a4, 6 * SIZE(AO)
  1905. MADD c71, c71, a3, b3
  1906. NOP
  1907. MADD c81, c81, a3, b4
  1908. LD a3, 12 * SIZE(AO)
  1909. MADD c52, c52, a2, b5
  1910. LD b5, 36 * SIZE(BO)
  1911. MADD c62, c62, a2, b2
  1912. LD b2, 25 * SIZE(BO)
  1913. MADD c72, c72, a2, b3
  1914. LD b3, 26 * SIZE(BO)
  1915. MADD c82, c82, a2, b4
  1916. LD b4, 27 * SIZE(BO)
  1917. MADD c11, c11, a4, b6
  1918. LD a2, 7 * SIZE(AO)
  1919. MADD c21, c21, a4, b2
  1920. NOP
  1921. MADD c31, c31, a4, b3
  1922. NOP
  1923. MADD c41, c41, a4, b4
  1924. NOP
  1925. MADD c12, c12, a2, b6
  1926. LD b6, 40 * SIZE(BO)
  1927. MADD c22, c22, a2, b2
  1928. LD b2, 29 * SIZE(BO)
  1929. MADD c32, c32, a2, b3
  1930. LD b3, 30 * SIZE(BO)
  1931. MADD c42, c42, a2, b4
  1932. LD b4, 31 * SIZE(BO)
  1933. MADD c51, c51, a4, b7
  1934. daddiu BO, BO, 32 * SIZE
  1935. MADD c61, c61, a4, b2
  1936. daddiu AO, AO, 8 * SIZE
  1937. MADD c71, c71, a4, b3
  1938. NOP
  1939. MADD c81, c81, a4, b4
  1940. NOP
  1941. MADD c52, c52, a2, b7
  1942. LD b7, 12 * SIZE(BO)
  1943. MADD c62, c62, a2, b2
  1944. LD b2, 1 * SIZE(BO)
  1945. MADD c72, c72, a2, b3
  1946. LD b3, 2 * SIZE(BO)
  1947. MADD c82, c82, a2, b4
  1948. LD b4, 3 * SIZE(BO)
  1949. .align 3
  1950. .L15:
  1951. #if defined(LT) || defined(RN)
  1952. andi L, KK, 3
  1953. #else
  1954. andi L, TEMP, 3
  1955. #endif
  1956. blez L, .L18
  1957. NOP
  1958. .align 3
  1959. .L16:
  1960. MADD c11, c11, a1, b1
  1961. LD a2, 1 * SIZE(AO)
  1962. MADD c21, c21, a1, b2
  1963. NOP
  1964. MADD c31, c31, a1, b3
  1965. NOP
  1966. MADD c41, c41, a1, b4
  1967. NOP
  1968. MADD c12, c12, a2, b1
  1969. LD b1, 8 * SIZE(BO)
  1970. MADD c22, c22, a2, b2
  1971. LD b2, 5 * SIZE(BO)
  1972. MADD c32, c32, a2, b3
  1973. LD b3, 6 * SIZE(BO)
  1974. MADD c42, c42, a2, b4
  1975. LD b4, 7 * SIZE(BO)
  1976. MADD c51, c51, a1, b5
  1977. daddiu L, L, -1
  1978. MADD c61, c61, a1, b2
  1979. daddiu AO, AO, 2 * SIZE
  1980. MADD c71, c71, a1, b3
  1981. daddiu BO, BO, 8 * SIZE
  1982. MADD c81, c81, a1, b4
  1983. LD a1, 0 * SIZE(AO)
  1984. MADD c52, c52, a2, b5
  1985. LD b5, 4 * SIZE(BO)
  1986. MADD c62, c62, a2, b2
  1987. LD b2, 1 * SIZE(BO)
  1988. MADD c72, c72, a2, b3
  1989. LD b3, 2 * SIZE(BO)
  1990. MADD c82, c82, a2, b4
  1991. bgtz L, .L16
  1992. LD b4, 3 * SIZE(BO)
  1993. .L18:
  1994. #if defined(LN) || defined(RT)
  1995. #ifdef LN
  1996. daddiu TEMP, KK, -2
  1997. #else
  1998. daddiu TEMP, KK, -8
  1999. #endif
  2000. dsll L, TEMP, 1 + BASE_SHIFT
  2001. dsll TEMP, TEMP, 3 + BASE_SHIFT
  2002. daddu AO, AORIG, L
  2003. daddu BO, B, TEMP
  2004. #endif
  2005. #if defined(LN) || defined(LT)
  2006. LD b1, 0 * SIZE(BO)
  2007. LD b2, 1 * SIZE(BO)
  2008. LD b3, 2 * SIZE(BO)
  2009. LD b4, 3 * SIZE(BO)
  2010. SUB c11, b1, c11
  2011. LD b5, 4 * SIZE(BO)
  2012. SUB c21, b2, c21
  2013. LD b6, 5 * SIZE(BO)
  2014. SUB c31, b3, c31
  2015. LD b7, 6 * SIZE(BO)
  2016. SUB c41, b4, c41
  2017. LD b8, 7 * SIZE(BO)
  2018. SUB c51, b5, c51
  2019. LD b1, 8 * SIZE(BO)
  2020. SUB c61, b6, c61
  2021. LD b2, 9 * SIZE(BO)
  2022. SUB c71, b7, c71
  2023. LD b3, 10 * SIZE(BO)
  2024. SUB c81, b8, c81
  2025. LD b4, 11 * SIZE(BO)
  2026. SUB c12, b1, c12
  2027. LD b5, 12 * SIZE(BO)
  2028. SUB c22, b2, c22
  2029. LD b6, 13 * SIZE(BO)
  2030. SUB c32, b3, c32
  2031. LD b7, 14 * SIZE(BO)
  2032. SUB c42, b4, c42
  2033. LD b8, 15 * SIZE(BO)
  2034. SUB c52, b5, c52
  2035. #ifdef LN
  2036. LD b1, 3 * SIZE(AO)
  2037. #else
  2038. LD b1, 0 * SIZE(AO)
  2039. #endif
  2040. SUB c62, b6, c62
  2041. SUB c72, b7, c72
  2042. SUB c82, b8, c82
  2043. #else
  2044. LD b1, 0 * SIZE(AO)
  2045. LD b2, 1 * SIZE(AO)
  2046. LD b3, 2 * SIZE(AO)
  2047. LD b4, 3 * SIZE(AO)
  2048. SUB c11, b1, c11
  2049. LD b5, 4 * SIZE(AO)
  2050. SUB c12, b2, c12
  2051. LD b6, 5 * SIZE(AO)
  2052. SUB c21, b3, c21
  2053. LD b7, 6 * SIZE(AO)
  2054. SUB c22, b4, c22
  2055. LD b8, 7 * SIZE(AO)
  2056. SUB c31, b5, c31
  2057. LD b1, 8 * SIZE(AO)
  2058. SUB c32, b6, c32
  2059. LD b2, 9 * SIZE(AO)
  2060. SUB c41, b7, c41
  2061. LD b3, 10 * SIZE(AO)
  2062. SUB c42, b8, c42
  2063. LD b4, 11 * SIZE(AO)
  2064. LD b5, 12 * SIZE(AO)
  2065. SUB c51, b1, c51
  2066. LD b6, 13 * SIZE(AO)
  2067. SUB c52, b2, c52
  2068. LD b7, 14 * SIZE(AO)
  2069. SUB c61, b3, c61
  2070. LD b8, 15 * SIZE(AO)
  2071. SUB c62, b4, c62
  2072. SUB c71, b5, c71
  2073. SUB c72, b6, c72
  2074. SUB c81, b7, c81
  2075. SUB c82, b8, c82
  2076. #endif
  2077. #ifdef LN
  2078. MUL c12, b1, c12
  2079. LD b2, 2 * SIZE(AO)
  2080. MUL c22, b1, c22
  2081. MUL c32, b1, c32
  2082. MUL c42, b1, c42
  2083. MUL c52, b1, c52
  2084. MUL c62, b1, c62
  2085. MUL c72, b1, c72
  2086. MUL c82, b1, c82
  2087. NMSUB c11, c11, b2, c12
  2088. LD b3, 0 * SIZE(AO)
  2089. NMSUB c21, c21, b2, c22
  2090. NMSUB c31, c31, b2, c32
  2091. NMSUB c41, c41, b2, c42
  2092. NMSUB c51, c51, b2, c52
  2093. NMSUB c61, c61, b2, c62
  2094. NMSUB c71, c71, b2, c72
  2095. NMSUB c81, c81, b2, c82
  2096. MUL c11, b3, c11
  2097. daddiu CO1, CO1, -2 * SIZE
  2098. MUL c21, b3, c21
  2099. daddiu CO2, CO2, -2 * SIZE
  2100. MUL c31, b3, c31
  2101. daddiu CO3, CO3, -2 * SIZE
  2102. MUL c41, b3, c41
  2103. daddiu CO4, CO4, -2 * SIZE
  2104. MUL c51, b3, c51
  2105. daddiu CO5, CO5, -2 * SIZE
  2106. MUL c61, b3, c61
  2107. daddiu CO6, CO6, -2 * SIZE
  2108. MUL c71, b3, c71
  2109. daddiu CO7, CO7, -2 * SIZE
  2110. MUL c81, b3, c81
  2111. daddiu CO8, CO8, -2 * SIZE
  2112. #endif
  2113. #ifdef LT
  2114. MUL c11, b1, c11
  2115. LD b2, 1 * SIZE(AO)
  2116. MUL c21, b1, c21
  2117. MUL c31, b1, c31
  2118. MUL c41, b1, c41
  2119. MUL c51, b1, c51
  2120. MUL c61, b1, c61
  2121. MUL c71, b1, c71
  2122. MUL c81, b1, c81
  2123. NMSUB c12, c12, b2, c11
  2124. LD b3, 3 * SIZE(AO)
  2125. NMSUB c22, c22, b2, c21
  2126. NMSUB c32, c32, b2, c31
  2127. NMSUB c42, c42, b2, c41
  2128. NMSUB c52, c52, b2, c51
  2129. NMSUB c62, c62, b2, c61
  2130. NMSUB c72, c72, b2, c71
  2131. NMSUB c82, c82, b2, c81
  2132. MUL c12, b3, c12
  2133. MUL c22, b3, c22
  2134. MUL c32, b3, c32
  2135. MUL c42, b3, c42
  2136. MUL c52, b3, c52
  2137. MUL c62, b3, c62
  2138. MUL c72, b3, c72
  2139. MUL c82, b3, c82
  2140. #endif
  2141. #ifdef RN
  2142. LD b1, 0 * SIZE(BO)
  2143. LD b2, 1 * SIZE(BO)
  2144. LD b3, 2 * SIZE(BO)
  2145. LD b4, 3 * SIZE(BO)
  2146. MUL c11, b1, c11
  2147. MUL c12, b1, c12
  2148. LD b5, 4 * SIZE(BO)
  2149. NMSUB c21, c21, b2, c11
  2150. NMSUB c22, c22, b2, c12
  2151. LD b6, 5 * SIZE(BO)
  2152. NMSUB c31, c31, b3, c11
  2153. NMSUB c32, c32, b3, c12
  2154. LD b7, 6 * SIZE(BO)
  2155. NMSUB c41, c41, b4, c11
  2156. NMSUB c42, c42, b4, c12
  2157. LD b8, 7 * SIZE(BO)
  2158. NMSUB c51, c51, b5, c11
  2159. NMSUB c52, c52, b5, c12
  2160. LD b2, 9 * SIZE(BO)
  2161. NMSUB c61, c61, b6, c11
  2162. NMSUB c62, c62, b6, c12
  2163. LD b3, 10 * SIZE(BO)
  2164. NMSUB c71, c71, b7, c11
  2165. NMSUB c72, c72, b7, c12
  2166. LD b4, 11 * SIZE(BO)
  2167. NMSUB c81, c81, b8, c11
  2168. NMSUB c82, c82, b8, c12
  2169. LD b5, 12 * SIZE(BO)
  2170. MUL c21, b2, c21
  2171. MUL c22, b2, c22
  2172. LD b6, 13 * SIZE(BO)
  2173. NMSUB c31, c31, b3, c21
  2174. NMSUB c32, c32, b3, c22
  2175. LD b7, 14 * SIZE(BO)
  2176. NMSUB c41, c41, b4, c21
  2177. NMSUB c42, c42, b4, c22
  2178. LD b8, 15 * SIZE(BO)
  2179. NMSUB c51, c51, b5, c21
  2180. NMSUB c52, c52, b5, c22
  2181. LD b3, 18 * SIZE(BO)
  2182. NMSUB c61, c61, b6, c21
  2183. NMSUB c62, c62, b6, c22
  2184. LD b4, 19 * SIZE(BO)
  2185. NMSUB c71, c71, b7, c21
  2186. NMSUB c72, c72, b7, c22
  2187. LD b5, 20 * SIZE(BO)
  2188. NMSUB c81, c81, b8, c21
  2189. NMSUB c82, c82, b8, c22
  2190. LD b6, 21 * SIZE(BO)
  2191. MUL c31, b3, c31
  2192. MUL c32, b3, c32
  2193. LD b7, 22 * SIZE(BO)
  2194. NMSUB c41, c41, b4, c31
  2195. NMSUB c42, c42, b4, c32
  2196. LD b8, 23 * SIZE(BO)
  2197. NMSUB c51, c51, b5, c31
  2198. NMSUB c52, c52, b5, c32
  2199. LD b4, 27 * SIZE(BO)
  2200. NMSUB c61, c61, b6, c31
  2201. NMSUB c62, c62, b6, c32
  2202. LD b5, 28 * SIZE(BO)
  2203. NMSUB c71, c71, b7, c31
  2204. NMSUB c72, c72, b7, c32
  2205. LD b6, 29 * SIZE(BO)
  2206. NMSUB c81, c81, b8, c31
  2207. NMSUB c82, c82, b8, c32
  2208. LD b7, 30 * SIZE(BO)
  2209. MUL c41, b4, c41
  2210. MUL c42, b4, c42
  2211. LD b8, 31 * SIZE(BO)
  2212. NMSUB c51, c51, b5, c41
  2213. NMSUB c52, c52, b5, c42
  2214. LD b5, 36 * SIZE(BO)
  2215. NMSUB c61, c61, b6, c41
  2216. NMSUB c62, c62, b6, c42
  2217. LD b6, 37 * SIZE(BO)
  2218. NMSUB c71, c71, b7, c41
  2219. NMSUB c72, c72, b7, c42
  2220. LD b7, 38 * SIZE(BO)
  2221. NMSUB c81, c81, b8, c41
  2222. NMSUB c82, c82, b8, c42
  2223. LD b8, 39 * SIZE(BO)
  2224. MUL c51, b5, c51
  2225. MUL c52, b5, c52
  2226. NMSUB c61, c61, b6, c51
  2227. NMSUB c62, c62, b6, c52
  2228. LD b6, 45 * SIZE(BO)
  2229. NMSUB c71, c71, b7, c51
  2230. NMSUB c72, c72, b7, c52
  2231. LD b7, 46 * SIZE(BO)
  2232. NMSUB c81, c81, b8, c51
  2233. NMSUB c82, c82, b8, c52
  2234. LD b8, 47 * SIZE(BO)
  2235. MUL c61, b6, c61
  2236. MUL c62, b6, c62
  2237. NMSUB c71, c71, b7, c61
  2238. NMSUB c72, c72, b7, c62
  2239. LD b7, 54 * SIZE(BO)
  2240. NMSUB c81, c81, b8, c61
  2241. NMSUB c82, c82, b8, c62
  2242. LD b8, 55 * SIZE(BO)
  2243. MUL c71, b7, c71
  2244. MUL c72, b7, c72
  2245. NMSUB c81, c81, b8, c71
  2246. NMSUB c82, c82, b8, c72
  2247. LD b8, 63 * SIZE(BO)
  2248. MUL c81, b8, c81
  2249. MUL c82, b8, c82
  2250. #endif
  2251. #ifdef RT
  2252. LD b1, 63 * SIZE(BO)
  2253. LD b2, 62 * SIZE(BO)
  2254. LD b3, 61 * SIZE(BO)
  2255. LD b4, 60 * SIZE(BO)
  2256. MUL c81, b1, c81
  2257. MUL c82, b1, c82
  2258. LD b5, 59 * SIZE(BO)
  2259. NMSUB c71, c71, b2, c81
  2260. NMSUB c72, c72, b2, c82
  2261. LD b6, 58 * SIZE(BO)
  2262. NMSUB c61, c61, b3, c81
  2263. NMSUB c62, c62, b3, c82
  2264. LD b7, 57 * SIZE(BO)
  2265. NMSUB c51, c51, b4, c81
  2266. NMSUB c52, c52, b4, c82
  2267. LD b8, 56 * SIZE(BO)
  2268. NMSUB c41, c41, b5, c81
  2269. NMSUB c42, c42, b5, c82
  2270. LD b2, 54 * SIZE(BO)
  2271. NMSUB c31, c31, b6, c81
  2272. NMSUB c32, c32, b6, c82
  2273. LD b3, 53 * SIZE(BO)
  2274. NMSUB c21, c21, b7, c81
  2275. NMSUB c22, c22, b7, c82
  2276. LD b4, 52 * SIZE(BO)
  2277. NMSUB c11, c11, b8, c81
  2278. NMSUB c12, c12, b8, c82
  2279. LD b5, 51 * SIZE(BO)
  2280. MUL c71, b2, c71
  2281. MUL c72, b2, c72
  2282. LD b6, 50 * SIZE(BO)
  2283. NMSUB c61, c61, b3, c71
  2284. NMSUB c62, c62, b3, c72
  2285. LD b7, 49 * SIZE(BO)
  2286. NMSUB c51, c51, b4, c71
  2287. NMSUB c52, c52, b4, c72
  2288. LD b8, 48 * SIZE(BO)
  2289. NMSUB c41, c41, b5, c71
  2290. NMSUB c42, c42, b5, c72
  2291. LD b3, 45 * SIZE(BO)
  2292. NMSUB c31, c31, b6, c71
  2293. NMSUB c32, c32, b6, c72
  2294. LD b4, 44 * SIZE(BO)
  2295. NMSUB c21, c21, b7, c71
  2296. NMSUB c22, c22, b7, c72
  2297. LD b5, 43 * SIZE(BO)
  2298. NMSUB c11, c11, b8, c71
  2299. NMSUB c12, c12, b8, c72
  2300. LD b6, 42 * SIZE(BO)
  2301. MUL c61, b3, c61
  2302. MUL c62, b3, c62
  2303. LD b7, 41 * SIZE(BO)
  2304. NMSUB c51, c51, b4, c61
  2305. NMSUB c52, c52, b4, c62
  2306. LD b8, 40 * SIZE(BO)
  2307. NMSUB c41, c41, b5, c61
  2308. NMSUB c42, c42, b5, c62
  2309. LD b4, 36 * SIZE(BO)
  2310. NMSUB c31, c31, b6, c61
  2311. NMSUB c32, c32, b6, c62
  2312. LD b5, 35 * SIZE(BO)
  2313. NMSUB c21, c21, b7, c61
  2314. NMSUB c22, c22, b7, c62
  2315. LD b6, 34 * SIZE(BO)
  2316. NMSUB c11, c11, b8, c61
  2317. NMSUB c12, c12, b8, c62
  2318. LD b7, 33 * SIZE(BO)
  2319. MUL c51, b4, c51
  2320. MUL c52, b4, c52
  2321. LD b8, 32 * SIZE(BO)
  2322. NMSUB c41, c41, b5, c51
  2323. NMSUB c42, c42, b5, c52
  2324. LD b5, 27 * SIZE(BO)
  2325. NMSUB c31, c31, b6, c51
  2326. NMSUB c32, c32, b6, c52
  2327. LD b6, 26 * SIZE(BO)
  2328. NMSUB c21, c21, b7, c51
  2329. NMSUB c22, c22, b7, c52
  2330. LD b7, 25 * SIZE(BO)
  2331. NMSUB c11, c11, b8, c51
  2332. NMSUB c12, c12, b8, c52
  2333. LD b8, 24 * SIZE(BO)
  2334. MUL c41, b5, c41
  2335. MUL c42, b5, c42
  2336. NMSUB c31, c31, b6, c41
  2337. NMSUB c32, c32, b6, c42
  2338. LD b6, 18 * SIZE(BO)
  2339. NMSUB c21, c21, b7, c41
  2340. NMSUB c22, c22, b7, c42
  2341. LD b7, 17 * SIZE(BO)
  2342. NMSUB c11, c11, b8, c41
  2343. NMSUB c12, c12, b8, c42
  2344. LD b8, 16 * SIZE(BO)
  2345. MUL c31, b6, c31
  2346. MUL c32, b6, c32
  2347. NMSUB c21, c21, b7, c31
  2348. NMSUB c22, c22, b7, c32
  2349. LD b7, 9 * SIZE(BO)
  2350. NMSUB c11, c11, b8, c31
  2351. NMSUB c12, c12, b8, c32
  2352. LD b8, 8 * SIZE(BO)
  2353. MUL c21, b7, c21
  2354. MUL c22, b7, c22
  2355. NMSUB c11, c11, b8, c21
  2356. NMSUB c12, c12, b8, c22
  2357. LD b8, 0 * SIZE(BO)
  2358. MUL c11, b8, c11
  2359. MUL c12, b8, c12
  2360. #endif
  2361. #if defined(LN) || defined(LT)
  2362. ST c11, 0 * SIZE(BO)
  2363. ST c21, 1 * SIZE(BO)
  2364. ST c31, 2 * SIZE(BO)
  2365. ST c41, 3 * SIZE(BO)
  2366. ST c51, 4 * SIZE(BO)
  2367. ST c61, 5 * SIZE(BO)
  2368. ST c71, 6 * SIZE(BO)
  2369. ST c81, 7 * SIZE(BO)
  2370. ST c12, 8 * SIZE(BO)
  2371. ST c22, 9 * SIZE(BO)
  2372. ST c32, 10 * SIZE(BO)
  2373. ST c42, 11 * SIZE(BO)
  2374. ST c52, 12 * SIZE(BO)
  2375. ST c62, 13 * SIZE(BO)
  2376. ST c72, 14 * SIZE(BO)
  2377. ST c82, 15 * SIZE(BO)
  2378. #else
  2379. ST c11, 0 * SIZE(AO)
  2380. ST c12, 1 * SIZE(AO)
  2381. ST c21, 2 * SIZE(AO)
  2382. ST c22, 3 * SIZE(AO)
  2383. ST c31, 4 * SIZE(AO)
  2384. ST c32, 5 * SIZE(AO)
  2385. ST c41, 6 * SIZE(AO)
  2386. ST c42, 7 * SIZE(AO)
  2387. ST c51, 8 * SIZE(AO)
  2388. ST c52, 9 * SIZE(AO)
  2389. ST c61, 10 * SIZE(AO)
  2390. ST c62, 11 * SIZE(AO)
  2391. ST c71, 12 * SIZE(AO)
  2392. ST c72, 13 * SIZE(AO)
  2393. ST c81, 14 * SIZE(AO)
  2394. ST c82, 15 * SIZE(AO)
  2395. #endif
  2396. ST c11, 0 * SIZE(CO1)
  2397. ST c12, 1 * SIZE(CO1)
  2398. ST c21, 0 * SIZE(CO2)
  2399. ST c22, 1 * SIZE(CO2)
  2400. ST c31, 0 * SIZE(CO3)
  2401. ST c32, 1 * SIZE(CO3)
  2402. ST c41, 0 * SIZE(CO4)
  2403. ST c42, 1 * SIZE(CO4)
  2404. ST c51, 0 * SIZE(CO5)
  2405. ST c52, 1 * SIZE(CO5)
  2406. ST c61, 0 * SIZE(CO6)
  2407. ST c62, 1 * SIZE(CO6)
  2408. ST c71, 0 * SIZE(CO7)
  2409. ST c72, 1 * SIZE(CO7)
  2410. ST c81, 0 * SIZE(CO8)
  2411. ST c82, 1 * SIZE(CO8)
  2412. MTC $0, a1
  2413. #ifndef LN
  2414. daddiu CO1, CO1, 2 * SIZE
  2415. daddiu CO2, CO2, 2 * SIZE
  2416. daddiu CO3, CO3, 2 * SIZE
  2417. daddiu CO4, CO4, 2 * SIZE
  2418. daddiu CO5, CO5, 2 * SIZE
  2419. daddiu CO6, CO6, 2 * SIZE
  2420. daddiu CO7, CO7, 2 * SIZE
  2421. daddiu CO8, CO8, 2 * SIZE
  2422. #endif
  2423. MOV c11, a1
  2424. MOV c21, a1
  2425. #ifdef RT
  2426. dsll TEMP, K, 1 + BASE_SHIFT
  2427. daddu AORIG, AORIG, TEMP
  2428. #endif
  2429. MOV c31, a1
  2430. MOV c41, a1
  2431. #if defined(LT) || defined(RN)
  2432. dsubu TEMP, K, KK
  2433. dsll L, TEMP, 1 + BASE_SHIFT
  2434. dsll TEMP, TEMP, 3 + BASE_SHIFT
  2435. daddu AO, AO, L
  2436. daddu BO, BO, TEMP
  2437. #endif
  2438. #ifdef LT
  2439. daddiu KK, KK, 2
  2440. #endif
  2441. #ifdef LN
  2442. daddiu KK, KK, -2
  2443. #endif
  2444. daddiu I, I, -1
  2445. MOV c51, a1
  2446. bgtz I, .L11
  2447. MOV c61, a1
  2448. .align 3
  2449. .L20:
  2450. andi I, M, 1
  2451. MOV c61, c11
  2452. blez I, .L29
  2453. MOV c71, c11
  2454. #if defined(LT) || defined(RN)
  2455. LD a1, 0 * SIZE(AO)
  2456. LD a2, 1 * SIZE(AO)
  2457. LD a3, 2 * SIZE(AO)
  2458. LD a4, 3 * SIZE(AO)
  2459. LD b1, 0 * SIZE(B)
  2460. LD b2, 1 * SIZE(B)
  2461. LD b3, 2 * SIZE(B)
  2462. LD b4, 3 * SIZE(B)
  2463. LD b5, 4 * SIZE(B)
  2464. LD b6, 8 * SIZE(B)
  2465. LD b7, 12 * SIZE(B)
  2466. dsra L, KK, 2
  2467. MOV c81, c11
  2468. blez L, .L25
  2469. move BO, B
  2470. #else
  2471. #ifdef LN
  2472. dsll TEMP, K, 0 + BASE_SHIFT
  2473. dsubu AORIG, AORIG, TEMP
  2474. #endif
  2475. dsll L, KK, 0 + BASE_SHIFT
  2476. dsll TEMP, KK, 3 + BASE_SHIFT
  2477. daddu AO, AORIG, L
  2478. daddu BO, B, TEMP
  2479. dsubu TEMP, K, KK
  2480. LD a1, 0 * SIZE(AO)
  2481. LD a2, 1 * SIZE(AO)
  2482. LD a3, 2 * SIZE(AO)
  2483. LD a4, 3 * SIZE(AO)
  2484. LD b1, 0 * SIZE(BO)
  2485. LD b2, 1 * SIZE(BO)
  2486. LD b3, 2 * SIZE(BO)
  2487. LD b4, 3 * SIZE(BO)
  2488. LD b5, 4 * SIZE(BO)
  2489. LD b6, 8 * SIZE(BO)
  2490. LD b7, 12 * SIZE(BO)
  2491. dsra L, TEMP, 2
  2492. MOV c81, c11
  2493. blez L, .L25
  2494. NOP
  2495. #endif
  2496. .align 3
  2497. .L22:
  2498. MADD c11, c11, a1, b1
  2499. LD b1, 16 * SIZE(BO)
  2500. MADD c21, c21, a1, b2
  2501. LD b2, 5 * SIZE(BO)
  2502. MADD c31, c31, a1, b3
  2503. LD b3, 6 * SIZE(BO)
  2504. MADD c41, c41, a1, b4
  2505. LD b4, 7 * SIZE(BO)
  2506. MADD c51, c51, a1, b5
  2507. LD b5, 20 * SIZE(BO)
  2508. MADD c61, c61, a1, b2
  2509. LD b2, 9 * SIZE(BO)
  2510. MADD c71, c71, a1, b3
  2511. LD b3, 10 * SIZE(BO)
  2512. MADD c81, c81, a1, b4
  2513. LD b4, 11 * SIZE(BO)
  2514. LD a1, 4 * SIZE(AO)
  2515. daddiu L, L, -1
  2516. MADD c11, c11, a2, b6
  2517. LD b6, 24 * SIZE(BO)
  2518. MADD c21, c21, a2, b2
  2519. LD b2, 13 * SIZE(BO)
  2520. MADD c31, c31, a2, b3
  2521. LD b3, 14 * SIZE(BO)
  2522. MADD c41, c41, a2, b4
  2523. LD b4, 15 * SIZE(BO)
  2524. MADD c51, c51, a2, b7
  2525. LD b7, 28 * SIZE(BO)
  2526. MADD c61, c61, a2, b2
  2527. LD b2, 17 * SIZE(BO)
  2528. MADD c71, c71, a2, b3
  2529. LD b3, 18 * SIZE(BO)
  2530. MADD c81, c81, a2, b4
  2531. LD b4, 19 * SIZE(BO)
  2532. LD a2, 5 * SIZE(AO)
  2533. daddiu AO, AO, 4 * SIZE
  2534. MADD c11, c11, a3, b1
  2535. LD b1, 32 * SIZE(BO)
  2536. MADD c21, c21, a3, b2
  2537. LD b2, 21 * SIZE(BO)
  2538. MADD c31, c31, a3, b3
  2539. LD b3, 22 * SIZE(BO)
  2540. MADD c41, c41, a3, b4
  2541. LD b4, 23 * SIZE(BO)
  2542. MADD c51, c51, a3, b5
  2543. LD b5, 36 * SIZE(BO)
  2544. MADD c61, c61, a3, b2
  2545. LD b2, 25 * SIZE(BO)
  2546. MADD c71, c71, a3, b3
  2547. LD b3, 26 * SIZE(BO)
  2548. MADD c81, c81, a3, b4
  2549. LD b4, 27 * SIZE(BO)
  2550. LD a3, 2 * SIZE(AO)
  2551. daddiu BO, BO, 32 * SIZE
  2552. MADD c11, c11, a4, b6
  2553. LD b6, 8 * SIZE(BO)
  2554. MADD c21, c21, a4, b2
  2555. LD b2, -3 * SIZE(BO)
  2556. MADD c31, c31, a4, b3
  2557. LD b3, -2 * SIZE(BO)
  2558. MADD c41, c41, a4, b4
  2559. LD b4, -1 * SIZE(BO)
  2560. MADD c51, c51, a4, b7
  2561. LD b7, 12 * SIZE(BO)
  2562. MADD c61, c61, a4, b2
  2563. LD b2, 1 * SIZE(BO)
  2564. MADD c71, c71, a4, b3
  2565. LD b3, 2 * SIZE(BO)
  2566. MADD c81, c81, a4, b4
  2567. LD b4, 3 * SIZE(BO)
  2568. bgtz L, .L22
  2569. LD a4, 3 * SIZE(AO)
  2570. .align 3
  2571. .L25:
  2572. #if defined(LT) || defined(RN)
  2573. andi L, KK, 3
  2574. #else
  2575. andi L, TEMP, 3
  2576. #endif
  2577. NOP
  2578. blez L, .L28
  2579. NOP
  2580. .align 3
  2581. .L26:
  2582. MADD c11, c11, a1, b1
  2583. LD b1, 8 * SIZE(BO)
  2584. MADD c21, c21, a1, b2
  2585. LD b2, 5 * SIZE(BO)
  2586. MADD c31, c31, a1, b3
  2587. LD b3, 6 * SIZE(BO)
  2588. MADD c41, c41, a1, b4
  2589. LD b4, 7 * SIZE(BO)
  2590. daddiu L, L, -1
  2591. MOV a2, a2
  2592. daddiu AO, AO, 1 * SIZE
  2593. daddiu BO, BO, 8 * SIZE
  2594. MADD c51, c51, a1, b5
  2595. LD b5, 4 * SIZE(BO)
  2596. MADD c61, c61, a1, b2
  2597. LD b2, 1 * SIZE(BO)
  2598. MADD c71, c71, a1, b3
  2599. LD b3, 2 * SIZE(BO)
  2600. MADD c81, c81, a1, b4
  2601. LD a1, 0 * SIZE(AO)
  2602. bgtz L, .L26
  2603. LD b4, 3 * SIZE(BO)
  2604. .L28:
  2605. #if defined(LN) || defined(RT)
  2606. #ifdef LN
  2607. daddiu TEMP, KK, -1
  2608. #else
  2609. daddiu TEMP, KK, -8
  2610. #endif
  2611. dsll L, TEMP, 0 + BASE_SHIFT
  2612. dsll TEMP, TEMP, 3 + BASE_SHIFT
  2613. daddu AO, AORIG, L
  2614. daddu BO, B, TEMP
  2615. #endif
  2616. #if defined(LN) || defined(LT)
  2617. LD b1, 0 * SIZE(BO)
  2618. LD b2, 1 * SIZE(BO)
  2619. LD b3, 2 * SIZE(BO)
  2620. LD b4, 3 * SIZE(BO)
  2621. LD b5, 4 * SIZE(BO)
  2622. LD b6, 5 * SIZE(BO)
  2623. LD b7, 6 * SIZE(BO)
  2624. LD b8, 7 * SIZE(BO)
  2625. SUB c11, b1, c11
  2626. SUB c21, b2, c21
  2627. SUB c31, b3, c31
  2628. SUB c41, b4, c41
  2629. SUB c51, b5, c51
  2630. SUB c61, b6, c61
  2631. SUB c71, b7, c71
  2632. SUB c81, b8, c81
  2633. #else
  2634. LD b1, 0 * SIZE(AO)
  2635. LD b2, 1 * SIZE(AO)
  2636. LD b3, 2 * SIZE(AO)
  2637. LD b4, 3 * SIZE(AO)
  2638. LD b5, 4 * SIZE(AO)
  2639. LD b6, 5 * SIZE(AO)
  2640. LD b7, 6 * SIZE(AO)
  2641. LD b8, 7 * SIZE(AO)
  2642. SUB c11, b1, c11
  2643. SUB c21, b2, c21
  2644. SUB c31, b3, c31
  2645. SUB c41, b4, c41
  2646. SUB c51, b5, c51
  2647. SUB c61, b6, c61
  2648. SUB c71, b7, c71
  2649. SUB c81, b8, c81
  2650. #endif
  2651. #if defined(LN) || defined(LT)
  2652. LD b1, 0 * SIZE(AO)
  2653. MUL c11, b1, c11
  2654. MUL c21, b1, c21
  2655. MUL c31, b1, c31
  2656. MUL c41, b1, c41
  2657. MUL c51, b1, c51
  2658. MUL c61, b1, c61
  2659. MUL c71, b1, c71
  2660. MUL c81, b1, c81
  2661. #endif
  2662. #ifdef RN
  2663. LD b1, 0 * SIZE(BO)
  2664. LD b2, 1 * SIZE(BO)
  2665. LD b3, 2 * SIZE(BO)
  2666. LD b4, 3 * SIZE(BO)
  2667. LD b5, 4 * SIZE(BO)
  2668. LD b6, 5 * SIZE(BO)
  2669. LD b7, 6 * SIZE(BO)
  2670. LD b8, 7 * SIZE(BO)
  2671. MUL c11, b1, c11
  2672. NMSUB c21, c21, b2, c11
  2673. NMSUB c31, c31, b3, c11
  2674. NMSUB c41, c41, b4, c11
  2675. NMSUB c51, c51, b5, c11
  2676. NMSUB c61, c61, b6, c11
  2677. NMSUB c71, c71, b7, c11
  2678. NMSUB c81, c81, b8, c11
  2679. LD b2, 9 * SIZE(BO)
  2680. LD b3, 10 * SIZE(BO)
  2681. LD b4, 11 * SIZE(BO)
  2682. LD b5, 12 * SIZE(BO)
  2683. LD b6, 13 * SIZE(BO)
  2684. LD b7, 14 * SIZE(BO)
  2685. LD b8, 15 * SIZE(BO)
  2686. MUL c21, b2, c21
  2687. NMSUB c31, c31, b3, c21
  2688. NMSUB c41, c41, b4, c21
  2689. NMSUB c51, c51, b5, c21
  2690. NMSUB c61, c61, b6, c21
  2691. NMSUB c71, c71, b7, c21
  2692. NMSUB c81, c81, b8, c21
  2693. LD b3, 18 * SIZE(BO)
  2694. LD b4, 19 * SIZE(BO)
  2695. LD b5, 20 * SIZE(BO)
  2696. LD b6, 21 * SIZE(BO)
  2697. LD b7, 22 * SIZE(BO)
  2698. LD b8, 23 * SIZE(BO)
  2699. MUL c31, b3, c31
  2700. NMSUB c41, c41, b4, c31
  2701. NMSUB c51, c51, b5, c31
  2702. NMSUB c61, c61, b6, c31
  2703. NMSUB c71, c71, b7, c31
  2704. NMSUB c81, c81, b8, c31
  2705. LD b4, 27 * SIZE(BO)
  2706. LD b5, 28 * SIZE(BO)
  2707. LD b6, 29 * SIZE(BO)
  2708. LD b7, 30 * SIZE(BO)
  2709. LD b8, 31 * SIZE(BO)
  2710. MUL c41, b4, c41
  2711. NMSUB c51, c51, b5, c41
  2712. NMSUB c61, c61, b6, c41
  2713. NMSUB c71, c71, b7, c41
  2714. NMSUB c81, c81, b8, c41
  2715. LD b5, 36 * SIZE(BO)
  2716. LD b6, 37 * SIZE(BO)
  2717. LD b7, 38 * SIZE(BO)
  2718. LD b8, 39 * SIZE(BO)
  2719. MUL c51, b5, c51
  2720. NMSUB c61, c61, b6, c51
  2721. NMSUB c71, c71, b7, c51
  2722. NMSUB c81, c81, b8, c51
  2723. LD b6, 45 * SIZE(BO)
  2724. LD b7, 46 * SIZE(BO)
  2725. LD b8, 47 * SIZE(BO)
  2726. MUL c61, b6, c61
  2727. NMSUB c71, c71, b7, c61
  2728. NMSUB c81, c81, b8, c61
  2729. LD b7, 54 * SIZE(BO)
  2730. LD b8, 55 * SIZE(BO)
  2731. MUL c71, b7, c71
  2732. NMSUB c81, c81, b8, c71
  2733. LD b8, 63 * SIZE(BO)
  2734. MUL c81, b8, c81
  2735. #endif
  2736. #ifdef RT
  2737. LD b1, 63 * SIZE(BO)
  2738. LD b2, 62 * SIZE(BO)
  2739. LD b3, 61 * SIZE(BO)
  2740. LD b4, 60 * SIZE(BO)
  2741. LD b5, 59 * SIZE(BO)
  2742. LD b6, 58 * SIZE(BO)
  2743. LD b7, 57 * SIZE(BO)
  2744. LD b8, 56 * SIZE(BO)
  2745. MUL c81, b1, c81
  2746. NMSUB c71, c71, b2, c81
  2747. NMSUB c61, c61, b3, c81
  2748. NMSUB c51, c51, b4, c81
  2749. NMSUB c41, c41, b5, c81
  2750. NMSUB c31, c31, b6, c81
  2751. NMSUB c21, c21, b7, c81
  2752. NMSUB c11, c11, b8, c81
  2753. LD b2, 54 * SIZE(BO)
  2754. LD b3, 53 * SIZE(BO)
  2755. LD b4, 52 * SIZE(BO)
  2756. LD b5, 51 * SIZE(BO)
  2757. LD b6, 50 * SIZE(BO)
  2758. LD b7, 49 * SIZE(BO)
  2759. LD b8, 48 * SIZE(BO)
  2760. MUL c71, b2, c71
  2761. NMSUB c61, c61, b3, c71
  2762. NMSUB c51, c51, b4, c71
  2763. NMSUB c41, c41, b5, c71
  2764. NMSUB c31, c31, b6, c71
  2765. NMSUB c21, c21, b7, c71
  2766. NMSUB c11, c11, b8, c71
  2767. LD b3, 45 * SIZE(BO)
  2768. LD b4, 44 * SIZE(BO)
  2769. LD b5, 43 * SIZE(BO)
  2770. LD b6, 42 * SIZE(BO)
  2771. LD b7, 41 * SIZE(BO)
  2772. LD b8, 40 * SIZE(BO)
  2773. MUL c61, b3, c61
  2774. NMSUB c51, c51, b4, c61
  2775. NMSUB c41, c41, b5, c61
  2776. NMSUB c31, c31, b6, c61
  2777. NMSUB c21, c21, b7, c61
  2778. NMSUB c11, c11, b8, c61
  2779. LD b4, 36 * SIZE(BO)
  2780. LD b5, 35 * SIZE(BO)
  2781. LD b6, 34 * SIZE(BO)
  2782. LD b7, 33 * SIZE(BO)
  2783. LD b8, 32 * SIZE(BO)
  2784. MUL c51, b4, c51
  2785. NMSUB c41, c41, b5, c51
  2786. NMSUB c31, c31, b6, c51
  2787. NMSUB c21, c21, b7, c51
  2788. NMSUB c11, c11, b8, c51
  2789. LD b5, 27 * SIZE(BO)
  2790. LD b6, 26 * SIZE(BO)
  2791. LD b7, 25 * SIZE(BO)
  2792. LD b8, 24 * SIZE(BO)
  2793. MUL c41, b5, c41
  2794. NMSUB c31, c31, b6, c41
  2795. NMSUB c21, c21, b7, c41
  2796. NMSUB c11, c11, b8, c41
  2797. LD b6, 18 * SIZE(BO)
  2798. LD b7, 17 * SIZE(BO)
  2799. LD b8, 16 * SIZE(BO)
  2800. MUL c31, b6, c31
  2801. NMSUB c21, c21, b7, c31
  2802. NMSUB c11, c11, b8, c31
  2803. LD b7, 9 * SIZE(BO)
  2804. LD b8, 8 * SIZE(BO)
  2805. MUL c21, b7, c21
  2806. NMSUB c11, c11, b8, c21
  2807. LD b8, 0 * SIZE(BO)
  2808. MUL c11, b8, c11
  2809. #endif
  2810. #ifdef LN
  2811. daddiu CO1, CO1, -1 * SIZE
  2812. daddiu CO2, CO2, -1 * SIZE
  2813. daddiu CO3, CO3, -1 * SIZE
  2814. daddiu CO4, CO4, -1 * SIZE
  2815. daddiu CO5, CO5, -1 * SIZE
  2816. daddiu CO6, CO6, -1 * SIZE
  2817. daddiu CO7, CO7, -1 * SIZE
  2818. daddiu CO8, CO8, -1 * SIZE
  2819. #endif
  2820. #if defined(LN) || defined(LT)
  2821. ST c11, 0 * SIZE(BO)
  2822. ST c21, 1 * SIZE(BO)
  2823. ST c31, 2 * SIZE(BO)
  2824. ST c41, 3 * SIZE(BO)
  2825. ST c51, 4 * SIZE(BO)
  2826. ST c61, 5 * SIZE(BO)
  2827. ST c71, 6 * SIZE(BO)
  2828. ST c81, 7 * SIZE(BO)
  2829. #else
  2830. ST c11, 0 * SIZE(AO)
  2831. ST c21, 1 * SIZE(AO)
  2832. ST c31, 2 * SIZE(AO)
  2833. ST c41, 3 * SIZE(AO)
  2834. ST c51, 4 * SIZE(AO)
  2835. ST c61, 5 * SIZE(AO)
  2836. ST c71, 6 * SIZE(AO)
  2837. ST c81, 7 * SIZE(AO)
  2838. #endif
  2839. ST c11, 0 * SIZE(CO1)
  2840. ST c21, 0 * SIZE(CO2)
  2841. ST c31, 0 * SIZE(CO3)
  2842. ST c41, 0 * SIZE(CO4)
  2843. ST c51, 0 * SIZE(CO5)
  2844. ST c61, 0 * SIZE(CO6)
  2845. ST c71, 0 * SIZE(CO7)
  2846. ST c81, 0 * SIZE(CO8)
  2847. #ifndef LN
  2848. daddiu CO1, CO1, 1 * SIZE
  2849. daddiu CO2, CO2, 1 * SIZE
  2850. daddiu CO3, CO3, 1 * SIZE
  2851. daddiu CO4, CO4, 1 * SIZE
  2852. daddiu CO5, CO5, 1 * SIZE
  2853. daddiu CO6, CO6, 1 * SIZE
  2854. daddiu CO7, CO7, 1 * SIZE
  2855. daddiu CO8, CO8, 1 * SIZE
  2856. #endif
  2857. #ifdef RT
  2858. dsll TEMP, K, BASE_SHIFT
  2859. daddu AORIG, AORIG, TEMP
  2860. #endif
  2861. #if defined(LT) || defined(RN)
  2862. dsubu TEMP, K, KK
  2863. dsll L, TEMP, 0 + BASE_SHIFT
  2864. dsll TEMP, TEMP, 3 + BASE_SHIFT
  2865. daddu AO, AO, L
  2866. daddu BO, BO, TEMP
  2867. #endif
  2868. #ifdef LT
  2869. daddiu KK, KK, 1
  2870. #endif
  2871. #ifdef LN
  2872. daddiu KK, KK, -1
  2873. #endif
  2874. .align 3
  2875. .L29:
  2876. #ifdef LN
  2877. dsll TEMP, K, 3 + BASE_SHIFT
  2878. daddu B, B, TEMP
  2879. #endif
  2880. #if defined(LT) || defined(RN)
  2881. move B, BO
  2882. #endif
  2883. #ifdef RN
  2884. daddiu KK, KK, 8
  2885. #endif
  2886. #ifdef RT
  2887. daddiu KK, KK, -8
  2888. #endif
  2889. bgtz J, .L10
  2890. NOP
  2891. .align 3
  2892. .L999:
  2893. LDARG $16, 0($sp)
  2894. LDARG $17, 8($sp)
  2895. LDARG $18, 16($sp)
  2896. LDARG $19, 24($sp)
  2897. LDARG $20, 32($sp)
  2898. LDARG $21, 40($sp)
  2899. ldc1 $f24, 48($sp)
  2900. ldc1 $f25, 56($sp)
  2901. ldc1 $f26, 64($sp)
  2902. ldc1 $f27, 72($sp)
  2903. ldc1 $f28, 80($sp)
  2904. LDARG $22, 88($sp)
  2905. LDARG $23, 96($sp)
  2906. LDARG $24, 104($sp)
  2907. LDARG $25, 112($sp)
  2908. #ifndef __64BIT__
  2909. ldc1 $f20,112($sp)
  2910. ldc1 $f21,120($sp)
  2911. ldc1 $f22,128($sp)
  2912. ldc1 $f23,136($sp)
  2913. #endif
  2914. j $31
  2915. daddiu $sp, $sp, 144
  2916. EPILOGUE