You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemm_kernel_loongson3a_4x2_ps.S 76 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026
  1. ##define REALNAME gemm
  2. #define ASSEMBLER
  3. #include "common.h"
  4. #define FETCH ld
  5. #define STACKSIZE 192
  6. #define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  7. #define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq)
  8. ##### Parameter registers ####
  9. #define M $4
  10. #define N $5
  11. #define K $6
  12. #define A $9
  13. #define B $10
  14. #define C $11
  15. #define LDC $8
  16. #### Pointer A, B, C ####
  17. #define AO $12
  18. #define BO $13
  19. #define CO1 $14
  20. #define CO2 $15
  21. #define PREA $18
  22. #define PREB $19
  23. #### Used registers ####
  24. #define A1 $f0
  25. #define A2 $f1
  26. #define A3 $f2
  27. #define A4 $f3
  28. #define A5 $f4
  29. #define A6 $f5
  30. #define A7 $f6
  31. #define A8 $f7
  32. #define B1 $f8
  33. #define B2 $f9
  34. #define B3 $f10
  35. #define B4 $f11
  36. #define B5 $f12
  37. #define B6 $f13
  38. #define B7 $f14
  39. #define B8 $f15
  40. #define C11 $f16
  41. #define C12 $f17
  42. #define C21 $f18
  43. #define C22 $f19
  44. #define C31 $f20
  45. #define C32 $f21
  46. #define C41 $f22
  47. #define C42 $f23
  48. #define C13 $f24
  49. #define C14 $f25
  50. #define C23 $f26
  51. #define C24 $f27
  52. #define C33 $f28
  53. #define C34 $f29
  54. #define C43 $f30
  55. #define C44 $f31
  56. #define I $2
  57. #define J $3
  58. #define L $7
  59. #### Alpha register ####
  60. #define ALPHA $f15
  61. #define F31 31
  62. #define F30 30
  63. #define F29 29
  64. #define F28 28
  65. #define F27 27
  66. #define F26 26
  67. #define F25 25
  68. #define F24 24
  69. #define F23 23
  70. #define F22 22
  71. #define F21 21
  72. #define F20 20
  73. #define F19 19
  74. #define F18 18
  75. #define F17 17
  76. #define F16 16
  77. #define F15 15
  78. #define F14 14
  79. #define F13 13
  80. #define F12 12
  81. #define F11 11
  82. #define F10 10
  83. #define F9 9
  84. #define F8 8
  85. #define F7 7
  86. #define F6 6
  87. #define F5 5
  88. #define F4 4
  89. #define F3 3
  90. #define F2 2
  91. #define F1 1
  92. #define F0 0
  93. #define R12 12
  94. #define R13 13
  95. #define R14 14
  96. #define R15 15
  97. #define R16 16
  98. #define R17 17
  99. #if defined(TRMMKERNEL)
  100. #define OFFSET $23
  101. #define KK $24
  102. #define TEMP $25
  103. #endif
  104. PROLOGUE
  105. LDARG LDC, 0($sp)
  106. daddiu $sp,$sp,-STACKSIZE
  107. sd $16, 0($sp)
  108. sd $17, 8($sp)
  109. sd $18, 16($sp)
  110. sd $19, 24($sp)
  111. sd $20, 32($sp)
  112. sd $21, 40($sp)
  113. sd $22, 48($sp)
  114. ST $f24, 56($sp)
  115. ST $f25, 64($sp)
  116. ST $f26, 72($sp)
  117. ST $f27, 80($sp)
  118. ST $f28, 88($sp)
  119. #if defined(TRMMKERNEL)
  120. sd $23, 96($sp)
  121. sd $24, 104($sp)
  122. sd $25, 112($sp)
  123. LDARG OFFSET, STACKSIZE+8($sp)
  124. #endif
  125. #ifndef __64BIT__
  126. ST $f20,120($sp)
  127. ST $f21,128($sp)
  128. ST $f22,136($sp)
  129. ST $f23,144($sp)
  130. #endif
  131. .align 4
  132. .L2:
  133. dsra J, N, 1 # NR=2
  134. ST $f15, 152($sp)
  135. #if defined(TRMMKERNEL) && !defined(LEFT)
  136. neg KK, OFFSET
  137. #endif
  138. dsll LDC, LDC, ZBASE_SHIFT# LDC*SIZE
  139. blez J, .L1
  140. ST $f16, 160($sp)
  141. .L24:
  142. #if defined(TRMMKERNEL) && defined(LEFT)
  143. move KK, OFFSET
  144. #endif
  145. dsra I, M, 2 # MR=8
  146. move AO, A # Reset A
  147. dsll PREA, K, 1 + ZBASE_SHIFT
  148. move CO1, C
  149. daddu CO2, C, LDC
  150. daddu PREA, AO, PREA
  151. blez I, .L22
  152. daddu C, CO2, LDC
  153. .align 4
  154. .L241:
  155. #if defined(TRMMKERNEL)
  156. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  157. move BO, B
  158. #else
  159. dsll L, KK, 2 + ZBASE_SHIFT
  160. dsll TEMP, KK, 1 + ZBASE_SHIFT
  161. daddu AO, AO, L
  162. daddu BO, B, TEMP
  163. #endif
  164. MTC $0, C11 # CLEAR REAULTS REGISTERS
  165. MOV C12, C11
  166. dsll PREB, K, ZBASE_SHIFT
  167. MOV C21, C11
  168. MOV C22, C11
  169. gsLQC1(R13, F9, F8, 0) # B1 B2
  170. MOV C31, C11
  171. MOV C32, C11
  172. gsLQC1(R12, F1, F0, 0) # A1 A2
  173. MOV C41, C11
  174. MOV C42, C11
  175. gsLQC1(R12, F3, F2, 1) # A3 A4
  176. MOV C13, C11
  177. MOV C14, C11
  178. MOV C23, C11
  179. MOV C24, C11
  180. MOV C33, C11
  181. MOV C34, C11
  182. MOV C43, C11
  183. MOV C44, C11
  184. PLU B3, B1, B1
  185. PLU B4, B2, B2
  186. daddu PREB, BO, PREB
  187. FETCH $0, 0 * SIZE(CO1)
  188. FETCH $0, 8 * SIZE(CO1)
  189. FETCH $0, 0 * SIZE(CO2)
  190. FETCH $0, 8 * SIZE(CO2)
  191. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  192. dsubu TEMP, K, KK
  193. #elif defined(LEFT)
  194. daddiu TEMP, KK, 4
  195. #else
  196. daddiu TEMP, KK, 2
  197. #endif
  198. dsra L, TEMP, 2
  199. blez L, .L242
  200. NOP
  201. #else
  202. move BO, B # Reset B
  203. dsra L, K, 2 # UnRoll K=64
  204. MTC $0, C11 # CLEAR REAULTS REGISTERS
  205. MOV C12, C11
  206. dsll PREB, K, ZBASE_SHIFT
  207. MOV C21, C11
  208. MOV C22, C11
  209. gsLQC1(R13, F9, F8, 0) # B1 B2
  210. MOV C31, C11
  211. MOV C32, C11
  212. gsLQC1(R12, F1, F0, 0) # A1 A2
  213. MOV C41, C11
  214. MOV C42, C11
  215. gsLQC1(R12, F3, F2, 1) # A3 A4
  216. MOV C13, C11
  217. MOV C14, C11
  218. FETCH $0, 0 * SIZE(CO1)
  219. MOV C23, C11
  220. MOV C24, C11
  221. FETCH $0, 0 * SIZE(CO2)
  222. MOV C33, C11
  223. MOV C34, C11
  224. MOV C43, C11
  225. MOV C44, C11
  226. daddu PREB, BO, PREB
  227. PLU B3, B1, B1
  228. PLU B4, B2, B2
  229. FETCH $0, 8 * SIZE(CO1)
  230. blez L, .L242
  231. FETCH $0, 8 * SIZE(CO2)
  232. #endif
  233. .L2410:
  234. daddiu L, L, -1
  235. gsLQC1(R13, F13, F12, 1) # B3 B4
  236. MADPS C11, C11, A1, B1
  237. MADPS C21, C21, A2, B1
  238. gsLQC1(R12, F5, F4, 2) # A5 A6
  239. MADPS C12, C12, A1, B2
  240. MADPS C22, C22, A2, B2
  241. gsLQC1(R12, F7, F6, 3) # A7 A8
  242. MADPS C31, C31, A3, B1
  243. MADPS C41, C41, A4, B1
  244. FETCH $0, 0 * SIZE(PREB)
  245. MADPS C32, C32, A3, B2
  246. MADPS C42, C42, A4, B2
  247. FETCH $0, 0 * SIZE(PREA)
  248. MADPS C13, C13, A1, B3
  249. MADPS C23, C23, A2, B3
  250. MADPS C33, C33, A3, B3
  251. MADPS C43, C43, A4, B3
  252. MADPS C14, C14, A1, B4
  253. MADPS C24, C24, A2, B4
  254. PLU B7, B5, B5
  255. PLU B8, B6, B6
  256. daddu PREB, PREB, 8 * SIZE
  257. MADPS C34, C34, A3, B4
  258. MADPS C44, C44, A4, B4
  259. gsLQC1(R13, F9, F8, 2) # B1 B2
  260. MADPS C11, C11, A5, B5
  261. MADPS C21, C21, A6, B5
  262. gsLQC1(R12, F1, F0, 4) # A1 A2
  263. MADPS C12, C12, A5, B6
  264. MADPS C22, C22, A6, B6
  265. gsLQC1(R12, F3, F2, 5) # A3 A4
  266. MADPS C31, C31, A7, B5
  267. MADPS C41, C41, A8, B5
  268. FETCH $0, 8 * SIZE(PREA)
  269. MADPS C32, C32, A7, B6
  270. MADPS C42, C42, A8, B6
  271. MADPS C13, C13, A5, B7
  272. MADPS C23, C23, A6, B7
  273. MADPS C33, C33, A7, B7
  274. MADPS C43, C43, A8, B7
  275. MADPS C14, C14, A5, B8
  276. MADPS C24, C24, A6, B8
  277. PLU B3, B1, B1
  278. PLU B4, B2, B2
  279. MADPS C34, C34, A7, B8
  280. MADPS C44, C44, A8, B8
  281. gsLQC1(R13, F13, F12, 3) # B3 B4
  282. MADPS C11, C11, A1, B1
  283. MADPS C21, C21, A2, B1
  284. gsLQC1(R12, F5, F4, 6) # A5 A6
  285. MADPS C12, C12, A1, B2
  286. MADPS C22, C22, A2, B2
  287. gsLQC1(R12, F7, F6, 7) # A7 A8
  288. MADPS C31, C31, A3, B1
  289. MADPS C41, C41, A4, B1
  290. daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR
  291. FETCH $0, 16 * SIZE(PREA)
  292. MADPS C32, C32, A3, B2
  293. MADPS C42, C42, A4, B2
  294. daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR
  295. MADPS C13, C13, A1, B3
  296. MADPS C23, C23, A2, B3
  297. MADPS C33, C33, A3, B3
  298. MADPS C43, C43, A4, B3
  299. MADPS C14, C14, A1, B4
  300. MADPS C24, C24, A2, B4
  301. PLU B7, B5, B5
  302. PLU B8, B6, B6
  303. MADPS C34, C34, A3, B4
  304. MADPS C44, C44, A4, B4
  305. gsLQC1(R13, F9, F8, 0) # B1 B2
  306. MADPS C11, C11, A5, B5
  307. MADPS C21, C21, A6, B5
  308. gsLQC1(R12, F1, F0, 0) # A1 A2
  309. MADPS C12, C12, A5, B6
  310. MADPS C22, C22, A6, B6
  311. gsLQC1(R12, F3, F2, 1) # A3 A4
  312. MADPS C31, C31, A7, B5
  313. MADPS C41, C41, A8, B5
  314. FETCH $0, 24 * SIZE(PREA)
  315. MADPS C32, C32, A7, B6
  316. MADPS C42, C42, A8, B6
  317. MADPS C13, C13, A5, B7
  318. MADPS C23, C23, A6, B7
  319. daddu PREA, PREA, 32 * SIZE
  320. MADPS C33, C33, A7, B7
  321. MADPS C43, C43, A8, B7
  322. MADPS C14, C14, A5, B8
  323. MADPS C24, C24, A6, B8
  324. PLU B3, B1, B1
  325. PLU B4, B2, B2
  326. MADPS C34, C34, A7, B8
  327. bgtz L, .L2410
  328. MADPS C44, C44, A8, B8
  329. .align 4
  330. .L242:
  331. #ifndef TRMMKERNEL
  332. andi L, K, 2
  333. #else
  334. andi L, TEMP, 2
  335. #endif
  336. blez L, .L247
  337. NOP
  338. gsLQC1(R13, F13, F12, 1) # B3 B4
  339. MADPS C11, C11, A1, B1
  340. MADPS C21, C21, A2, B1
  341. gsLQC1(R12, F5, F4, 2) # A5 A6
  342. MADPS C12, C12, A1, B2
  343. MADPS C22, C22, A2, B2
  344. gsLQC1(R12, F7, F6, 3) # A7 A8
  345. MADPS C31, C31, A3, B1
  346. MADPS C41, C41, A4, B1
  347. daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR
  348. MADPS C32, C32, A3, B2
  349. MADPS C42, C42, A4, B2
  350. daddiu AO, AO, 4 * 4 * SIZE
  351. MADPS C13, C13, A1, B3
  352. MADPS C23, C23, A2, B3
  353. MADPS C33, C33, A3, B3
  354. MADPS C43, C43, A4, B3
  355. MADPS C14, C14, A1, B4
  356. MADPS C24, C24, A2, B4
  357. PLU B7, B5, B5
  358. PLU B8, B6, B6
  359. MADPS C34, C34, A3, B4
  360. MADPS C44, C44, A4, B4
  361. gsLQC1(R13, F9, F8, 0) # B1 B2
  362. MADPS C11, C11, A5, B5
  363. MADPS C21, C21, A6, B5
  364. gsLQC1(R12, F1, F0, 0) # A1 A2
  365. MADPS C12, C12, A5, B6
  366. MADPS C22, C22, A6, B6
  367. gsLQC1(R12, F3, F2, 1) # A3 A4
  368. MADPS C31, C31, A7, B5
  369. MADPS C41, C41, A8, B5
  370. MADPS C32, C32, A7, B6
  371. MADPS C42, C42, A8, B6
  372. MADPS C13, C13, A5, B7
  373. MADPS C23, C23, A6, B7
  374. MADPS C33, C33, A7, B7
  375. MADPS C43, C43, A8, B7
  376. MADPS C14, C14, A5, B8
  377. MADPS C24, C24, A6, B8
  378. PLU B3, B1, B1
  379. PLU B4, B2, B2
  380. MADPS C34, C34, A7, B8
  381. MADPS C44, C44, A8, B8
  382. .align 4
  383. .L247:
  384. #ifndef TRMMKERNEL
  385. andi L, K, 1
  386. #else
  387. andi L, TEMP, 1
  388. #endif
  389. blez L, .L240
  390. NOP
  391. MADPS C11, C11, A1, B1
  392. MADPS C21, C21, A2, B1
  393. MADPS C12, C12, A1, B2
  394. MADPS C22, C22, A2, B2
  395. MADPS C31, C31, A3, B1
  396. MADPS C41, C41, A4, B1
  397. daddiu BO, BO, 1 * 4 * SIZE # 4KR*4NR
  398. MADPS C32, C32, A3, B2
  399. MADPS C42, C42, A4, B2
  400. daddiu AO, AO, 2 * 4 * SIZE
  401. MADPS C13, C13, A1, B3
  402. MADPS C23, C23, A2, B3
  403. MADPS C33, C33, A3, B3
  404. MADPS C43, C43, A4, B3
  405. MADPS C14, C14, A1, B4
  406. MADPS C24, C24, A2, B4
  407. MADPS C34, C34, A3, B4
  408. MADPS C44, C44, A4, B4
  409. .align 4
  410. .L240: # Write Back
  411. #ifndef TRMMKERNEL
  412. daddiu I, I, -1
  413. CVTU A1, C11
  414. CVTU A2, C21
  415. CVTU A3, C31
  416. CVTU A4, C41
  417. CVTU A5, C13
  418. CVTU A6, C23
  419. CVTU A7, C33
  420. CVTU A8, C43
  421. CVTU B1, C12
  422. CVTU B2, C22
  423. CVTU B3, C32
  424. CVTU B4, C42
  425. CVTU B5, C14
  426. CVTU B6, C24
  427. CVTU B7, C34
  428. CVTU B8, C44
  429. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  430. /* (a + bi) * (c + di) */
  431. SUB C11, C11, A1 # ac'+'bd
  432. SUB C21, C21, A2
  433. # LD A1, 0 * SIZE(A) # load alpha_r
  434. SUB C31, C31, A3
  435. LD A1, 152($sp) # load alpha_r
  436. SUB C41, C41, A4
  437. LD A2, 160($sp) # load alpha_i
  438. # LD A2, 0 * SIZE(A) # load alpha_i
  439. ADD C13, A5, C13 # ad'+'cb
  440. ADD C23, A6, C23
  441. ADD C33, A7, C33
  442. ADD C43, A8, C43
  443. SUB C12, C12, B1
  444. SUB C22, C22, B2
  445. SUB C32, C32, B3
  446. SUB C42, C42, B4
  447. ADD C14, B5, C14
  448. ADD C24, B6, C24
  449. ADD C34, B7, C34
  450. ADD C44, B8, C44
  451. LD B1, 0 * SIZE(CO1)
  452. LD B3, 2 * SIZE(CO1)
  453. LD B5, 4 * SIZE(CO1)
  454. LD B7, 6 * SIZE(CO1)
  455. LD B2, 1 * SIZE(CO1)
  456. LD B4, 3 * SIZE(CO1)
  457. LD B6, 5 * SIZE(CO1)
  458. LD B8, 7 * SIZE(CO1)
  459. MADD B1, B1, C11, A1 # A1 = alpha_r
  460. MADD B3, B3, C21, A1
  461. MADD B5, B5, C31, A1
  462. MADD B7, B7, C41, A1
  463. MADD B2, B2, C13, A1
  464. MADD B4, B4, C23, A1
  465. MADD B6, B6, C33, A1
  466. MADD B8, B8, C43, A1
  467. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  468. NMSUB B3, B3, C23, A2
  469. NMSUB B5, B5, C33, A2
  470. NMSUB B7, B7, C43, A2
  471. MADD B2, B2, C11, A2
  472. MADD B4, B4, C21, A2
  473. MADD B6, B6, C31, A2
  474. MADD B8, B8, C41, A2
  475. LD C13, 0 * SIZE(CO2)
  476. LD C23, 2 * SIZE(CO2)
  477. LD C33, 4 * SIZE(CO2)
  478. LD C43, 6 * SIZE(CO2)
  479. LD C11, 1 * SIZE(CO2)
  480. LD C21, 3 * SIZE(CO2)
  481. LD C31, 5 * SIZE(CO2)
  482. LD C41, 7 * SIZE(CO2)
  483. MADD C13, C13, C12, A1
  484. MADD C23, C23, C22, A1
  485. MADD C33, C33, C32, A1
  486. ST B1, 0 * SIZE(CO1)
  487. MADD C43, C43, C42, A1
  488. ST B3, 2 * SIZE(CO1)
  489. MADD C11, C11, C14, A1
  490. ST B5, 4 * SIZE(CO1)
  491. MADD C21, C21, C24, A1
  492. ST B7, 6 * SIZE(CO1)
  493. MADD C31, C31, C34, A1
  494. ST B2, 1 * SIZE(CO1)
  495. MADD C41, C41, C44, A1
  496. ST B4, 3 * SIZE(CO1)
  497. NMSUB C13, C13, C14, A2
  498. ST B6, 5 * SIZE(CO1)
  499. NMSUB C23, C23, C24, A2
  500. ST B8, 7 * SIZE(CO1)
  501. NMSUB C33, C33, C34, A2
  502. NMSUB C43, C43, C44, A2
  503. MADD C11, C11, C12, A2
  504. MADD C21, C21, C22, A2
  505. MADD C31, C31, C32, A2
  506. MADD C41, C41, C42, A2
  507. ST C13, 0 * SIZE(CO2)
  508. ST C23, 2 * SIZE(CO2)
  509. ST C33, 4 * SIZE(CO2)
  510. ST C43, 6 * SIZE(CO2)
  511. ST C11, 1 * SIZE(CO2)
  512. ST C21, 3 * SIZE(CO2)
  513. ST C31, 5 * SIZE(CO2)
  514. ST C41, 7 * SIZE(CO2)
  515. #endif
  516. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  517. /* (a + bi) * (c - di) */
  518. ADD C11, A1, C11 # ac'+'bd
  519. ADD C21, A2, C21
  520. # LD A1, 0 * SIZE(A) # load alpha_r
  521. ADD C31, A3, C31
  522. LD A1, 152($sp) # load alpha_r
  523. ADD C41, A4, C41
  524. LD A2, 160($sp) # load alpha_i
  525. # LD A2, 0 * SIZE(A) # load alpha_r
  526. SUB C13, A5, C13 # ad'+'cb
  527. SUB C23, A6, C23
  528. SUB C33, A7, C33
  529. SUB C43, A8, C43
  530. ADD C12, B1, C12
  531. ADD C22, B2, C22
  532. ADD C32, B3, C32
  533. ADD C42, B4, C42
  534. SUB C14, B5, C14
  535. SUB C24, B6, C24
  536. SUB C34, B7, C34
  537. SUB C44, B8, C44
  538. LD B1, 0 * SIZE(CO1)
  539. LD B3, 2 * SIZE(CO1)
  540. LD B5, 4 * SIZE(CO1)
  541. LD B7, 6 * SIZE(CO1)
  542. LD B2, 1 * SIZE(CO1)
  543. LD B4, 3 * SIZE(CO1)
  544. LD B6, 5 * SIZE(CO1)
  545. LD B8, 7 * SIZE(CO1)
  546. MADD B1, B1, C11, A1 # A1 = alpha_r
  547. MADD B3, B3, C21, A1
  548. MADD B5, B5, C31, A1
  549. MADD B7, B7, C41, A1
  550. MADD B2, B2, C13, A1
  551. MADD B4, B4, C23, A1
  552. MADD B6, B6, C33, A1
  553. MADD B8, B8, C43, A1
  554. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  555. NMSUB B3, B3, C23, A2
  556. NMSUB B5, B5, C33, A2
  557. NMSUB B7, B7, C43, A2
  558. MADD B2, B2, C11, A2
  559. MADD B4, B4, C21, A2
  560. MADD B6, B6, C31, A2
  561. MADD B8, B8, C41, A2
  562. LD C13, 0 * SIZE(CO2)
  563. LD C23, 2 * SIZE(CO2)
  564. LD C33, 4 * SIZE(CO2)
  565. LD C43, 6 * SIZE(CO2)
  566. LD C11, 1 * SIZE(CO2)
  567. LD C21, 3 * SIZE(CO2)
  568. LD C31, 5 * SIZE(CO2)
  569. LD C41, 7 * SIZE(CO2)
  570. MADD C13, C13, C12, A1
  571. MADD C23, C23, C22, A1
  572. MADD C33, C33, C32, A1
  573. ST B1, 0 * SIZE(CO1)
  574. MADD C43, C43, C42, A1
  575. ST B3, 2 * SIZE(CO1)
  576. MADD C11, C11, C14, A1
  577. ST B5, 4 * SIZE(CO1)
  578. MADD C21, C21, C24, A1
  579. ST B7, 6 * SIZE(CO1)
  580. MADD C31, C31, C34, A1
  581. ST B2, 1 * SIZE(CO1)
  582. MADD C41, C41, C44, A1
  583. ST B4, 3 * SIZE(CO1)
  584. NMSUB C13, C13, C14, A2
  585. ST B6, 5 * SIZE(CO1)
  586. NMSUB C23, C23, C24, A2
  587. ST B8, 7 * SIZE(CO1)
  588. NMSUB C33, C33, C34, A2
  589. NMSUB C43, C43, C44, A2
  590. MADD C11, C11, C12, A2
  591. MADD C21, C21, C22, A2
  592. MADD C31, C31, C32, A2
  593. MADD C41, C41, C42, A2
  594. ST C13, 0 * SIZE(CO2)
  595. ST C23, 2 * SIZE(CO2)
  596. ST C33, 4 * SIZE(CO2)
  597. ST C43, 6 * SIZE(CO2)
  598. ST C11, 1 * SIZE(CO2)
  599. ST C21, 3 * SIZE(CO2)
  600. ST C31, 5 * SIZE(CO2)
  601. ST C41, 7 * SIZE(CO2)
  602. #endif
  603. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  604. /* (a - bi) * (c + di) */
  605. ADD C11, A1, C11 # ac'+'bd
  606. ADD C21, A2, C21
  607. # LD A1, 0 * SIZE(A) # load alpha_r
  608. ADD C31, A3, C31
  609. LD A1, 152($sp) # load alpha_r
  610. # LD A2, 0 * SIZE(A) # load alpha_r
  611. ADD C41, A4, C41
  612. LD A2, 160($sp) # load alpha_i
  613. SUB C13, C13, A5 # ad'+'cb
  614. SUB C23, C23, A6
  615. SUB C33, C33, A7
  616. SUB C43, C43, A8
  617. ADD C12, B1, C12
  618. ADD C22, B2, C22
  619. ADD C32, B3, C32
  620. ADD C42, B4, C42
  621. SUB C14, C14, B5
  622. SUB C24, C24, B6
  623. SUB C34, C34, B7
  624. SUB C44, C44, B8
  625. LD B1, 0 * SIZE(CO1)
  626. LD B3, 2 * SIZE(CO1)
  627. LD B5, 4 * SIZE(CO1)
  628. LD B7, 6 * SIZE(CO1)
  629. LD B2, 1 * SIZE(CO1)
  630. LD B4, 3 * SIZE(CO1)
  631. LD B6, 5 * SIZE(CO1)
  632. LD B8, 7 * SIZE(CO1)
  633. MADD B1, B1, C11, A1 # A1 = alpha_r
  634. MADD B3, B3, C21, A1
  635. MADD B5, B5, C31, A1
  636. MADD B7, B7, C41, A1
  637. MADD B2, B2, C13, A1
  638. MADD B4, B4, C23, A1
  639. MADD B6, B6, C33, A1
  640. MADD B8, B8, C43, A1
  641. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  642. NMSUB B3, B3, C23, A2
  643. NMSUB B5, B5, C33, A2
  644. NMSUB B7, B7, C43, A2
  645. MADD B2, B2, C11, A2
  646. MADD B4, B4, C21, A2
  647. MADD B6, B6, C31, A2
  648. MADD B8, B8, C41, A2
  649. LD C13, 0 * SIZE(CO2)
  650. LD C23, 2 * SIZE(CO2)
  651. LD C33, 4 * SIZE(CO2)
  652. LD C43, 6 * SIZE(CO2)
  653. LD C11, 1 * SIZE(CO2)
  654. LD C21, 3 * SIZE(CO2)
  655. LD C31, 5 * SIZE(CO2)
  656. LD C41, 7 * SIZE(CO2)
  657. MADD C13, C13, C12, A1
  658. MADD C23, C23, C22, A1
  659. MADD C33, C33, C32, A1
  660. ST B1, 0 * SIZE(CO1)
  661. MADD C43, C43, C42, A1
  662. ST B3, 2 * SIZE(CO1)
  663. MADD C11, C11, C14, A1
  664. ST B5, 4 * SIZE(CO1)
  665. MADD C21, C21, C24, A1
  666. ST B7, 6 * SIZE(CO1)
  667. MADD C31, C31, C34, A1
  668. ST B2, 1 * SIZE(CO1)
  669. MADD C41, C41, C44, A1
  670. ST B4, 3 * SIZE(CO1)
  671. NMSUB C13, C13, C14, A2
  672. ST B6, 5 * SIZE(CO1)
  673. NMSUB C23, C23, C24, A2
  674. ST B8, 7 * SIZE(CO1)
  675. NMSUB C33, C33, C34, A2
  676. NMSUB C43, C43, C44, A2
  677. MADD C11, C11, C12, A2
  678. MADD C21, C21, C22, A2
  679. MADD C31, C31, C32, A2
  680. MADD C41, C41, C42, A2
  681. ST C13, 0 * SIZE(CO2)
  682. ST C23, 2 * SIZE(CO2)
  683. ST C33, 4 * SIZE(CO2)
  684. ST C43, 6 * SIZE(CO2)
  685. ST C11, 1 * SIZE(CO2)
  686. ST C21, 3 * SIZE(CO2)
  687. ST C31, 5 * SIZE(CO2)
  688. ST C41, 7 * SIZE(CO2)
  689. #endif
  690. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  691. /* (a - bi) * (c - di) */
  692. SUB C11, C11, A1 # ac'+'bd
  693. SUB C21, C21, A2
  694. SUB C31, C31, A3
  695. LD A1, 152($sp) # load alpha_r
  696. # LD A1, 0 * SIZE(A) # load alpha_r
  697. SUB C41, C41, A4
  698. LD A2, 160($sp)
  699. # LD A2, 0 * SIZE(A) # load alpha_i
  700. ADD C13, A5, C13 # ad'+'cb
  701. ADD C23, A6, C23
  702. ADD C33, A7, C33
  703. ADD C43, A8, C43
  704. SUB C12, C12, B1
  705. SUB C22, C22, B2
  706. SUB C32, C32, B3
  707. SUB C42, C42, B4
  708. ADD C14, B5, C14
  709. ADD C24, B6, C24
  710. ADD C34, B7, C34
  711. ADD C44, B8, C44
  712. NEG C13, C13
  713. NEG C23, C23
  714. NEG C33, C33
  715. NEG C43, C43
  716. NEG C14, C14
  717. NEG C24, C24
  718. NEG C34, C34
  719. NEG C44, C44
  720. LD B1, 0 * SIZE(CO1)
  721. LD B3, 2 * SIZE(CO1)
  722. LD B5, 4 * SIZE(CO1)
  723. LD B7, 6 * SIZE(CO1)
  724. LD B2, 1 * SIZE(CO1)
  725. LD B4, 3 * SIZE(CO1)
  726. LD B6, 5 * SIZE(CO1)
  727. LD B8, 7 * SIZE(CO1)
  728. MADD B1, B1, C11, A1 # A1 = alpha_r
  729. MADD B3, B3, C21, A1
  730. MADD B5, B5, C31, A1
  731. MADD B7, B7, C41, A1
  732. MADD B2, B2, C13, A1
  733. MADD B4, B4, C23, A1
  734. MADD B6, B6, C33, A1
  735. MADD B8, B8, C43, A1
  736. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  737. NMSUB B3, B3, C23, A2
  738. NMSUB B5, B5, C33, A2
  739. NMSUB B7, B7, C43, A2
  740. MADD B2, B2, C11, A2
  741. MADD B4, B4, C21, A2
  742. MADD B6, B6, C31, A2
  743. MADD B8, B8, C41, A2
  744. LD C13, 0 * SIZE(CO2)
  745. LD C43, 6 * SIZE(CO2)
  746. LD C23, 2 * SIZE(CO2)
  747. LD C33, 4 * SIZE(CO2)
  748. LD C11, 1 * SIZE(CO2)
  749. LD C21, 3 * SIZE(CO2)
  750. LD C31, 5 * SIZE(CO2)
  751. LD C41, 7 * SIZE(CO2)
  752. MADD C13, C13, C12, A1
  753. ST B1, 0 * SIZE(CO1)
  754. MADD C23, C23, C22, A1
  755. ST B3, 2 * SIZE(CO1)
  756. MADD C33, C33, C32, A1
  757. ST B5, 4 * SIZE(CO1)
  758. MADD C43, C43, C42, A1
  759. ST B7, 6 * SIZE(CO1)
  760. MADD C11, C11, C14, A1
  761. ST B2, 1 * SIZE(CO1)
  762. MADD C21, C21, C24, A1
  763. ST B4, 3 * SIZE(CO1)
  764. MADD C31, C31, C34, A1
  765. ST B6, 5 * SIZE(CO1)
  766. MADD C41, C41, C44, A1
  767. ST B8, 7 * SIZE(CO1)
  768. NMSUB C13, C13, C14, A2
  769. NMSUB C23, C23, C24, A2
  770. NMSUB C33, C33, C34, A2
  771. NMSUB C43, C43, C44, A2
  772. MADD C11, C11, C12, A2
  773. MADD C21, C21, C22, A2
  774. MADD C31, C31, C32, A2
  775. MADD C41, C41, C42, A2
  776. ST C13, 0 * SIZE(CO2)
  777. ST C23, 2 * SIZE(CO2)
  778. ST C33, 4 * SIZE(CO2)
  779. ST C43, 6 * SIZE(CO2)
  780. ST C11, 1 * SIZE(CO2)
  781. ST C21, 3 * SIZE(CO2)
  782. ST C31, 5 * SIZE(CO2)
  783. ST C41, 7 * SIZE(CO2)
  784. #endif
  785. #else
  786. daddiu I, I, -1
  787. CVTU A1, C11
  788. CVTU A2, C21
  789. CVTU A3, C31
  790. CVTU A4, C41
  791. CVTU A5, C13
  792. CVTU A6, C23
  793. CVTU A7, C33
  794. CVTU A8, C43
  795. CVTU B1, C12
  796. CVTU B2, C22
  797. CVTU B3, C32
  798. CVTU B4, C42
  799. CVTU B5, C14
  800. CVTU B6, C24
  801. CVTU B7, C34
  802. CVTU B8, C44
  803. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  804. /* (a + bi) * (c + di) */
  805. SUB C11, C11, A1 # ac'+'bd
  806. SUB C21, C21, A2
  807. SUB C31, C31, A3
  808. LD A1, 152($sp) # load alpha_r
  809. SUB C41, C41, A4
  810. # LD A1, 0 * SIZE(A) # load alpha_r
  811. LD A2, 160($sp) # load alpha_i
  812. ADD C13, A5, C13 # ad'+'cb
  813. ADD C23, A6, C23
  814. # LD A2, 0 * SIZE(A) # load alpha_i
  815. ADD C33, A7, C33
  816. ADD C43, A8, C43
  817. SUB C12, C12, B1
  818. SUB C22, C22, B2
  819. SUB C32, C32, B3
  820. SUB C42, C42, B4
  821. ADD C14, B5, C14
  822. ADD C24, B6, C24
  823. ADD C34, B7, C34
  824. ADD C44, B8, C44
  825. MUL B1, C11, A1 # A1 = alpha_r
  826. MUL B3, C21, A1
  827. MUL B5, C31, A1
  828. MUL B7, C41, A1
  829. MUL B2, C13, A1
  830. MUL B4, C23, A1
  831. MUL B6, C33, A1
  832. MUL B8, C43, A1
  833. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  834. NMSUB B3, B3, C23, A2
  835. NMSUB B5, B5, C33, A2
  836. NMSUB B7, B7, C43, A2
  837. MADD B2, B2, C11, A2
  838. MADD B4, B4, C21, A2
  839. MADD B6, B6, C31, A2
  840. MADD B8, B8, C41, A2
  841. ST B1, 0 * SIZE(CO1)
  842. MUL C13, C12, A1
  843. MUL C23, C22, A1
  844. ST B3, 2 * SIZE(CO1)
  845. MUL C33, C32, A1
  846. MUL C43, C42, A1
  847. ST B5, 4 * SIZE(CO1)
  848. MUL C11, C14, A1
  849. MUL C21, C24, A1
  850. ST B7, 6 * SIZE(CO1)
  851. MUL C31, C34, A1
  852. MUL C41, C44, A1
  853. ST B2, 1 * SIZE(CO1)
  854. NMSUB C13, C13, C14, A2
  855. NMSUB C23, C23, C24, A2
  856. ST B4, 3 * SIZE(CO1)
  857. NMSUB C33, C33, C34, A2
  858. NMSUB C43, C43, C44, A2
  859. ST B6, 5 * SIZE(CO1)
  860. MADD C11, C11, C12, A2
  861. MADD C21, C21, C22, A2
  862. ST B8, 7 * SIZE(CO1)
  863. MADD C31, C31, C32, A2
  864. MADD C41, C41, C42, A2
  865. ST C13, 0 * SIZE(CO2)
  866. ST C23, 2 * SIZE(CO2)
  867. ST C33, 4 * SIZE(CO2)
  868. ST C43, 6 * SIZE(CO2)
  869. ST C11, 1 * SIZE(CO2)
  870. ST C21, 3 * SIZE(CO2)
  871. ST C31, 5 * SIZE(CO2)
  872. ST C41, 7 * SIZE(CO2)
  873. #endif
  874. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  875. /* (a + bi) * (c - di) */
  876. ADD C11, A1, C11 # ac'+'bd
  877. ADD C21, A2, C21
  878. # LD A1, 0 * SIZE(A) # load alpha_r
  879. ADD C31, A3, C31
  880. LD A1, 152($sp) # load alpha_r
  881. ADD C41, A4, C41
  882. LD A2, 160($sp) # load alpha_i
  883. # LD A2, 0 * SIZE(A) # load alpha_r
  884. SUB C13, A5, C13 # ad'+'cb
  885. SUB C23, A6, C23
  886. SUB C33, A7, C33
  887. SUB C43, A8, C43
  888. ADD C12, B1, C12
  889. ADD C22, B2, C22
  890. ADD C32, B3, C32
  891. ADD C42, B4, C42
  892. SUB C14, B5, C14
  893. SUB C24, B6, C24
  894. SUB C34, B7, C34
  895. SUB C44, B8, C44
  896. MUL B1, C11, A1 # A1 = alpha_r
  897. MUL B3, C21, A1
  898. MUL B5, C31, A1
  899. MUL B7, C41, A1
  900. MUL B2, C13, A1
  901. MUL B4, C23, A1
  902. MUL B6, C33, A1
  903. MUL B8, C43, A1
  904. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  905. NMSUB B3, B3, C23, A2
  906. NMSUB B5, B5, C33, A2
  907. NMSUB B7, B7, C43, A2
  908. MADD B2, B2, C11, A2
  909. MADD B4, B4, C21, A2
  910. MADD B6, B6, C31, A2
  911. MADD B8, B8, C41, A2
  912. MUL C13, C12, A1
  913. MUL C23, C22, A1
  914. ST B1, 0 * SIZE(CO1)
  915. MUL C33, C32, A1
  916. MUL C43, C42, A1
  917. ST B3, 2 * SIZE(CO1)
  918. MUL C11, C14, A1
  919. MUL C21, C24, A1
  920. ST B5, 4 * SIZE(CO1)
  921. MUL C31, C34, A1
  922. MUL C41, C44, A1
  923. ST B7, 6 * SIZE(CO1)
  924. NMSUB C13, C13, C14, A2
  925. NMSUB C23, C23, C24, A2
  926. ST B2, 1 * SIZE(CO1)
  927. NMSUB C33, C33, C34, A2
  928. NMSUB C43, C43, C44, A2
  929. ST B4, 3 * SIZE(CO1)
  930. MADD C11, C11, C12, A2
  931. MADD C21, C21, C22, A2
  932. ST B6, 5 * SIZE(CO1)
  933. MADD C31, C31, C32, A2
  934. MADD C41, C41, C42, A2
  935. ST B8, 7 * SIZE(CO1)
  936. ST C13, 0 * SIZE(CO2)
  937. ST C23, 2 * SIZE(CO2)
  938. ST C33, 4 * SIZE(CO2)
  939. ST C43, 6 * SIZE(CO2)
  940. ST C11, 1 * SIZE(CO2)
  941. ST C21, 3 * SIZE(CO2)
  942. ST C31, 5 * SIZE(CO2)
  943. ST C41, 7 * SIZE(CO2)
  944. #endif
  945. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  946. /* (a - bi) * (c + di) */
  947. ADD C11, A1, C11 # ac'+'bd
  948. ADD C21, A2, C21
  949. # LD A1, 0 * SIZE(A) # load alpha_r
  950. ADD C31, A3, C31
  951. LD A1, 152($sp) # load alpha_r
  952. # LD A2, 0 * SIZE(A) # load alpha_r
  953. ADD C41, A4, C41
  954. LD A2, 160($sp) # load alpha_i
  955. SUB C13, C13, A5 # ad'+'cb
  956. SUB C23, C23, A6
  957. SUB C33, C33, A7
  958. SUB C43, C43, A8
  959. ADD C12, B1, C12
  960. ADD C22, B2, C22
  961. ADD C32, B3, C32
  962. ADD C42, B4, C42
  963. SUB C14, C14, B5
  964. SUB C24, C24, B6
  965. SUB C34, C34, B7
  966. SUB C44, C44, B8
  967. MUL B1, C11, A1 # A1 = alpha_r
  968. MUL B3, C21, A1
  969. MUL B5, C31, A1
  970. MUL B7, C41, A1
  971. MUL B2, C13, A1
  972. MUL B4, C23, A1
  973. MUL B6, C33, A1
  974. MUL B8, C43, A1
  975. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  976. NMSUB B3, B3, C23, A2
  977. NMSUB B5, B5, C33, A2
  978. NMSUB B7, B7, C43, A2
  979. MADD B2, B2, C11, A2
  980. MADD B4, B4, C21, A2
  981. MADD B6, B6, C31, A2
  982. MADD B8, B8, C41, A2
  983. MUL C13, C12, A1
  984. MUL C23, C22, A1
  985. ST B1, 0 * SIZE(CO1)
  986. MUL C33, C32, A1
  987. MUL C43, C42, A1
  988. ST B3, 2 * SIZE(CO1)
  989. MUL C11, C14, A1
  990. MUL C21, C24, A1
  991. ST B5, 4 * SIZE(CO1)
  992. MUL C31, C34, A1
  993. MUL C41, C44, A1
  994. ST B7, 6 * SIZE(CO1)
  995. NMSUB C13, C13, C14, A2
  996. NMSUB C23, C23, C24, A2
  997. ST B2, 1 * SIZE(CO1)
  998. NMSUB C33, C33, C34, A2
  999. NMSUB C43, C43, C44, A2
  1000. ST B4, 3 * SIZE(CO1)
  1001. MADD C11, C11, C12, A2
  1002. MADD C21, C21, C22, A2
  1003. ST B6, 5 * SIZE(CO1)
  1004. MADD C31, C31, C32, A2
  1005. MADD C41, C41, C42, A2
  1006. ST B8, 7 * SIZE(CO1)
  1007. ST C13, 0 * SIZE(CO2)
  1008. ST C23, 2 * SIZE(CO2)
  1009. ST C33, 4 * SIZE(CO2)
  1010. ST C43, 6 * SIZE(CO2)
  1011. ST C11, 1 * SIZE(CO2)
  1012. ST C21, 3 * SIZE(CO2)
  1013. ST C31, 5 * SIZE(CO2)
  1014. ST C41, 7 * SIZE(CO2)
  1015. #endif
  1016. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1017. /* (a - bi) * (c - di) */
  1018. SUB C11, C11, A1 # ac'+'bd
  1019. SUB C21, C21, A2
  1020. SUB C31, C31, A3
  1021. LD A1, 152($sp) # load alpha_r
  1022. # LD A1, 0 * SIZE(A) # load alpha_r
  1023. SUB C41, C41, A4
  1024. LD A2, 160($sp)
  1025. # LD A2, 0 * SIZE(A) # load alpha_i
  1026. ADD C13, A5, C13 # ad'+'cb
  1027. ADD C23, A6, C23
  1028. ADD C33, A7, C33
  1029. ADD C43, A8, C43
  1030. SUB C12, C12, B1
  1031. SUB C22, C22, B2
  1032. SUB C32, C32, B3
  1033. SUB C42, C42, B4
  1034. ADD C14, B5, C14
  1035. ADD C24, B6, C24
  1036. ADD C34, B7, C34
  1037. ADD C44, B8, C44
  1038. NEG C13, C13
  1039. NEG C23, C23
  1040. NEG C33, C33
  1041. NEG C43, C43
  1042. NEG C14, C14
  1043. NEG C24, C24
  1044. NEG C34, C34
  1045. NEG C44, C44
  1046. MUL B1, C11, A1 # A1 = alpha_r
  1047. MUL B3, C21, A1
  1048. MUL B5, C31, A1
  1049. MUL B7, C41, A1
  1050. MUL B2, C13, A1
  1051. MUL B4, C23, A1
  1052. MUL B6, C33, A1
  1053. MUL B8, C43, A1
  1054. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1055. NMSUB B3, B3, C23, A2
  1056. NMSUB B5, B5, C33, A2
  1057. NMSUB B7, B7, C43, A2
  1058. MADD B2, B2, C11, A2
  1059. MADD B4, B4, C21, A2
  1060. MADD B6, B6, C31, A2
  1061. MADD B8, B8, C41, A2
  1062. ST B1, 0 * SIZE(CO1)
  1063. MUL C13, C12, A1
  1064. MUL C23, C22, A1
  1065. ST B3, 2 * SIZE(CO1)
  1066. MUL C33, C32, A1
  1067. MUL C43, C42, A1
  1068. ST B5, 4 * SIZE(CO1)
  1069. MUL C11, C14, A1
  1070. MUL C21, C24, A1
  1071. ST B7, 6 * SIZE(CO1)
  1072. MUL C31, C34, A1
  1073. MUL C41, C44, A1
  1074. ST B2, 1 * SIZE(CO1)
  1075. NMSUB C13, C13, C14, A2
  1076. NMSUB C23, C23, C24, A2
  1077. ST B4, 3 * SIZE(CO1)
  1078. NMSUB C33, C33, C34, A2
  1079. NMSUB C43, C43, C44, A2
  1080. ST B6, 5 * SIZE(CO1)
  1081. MADD C11, C11, C12, A2
  1082. MADD C21, C21, C22, A2
  1083. ST B8, 7 * SIZE(CO1)
  1084. MADD C31, C31, C32, A2
  1085. MADD C41, C41, C42, A2
  1086. ST C13, 0 * SIZE(CO2)
  1087. ST C23, 2 * SIZE(CO2)
  1088. ST C33, 4 * SIZE(CO2)
  1089. ST C43, 6 * SIZE(CO2)
  1090. ST C11, 1 * SIZE(CO2)
  1091. ST C21, 3 * SIZE(CO2)
  1092. ST C31, 5 * SIZE(CO2)
  1093. ST C41, 7 * SIZE(CO2)
  1094. #endif
  1095. #if ( defined(LEFT) && defined(TRANSA)) || \
  1096. (!defined(LEFT) && !defined(TRANSA))
  1097. dsubu TEMP, K, KK
  1098. #ifdef LEFT
  1099. daddiu TEMP, TEMP, -4
  1100. #else
  1101. daddiu TEMP, TEMP, -2
  1102. #endif
  1103. dsll L, TEMP, 2 + ZBASE_SHIFT
  1104. dsll TEMP, TEMP, 1 + ZBASE_SHIFT
  1105. daddu AO, AO, L
  1106. daddu BO, BO, TEMP
  1107. #endif
  1108. #ifdef LEFT
  1109. daddiu KK, KK, 4
  1110. #endif
  1111. #endif
  1112. daddiu CO1, CO1, 8 * SIZE
  1113. bgtz I, .L241
  1114. daddiu CO2, CO2, 8 * SIZE
  1115. .align 4
  1116. .L22:
  1117. andi I, M, 2 # MR=4
  1118. blez I, .L21
  1119. NOP
  1120. .align 4
  1121. .L221:
  1122. #if defined(TRMMKERNEL)
  1123. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1124. move BO, B
  1125. #else
  1126. dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2
  1127. daddu AO, AO, TEMP
  1128. daddu BO, B, TEMP
  1129. #endif
  1130. MTC $0, C11 # CLEAR REAULTS REGISTERS
  1131. MOV C12, C11
  1132. MOV C21, C11
  1133. MOV C22, C11
  1134. gsLQC1(R13, F9, F8, 0) # B1 B2
  1135. gsLQC1(R12, F1, F0, 0) # A1 A2
  1136. MOV C13, C11
  1137. MOV C14, C11
  1138. MOV C23, C11
  1139. FETCH $0, 0 * SIZE(CO1)
  1140. FETCH $0, 8 * SIZE(CO1)
  1141. MOV C24, C11
  1142. FETCH $0, 0 * SIZE(CO2)
  1143. FETCH $0, 8 * SIZE(CO2)
  1144. PLU B3, B1, B1
  1145. PLU B4, B2, B2
  1146. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1147. dsubu TEMP, K, KK
  1148. #elif defined(LEFT)
  1149. daddiu TEMP, KK, 2 # MR=2
  1150. #else
  1151. daddiu TEMP, KK, 2 # NR=2
  1152. #endif
  1153. dsra L, TEMP, 2
  1154. blez L, .L222
  1155. NOP
  1156. #else
  1157. move BO, B # Reset B
  1158. dsra L, K, 2 # UnRoll K=64
  1159. MTC $0, C11 # CLEAR REAULTS REGISTERS
  1160. MOV C12, C11
  1161. MOV C21, C11
  1162. MOV C22, C11
  1163. gsLQC1(R13, F9, F8, 0) # B1 B2
  1164. gsLQC1(R12, F1, F0, 0) # A1 A2
  1165. MOV C13, C11
  1166. MOV C14, C11
  1167. MOV C23, C11
  1168. FETCH $0, 0 * SIZE(CO1)
  1169. FETCH $0, 8 * SIZE(CO1)
  1170. MOV C24, C11
  1171. FETCH $0, 0 * SIZE(CO2)
  1172. FETCH $0, 8 * SIZE(CO2)
  1173. PLU B3, B1, B1
  1174. blez L, .L222
  1175. PLU B4, B2, B2
  1176. #endif
  1177. .L2210:
  1178. daddiu L, L, -1
  1179. gsLQC1(R13, F13, F12, 1) # B3 B4
  1180. MADPS C11, C11, A1, B1
  1181. MADPS C21, C21, A2, B1
  1182. gsLQC1(R12, F3, F2, 1) # A3 A4
  1183. MADPS C12, C12, A1, B2
  1184. MADPS C22, C22, A2, B2
  1185. MADPS C13, C13, A1, B3
  1186. MADPS C23, C23, A2, B3
  1187. MADPS C14, C14, A1, B4
  1188. MADPS C24, C24, A2, B4
  1189. gsLQC1(R12, F5, F4, 2) # A5 A6
  1190. PLU B7, B5, B5
  1191. PLU B8, B6, B6
  1192. gsLQC1(R13, F9, F8, 2) # B1 B2
  1193. MADPS C11, C11, A3, B5
  1194. MADPS C21, C21, A4, B5
  1195. MADPS C12, C12, A3, B6
  1196. MADPS C22, C22, A4, B6
  1197. MADPS C13, C13, A3, B7
  1198. MADPS C23, C23, A4, B7
  1199. MADPS C14, C14, A3, B8
  1200. MADPS C24, C24, A4, B8
  1201. gsLQC1(R12, F7, F6, 3) # A7 A8
  1202. PLU B3, B1, B1
  1203. PLU B4, B2, B2
  1204. gsLQC1(R13, F13, F12, 3) # B3 B4
  1205. MADPS C11, C11, A5, B1
  1206. MADPS C21, C21, A6, B1
  1207. MADPS C12, C12, A5, B2
  1208. MADPS C22, C22, A6, B2
  1209. daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR
  1210. daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR
  1211. MADPS C13, C13, A5, B3
  1212. MADPS C23, C23, A6, B3
  1213. MADPS C14, C14, A5, B4
  1214. MADPS C24, C24, A6, B4
  1215. gsLQC1(R12, F1, F0, 0) # A1 A2
  1216. PLU B7, B5, B5
  1217. PLU B8, B6, B6
  1218. gsLQC1(R13, F9, F8, 0) # B1 B2
  1219. MADPS C11, C11, A7, B5
  1220. MADPS C21, C21, A8, B5
  1221. MADPS C12, C12, A7, B6
  1222. MADPS C22, C22, A8, B6
  1223. MADPS C13, C13, A7, B7
  1224. MADPS C23, C23, A8, B7
  1225. MADPS C14, C14, A7, B8
  1226. MADPS C24, C24, A8, B8
  1227. PLU B3, B1, B1
  1228. bgtz L, .L2210
  1229. PLU B4, B2, B2
  1230. .align 4
  1231. .L222:
  1232. #ifndef TRMMKERNEL
  1233. andi L, K, 2
  1234. #else
  1235. andi L, TEMP, 2
  1236. #endif
  1237. blez L, .L227
  1238. NOP
  1239. gsLQC1(R13, F13, F12, 1) # B3 B4
  1240. MADPS C11, C11, A1, B1
  1241. MADPS C21, C21, A2, B1
  1242. gsLQC1(R12, F3, F2, 1) # A3 A4
  1243. MADPS C12, C12, A1, B2
  1244. MADPS C22, C22, A2, B2
  1245. MADPS C13, C13, A1, B3
  1246. MADPS C23, C23, A2, B3
  1247. MADPS C14, C14, A1, B4
  1248. MADPS C24, C24, A2, B4
  1249. PLU B7, B5, B5
  1250. PLU B8, B6, B6
  1251. daddiu BO, BO, 2 * 4 * SIZE
  1252. daddiu AO, AO, 2 * 4 * SIZE
  1253. MADPS C11, C11, A3, B5
  1254. MADPS C21, C21, A4, B5
  1255. gsLQC1(R13, F9, F8, 0) # A1 A2
  1256. MADPS C12, C12, A3, B6
  1257. MADPS C22, C22, A4, B6
  1258. gsLQC1(R12, F1, F0, 0) # A1 A2
  1259. MADPS C13, C13, A3, B7
  1260. MADPS C23, C23, A4, B7
  1261. MADPS C14, C14, A3, B8
  1262. MADPS C24, C24, A4, B8
  1263. PLU B3, B1, B1
  1264. PLU B4, B2, B2
  1265. .align 4
  1266. .L227:
  1267. #ifndef TRMMKERNEL
  1268. andi L, K, 1
  1269. #else
  1270. andi L, TEMP, 1
  1271. #endif
  1272. blez L, .L220
  1273. NOP
  1274. MADPS C11, C11, A1, B1
  1275. MADPS C21, C21, A2, B1
  1276. daddiu BO, BO, 4 * SIZE
  1277. daddiu AO, AO, 4 * SIZE
  1278. MADPS C12, C12, A1, B2
  1279. MADPS C22, C22, A2, B2
  1280. MADPS C13, C13, A1, B3
  1281. MADPS C23, C23, A2, B3
  1282. MADPS C14, C14, A1, B4
  1283. MADPS C24, C24, A2, B4
  1284. .align 4
  1285. .L220: # Write Back
  1286. #ifndef TRMMKERNEL
  1287. daddiu I, I, -1
  1288. CVTU A1, C11
  1289. CVTU A2, C21
  1290. CVTU A3, C13
  1291. CVTU A4, C23
  1292. CVTU A5, C12
  1293. CVTU A6, C22
  1294. CVTU A7, C14
  1295. CVTU A8, C24
  1296. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1297. /* (a + bi) * (c + di) */
  1298. SUB C11, C11, A1 # ac'+'bd
  1299. SUB C21, C21, A2
  1300. ADD C13, A3, C13 # ad'+'cb
  1301. ADD C23, A4, C23
  1302. # LD A1, 0 * SIZE(A) # load alpha_r
  1303. LD A1, 152($sp) # load alpha_r
  1304. LD A2, 160($sp) # load alpha_i
  1305. # LD A2, 0 * SIZE(A) # load alpha_i
  1306. SUB C12, C12, A5
  1307. SUB C22, C22, A6
  1308. ADD C14, A7, C14
  1309. ADD C24, A8, C24
  1310. LD B1, 0 * SIZE(CO1)
  1311. LD B3, 2 * SIZE(CO1)
  1312. LD B2, 1 * SIZE(CO1)
  1313. LD B4, 3 * SIZE(CO1)
  1314. MADD B1, B1, C11, A1 # A1 = alpha_r
  1315. MADD B3, B3, C21, A1
  1316. MADD B2, B2, C13, A1
  1317. MADD B4, B4, C23, A1
  1318. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1319. NMSUB B3, B3, C23, A2
  1320. MADD B2, B2, C11, A2
  1321. MADD B4, B4, C21, A2
  1322. LD B5, 0 * SIZE(CO2)
  1323. LD B7, 2 * SIZE(CO2)
  1324. LD B6, 1 * SIZE(CO2)
  1325. LD B8, 3 * SIZE(CO2)
  1326. MADD B5, B5, C12, A1
  1327. MADD B7, B7, C22, A1
  1328. ST B1, 0 * SIZE(CO1)
  1329. ST B3, 2 * SIZE(CO1)
  1330. MADD B6, B6, C14, A1
  1331. MADD B8, B8, C24, A1
  1332. ST B2, 1 * SIZE(CO1)
  1333. ST B4, 3 * SIZE(CO1)
  1334. NMSUB B5, B5, C14, A2
  1335. NMSUB B7, B7, C24, A2
  1336. MADD B6, B6, C12, A2
  1337. MADD B8, B8, C22, A2
  1338. ST B5, 0 * SIZE(CO2)
  1339. ST B7, 2 * SIZE(CO2)
  1340. ST B6, 1 * SIZE(CO2)
  1341. ST B8, 3 * SIZE(CO2)
  1342. #endif
  1343. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1344. /* (a + bi) * (c - di) */
  1345. ADD C11, A1, C11 # ac'+'bd
  1346. ADD C21, A2, C21
  1347. SUB C13, A3, C13 # ad'+'cb
  1348. SUB C23, A4, C23
  1349. # LD A1, 0 * SIZE(A) # load alpha_r
  1350. LD A1, 152($sp) # load alpha_r
  1351. LD A2, 160($sp) # load alpha_i
  1352. # LD A2, 0 * SIZE(A) # load alpha_r
  1353. ADD C12, A5, C12
  1354. ADD C22, A6, C22
  1355. SUB C14, A7, C14
  1356. SUB C24, A8, C24
  1357. LD B1, 0 * SIZE(CO1)
  1358. LD B3, 2 * SIZE(CO1)
  1359. LD B2, 1 * SIZE(CO1)
  1360. LD B4, 3 * SIZE(CO1)
  1361. MADD B1, B1, C11, A1 # A1 = alpha_r
  1362. MADD B3, B3, C21, A1
  1363. MADD B2, B2, C13, A1
  1364. MADD B4, B4, C23, A1
  1365. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1366. NMSUB B3, B3, C23, A2
  1367. MADD B2, B2, C11, A2
  1368. MADD B4, B4, C21, A2
  1369. LD B5, 0 * SIZE(CO2)
  1370. LD B7, 2 * SIZE(CO2)
  1371. LD B6, 1 * SIZE(CO2)
  1372. LD B8, 3 * SIZE(CO2)
  1373. MADD B5, B5, C12, A1
  1374. MADD B7, B7, C22, A1
  1375. ST B1, 0 * SIZE(CO1)
  1376. ST B3, 2 * SIZE(CO1)
  1377. MADD B6, B6, C14, A1
  1378. MADD B8, B8, C24, A1
  1379. ST B2, 1 * SIZE(CO1)
  1380. ST B4, 3 * SIZE(CO1)
  1381. NMSUB B5, B5, C14, A2
  1382. NMSUB B7, B7, C24, A2
  1383. MADD B6, B6, C12, A2
  1384. MADD B8, B8, C22, A2
  1385. ST B5, 0 * SIZE(CO2)
  1386. ST B7, 2 * SIZE(CO2)
  1387. ST B6, 1 * SIZE(CO2)
  1388. ST B8, 3 * SIZE(CO2)
  1389. #endif
  1390. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1391. /* (a - bi) * (c + di) */
  1392. ADD C11, A1, C11 # ac'+'bd
  1393. ADD C21, A2, C21
  1394. SUB C13, C13, A3 # ad'+'cb
  1395. SUB C23, C23, A4
  1396. # LD A1, 0 * SIZE(A) # load alpha_r
  1397. LD A1, 152($sp) # load alpha_r
  1398. # LD A2, 0 * SIZE(A) # load alpha_r
  1399. LD A2, 160($sp) # load alpha_i
  1400. ADD C12, A5, C12
  1401. ADD C22, A6, C22
  1402. SUB C14, C14, A7
  1403. SUB C24, C24, A8
  1404. LD B1, 0 * SIZE(CO1)
  1405. LD B3, 2 * SIZE(CO1)
  1406. LD B2, 1 * SIZE(CO1)
  1407. LD B4, 3 * SIZE(CO1)
  1408. MADD B1, B1, C11, A1 # A1 = alpha_r
  1409. MADD B3, B3, C21, A1
  1410. MADD B2, B2, C13, A1
  1411. MADD B4, B4, C23, A1
  1412. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1413. NMSUB B3, B3, C23, A2
  1414. MADD B2, B2, C11, A2
  1415. MADD B4, B4, C21, A2
  1416. LD B5, 0 * SIZE(CO2)
  1417. LD B7, 2 * SIZE(CO2)
  1418. LD B6, 1 * SIZE(CO2)
  1419. LD B8, 3 * SIZE(CO2)
  1420. MADD B5, B5, C12, A1
  1421. MADD B7, B7, C22, A1
  1422. ST B1, 0 * SIZE(CO1)
  1423. ST B3, 2 * SIZE(CO1)
  1424. MADD B6, B6, C14, A1
  1425. MADD B8, B8, C24, A1
  1426. ST B2, 1 * SIZE(CO1)
  1427. ST B4, 3 * SIZE(CO1)
  1428. NMSUB B5, B5, C14, A2
  1429. NMSUB B7, B7, C24, A2
  1430. MADD B6, B6, C12, A2
  1431. MADD B8, B8, C22, A2
  1432. ST B5, 0 * SIZE(CO2)
  1433. ST B7, 2 * SIZE(CO2)
  1434. ST B6, 1 * SIZE(CO2)
  1435. ST B8, 3 * SIZE(CO2)
  1436. #endif
  1437. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1438. /* (a - bi) * (c - di) */
  1439. SUB C11, C11, A1 # ac'+'bd
  1440. SUB C21, C21, A2
  1441. ADD C13, A3, C13 # ad'+'cb
  1442. ADD C23, A4, C23
  1443. LD A1, 152($sp) # load alpha_r
  1444. # LD A1, 0 * SIZE(A) # load alpha_r
  1445. LD A2, 160($sp)
  1446. # LD A2, 0 * SIZE(A) # load alpha_i
  1447. SUB C12, C12, A5
  1448. SUB C22, C22, A6
  1449. ADD C14, A7, C14
  1450. ADD C24, A8, C24
  1451. NEG C13, C13
  1452. NEG C23, C23
  1453. NEG C14, C14
  1454. NEG C24, C24
  1455. LD B1, 0 * SIZE(CO1)
  1456. LD B3, 2 * SIZE(CO1)
  1457. LD B2, 1 * SIZE(CO1)
  1458. LD B4, 3 * SIZE(CO1)
  1459. MADD B1, B1, C11, A1 # A1 = alpha_r
  1460. MADD B3, B3, C21, A1
  1461. MADD B2, B2, C13, A1
  1462. MADD B4, B4, C23, A1
  1463. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1464. NMSUB B3, B3, C23, A2
  1465. MADD B2, B2, C11, A2
  1466. MADD B4, B4, C21, A2
  1467. LD B5, 0 * SIZE(CO2)
  1468. LD B7, 2 * SIZE(CO2)
  1469. LD B6, 1 * SIZE(CO2)
  1470. LD B8, 3 * SIZE(CO2)
  1471. MADD B5, B5, C12, A1
  1472. MADD B7, B7, C22, A1
  1473. ST B1, 0 * SIZE(CO1)
  1474. ST B3, 2 * SIZE(CO1)
  1475. MADD B6, B6, C14, A1
  1476. MADD B8, B8, C24, A1
  1477. ST B2, 1 * SIZE(CO1)
  1478. ST B4, 3 * SIZE(CO1)
  1479. NMSUB B5, B5, C14, A2
  1480. NMSUB B7, B7, C24, A2
  1481. MADD B6, B6, C12, A2
  1482. MADD B8, B8, C22, A2
  1483. ST B5, 0 * SIZE(CO2)
  1484. ST B7, 2 * SIZE(CO2)
  1485. ST B6, 1 * SIZE(CO2)
  1486. ST B8, 3 * SIZE(CO2)
  1487. #endif
  1488. #else
  1489. daddiu I, I, -1
  1490. CVTU A1, C11
  1491. CVTU A2, C21
  1492. CVTU A3, C13
  1493. CVTU A4, C23
  1494. CVTU A5, C12
  1495. CVTU A6, C22
  1496. CVTU A7, C14
  1497. CVTU A8, C24
  1498. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1499. /* (a + bi) * (c + di) */
  1500. SUB C11, C11, A1 # ac'+'bd
  1501. SUB C21, C21, A2
  1502. ADD C13, A3, C13 # ad'+'cb
  1503. ADD C23, A4, C23
  1504. # LD A1, 0 * SIZE(A) # load alpha_r
  1505. LD A1, 152($sp) # load alpha_r
  1506. LD A2, 160($sp) # load alpha_i
  1507. # LD A2, 0 * SIZE(A) # load alpha_i
  1508. SUB C12, C12, A5
  1509. SUB C22, C22, A6
  1510. ADD C14, A7, C14
  1511. ADD C24, A8, C24
  1512. MUL B1, C11, A1 # A1 = alpha_r
  1513. MUL B3, C21, A1
  1514. MUL B2, C13, A1
  1515. MUL B4, C23, A1
  1516. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1517. NMSUB B3, B3, C23, A2
  1518. MADD B2, B2, C11, A2
  1519. MADD B4, B4, C21, A2
  1520. MUL B5, C12, A1
  1521. MUL B7, C22, A1
  1522. ST B1, 0 * SIZE(CO1)
  1523. ST B3, 2 * SIZE(CO1)
  1524. MUL B6, C14, A1
  1525. MUL B8, C24, A1
  1526. ST B2, 1 * SIZE(CO1)
  1527. ST B4, 3 * SIZE(CO1)
  1528. NMSUB B5, B5, C14, A2
  1529. NMSUB B7, B7, C24, A2
  1530. MADD B6, B6, C12, A2
  1531. MADD B8, B8, C22, A2
  1532. ST B5, 0 * SIZE(CO2)
  1533. ST B7, 2 * SIZE(CO2)
  1534. ST B6, 1 * SIZE(CO2)
  1535. ST B8, 3 * SIZE(CO2)
  1536. #endif
  1537. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1538. /* (a + bi) * (c - di) */
  1539. ADD C11, A1, C11 # ac'+'bd
  1540. ADD C21, A2, C21
  1541. SUB C13, A3, C13 # ad'+'cb
  1542. SUB C23, A4, C23
  1543. # LD A1, 0 * SIZE(A) # load alpha_r
  1544. LD A1, 152($sp) # load alpha_r
  1545. LD A2, 160($sp) # load alpha_i
  1546. # LD A2, 0 * SIZE(A) # load alpha_r
  1547. ADD C12, A5, C12
  1548. ADD C22, A6, C22
  1549. SUB C14, A7, C14
  1550. SUB C24, A8, C24
  1551. MUL B1, C11, A1 # A1 = alpha_r
  1552. MUL B3, C21, A1
  1553. MUL B2, C13, A1
  1554. MUL B4, C23, A1
  1555. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1556. NMSUB B3, B3, C23, A2
  1557. MADD B2, B2, C11, A2
  1558. MADD B4, B4, C21, A2
  1559. MUL B5, C12, A1
  1560. MUL B7, C22, A1
  1561. ST B1, 0 * SIZE(CO1)
  1562. ST B3, 2 * SIZE(CO1)
  1563. MUL B6, C14, A1
  1564. MUL B8, C24, A1
  1565. ST B2, 1 * SIZE(CO1)
  1566. ST B4, 3 * SIZE(CO1)
  1567. NMSUB B5, B5, C14, A2
  1568. NMSUB B7, B7, C24, A2
  1569. MADD B6, B6, C12, A2
  1570. MADD B8, B8, C22, A2
  1571. ST B5, 0 * SIZE(CO2)
  1572. ST B7, 2 * SIZE(CO2)
  1573. ST B6, 1 * SIZE(CO2)
  1574. ST B8, 3 * SIZE(CO2)
  1575. #endif
  1576. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1577. /* (a - bi) * (c + di) */
  1578. ADD C11, A1, C11 # ac'+'bd
  1579. ADD C21, A2, C21
  1580. SUB C13, C13, A3 # ad'+'cb
  1581. SUB C23, C23, A4
  1582. # LD A1, 0 * SIZE(A) # load alpha_r
  1583. LD A1, 152($sp) # load alpha_r
  1584. # LD A2, 0 * SIZE(A) # load alpha_r
  1585. LD A2, 160($sp) # load alpha_i
  1586. ADD C12, A5, C12
  1587. ADD C22, A6, C22
  1588. SUB C14, C14, A7
  1589. SUB C24, C24, A8
  1590. MUL B1, C11, A1 # A1 = alpha_r
  1591. MUL B3, C21, A1
  1592. MUL B2, C13, A1
  1593. MUL B4, C23, A1
  1594. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1595. NMSUB B3, B3, C23, A2
  1596. MADD B2, B2, C11, A2
  1597. MADD B4, B4, C21, A2
  1598. MUL B5, C12, A1
  1599. MUL B7, C22, A1
  1600. ST B1, 0 * SIZE(CO1)
  1601. ST B3, 2 * SIZE(CO1)
  1602. MUL B6, C14, A1
  1603. MUL B8, C24, A1
  1604. ST B2, 1 * SIZE(CO1)
  1605. ST B4, 3 * SIZE(CO1)
  1606. NMSUB B5, B5, C14, A2
  1607. NMSUB B7, B7, C24, A2
  1608. MADD B6, B6, C12, A2
  1609. MADD B8, B8, C22, A2
  1610. ST B5, 0 * SIZE(CO2)
  1611. ST B7, 2 * SIZE(CO2)
  1612. ST B6, 1 * SIZE(CO2)
  1613. ST B8, 3 * SIZE(CO2)
  1614. #endif
  1615. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1616. /* (a - bi) * (c - di) */
  1617. SUB C11, C11, A1 # ac'+'bd
  1618. SUB C21, C21, A2
  1619. ADD C13, A3, C13 # ad'+'cb
  1620. ADD C23, A4, C23
  1621. LD A1, 152($sp) # load alpha_r
  1622. # LD A1, 0 * SIZE(A) # load alpha_r
  1623. LD A2, 160($sp)
  1624. # LD A2, 0 * SIZE(A) # load alpha_i
  1625. SUB C12, C12, A5
  1626. SUB C22, C22, A6
  1627. ADD C14, A7, C14
  1628. ADD C24, A8, C24
  1629. NEG C13, C13
  1630. NEG C23, C23
  1631. NEG C14, C14
  1632. NEG C24, C24
  1633. MUL B1, C11, A1 # A1 = alpha_r
  1634. MUL B3, C21, A1
  1635. MUL B2, C13, A1
  1636. MUL B4, C23, A1
  1637. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1638. NMSUB B3, B3, C23, A2
  1639. MADD B2, B2, C11, A2
  1640. MADD B4, B4, C21, A2
  1641. MUL B5, C12, A1
  1642. MUL B7, C22, A1
  1643. ST B1, 0 * SIZE(CO1)
  1644. ST B3, 2 * SIZE(CO1)
  1645. MUL B6, C14, A1
  1646. MUL B8, C24, A1
  1647. ST B2, 1 * SIZE(CO1)
  1648. ST B4, 3 * SIZE(CO1)
  1649. NMSUB B5, B5, C14, A2
  1650. NMSUB B7, B7, C24, A2
  1651. MADD B6, B6, C12, A2
  1652. MADD B8, B8, C22, A2
  1653. ST B5, 0 * SIZE(CO2)
  1654. ST B7, 2 * SIZE(CO2)
  1655. ST B6, 1 * SIZE(CO2)
  1656. ST B8, 3 * SIZE(CO2)
  1657. #endif
  1658. #if ( defined(LEFT) && defined(TRANSA)) || \
  1659. (!defined(LEFT) && !defined(TRANSA))
  1660. dsubu TEMP, K, KK
  1661. #ifdef LEFT
  1662. daddiu TEMP, TEMP, -2
  1663. #else
  1664. daddiu TEMP, TEMP, -2
  1665. #endif
  1666. dsll TEMP, TEMP, 1 + ZBASE_SHIFT
  1667. daddu AO, AO, TEMP
  1668. daddu BO, BO, TEMP
  1669. #endif
  1670. #ifdef LEFT
  1671. daddiu KK, KK, 2
  1672. #endif
  1673. #endif
  1674. daddiu CO1, CO1, 4 * SIZE
  1675. daddiu CO2, CO2, 4 * SIZE
  1676. .align 4
  1677. .L21:
  1678. andi I, M, 1
  1679. blez I, .L20
  1680. NOP
  1681. .align 4
  1682. .L211:
  1683. #if defined(TRMMKERNEL)
  1684. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  1685. move BO, B
  1686. #else
  1687. dsll L, KK, ZBASE_SHIFT # MR=1
  1688. dsll TEMP, KK, 1 + ZBASE_SHIFT # NR=2
  1689. daddu AO, AO, L
  1690. daddu BO, B, TEMP
  1691. #endif
  1692. MTC $0, C11 # CLEAR REAULTS REGISTERS
  1693. MOV C12, C11
  1694. gsLQC1(R13, F9, F8, 0) # B1 B2
  1695. gsLQC1(R12, F1, F0, 0) # A1 A2
  1696. MOV C13, C11
  1697. MOV C14, C11
  1698. FETCH $0, 0 * SIZE(CO1)
  1699. FETCH $0, 0 * SIZE(CO2)
  1700. PLU B3, B1, B1
  1701. PLU B4, B2, B2
  1702. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1703. dsubu TEMP, K, KK
  1704. #elif defined(LEFT)
  1705. daddiu TEMP, KK, 1 # MR=1
  1706. #else
  1707. daddiu TEMP, KK, 2 # NR=2
  1708. #endif
  1709. dsra L, TEMP, 2
  1710. blez L, .L212
  1711. NOP
  1712. #else
  1713. move BO, B # Reset B
  1714. dsra L, K, 2 # UnRoll K=64
  1715. MTC $0, C11 # CLEAR REAULTS REGISTERS
  1716. MOV C12, C11
  1717. gsLQC1(R13, F9, F8, 0) # B1 B2
  1718. gsLQC1(R12, F1, F0, 0) # A1 A2
  1719. MOV C13, C11
  1720. MOV C14, C11
  1721. FETCH $0, 0 * SIZE(CO1)
  1722. FETCH $0, 0 * SIZE(CO2)
  1723. PLU B3, B1, B1
  1724. blez L, .L212
  1725. PLU B4, B2, B2
  1726. #endif
  1727. .L2110:
  1728. daddiu L, L, -1
  1729. gsLQC1(R13, F13, F12, 1) # B3 B4
  1730. MADPS C11, C11, A1, B1
  1731. MADPS C12, C12, A1, B2
  1732. MADPS C13, C13, A1, B3
  1733. MADPS C14, C14, A1, B4
  1734. PLU B7, B5, B5
  1735. PLU B8, B6, B6
  1736. gsLQC1(R13, F9, F8, 2) # B1 B2
  1737. MADPS C11, C11, A2, B5
  1738. MADPS C12, C12, A2, B6
  1739. gsLQC1(R12, F3, F2, 1) # A3 A4
  1740. MADPS C13, C13, A2, B7
  1741. MADPS C14, C14, A2, B8
  1742. PLU B3, B1, B1
  1743. PLU B4, B2, B2
  1744. gsLQC1(R13, F13, F12, 3) # B3 B4
  1745. MADPS C11, C11, A3, B1
  1746. MADPS C12, C12, A3, B2
  1747. daddiu BO, BO, 4 * 4 * SIZE # 4KR*4NR
  1748. daddiu AO, AO, 2 * 4 * SIZE # 4KR*8MR
  1749. MADPS C13, C13, A3, B3
  1750. MADPS C14, C14, A3, B4
  1751. PLU B7, B5, B5
  1752. PLU B8, B6, B6
  1753. gsLQC1(R13, F9, F8, 0) # B1 B2
  1754. MADPS C11, C11, A4, B5
  1755. MADPS C12, C12, A4, B6
  1756. gsLQC1(R12, F1, F0, 0) # A1 A2
  1757. MADPS C13, C13, A4, B7
  1758. MADPS C14, C14, A4, B8
  1759. PLU B3, B1, B1
  1760. bgtz L, .L2110
  1761. PLU B4, B2, B2
  1762. .align 4
  1763. .L212:
  1764. #ifndef TRMMKERNEL
  1765. andi L, K, 2
  1766. #else
  1767. andi L, TEMP, 2
  1768. #endif
  1769. blez L, .L217
  1770. NOP
  1771. gsLQC1(R13, F13, F12, 1) # B3 B4
  1772. MADPS C11, C11, A1, B1
  1773. MADPS C12, C12, A1, B2
  1774. MADPS C13, C13, A1, B3
  1775. MADPS C14, C14, A1, B4
  1776. PLU B7, B5, B5
  1777. PLU B8, B6, B6
  1778. daddiu BO, BO, 2 * 4 * SIZE
  1779. MADPS C11, C11, A2, B5
  1780. MADPS C12, C12, A2, B6
  1781. daddiu AO, AO, 4 * SIZE
  1782. MADPS C13, C13, A2, B7
  1783. MADPS C14, C14, A2, B8
  1784. gsLQC1(R12, F1, F0, 0) # A5 A6
  1785. gsLQC1(R13, F9, F8, 0) # B1 B2
  1786. PLU B3, B1, B1
  1787. PLU B4, B2, B2
  1788. .align 4
  1789. .L217:
  1790. #ifndef TRMMKERNEL
  1791. andi L, K, 1
  1792. #else
  1793. andi L, TEMP, 1
  1794. #endif
  1795. blez L, .L210
  1796. NOP
  1797. MADPS C11, C11, A1, B1
  1798. daddiu BO, BO, 4 * SIZE
  1799. MADPS C12, C12, A1, B2
  1800. daddiu AO, AO, 2 * SIZE
  1801. MADPS C13, C13, A1, B3
  1802. MADPS C14, C14, A1, B4
  1803. .align 4
  1804. .L210: # Write Back
  1805. #ifndef TRMMKERNEL
  1806. daddiu I, I, -1
  1807. CVTU A1, C11
  1808. CVTU A3, C13
  1809. CVTU A5, C12
  1810. CVTU A7, C14
  1811. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1812. /* (a + bi) * (c + di) */
  1813. SUB C11, C11, A1 # ac'+'bd
  1814. ADD C13, A3, C13 # ad'+'cb
  1815. # LD A1, 0 * SIZE(A) # load alpha_r
  1816. LD A4, 152($sp) # load alpha_r
  1817. LD A2, 160($sp) # load alpha_i
  1818. # LD A2, 0 * SIZE(A) # load alpha_i
  1819. SUB C12, C12, A5
  1820. ADD C14, A7, C14
  1821. LD B1, 0 * SIZE(CO1)
  1822. LD B2, 1 * SIZE(CO1)
  1823. MADD B1, B1, C11, A4 # A1 = alpha_r
  1824. MADD B2, B2, C13, A4
  1825. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1826. MADD B2, B2, C11, A2
  1827. LD B5, 0 * SIZE(CO2)
  1828. LD B6, 1 * SIZE(CO2)
  1829. MADD B5, B5, C12, A4
  1830. ST B1, 0 * SIZE(CO1)
  1831. MADD B6, B6, C14, A4
  1832. ST B2, 1 * SIZE(CO1)
  1833. NMSUB B5, B5, C14, A2
  1834. MADD B6, B6, C12, A2
  1835. ST B5, 0 * SIZE(CO2)
  1836. ST B6, 1 * SIZE(CO2)
  1837. #endif
  1838. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1839. /* (a + bi) * (c - di) */
  1840. ADD C11, A1, C11 # ac'+'bd
  1841. SUB C13, A3, C13 # ad'+'cb
  1842. # LD A1, 0 * SIZE(A) # load alpha_r
  1843. LD A4, 152($sp) # load alpha_r
  1844. LD A2, 160($sp) # load alpha_i
  1845. # LD A2, 0 * SIZE(A) # load alpha_r
  1846. ADD C12, A5, C12
  1847. SUB C14, A7, C14
  1848. LD B1, 0 * SIZE(CO1)
  1849. LD B2, 1 * SIZE(CO1)
  1850. MADD B1, B1, C11, A4 # A1 = alpha_r
  1851. MADD B2, B2, C13, A4
  1852. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1853. MADD B2, B2, C11, A2
  1854. LD B5, 0 * SIZE(CO2)
  1855. LD B6, 1 * SIZE(CO2)
  1856. MADD B5, B5, C12, A4
  1857. ST B1, 0 * SIZE(CO1)
  1858. MADD B6, B6, C14, A4
  1859. ST B2, 1 * SIZE(CO1)
  1860. NMSUB B5, B5, C14, A2
  1861. MADD B6, B6, C12, A2
  1862. ST B5, 0 * SIZE(CO2)
  1863. ST B6, 1 * SIZE(CO2)
  1864. #endif
  1865. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1866. /* (a - bi) * (c + di) */
  1867. ADD C11, A1, C11 # ac'+'bd
  1868. SUB C13, C13, A3 # ad'+'cb
  1869. # LD A1, 0 * SIZE(A) # load alpha_r
  1870. LD A4, 152($sp) # load alpha_r
  1871. # LD A2, 0 * SIZE(A) # load alpha_r
  1872. LD A2, 160($sp) # load alpha_i
  1873. ADD C12, A5, C12
  1874. SUB C14, C14, A7
  1875. LD B1, 0 * SIZE(CO1)
  1876. LD B2, 1 * SIZE(CO1)
  1877. MADD B1, B1, C11, A4 # A1 = alpha_r
  1878. MADD B2, B2, C13, A4
  1879. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1880. MADD B2, B2, C11, A2
  1881. LD B5, 0 * SIZE(CO2)
  1882. LD B6, 1 * SIZE(CO2)
  1883. MADD B5, B5, C12, A4
  1884. ST B1, 0 * SIZE(CO1)
  1885. MADD B6, B6, C14, A4
  1886. ST B2, 1 * SIZE(CO1)
  1887. NMSUB B5, B5, C14, A2
  1888. MADD B6, B6, C12, A2
  1889. ST B5, 0 * SIZE(CO2)
  1890. ST B6, 1 * SIZE(CO2)
  1891. #endif
  1892. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1893. /* (a - bi) * (c - di) */
  1894. SUB C11, C11, A1 # ac'+'bd
  1895. ADD C13, A3, C13 # ad'+'cb
  1896. LD A4, 152($sp) # load alpha_r
  1897. # LD A1, 0 * SIZE(A) # load alpha_r
  1898. LD A2, 160($sp)
  1899. # LD A2, 0 * SIZE(A) # load alpha_i
  1900. SUB C12, C12, A5
  1901. ADD C14, A7, C14
  1902. NEG C13, C13
  1903. LD B1, 0 * SIZE(CO1)
  1904. LD B2, 1 * SIZE(CO1)
  1905. NEG C14, C14
  1906. MADD B1, B1, C11, A4 # A1 = alpha_r
  1907. MADD B2, B2, C13, A4
  1908. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1909. MADD B2, B2, C11, A2
  1910. LD B5, 0 * SIZE(CO2)
  1911. LD B6, 1 * SIZE(CO2)
  1912. MADD B5, B5, C12, A4
  1913. ST B1, 0 * SIZE(CO1)
  1914. MADD B6, B6, C14, A4
  1915. ST B2, 1 * SIZE(CO1)
  1916. NMSUB B5, B5, C14, A2
  1917. MADD B6, B6, C12, A2
  1918. ST B5, 0 * SIZE(CO2)
  1919. ST B6, 1 * SIZE(CO2)
  1920. #endif
  1921. #else
  1922. daddiu I, I, -1
  1923. CVTU A1, C11
  1924. CVTU A3, C13
  1925. CVTU A5, C12
  1926. CVTU A7, C14
  1927. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  1928. /* (a + bi) * (c + di) */
  1929. SUB C11, C11, A1 # ac'+'bd
  1930. ADD C13, A3, C13 # ad'+'cb
  1931. # LD A1, 0 * SIZE(A) # load alpha_r
  1932. LD A4, 152($sp) # load alpha_r
  1933. LD A2, 160($sp) # load alpha_i
  1934. # LD A2, 0 * SIZE(A) # load alpha_i
  1935. SUB C12, C12, A5
  1936. ADD C14, A7, C14
  1937. MUL B1, C11, A4 # A1 = alpha_r
  1938. MUL B2, C13, A4
  1939. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1940. MADD B2, B2, C11, A2
  1941. MUL B5, C12, A4
  1942. ST B1, 0 * SIZE(CO1)
  1943. MUL B6, C14, A4
  1944. ST B2, 1 * SIZE(CO1)
  1945. NMSUB B5, B5, C14, A2
  1946. MADD B6, B6, C12, A2
  1947. ST B5, 0 * SIZE(CO2)
  1948. ST B6, 1 * SIZE(CO2)
  1949. #endif
  1950. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  1951. /* (a + bi) * (c - di) */
  1952. ADD C11, A1, C11 # ac'+'bd
  1953. SUB C13, A3, C13 # ad'+'cb
  1954. # LD A1, 0 * SIZE(A) # load alpha_r
  1955. LD A4, 152($sp) # load alpha_r
  1956. LD A2, 160($sp) # load alpha_i
  1957. # LD A2, 0 * SIZE(A) # load alpha_r
  1958. ADD C12, A5, C12
  1959. SUB C14, A7, C14
  1960. MUL B1, C11, A4 # A1 = alpha_r
  1961. MUL B2, C13, A4
  1962. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1963. MADD B2, B2, C11, A2
  1964. MUL B5, C12, A4
  1965. ST B1, 0 * SIZE(CO1)
  1966. MUL B6, C14, A4
  1967. ST B2, 1 * SIZE(CO1)
  1968. NMSUB B5, B5, C14, A2
  1969. MADD B6, B6, C12, A2
  1970. ST B5, 0 * SIZE(CO2)
  1971. ST B6, 1 * SIZE(CO2)
  1972. #endif
  1973. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  1974. /* (a - bi) * (c + di) */
  1975. ADD C11, A1, C11 # ac'+'bd
  1976. SUB C13, C13, A3 # ad'+'cb
  1977. # LD A1, 0 * SIZE(A) # load alpha_r
  1978. LD A4, 152($sp) # load alpha_r
  1979. # LD A2, 0 * SIZE(A) # load alpha_r
  1980. LD A2, 160($sp) # load alpha_i
  1981. ADD C12, A5, C12
  1982. SUB C14, C14, A7
  1983. MUL B1, C11, A4 # A1 = alpha_r
  1984. MUL B2, C13, A4
  1985. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  1986. MADD B2, B2, C11, A2
  1987. MUL B5, C12, A4
  1988. ST B1, 0 * SIZE(CO1)
  1989. MUL B6, C14, A4
  1990. ST B2, 1 * SIZE(CO1)
  1991. NMSUB B5, B5, C14, A2
  1992. MADD B6, B6, C12, A2
  1993. ST B5, 0 * SIZE(CO2)
  1994. ST B6, 1 * SIZE(CO2)
  1995. #endif
  1996. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  1997. /* (a - bi) * (c - di) */
  1998. SUB C11, C11, A1 # ac'+'bd
  1999. ADD C13, A3, C13 # ad'+'cb
  2000. LD A4, 152($sp) # load alpha_r
  2001. # LD A1, 0 * SIZE(A) # load alpha_r
  2002. LD A2, 160($sp)
  2003. # LD A2, 0 * SIZE(A) # load alpha_i
  2004. SUB C12, C12, A5
  2005. ADD C14, A7, C14
  2006. NEG C13, C13
  2007. NEG C14, C14
  2008. MUL B1, C11, A4 # A1 = alpha_r
  2009. MUL B2, C13, A4
  2010. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2011. MADD B2, B2, C11, A2
  2012. MUL B5, C12, A4
  2013. ST B1, 0 * SIZE(CO1)
  2014. MUL B6, C14, A4
  2015. ST B2, 1 * SIZE(CO1)
  2016. NMSUB B5, B5, C14, A2
  2017. MADD B6, B6, C12, A2
  2018. ST B5, 0 * SIZE(CO2)
  2019. ST B6, 1 * SIZE(CO2)
  2020. #endif
  2021. #if ( defined(LEFT) && defined(TRANSA)) || \
  2022. (!defined(LEFT) && !defined(TRANSA))
  2023. dsubu TEMP, K, KK
  2024. #ifdef LEFT
  2025. daddiu TEMP, TEMP, -1
  2026. #else
  2027. daddiu TEMP, TEMP, -2
  2028. #endif
  2029. dsll L, TEMP, ZBASE_SHIFT
  2030. dsll TEMP, TEMP, 1 + ZBASE_SHIFT
  2031. daddu AO, AO, L
  2032. daddu BO, BO, TEMP
  2033. #endif
  2034. #ifdef LEFT
  2035. daddiu KK, KK, 1
  2036. #endif
  2037. #endif
  2038. daddiu CO1, CO1, 2 * SIZE
  2039. daddiu CO2, CO2, 2 * SIZE
  2040. .align 4
  2041. .L20:
  2042. daddiu J, J, -1
  2043. move B, BO
  2044. #if defined(TRMMKERNEL) && !defined(LEFT)
  2045. daddiu KK, KK, 2
  2046. #endif
  2047. bgtz J, .L24
  2048. NOP
  2049. .align 4
  2050. .L1:
  2051. andi J, N, 1
  2052. blez J, .L999
  2053. NOP
  2054. .L14:
  2055. dsra I, M, 2 # MR=8
  2056. move AO, A # Reset A
  2057. #if defined(TRMMKERNEL) && defined(LEFT)
  2058. move KK, OFFSET
  2059. #endif
  2060. move CO1, C
  2061. blez I, .L12
  2062. daddu C, CO1, LDC
  2063. .align 4
  2064. .L141:
  2065. #if defined(TRMMKERNEL)
  2066. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2067. move BO, B
  2068. #else
  2069. dsll L, KK, 2 + ZBASE_SHIFT
  2070. dsll TEMP, KK, ZBASE_SHIFT
  2071. daddu AO, AO, L
  2072. daddu BO, B, TEMP
  2073. #endif
  2074. MTC $0, C11 # CLEAR REAULTS REGISTERS
  2075. MOV C21, C11
  2076. gsLQC1(R13, F9, F8, 0) # B1 B2
  2077. gsLQC1(R12, F1, F0, 0) # A1 A2
  2078. MOV C31, C11
  2079. MOV C41, C11
  2080. gsLQC1(R12, F3, F2, 1) # A3 A4
  2081. MOV C13, C11
  2082. MOV C23, C11
  2083. FETCH $0, 0 * SIZE(CO1)
  2084. MOV C33, C11
  2085. MOV C43, C11
  2086. FETCH $0, 8 * SIZE(CO1)
  2087. PLU B3, B1, B1
  2088. PLU B4, B2, B2
  2089. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2090. dsubu TEMP, K, KK
  2091. #elif defined(LEFT)
  2092. daddiu TEMP, KK, 4 # define Mr=4
  2093. #else
  2094. daddiu TEMP, KK, 1 # define NR=1
  2095. #endif
  2096. dsra L, TEMP, 2
  2097. blez L, .L142
  2098. NOP
  2099. #else
  2100. move BO, B # Reset B
  2101. dsra L, K, 2 # UnRoll K=64
  2102. MTC $0, C11 # CLEAR REAULTS REGISTERS
  2103. MOV C21, C11
  2104. gsLQC1(R13, F9, F8, 0) # B1 B2
  2105. gsLQC1(R12, F1, F0, 0) # A1 A2
  2106. MOV C31, C11
  2107. MOV C41, C11
  2108. gsLQC1(R12, F3, F2, 1) # A3 A4
  2109. MOV C13, C11
  2110. MOV C23, C11
  2111. FETCH $0, 0 * SIZE(CO1)
  2112. MOV C33, C11
  2113. MOV C43, C11
  2114. FETCH $0, 8 * SIZE(CO1)
  2115. PLU B3, B1, B1
  2116. blez L, .L142
  2117. PLU B4, B2, B2
  2118. #endif
  2119. .L1410:
  2120. daddiu L, L, -1
  2121. MADPS C11, C11, A1, B1
  2122. MADPS C21, C21, A2, B1
  2123. gsLQC1(R12, F5, F4, 2) # A5 A6
  2124. gsLQC1(R12, F7, F6, 3) # A7 A8
  2125. MADPS C31, C31, A3, B1
  2126. MADPS C41, C41, A4, B1
  2127. MADPS C13, C13, A1, B3
  2128. MADPS C23, C23, A2, B3
  2129. MADPS C33, C33, A3, B3
  2130. MADPS C43, C43, A4, B3
  2131. gsLQC1(R13, F13, F12, 1) # B3 B4
  2132. gsLQC1(R12, F1, F0, 4) # A1 A2
  2133. MADPS C11, C11, A5, B2
  2134. MADPS C21, C21, A6, B2
  2135. gsLQC1(R12, F3, F2, 5) # A3 A4
  2136. MADPS C31, C31, A7, B2
  2137. MADPS C41, C41, A8, B2
  2138. daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR
  2139. MADPS C13, C13, A5, B4
  2140. MADPS C23, C23, A6, B4
  2141. MADPS C33, C33, A7, B4
  2142. MADPS C43, C43, A8, B4
  2143. PLU B7, B5, B5
  2144. PLU B8, B6, B6
  2145. MADPS C11, C11, A1, B5
  2146. MADPS C21, C21, A2, B5
  2147. gsLQC1(R12, F5, F4, 6) # A5 A6
  2148. gsLQC1(R12, F7, F6, 7) # A7 A8
  2149. MADPS C31, C31, A3, B5
  2150. MADPS C41, C41, A4, B5
  2151. daddiu AO, AO, 8 * 4 * SIZE # 4KR*8MR
  2152. MADPS C13, C13, A1, B7
  2153. MADPS C23, C23, A2, B7
  2154. MADPS C33, C33, A3, B7
  2155. MADPS C43, C43, A4, B7
  2156. gsLQC1(R13, F9, F8, 0) # B1 B2
  2157. gsLQC1(R12, F1, F0, 0) # A1 A2
  2158. MADPS C11, C11, A5, B6
  2159. MADPS C21, C21, A6, B6
  2160. gsLQC1(R12, F3, F2, 1) # A3 A4
  2161. MADPS C31, C31, A7, B6
  2162. MADPS C41, C41, A8, B6
  2163. MADPS C13, C13, A5, B8
  2164. MADPS C23, C23, A6, B8
  2165. MADPS C33, C33, A7, B8
  2166. MADPS C43, C43, A8, B8
  2167. PLU B3, B1, B1
  2168. bgtz L, .L1410
  2169. PLU B4, B2, B2
  2170. .align 4
  2171. .L142:
  2172. #ifndef TRMMKERNEL
  2173. andi L, K, 2
  2174. #else
  2175. andi L, TEMP, 2
  2176. #endif
  2177. blez L, .L147
  2178. NOP
  2179. MADPS C11, C11, A1, B1
  2180. MADPS C21, C21, A2, B1
  2181. gsLQC1(R12, F5, F4, 2) # A5 A6
  2182. gsLQC1(R12, F7, F6, 3) # A7 A8
  2183. MADPS C31, C31, A3, B1
  2184. MADPS C41, C41, A4, B1
  2185. daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR
  2186. MADPS C13, C13, A1, B3
  2187. MADPS C23, C23, A2, B3
  2188. MADPS C33, C33, A3, B3
  2189. MADPS C43, C43, A4, B3
  2190. gsLQC1(R13, F13, F8, 1) # B3 B4
  2191. gsLQC1(R12, F1, F0, 0) # A1 A2
  2192. MADPS C11, C11, A5, B2
  2193. MADPS C21, C21, A6, B2
  2194. gsLQC1(R12, F3, F2, 1) # A3 A4
  2195. MADPS C31, C31, A7, B2
  2196. MADPS C41, C41, A8, B2
  2197. daddiu BO, BO, 4 * SIZE # 4KR*4NR
  2198. MADPS C13, C13, A5, B4
  2199. MADPS C23, C23, A6, B4
  2200. MADPS C33, C33, A7, B4
  2201. MADPS C43, C43, A8, B4
  2202. PLU B3, B1, B1
  2203. .align 4
  2204. .L147:
  2205. #ifndef TRMMKERNEL
  2206. andi L, K, 1
  2207. #else
  2208. andi L, TEMP, 1
  2209. #endif
  2210. blez L, .L140
  2211. NOP
  2212. MADPS C11, C11, A1, B1
  2213. MADPS C21, C21, A2, B1
  2214. daddiu BO, BO, 2 * SIZE
  2215. MADPS C31, C31, A3, B1
  2216. MADPS C41, C41, A4, B1
  2217. daddiu AO, AO, 2 * 4 * SIZE
  2218. MADPS C13, C13, A1, B3
  2219. MADPS C23, C23, A2, B3
  2220. MADPS C33, C33, A3, B3
  2221. MADPS C43, C43, A4, B3
  2222. .align 4
  2223. .L140: # Write Back
  2224. #ifndef TRMMKERNEL
  2225. daddiu I, I, -1
  2226. CVTU A1, C11
  2227. CVTU A2, C21
  2228. CVTU A3, C31
  2229. CVTU A4, C41
  2230. CVTU A5, C13
  2231. CVTU A6, C23
  2232. CVTU A7, C33
  2233. CVTU A8, C43
  2234. CVTU B1, C12
  2235. CVTU B2, C22
  2236. CVTU B3, C32
  2237. CVTU B4, C42
  2238. CVTU B5, C14
  2239. CVTU B6, C24
  2240. CVTU B7, C34
  2241. CVTU B8, C44
  2242. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  2243. /* (a + bi) * (c + di) */
  2244. SUB C11, C11, A1 # ac'+'bd
  2245. SUB C21, C21, A2
  2246. # LD A1, 0 * SIZE(A) # load alpha_r
  2247. SUB C31, C31, A3
  2248. LD A1, 152($sp) # load alpha_r
  2249. SUB C41, C41, A4
  2250. LD A2, 160($sp) # load alpha_i
  2251. # LD A2, 0 * SIZE(A) # load alpha_i
  2252. ADD C13, A5, C13 # ad'+'cb
  2253. ADD C23, A6, C23
  2254. ADD C33, A7, C33
  2255. ADD C43, A8, C43
  2256. LD B1, 0 * SIZE(CO1)
  2257. LD B3, 2 * SIZE(CO1)
  2258. LD B5, 4 * SIZE(CO1)
  2259. LD B7, 6 * SIZE(CO1)
  2260. LD B2, 1 * SIZE(CO1)
  2261. LD B4, 3 * SIZE(CO1)
  2262. LD B6, 5 * SIZE(CO1)
  2263. LD B8, 7 * SIZE(CO1)
  2264. MADD B1, B1, C11, A1 # A1 = alpha_r
  2265. MADD B3, B3, C21, A1
  2266. MADD B5, B5, C31, A1
  2267. MADD B7, B7, C41, A1
  2268. MADD B2, B2, C13, A1
  2269. MADD B4, B4, C23, A1
  2270. MADD B6, B6, C33, A1
  2271. MADD B8, B8, C43, A1
  2272. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2273. NMSUB B3, B3, C23, A2
  2274. NMSUB B5, B5, C33, A2
  2275. NMSUB B7, B7, C43, A2
  2276. MADD B2, B2, C11, A2
  2277. MADD B4, B4, C21, A2
  2278. MADD B6, B6, C31, A2
  2279. MADD B8, B8, C41, A2
  2280. ST B1, 0 * SIZE(CO1)
  2281. ST B3, 2 * SIZE(CO1)
  2282. ST B5, 4 * SIZE(CO1)
  2283. ST B7, 6 * SIZE(CO1)
  2284. ST B2, 1 * SIZE(CO1)
  2285. ST B4, 3 * SIZE(CO1)
  2286. ST B6, 5 * SIZE(CO1)
  2287. ST B8, 7 * SIZE(CO1)
  2288. #endif
  2289. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  2290. /* (a + bi) * (c - di) */
  2291. ADD C11, A1, C11 # ac'+'bd
  2292. ADD C21, A2, C21
  2293. # LD A1, 0 * SIZE(A) # load alpha_r
  2294. ADD C31, A3, C31
  2295. LD A1, 152($sp) # load alpha_r
  2296. ADD C41, A4, C41
  2297. LD A2, 160($sp) # load alpha_i
  2298. # LD A2, 0 * SIZE(A) # load alpha_r
  2299. SUB C13, A5, C13 # ad'+'cb
  2300. SUB C23, A6, C23
  2301. SUB C33, A7, C33
  2302. SUB C43, A8, C43
  2303. LD B1, 0 * SIZE(CO1)
  2304. LD B3, 2 * SIZE(CO1)
  2305. LD B5, 4 * SIZE(CO1)
  2306. LD B7, 6 * SIZE(CO1)
  2307. LD B2, 1 * SIZE(CO1)
  2308. LD B4, 3 * SIZE(CO1)
  2309. LD B6, 5 * SIZE(CO1)
  2310. LD B8, 7 * SIZE(CO1)
  2311. MADD B1, B1, C11, A1 # A1 = alpha_r
  2312. MADD B3, B3, C21, A1
  2313. MADD B5, B5, C31, A1
  2314. MADD B7, B7, C41, A1
  2315. MADD B2, B2, C13, A1
  2316. MADD B4, B4, C23, A1
  2317. MADD B6, B6, C33, A1
  2318. MADD B8, B8, C43, A1
  2319. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2320. NMSUB B3, B3, C23, A2
  2321. NMSUB B5, B5, C33, A2
  2322. NMSUB B7, B7, C43, A2
  2323. MADD B2, B2, C11, A2
  2324. MADD B4, B4, C21, A2
  2325. MADD B6, B6, C31, A2
  2326. MADD B8, B8, C41, A2
  2327. ST B1, 0 * SIZE(CO1)
  2328. ST B3, 2 * SIZE(CO1)
  2329. ST B5, 4 * SIZE(CO1)
  2330. ST B7, 6 * SIZE(CO1)
  2331. ST B2, 1 * SIZE(CO1)
  2332. ST B4, 3 * SIZE(CO1)
  2333. ST B6, 5 * SIZE(CO1)
  2334. ST B8, 7 * SIZE(CO1)
  2335. #endif
  2336. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  2337. /* (a - bi) * (c + di) */
  2338. ADD C11, A1, C11 # ac'+'bd
  2339. ADD C21, A2, C21
  2340. # LD A1, 0 * SIZE(A) # load alpha_r
  2341. ADD C31, A3, C31
  2342. LD A1, 152($sp) # load alpha_r
  2343. # LD A2, 0 * SIZE(A) # load alpha_r
  2344. ADD C41, A4, C41
  2345. LD A2, 160($sp) # load alpha_i
  2346. SUB C13, C13, A5 # ad'+'cb
  2347. SUB C23, C23, A6
  2348. SUB C33, C33, A7
  2349. SUB C43, C43, A8
  2350. LD B1, 0 * SIZE(CO1)
  2351. LD B3, 2 * SIZE(CO1)
  2352. LD B5, 4 * SIZE(CO1)
  2353. LD B7, 6 * SIZE(CO1)
  2354. LD B2, 1 * SIZE(CO1)
  2355. LD B4, 3 * SIZE(CO1)
  2356. LD B6, 5 * SIZE(CO1)
  2357. LD B8, 7 * SIZE(CO1)
  2358. MADD B1, B1, C11, A1 # A1 = alpha_r
  2359. MADD B3, B3, C21, A1
  2360. MADD B5, B5, C31, A1
  2361. MADD B7, B7, C41, A1
  2362. MADD B2, B2, C13, A1
  2363. MADD B4, B4, C23, A1
  2364. MADD B6, B6, C33, A1
  2365. MADD B8, B8, C43, A1
  2366. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2367. NMSUB B3, B3, C23, A2
  2368. NMSUB B5, B5, C33, A2
  2369. NMSUB B7, B7, C43, A2
  2370. MADD B2, B2, C11, A2
  2371. MADD B4, B4, C21, A2
  2372. MADD B6, B6, C31, A2
  2373. MADD B8, B8, C41, A2
  2374. ST B1, 0 * SIZE(CO1)
  2375. ST B3, 2 * SIZE(CO1)
  2376. ST B5, 4 * SIZE(CO1)
  2377. ST B7, 6 * SIZE(CO1)
  2378. ST B2, 1 * SIZE(CO1)
  2379. ST B4, 3 * SIZE(CO1)
  2380. ST B6, 5 * SIZE(CO1)
  2381. ST B8, 7 * SIZE(CO1)
  2382. #endif
  2383. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  2384. /* (a - bi) * (c - di) */
  2385. SUB C11, C11, A1 # AC'+'BD
  2386. SUB C21, C21, A2
  2387. SUB C31, C31, A3
  2388. LD A1, 152($sp) # LOAD ALPHA_R
  2389. # LD A1, 0 * SIZE(A) # LOAD ALPHA_R
  2390. SUB C41, C41, A4
  2391. LD A2, 160($sp)
  2392. # LD A2, 0 * SIZE(A) # LOAD ALPHA_I
  2393. ADD C13, A5, C13 # AD'+'CB
  2394. ADD C23, A6, C23
  2395. ADD C33, A7, C33
  2396. ADD C43, A8, C43
  2397. NEG C13, C13 # AD'+'CB
  2398. NEG C23, C23
  2399. NEG C33, C33
  2400. NEG C43, C43
  2401. LD B1, 0 * SIZE(CO1)
  2402. LD B3, 2 * SIZE(CO1)
  2403. LD B5, 4 * SIZE(CO1)
  2404. LD B7, 6 * SIZE(CO1)
  2405. LD B2, 1 * SIZE(CO1)
  2406. LD B4, 3 * SIZE(CO1)
  2407. LD B6, 5 * SIZE(CO1)
  2408. LD B8, 7 * SIZE(CO1)
  2409. MADD B1, B1, C11, A1 # A1 = ALPHA_R
  2410. MADD B3, B3, C21, A1
  2411. MADD B5, B5, C31, A1
  2412. MADD B7, B7, C41, A1
  2413. MADD B2, B2, C13, A1
  2414. MADD B4, B4, C23, A1
  2415. MADD B6, B6, C33, A1
  2416. MADD B8, B8, C43, A1
  2417. NMSUB B1, B1, C13, A2 # A2 = ALPHA_I
  2418. NMSUB B3, B3, C23, A2
  2419. NMSUB B5, B5, C33, A2
  2420. NMSUB B7, B7, C43, A2
  2421. MADD B2, B2, C11, A2
  2422. MADD B4, B4, C21, A2
  2423. MADD B6, B6, C31, A2
  2424. MADD B8, B8, C41, A2
  2425. ST B1, 0 * SIZE(CO1)
  2426. ST B3, 2 * SIZE(CO1)
  2427. ST B5, 4 * SIZE(CO1)
  2428. ST B7, 6 * SIZE(CO1)
  2429. ST B2, 1 * SIZE(CO1)
  2430. ST B4, 3 * SIZE(CO1)
  2431. ST B6, 5 * SIZE(CO1)
  2432. ST B8, 7 * SIZE(CO1)
  2433. #endif
  2434. #else
  2435. daddiu I, I, -1
  2436. CVTU A1, C11
  2437. CVTU A2, C21
  2438. CVTU A3, C31
  2439. CVTU A4, C41
  2440. CVTU A5, C13
  2441. CVTU A6, C23
  2442. CVTU A7, C33
  2443. CVTU A8, C43
  2444. CVTU B1, C12
  2445. CVTU B2, C22
  2446. CVTU B3, C32
  2447. CVTU B4, C42
  2448. CVTU B5, C14
  2449. CVTU B6, C24
  2450. CVTU B7, C34
  2451. CVTU B8, C44
  2452. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  2453. /* (a + bi) * (c + di) */
  2454. SUB C11, C11, A1 # ac'+'bd
  2455. SUB C21, C21, A2
  2456. # LD A1, 0 * SIZE(A) # load alpha_r
  2457. SUB C31, C31, A3
  2458. LD A1, 152($sp) # load alpha_r
  2459. SUB C41, C41, A4
  2460. LD A2, 160($sp) # load alpha_i
  2461. # LD A2, 0 * SIZE(A) # load alpha_i
  2462. ADD C13, A5, C13 # ad'+'cb
  2463. ADD C23, A6, C23
  2464. ADD C33, A7, C33
  2465. ADD C43, A8, C43
  2466. MUL B1, C11, A1 # A1 = alpha_r
  2467. MUL B3, C21, A1
  2468. MUL B5, C31, A1
  2469. MUL B7, C41, A1
  2470. MUL B2, C13, A1
  2471. MUL B4, C23, A1
  2472. MUL B6, C33, A1
  2473. MUL B8, C43, A1
  2474. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2475. NMSUB B3, B3, C23, A2
  2476. NMSUB B5, B5, C33, A2
  2477. NMSUB B7, B7, C43, A2
  2478. MADD B2, B2, C11, A2
  2479. MADD B4, B4, C21, A2
  2480. MADD B6, B6, C31, A2
  2481. MADD B8, B8, C41, A2
  2482. ST B1, 0 * SIZE(CO1)
  2483. ST B3, 2 * SIZE(CO1)
  2484. ST B5, 4 * SIZE(CO1)
  2485. ST B7, 6 * SIZE(CO1)
  2486. ST B2, 1 * SIZE(CO1)
  2487. ST B4, 3 * SIZE(CO1)
  2488. ST B6, 5 * SIZE(CO1)
  2489. ST B8, 7 * SIZE(CO1)
  2490. #endif
  2491. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  2492. /* (a + bi) * (c - di) */
  2493. ADD C11, A1, C11 # ac'+'bd
  2494. ADD C21, A2, C21
  2495. # LD A1, 0 * SIZE(A) # load alpha_r
  2496. ADD C31, A3, C31
  2497. LD A1, 152($sp) # load alpha_r
  2498. ADD C41, A4, C41
  2499. LD A2, 160($sp) # load alpha_i
  2500. # LD A2, 0 * SIZE(A) # load alpha_r
  2501. SUB C13, A5, C13 # ad'+'cb
  2502. SUB C23, A6, C23
  2503. SUB C33, A7, C33
  2504. SUB C43, A8, C43
  2505. MUL B1, C11, A1 # A1 = alpha_r
  2506. MUL B3, C21, A1
  2507. MUL B5, C31, A1
  2508. MUL B7, C41, A1
  2509. MUL B2, C13, A1
  2510. MUL B4, C23, A1
  2511. MUL B6, C33, A1
  2512. MUL B8, C43, A1
  2513. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2514. NMSUB B3, B3, C23, A2
  2515. NMSUB B5, B5, C33, A2
  2516. NMSUB B7, B7, C43, A2
  2517. MADD B2, B2, C11, A2
  2518. MADD B4, B4, C21, A2
  2519. MADD B6, B6, C31, A2
  2520. MADD B8, B8, C41, A2
  2521. ST B1, 0 * SIZE(CO1)
  2522. ST B3, 2 * SIZE(CO1)
  2523. ST B5, 4 * SIZE(CO1)
  2524. ST B7, 6 * SIZE(CO1)
  2525. ST B2, 1 * SIZE(CO1)
  2526. ST B4, 3 * SIZE(CO1)
  2527. ST B6, 5 * SIZE(CO1)
  2528. ST B8, 7 * SIZE(CO1)
  2529. #endif
  2530. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  2531. /* (a - bi) * (c + di) */
  2532. ADD C11, A1, C11 # ac'+'bd
  2533. ADD C21, A2, C21
  2534. # LD A1, 0 * SIZE(A) # load alpha_r
  2535. ADD C31, A3, C31
  2536. LD A1, 152($sp) # load alpha_r
  2537. # LD A2, 0 * SIZE(A) # load alpha_r
  2538. ADD C41, A4, C41
  2539. LD A2, 160($sp) # load alpha_i
  2540. SUB C13, C13, A5 # ad'+'cb
  2541. SUB C23, C23, A6
  2542. SUB C33, C33, A7
  2543. SUB C43, C43, A8
  2544. MUL B1, C11, A1 # A1 = alpha_r
  2545. MUL B3, C21, A1
  2546. MUL B5, C31, A1
  2547. MUL B7, C41, A1
  2548. MUL B2, C13, A1
  2549. MUL B4, C23, A1
  2550. MUL B6, C33, A1
  2551. MUL B8, C43, A1
  2552. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2553. NMSUB B3, B3, C23, A2
  2554. NMSUB B5, B5, C33, A2
  2555. NMSUB B7, B7, C43, A2
  2556. MADD B2, B2, C11, A2
  2557. MADD B4, B4, C21, A2
  2558. MADD B6, B6, C31, A2
  2559. MADD B8, B8, C41, A2
  2560. ST B1, 0 * SIZE(CO1)
  2561. ST B3, 2 * SIZE(CO1)
  2562. ST B5, 4 * SIZE(CO1)
  2563. ST B7, 6 * SIZE(CO1)
  2564. ST B2, 1 * SIZE(CO1)
  2565. ST B4, 3 * SIZE(CO1)
  2566. ST B6, 5 * SIZE(CO1)
  2567. ST B8, 7 * SIZE(CO1)
  2568. #endif
  2569. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  2570. /* (a - bi) * (c - di) */
  2571. SUB C11, C11, A1 # AC'+'BD
  2572. SUB C21, C21, A2
  2573. SUB C31, C31, A3
  2574. LD A1, 152($sp) # LOAD ALPHA_R
  2575. # LD A1, 0 * SIZE(A) # LOAD ALPHA_R
  2576. SUB C41, C41, A4
  2577. LD A2, 160($sp)
  2578. # LD A2, 0 * SIZE(A) # LOAD ALPHA_I
  2579. ADD C13, A5, C13 # AD'+'CB
  2580. ADD C23, A6, C23
  2581. ADD C33, A7, C33
  2582. ADD C43, A8, C43
  2583. NEG C13, C13 # AD'+'CB
  2584. NEG C23, C23
  2585. NEG C33, C33
  2586. NEG C43, C43
  2587. MUL B1, C11, A1 # A1 = ALPHA_R
  2588. MUL B3, C21, A1
  2589. MUL B5, C31, A1
  2590. MUL B7, C41, A1
  2591. MUL B2, C13, A1
  2592. MUL B4, C23, A1
  2593. MUL B6, C33, A1
  2594. MUL B8, C43, A1
  2595. NMSUB B1, B1, C13, A2 # A2 = ALPHA_I
  2596. NMSUB B3, B3, C23, A2
  2597. NMSUB B5, B5, C33, A2
  2598. NMSUB B7, B7, C43, A2
  2599. MADD B2, B2, C11, A2
  2600. MADD B4, B4, C21, A2
  2601. MADD B6, B6, C31, A2
  2602. MADD B8, B8, C41, A2
  2603. ST B1, 0 * SIZE(CO1)
  2604. ST B3, 2 * SIZE(CO1)
  2605. ST B5, 4 * SIZE(CO1)
  2606. ST B7, 6 * SIZE(CO1)
  2607. ST B2, 1 * SIZE(CO1)
  2608. ST B4, 3 * SIZE(CO1)
  2609. ST B6, 5 * SIZE(CO1)
  2610. ST B8, 7 * SIZE(CO1)
  2611. #endif
  2612. #if ( defined(LEFT) && defined(TRANSA)) || \
  2613. (!defined(LEFT) && !defined(TRANSA))
  2614. dsubu TEMP, K, KK
  2615. #ifdef LEFT
  2616. daddiu TEMP, TEMP, -4
  2617. #else
  2618. daddiu TEMP, TEMP, -1
  2619. #endif
  2620. dsll L, TEMP, 2 + ZBASE_SHIFT
  2621. dsll TEMP, TEMP, ZBASE_SHIFT
  2622. daddu AO, AO, L
  2623. daddu BO, BO, TEMP
  2624. #endif
  2625. #ifdef LEFT
  2626. daddiu KK, KK, 4
  2627. #endif
  2628. #endif
  2629. bgtz I, .L141
  2630. daddiu CO1, CO1, 8 * SIZE
  2631. .align 4
  2632. .L12:
  2633. andi I, M, 2 # MR=4
  2634. blez I, .L11
  2635. NOP
  2636. .align 4
  2637. .L121:
  2638. #if defined(TRMMKERNEL)
  2639. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2640. move BO, B
  2641. #else
  2642. dsll L, KK, 1 + ZBASE_SHIFT
  2643. dsll TEMP, KK, ZBASE_SHIFT
  2644. daddu AO, AO, L
  2645. daddu BO, B, TEMP
  2646. #endif
  2647. MTC $0, C11 # CLEAR REAULTS REGISTERS
  2648. MOV C21, C11
  2649. gsLQC1(R13, F9, F8, 0) # B1 B2
  2650. gsLQC1(R12, F1, F0, 0) # A1 A2
  2651. MOV C13, C11
  2652. MOV C23, C11
  2653. FETCH $0, 0 * SIZE(CO1)
  2654. FETCH $0, 8 * SIZE(CO1)
  2655. PLU B3, B1, B1
  2656. PLU B4, B2, B2
  2657. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2658. dsubu TEMP, K, KK
  2659. #elif defined(LEFT)
  2660. daddiu TEMP, KK, 2
  2661. #else
  2662. daddiu TEMP, KK, 1
  2663. #endif
  2664. dsra L, TEMP, 2
  2665. blez L, .L122
  2666. NOP
  2667. #else
  2668. move BO, B # Reset B
  2669. dsra L, K, 2 # UnRoll K=64
  2670. MTC $0, C11 # CLEAR REAULTS REGISTERS
  2671. MOV C21, C11
  2672. gsLQC1(R13, F9, F8, 0) # B1 B2
  2673. gsLQC1(R12, F1, F0, 0) # A1 A2
  2674. MOV C13, C11
  2675. MOV C23, C11
  2676. FETCH $0, 0 * SIZE(CO1)
  2677. FETCH $0, 8 * SIZE(CO1)
  2678. PLU B3, B1, B1
  2679. blez L, .L122
  2680. PLU B4, B2, B2
  2681. #endif
  2682. .L1210:
  2683. daddiu L, L, -1
  2684. gsLQC1(R13, F13, F12, 1) # B3 B4
  2685. MADPS C11, C11, A1, B1
  2686. MADPS C21, C21, A2, B1
  2687. gsLQC1(R12, F3, F2, 1) # A3 A4
  2688. MADPS C13, C13, A1, B3
  2689. MADPS C23, C23, A2, B3
  2690. gsLQC1(R12, F5, F4, 2) # A5 A6
  2691. PLU B7, B5, B5
  2692. PLU B8, B6, B6
  2693. daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR
  2694. MADPS C11, C11, A3, B2
  2695. MADPS C21, C21, A4, B2
  2696. gsLQC1(R12, F7, F6, 3) # A7 A8
  2697. MADPS C13, C13, A3, B4
  2698. MADPS C23, C23, A4, B4
  2699. MADPS C11, C11, A5, B5
  2700. MADPS C21, C21, A6, B5
  2701. daddiu AO, AO, 4 * 4 * SIZE # 4KR*8MR
  2702. gsLQC1(R13, F9, F8, 0) # B1 B2
  2703. MADPS C13, C13, A5, B7
  2704. MADPS C23, C23, A6, B7
  2705. gsLQC1(R12, F1, F0, 0) # A1 A2
  2706. MADPS C11, C11, A7, B6
  2707. MADPS C21, C21, A8, B6
  2708. MADPS C13, C13, A7, B8
  2709. MADPS C23, C23, A8, B8
  2710. PLU B3, B1, B1
  2711. bgtz L, .L1210
  2712. PLU B4, B2, B2
  2713. .align 4
  2714. .L122:
  2715. #ifndef TRMMKERNEL
  2716. andi L, K, 2
  2717. #else
  2718. andi L, TEMP, 2
  2719. #endif
  2720. blez L, .L127
  2721. NOP
  2722. MADPS C11, C11, A1, B1
  2723. MADPS C21, C21, A2, B1
  2724. gsLQC1(R12, F3, F2, 1) # A3 A4
  2725. MADPS C13, C13, A1, B3
  2726. MADPS C23, C23, A2, B3
  2727. PLU B7, B5, B5
  2728. daddiu BO, BO, 1 * 4 * SIZE
  2729. daddiu AO, AO, 2 * 4 * SIZE
  2730. MADPS C11, C11, A3, B2
  2731. MADPS C21, C21, A4, B2
  2732. MADPS C13, C13, A3, B4
  2733. MADPS C23, C23, A4, B4
  2734. gsLQC1(R13, F9, F8, 0)
  2735. gsLQC1(R12, F1, F0, 0)
  2736. PLU B3, B1, B1
  2737. .align 4
  2738. .L127:
  2739. #ifndef TRMMKERNEL
  2740. andi L, K, 1
  2741. #else
  2742. andi L, TEMP, 1
  2743. #endif
  2744. blez L, .L120
  2745. NOP
  2746. MADPS C11, C11, A1, B1
  2747. MADPS C21, C21, A2, B1
  2748. daddiu BO, BO, 2 * SIZE
  2749. daddiu AO, AO, 4 * SIZE
  2750. MADPS C13, C13, A1, B3
  2751. MADPS C23, C23, A2, B3
  2752. .align 4
  2753. .L120: # Write Back
  2754. #ifndef TRMMKERNEL
  2755. daddiu I, I, -1
  2756. CVTU A1, C11
  2757. CVTU A2, C21
  2758. CVTU A3, C13
  2759. CVTU A4, C23
  2760. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  2761. /* (a + bi) * (c + di) */
  2762. SUB C11, C11, A1 # ac'+'bd
  2763. SUB C21, C21, A2
  2764. ADD C13, A3, C13 # ad'+'cb
  2765. ADD C23, A4, C23
  2766. # LD A1, 0 * SIZE(A) # load alpha_r
  2767. LD A1, 152($sp) # load alpha_r
  2768. LD A2, 160($sp) # load alpha_i
  2769. # LD A2, 0 * SIZE(A) # load alpha_i
  2770. LD B1, 0 * SIZE(CO1)
  2771. LD B3, 2 * SIZE(CO1)
  2772. LD B2, 1 * SIZE(CO1)
  2773. LD B4, 3 * SIZE(CO1)
  2774. MADD B1, B1, C11, A1 # A1 = alpha_r
  2775. MADD B3, B3, C21, A1
  2776. MADD B2, B2, C13, A1
  2777. MADD B4, B4, C23, A1
  2778. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2779. NMSUB B3, B3, C23, A2
  2780. MADD B2, B2, C11, A2
  2781. MADD B4, B4, C21, A2
  2782. ST B1, 0 * SIZE(CO1)
  2783. ST B3, 2 * SIZE(CO1)
  2784. ST B2, 1 * SIZE(CO1)
  2785. ST B4, 3 * SIZE(CO1)
  2786. #endif
  2787. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  2788. /* (a + bi) * (c - di) */
  2789. ADD C11, A1, C11 # ac'+'bd
  2790. ADD C21, A2, C21
  2791. SUB C13, A3, C13 # ad'+'cb
  2792. SUB C23, A4, C23
  2793. # LD A1, 0 * SIZE(A) # load alpha_r
  2794. LD A1, 152($sp) # load alpha_r
  2795. LD A2, 160($sp) # load alpha_i
  2796. # LD A2, 0 * SIZE(A) # load alpha_r
  2797. LD B1, 0 * SIZE(CO1)
  2798. LD B3, 2 * SIZE(CO1)
  2799. LD B2, 1 * SIZE(CO1)
  2800. LD B4, 3 * SIZE(CO1)
  2801. MADD B1, B1, C11, A1 # A1 = alpha_r
  2802. MADD B3, B3, C21, A1
  2803. MADD B2, B2, C13, A1
  2804. MADD B4, B4, C23, A1
  2805. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2806. NMSUB B3, B3, C23, A2
  2807. MADD B2, B2, C11, A2
  2808. MADD B4, B4, C21, A2
  2809. ST B1, 0 * SIZE(CO1)
  2810. ST B3, 2 * SIZE(CO1)
  2811. ST B2, 1 * SIZE(CO1)
  2812. ST B4, 3 * SIZE(CO1)
  2813. #endif
  2814. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  2815. /* (a - bi) * (c + di) */
  2816. ADD C11, A1, C11 # ac'+'bd
  2817. ADD C21, A2, C21
  2818. SUB C13, C13, A3 # ad'+'cb
  2819. SUB C23, C23, A4
  2820. # LD A1, 0 * SIZE(A) # load alpha_r
  2821. LD A1, 152($sp) # load alpha_r
  2822. # LD A2, 0 * SIZE(A) # load alpha_r
  2823. LD A2, 160($sp) # load alpha_i
  2824. LD B1, 0 * SIZE(CO1)
  2825. LD B3, 2 * SIZE(CO1)
  2826. LD B2, 1 * SIZE(CO1)
  2827. LD B4, 3 * SIZE(CO1)
  2828. MADD B1, B1, C11, A1 # A1 = alpha_r
  2829. MADD B3, B3, C21, A1
  2830. MADD B2, B2, C13, A1
  2831. MADD B4, B4, C23, A1
  2832. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2833. NMSUB B3, B3, C23, A2
  2834. MADD B2, B2, C11, A2
  2835. MADD B4, B4, C21, A2
  2836. ST B1, 0 * SIZE(CO1)
  2837. ST B3, 2 * SIZE(CO1)
  2838. ST B2, 1 * SIZE(CO1)
  2839. ST B4, 3 * SIZE(CO1)
  2840. #endif
  2841. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  2842. /* (a - bi) * (c - di) */
  2843. SUB C11, C11, A1 # ac'+'bd
  2844. SUB C21, C21, A2
  2845. ADD C13, A3, C13 # ad'+'cb
  2846. ADD C23, A4, C23
  2847. LD A1, 152($sp) # load alpha_r
  2848. # LD A1, 0 * SIZE(A) # load alpha_r
  2849. LD A2, 160($sp)
  2850. # LD A2, 0 * SIZE(A) # load alpha_i
  2851. NEG C13, C13 # ad'+'cb
  2852. NEG C23, C23
  2853. LD B1, 0 * SIZE(CO1)
  2854. LD B3, 2 * SIZE(CO1)
  2855. LD B2, 1 * SIZE(CO1)
  2856. LD B4, 3 * SIZE(CO1)
  2857. MADD B1, B1, C11, A1 # A1 = alpha_r
  2858. MADD B3, B3, C21, A1
  2859. MADD B2, B2, C13, A1
  2860. MADD B4, B4, C23, A1
  2861. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2862. NMSUB B3, B3, C23, A2
  2863. MADD B2, B2, C11, A2
  2864. MADD B4, B4, C21, A2
  2865. ST B1, 0 * SIZE(CO1)
  2866. ST B3, 2 * SIZE(CO1)
  2867. ST B2, 1 * SIZE(CO1)
  2868. ST B4, 3 * SIZE(CO1)
  2869. #endif
  2870. #else
  2871. daddiu I, I, -1
  2872. CVTU A1, C11
  2873. CVTU A2, C21
  2874. CVTU A3, C13
  2875. CVTU A4, C23
  2876. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  2877. /* (a + bi) * (c + di) */
  2878. SUB C11, C11, A1 # ac'+'bd
  2879. SUB C21, C21, A2
  2880. ADD C13, A3, C13 # ad'+'cb
  2881. ADD C23, A4, C23
  2882. # LD A1, 0 * SIZE(A) # load alpha_r
  2883. LD A1, 152($sp) # load alpha_r
  2884. LD A2, 160($sp) # load alpha_i
  2885. # LD A2, 0 * SIZE(A) # load alpha_i
  2886. MUL B1, C11, A1 # A1 = alpha_r
  2887. MUL B3, C21, A1
  2888. MUL B2, C13, A1
  2889. MUL B4, C23, A1
  2890. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2891. NMSUB B3, B3, C23, A2
  2892. MADD B2, B2, C11, A2
  2893. MADD B4, B4, C21, A2
  2894. ST B1, 0 * SIZE(CO1)
  2895. ST B3, 2 * SIZE(CO1)
  2896. ST B2, 1 * SIZE(CO1)
  2897. ST B4, 3 * SIZE(CO1)
  2898. #endif
  2899. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  2900. /* (a + bi) * (c - di) */
  2901. ADD C11, A1, C11 # ac'+'bd
  2902. ADD C21, A2, C21
  2903. SUB C13, A3, C13 # ad'+'cb
  2904. SUB C23, A4, C23
  2905. # LD A1, 0 * SIZE(A) # load alpha_r
  2906. LD A1, 152($sp) # load alpha_r
  2907. LD A2, 160($sp) # load alpha_i
  2908. # LD A2, 0 * SIZE(A) # load alpha_r
  2909. MUL B1, C11, A1 # A1 = alpha_r
  2910. MUL B3, C21, A1
  2911. MUL B2, C13, A1
  2912. MUL B4, C23, A1
  2913. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2914. NMSUB B3, B3, C23, A2
  2915. MADD B2, B2, C11, A2
  2916. MADD B4, B4, C21, A2
  2917. ST B1, 0 * SIZE(CO1)
  2918. ST B3, 2 * SIZE(CO1)
  2919. ST B2, 1 * SIZE(CO1)
  2920. ST B4, 3 * SIZE(CO1)
  2921. #endif
  2922. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  2923. /* (a - bi) * (c + di) */
  2924. ADD C11, A1, C11 # ac'+'bd
  2925. ADD C21, A2, C21
  2926. SUB C13, C13, A3 # ad'+'cb
  2927. SUB C23, C23, A4
  2928. # LD A1, 0 * SIZE(A) # load alpha_r
  2929. LD A1, 152($sp) # load alpha_r
  2930. # LD A2, 0 * SIZE(A) # load alpha_r
  2931. LD A2, 160($sp) # load alpha_i
  2932. MUL B1, C11, A1 # A1 = alpha_r
  2933. MUL B3, C21, A1
  2934. MUL B2, C13, A1
  2935. MUL B4, C23, A1
  2936. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2937. NMSUB B3, B3, C23, A2
  2938. MADD B2, B2, C11, A2
  2939. MADD B4, B4, C21, A2
  2940. ST B1, 0 * SIZE(CO1)
  2941. ST B3, 2 * SIZE(CO1)
  2942. ST B2, 1 * SIZE(CO1)
  2943. ST B4, 3 * SIZE(CO1)
  2944. #endif
  2945. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  2946. /* (a - bi) * (c - di) */
  2947. SUB C11, C11, A1 # ac'+'bd
  2948. SUB C21, C21, A2
  2949. ADD C13, A3, C13 # ad'+'cb
  2950. ADD C23, A4, C23
  2951. LD A1, 152($sp) # load alpha_r
  2952. # LD A1, 0 * SIZE(A) # load alpha_r
  2953. LD A2, 160($sp)
  2954. # LD A2, 0 * SIZE(A) # load alpha_i
  2955. NEG C13, C13 # ad'+'cb
  2956. NEG C23, C23
  2957. MUL B1, C11, A1 # A1 = alpha_r
  2958. MUL B3, C21, A1
  2959. MUL B2, C13, A1
  2960. MUL B4, C23, A1
  2961. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  2962. NMSUB B3, B3, C23, A2
  2963. MADD B2, B2, C11, A2
  2964. MADD B4, B4, C21, A2
  2965. ST B1, 0 * SIZE(CO1)
  2966. ST B3, 2 * SIZE(CO1)
  2967. ST B2, 1 * SIZE(CO1)
  2968. ST B4, 3 * SIZE(CO1)
  2969. #endif
  2970. #if ( defined(LEFT) && defined(TRANSA)) || \
  2971. (!defined(LEFT) && !defined(TRANSA))
  2972. dsubu TEMP, K, KK
  2973. #ifdef LEFT
  2974. daddiu TEMP, TEMP, -2
  2975. #else
  2976. daddiu TEMP, TEMP, -1
  2977. #endif
  2978. dsll L, TEMP, 1 + ZBASE_SHIFT
  2979. dsll TEMP, TEMP, ZBASE_SHIFT
  2980. daddu AO, AO, L
  2981. daddu BO, BO, TEMP
  2982. #endif
  2983. #ifdef LEFT
  2984. daddiu KK, KK, 2
  2985. #endif
  2986. #endif
  2987. daddiu CO1, CO1, 4 * SIZE
  2988. daddiu CO2, CO2, 4 * SIZE
  2989. .align 4
  2990. .L11:
  2991. andi I, M, 1
  2992. blez I, .L10
  2993. NOP
  2994. .align 4
  2995. .L111:
  2996. #if defined(TRMMKERNEL)
  2997. #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
  2998. move BO, B
  2999. #else
  3000. dsll TEMP, KK, ZBASE_SHIFT
  3001. daddu AO, AO, TEMP
  3002. daddu BO, B, TEMP
  3003. #endif
  3004. MTC $0, C11 # CLEAR REAULTS REGISTERS
  3005. gsLQC1(R13, F9, F8, 0) # B1 B2
  3006. gsLQC1(R12, F1, F0, 0) # A1 A2
  3007. MOV C13, C11
  3008. FETCH $0, 0 * SIZE(CO1)
  3009. PLU B3, B1, B1
  3010. PLU B4, B2, B2
  3011. #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3012. dsubu TEMP, K, KK
  3013. #elif defined(LEFT)
  3014. daddiu TEMP, KK, 1
  3015. #else
  3016. daddiu TEMP, KK, 1
  3017. #endif
  3018. dsra L, TEMP, 2
  3019. blez L, .L112
  3020. NOP
  3021. #else
  3022. move BO, B # Reset B
  3023. dsra L, K, 2 # UnRoll K=64
  3024. MTC $0, C11 # CLEAR REAULTS REGISTERS
  3025. gsLQC1(R13, F9, F8, 0) # B1 B2
  3026. gsLQC1(R12, F1, F0, 0) # A1 A2
  3027. MOV C13, C11
  3028. FETCH $0, 0 * SIZE(CO1)
  3029. PLU B3, B1, B1
  3030. blez L, .L112
  3031. PLU B4, B2, B2
  3032. #endif
  3033. .L1110:
  3034. daddiu L, L, -1
  3035. gsLQC1(R13, F13, F12, 1) # B3 B4
  3036. MADPS C11, C11, A1, B1
  3037. gsLQC1(R12, F3, F2, 1) # A3 A4
  3038. MADPS C13, C13, A1, B3
  3039. daddiu BO, BO, 2 * 4 * SIZE # 4KR*4NR
  3040. PLU B7, B5, B5
  3041. PLU B8, B6, B6
  3042. daddiu AO, AO, 2 * 4 * SIZE # 4KR*8MR
  3043. MADPS C11, C11, A2, B2
  3044. MADPS C13, C13, A2, B4
  3045. MADPS C11, C11, A3, B5
  3046. MADPS C13, C13, A3, B7
  3047. gsLQC1(R13, F9, F8, 0) # B1 B2
  3048. MADPS C11, C11, A4, B6
  3049. gsLQC1(R12, F1, F0, 0) # A1 A2
  3050. MADPS C13, C13, A4, B8
  3051. PLU B3, B1, B1
  3052. bgtz L, .L1110
  3053. PLU B4, B2, B2
  3054. .align 4
  3055. .L112:
  3056. #ifndef TRMMKERNEL
  3057. andi L, K, 2
  3058. #else
  3059. andi L, TEMP, 2
  3060. #endif
  3061. blez L, .L117
  3062. NOP
  3063. MADPS C11, C11, A1, B1
  3064. MADPS C13, C13, A1, B3
  3065. daddiu BO, BO, 4 * SIZE
  3066. daddiu AO, AO, 4 * SIZE
  3067. MADPS C11, C11, A2, B2
  3068. MADPS C13, C13, A2, B4
  3069. gsLQC1(R13, F9, F8, 0)
  3070. gsLQC1(R12, F1, F0, 0)
  3071. PLU B3, B1, B1
  3072. .align 4
  3073. .L117:
  3074. #ifndef TRMMKERNEL
  3075. andi L, K, 1
  3076. #else
  3077. andi L, TEMP, 1
  3078. #endif
  3079. blez L, .L110
  3080. NOP
  3081. daddiu BO, BO, 2 * SIZE
  3082. daddiu AO, AO, 2 * SIZE
  3083. MADPS C11, C11, A1, B1
  3084. MADPS C13, C13, A1, B3
  3085. .align 4
  3086. .L110: # Write Back
  3087. #ifndef TRMMKERNEL
  3088. daddiu I, I, -1
  3089. CVTU A1, C11
  3090. CVTU A3, C13
  3091. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  3092. /* (a + bi) * (c + di) */
  3093. SUB C11, C11, A1 # ac'+'bd
  3094. ADD C13, A3, C13 # ad'+'cb
  3095. # LD A1, 0 * SIZE(A) # load alpha_r
  3096. LD A4, 152($sp) # load alpha_r
  3097. LD A2, 160($sp) # load alpha_i
  3098. # LD A2, 0 * SIZE(A) # load alpha_i
  3099. LD B1, 0 * SIZE(CO1)
  3100. LD B2, 1 * SIZE(CO1)
  3101. MADD B1, B1, C11, A4 # A1 = alpha_r
  3102. MADD B2, B2, C13, A4
  3103. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  3104. MADD B2, B2, C11, A2
  3105. ST B1, 0 * SIZE(CO1)
  3106. ST B2, 1 * SIZE(CO1)
  3107. #endif
  3108. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  3109. /* (a + bi) * (c - di) */
  3110. ADD C11, A1, C11 # ac'+'bd
  3111. SUB C13, A3, C13 # ad'+'cb
  3112. LD A4, 152($sp) # load alpha_r
  3113. LD A2, 160($sp) # load alpha_i
  3114. LD B1, 0 * SIZE(CO1)
  3115. LD B2, 1 * SIZE(CO1)
  3116. MADD B1, B1, C11, A4 # A1 = alpha_r
  3117. MADD B2, B2, C13, A4
  3118. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  3119. MADD B2, B2, C11, A2
  3120. ST B1, 0 * SIZE(CO1)
  3121. ST B2, 1 * SIZE(CO1)
  3122. #endif
  3123. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  3124. /* (a - bi) * (c + di) */
  3125. ADD C11, A1, C11 # ac'+'bd
  3126. SUB C13, C13, A3 # ad'+'cb
  3127. LD A4, 152($sp) # load alpha_r
  3128. LD A2, 160($sp) # load alpha_i
  3129. LD B1, 0 * SIZE(CO1)
  3130. LD B2, 1 * SIZE(CO1)
  3131. MADD B1, B1, C11, A4 # A1 = alpha_r
  3132. MADD B2, B2, C13, A4
  3133. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  3134. MADD B2, B2, C11, A2
  3135. ST B1, 0 * SIZE(CO1)
  3136. ST B2, 1 * SIZE(CO1)
  3137. #endif
  3138. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  3139. /* (a - bi) * (c - di) */
  3140. SUB C11, C11, A1 # ac'+'bd
  3141. ADD C13, A3, C13 # ad'+'cb
  3142. NEG C13, C13
  3143. LD A4, 152($sp) # load alpha_r
  3144. LD A2, 160($sp)
  3145. LD B1, 0 * SIZE(CO1)
  3146. LD B2, 1 * SIZE(CO1)
  3147. MADD B1, B1, C11, A4 # A1 = alpha_r
  3148. MADD B2, B2, C13, A4
  3149. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  3150. MADD B2, B2, C11, A2
  3151. ST B1, 0 * SIZE(CO1)
  3152. ST B2, 1 * SIZE(CO1)
  3153. #endif
  3154. #else
  3155. daddiu I, I, -1
  3156. CVTU A1, C11
  3157. CVTU A3, C13
  3158. #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
  3159. /* (a + bi) * (c + di) */
  3160. SUB C11, C11, A1 # ac'+'bd
  3161. ADD C13, A3, C13 # ad'+'cb
  3162. # LD A1, 0 * SIZE(A) # load alpha_r
  3163. LD A4, 152($sp) # load alpha_r
  3164. LD A2, 160($sp) # load alpha_i
  3165. # LD A2, 0 * SIZE(A) # load alpha_i
  3166. MUL B1, C11, A4 # A1 = alpha_r
  3167. MUL B2, C13, A4
  3168. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  3169. MADD B2, B2, C11, A2
  3170. ST B1, 0 * SIZE(CO1)
  3171. ST B2, 1 * SIZE(CO1)
  3172. #endif
  3173. #if defined(NR) || defined(NC) || defined(TR) || defined(TC)
  3174. /* (a + bi) * (c - di) */
  3175. ADD C11, A1, C11 # ac'+'bd
  3176. SUB C13, A3, C13 # ad'+'cb
  3177. LD A4, 152($sp) # load alpha_r
  3178. LD A2, 160($sp) # load alpha_i
  3179. MUL B1, C11, A4 # A1 = alpha_r
  3180. MUL B2, C13, A4
  3181. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  3182. MADD B2, B2, C11, A2
  3183. ST B1, 0 * SIZE(CO1)
  3184. ST B2, 1 * SIZE(CO1)
  3185. #endif
  3186. #if defined(RN) || defined(RT) || defined(CN) || defined(CT)
  3187. /* (a - bi) * (c + di) */
  3188. ADD C11, A1, C11 # ac'+'bd
  3189. SUB C13, C13, A3 # ad'+'cb
  3190. LD A4, 152($sp) # load alpha_r
  3191. LD A2, 160($sp) # load alpha_i
  3192. MUL B1, C11, A4 # A1 = alpha_r
  3193. MUL B2, C13, A4
  3194. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  3195. MADD B2, B2, C11, A2
  3196. ST B1, 0 * SIZE(CO1)
  3197. ST B2, 1 * SIZE(CO1)
  3198. #endif
  3199. #if defined(RR) || defined(RC) || defined(CR) || defined(CC)
  3200. /* (a - bi) * (c - di) */
  3201. SUB C11, C11, A1 # ac'+'bd
  3202. ADD C13, A3, C13 # ad'+'cb
  3203. NEG C13, C13
  3204. LD A4, 152($sp) # load alpha_r
  3205. LD A2, 160($sp)
  3206. MUL B1, C11, A4 # A1 = alpha_r
  3207. MUL B2, C13, A4
  3208. NMSUB B1, B1, C13, A2 # A2 = alpha_i
  3209. MADD B2, B2, C11, A2
  3210. ST B1, 0 * SIZE(CO1)
  3211. ST B2, 1 * SIZE(CO1)
  3212. #endif
  3213. #if ( defined(LEFT) && defined(TRANSA)) || \
  3214. (!defined(LEFT) && !defined(TRANSA))
  3215. dsubu TEMP, K, KK
  3216. #ifdef LEFT
  3217. daddiu TEMP, TEMP, -1
  3218. #else
  3219. daddiu TEMP, TEMP, -1
  3220. #endif
  3221. dsll TEMP, TEMP, ZBASE_SHIFT
  3222. daddu AO, AO, TEMP
  3223. daddu BO, BO, TEMP
  3224. #endif
  3225. #ifdef LEFT
  3226. daddiu KK, KK, 1
  3227. #endif
  3228. #endif
  3229. daddiu CO1, CO1, 2 * SIZE
  3230. daddiu CO2, CO2, 2 * SIZE
  3231. .align 4
  3232. .L10:
  3233. move B, BO
  3234. #if defined(TRMMKERNEL) && !defined(LEFT)
  3235. daddiu KK, KK, 1
  3236. #endif
  3237. .L999:
  3238. ld $16, 0($sp)
  3239. ld $17, 8($sp)
  3240. ld $18, 16($sp)
  3241. ld $19, 24($sp)
  3242. ld $20, 32($sp)
  3243. ld $21, 40($sp)
  3244. ld $22, 48($sp)
  3245. LD $f24, 56($sp)
  3246. LD $f25, 64($sp)
  3247. LD $f26, 72($sp)
  3248. LD $f27, 80($sp)
  3249. LD $f28, 88($sp)
  3250. #if defined(TRMMKERNEL)
  3251. ld $23, 96($sp)
  3252. ld $24, 104($sp)
  3253. ld $25, 112($sp)
  3254. #endif
  3255. #ifndef __64BIT__
  3256. LD $f20,120($sp)
  3257. LD $f21,128($sp)
  3258. LD $f22,136($sp)
  3259. LD $f23,144($sp)
  3260. #endif
  3261. daddiu $sp,$sp,STACKSIZE
  3262. j $31
  3263. nop
  3264. EPILOGUE