You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_4x4_LN.S 59 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(EV4) && !defined(EV5) && !defined(EV6)
  41. #error "Architecture is not specified."
  42. #endif
  43. #ifdef EV6
  44. #define PREFETCHSIZE 56
  45. #define UNOP unop
  46. #endif
  47. #ifdef EV5
  48. #define PREFETCHSIZE 56
  49. #define UNOP
  50. #endif
  51. #ifdef EV4
  52. #define UNOP
  53. #endif
  54. #define STACKSIZE 80
  55. #define M $16
  56. #define N $17
  57. #define K $18
  58. #define A $20
  59. #define B $21
  60. #define C $22
  61. #define LDC $23
  62. #define C1 $19
  63. #define C2 $24
  64. #define C3 $25
  65. #define C4 $27
  66. #define AO $at
  67. #define BO $5
  68. #define I $6
  69. #define J $7
  70. #define L $8
  71. #define a1 $f16
  72. #define a2 $f17
  73. #define a3 $f18
  74. #define a4 $f19
  75. #define b1 $f20
  76. #define b2 $f21
  77. #define b3 $f22
  78. #define b4 $f23
  79. #define t1 $f24
  80. #define t2 $f25
  81. #define t3 $f26
  82. #define t4 $f27
  83. #define a5 $f28
  84. #define a6 $f30
  85. #define b5 $f29
  86. #define alpha $f30
  87. #define c01 $f0
  88. #define c02 $f1
  89. #define c03 $f2
  90. #define c04 $f3
  91. #define c05 $f4
  92. #define c06 $f5
  93. #define c07 $f6
  94. #define c08 $f7
  95. #define c09 $f8
  96. #define c10 $f9
  97. #define c11 $f10
  98. #define c12 $f11
  99. #define c13 $f12
  100. #define c14 $f13
  101. #define c15 $f14
  102. #define c16 $f15
  103. #define TMP1 $0
  104. #define TMP2 $1
  105. #define KK $2
  106. #define AORIG $3
  107. #define OFFSET $4
  108. PROLOGUE
  109. PROFCODE
  110. .frame $sp, STACKSIZE, $26, 0
  111. lda $sp, -STACKSIZE($sp)
  112. ldq C, 0 + STACKSIZE($sp)
  113. ldq LDC, 8 + STACKSIZE($sp)
  114. ldq OFFSET, 16 + STACKSIZE($sp)
  115. SXADDQ LDC, 0, LDC
  116. stt $f2, 0($sp)
  117. stt $f3, 8($sp)
  118. stt $f4, 16($sp)
  119. stt $f5, 24($sp)
  120. stt $f6, 32($sp)
  121. stt $f7, 40($sp)
  122. stt $f8, 48($sp)
  123. stt $f9, 56($sp)
  124. cmple M, 0, $0
  125. cmple N, 0, $1
  126. cmple K, 0, $2
  127. or $0, $1, $0
  128. or $0, $2, $0
  129. bne $0, $L999
  130. #ifdef LN
  131. mulq M, K, TMP1
  132. SXADDQ TMP1, A, A
  133. SXADDQ M, C, C
  134. #endif
  135. #ifdef RN
  136. negq OFFSET, KK
  137. #endif
  138. #ifdef RT
  139. mulq N, K, TMP1
  140. SXADDQ TMP1, B, B
  141. mulq N, LDC, TMP1
  142. addq TMP1, C, C
  143. subq N, OFFSET, KK
  144. #endif
  145. sra N, 2, J
  146. ble J, $L40
  147. .align 4
  148. $L01:
  149. #ifdef RT
  150. sll K, 2 + BASE_SHIFT, TMP1
  151. subq B, TMP1, B
  152. s4addq LDC, 0, TMP1
  153. subq C, TMP1, C
  154. #endif
  155. mov C, C1
  156. addq C, LDC, C2
  157. addq C2, LDC, C3
  158. #ifndef RT
  159. s4addq LDC, C, C
  160. #endif
  161. fclr t1
  162. addq C3, LDC, C4
  163. fclr t2
  164. #ifdef LN
  165. addq M, OFFSET, KK
  166. #endif
  167. #ifdef LT
  168. mov OFFSET, KK
  169. #endif
  170. #if defined(LN) || defined(RT)
  171. mov A, AORIG
  172. #else
  173. mov A, AO
  174. #endif
  175. fclr t3
  176. fclr t4
  177. and M, 1, I
  178. ble I, $L20
  179. #if defined(LT) || defined(RN)
  180. LD a1, 0 * SIZE(AO)
  181. fclr c01
  182. LD a2, 1 * SIZE(AO)
  183. fclr c05
  184. LD b1, 0 * SIZE(B)
  185. lda L, -2(KK)
  186. LD b2, 1 * SIZE(B)
  187. lda AO, 1 * SIZE(AO)
  188. LD b3, 2 * SIZE(B)
  189. fclr c09
  190. LD b4, 3 * SIZE(B)
  191. fclr c13
  192. lda BO, 4 * SIZE(B)
  193. ble KK, $L38
  194. ble L, $L35
  195. #else
  196. #ifdef LN
  197. sll K, BASE_SHIFT + 0, TMP1
  198. subq AORIG, TMP1, AORIG
  199. #endif
  200. sll KK, BASE_SHIFT + 0, TMP1
  201. addq AORIG, TMP1, AO
  202. sll KK, BASE_SHIFT + 2, TMP2
  203. addq B, TMP2, BO
  204. subq K, KK, TMP1
  205. LD a1, 0 * SIZE(AO)
  206. fclr c01
  207. LD a2, 1 * SIZE(AO)
  208. fclr c05
  209. LD b1, 0 * SIZE(BO)
  210. lda L, -2(TMP1)
  211. LD b2, 1 * SIZE(BO)
  212. lda AO, 1 * SIZE(AO)
  213. LD b3, 2 * SIZE(BO)
  214. fclr c09
  215. LD b4, 3 * SIZE(BO)
  216. fclr c13
  217. lda BO, 4 * SIZE(BO)
  218. ble TMP1, $L38
  219. ble L, $L35
  220. #endif
  221. .align 4
  222. $L32:
  223. ADD c01, t1, c01
  224. lda L, -2(L)
  225. MUL a1, b1, t1
  226. LD b1, 0 * SIZE(BO)
  227. ADD c05, t2, c05
  228. lda AO, 2 * SIZE(AO)
  229. MUL a1, b2, t2
  230. LD b2, 1 * SIZE(BO)
  231. ADD c09, t3, c09
  232. LD b5, 3 * SIZE(BO)
  233. MUL a1, b3, t3
  234. LD b3, 2 * SIZE(BO)
  235. ADD c13, t4, c13
  236. MUL a1, b4, t4
  237. LD a1, -1 * SIZE(AO)
  238. ADD c01, t1, c01
  239. MUL a2, b1, t1
  240. LD b1, 4 * SIZE(BO)
  241. lda BO, 8 * SIZE(BO)
  242. ADD c05, t2, c05
  243. MUL a2, b2, t2
  244. LD b2, -3 * SIZE(BO)
  245. ADD c09, t3, c09
  246. LD b4, -1 * SIZE(BO)
  247. MUL a2, b3, t3
  248. LD b3, -2 * SIZE(BO)
  249. ADD c13, t4, c13
  250. MUL a2, b5, t4
  251. LD a2, 0 * SIZE(AO)
  252. bgt L, $L32
  253. .align 4
  254. $L35:
  255. ADD c01, t1, c01
  256. MUL a1, b1, t1
  257. #if defined(LT) || defined(RN)
  258. blbs KK, $L37
  259. #else
  260. blbs TMP1, $L37
  261. #endif
  262. .align 4
  263. ADD c05, t2, c05
  264. LD b1, 0 * SIZE(BO)
  265. MUL a1, b2, t2
  266. LD b2, 1 * SIZE(BO)
  267. ADD c09, t3, c09
  268. MUL a1, b3, t3
  269. LD b3, 2 * SIZE(BO)
  270. ADD c13, t4, c13
  271. MUL a1, b4, t4
  272. LD a1, 0 * SIZE(AO)
  273. lda AO, 1 * SIZE(AO)
  274. ADD c01, t1, c01
  275. LD b4, 3 * SIZE(BO)
  276. MUL a1, b1, t1
  277. lda BO, 4 * SIZE(BO)
  278. .align 4
  279. $L37:
  280. ADD c05, t2, c05
  281. MUL a1, b2, t2
  282. ADD c09, t3, c09
  283. MUL a1, b3, t3
  284. ADD c13, t4, c13
  285. lda AO, 1 * SIZE(AO)
  286. MUL a1, b4, t4
  287. lda BO, 4 * SIZE(BO)
  288. ADD c01, t1, c01
  289. ADD c05, t2, c05
  290. ADD c09, t3, c09
  291. ADD c13, t4, c13
  292. $L38:
  293. #if defined(LN) || defined(RT)
  294. #ifdef LN
  295. subq KK, 1, TMP1
  296. #else
  297. subq KK, 4, TMP1
  298. #endif
  299. sll TMP1, BASE_SHIFT + 0, TMP2
  300. addq AORIG, TMP2, AO
  301. sll TMP1, BASE_SHIFT + 2, TMP2
  302. addq B, TMP2, BO
  303. #else
  304. lda AO, -1 * SIZE(AO)
  305. lda BO, -4 * SIZE(BO)
  306. #endif
  307. #if defined(LN) || defined(LT)
  308. LD a1, 0 * SIZE(BO)
  309. LD a2, 1 * SIZE(BO)
  310. LD a3, 2 * SIZE(BO)
  311. LD a4, 3 * SIZE(BO)
  312. SUB a1, c01, c01
  313. SUB a2, c05, c05
  314. SUB a3, c09, c09
  315. SUB a4, c13, c13
  316. #else
  317. LD a1, 0 * SIZE(AO)
  318. LD a2, 1 * SIZE(AO)
  319. LD a3, 2 * SIZE(AO)
  320. LD a4, 3 * SIZE(AO)
  321. SUB a1, c01, c01
  322. SUB a2, c05, c05
  323. SUB a3, c09, c09
  324. SUB a4, c13, c13
  325. #endif
  326. #if defined(LN) || defined(LT)
  327. LD a1, 0 * SIZE(AO)
  328. MUL a1, c01, c01
  329. MUL a1, c05, c05
  330. MUL a1, c09, c09
  331. MUL a1, c13, c13
  332. #endif
  333. #ifdef RN
  334. LD a1, 0 * SIZE(BO)
  335. LD a2, 1 * SIZE(BO)
  336. LD a3, 2 * SIZE(BO)
  337. LD a4, 3 * SIZE(BO)
  338. MUL a1, c01, c01
  339. MUL a2, c01, t1
  340. SUB c05, t1, c05
  341. MUL a3, c01, t1
  342. SUB c09, t1, c09
  343. MUL a4, c01, t1
  344. SUB c13, t1, c13
  345. LD b1, 5 * SIZE(BO)
  346. LD b2, 6 * SIZE(BO)
  347. LD b3, 7 * SIZE(BO)
  348. MUL b1, c05, c05
  349. MUL b2, c05, t1
  350. SUB c09, t1, c09
  351. MUL b3, c05, t1
  352. SUB c13, t1, c13
  353. LD a1, 10 * SIZE(BO)
  354. LD a2, 11 * SIZE(BO)
  355. LD a3, 15 * SIZE(BO)
  356. MUL a1, c09, c09
  357. MUL a2, c09, t1
  358. SUB c13, t1, c13
  359. MUL a3, c13, c13
  360. #endif
  361. #ifdef RT
  362. LD a1, 15 * SIZE(BO)
  363. LD a2, 14 * SIZE(BO)
  364. LD a3, 13 * SIZE(BO)
  365. LD a4, 12 * SIZE(BO)
  366. MUL a1, c13, c13
  367. MUL a2, c13, t1
  368. SUB c09, t1, c09
  369. MUL a3, c13, t1
  370. SUB c05, t1, c05
  371. MUL a4, c13, t1
  372. SUB c01, t1, c01
  373. LD b1, 10 * SIZE(BO)
  374. LD b2, 9 * SIZE(BO)
  375. LD b3, 8 * SIZE(BO)
  376. MUL b1, c09, c09
  377. MUL b2, c09, t1
  378. SUB c05, t1, c05
  379. MUL b3, c09, t1
  380. SUB c01, t1, c01
  381. LD a1, 5 * SIZE(BO)
  382. LD a2, 4 * SIZE(BO)
  383. LD a3, 0 * SIZE(BO)
  384. MUL a1, c05, c05
  385. MUL a2, c05, t1
  386. SUB c01, t1, c01
  387. MUL a3, c01, c01
  388. #endif
  389. #if defined(LN) || defined(LT)
  390. ST c01, 0 * SIZE(BO)
  391. ST c05, 1 * SIZE(BO)
  392. ST c09, 2 * SIZE(BO)
  393. ST c13, 3 * SIZE(BO)
  394. #else
  395. ST c01, 0 * SIZE(AO)
  396. ST c05, 1 * SIZE(AO)
  397. ST c09, 2 * SIZE(AO)
  398. ST c13, 3 * SIZE(AO)
  399. #endif
  400. #ifdef LN
  401. lda C1, -1 * SIZE(C1)
  402. lda C2, -1 * SIZE(C2)
  403. lda C3, -1 * SIZE(C3)
  404. lda C4, -1 * SIZE(C4)
  405. #endif
  406. ST c01, 0 * SIZE(C1)
  407. ST c05, 0 * SIZE(C2)
  408. ST c09, 0 * SIZE(C3)
  409. ST c13, 0 * SIZE(C4)
  410. #ifdef RT
  411. sll K, 0 + BASE_SHIFT, TMP1
  412. addq AORIG, TMP1, AORIG
  413. #endif
  414. #if defined(LT) || defined(RN)
  415. subq K, KK, TMP1
  416. sll TMP1, BASE_SHIFT + 0, TMP2
  417. addq AO, TMP2, AO
  418. sll TMP1, BASE_SHIFT + 2, TMP2
  419. addq BO, TMP2, BO
  420. #endif
  421. #ifdef LT
  422. addq KK, 1, KK
  423. #endif
  424. #ifdef LN
  425. subq KK, 1, KK
  426. #endif
  427. .align 4
  428. $L20:
  429. and M, 2, I
  430. ble I, $L30
  431. #if defined(LT) || defined(RN)
  432. LD a1, 0 * SIZE(AO)
  433. fclr c09
  434. LD a2, 1 * SIZE(AO)
  435. fclr c13
  436. LD a3, 2 * SIZE(AO)
  437. fclr c10
  438. LD a4, 3 * SIZE(AO)
  439. fclr c14
  440. LD b1, 0 * SIZE(B)
  441. lda L, -2(KK)
  442. LD b2, 1 * SIZE(B)
  443. lda AO, 2 * SIZE(AO)
  444. LD b3, 2 * SIZE(B)
  445. fclr c01
  446. LD b4, 3 * SIZE(B)
  447. fclr c05
  448. lda BO, 4 * SIZE(B)
  449. fclr c02
  450. fclr c06
  451. ble KK, $L28
  452. ble L, $L25
  453. #else
  454. #ifdef LN
  455. sll K, BASE_SHIFT + 1, TMP1
  456. subq AORIG, TMP1, AORIG
  457. #endif
  458. sll KK, BASE_SHIFT + 1, TMP1
  459. addq AORIG, TMP1, AO
  460. sll KK, BASE_SHIFT + 2, TMP2
  461. addq B, TMP2, BO
  462. subq K, KK, TMP1
  463. LD a1, 0 * SIZE(AO)
  464. fclr c09
  465. LD a2, 1 * SIZE(AO)
  466. fclr c13
  467. LD a3, 2 * SIZE(AO)
  468. fclr c10
  469. LD a4, 3 * SIZE(AO)
  470. fclr c14
  471. LD b1, 0 * SIZE(BO)
  472. lda L, -2(TMP1)
  473. LD b2, 1 * SIZE(BO)
  474. lda AO, 2 * SIZE(AO)
  475. LD b3, 2 * SIZE(BO)
  476. fclr c01
  477. LD b4, 3 * SIZE(BO)
  478. fclr c05
  479. lda BO, 4 * SIZE(BO)
  480. fclr c02
  481. fclr c06
  482. ble TMP1, $L28
  483. ble L, $L25
  484. #endif
  485. .align 4
  486. $L22:
  487. ADD c09, t1, c09
  488. unop
  489. MUL a1, b1, t1
  490. unop
  491. ADD c10, t2, c10
  492. unop
  493. MUL a2, b1, t2
  494. LD b1, 0 * SIZE(BO)
  495. ADD c13, t3, c13
  496. unop
  497. MUL a1, b2, t3
  498. lda BO, 8 * SIZE(BO)
  499. ADD c14, t4, c14
  500. unop
  501. MUL a2, b2, t4
  502. LD b2, -7 * SIZE(BO)
  503. ADD c01, t1, c01
  504. unop
  505. MUL a1, b3, t1
  506. unop
  507. ADD c02, t2, c02
  508. unop
  509. MUL a2, b3, t2
  510. LD b3, -6 * SIZE(BO)
  511. ADD c05, t3, c05
  512. unop
  513. MUL a1, b4, t3
  514. LD a1, 2 * SIZE(AO)
  515. ADD c06, t4, c06
  516. MUL a2, b4, t4
  517. LD b5, -5 * SIZE(BO)
  518. ADD c09, t1, c09
  519. unop
  520. MUL a3, b1, t1
  521. LD a2, 3 * SIZE(AO)
  522. ADD c10, t2, c10
  523. unop
  524. MUL a4, b1, t2
  525. LD b1, -4 * SIZE(BO)
  526. ADD c13, t3, c13
  527. unop
  528. MUL a3, b2, t3
  529. lda AO, 4 * SIZE(AO)
  530. ADD c14, t4, c14
  531. MUL a4, b2, t4
  532. LD b2, -3 * SIZE(BO)
  533. ADD c01, t1, c01
  534. lda L, -2(L)
  535. MUL a3, b3, t1
  536. LD b4, -1 * SIZE(BO)
  537. ADD c02, t2, c02
  538. unop
  539. MUL a4, b3, t2
  540. LD b3, -2 * SIZE(BO)
  541. ADD c05, t3, c05
  542. unop
  543. MUL a3, b5, t3
  544. LD a3, 0 * SIZE(AO)
  545. ADD c06, t4, c06
  546. MUL a4, b5, t4
  547. LD a4, 1 * SIZE(AO)
  548. bgt L, $L22
  549. .align 4
  550. $L25:
  551. ADD c09, t1, c09
  552. MUL a1, b1, t1
  553. #if defined(LT) || defined(RN)
  554. blbs KK, $L27
  555. #else
  556. blbs TMP1, $L27
  557. #endif
  558. ADD c10, t2, c10
  559. unop
  560. MUL a2, b1, t2
  561. LD b1, 0 * SIZE(BO)
  562. ADD c13, t3, c13
  563. unop
  564. MUL a1, b2, t3
  565. unop
  566. ADD c14, t4, c14
  567. unop
  568. MUL a2, b2, t4
  569. LD b2, 1 * SIZE(BO)
  570. ADD c01, t1, c01
  571. unop
  572. MUL a1, b3, t1
  573. lda AO, 2 * SIZE(AO)
  574. ADD c02, t2, c02
  575. unop
  576. MUL a2, b3, t2
  577. LD b3, 2 * SIZE(BO)
  578. ADD c05, t3, c05
  579. unop
  580. MUL a1, b4, t3
  581. LD a1, -2 * SIZE(AO)
  582. ADD c06, t4, c06
  583. unop
  584. MUL a2, b4, t4
  585. LD a2, -1 * SIZE(AO)
  586. ADD c09, t1, c09
  587. LD b4, 3 * SIZE(BO)
  588. MUL a1, b1, t1
  589. lda BO, 4 * SIZE(BO)
  590. .align 4
  591. $L27:
  592. ADD c10, t2, c10
  593. MUL a2, b1, t2
  594. ADD c13, t3, c13
  595. MUL a1, b2, t3
  596. ADD c14, t4, c14
  597. MUL a2, b2, t4
  598. ADD c01, t1, c01
  599. MUL a1, b3, t1
  600. ADD c02, t2, c02
  601. MUL a2, b3, t2
  602. ADD c05, t3, c05
  603. MUL a1, b4, t3
  604. ADD c06, t4, c06
  605. lda AO, 2 * SIZE(AO)
  606. MUL a2, b4, t4
  607. lda BO, 4 * SIZE(BO)
  608. ADD c09, t1, c09
  609. ADD c10, t2, c10
  610. ADD c13, t3, c13
  611. ADD c14, t4, c14
  612. .align 4
  613. $L28:
  614. #if defined(LN) || defined(RT)
  615. #ifdef LN
  616. subq KK, 2, TMP1
  617. #else
  618. subq KK, 4, TMP1
  619. #endif
  620. sll TMP1, BASE_SHIFT + 1, TMP2
  621. addq AORIG, TMP2, AO
  622. sll TMP1, BASE_SHIFT + 2, TMP2
  623. addq B, TMP2, BO
  624. #else
  625. lda AO, -2 * SIZE(AO)
  626. lda BO, -4 * SIZE(BO)
  627. #endif
  628. #if defined(LN) || defined(LT)
  629. LD a1, 0 * SIZE(BO)
  630. LD a2, 1 * SIZE(BO)
  631. LD a3, 2 * SIZE(BO)
  632. LD a4, 3 * SIZE(BO)
  633. LD b1, 4 * SIZE(BO)
  634. LD b2, 5 * SIZE(BO)
  635. LD b3, 6 * SIZE(BO)
  636. LD b4, 7 * SIZE(BO)
  637. SUB a1, c01, c01
  638. SUB a2, c05, c05
  639. SUB a3, c09, c09
  640. SUB a4, c13, c13
  641. SUB b1, c02, c02
  642. SUB b2, c06, c06
  643. SUB b3, c10, c10
  644. SUB b4, c14, c14
  645. #else
  646. LD a1, 0 * SIZE(AO)
  647. LD a2, 1 * SIZE(AO)
  648. LD a3, 2 * SIZE(AO)
  649. LD a4, 3 * SIZE(AO)
  650. LD b1, 4 * SIZE(AO)
  651. LD b2, 5 * SIZE(AO)
  652. LD b3, 6 * SIZE(AO)
  653. LD b4, 7 * SIZE(AO)
  654. SUB a1, c01, c01
  655. SUB a2, c02, c02
  656. SUB a3, c05, c05
  657. SUB a4, c06, c06
  658. SUB b1, c09, c09
  659. SUB b2, c10, c10
  660. SUB b3, c13, c13
  661. SUB b4, c14, c14
  662. #endif
  663. #ifdef LN
  664. LD a1, 3 * SIZE(AO)
  665. LD a2, 2 * SIZE(AO)
  666. LD a3, 0 * SIZE(AO)
  667. MUL a1, c02, c02
  668. MUL a1, c06, c06
  669. MUL a1, c10, c10
  670. MUL a1, c14, c14
  671. MUL a2, c02, t1
  672. MUL a2, c06, t2
  673. MUL a2, c10, t3
  674. MUL a2, c14, t4
  675. SUB c01, t1, c01
  676. SUB c05, t2, c05
  677. SUB c09, t3, c09
  678. SUB c13, t4, c13
  679. MUL a3, c01, c01
  680. MUL a3, c05, c05
  681. MUL a3, c09, c09
  682. MUL a3, c13, c13
  683. #endif
  684. #ifdef LT
  685. LD a1, 0 * SIZE(AO)
  686. LD a2, 1 * SIZE(AO)
  687. LD a3, 3 * SIZE(AO)
  688. MUL a1, c01, c01
  689. MUL a1, c05, c05
  690. MUL a1, c09, c09
  691. MUL a1, c13, c13
  692. MUL a2, c01, t1
  693. MUL a2, c05, t2
  694. MUL a2, c09, t3
  695. MUL a2, c13, t4
  696. SUB c02, t1, c02
  697. SUB c06, t2, c06
  698. SUB c10, t3, c10
  699. SUB c14, t4, c14
  700. MUL a3, c02, c02
  701. MUL a3, c06, c06
  702. MUL a3, c10, c10
  703. MUL a3, c14, c14
  704. #endif
  705. #ifdef RN
  706. LD a1, 0 * SIZE(BO)
  707. LD a2, 1 * SIZE(BO)
  708. LD a3, 2 * SIZE(BO)
  709. LD a4, 3 * SIZE(BO)
  710. MUL a1, c01, c01
  711. MUL a1, c02, c02
  712. MUL a2, c01, t1
  713. MUL a2, c02, t2
  714. SUB c05, t1, c05
  715. SUB c06, t2, c06
  716. MUL a3, c01, t1
  717. MUL a3, c02, t2
  718. SUB c09, t1, c09
  719. SUB c10, t2, c10
  720. MUL a4, c01, t1
  721. MUL a4, c02, t2
  722. SUB c13, t1, c13
  723. SUB c14, t2, c14
  724. LD b1, 5 * SIZE(BO)
  725. LD b2, 6 * SIZE(BO)
  726. LD b3, 7 * SIZE(BO)
  727. MUL b1, c05, c05
  728. MUL b1, c06, c06
  729. MUL b2, c05, t1
  730. MUL b2, c06, t2
  731. SUB c09, t1, c09
  732. SUB c10, t2, c10
  733. MUL b3, c05, t1
  734. MUL b3, c06, t2
  735. SUB c13, t1, c13
  736. SUB c14, t2, c14
  737. LD a1, 10 * SIZE(BO)
  738. LD a2, 11 * SIZE(BO)
  739. LD a3, 15 * SIZE(BO)
  740. MUL a1, c09, c09
  741. MUL a1, c10, c10
  742. MUL a2, c09, t1
  743. MUL a2, c10, t2
  744. SUB c13, t1, c13
  745. SUB c14, t2, c14
  746. MUL a3, c13, c13
  747. MUL a3, c14, c14
  748. #endif
  749. #ifdef RT
  750. LD a1, 15 * SIZE(BO)
  751. LD a2, 14 * SIZE(BO)
  752. LD a3, 13 * SIZE(BO)
  753. LD a4, 12 * SIZE(BO)
  754. MUL a1, c13, c13
  755. MUL a1, c14, c14
  756. MUL a2, c13, t1
  757. MUL a2, c14, t2
  758. SUB c09, t1, c09
  759. SUB c10, t2, c10
  760. MUL a3, c13, t1
  761. MUL a3, c14, t2
  762. SUB c05, t1, c05
  763. SUB c06, t2, c06
  764. MUL a4, c13, t1
  765. MUL a4, c14, t2
  766. SUB c01, t1, c01
  767. SUB c02, t2, c02
  768. LD b1, 10 * SIZE(BO)
  769. LD b2, 9 * SIZE(BO)
  770. LD b3, 8 * SIZE(BO)
  771. MUL b1, c09, c09
  772. MUL b1, c10, c10
  773. MUL b2, c09, t1
  774. MUL b2, c10, t2
  775. SUB c05, t1, c05
  776. SUB c06, t2, c06
  777. MUL b3, c09, t1
  778. MUL b3, c10, t2
  779. SUB c01, t1, c01
  780. SUB c02, t2, c02
  781. LD a1, 5 * SIZE(BO)
  782. LD a2, 4 * SIZE(BO)
  783. LD a3, 0 * SIZE(BO)
  784. MUL a1, c05, c05
  785. MUL a1, c06, c06
  786. MUL a2, c05, t1
  787. MUL a2, c06, t2
  788. SUB c01, t1, c01
  789. SUB c02, t2, c02
  790. MUL a3, c01, c01
  791. MUL a3, c02, c02
  792. #endif
  793. #if defined(LN) || defined(LT)
  794. ST c01, 0 * SIZE(BO)
  795. ST c05, 1 * SIZE(BO)
  796. ST c09, 2 * SIZE(BO)
  797. ST c13, 3 * SIZE(BO)
  798. ST c02, 4 * SIZE(BO)
  799. ST c06, 5 * SIZE(BO)
  800. ST c10, 6 * SIZE(BO)
  801. ST c14, 7 * SIZE(BO)
  802. #else
  803. ST c01, 0 * SIZE(AO)
  804. ST c02, 1 * SIZE(AO)
  805. ST c05, 2 * SIZE(AO)
  806. ST c06, 3 * SIZE(AO)
  807. ST c09, 4 * SIZE(AO)
  808. ST c10, 5 * SIZE(AO)
  809. ST c13, 6 * SIZE(AO)
  810. ST c14, 7 * SIZE(AO)
  811. #endif
  812. #ifdef LN
  813. lda C1, -2 * SIZE(C1)
  814. lda C2, -2 * SIZE(C2)
  815. lda C3, -2 * SIZE(C3)
  816. lda C4, -2 * SIZE(C4)
  817. #endif
  818. ST c01, 0 * SIZE(C1)
  819. ST c02, 1 * SIZE(C1)
  820. ST c05, 0 * SIZE(C2)
  821. ST c06, 1 * SIZE(C2)
  822. ST c09, 0 * SIZE(C3)
  823. ST c10, 1 * SIZE(C3)
  824. ST c13, 0 * SIZE(C4)
  825. ST c14, 1 * SIZE(C4)
  826. #ifndef LN
  827. lda C1, 2 * SIZE(C1)
  828. lda C2, 2 * SIZE(C2)
  829. lda C3, 2 * SIZE(C3)
  830. lda C4, 2 * SIZE(C4)
  831. #endif
  832. fclr t1
  833. fclr t2
  834. fclr t3
  835. fclr t4
  836. #ifdef RT
  837. sll K, 1 + BASE_SHIFT, TMP1
  838. addq AORIG, TMP1, AORIG
  839. #endif
  840. #if defined(LT) || defined(RN)
  841. subq K, KK, TMP1
  842. sll TMP1, BASE_SHIFT + 1, TMP2
  843. addq AO, TMP2, AO
  844. sll TMP1, BASE_SHIFT + 2, TMP2
  845. addq BO, TMP2, BO
  846. #endif
  847. #ifdef LT
  848. addq KK, 2, KK
  849. #endif
  850. #ifdef LN
  851. subq KK, 2, KK
  852. #endif
  853. .align 4
  854. $L30:
  855. sra M, 2, I
  856. ble I, $L39
  857. .align 4
  858. $L11:
  859. #if defined(LT) || defined(RN)
  860. LD a1, 0 * SIZE(AO)
  861. fclr c11
  862. LD a2, 1 * SIZE(AO)
  863. fclr c12
  864. LD a3, 2 * SIZE(AO)
  865. fclr c16
  866. LD a4, 3 * SIZE(AO)
  867. fclr c15
  868. LD b1, 0 * SIZE(B)
  869. fclr c01
  870. LD b2, 1 * SIZE(B)
  871. fclr c02
  872. LD b3, 2 * SIZE(B)
  873. fclr c06
  874. LD b4, 3 * SIZE(B)
  875. fclr c05
  876. lds $f31, 4 * SIZE(C1)
  877. fclr c03
  878. lda L, -2(KK)
  879. fclr c04
  880. lds $f31, 7 * SIZE(C2)
  881. fclr c08
  882. lda BO, 4 * SIZE(B)
  883. fclr c13
  884. lds $f31, 4 * SIZE(C3)
  885. fclr c09
  886. lda AO, 4 * SIZE(AO)
  887. fclr c10
  888. lds $f31, 7 * SIZE(C4)
  889. fclr c14
  890. fclr c07
  891. ble KK, $L18
  892. #else
  893. #ifdef LN
  894. sll K, BASE_SHIFT + 2, TMP1
  895. subq AORIG, TMP1, AORIG
  896. #endif
  897. sll KK, BASE_SHIFT + 2, TMP1
  898. addq AORIG, TMP1, AO
  899. addq B, TMP1, BO
  900. subq K, KK, TMP1
  901. LD a1, 0 * SIZE(AO)
  902. fclr c11
  903. LD a2, 1 * SIZE(AO)
  904. fclr c12
  905. LD a3, 2 * SIZE(AO)
  906. fclr c16
  907. LD a4, 3 * SIZE(AO)
  908. fclr c15
  909. LD b1, 0 * SIZE(BO)
  910. fclr c01
  911. LD b2, 1 * SIZE(BO)
  912. fclr c02
  913. LD b3, 2 * SIZE(BO)
  914. fclr c06
  915. LD b4, 3 * SIZE(BO)
  916. fclr c05
  917. lds $f31, 4 * SIZE(C1)
  918. fclr c03
  919. lda L, -2(TMP1)
  920. fclr c04
  921. lds $f31, 7 * SIZE(C2)
  922. fclr c08
  923. lda BO, 4 * SIZE(BO)
  924. fclr c13
  925. lds $f31, 4 * SIZE(C3)
  926. fclr c09
  927. lda AO, 4 * SIZE(AO)
  928. fclr c10
  929. lds $f31, 7 * SIZE(C4)
  930. fclr c14
  931. fclr c07
  932. ble TMP1, $L18
  933. #endif
  934. ble L, $L15
  935. .align 5
  936. $L12:
  937. /* 1 */
  938. ADD c11, t1, c11
  939. #ifndef EV4
  940. ldq $31, PREFETCHSIZE * SIZE(AO)
  941. #else
  942. unop
  943. #endif
  944. MUL b1, a1, t1
  945. #ifndef EV4
  946. ldl $31, PREFETCHSIZE * SIZE(BO)
  947. #else
  948. unop
  949. #endif
  950. ADD c12, t2, c12
  951. unop
  952. MUL b1, a2, t2
  953. unop
  954. ADD c16, t3, c16
  955. unop
  956. MUL b2, a2, t3
  957. LD a5, 0 * SIZE(AO)
  958. ADD c15, t4, c15
  959. unop
  960. MUL b2, a1, t4
  961. LD b5, 0 * SIZE(BO)
  962. /* 2 */
  963. ADD c01, t1, c01
  964. UNOP
  965. MUL b1, a3, t1
  966. UNOP
  967. ADD c02, t2, c02
  968. UNOP
  969. MUL b1, a4, t2
  970. UNOP
  971. ADD c06, t3, c06
  972. unop
  973. MUL b2, a4, t3
  974. unop
  975. ADD c05, t4, c05
  976. unop
  977. MUL b4, a1, t4
  978. unop
  979. /* 3 */
  980. ADD c03, t1, c03
  981. unop
  982. MUL b3, a1, t1
  983. unop
  984. ADD c04, t2, c04
  985. unop
  986. MUL b3, a2, t2
  987. unop
  988. ADD c08, t3, c08
  989. unop
  990. MUL b4, a2, t3
  991. LD a2, 1 * SIZE(AO)
  992. ADD c13, t4, c13
  993. unop
  994. MUL b2, a3, t4
  995. LD b2, 1 * SIZE(BO)
  996. /* 4 */
  997. ADD c09, t1, c09
  998. unop
  999. MUL b3, a3, t1
  1000. LD a6, 2 * SIZE(AO)
  1001. ADD c10, t2, c10
  1002. unop
  1003. MUL b3, a4, t2
  1004. LD b3, 2 * SIZE(BO)
  1005. ADD c14, t3, c14
  1006. unop
  1007. MUL b4, a4, t3
  1008. LD a4, 3 * SIZE(AO)
  1009. ADD c07, t4, c07
  1010. unop
  1011. MUL b4, a3, t4
  1012. LD b4, 3 * SIZE(BO)
  1013. /* 5 */
  1014. ADD c11, t1, c11
  1015. unop
  1016. MUL b5, a5, t1
  1017. LD a1, 4 * SIZE(AO)
  1018. ADD c12, t2, c12
  1019. lda L, -2(L)
  1020. MUL b5, a2, t2
  1021. LD b1, 4 * SIZE(BO)
  1022. ADD c16, t3, c16
  1023. unop
  1024. MUL b2, a2, t3
  1025. unop
  1026. ADD c15, t4, c15
  1027. unop
  1028. MUL b2, a5, t4
  1029. unop
  1030. /* 6 */
  1031. ADD c01, t1, c01
  1032. unop
  1033. MUL b5, a6, t1
  1034. unop
  1035. ADD c02, t2, c02
  1036. unop
  1037. MUL b5, a4, t2
  1038. unop
  1039. ADD c06, t3, c06
  1040. unop
  1041. MUL b2, a4, t3
  1042. unop
  1043. ADD c05, t4, c05
  1044. unop
  1045. MUL b4, a5, t4
  1046. unop
  1047. /* 7 */
  1048. ADD c03, t1, c03
  1049. lda AO, 8 * SIZE(AO)
  1050. MUL b3, a5, t1
  1051. unop
  1052. ADD c04, t2, c04
  1053. lda BO, 8 * SIZE(BO)
  1054. MUL b3, a2, t2
  1055. unop
  1056. ADD c08, t3, c08
  1057. unop
  1058. MUL b4, a2, t3
  1059. LD a2, -3 * SIZE(AO)
  1060. ADD c13, t4, c13
  1061. unop
  1062. MUL b2, a6, t4
  1063. LD b2, -3 * SIZE(BO)
  1064. /* 8 */
  1065. ADD c09, t1, c09
  1066. unop
  1067. MUL b3, a6, t1
  1068. LD a3, -2 * SIZE(AO)
  1069. ADD c10, t2, c10
  1070. unop
  1071. MUL b3, a4, t2
  1072. LD b3, -2 * SIZE(BO)
  1073. ADD c14, t3, c14
  1074. unop
  1075. MUL b4, a4, t3
  1076. LD a4, -1 * SIZE(AO)
  1077. ADD c07, t4, c07
  1078. MUL b4, a6, t4
  1079. LD b4, -1 * SIZE(BO)
  1080. bgt L, $L12
  1081. .align 4
  1082. $L15:
  1083. ADD c11, t1, c11
  1084. MUL b1, a1, t1
  1085. #if defined(LT) || defined(RN)
  1086. blbs KK, $L17
  1087. #else
  1088. blbs TMP1, $L17
  1089. #endif
  1090. .align 4
  1091. ADD c12, t2, c12
  1092. MUL b1, a2, t2
  1093. ADD c16, t3, c16
  1094. MUL b2, a2, t3
  1095. ADD c15, t4, c15
  1096. MUL b2, a1, t4
  1097. ADD c01, t1, c01
  1098. MUL b1, a3, t1
  1099. ADD c02, t2, c02
  1100. unop
  1101. MUL b1, a4, t2
  1102. LD b1, 0 * SIZE(BO)
  1103. ADD c06, t3, c06
  1104. MUL b2, a4, t3
  1105. ADD c05, t4, c05
  1106. MUL b4, a1, t4
  1107. ADD c03, t1, c03
  1108. unop
  1109. MUL b3, a1, t1
  1110. LD a1, 0 * SIZE(AO)
  1111. ADD c04, t2, c04
  1112. unop
  1113. MUL b3, a2, t2
  1114. unop
  1115. ADD c08, t3, c08
  1116. unop
  1117. MUL b4, a2, t3
  1118. LD a2, 1 * SIZE(AO)
  1119. ADD c13, t4, c13
  1120. unop
  1121. MUL b2, a3, t4
  1122. LD b2, 1 * SIZE(BO)
  1123. ADD c09, t1, c09
  1124. unop
  1125. MUL b3, a3, t1
  1126. lda AO, 4 * SIZE(AO)
  1127. ADD c10, t2, c10
  1128. unop
  1129. MUL b3, a4, t2
  1130. LD b3, 2 * SIZE(BO)
  1131. ADD c14, t3, c14
  1132. unop
  1133. MUL b4, a4, t3
  1134. LD a4, -1 * SIZE(AO)
  1135. ADD c07, t4, c07
  1136. unop
  1137. MUL b4, a3, t4
  1138. LD a3, -2 * SIZE(AO)
  1139. ADD c11, t1, c11
  1140. LD b4, 3 * SIZE(BO)
  1141. MUL b1, a1, t1
  1142. lda BO, 4 * SIZE(BO)
  1143. .align 4
  1144. $L17:
  1145. ADD c12, t2, c12
  1146. MUL b1, a2, t2
  1147. ADD c16, t3, c16
  1148. MUL b2, a2, t3
  1149. ADD c15, t4, c15
  1150. MUL b2, a1, t4
  1151. ADD c01, t1, c01
  1152. MUL b1, a3, t1
  1153. ADD c02, t2, c02
  1154. MUL b1, a4, t2
  1155. ADD c06, t3, c06
  1156. MUL b2, a4, t3
  1157. ADD c05, t4, c05
  1158. MUL b4, a1, t4
  1159. ADD c03, t1, c03
  1160. MUL b3, a1, t1
  1161. ADD c04, t2, c04
  1162. MUL b3, a2, t2
  1163. ADD c08, t3, c08
  1164. MUL b4, a2, t3
  1165. ADD c13, t4, c13
  1166. MUL b2, a3, t4
  1167. ADD c09, t1, c09
  1168. MUL b3, a3, t1
  1169. ADD c10, t2, c10
  1170. MUL b3, a4, t2
  1171. ADD c14, t3, c14
  1172. MUL b4, a4, t3
  1173. ADD c07, t4, c07
  1174. lda AO, 4 * SIZE(AO)
  1175. MUL b4, a3, t4
  1176. lda BO, 4 * SIZE(BO)
  1177. ADD c11, t1, c11
  1178. ADD c12, t2, c12
  1179. ADD c16, t3, c16
  1180. ADD c15, t4, c15
  1181. .align 4
  1182. $L18:
  1183. #if defined(LN) || defined(RT)
  1184. #ifdef LN
  1185. subq KK, 4, TMP1
  1186. #else
  1187. subq KK, 4, TMP1
  1188. #endif
  1189. sll TMP1, BASE_SHIFT + 2, TMP2
  1190. addq AORIG, TMP2, AO
  1191. sll TMP1, BASE_SHIFT + 2, TMP2
  1192. addq B, TMP2, BO
  1193. #else
  1194. lda AO, -4 * SIZE(AO)
  1195. lda BO, -4 * SIZE(BO)
  1196. #endif
  1197. #if defined(LN) || defined(LT)
  1198. LD a1, 0 * SIZE(BO)
  1199. LD a2, 1 * SIZE(BO)
  1200. LD a3, 2 * SIZE(BO)
  1201. LD a4, 3 * SIZE(BO)
  1202. LD b1, 4 * SIZE(BO)
  1203. LD b2, 5 * SIZE(BO)
  1204. LD b3, 6 * SIZE(BO)
  1205. LD b4, 7 * SIZE(BO)
  1206. SUB a1, c01, c01
  1207. SUB a2, c05, c05
  1208. SUB a3, c09, c09
  1209. SUB a4, c13, c13
  1210. SUB b1, c02, c02
  1211. SUB b2, c06, c06
  1212. SUB b3, c10, c10
  1213. SUB b4, c14, c14
  1214. LD a1, 8 * SIZE(BO)
  1215. LD a2, 9 * SIZE(BO)
  1216. LD a3, 10 * SIZE(BO)
  1217. LD a4, 11 * SIZE(BO)
  1218. LD b1, 12 * SIZE(BO)
  1219. LD b2, 13 * SIZE(BO)
  1220. LD b3, 14 * SIZE(BO)
  1221. LD b4, 15 * SIZE(BO)
  1222. SUB a1, c03, c03
  1223. SUB a2, c07, c07
  1224. SUB a3, c11, c11
  1225. SUB a4, c15, c15
  1226. SUB b1, c04, c04
  1227. SUB b2, c08, c08
  1228. SUB b3, c12, c12
  1229. SUB b4, c16, c16
  1230. #else
  1231. LD a1, 0 * SIZE(AO)
  1232. LD a2, 1 * SIZE(AO)
  1233. LD a3, 2 * SIZE(AO)
  1234. LD a4, 3 * SIZE(AO)
  1235. LD b1, 4 * SIZE(AO)
  1236. LD b2, 5 * SIZE(AO)
  1237. LD b3, 6 * SIZE(AO)
  1238. LD b4, 7 * SIZE(AO)
  1239. SUB a1, c01, c01
  1240. SUB a2, c02, c02
  1241. SUB a3, c03, c03
  1242. SUB a4, c04, c04
  1243. SUB b1, c05, c05
  1244. SUB b2, c06, c06
  1245. SUB b3, c07, c07
  1246. SUB b4, c08, c08
  1247. LD a1, 8 * SIZE(AO)
  1248. LD a2, 9 * SIZE(AO)
  1249. LD a3, 10 * SIZE(AO)
  1250. LD a4, 11 * SIZE(AO)
  1251. LD b1, 12 * SIZE(AO)
  1252. LD b2, 13 * SIZE(AO)
  1253. LD b3, 14 * SIZE(AO)
  1254. LD b4, 15 * SIZE(AO)
  1255. SUB a1, c09, c09
  1256. SUB a2, c10, c10
  1257. SUB a3, c11, c11
  1258. SUB a4, c12, c12
  1259. SUB b1, c13, c13
  1260. SUB b2, c14, c14
  1261. SUB b3, c15, c15
  1262. SUB b4, c16, c16
  1263. #endif
  1264. #ifdef LN
  1265. LD a1, 15 * SIZE(AO)
  1266. LD a2, 14 * SIZE(AO)
  1267. LD a3, 13 * SIZE(AO)
  1268. LD a4, 12 * SIZE(AO)
  1269. MUL a1, c04, c04
  1270. MUL a1, c08, c08
  1271. MUL a1, c12, c12
  1272. MUL a1, c16, c16
  1273. MUL a2, c04, t1
  1274. MUL a2, c08, t2
  1275. MUL a2, c12, t3
  1276. MUL a2, c16, t4
  1277. SUB c03, t1, c03
  1278. SUB c07, t2, c07
  1279. SUB c11, t3, c11
  1280. SUB c15, t4, c15
  1281. MUL a3, c04, t1
  1282. MUL a3, c08, t2
  1283. MUL a3, c12, t3
  1284. MUL a3, c16, t4
  1285. SUB c02, t1, c02
  1286. SUB c06, t2, c06
  1287. SUB c10, t3, c10
  1288. SUB c14, t4, c14
  1289. MUL a4, c04, t1
  1290. MUL a4, c08, t2
  1291. MUL a4, c12, t3
  1292. MUL a4, c16, t4
  1293. SUB c01, t1, c01
  1294. SUB c05, t2, c05
  1295. SUB c09, t3, c09
  1296. SUB c13, t4, c13
  1297. LD b1, 10 * SIZE(AO)
  1298. LD b2, 9 * SIZE(AO)
  1299. LD b3, 8 * SIZE(AO)
  1300. MUL b1, c03, c03
  1301. MUL b1, c07, c07
  1302. MUL b1, c11, c11
  1303. MUL b1, c15, c15
  1304. MUL b2, c03, t1
  1305. MUL b2, c07, t2
  1306. MUL b2, c11, t3
  1307. MUL b2, c15, t4
  1308. SUB c02, t1, c02
  1309. SUB c06, t2, c06
  1310. SUB c10, t3, c10
  1311. SUB c14, t4, c14
  1312. MUL b3, c03, t1
  1313. MUL b3, c07, t2
  1314. MUL b3, c11, t3
  1315. MUL b3, c15, t4
  1316. SUB c01, t1, c01
  1317. SUB c05, t2, c05
  1318. SUB c09, t3, c09
  1319. SUB c13, t4, c13
  1320. LD a1, 5 * SIZE(AO)
  1321. LD a2, 4 * SIZE(AO)
  1322. LD a3, 0 * SIZE(AO)
  1323. MUL a1, c02, c02
  1324. MUL a1, c06, c06
  1325. MUL a1, c10, c10
  1326. MUL a1, c14, c14
  1327. MUL a2, c02, t1
  1328. MUL a2, c06, t2
  1329. MUL a2, c10, t3
  1330. MUL a2, c14, t4
  1331. SUB c01, t1, c01
  1332. SUB c05, t2, c05
  1333. SUB c09, t3, c09
  1334. SUB c13, t4, c13
  1335. MUL a3, c01, c01
  1336. MUL a3, c05, c05
  1337. MUL a3, c09, c09
  1338. MUL a3, c13, c13
  1339. #endif
  1340. #ifdef LT
  1341. LD a1, 0 * SIZE(AO)
  1342. LD a2, 1 * SIZE(AO)
  1343. LD a3, 2 * SIZE(AO)
  1344. LD a4, 3 * SIZE(AO)
  1345. MUL a1, c01, c01
  1346. MUL a1, c05, c05
  1347. MUL a1, c09, c09
  1348. MUL a1, c13, c13
  1349. MUL a2, c01, t1
  1350. MUL a2, c05, t2
  1351. MUL a2, c09, t3
  1352. MUL a2, c13, t4
  1353. SUB c02, t1, c02
  1354. SUB c06, t2, c06
  1355. SUB c10, t3, c10
  1356. SUB c14, t4, c14
  1357. MUL a3, c01, t1
  1358. MUL a3, c05, t2
  1359. MUL a3, c09, t3
  1360. MUL a3, c13, t4
  1361. SUB c03, t1, c03
  1362. SUB c07, t2, c07
  1363. SUB c11, t3, c11
  1364. SUB c15, t4, c15
  1365. MUL a4, c01, t1
  1366. MUL a4, c05, t2
  1367. MUL a4, c09, t3
  1368. MUL a4, c13, t4
  1369. SUB c04, t1, c04
  1370. SUB c08, t2, c08
  1371. SUB c12, t3, c12
  1372. SUB c16, t4, c16
  1373. LD b1, 5 * SIZE(AO)
  1374. LD b2, 6 * SIZE(AO)
  1375. LD b3, 7 * SIZE(AO)
  1376. MUL b1, c02, c02
  1377. MUL b1, c06, c06
  1378. MUL b1, c10, c10
  1379. MUL b1, c14, c14
  1380. MUL b2, c02, t1
  1381. MUL b2, c06, t2
  1382. MUL b2, c10, t3
  1383. MUL b2, c14, t4
  1384. SUB c03, t1, c03
  1385. SUB c07, t2, c07
  1386. SUB c11, t3, c11
  1387. SUB c15, t4, c15
  1388. MUL b3, c02, t1
  1389. MUL b3, c06, t2
  1390. MUL b3, c10, t3
  1391. MUL b3, c14, t4
  1392. SUB c04, t1, c04
  1393. SUB c08, t2, c08
  1394. SUB c12, t3, c12
  1395. SUB c16, t4, c16
  1396. LD a1, 10 * SIZE(AO)
  1397. LD a2, 11 * SIZE(AO)
  1398. LD a3, 15 * SIZE(AO)
  1399. MUL a1, c03, c03
  1400. MUL a1, c07, c07
  1401. MUL a1, c11, c11
  1402. MUL a1, c15, c15
  1403. MUL a2, c03, t1
  1404. MUL a2, c07, t2
  1405. MUL a2, c11, t3
  1406. MUL a2, c15, t4
  1407. SUB c04, t1, c04
  1408. SUB c08, t2, c08
  1409. SUB c12, t3, c12
  1410. SUB c16, t4, c16
  1411. MUL a3, c04, c04
  1412. MUL a3, c08, c08
  1413. MUL a3, c12, c12
  1414. MUL a3, c16, c16
  1415. #endif
  1416. #ifdef RN
  1417. LD a1, 0 * SIZE(BO)
  1418. LD a2, 1 * SIZE(BO)
  1419. LD a3, 2 * SIZE(BO)
  1420. LD a4, 3 * SIZE(BO)
  1421. MUL a1, c01, c01
  1422. MUL a1, c02, c02
  1423. MUL a1, c03, c03
  1424. MUL a1, c04, c04
  1425. MUL a2, c01, t1
  1426. MUL a2, c02, t2
  1427. MUL a2, c03, t3
  1428. MUL a2, c04, t4
  1429. SUB c05, t1, c05
  1430. SUB c06, t2, c06
  1431. SUB c07, t3, c07
  1432. SUB c08, t4, c08
  1433. MUL a3, c01, t1
  1434. MUL a3, c02, t2
  1435. MUL a3, c03, t3
  1436. MUL a3, c04, t4
  1437. SUB c09, t1, c09
  1438. SUB c10, t2, c10
  1439. SUB c11, t3, c11
  1440. SUB c12, t4, c12
  1441. MUL a4, c01, t1
  1442. MUL a4, c02, t2
  1443. MUL a4, c03, t3
  1444. MUL a4, c04, t4
  1445. SUB c13, t1, c13
  1446. SUB c14, t2, c14
  1447. SUB c15, t3, c15
  1448. SUB c16, t4, c16
  1449. LD b1, 5 * SIZE(BO)
  1450. LD b2, 6 * SIZE(BO)
  1451. LD b3, 7 * SIZE(BO)
  1452. MUL b1, c05, c05
  1453. MUL b1, c06, c06
  1454. MUL b1, c07, c07
  1455. MUL b1, c08, c08
  1456. MUL b2, c05, t1
  1457. MUL b2, c06, t2
  1458. MUL b2, c07, t3
  1459. MUL b2, c08, t4
  1460. SUB c09, t1, c09
  1461. SUB c10, t2, c10
  1462. SUB c11, t3, c11
  1463. SUB c12, t4, c12
  1464. MUL b3, c05, t1
  1465. MUL b3, c06, t2
  1466. MUL b3, c07, t3
  1467. MUL b3, c08, t4
  1468. SUB c13, t1, c13
  1469. SUB c14, t2, c14
  1470. SUB c15, t3, c15
  1471. SUB c16, t4, c16
  1472. LD a1, 10 * SIZE(BO)
  1473. LD a2, 11 * SIZE(BO)
  1474. LD a3, 15 * SIZE(BO)
  1475. MUL a1, c09, c09
  1476. MUL a1, c10, c10
  1477. MUL a1, c11, c11
  1478. MUL a1, c12, c12
  1479. MUL a2, c09, t1
  1480. MUL a2, c10, t2
  1481. MUL a2, c11, t3
  1482. MUL a2, c12, t4
  1483. SUB c13, t1, c13
  1484. SUB c14, t2, c14
  1485. SUB c15, t3, c15
  1486. SUB c16, t4, c16
  1487. MUL a3, c13, c13
  1488. MUL a3, c14, c14
  1489. MUL a3, c15, c15
  1490. MUL a3, c16, c16
  1491. #endif
  1492. #ifdef RT
  1493. LD a1, 15 * SIZE(BO)
  1494. LD a2, 14 * SIZE(BO)
  1495. LD a3, 13 * SIZE(BO)
  1496. LD a4, 12 * SIZE(BO)
  1497. MUL a1, c13, c13
  1498. MUL a1, c14, c14
  1499. MUL a1, c15, c15
  1500. MUL a1, c16, c16
  1501. MUL a2, c13, t1
  1502. MUL a2, c14, t2
  1503. MUL a2, c15, t3
  1504. MUL a2, c16, t4
  1505. SUB c09, t1, c09
  1506. SUB c10, t2, c10
  1507. SUB c11, t3, c11
  1508. SUB c12, t4, c12
  1509. MUL a3, c13, t1
  1510. MUL a3, c14, t2
  1511. MUL a3, c15, t3
  1512. MUL a3, c16, t4
  1513. SUB c05, t1, c05
  1514. SUB c06, t2, c06
  1515. SUB c07, t3, c07
  1516. SUB c08, t4, c08
  1517. MUL a4, c13, t1
  1518. MUL a4, c14, t2
  1519. MUL a4, c15, t3
  1520. MUL a4, c16, t4
  1521. SUB c01, t1, c01
  1522. SUB c02, t2, c02
  1523. SUB c03, t3, c03
  1524. SUB c04, t4, c04
  1525. LD b1, 10 * SIZE(BO)
  1526. LD b2, 9 * SIZE(BO)
  1527. LD b3, 8 * SIZE(BO)
  1528. MUL b1, c09, c09
  1529. MUL b1, c10, c10
  1530. MUL b1, c11, c11
  1531. MUL b1, c12, c12
  1532. MUL b2, c09, t1
  1533. MUL b2, c10, t2
  1534. MUL b2, c11, t3
  1535. MUL b2, c12, t4
  1536. SUB c05, t1, c05
  1537. SUB c06, t2, c06
  1538. SUB c07, t3, c07
  1539. SUB c08, t4, c08
  1540. MUL b3, c09, t1
  1541. MUL b3, c10, t2
  1542. MUL b3, c11, t3
  1543. MUL b3, c12, t4
  1544. SUB c01, t1, c01
  1545. SUB c02, t2, c02
  1546. SUB c03, t3, c03
  1547. SUB c04, t4, c04
  1548. LD a1, 5 * SIZE(BO)
  1549. LD a2, 4 * SIZE(BO)
  1550. LD a3, 0 * SIZE(BO)
  1551. MUL a1, c05, c05
  1552. MUL a1, c06, c06
  1553. MUL a1, c07, c07
  1554. MUL a1, c08, c08
  1555. MUL a2, c05, t1
  1556. MUL a2, c06, t2
  1557. MUL a2, c07, t3
  1558. MUL a2, c08, t4
  1559. SUB c01, t1, c01
  1560. SUB c02, t2, c02
  1561. SUB c03, t3, c03
  1562. SUB c04, t4, c04
  1563. MUL a3, c01, c01
  1564. MUL a3, c02, c02
  1565. MUL a3, c03, c03
  1566. MUL a3, c04, c04
  1567. #endif
  1568. #if defined(LN) || defined(LT)
  1569. ST c01, 0 * SIZE(BO)
  1570. ST c05, 1 * SIZE(BO)
  1571. ST c09, 2 * SIZE(BO)
  1572. ST c13, 3 * SIZE(BO)
  1573. ST c02, 4 * SIZE(BO)
  1574. ST c06, 5 * SIZE(BO)
  1575. ST c10, 6 * SIZE(BO)
  1576. ST c14, 7 * SIZE(BO)
  1577. ST c03, 8 * SIZE(BO)
  1578. ST c07, 9 * SIZE(BO)
  1579. ST c11, 10 * SIZE(BO)
  1580. ST c15, 11 * SIZE(BO)
  1581. ST c04, 12 * SIZE(BO)
  1582. ST c08, 13 * SIZE(BO)
  1583. ST c12, 14 * SIZE(BO)
  1584. ST c16, 15 * SIZE(BO)
  1585. #else
  1586. ST c01, 0 * SIZE(AO)
  1587. ST c02, 1 * SIZE(AO)
  1588. ST c03, 2 * SIZE(AO)
  1589. ST c04, 3 * SIZE(AO)
  1590. ST c05, 4 * SIZE(AO)
  1591. ST c06, 5 * SIZE(AO)
  1592. ST c07, 6 * SIZE(AO)
  1593. ST c08, 7 * SIZE(AO)
  1594. ST c09, 8 * SIZE(AO)
  1595. ST c10, 9 * SIZE(AO)
  1596. ST c11, 10 * SIZE(AO)
  1597. ST c12, 11 * SIZE(AO)
  1598. ST c13, 12 * SIZE(AO)
  1599. ST c14, 13 * SIZE(AO)
  1600. ST c15, 14 * SIZE(AO)
  1601. ST c16, 15 * SIZE(AO)
  1602. #endif
  1603. #ifdef LN
  1604. lda C1, -4 * SIZE(C1)
  1605. lda C2, -4 * SIZE(C2)
  1606. lda C3, -4 * SIZE(C3)
  1607. lda C4, -4 * SIZE(C4)
  1608. #endif
  1609. ST c01, 0 * SIZE(C1)
  1610. ST c02, 1 * SIZE(C1)
  1611. ST c03, 2 * SIZE(C1)
  1612. ST c04, 3 * SIZE(C1)
  1613. ST c05, 0 * SIZE(C2)
  1614. ST c06, 1 * SIZE(C2)
  1615. ST c07, 2 * SIZE(C2)
  1616. ST c08, 3 * SIZE(C2)
  1617. ST c09, 0 * SIZE(C3)
  1618. ST c10, 1 * SIZE(C3)
  1619. ST c11, 2 * SIZE(C3)
  1620. ST c12, 3 * SIZE(C3)
  1621. ST c13, 0 * SIZE(C4)
  1622. ST c14, 1 * SIZE(C4)
  1623. ST c15, 2 * SIZE(C4)
  1624. ST c16, 3 * SIZE(C4)
  1625. #ifndef LN
  1626. lda C1, 4 * SIZE(C1)
  1627. lda C2, 4 * SIZE(C2)
  1628. lda C3, 4 * SIZE(C3)
  1629. lda C4, 4 * SIZE(C4)
  1630. #endif
  1631. fclr t1
  1632. fclr t2
  1633. fclr t3
  1634. fclr t4
  1635. #ifdef RT
  1636. sll K, 2 + BASE_SHIFT, TMP1
  1637. addq AORIG, TMP1, AORIG
  1638. #endif
  1639. #if defined(LT) || defined(RN)
  1640. subq K, KK, TMP1
  1641. sll TMP1, BASE_SHIFT + 2, TMP1
  1642. addq AO, TMP1, AO
  1643. addq BO, TMP1, BO
  1644. #endif
  1645. #ifdef LT
  1646. addq KK, 4, KK
  1647. #endif
  1648. #ifdef LN
  1649. subq KK, 4, KK
  1650. #endif
  1651. lda I, -1(I)
  1652. bgt I, $L11
  1653. .align 4
  1654. $L39:
  1655. #ifdef LN
  1656. sll K, 2 + BASE_SHIFT, TMP1
  1657. addq B, TMP1, B
  1658. #endif
  1659. #if defined(LT) || defined(RN)
  1660. mov BO, B
  1661. #endif
  1662. #ifdef RN
  1663. addq KK, 4, KK
  1664. #endif
  1665. #ifdef RT
  1666. subq KK, 4, KK
  1667. #endif
  1668. lda J, -1(J)
  1669. bgt J, $L01
  1670. .align 4
  1671. $L40:
  1672. and N, 2, J
  1673. ble J, $L80
  1674. #ifdef RT
  1675. sll K, 1 + BASE_SHIFT, TMP1
  1676. subq B, TMP1, B
  1677. addq LDC, LDC, TMP1
  1678. subq C, TMP1, C
  1679. #endif
  1680. mov C, C1
  1681. addq C, LDC, C2
  1682. fclr t1
  1683. #ifndef RT
  1684. addq C2, LDC, C
  1685. #endif
  1686. fclr t2
  1687. #ifdef LN
  1688. addq M, OFFSET, KK
  1689. #endif
  1690. #ifdef LT
  1691. mov OFFSET, KK
  1692. #endif
  1693. #if defined(LN) || defined(RT)
  1694. mov A, AORIG
  1695. #else
  1696. mov A, AO
  1697. #endif
  1698. fclr t3
  1699. fclr t4
  1700. and M, 1, I
  1701. ble I, $L60
  1702. #if defined(LT) || defined(RN)
  1703. LD a1, 0 * SIZE(AO)
  1704. fclr c01
  1705. LD a2, 1 * SIZE(AO)
  1706. fclr c05
  1707. LD b1, 0 * SIZE(B)
  1708. fclr c02
  1709. LD b2, 1 * SIZE(B)
  1710. fclr c06
  1711. lda L, -2(KK)
  1712. LD b3, 2 * SIZE(B)
  1713. lda AO, 1 * SIZE(AO)
  1714. LD b4, 3 * SIZE(B)
  1715. lda BO, 2 * SIZE(B)
  1716. ble KK, $L78
  1717. ble L, $L75
  1718. #else
  1719. #ifdef LN
  1720. sll K, BASE_SHIFT + 0, TMP1
  1721. subq AORIG, TMP1, AORIG
  1722. #endif
  1723. sll KK, BASE_SHIFT + 0, TMP1
  1724. addq AORIG, TMP1, AO
  1725. sll KK, BASE_SHIFT + 1, TMP1
  1726. addq B, TMP1, BO
  1727. subq K, KK, TMP1
  1728. LD a1, 0 * SIZE(AO)
  1729. fclr c01
  1730. LD a2, 1 * SIZE(AO)
  1731. fclr c05
  1732. LD b1, 0 * SIZE(BO)
  1733. fclr c02
  1734. LD b2, 1 * SIZE(BO)
  1735. fclr c06
  1736. lda L, -2(TMP1)
  1737. LD b3, 2 * SIZE(BO)
  1738. lda AO, 1 * SIZE(AO)
  1739. LD b4, 3 * SIZE(BO)
  1740. lda BO, 2 * SIZE(BO)
  1741. ble TMP1, $L78
  1742. ble L, $L75
  1743. #endif
  1744. .align 4
  1745. $L72:
  1746. ADD c01, t1, c01
  1747. lda L, -2(L)
  1748. MUL a1, b1, t1
  1749. LD b1, 2 * SIZE(BO)
  1750. ADD c05, t2, c05
  1751. MUL a1, b2, t2
  1752. LD a1, 1 * SIZE(AO)
  1753. LD b2, 3 * SIZE(BO)
  1754. ADD c02, t3, c02
  1755. lda AO, 2 * SIZE(AO)
  1756. MUL a2, b3, t3
  1757. LD b3, 4 * SIZE(BO)
  1758. ADD c06, t4, c06
  1759. MUL a2, b4, t4
  1760. LD a2, 0 * SIZE(AO)
  1761. LD b4, 5 * SIZE(BO)
  1762. lda BO, 4 * SIZE(BO)
  1763. unop
  1764. unop
  1765. bgt L, $L72
  1766. .align 4
  1767. $L75:
  1768. ADD c01, t1, c01
  1769. MUL a1, b1, t1
  1770. #if defined(LT) || defined(RN)
  1771. blbs KK, $L77
  1772. #else
  1773. blbs TMP1, $L77
  1774. #endif
  1775. .align 4
  1776. ADD c05, t2, c05
  1777. MUL a1, b2, t2
  1778. LD a1, 0 * SIZE(AO)
  1779. LD b1, 0 * SIZE(BO)
  1780. ADD c01, t1, c01
  1781. LD b2, 1 * SIZE(BO)
  1782. lda AO, 1 * SIZE(AO)
  1783. MUL a1, b1, t1
  1784. lda BO, 2 * SIZE(BO)
  1785. .align 4
  1786. $L77:
  1787. ADD c05, t2, c05
  1788. MUL a1, b2, t2
  1789. ADD c02, t3, c02
  1790. ADD c06, t4, c06
  1791. ADD c01, c02, c01
  1792. lda AO, 1 * SIZE(AO)
  1793. ADD c05, c06, c05
  1794. lda BO, 2 * SIZE(BO)
  1795. ADD c01, t1, c01
  1796. ADD c05, t2, c05
  1797. .align 4
  1798. $L78:
  1799. #if defined(LN) || defined(RT)
  1800. #ifdef LN
  1801. subq KK, 1, TMP1
  1802. #else
  1803. subq KK, 2, TMP1
  1804. #endif
  1805. sll TMP1, BASE_SHIFT + 0, TMP2
  1806. addq AORIG, TMP2, AO
  1807. sll TMP1, BASE_SHIFT + 1, TMP2
  1808. addq B, TMP2, BO
  1809. #else
  1810. lda AO, -1 * SIZE(AO)
  1811. lda BO, -2 * SIZE(BO)
  1812. #endif
  1813. #if defined(LN) || defined(LT)
  1814. LD a1, 0 * SIZE(BO)
  1815. LD a2, 1 * SIZE(BO)
  1816. SUB a1, c01, c01
  1817. SUB a2, c05, c05
  1818. #else
  1819. LD a1, 0 * SIZE(AO)
  1820. LD a2, 1 * SIZE(AO)
  1821. SUB a1, c01, c01
  1822. SUB a2, c05, c05
  1823. #endif
  1824. #if defined(LN) || defined(LT)
  1825. LD a1, 0 * SIZE(AO)
  1826. MUL a1, c01, c01
  1827. MUL a1, c05, c05
  1828. #endif
  1829. #ifdef RN
  1830. LD a1, 0 * SIZE(BO)
  1831. LD a2, 1 * SIZE(BO)
  1832. LD a3, 3 * SIZE(BO)
  1833. MUL a1, c01, c01
  1834. MUL a2, c01, t1
  1835. SUB c05, t1, c05
  1836. MUL a3, c05, c05
  1837. #endif
  1838. #ifdef RT
  1839. LD a1, 3 * SIZE(BO)
  1840. LD a2, 2 * SIZE(BO)
  1841. LD a3, 0 * SIZE(BO)
  1842. MUL a1, c05, c05
  1843. MUL a2, c05, t1
  1844. SUB c01, t1, c01
  1845. MUL a3, c01, c01
  1846. #endif
  1847. #if defined(LN) || defined(LT)
  1848. ST c01, 0 * SIZE(BO)
  1849. ST c05, 1 * SIZE(BO)
  1850. #else
  1851. ST c01, 0 * SIZE(AO)
  1852. ST c05, 1 * SIZE(AO)
  1853. #endif
  1854. #ifdef LN
  1855. lda C1, -1 * SIZE(C1)
  1856. lda C2, -1 * SIZE(C2)
  1857. #endif
  1858. ST c01, 0 * SIZE(C1)
  1859. ST c05, 0 * SIZE(C2)
  1860. fclr t1
  1861. fclr t2
  1862. fclr t3
  1863. fclr t4
  1864. #ifdef RT
  1865. sll K, 0 + BASE_SHIFT, TMP1
  1866. addq AORIG, TMP1, AORIG
  1867. #endif
  1868. #if defined(LT) || defined(RN)
  1869. subq K, KK, TMP1
  1870. sll TMP1, BASE_SHIFT + 0, TMP2
  1871. addq AO, TMP2, AO
  1872. sll TMP1, BASE_SHIFT + 1, TMP2
  1873. addq BO, TMP2, BO
  1874. #endif
  1875. #ifdef LT
  1876. addq KK, 1, KK
  1877. #endif
  1878. #ifdef LN
  1879. subq KK, 1, KK
  1880. #endif
  1881. .align 4
  1882. $L60:
  1883. and M, 2, I
  1884. ble I, $L70
  1885. #if defined(LT) || defined(RN)
  1886. LD a1, 0 * SIZE(AO)
  1887. fclr c01
  1888. LD a2, 1 * SIZE(AO)
  1889. fclr c05
  1890. LD a3, 2 * SIZE(AO)
  1891. fclr c02
  1892. LD a4, 3 * SIZE(AO)
  1893. fclr c06
  1894. LD b1, 0 * SIZE(B)
  1895. lda L, -2(KK)
  1896. LD b2, 1 * SIZE(B)
  1897. lda AO, 2 * SIZE(AO)
  1898. LD b3, 2 * SIZE(B)
  1899. LD b4, 3 * SIZE(B)
  1900. lda BO, 2 * SIZE(B)
  1901. ble KK, $L68
  1902. ble L, $L65
  1903. #else
  1904. #ifdef LN
  1905. sll K, BASE_SHIFT + 1, TMP1
  1906. subq AORIG, TMP1, AORIG
  1907. #endif
  1908. sll KK, BASE_SHIFT + 1, TMP1
  1909. addq AORIG, TMP1, AO
  1910. sll KK, BASE_SHIFT + 1, TMP1
  1911. addq B, TMP1, BO
  1912. subq K, KK, TMP1
  1913. LD a1, 0 * SIZE(AO)
  1914. fclr c01
  1915. LD a2, 1 * SIZE(AO)
  1916. fclr c05
  1917. LD a3, 2 * SIZE(AO)
  1918. fclr c02
  1919. LD a4, 3 * SIZE(AO)
  1920. fclr c06
  1921. LD b1, 0 * SIZE(BO)
  1922. lda L, -2(TMP1)
  1923. LD b2, 1 * SIZE(BO)
  1924. lda AO, 2 * SIZE(AO)
  1925. LD b3, 2 * SIZE(BO)
  1926. LD b4, 3 * SIZE(BO)
  1927. lda BO, 2 * SIZE(BO)
  1928. ble TMP1, $L68
  1929. ble L, $L65
  1930. #endif
  1931. .align 4
  1932. $L62:
  1933. ADD c01, t1, c01
  1934. unop
  1935. MUL a1, b1, t1
  1936. unop
  1937. ADD c02, t2, c02
  1938. lda AO, 4 * SIZE(AO)
  1939. MUL a2, b1, t2
  1940. LD b1, 2 * SIZE(BO)
  1941. ADD c05, t3, c05
  1942. lda L, -2(L)
  1943. MUL a1, b2, t3
  1944. LD a1, -2 * SIZE(AO)
  1945. ADD c06, t4, c06
  1946. unop
  1947. MUL a2, b2, t4
  1948. LD a2, -1 * SIZE(AO)
  1949. ADD c01, t1, c01
  1950. LD b2, 3 * SIZE(BO)
  1951. MUL a3, b3, t1
  1952. lda BO, 4 * SIZE(BO)
  1953. ADD c02, t2, c02
  1954. unop
  1955. MUL a4, b3, t2
  1956. LD b3, 0 * SIZE(BO)
  1957. ADD c05, t3, c05
  1958. unop
  1959. MUL a3, b4, t3
  1960. LD a3, 0 * SIZE(AO)
  1961. ADD c06, t4, c06
  1962. MUL a4, b4, t4
  1963. LD b4, 1 * SIZE(BO)
  1964. unop
  1965. LD a4, 1 * SIZE(AO)
  1966. unop
  1967. unop
  1968. bgt L, $L62
  1969. .align 4
  1970. $L65:
  1971. ADD c01, t1, c01
  1972. MUL a1, b1, t1
  1973. #if defined(LT) || defined(RN)
  1974. blbs KK, $L67
  1975. #else
  1976. blbs TMP1, $L67
  1977. #endif
  1978. .align 4
  1979. ADD c02, t2, c02
  1980. unop
  1981. MUL a2, b1, t2
  1982. LD b1, 0 * SIZE(BO)
  1983. ADD c05, t3, c05
  1984. lda BO, 2 * SIZE(BO)
  1985. MUL a1, b2, t3
  1986. LD a1, 0 * SIZE(AO)
  1987. ADD c06, t4, c06
  1988. unop
  1989. MUL a2, b2, t4
  1990. LD a2, 1 * SIZE(AO)
  1991. ADD c01, t1, c01
  1992. LD b2, -1 * SIZE(BO)
  1993. MUL a1, b1, t1
  1994. lda AO, 2 * SIZE(AO)
  1995. .align 4
  1996. $L67:
  1997. ADD c02, t2, c02
  1998. MUL a2, b1, t2
  1999. ADD c05, t3, c05
  2000. MUL a1, b2, t3
  2001. ADD c06, t4, c06
  2002. lda AO, 2 * SIZE(AO)
  2003. MUL a2, b2, t4
  2004. lda BO, 2 * SIZE(BO)
  2005. ADD c01, t1, c01
  2006. ADD c02, t2, c02
  2007. ADD c05, t3, c05
  2008. ADD c06, t4, c06
  2009. .align 4
  2010. $L68:
  2011. #if defined(LN) || defined(RT)
  2012. #ifdef LN
  2013. subq KK, 2, TMP1
  2014. #else
  2015. subq KK, 2, TMP1
  2016. #endif
  2017. sll TMP1, BASE_SHIFT + 1, TMP2
  2018. addq AORIG, TMP2, AO
  2019. sll TMP1, BASE_SHIFT + 1, TMP2
  2020. addq B, TMP2, BO
  2021. #else
  2022. lda AO, -2 * SIZE(AO)
  2023. lda BO, -2 * SIZE(BO)
  2024. #endif
  2025. #if defined(LN) || defined(LT)
  2026. LD a1, 0 * SIZE(BO)
  2027. LD a2, 1 * SIZE(BO)
  2028. LD a3, 2 * SIZE(BO)
  2029. LD a4, 3 * SIZE(BO)
  2030. SUB a1, c01, c01
  2031. SUB a2, c05, c05
  2032. SUB a3, c02, c02
  2033. SUB a4, c06, c06
  2034. #else
  2035. LD a1, 0 * SIZE(AO)
  2036. LD a2, 1 * SIZE(AO)
  2037. LD a3, 2 * SIZE(AO)
  2038. LD a4, 3 * SIZE(AO)
  2039. SUB a1, c01, c01
  2040. SUB a2, c02, c02
  2041. SUB a3, c05, c05
  2042. SUB a4, c06, c06
  2043. #endif
  2044. #ifdef LN
  2045. LD a1, 3 * SIZE(AO)
  2046. LD a2, 2 * SIZE(AO)
  2047. LD a3, 0 * SIZE(AO)
  2048. MUL a1, c02, c02
  2049. MUL a1, c06, c06
  2050. MUL a2, c02, t1
  2051. MUL a2, c06, t2
  2052. SUB c01, t1, c01
  2053. SUB c05, t2, c05
  2054. MUL a3, c01, c01
  2055. MUL a3, c05, c05
  2056. #endif
  2057. #ifdef LT
  2058. LD a1, 0 * SIZE(AO)
  2059. LD a2, 1 * SIZE(AO)
  2060. LD a3, 3 * SIZE(AO)
  2061. MUL a1, c01, c01
  2062. MUL a1, c05, c05
  2063. MUL a2, c01, t1
  2064. MUL a2, c05, t2
  2065. SUB c02, t1, c02
  2066. SUB c06, t2, c06
  2067. MUL a3, c02, c02
  2068. MUL a3, c06, c06
  2069. #endif
  2070. #ifdef RN
  2071. LD a1, 0 * SIZE(BO)
  2072. LD a2, 1 * SIZE(BO)
  2073. LD a3, 3 * SIZE(BO)
  2074. MUL a1, c01, c01
  2075. MUL a1, c02, c02
  2076. MUL a2, c01, t1
  2077. MUL a2, c02, t2
  2078. SUB c05, t1, c05
  2079. SUB c06, t2, c06
  2080. MUL a3, c05, c05
  2081. MUL a3, c06, c06
  2082. #endif
  2083. #ifdef RT
  2084. LD a1, 3 * SIZE(BO)
  2085. LD a2, 2 * SIZE(BO)
  2086. LD a3, 0 * SIZE(BO)
  2087. MUL a1, c05, c05
  2088. MUL a1, c06, c06
  2089. MUL a2, c05, t1
  2090. MUL a2, c06, t2
  2091. SUB c01, t1, c01
  2092. SUB c02, t2, c02
  2093. MUL a3, c01, c01
  2094. MUL a3, c02, c02
  2095. #endif
  2096. #if defined(LN) || defined(LT)
  2097. ST c01, 0 * SIZE(BO)
  2098. ST c05, 1 * SIZE(BO)
  2099. ST c02, 2 * SIZE(BO)
  2100. ST c06, 3 * SIZE(BO)
  2101. #else
  2102. ST c01, 0 * SIZE(AO)
  2103. ST c02, 1 * SIZE(AO)
  2104. ST c05, 2 * SIZE(AO)
  2105. ST c06, 3 * SIZE(AO)
  2106. #endif
  2107. #ifdef LN
  2108. lda C1, -2 * SIZE(C1)
  2109. lda C2, -2 * SIZE(C2)
  2110. #endif
  2111. ST c01, 0 * SIZE(C1)
  2112. ST c02, 1 * SIZE(C1)
  2113. ST c05, 0 * SIZE(C2)
  2114. ST c06, 1 * SIZE(C2)
  2115. #ifndef LN
  2116. lda C1, 2 * SIZE(C1)
  2117. lda C2, 2 * SIZE(C2)
  2118. #endif
  2119. fclr t1
  2120. fclr t2
  2121. fclr t3
  2122. fclr t4
  2123. #ifdef RT
  2124. sll K, 1 + BASE_SHIFT, TMP1
  2125. addq AORIG, TMP1, AORIG
  2126. #endif
  2127. #if defined(LT) || defined(RN)
  2128. subq K, KK, TMP1
  2129. sll TMP1, BASE_SHIFT + 1, TMP2
  2130. addq AO, TMP2, AO
  2131. sll TMP1, BASE_SHIFT + 1, TMP2
  2132. addq BO, TMP2, BO
  2133. #endif
  2134. #ifdef LT
  2135. addq KK, 2, KK
  2136. #endif
  2137. #ifdef LN
  2138. subq KK, 2, KK
  2139. #endif
  2140. .align 4
  2141. $L70:
  2142. sra M, 2, I
  2143. ble I, $L79
  2144. .align 4
  2145. $L51:
  2146. #if defined(LT) || defined(RN)
  2147. LD a1, 0 * SIZE(AO)
  2148. fclr c03
  2149. LD a2, 1 * SIZE(AO)
  2150. fclr c07
  2151. LD a3, 2 * SIZE(AO)
  2152. fclr c04
  2153. LD a4, 3 * SIZE(AO)
  2154. fclr c08
  2155. LD b1, 0 * SIZE(B)
  2156. fclr c01
  2157. LD b2, 1 * SIZE(B)
  2158. fclr c05
  2159. LD b3, 2 * SIZE(B)
  2160. fclr c02
  2161. LD b4, 3 * SIZE(B)
  2162. fclr c06
  2163. lda L, -2(KK)
  2164. lda BO, 2 * SIZE(B)
  2165. lda AO, 4 * SIZE(AO)
  2166. ble KK, $L58
  2167. ble L, $L55
  2168. #else
  2169. #ifdef LN
  2170. sll K, BASE_SHIFT + 2, TMP1
  2171. subq AORIG, TMP1, AORIG
  2172. #endif
  2173. sll KK, BASE_SHIFT + 2, TMP1
  2174. addq AORIG, TMP1, AO
  2175. sll KK, BASE_SHIFT + 1, TMP1
  2176. addq B, TMP1, BO
  2177. subq K, KK, TMP1
  2178. LD a1, 0 * SIZE(AO)
  2179. fclr c03
  2180. LD a2, 1 * SIZE(AO)
  2181. fclr c07
  2182. LD a3, 2 * SIZE(AO)
  2183. fclr c04
  2184. LD a4, 3 * SIZE(AO)
  2185. fclr c08
  2186. LD b1, 0 * SIZE(BO)
  2187. fclr c01
  2188. LD b2, 1 * SIZE(BO)
  2189. fclr c05
  2190. LD b3, 2 * SIZE(BO)
  2191. fclr c02
  2192. LD b4, 3 * SIZE(BO)
  2193. fclr c06
  2194. lda L, -2(TMP1)
  2195. lda BO, 2 * SIZE(BO)
  2196. lda AO, 4 * SIZE(AO)
  2197. ble TMP1, $L58
  2198. ble L, $L55
  2199. #endif
  2200. .align 4
  2201. $L52:
  2202. ADD c05, t1, c05
  2203. unop
  2204. MUL a1, b1, t1
  2205. unop
  2206. ADD c06, t2, c06
  2207. lda L, -2(L)
  2208. MUL a2, b1, t2
  2209. unop
  2210. ADD c07, t3, c07
  2211. unop
  2212. MUL a3, b1, t3
  2213. unop
  2214. ADD c08, t4, c08
  2215. unop
  2216. MUL a4, b1, t4
  2217. LD b1, 2 * SIZE(BO)
  2218. ADD c01, t1, c01
  2219. unop
  2220. MUL a1, b2, t1
  2221. LD a1, 0 * SIZE(AO)
  2222. ADD c02, t2, c02
  2223. lda BO, 4 * SIZE(BO)
  2224. MUL a2, b2, t2
  2225. LD a2, 1 * SIZE(AO)
  2226. ADD c03, t3, c03
  2227. unop
  2228. MUL a3, b2, t3
  2229. LD a3, 2 * SIZE(AO)
  2230. ADD c04, t4, c04
  2231. unop
  2232. MUL a4, b2, t4
  2233. LD a5, 3 * SIZE(AO)
  2234. ADD c05, t1, c05
  2235. unop
  2236. MUL a1, b3, t1
  2237. LD b2, -1 * SIZE(BO)
  2238. ADD c06, t2, c06
  2239. unop
  2240. MUL a2, b3, t2
  2241. unop
  2242. ADD c07, t3, c07
  2243. unop
  2244. MUL a3, b3, t3
  2245. lda AO, 8 * SIZE(AO)
  2246. ADD c08, t4, c08
  2247. unop
  2248. MUL a5, b3, t4
  2249. LD b3, 0 * SIZE(BO)
  2250. ADD c01, t1, c01
  2251. unop
  2252. MUL a1, b4, t1
  2253. LD a1, -4 * SIZE(AO)
  2254. ADD c02, t2, c02
  2255. unop
  2256. MUL a2, b4, t2
  2257. LD a2, -3 * SIZE(AO)
  2258. ADD c03, t3, c03
  2259. LD a4, -1 * SIZE(AO)
  2260. MUL a3, b4, t3
  2261. LD a3, -2 * SIZE(AO)
  2262. ADD c04, t4, c04
  2263. MUL a5, b4, t4
  2264. LD b4, 1 * SIZE(BO)
  2265. bgt L, $L52
  2266. .align 4
  2267. $L55:
  2268. ADD c05, t1, c05
  2269. MUL a1, b1, t1
  2270. #if defined(LT) || defined(RN)
  2271. blbs KK, $L57
  2272. #else
  2273. blbs TMP1, $L57
  2274. #endif
  2275. .align 4
  2276. ADD c06, t2, c06
  2277. MUL a2, b1, t2
  2278. ADD c07, t3, c07
  2279. MUL a3, b1, t3
  2280. ADD c08, t4, c08
  2281. unop
  2282. MUL a4, b1, t4
  2283. LD b1, 0 * SIZE(BO)
  2284. ADD c01, t1, c01
  2285. unop
  2286. MUL a1, b2, t1
  2287. LD a1, 0 * SIZE(AO)
  2288. ADD c02, t2, c02
  2289. unop
  2290. MUL a2, b2, t2
  2291. LD a2, 1 * SIZE(AO)
  2292. ADD c03, t3, c03
  2293. unop
  2294. MUL a3, b2, t3
  2295. LD a3, 2 * SIZE(AO)
  2296. ADD c04, t4, c04
  2297. MUL a4, b2, t4
  2298. LD a4, 3 * SIZE(AO)
  2299. lda AO, 4 * SIZE(AO)
  2300. ADD c05, t1, c05
  2301. LD b2, 1 * SIZE(BO)
  2302. MUL a1, b1, t1
  2303. lda BO, 2 * SIZE(BO)
  2304. .align 4
  2305. $L57:
  2306. ADD c06, t2, c06
  2307. MUL a2, b1, t2
  2308. ADD c07, t3, c07
  2309. MUL a3, b1, t3
  2310. ADD c08, t4, c08
  2311. MUL a4, b1, t4
  2312. ADD c01, t1, c01
  2313. MUL a1, b2, t1
  2314. ADD c02, t2, c02
  2315. MUL a2, b2, t2
  2316. ADD c03, t3, c03
  2317. MUL a3, b2, t3
  2318. ADD c04, t4, c04
  2319. lda AO, 4 * SIZE(AO)
  2320. MUL a4, b2, t4
  2321. lda BO, 2 * SIZE(BO)
  2322. ADD c05, t1, c05
  2323. ADD c06, t2, c06
  2324. ADD c07, t3, c07
  2325. ADD c08, t4, c08
  2326. .align 4
  2327. $L58:
  2328. #if defined(LN) || defined(RT)
  2329. #ifdef LN
  2330. subq KK, 4, TMP1
  2331. #else
  2332. subq KK, 2, TMP1
  2333. #endif
  2334. sll TMP1, BASE_SHIFT + 2, TMP2
  2335. addq AORIG, TMP2, AO
  2336. sll TMP1, BASE_SHIFT + 1, TMP2
  2337. addq B, TMP2, BO
  2338. #else
  2339. lda AO, -4 * SIZE(AO)
  2340. lda BO, -2 * SIZE(BO)
  2341. #endif
  2342. #if defined(LN) || defined(LT)
  2343. LD a1, 0 * SIZE(BO)
  2344. LD a2, 1 * SIZE(BO)
  2345. LD a3, 2 * SIZE(BO)
  2346. LD a4, 3 * SIZE(BO)
  2347. LD b1, 4 * SIZE(BO)
  2348. LD b2, 5 * SIZE(BO)
  2349. LD b3, 6 * SIZE(BO)
  2350. LD b4, 7 * SIZE(BO)
  2351. SUB a1, c01, c01
  2352. SUB a2, c05, c05
  2353. SUB a3, c02, c02
  2354. SUB a4, c06, c06
  2355. SUB b1, c03, c03
  2356. SUB b2, c07, c07
  2357. SUB b3, c04, c04
  2358. SUB b4, c08, c08
  2359. #else
  2360. LD a1, 0 * SIZE(AO)
  2361. LD a2, 1 * SIZE(AO)
  2362. LD a3, 2 * SIZE(AO)
  2363. LD a4, 3 * SIZE(AO)
  2364. LD b1, 4 * SIZE(AO)
  2365. LD b2, 5 * SIZE(AO)
  2366. LD b3, 6 * SIZE(AO)
  2367. LD b4, 7 * SIZE(AO)
  2368. SUB a1, c01, c01
  2369. SUB a2, c02, c02
  2370. SUB a3, c03, c03
  2371. SUB a4, c04, c04
  2372. SUB b1, c05, c05
  2373. SUB b2, c06, c06
  2374. SUB b3, c07, c07
  2375. SUB b4, c08, c08
  2376. #endif
  2377. #ifdef LN
  2378. LD a1, 15 * SIZE(AO)
  2379. LD a2, 14 * SIZE(AO)
  2380. LD a3, 13 * SIZE(AO)
  2381. LD a4, 12 * SIZE(AO)
  2382. MUL a1, c04, c04
  2383. MUL a1, c08, c08
  2384. MUL a2, c04, t1
  2385. MUL a2, c08, t2
  2386. SUB c03, t1, c03
  2387. SUB c07, t2, c07
  2388. MUL a3, c04, t1
  2389. MUL a3, c08, t2
  2390. SUB c02, t1, c02
  2391. SUB c06, t2, c06
  2392. MUL a4, c04, t1
  2393. MUL a4, c08, t2
  2394. SUB c01, t1, c01
  2395. SUB c05, t2, c05
  2396. LD b1, 10 * SIZE(AO)
  2397. LD b2, 9 * SIZE(AO)
  2398. LD b3, 8 * SIZE(AO)
  2399. MUL b1, c03, c03
  2400. MUL b1, c07, c07
  2401. MUL b2, c03, t1
  2402. MUL b2, c07, t2
  2403. SUB c02, t1, c02
  2404. SUB c06, t2, c06
  2405. MUL b3, c03, t1
  2406. MUL b3, c07, t2
  2407. SUB c01, t1, c01
  2408. SUB c05, t2, c05
  2409. LD a1, 5 * SIZE(AO)
  2410. LD a2, 4 * SIZE(AO)
  2411. LD a3, 0 * SIZE(AO)
  2412. MUL a1, c02, c02
  2413. MUL a1, c06, c06
  2414. MUL a2, c02, t1
  2415. MUL a2, c06, t2
  2416. SUB c01, t1, c01
  2417. SUB c05, t2, c05
  2418. MUL a3, c01, c01
  2419. MUL a3, c05, c05
  2420. #endif
  2421. #ifdef LT
  2422. LD a1, 0 * SIZE(AO)
  2423. LD a2, 1 * SIZE(AO)
  2424. LD a3, 2 * SIZE(AO)
  2425. LD a4, 3 * SIZE(AO)
  2426. MUL a1, c01, c01
  2427. MUL a1, c05, c05
  2428. MUL a2, c01, t1
  2429. MUL a2, c05, t2
  2430. SUB c02, t1, c02
  2431. SUB c06, t2, c06
  2432. MUL a3, c01, t1
  2433. MUL a3, c05, t2
  2434. SUB c03, t1, c03
  2435. SUB c07, t2, c07
  2436. MUL a4, c01, t1
  2437. MUL a4, c05, t2
  2438. SUB c04, t1, c04
  2439. SUB c08, t2, c08
  2440. LD b1, 5 * SIZE(AO)
  2441. LD b2, 6 * SIZE(AO)
  2442. LD b3, 7 * SIZE(AO)
  2443. MUL b1, c02, c02
  2444. MUL b1, c06, c06
  2445. MUL b2, c02, t1
  2446. MUL b2, c06, t2
  2447. SUB c03, t1, c03
  2448. SUB c07, t2, c07
  2449. MUL b3, c02, t1
  2450. MUL b3, c06, t2
  2451. SUB c04, t1, c04
  2452. SUB c08, t2, c08
  2453. LD a1, 10 * SIZE(AO)
  2454. LD a2, 11 * SIZE(AO)
  2455. LD a3, 15 * SIZE(AO)
  2456. MUL a1, c03, c03
  2457. MUL a1, c07, c07
  2458. MUL a2, c03, t1
  2459. MUL a2, c07, t2
  2460. SUB c04, t1, c04
  2461. SUB c08, t2, c08
  2462. MUL a3, c04, c04
  2463. MUL a3, c08, c08
  2464. #endif
  2465. #ifdef RN
  2466. LD a1, 0 * SIZE(BO)
  2467. LD a2, 1 * SIZE(BO)
  2468. LD a3, 3 * SIZE(BO)
  2469. MUL a1, c01, c01
  2470. MUL a1, c02, c02
  2471. MUL a1, c03, c03
  2472. MUL a1, c04, c04
  2473. MUL a2, c01, t1
  2474. MUL a2, c02, t2
  2475. MUL a2, c03, t3
  2476. MUL a2, c04, t4
  2477. SUB c05, t1, c05
  2478. SUB c06, t2, c06
  2479. SUB c07, t3, c07
  2480. SUB c08, t4, c08
  2481. MUL a3, c05, c05
  2482. MUL a3, c06, c06
  2483. MUL a3, c07, c07
  2484. MUL a3, c08, c08
  2485. #endif
  2486. #ifdef RT
  2487. LD a1, 3 * SIZE(BO)
  2488. LD a2, 2 * SIZE(BO)
  2489. LD a3, 0 * SIZE(BO)
  2490. MUL a1, c05, c05
  2491. MUL a1, c06, c06
  2492. MUL a1, c07, c07
  2493. MUL a1, c08, c08
  2494. MUL a2, c05, t1
  2495. MUL a2, c06, t2
  2496. MUL a2, c07, t3
  2497. MUL a2, c08, t4
  2498. SUB c01, t1, c01
  2499. SUB c02, t2, c02
  2500. SUB c03, t3, c03
  2501. SUB c04, t4, c04
  2502. MUL a3, c01, c01
  2503. MUL a3, c02, c02
  2504. MUL a3, c03, c03
  2505. MUL a3, c04, c04
  2506. #endif
  2507. #if defined(LN) || defined(LT)
  2508. ST c01, 0 * SIZE(BO)
  2509. ST c05, 1 * SIZE(BO)
  2510. ST c02, 2 * SIZE(BO)
  2511. ST c06, 3 * SIZE(BO)
  2512. ST c03, 4 * SIZE(BO)
  2513. ST c07, 5 * SIZE(BO)
  2514. ST c04, 6 * SIZE(BO)
  2515. ST c08, 7 * SIZE(BO)
  2516. #else
  2517. ST c01, 0 * SIZE(AO)
  2518. ST c02, 1 * SIZE(AO)
  2519. ST c03, 2 * SIZE(AO)
  2520. ST c04, 3 * SIZE(AO)
  2521. ST c05, 4 * SIZE(AO)
  2522. ST c06, 5 * SIZE(AO)
  2523. ST c07, 6 * SIZE(AO)
  2524. ST c08, 7 * SIZE(AO)
  2525. #endif
  2526. #ifdef LN
  2527. lda C1, -4 * SIZE(C1)
  2528. lda C2, -4 * SIZE(C2)
  2529. #endif
  2530. ST c01, 0 * SIZE(C1)
  2531. ST c02, 1 * SIZE(C1)
  2532. ST c03, 2 * SIZE(C1)
  2533. ST c04, 3 * SIZE(C1)
  2534. ST c05, 0 * SIZE(C2)
  2535. ST c06, 1 * SIZE(C2)
  2536. ST c07, 2 * SIZE(C2)
  2537. ST c08, 3 * SIZE(C2)
  2538. #ifndef LN
  2539. lda C1, 4 * SIZE(C1)
  2540. lda C2, 4 * SIZE(C2)
  2541. #endif
  2542. fclr t1
  2543. fclr t2
  2544. fclr t3
  2545. fclr t4
  2546. #ifdef RT
  2547. sll K, 2 + BASE_SHIFT, TMP1
  2548. addq AORIG, TMP1, AORIG
  2549. #endif
  2550. #if defined(LT) || defined(RN)
  2551. subq K, KK, TMP1
  2552. sll TMP1, BASE_SHIFT + 2, TMP2
  2553. addq AO, TMP2, AO
  2554. sll TMP1, BASE_SHIFT + 1, TMP2
  2555. addq BO, TMP2, BO
  2556. #endif
  2557. #ifdef LT
  2558. addq KK, 4, KK
  2559. #endif
  2560. #ifdef LN
  2561. subq KK, 4, KK
  2562. #endif
  2563. lda I, -1(I)
  2564. bgt I, $L51
  2565. .align 4
  2566. $L79:
  2567. #ifdef LN
  2568. sll K, 1 + BASE_SHIFT, TMP1
  2569. addq B, TMP1, B
  2570. #endif
  2571. #if defined(LT) || defined(RN)
  2572. mov BO, B
  2573. #endif
  2574. #ifdef RN
  2575. addq KK, 2, KK
  2576. #endif
  2577. #ifdef RT
  2578. subq KK, 2, KK
  2579. #endif
  2580. .align 4
  2581. $L80:
  2582. and N, 1, J
  2583. ble J, $L999
  2584. #ifdef RT
  2585. sll K, BASE_SHIFT, TMP1
  2586. subq B, TMP1, B
  2587. subq C, LDC, C
  2588. #endif
  2589. mov C, C1
  2590. #ifndef RT
  2591. addq C, LDC, C
  2592. #endif
  2593. #ifdef LN
  2594. addq M, OFFSET, KK
  2595. #endif
  2596. #ifdef LT
  2597. mov OFFSET, KK
  2598. #endif
  2599. #if defined(LN) || defined(RT)
  2600. mov A, AORIG
  2601. #else
  2602. mov A, AO
  2603. #endif
  2604. and M, 1, I
  2605. ble I, $L100
  2606. #if defined(LT) || defined(RN)
  2607. LD a1, 0 * SIZE(AO)
  2608. fclr t1
  2609. LD a2, 1 * SIZE(AO)
  2610. fclr t2
  2611. LD a3, 2 * SIZE(AO)
  2612. fclr t3
  2613. LD a4, 3 * SIZE(AO)
  2614. fclr t4
  2615. LD b1, 0 * SIZE(B)
  2616. fclr c01
  2617. LD b2, 1 * SIZE(B)
  2618. fclr c02
  2619. LD b3, 2 * SIZE(B)
  2620. fclr c03
  2621. LD b4, 3 * SIZE(B)
  2622. fclr c04
  2623. sra KK, 2, L
  2624. mov B, BO
  2625. unop
  2626. ble L, $L115
  2627. #else
  2628. #ifdef LN
  2629. sll K, BASE_SHIFT + 0, TMP1
  2630. subq AORIG, TMP1, AORIG
  2631. #endif
  2632. sll KK, BASE_SHIFT + 0, TMP1
  2633. addq AORIG, TMP1, AO
  2634. sll KK, BASE_SHIFT + 0, TMP1
  2635. addq B, TMP1, BO
  2636. subq K, KK, TMP1
  2637. LD a1, 0 * SIZE(AO)
  2638. fclr t1
  2639. LD a2, 1 * SIZE(AO)
  2640. fclr t2
  2641. LD a3, 2 * SIZE(AO)
  2642. fclr t3
  2643. LD a4, 3 * SIZE(AO)
  2644. fclr t4
  2645. LD b1, 0 * SIZE(BO)
  2646. fclr c01
  2647. LD b2, 1 * SIZE(BO)
  2648. fclr c02
  2649. LD b3, 2 * SIZE(BO)
  2650. fclr c03
  2651. LD b4, 3 * SIZE(BO)
  2652. fclr c04
  2653. sra TMP1, 2, L
  2654. unop
  2655. ble L, $L115
  2656. #endif
  2657. .align 4
  2658. $L112:
  2659. ADD c01, t1, c01
  2660. MUL a1, b1, t1
  2661. LD a1, 4 * SIZE(AO)
  2662. LD b1, 4 * SIZE(BO)
  2663. ADD c02, t2, c02
  2664. MUL a2, b2, t2
  2665. LD a2, 5 * SIZE(AO)
  2666. LD b2, 5 * SIZE(BO)
  2667. ADD c03, t3, c03
  2668. MUL a3, b3, t3
  2669. LD a3, 6 * SIZE(AO)
  2670. LD b3, 6 * SIZE(BO)
  2671. ADD c04, t4, c04
  2672. MUL a4, b4, t4
  2673. LD a4, 7 * SIZE(AO)
  2674. LD b4, 7 * SIZE(BO)
  2675. lda L, -1(L)
  2676. lda AO, 4 * SIZE(AO)
  2677. lda BO, 4 * SIZE(BO)
  2678. bgt L, $L112
  2679. .align 4
  2680. $L115:
  2681. #if defined(LT) || defined(RN)
  2682. and KK, 3, L
  2683. #else
  2684. and TMP1, 3, L
  2685. #endif
  2686. ble L, $L118
  2687. .align 4
  2688. $L116:
  2689. ADD c01, t1, c01
  2690. MUL a1, b1, t1
  2691. LD a1, 1 * SIZE(AO)
  2692. LD b1, 1 * SIZE(BO)
  2693. lda L, -1(L)
  2694. lda AO, 1 * SIZE(AO)
  2695. lda BO, 1 * SIZE(BO)
  2696. bgt L, $L116
  2697. .align 4
  2698. $L118:
  2699. ADD c01, t1, c01
  2700. ADD c02, t2, c02
  2701. ADD c03, t3, c03
  2702. ADD c04, t4, c04
  2703. ADD c01, c02, c01
  2704. ADD c03, c04, c03
  2705. ADD c01, c03, c01
  2706. #if defined(LN) || defined(RT)
  2707. subq KK, 1, TMP1
  2708. sll TMP1, BASE_SHIFT + 0, TMP2
  2709. addq AORIG, TMP2, AO
  2710. addq B, TMP2, BO
  2711. #endif
  2712. #if defined(LN) || defined(LT)
  2713. LD a1, 0 * SIZE(BO)
  2714. SUB a1, c01, c01
  2715. #else
  2716. LD a1, 0 * SIZE(AO)
  2717. SUB a1, c01, c01
  2718. #endif
  2719. #if defined(LN) || defined(LT)
  2720. LD a1, 0 * SIZE(AO)
  2721. MUL a1, c01, c01
  2722. #endif
  2723. #if defined(RN) || defined(RT)
  2724. LD a1, 0 * SIZE(BO)
  2725. MUL a1, c01, c01
  2726. #endif
  2727. #if defined(LN) || defined(LT)
  2728. ST c01, 0 * SIZE(BO)
  2729. #else
  2730. ST c01, 0 * SIZE(AO)
  2731. #endif
  2732. #ifdef LN
  2733. lda C1, -1 * SIZE(C1)
  2734. #endif
  2735. ST c01, 0 * SIZE(C1)
  2736. #ifndef LN
  2737. lda C1, 1 * SIZE(C1)
  2738. #endif
  2739. #ifdef RT
  2740. SXADDQ K, AORIG, AORIG
  2741. #endif
  2742. #if defined(LT) || defined(RN)
  2743. subq K, KK, TMP1
  2744. sll TMP1, BASE_SHIFT + 0, TMP2
  2745. addq AO, TMP2, AO
  2746. addq BO, TMP2, BO
  2747. #endif
  2748. #ifdef LT
  2749. addq KK, 1, KK
  2750. #endif
  2751. #ifdef LN
  2752. subq KK, 1, KK
  2753. #endif
  2754. .align 4
  2755. $L100:
  2756. and M, 2, I
  2757. ble I, $L110
  2758. #if defined(LT) || defined(RN)
  2759. LD a1, 0 * SIZE(AO)
  2760. fclr t1
  2761. LD a2, 1 * SIZE(AO)
  2762. fclr t2
  2763. LD a3, 2 * SIZE(AO)
  2764. fclr t3
  2765. LD a4, 3 * SIZE(AO)
  2766. fclr t4
  2767. LD b1, 0 * SIZE(B)
  2768. fclr c01
  2769. LD b2, 1 * SIZE(B)
  2770. fclr c02
  2771. LD b3, 2 * SIZE(B)
  2772. fclr c03
  2773. LD b4, 3 * SIZE(B)
  2774. fclr c04
  2775. sra KK, 2, L
  2776. mov B, BO
  2777. ble L, $L105
  2778. #else
  2779. #ifdef LN
  2780. sll K, BASE_SHIFT + 1, TMP1
  2781. subq AORIG, TMP1, AORIG
  2782. #endif
  2783. sll KK, BASE_SHIFT + 1, TMP1
  2784. addq AORIG, TMP1, AO
  2785. sll KK, BASE_SHIFT + 0, TMP1
  2786. addq B, TMP1, BO
  2787. subq K, KK, TMP1
  2788. LD a1, 0 * SIZE(AO)
  2789. fclr t1
  2790. LD a2, 1 * SIZE(AO)
  2791. fclr t2
  2792. LD a3, 2 * SIZE(AO)
  2793. fclr t3
  2794. LD a4, 3 * SIZE(AO)
  2795. fclr t4
  2796. LD b1, 0 * SIZE(BO)
  2797. fclr c01
  2798. LD b2, 1 * SIZE(BO)
  2799. fclr c02
  2800. LD b3, 2 * SIZE(BO)
  2801. fclr c03
  2802. LD b4, 3 * SIZE(BO)
  2803. fclr c04
  2804. sra TMP1, 2, L
  2805. ble L, $L105
  2806. #endif
  2807. .align 5
  2808. $L102:
  2809. ADD c01, t1, c01
  2810. lda L, -1(L)
  2811. MUL a1, b1, t1
  2812. LD a1, 4 * SIZE(AO)
  2813. ADD c02, t2, c02
  2814. MUL a2, b1, t2
  2815. LD a2, 5 * SIZE(AO)
  2816. LD b1, 4 * SIZE(BO)
  2817. ADD c03, t3, c03
  2818. lda BO, 4 * SIZE(BO)
  2819. MUL a3, b2, t3
  2820. LD a3, 6 * SIZE(AO)
  2821. ADD c04, t4, c04
  2822. MUL a4, b2, t4
  2823. LD a5, 7 * SIZE(AO)
  2824. LD b2, 1 * SIZE(BO)
  2825. ADD c01, t1, c01
  2826. MUL a1, b3, t1
  2827. LD a1, 8 * SIZE(AO)
  2828. lda AO, 8 * SIZE(AO)
  2829. ADD c02, t2, c02
  2830. MUL a2, b3, t2
  2831. LD b3, 2 * SIZE(BO)
  2832. LD a2, 1 * SIZE(AO)
  2833. ADD c03, t3, c03
  2834. LD a4, 3 * SIZE(AO)
  2835. MUL a3, b4, t3
  2836. LD a3, 2 * SIZE(AO)
  2837. ADD c04, t4, c04
  2838. MUL a5, b4, t4
  2839. LD b4, 3 * SIZE(BO)
  2840. bgt L, $L102
  2841. .align 4
  2842. $L105:
  2843. #if defined(LT) || defined(RN)
  2844. and KK, 3, L
  2845. #else
  2846. and TMP1, 3, L
  2847. #endif
  2848. ble L, $L108
  2849. .align 4
  2850. $L106:
  2851. ADD c01, t1, c01
  2852. lda L, -1(L)
  2853. MUL a1, b1, t1
  2854. LD a1, 2 * SIZE(AO)
  2855. ADD c02, t2, c02
  2856. MUL a2, b1, t2
  2857. LD a2, 3 * SIZE(AO)
  2858. LD b1, 1 * SIZE(BO)
  2859. lda AO, 2 * SIZE(AO)
  2860. unop
  2861. lda BO, 1 * SIZE(BO)
  2862. bgt L, $L106
  2863. .align 4
  2864. $L108:
  2865. ADD c01, t1, c01
  2866. ADD c02, t2, c02
  2867. ADD c03, t3, c03
  2868. ADD c04, t4, c04
  2869. ADD c01, c03, c01
  2870. ADD c02, c04, c02
  2871. #if defined(LN) || defined(RT)
  2872. #ifdef LN
  2873. subq KK, 2, TMP1
  2874. #else
  2875. subq KK, 1, TMP1
  2876. #endif
  2877. sll TMP1, BASE_SHIFT + 1, TMP2
  2878. addq AORIG, TMP2, AO
  2879. sll TMP1, BASE_SHIFT + 0, TMP2
  2880. addq B, TMP2, BO
  2881. #endif
  2882. #if defined(LN) || defined(LT)
  2883. LD a1, 0 * SIZE(BO)
  2884. LD a2, 1 * SIZE(BO)
  2885. SUB a1, c01, c01
  2886. SUB a2, c02, c02
  2887. #else
  2888. LD a1, 0 * SIZE(AO)
  2889. LD a2, 1 * SIZE(AO)
  2890. SUB a1, c01, c01
  2891. SUB a2, c02, c02
  2892. #endif
  2893. #ifdef LN
  2894. LD a1, 3 * SIZE(AO)
  2895. LD a2, 2 * SIZE(AO)
  2896. LD a3, 0 * SIZE(AO)
  2897. MUL a1, c02, c02
  2898. MUL a2, c02, t1
  2899. SUB c01, t1, c01
  2900. MUL a3, c01, c01
  2901. #endif
  2902. #ifdef LT
  2903. LD a1, 0 * SIZE(AO)
  2904. LD a2, 1 * SIZE(AO)
  2905. LD a3, 3 * SIZE(AO)
  2906. MUL a1, c01, c01
  2907. MUL a2, c01, t1
  2908. SUB c02, t1, c02
  2909. MUL a3, c02, c02
  2910. #endif
  2911. #if defined(RN) || defined(RT)
  2912. LD a1, 0 * SIZE(BO)
  2913. MUL a1, c01, c01
  2914. MUL a1, c02, c02
  2915. #endif
  2916. #if defined(LN) || defined(LT)
  2917. ST c01, 0 * SIZE(BO)
  2918. ST c02, 1 * SIZE(BO)
  2919. #else
  2920. ST c01, 0 * SIZE(AO)
  2921. ST c02, 1 * SIZE(AO)
  2922. #endif
  2923. #ifdef LN
  2924. lda C1, -2 * SIZE(C1)
  2925. #endif
  2926. ST c01, 0 * SIZE(C1)
  2927. ST c02, 1 * SIZE(C1)
  2928. #ifndef LN
  2929. lda C1, 2 * SIZE(C1)
  2930. #endif
  2931. fclr t1
  2932. fclr t2
  2933. fclr t3
  2934. fclr t4
  2935. #ifdef RT
  2936. sll K, 1 + BASE_SHIFT, TMP1
  2937. addq AORIG, TMP1, AORIG
  2938. #endif
  2939. #if defined(LT) || defined(RN)
  2940. subq K, KK, TMP1
  2941. sll TMP1, BASE_SHIFT + 1, TMP2
  2942. addq AO, TMP2, AO
  2943. sll TMP1, BASE_SHIFT + 0, TMP2
  2944. addq BO, TMP2, BO
  2945. #endif
  2946. #ifdef LT
  2947. addq KK, 2, KK
  2948. #endif
  2949. #ifdef LN
  2950. subq KK, 2, KK
  2951. #endif
  2952. .align 4
  2953. $L110:
  2954. sra M, 2, I
  2955. ble I, $L119
  2956. .align 4
  2957. $L91:
  2958. #if defined(LT) || defined(RN)
  2959. LD a1, 0 * SIZE(AO)
  2960. fclr t1
  2961. LD a2, 1 * SIZE(AO)
  2962. fclr t2
  2963. LD a3, 2 * SIZE(AO)
  2964. fclr t3
  2965. LD a4, 3 * SIZE(AO)
  2966. fclr t4
  2967. LD b1, 0 * SIZE(B)
  2968. fclr c01
  2969. LD b2, 1 * SIZE(B)
  2970. fclr c02
  2971. LD b3, 2 * SIZE(B)
  2972. fclr c03
  2973. LD b4, 3 * SIZE(B)
  2974. fclr c04
  2975. sra KK, 2, L
  2976. mov B, BO
  2977. ble L, $L95
  2978. #else
  2979. #ifdef LN
  2980. sll K, BASE_SHIFT + 2, TMP1
  2981. subq AORIG, TMP1, AORIG
  2982. #endif
  2983. sll KK, BASE_SHIFT + 2, TMP1
  2984. addq AORIG, TMP1, AO
  2985. sll KK, BASE_SHIFT + 0, TMP1
  2986. addq B, TMP1, BO
  2987. subq K, KK, TMP1
  2988. LD a1, 0 * SIZE(AO)
  2989. fclr t1
  2990. LD a2, 1 * SIZE(AO)
  2991. fclr t2
  2992. LD a3, 2 * SIZE(AO)
  2993. fclr t3
  2994. LD a4, 3 * SIZE(AO)
  2995. fclr t4
  2996. LD b1, 0 * SIZE(BO)
  2997. fclr c01
  2998. LD b2, 1 * SIZE(BO)
  2999. fclr c02
  3000. LD b3, 2 * SIZE(BO)
  3001. fclr c03
  3002. LD b4, 3 * SIZE(BO)
  3003. fclr c04
  3004. sra TMP1, 2, L
  3005. unop
  3006. ble L, $L95
  3007. #endif
  3008. .align 5
  3009. $L92:
  3010. ADD c01, t1, c01
  3011. unop
  3012. MUL a1, b1, t1
  3013. LD a1, 4 * SIZE(AO)
  3014. ADD c02, t2, c02
  3015. lda L, -1(L)
  3016. MUL a2, b1, t2
  3017. LD a2, 5 * SIZE(AO)
  3018. ADD c03, t3, c03
  3019. unop
  3020. MUL a3, b1, t3
  3021. LD a3, 6 * SIZE(AO)
  3022. ADD c04, t4, c04
  3023. MUL a4, b1, t4
  3024. LD a4, 7 * SIZE(AO)
  3025. LD b1, 4 * SIZE(BO)
  3026. ADD c01, t1, c01
  3027. unop
  3028. MUL a1, b2, t1
  3029. LD a1, 8 * SIZE(AO)
  3030. ADD c02, t2, c02
  3031. unop
  3032. MUL a2, b2, t2
  3033. LD a2, 9 * SIZE(AO)
  3034. ADD c03, t3, c03
  3035. unop
  3036. MUL a3, b2, t3
  3037. LD a3, 10 * SIZE(AO)
  3038. ADD c04, t4, c04
  3039. MUL a4, b2, t4
  3040. LD a4, 11 * SIZE(AO)
  3041. LD b2, 5 * SIZE(BO)
  3042. ADD c01, t1, c01
  3043. unop
  3044. MUL a1, b3, t1
  3045. LD a1, 12 * SIZE(AO)
  3046. ADD c02, t2, c02
  3047. unop
  3048. MUL a2, b3, t2
  3049. LD a2, 13 * SIZE(AO)
  3050. ADD c03, t3, c03
  3051. unop
  3052. MUL a3, b3, t3
  3053. LD a3, 14 * SIZE(AO)
  3054. ADD c04, t4, c04
  3055. MUL a4, b3, t4
  3056. LD a5, 15 * SIZE(AO)
  3057. LD b3, 6 * SIZE(BO)
  3058. ADD c01, t1, c01
  3059. MUL a1, b4, t1
  3060. LD a1, 16 * SIZE(AO)
  3061. lda AO, 16 * SIZE(AO)
  3062. ADD c02, t2, c02
  3063. lda BO, 4 * SIZE(BO)
  3064. MUL a2, b4, t2
  3065. LD a2, 1 * SIZE(AO)
  3066. ADD c03, t3, c03
  3067. LD a4, 3 * SIZE(AO)
  3068. MUL a3, b4, t3
  3069. LD a3, 2 * SIZE(AO)
  3070. ADD c04, t4, c04
  3071. MUL a5, b4, t4
  3072. LD b4, 3 * SIZE(BO)
  3073. bgt L, $L92
  3074. .align 4
  3075. $L95:
  3076. #if defined(LT) || defined(RN)
  3077. and KK, 3, L
  3078. #else
  3079. and TMP1, 3, L
  3080. #endif
  3081. unop
  3082. ble L, $L98
  3083. .align 4
  3084. $L96:
  3085. ADD c01, t1, c01
  3086. lda L, -1(L)
  3087. MUL a1, b1, t1
  3088. LD a1, 4 * SIZE(AO)
  3089. ADD c02, t2, c02
  3090. lda BO, 1 * SIZE(BO)
  3091. MUL a2, b1, t2
  3092. LD a2, 5 * SIZE(AO)
  3093. ADD c03, t3, c03
  3094. unop
  3095. MUL a3, b1, t3
  3096. LD a3, 6 * SIZE(AO)
  3097. ADD c04, t4, c04
  3098. MUL a4, b1, t4
  3099. LD a4, 7 * SIZE(AO)
  3100. LD b1, 0 * SIZE(BO)
  3101. lda AO, 4 * SIZE(AO)
  3102. bgt L, $L96
  3103. .align 4
  3104. $L98:
  3105. ADD c01, t1, c01
  3106. ADD c02, t2, c02
  3107. ADD c03, t3, c03
  3108. ADD c04, t4, c04
  3109. #if defined(LN) || defined(RT)
  3110. #ifdef LN
  3111. subq KK, 4, TMP1
  3112. #else
  3113. subq KK, 1, TMP1
  3114. #endif
  3115. sll TMP1, BASE_SHIFT + 2, TMP2
  3116. addq AORIG, TMP2, AO
  3117. sll TMP1, BASE_SHIFT + 0, TMP2
  3118. addq B, TMP2, BO
  3119. #endif
  3120. #if defined(LN) || defined(LT)
  3121. LD a1, 0 * SIZE(BO)
  3122. LD a2, 1 * SIZE(BO)
  3123. LD a3, 2 * SIZE(BO)
  3124. LD a4, 3 * SIZE(BO)
  3125. SUB a1, c01, c01
  3126. SUB a2, c02, c02
  3127. SUB a3, c03, c03
  3128. SUB a4, c04, c04
  3129. #else
  3130. LD a1, 0 * SIZE(AO)
  3131. LD a2, 1 * SIZE(AO)
  3132. LD a3, 2 * SIZE(AO)
  3133. LD a4, 3 * SIZE(AO)
  3134. SUB a1, c01, c01
  3135. SUB a2, c02, c02
  3136. SUB a3, c03, c03
  3137. SUB a4, c04, c04
  3138. #endif
  3139. #ifdef LN
  3140. LD a1, 15 * SIZE(AO)
  3141. LD a2, 14 * SIZE(AO)
  3142. LD a3, 13 * SIZE(AO)
  3143. LD a4, 12 * SIZE(AO)
  3144. MUL a1, c04, c04
  3145. MUL a2, c04, t1
  3146. SUB c03, t1, c03
  3147. MUL a3, c04, t1
  3148. SUB c02, t1, c02
  3149. MUL a4, c04, t1
  3150. SUB c01, t1, c01
  3151. LD b1, 10 * SIZE(AO)
  3152. LD b2, 9 * SIZE(AO)
  3153. LD b3, 8 * SIZE(AO)
  3154. MUL b1, c03, c03
  3155. MUL b2, c03, t1
  3156. SUB c02, t1, c02
  3157. MUL b3, c03, t1
  3158. SUB c01, t1, c01
  3159. LD a1, 5 * SIZE(AO)
  3160. LD a2, 4 * SIZE(AO)
  3161. LD a3, 0 * SIZE(AO)
  3162. MUL a1, c02, c02
  3163. MUL a2, c02, t1
  3164. SUB c01, t1, c01
  3165. MUL a3, c01, c01
  3166. #endif
  3167. #ifdef LT
  3168. LD a1, 0 * SIZE(AO)
  3169. LD a2, 1 * SIZE(AO)
  3170. LD a3, 2 * SIZE(AO)
  3171. LD a4, 3 * SIZE(AO)
  3172. MUL a1, c01, c01
  3173. MUL a2, c01, t1
  3174. SUB c02, t1, c02
  3175. MUL a3, c01, t1
  3176. SUB c03, t1, c03
  3177. MUL a4, c01, t1
  3178. SUB c04, t1, c04
  3179. LD b1, 5 * SIZE(AO)
  3180. LD b2, 6 * SIZE(AO)
  3181. LD b3, 7 * SIZE(AO)
  3182. MUL b1, c02, c02
  3183. MUL b2, c02, t1
  3184. SUB c03, t1, c03
  3185. MUL b3, c02, t1
  3186. SUB c04, t1, c04
  3187. LD a1, 10 * SIZE(AO)
  3188. LD a2, 11 * SIZE(AO)
  3189. LD a3, 15 * SIZE(AO)
  3190. MUL a1, c03, c03
  3191. MUL a2, c03, t1
  3192. SUB c04, t1, c04
  3193. MUL a3, c04, c04
  3194. #endif
  3195. #if defined(RN) || defined(RT)
  3196. LD a1, 0 * SIZE(BO)
  3197. MUL a1, c01, c01
  3198. MUL a1, c02, c02
  3199. MUL a1, c03, c03
  3200. MUL a1, c04, c04
  3201. #endif
  3202. #if defined(LN) || defined(LT)
  3203. ST c01, 0 * SIZE(BO)
  3204. ST c02, 1 * SIZE(BO)
  3205. ST c03, 2 * SIZE(BO)
  3206. ST c04, 3 * SIZE(BO)
  3207. #else
  3208. ST c01, 0 * SIZE(AO)
  3209. ST c02, 1 * SIZE(AO)
  3210. ST c03, 2 * SIZE(AO)
  3211. ST c04, 3 * SIZE(AO)
  3212. #endif
  3213. #ifdef LN
  3214. lda C1, -4 * SIZE(C1)
  3215. #endif
  3216. ST c01, 0 * SIZE(C1)
  3217. ST c02, 1 * SIZE(C1)
  3218. ST c03, 2 * SIZE(C1)
  3219. ST c04, 3 * SIZE(C1)
  3220. #ifndef LN
  3221. lda C1, 4 * SIZE(C1)
  3222. #endif
  3223. fclr t1
  3224. fclr t2
  3225. fclr t3
  3226. fclr t4
  3227. #ifdef RT
  3228. sll K, 2 + BASE_SHIFT, TMP1
  3229. addq AORIG, TMP1, AORIG
  3230. #endif
  3231. #if defined(LT) || defined(RN)
  3232. subq K, KK, TMP1
  3233. sll TMP1, BASE_SHIFT + 2, TMP2
  3234. addq AO, TMP2, AO
  3235. sll TMP1, BASE_SHIFT + 0, TMP2
  3236. addq BO, TMP2, BO
  3237. #endif
  3238. #ifdef LT
  3239. addq KK, 4, KK
  3240. #endif
  3241. #ifdef LN
  3242. subq KK, 4, KK
  3243. #endif
  3244. lda I, -1(I)
  3245. bgt I, $L91
  3246. .align 4
  3247. $L119:
  3248. #ifdef LN
  3249. SXADDQ K, B, B
  3250. #endif
  3251. #if defined(LT) || defined(RN)
  3252. mov BO, B
  3253. #endif
  3254. #ifdef RN
  3255. addq KK, 1, KK
  3256. #endif
  3257. #ifdef RT
  3258. subq KK, 1, KK
  3259. #endif
  3260. .align 4
  3261. $L999:
  3262. ldt $f2, 0($sp)
  3263. ldt $f3, 8($sp)
  3264. ldt $f4, 16($sp)
  3265. ldt $f5, 24($sp)
  3266. ldt $f6, 32($sp)
  3267. ldt $f7, 40($sp)
  3268. ldt $f8, 48($sp)
  3269. ldt $f9, 56($sp)
  3270. clr $0
  3271. lda $sp, STACKSIZE($sp)
  3272. ret
  3273. EPILOGUE