You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_4x4_sse.S 69 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define OLD_M 4 + STACK(%esi)
  42. #define OLD_N 8 + STACK(%esi)
  43. #define OLD_K 12 + STACK(%esi)
  44. #define OLD_A 20 + STACK(%esi)
  45. #define OLD_B 24 + STACK(%esi)
  46. #define OLD_C 28 + STACK(%esi)
  47. #define OLD_LDC 32 + STACK(%esi)
  48. #define STACK_OFFT 36 + STACK(%esi)
  49. #define K 16(%esp)
  50. #define N 20(%esp)
  51. #define M 24(%esp)
  52. #define A 28(%esp)
  53. #define C 32(%esp)
  54. #define J 36(%esp)
  55. #define OLD_STACK 40(%esp)
  56. #define OFFSET 44(%esp)
  57. #define KK 48(%esp)
  58. #define KKK 52(%esp)
  59. #define AORIG 56(%esp)
  60. #define BORIG 60(%esp)
  61. #define BUFFER 128(%esp)
  62. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  63. #define PREFETCH prefetch
  64. #define PREFETCHW prefetchw
  65. #define PREFETCHSIZE (16 * 10 + 8)
  66. #endif
  67. #if defined(PENTIUM4) || defined(PENTIUMM)
  68. #define PREFETCH prefetcht0
  69. #define PREFETCHW prefetcht0
  70. #define PREFETCHSIZE 96
  71. #endif
  72. #if defined(PENRYN) || defined(DUNNINGTON)
  73. #define PREFETCH prefetcht0
  74. #define PREFETCHW prefetcht0
  75. #define PREFETCHSIZE 96
  76. #endif
  77. #define B %edi
  78. #define AA %edx
  79. #define BB %ecx
  80. #define LDC %ebp
  81. #define CO1 %esi
  82. #if defined(OPTERON) || !defined(HAVE_SSE2)
  83. #define movsd movlps
  84. #endif
  85. #ifdef HAVE_SSE2
  86. #define xorps pxor
  87. #endif
  88. #define KERNEL1(address) \
  89. mulps %xmm0, %xmm2; \
  90. addps %xmm2, %xmm4; \
  91. movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  92. mulps %xmm0, %xmm2; \
  93. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  94. addps %xmm2, %xmm5; \
  95. movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  96. mulps %xmm0, %xmm2; \
  97. mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  98. addps %xmm2, %xmm6; \
  99. movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  100. addps %xmm0, %xmm7; \
  101. movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  102. #define KERNEL2(address) \
  103. mulps %xmm0, %xmm3; \
  104. addps %xmm3, %xmm4; \
  105. movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  106. mulps %xmm0, %xmm3; \
  107. addps %xmm3, %xmm5; \
  108. movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  109. mulps %xmm0, %xmm3; \
  110. mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  111. addps %xmm3, %xmm6; \
  112. movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  113. addps %xmm0, %xmm7; \
  114. movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  115. #define KERNEL3(address) \
  116. mulps %xmm0, %xmm2; \
  117. addps %xmm2, %xmm4; \
  118. movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  119. mulps %xmm0, %xmm2; \
  120. addps %xmm2, %xmm5; \
  121. movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  122. mulps %xmm0, %xmm2; \
  123. mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  124. addps %xmm2, %xmm6; \
  125. movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  126. addps %xmm0, %xmm7; \
  127. movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  128. #define KERNEL4(address) \
  129. mulps %xmm0, %xmm3; \
  130. addps %xmm3, %xmm4; \
  131. movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  132. mulps %xmm0, %xmm3; \
  133. addps %xmm3, %xmm5; \
  134. movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  135. mulps %xmm0, %xmm3; \
  136. mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  137. addps %xmm3, %xmm6; \
  138. movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  139. addps %xmm0, %xmm7; \
  140. movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  141. #define KERNEL5(address) \
  142. mulps %xmm1, %xmm2; \
  143. addps %xmm2, %xmm4; \
  144. movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  145. mulps %xmm1, %xmm2; \
  146. addps %xmm2, %xmm5; \
  147. movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  148. mulps %xmm1, %xmm2; \
  149. mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  150. addps %xmm2, %xmm6; \
  151. movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  152. addps %xmm1, %xmm7; \
  153. movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  154. #define KERNEL6(address) \
  155. mulps %xmm1, %xmm3; \
  156. addps %xmm3, %xmm4; \
  157. movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  158. mulps %xmm1, %xmm3; \
  159. addps %xmm3, %xmm5; \
  160. movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  161. mulps %xmm1, %xmm3; \
  162. mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  163. addps %xmm3, %xmm6; \
  164. movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  165. addps %xmm1, %xmm7; \
  166. movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  167. #define KERNEL7(address) \
  168. mulps %xmm1, %xmm2; \
  169. addps %xmm2, %xmm4; \
  170. movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  171. mulps %xmm1, %xmm2; \
  172. addps %xmm2, %xmm5; \
  173. movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  174. mulps %xmm1, %xmm2; \
  175. mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  176. addps %xmm2, %xmm6; \
  177. movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  178. addps %xmm1, %xmm7; \
  179. movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  180. #define KERNEL8(address) \
  181. mulps %xmm1, %xmm3; \
  182. addps %xmm3, %xmm4; \
  183. movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  184. mulps %xmm1, %xmm3; \
  185. addps %xmm3, %xmm5; \
  186. movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  187. mulps %xmm1, %xmm3; \
  188. mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  189. addps %xmm3, %xmm6; \
  190. movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  191. addps %xmm1, %xmm7; \
  192. movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
  193. PROLOGUE
  194. pushl %ebp
  195. pushl %edi
  196. pushl %esi
  197. pushl %ebx
  198. PROFCODE
  199. movl %esp, %esi
  200. subl $128 + LOCAL_BUFFER_SIZE, %esp
  201. andl $-1024, %esp
  202. STACK_TOUCHING
  203. movl OLD_M, %ebx
  204. movl OLD_N, %eax
  205. movl OLD_K, %ecx
  206. movl OLD_A, %edx
  207. movl %ebx, M
  208. movl %eax, N
  209. movl %ecx, K
  210. movl %edx, A
  211. movl %esi, OLD_STACK
  212. movss STACK_OFFT, %xmm4
  213. movl OLD_B, B
  214. movl OLD_C, %ebx
  215. movl %ebx, C
  216. movl OLD_LDC, LDC
  217. movss %xmm4, OFFSET
  218. movss %xmm4, KK
  219. leal (, LDC, SIZE), LDC
  220. #ifdef LN
  221. movl M, %eax
  222. leal (, %eax, SIZE), %eax
  223. addl %eax, C
  224. imull K, %eax
  225. addl %eax, A
  226. #endif
  227. #ifdef RT
  228. movl N, %eax
  229. leal (, %eax, SIZE), %eax
  230. imull K, %eax
  231. addl %eax, B
  232. movl N, %eax
  233. imull LDC, %eax
  234. addl %eax, C
  235. #endif
  236. #ifdef RN
  237. negl KK
  238. #endif
  239. #ifdef RT
  240. movl N, %eax
  241. subl OFFSET, %eax
  242. movl %eax, KK
  243. #endif
  244. movl N, %eax
  245. sarl $2, %eax
  246. movl %eax, J
  247. jle .L40
  248. .L01:
  249. #ifdef LN
  250. movl OFFSET, %eax
  251. addl M, %eax
  252. movl %eax, KK
  253. #endif
  254. leal BUFFER, %ecx
  255. #ifdef RT
  256. movl K, %eax
  257. sall $2 + BASE_SHIFT, %eax
  258. subl %eax, B
  259. #endif
  260. #if defined(LN) || defined(RT)
  261. movl KK, %eax
  262. movl B, BORIG
  263. sall $2 + BASE_SHIFT, %eax
  264. leal (B, %eax, 1), B
  265. leal (BB, %eax, 4), BB
  266. #endif
  267. #ifdef LT
  268. movl OFFSET, %eax
  269. movl %eax, KK
  270. #endif
  271. #if defined(LT) || defined(RN)
  272. movl KK, %eax
  273. #else
  274. movl K, %eax
  275. subl KK, %eax
  276. #endif
  277. sarl $1, %eax
  278. jle .L05
  279. ALIGN_4
  280. .L02:
  281. movaps 0 * SIZE(B), %xmm3
  282. movaps 4 * SIZE(B), %xmm7
  283. pshufd $0x00, %xmm3, %xmm0
  284. pshufd $0x55, %xmm3, %xmm1
  285. pshufd $0xaa, %xmm3, %xmm2
  286. pshufd $0xff, %xmm3, %xmm3
  287. pshufd $0x00, %xmm7, %xmm4
  288. pshufd $0x55, %xmm7, %xmm5
  289. pshufd $0xaa, %xmm7, %xmm6
  290. pshufd $0xff, %xmm7, %xmm7
  291. movaps %xmm0, 0 * SIZE(BB)
  292. movaps %xmm1, 4 * SIZE(BB)
  293. movaps %xmm2, 8 * SIZE(BB)
  294. movaps %xmm3, 12 * SIZE(BB)
  295. movaps %xmm4, 16 * SIZE(BB)
  296. movaps %xmm5, 20 * SIZE(BB)
  297. movaps %xmm6, 24 * SIZE(BB)
  298. movaps %xmm7, 28 * SIZE(BB)
  299. addl $ 8 * SIZE, B
  300. addl $32 * SIZE, %ecx
  301. decl %eax
  302. jne .L02
  303. ALIGN_2
  304. .L05:
  305. #if defined(LT) || defined(RN)
  306. movl KK, %eax
  307. #else
  308. movl K, %eax
  309. subl KK, %eax
  310. #endif
  311. andl $1, %eax
  312. BRANCH
  313. jle .L10
  314. movaps 0 * SIZE(B), %xmm3
  315. pshufd $0x00, %xmm3, %xmm0
  316. pshufd $0x55, %xmm3, %xmm1
  317. pshufd $0xaa, %xmm3, %xmm2
  318. pshufd $0xff, %xmm3, %xmm3
  319. movaps %xmm0, 0 * SIZE(BB)
  320. movaps %xmm1, 4 * SIZE(BB)
  321. movaps %xmm2, 8 * SIZE(BB)
  322. movaps %xmm3, 12 * SIZE(BB)
  323. addl $4 * SIZE, B
  324. ALIGN_4
  325. .L10:
  326. #if defined(LT) || defined(RN)
  327. movl A, AA
  328. #else
  329. movl A, %eax
  330. movl %eax, AORIG
  331. #endif
  332. leal (, LDC, 4), %eax
  333. #ifdef RT
  334. subl %eax, C
  335. #endif
  336. movl C, CO1
  337. #ifndef RT
  338. addl %eax, C
  339. #endif
  340. movl M, %ebx
  341. sarl $2, %ebx # i = (m >> 2)
  342. jle .L20
  343. ALIGN_4
  344. .L11:
  345. #ifdef LN
  346. movl K, %eax
  347. sall $2 + BASE_SHIFT, %eax
  348. subl %eax, AORIG
  349. #endif
  350. #if defined(LN) || defined(RT)
  351. movl KK, %eax
  352. movl AORIG, AA
  353. leal (, %eax, SIZE), %eax
  354. leal (AA, %eax, 4), AA
  355. #endif
  356. leal BUFFER, BB
  357. #if defined(LN) || defined(RT)
  358. movl KK, %eax
  359. sall $2 + BASE_SHIFT, %eax
  360. leal (BB, %eax, 4), BB
  361. #endif
  362. movaps 0 * SIZE(AA), %xmm0
  363. xorps %xmm4, %xmm4
  364. movaps 16 * SIZE(AA), %xmm1
  365. xorps %xmm5, %xmm5
  366. movaps 0 * SIZE(BB), %xmm2
  367. xorps %xmm6, %xmm6
  368. movaps 16 * SIZE(BB), %xmm3
  369. xorps %xmm7, %xmm7
  370. leal (LDC, LDC, 2), %eax
  371. PREFETCHW 3 * SIZE(CO1)
  372. PREFETCHW 3 * SIZE(CO1, LDC)
  373. PREFETCHW 3 * SIZE(CO1, LDC, 2)
  374. PREFETCHW 3 * SIZE(CO1, %eax)
  375. #if defined(LT) || defined(RN)
  376. movl KK, %eax
  377. #else
  378. movl K, %eax
  379. subl KK, %eax
  380. #endif
  381. sarl $3, %eax
  382. je .L15
  383. ALIGN_4
  384. .L12:
  385. KERNEL1(0 * 16)
  386. KERNEL2(0 * 16)
  387. KERNEL3(0 * 16)
  388. KERNEL4(0 * 16)
  389. KERNEL5(0 * 16)
  390. KERNEL6(0 * 16)
  391. KERNEL7(0 * 16)
  392. KERNEL8(0 * 16)
  393. addl $128 * SIZE, BB
  394. addl $32 * SIZE, AA
  395. decl %eax
  396. jne .L12
  397. ALIGN_4
  398. .L15:
  399. #if defined(LT) || defined(RN)
  400. movl KK, %eax
  401. #else
  402. movl K, %eax
  403. subl KK, %eax
  404. #endif
  405. andl $7, %eax # if (k & 1)
  406. BRANCH
  407. je .L18
  408. ALIGN_4
  409. .L16:
  410. mulps %xmm0, %xmm2
  411. addps %xmm2, %xmm4
  412. movaps 4 * SIZE(BB), %xmm2
  413. mulps %xmm0, %xmm2
  414. addps %xmm2, %xmm5
  415. movaps 8 * SIZE(BB), %xmm2
  416. mulps %xmm0, %xmm2
  417. mulps 12 * SIZE(BB), %xmm0
  418. addps %xmm2, %xmm6
  419. movaps 16 * SIZE(BB), %xmm2
  420. addps %xmm0, %xmm7
  421. movaps 4 * SIZE(AA), %xmm0
  422. addl $ 4 * SIZE, AA
  423. addl $16 * SIZE, BB
  424. decl %eax
  425. jg .L16
  426. ALIGN_4
  427. .L18:
  428. #if defined(LN) || defined(RT)
  429. movl KK, %eax
  430. #ifdef LN
  431. subl $4, %eax
  432. #else
  433. subl $4, %eax
  434. #endif
  435. movl AORIG, AA
  436. movl BORIG, B
  437. leal BUFFER, BB
  438. sall $2 + BASE_SHIFT, %eax
  439. leal (AA, %eax, 1), AA
  440. leal (B, %eax, 1), B
  441. leal (BB, %eax, 4), BB
  442. #endif
  443. #if defined(LN) || defined(LT)
  444. movaps %xmm4, %xmm0
  445. unpcklps %xmm6, %xmm4
  446. unpckhps %xmm6, %xmm0
  447. movaps %xmm5, %xmm1
  448. unpcklps %xmm7, %xmm5
  449. unpckhps %xmm7, %xmm1
  450. movaps %xmm4, %xmm6
  451. unpcklps %xmm5, %xmm4
  452. unpckhps %xmm5, %xmm6
  453. movaps %xmm0, %xmm2
  454. unpcklps %xmm1, %xmm0
  455. unpckhps %xmm1, %xmm2
  456. movaps 0 * SIZE(B), %xmm1
  457. movaps 4 * SIZE(B), %xmm3
  458. movaps 8 * SIZE(B), %xmm5
  459. movaps 12 * SIZE(B), %xmm7
  460. subps %xmm4, %xmm1
  461. subps %xmm6, %xmm3
  462. subps %xmm0, %xmm5
  463. subps %xmm2, %xmm7
  464. #else
  465. movaps 0 * SIZE(AA), %xmm0
  466. movaps 4 * SIZE(AA), %xmm1
  467. movaps 8 * SIZE(AA), %xmm2
  468. movaps 12 * SIZE(AA), %xmm3
  469. subps %xmm4, %xmm0
  470. subps %xmm5, %xmm1
  471. subps %xmm6, %xmm2
  472. subps %xmm7, %xmm3
  473. #endif
  474. #ifdef LN
  475. movaps 12 * SIZE(AA), %xmm4
  476. pshufd $0xff, %xmm4, %xmm6
  477. mulps %xmm6, %xmm7
  478. pshufd $0xaa, %xmm4, %xmm6
  479. mulps %xmm7, %xmm6
  480. subps %xmm6, %xmm5
  481. pshufd $0x55, %xmm4, %xmm6
  482. mulps %xmm7, %xmm6
  483. subps %xmm6, %xmm3
  484. pshufd $0x00, %xmm4, %xmm6
  485. mulps %xmm7, %xmm6
  486. subps %xmm6, %xmm1
  487. movaps 8 * SIZE(AA), %xmm4
  488. pshufd $0xaa, %xmm4, %xmm6
  489. mulps %xmm6, %xmm5
  490. pshufd $0x55, %xmm4, %xmm6
  491. mulps %xmm5, %xmm6
  492. subps %xmm6, %xmm3
  493. pshufd $0x00, %xmm4, %xmm6
  494. mulps %xmm5, %xmm6
  495. subps %xmm6, %xmm1
  496. movaps 4 * SIZE(AA), %xmm4
  497. pshufd $0x55, %xmm4, %xmm6
  498. mulps %xmm6, %xmm3
  499. pshufd $0x00, %xmm4, %xmm6
  500. mulps %xmm3, %xmm6
  501. subps %xmm6, %xmm1
  502. movaps 0 * SIZE(AA), %xmm4
  503. pshufd $0x00, %xmm4, %xmm6
  504. mulps %xmm6, %xmm1
  505. #endif
  506. #ifdef LT
  507. movaps 0 * SIZE(AA), %xmm4
  508. pshufd $0x00, %xmm4, %xmm6
  509. mulps %xmm6, %xmm1
  510. pshufd $0x55, %xmm4, %xmm6
  511. mulps %xmm1, %xmm6
  512. subps %xmm6, %xmm3
  513. pshufd $0xaa, %xmm4, %xmm6
  514. mulps %xmm1, %xmm6
  515. subps %xmm6, %xmm5
  516. pshufd $0xff, %xmm4, %xmm6
  517. mulps %xmm1, %xmm6
  518. subps %xmm6, %xmm7
  519. movaps 4 * SIZE(AA), %xmm4
  520. pshufd $0x55, %xmm4, %xmm6
  521. mulps %xmm6, %xmm3
  522. pshufd $0xaa, %xmm4, %xmm6
  523. mulps %xmm3, %xmm6
  524. subps %xmm6, %xmm5
  525. pshufd $0xff, %xmm4, %xmm6
  526. mulps %xmm3, %xmm6
  527. subps %xmm6, %xmm7
  528. movaps 8 * SIZE(AA), %xmm4
  529. pshufd $0xaa, %xmm4, %xmm6
  530. mulps %xmm6, %xmm5
  531. pshufd $0xff, %xmm4, %xmm6
  532. mulps %xmm5, %xmm6
  533. subps %xmm6, %xmm7
  534. movaps 12 * SIZE(AA), %xmm4
  535. pshufd $0xff, %xmm4, %xmm6
  536. mulps %xmm6, %xmm7
  537. #endif
  538. #ifdef RN
  539. movaps 0 * SIZE(B), %xmm6
  540. pshufd $0x00, %xmm6, %xmm7
  541. mulps %xmm7, %xmm0
  542. pshufd $0x55, %xmm6, %xmm7
  543. mulps %xmm0, %xmm7
  544. subps %xmm7, %xmm1
  545. pshufd $0xaa, %xmm6, %xmm7
  546. mulps %xmm0, %xmm7
  547. subps %xmm7, %xmm2
  548. pshufd $0xff, %xmm6, %xmm7
  549. mulps %xmm0, %xmm7
  550. subps %xmm7, %xmm3
  551. movaps 4 * SIZE(B), %xmm6
  552. pshufd $0x55, %xmm6, %xmm7
  553. mulps %xmm7, %xmm1
  554. pshufd $0xaa, %xmm6, %xmm7
  555. mulps %xmm1, %xmm7
  556. subps %xmm7, %xmm2
  557. pshufd $0xff, %xmm6, %xmm7
  558. mulps %xmm1, %xmm7
  559. subps %xmm7, %xmm3
  560. movaps 8 * SIZE(B), %xmm6
  561. pshufd $0xaa, %xmm6, %xmm7
  562. mulps %xmm7, %xmm2
  563. pshufd $0xff, %xmm6, %xmm7
  564. mulps %xmm2, %xmm7
  565. subps %xmm7, %xmm3
  566. movaps 12 * SIZE(B), %xmm6
  567. pshufd $0xff, %xmm6, %xmm7
  568. mulps %xmm7, %xmm3
  569. #endif
  570. #ifdef RT
  571. movaps 12 * SIZE(B), %xmm6
  572. pshufd $0xff, %xmm6, %xmm7
  573. mulps %xmm7, %xmm3
  574. pshufd $0xaa, %xmm6, %xmm7
  575. mulps %xmm3, %xmm7
  576. subps %xmm7, %xmm2
  577. pshufd $0x55, %xmm6, %xmm7
  578. mulps %xmm3, %xmm7
  579. subps %xmm7, %xmm1
  580. pshufd $0x00, %xmm6, %xmm7
  581. mulps %xmm3, %xmm7
  582. subps %xmm7, %xmm0
  583. movaps 8 * SIZE(B), %xmm6
  584. pshufd $0xaa, %xmm6, %xmm7
  585. mulps %xmm7, %xmm2
  586. pshufd $0x55, %xmm6, %xmm7
  587. mulps %xmm2, %xmm7
  588. subps %xmm7, %xmm1
  589. pshufd $0x00, %xmm6, %xmm7
  590. mulps %xmm2, %xmm7
  591. subps %xmm7, %xmm0
  592. movaps 4 * SIZE(B), %xmm6
  593. pshufd $0x55, %xmm6, %xmm7
  594. mulps %xmm7, %xmm1
  595. pshufd $0x00, %xmm6, %xmm7
  596. mulps %xmm1, %xmm7
  597. subps %xmm7, %xmm0
  598. movaps 0 * SIZE(B), %xmm6
  599. pshufd $0x00, %xmm6, %xmm7
  600. mulps %xmm7, %xmm0
  601. #endif
  602. #if defined(LN) || defined(LT)
  603. movaps %xmm1, 0 * SIZE(B)
  604. movaps %xmm3, 4 * SIZE(B)
  605. movaps %xmm5, 8 * SIZE(B)
  606. movaps %xmm7, 12 * SIZE(B)
  607. pshufd $0x00, %xmm1, %xmm0
  608. pshufd $0x55, %xmm1, %xmm2
  609. pshufd $0xaa, %xmm1, %xmm4
  610. pshufd $0xff, %xmm1, %xmm6
  611. movaps %xmm0, 0 * SIZE(BB)
  612. movaps %xmm2, 4 * SIZE(BB)
  613. movaps %xmm4, 8 * SIZE(BB)
  614. movaps %xmm6, 12 * SIZE(BB)
  615. pshufd $0x00, %xmm3, %xmm0
  616. pshufd $0x55, %xmm3, %xmm2
  617. pshufd $0xaa, %xmm3, %xmm4
  618. pshufd $0xff, %xmm3, %xmm6
  619. movaps %xmm0, 16 * SIZE(BB)
  620. movaps %xmm2, 20 * SIZE(BB)
  621. movaps %xmm4, 24 * SIZE(BB)
  622. movaps %xmm6, 28 * SIZE(BB)
  623. pshufd $0x00, %xmm5, %xmm0
  624. pshufd $0x55, %xmm5, %xmm2
  625. pshufd $0xaa, %xmm5, %xmm4
  626. pshufd $0xff, %xmm5, %xmm6
  627. movaps %xmm0, 32 * SIZE(BB)
  628. movaps %xmm2, 36 * SIZE(BB)
  629. movaps %xmm4, 40 * SIZE(BB)
  630. movaps %xmm6, 44 * SIZE(BB)
  631. pshufd $0x00, %xmm7, %xmm0
  632. pshufd $0x55, %xmm7, %xmm2
  633. pshufd $0xaa, %xmm7, %xmm4
  634. pshufd $0xff, %xmm7, %xmm6
  635. movaps %xmm0, 48 * SIZE(BB)
  636. movaps %xmm2, 52 * SIZE(BB)
  637. movaps %xmm4, 56 * SIZE(BB)
  638. movaps %xmm6, 60 * SIZE(BB)
  639. #else
  640. movaps %xmm0, 0 * SIZE(AA)
  641. movaps %xmm1, 4 * SIZE(AA)
  642. movaps %xmm2, 8 * SIZE(AA)
  643. movaps %xmm3, 12 * SIZE(AA)
  644. #endif
  645. #ifdef LN
  646. subl $4 * SIZE, CO1
  647. #endif
  648. leal (LDC, LDC, 2), %eax
  649. #if defined(LN) || defined(LT)
  650. movaps %xmm1, %xmm0
  651. unpcklps %xmm5, %xmm1
  652. unpckhps %xmm5, %xmm0
  653. movaps %xmm3, %xmm4
  654. unpcklps %xmm7, %xmm3
  655. unpckhps %xmm7, %xmm4
  656. movaps %xmm1, %xmm2
  657. unpcklps %xmm3, %xmm1
  658. unpckhps %xmm3, %xmm2
  659. movaps %xmm0, %xmm6
  660. unpcklps %xmm4, %xmm0
  661. unpckhps %xmm4, %xmm6
  662. movlps %xmm1, 0 * SIZE(CO1)
  663. movhps %xmm1, 2 * SIZE(CO1)
  664. movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
  665. movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
  666. movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
  667. movhps %xmm0, 2 * SIZE(CO1, LDC, 2)
  668. movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
  669. movhps %xmm6, 2 * SIZE(CO1, %eax, 1)
  670. #else
  671. movlps %xmm0, 0 * SIZE(CO1)
  672. movhps %xmm0, 2 * SIZE(CO1)
  673. movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
  674. movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
  675. movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
  676. movhps %xmm2, 2 * SIZE(CO1, LDC, 2)
  677. movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
  678. movhps %xmm3, 2 * SIZE(CO1, %eax, 1)
  679. #endif
  680. #ifndef LN
  681. addl $4 * SIZE, CO1
  682. #endif
  683. #if defined(LT) || defined(RN)
  684. movl K, %eax
  685. subl KK, %eax
  686. leal (,%eax, SIZE), %eax
  687. leal (AA, %eax, 4), AA
  688. #ifdef LT
  689. addl $16 * SIZE, B
  690. #endif
  691. #endif
  692. #ifdef LN
  693. subl $4, KK
  694. movl BORIG, B
  695. #endif
  696. #ifdef LT
  697. addl $4, KK
  698. #endif
  699. #ifdef RT
  700. movl K, %eax
  701. movl BORIG, B
  702. sall $2 + BASE_SHIFT, %eax
  703. addl %eax, AORIG
  704. #endif
  705. decl %ebx # i --
  706. jg .L11
  707. ALIGN_4
  708. .L20:
  709. testl $2, M
  710. je .L30
  711. #ifdef LN
  712. movl K, %eax
  713. sall $1 + BASE_SHIFT, %eax
  714. subl %eax, AORIG
  715. #endif
  716. #if defined(LN) || defined(RT)
  717. movl KK, %eax
  718. movl AORIG, AA
  719. leal (, %eax, SIZE), %eax
  720. leal (AA, %eax, 2), AA
  721. #endif
  722. leal BUFFER, BB
  723. #if defined(LN) || defined(RT)
  724. movl KK, %eax
  725. sall $2 + BASE_SHIFT, %eax
  726. leal (BB, %eax, 4), BB
  727. #endif
  728. #ifdef movsd
  729. xorps %xmm0, %xmm0
  730. #endif
  731. movsd 0 * SIZE(AA), %xmm0
  732. xorps %xmm4, %xmm4
  733. #ifdef movsd
  734. xorps %xmm1, %xmm1
  735. #endif
  736. movsd 8 * SIZE(AA), %xmm1
  737. xorps %xmm5, %xmm5
  738. movaps 0 * SIZE(BB), %xmm2
  739. xorps %xmm6, %xmm6
  740. movaps 16 * SIZE(BB), %xmm3
  741. xorps %xmm7, %xmm7
  742. #if defined(LT) || defined(RN)
  743. movl KK, %eax
  744. #else
  745. movl K, %eax
  746. subl KK, %eax
  747. #endif
  748. sarl $3, %eax
  749. je .L25
  750. ALIGN_4
  751. .L22:
  752. mulps %xmm0, %xmm2
  753. addps %xmm2, %xmm4
  754. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  755. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  756. #endif
  757. movaps 4 * SIZE(BB), %xmm2
  758. mulps %xmm0, %xmm2
  759. addps %xmm2, %xmm5
  760. movaps 8 * SIZE(BB), %xmm2
  761. mulps %xmm0, %xmm2
  762. addps %xmm2, %xmm6
  763. movaps 12 * SIZE(BB), %xmm2
  764. mulps %xmm0, %xmm2
  765. movsd 2 * SIZE(AA), %xmm0
  766. addps %xmm2, %xmm7
  767. movaps 32 * SIZE(BB), %xmm2
  768. mulps %xmm0, %xmm3
  769. addps %xmm3, %xmm4
  770. movaps 20 * SIZE(BB), %xmm3
  771. mulps %xmm0, %xmm3
  772. addps %xmm3, %xmm5
  773. movaps 24 * SIZE(BB), %xmm3
  774. mulps %xmm0, %xmm3
  775. addps %xmm3, %xmm6
  776. movaps 28 * SIZE(BB), %xmm3
  777. mulps %xmm0, %xmm3
  778. movsd 4 * SIZE(AA), %xmm0
  779. addps %xmm3, %xmm7
  780. movaps 48 * SIZE(BB), %xmm3
  781. mulps %xmm0, %xmm2
  782. addps %xmm2, %xmm4
  783. movaps 36 * SIZE(BB), %xmm2
  784. mulps %xmm0, %xmm2
  785. addps %xmm2, %xmm5
  786. movaps 40 * SIZE(BB), %xmm2
  787. mulps %xmm0, %xmm2
  788. addps %xmm2, %xmm6
  789. movaps 44 * SIZE(BB), %xmm2
  790. mulps %xmm0, %xmm2
  791. movsd 6 * SIZE(AA), %xmm0
  792. addps %xmm2, %xmm7
  793. movaps 64 * SIZE(BB), %xmm2
  794. mulps %xmm0, %xmm3
  795. addps %xmm3, %xmm4
  796. movaps 52 * SIZE(BB), %xmm3
  797. mulps %xmm0, %xmm3
  798. addps %xmm3, %xmm5
  799. movaps 56 * SIZE(BB), %xmm3
  800. mulps %xmm0, %xmm3
  801. addps %xmm3, %xmm6
  802. movaps 60 * SIZE(BB), %xmm3
  803. mulps %xmm0, %xmm3
  804. movsd 16 * SIZE(AA), %xmm0
  805. addps %xmm3, %xmm7
  806. movaps 80 * SIZE(BB), %xmm3
  807. mulps %xmm1, %xmm2
  808. addps %xmm2, %xmm4
  809. movaps 68 * SIZE(BB), %xmm2
  810. mulps %xmm1, %xmm2
  811. addps %xmm2, %xmm5
  812. movaps 72 * SIZE(BB), %xmm2
  813. mulps %xmm1, %xmm2
  814. addps %xmm2, %xmm6
  815. movaps 76 * SIZE(BB), %xmm2
  816. mulps %xmm1, %xmm2
  817. movsd 10 * SIZE(AA), %xmm1
  818. addps %xmm2, %xmm7
  819. movaps 96 * SIZE(BB), %xmm2
  820. mulps %xmm1, %xmm3
  821. addps %xmm3, %xmm4
  822. movaps 84 * SIZE(BB), %xmm3
  823. mulps %xmm1, %xmm3
  824. addps %xmm3, %xmm5
  825. movaps 88 * SIZE(BB), %xmm3
  826. mulps %xmm1, %xmm3
  827. addps %xmm3, %xmm6
  828. movaps 92 * SIZE(BB), %xmm3
  829. mulps %xmm1, %xmm3
  830. movsd 12 * SIZE(AA), %xmm1
  831. addps %xmm3, %xmm7
  832. movaps 112 * SIZE(BB), %xmm3
  833. mulps %xmm1, %xmm2
  834. addps %xmm2, %xmm4
  835. movaps 100 * SIZE(BB), %xmm2
  836. mulps %xmm1, %xmm2
  837. addps %xmm2, %xmm5
  838. movaps 104 * SIZE(BB), %xmm2
  839. mulps %xmm1, %xmm2
  840. addps %xmm2, %xmm6
  841. movaps 108 * SIZE(BB), %xmm2
  842. mulps %xmm1, %xmm2
  843. movsd 14 * SIZE(AA), %xmm1
  844. addps %xmm2, %xmm7
  845. movaps 128 * SIZE(BB), %xmm2
  846. mulps %xmm1, %xmm3
  847. addps %xmm3, %xmm4
  848. movaps 116 * SIZE(BB), %xmm3
  849. mulps %xmm1, %xmm3
  850. addps %xmm3, %xmm5
  851. movaps 120 * SIZE(BB), %xmm3
  852. mulps %xmm1, %xmm3
  853. addps %xmm3, %xmm6
  854. movaps 124 * SIZE(BB), %xmm3
  855. mulps %xmm1, %xmm3
  856. movsd 24 * SIZE(AA), %xmm1
  857. addps %xmm3, %xmm7
  858. movaps 144 * SIZE(BB), %xmm3
  859. addl $ 16 * SIZE, AA
  860. addl $128 * SIZE, BB
  861. decl %eax
  862. jne .L22
  863. ALIGN_4
  864. .L25:
  865. #if defined(LT) || defined(RN)
  866. movl KK, %eax
  867. #else
  868. movl K, %eax
  869. subl KK, %eax
  870. #endif
  871. andl $7, %eax # if (k & 1)
  872. BRANCH
  873. je .L28
  874. ALIGN_4
  875. .L26:
  876. mulps %xmm0, %xmm2
  877. addps %xmm2, %xmm4
  878. movaps 4 * SIZE(BB), %xmm2
  879. mulps %xmm0, %xmm2
  880. addps %xmm2, %xmm5
  881. movaps 8 * SIZE(BB), %xmm2
  882. mulps %xmm0, %xmm2
  883. addps %xmm2, %xmm6
  884. movaps 12 * SIZE(BB), %xmm2
  885. mulps %xmm0, %xmm2
  886. movsd 2 * SIZE(AA), %xmm0
  887. addps %xmm2, %xmm7
  888. movaps 16 * SIZE(BB), %xmm2
  889. addl $ 2 * SIZE, AA
  890. addl $16 * SIZE, BB
  891. decl %eax
  892. jg .L26
  893. ALIGN_4
  894. .L28:
  895. #if defined(LN) || defined(RT)
  896. movl KK, %eax
  897. #ifdef LN
  898. subl $2, %eax
  899. #else
  900. subl $4, %eax
  901. #endif
  902. movl AORIG, AA
  903. movl BORIG, B
  904. leal BUFFER, BB
  905. sall $1 + BASE_SHIFT, %eax
  906. leal (AA, %eax, 1), AA
  907. leal (B, %eax, 2), B
  908. leal (BB, %eax, 8), BB
  909. #endif
  910. #if defined(LN) || defined(LT)
  911. unpcklps %xmm6, %xmm4
  912. unpcklps %xmm7, %xmm5
  913. movaps %xmm4, %xmm6
  914. unpcklps %xmm5, %xmm4
  915. unpckhps %xmm5, %xmm6
  916. movaps 0 * SIZE(B), %xmm1
  917. movaps 4 * SIZE(B), %xmm3
  918. subps %xmm4, %xmm1
  919. subps %xmm6, %xmm3
  920. #else
  921. #ifdef movsd
  922. xorps %xmm0, %xmm0
  923. #endif
  924. movsd 0 * SIZE(AA), %xmm0
  925. #ifdef movsd
  926. xorps %xmm1, %xmm1
  927. #endif
  928. movsd 2 * SIZE(AA), %xmm1
  929. #ifdef movsd
  930. xorps %xmm2, %xmm2
  931. #endif
  932. movsd 4 * SIZE(AA), %xmm2
  933. #ifdef movsd
  934. xorps %xmm3, %xmm3
  935. #endif
  936. movsd 6 * SIZE(AA), %xmm3
  937. subps %xmm4, %xmm0
  938. subps %xmm5, %xmm1
  939. subps %xmm6, %xmm2
  940. subps %xmm7, %xmm3
  941. #endif
  942. #ifdef LN
  943. movaps 0 * SIZE(AA), %xmm4
  944. pshufd $0xff, %xmm4, %xmm6
  945. mulps %xmm6, %xmm3
  946. pshufd $0xaa, %xmm4, %xmm6
  947. mulps %xmm3, %xmm6
  948. subps %xmm6, %xmm1
  949. pshufd $0x00, %xmm4, %xmm6
  950. mulps %xmm6, %xmm1
  951. #endif
  952. #ifdef LT
  953. movaps 0 * SIZE(AA), %xmm4
  954. pshufd $0x00, %xmm4, %xmm6
  955. mulps %xmm6, %xmm1
  956. pshufd $0x55, %xmm4, %xmm6
  957. mulps %xmm1, %xmm6
  958. subps %xmm6, %xmm3
  959. pshufd $0xff, %xmm4, %xmm6
  960. mulps %xmm6, %xmm3
  961. #endif
  962. #ifdef RN
  963. movaps 0 * SIZE(B), %xmm6
  964. pshufd $0x00, %xmm6, %xmm7
  965. mulps %xmm7, %xmm0
  966. pshufd $0x55, %xmm6, %xmm7
  967. mulps %xmm0, %xmm7
  968. subps %xmm7, %xmm1
  969. pshufd $0xaa, %xmm6, %xmm7
  970. mulps %xmm0, %xmm7
  971. subps %xmm7, %xmm2
  972. pshufd $0xff, %xmm6, %xmm7
  973. mulps %xmm0, %xmm7
  974. subps %xmm7, %xmm3
  975. movaps 4 * SIZE(B), %xmm6
  976. pshufd $0x55, %xmm6, %xmm7
  977. mulps %xmm7, %xmm1
  978. pshufd $0xaa, %xmm6, %xmm7
  979. mulps %xmm1, %xmm7
  980. subps %xmm7, %xmm2
  981. pshufd $0xff, %xmm6, %xmm7
  982. mulps %xmm1, %xmm7
  983. subps %xmm7, %xmm3
  984. movaps 8 * SIZE(B), %xmm6
  985. pshufd $0xaa, %xmm6, %xmm7
  986. mulps %xmm7, %xmm2
  987. pshufd $0xff, %xmm6, %xmm7
  988. mulps %xmm2, %xmm7
  989. subps %xmm7, %xmm3
  990. movaps 12 * SIZE(B), %xmm6
  991. pshufd $0xff, %xmm6, %xmm7
  992. mulps %xmm7, %xmm3
  993. #endif
  994. #ifdef RT
  995. movaps 12 * SIZE(B), %xmm6
  996. pshufd $0xff, %xmm6, %xmm7
  997. mulps %xmm7, %xmm3
  998. pshufd $0xaa, %xmm6, %xmm7
  999. mulps %xmm3, %xmm7
  1000. subps %xmm7, %xmm2
  1001. pshufd $0x55, %xmm6, %xmm7
  1002. mulps %xmm3, %xmm7
  1003. subps %xmm7, %xmm1
  1004. pshufd $0x00, %xmm6, %xmm7
  1005. mulps %xmm3, %xmm7
  1006. subps %xmm7, %xmm0
  1007. movaps 8 * SIZE(B), %xmm6
  1008. pshufd $0xaa, %xmm6, %xmm7
  1009. mulps %xmm7, %xmm2
  1010. pshufd $0x55, %xmm6, %xmm7
  1011. mulps %xmm2, %xmm7
  1012. subps %xmm7, %xmm1
  1013. pshufd $0x00, %xmm6, %xmm7
  1014. mulps %xmm2, %xmm7
  1015. subps %xmm7, %xmm0
  1016. movaps 4 * SIZE(B), %xmm6
  1017. pshufd $0x55, %xmm6, %xmm7
  1018. mulps %xmm7, %xmm1
  1019. pshufd $0x00, %xmm6, %xmm7
  1020. mulps %xmm1, %xmm7
  1021. subps %xmm7, %xmm0
  1022. movaps 0 * SIZE(B), %xmm6
  1023. pshufd $0x00, %xmm6, %xmm7
  1024. mulps %xmm7, %xmm0
  1025. #endif
  1026. #if defined(LN) || defined(LT)
  1027. movaps %xmm1, 0 * SIZE(B)
  1028. movaps %xmm3, 4 * SIZE(B)
  1029. pshufd $0x00, %xmm1, %xmm0
  1030. pshufd $0x55, %xmm1, %xmm2
  1031. pshufd $0xaa, %xmm1, %xmm4
  1032. pshufd $0xff, %xmm1, %xmm6
  1033. movaps %xmm0, 0 * SIZE(BB)
  1034. movaps %xmm2, 4 * SIZE(BB)
  1035. movaps %xmm4, 8 * SIZE(BB)
  1036. movaps %xmm6, 12 * SIZE(BB)
  1037. pshufd $0x00, %xmm3, %xmm0
  1038. pshufd $0x55, %xmm3, %xmm2
  1039. pshufd $0xaa, %xmm3, %xmm4
  1040. pshufd $0xff, %xmm3, %xmm6
  1041. movaps %xmm0, 16 * SIZE(BB)
  1042. movaps %xmm2, 20 * SIZE(BB)
  1043. movaps %xmm4, 24 * SIZE(BB)
  1044. movaps %xmm6, 28 * SIZE(BB)
  1045. #else
  1046. movlps %xmm0, 0 * SIZE(AA)
  1047. movlps %xmm1, 2 * SIZE(AA)
  1048. movlps %xmm2, 4 * SIZE(AA)
  1049. movlps %xmm3, 6 * SIZE(AA)
  1050. #endif
  1051. #ifdef LN
  1052. subl $2 * SIZE, CO1
  1053. #endif
  1054. leal (LDC, LDC, 2), %eax
  1055. #if defined(LN) || defined(LT)
  1056. movaps %xmm1, %xmm0
  1057. unpcklps %xmm5, %xmm1
  1058. unpckhps %xmm5, %xmm0
  1059. movaps %xmm3, %xmm4
  1060. unpcklps %xmm7, %xmm3
  1061. unpckhps %xmm7, %xmm4
  1062. movaps %xmm1, %xmm2
  1063. unpcklps %xmm3, %xmm1
  1064. unpckhps %xmm3, %xmm2
  1065. movaps %xmm0, %xmm6
  1066. unpcklps %xmm4, %xmm0
  1067. unpckhps %xmm4, %xmm6
  1068. movlps %xmm1, 0 * SIZE(CO1)
  1069. movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
  1070. movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
  1071. movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
  1072. #else
  1073. movlps %xmm0, 0 * SIZE(CO1)
  1074. movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
  1075. movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
  1076. movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
  1077. #endif
  1078. #ifndef LN
  1079. addl $2 * SIZE, CO1
  1080. #endif
  1081. #if defined(LT) || defined(RN)
  1082. movl K, %eax
  1083. subl KK, %eax
  1084. leal (,%eax, SIZE), %eax
  1085. leal (AA, %eax, 2), AA
  1086. #ifdef LT
  1087. addl $8 * SIZE, B
  1088. #endif
  1089. #endif
  1090. #ifdef LN
  1091. subl $2, KK
  1092. movl BORIG, B
  1093. #endif
  1094. #ifdef LT
  1095. addl $2, KK
  1096. #endif
  1097. #ifdef RT
  1098. movl K, %eax
  1099. movl BORIG, B
  1100. sall $1 + BASE_SHIFT, %eax
  1101. addl %eax, AORIG
  1102. #endif
  1103. ALIGN_4
  1104. .L30:
  1105. testl $1, M
  1106. je .L39
  1107. #ifdef LN
  1108. movl K, %eax
  1109. sall $BASE_SHIFT, %eax
  1110. subl %eax, AORIG
  1111. #endif
  1112. #if defined(LN) || defined(RT)
  1113. movl KK, %eax
  1114. movl AORIG, AA
  1115. leal (AA, %eax, SIZE), AA
  1116. #endif
  1117. leal BUFFER, BB
  1118. #if defined(LN) || defined(RT)
  1119. movl KK, %eax
  1120. sall $2 + BASE_SHIFT, %eax
  1121. leal (BB, %eax, 4), BB
  1122. #endif
  1123. movss 0 * SIZE(AA), %xmm0
  1124. xorps %xmm4, %xmm4
  1125. movss 4 * SIZE(AA), %xmm1
  1126. xorps %xmm5, %xmm5
  1127. movss 0 * SIZE(BB), %xmm2
  1128. xorps %xmm6, %xmm6
  1129. movss 16 * SIZE(BB), %xmm3
  1130. xorps %xmm7, %xmm7
  1131. #if defined(LT) || defined(RN)
  1132. movl KK, %eax
  1133. #else
  1134. movl K, %eax
  1135. subl KK, %eax
  1136. #endif
  1137. sarl $3, %eax
  1138. je .L35
  1139. ALIGN_4
  1140. .L32:
  1141. mulss %xmm0, %xmm2
  1142. addss %xmm2, %xmm4
  1143. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  1144. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1145. #endif
  1146. movss 4 * SIZE(BB), %xmm2
  1147. mulss %xmm0, %xmm2
  1148. addss %xmm2, %xmm5
  1149. movss 8 * SIZE(BB), %xmm2
  1150. mulss %xmm0, %xmm2
  1151. mulss 12 * SIZE(BB), %xmm0
  1152. addss %xmm2, %xmm6
  1153. movss 32 * SIZE(BB), %xmm2
  1154. addss %xmm0, %xmm7
  1155. movss 1 * SIZE(AA), %xmm0
  1156. mulss %xmm0, %xmm3
  1157. addss %xmm3, %xmm4
  1158. movss 20 * SIZE(BB), %xmm3
  1159. mulss %xmm0, %xmm3
  1160. addss %xmm3, %xmm5
  1161. movss 24 * SIZE(BB), %xmm3
  1162. mulss %xmm0, %xmm3
  1163. mulss 28 * SIZE(BB), %xmm0
  1164. addss %xmm3, %xmm6
  1165. movss 48 * SIZE(BB), %xmm3
  1166. addss %xmm0, %xmm7
  1167. movss 2 * SIZE(AA), %xmm0
  1168. mulss %xmm0, %xmm2
  1169. addss %xmm2, %xmm4
  1170. movss 36 * SIZE(BB), %xmm2
  1171. mulss %xmm0, %xmm2
  1172. addss %xmm2, %xmm5
  1173. movss 40 * SIZE(BB), %xmm2
  1174. mulss %xmm0, %xmm2
  1175. mulss 44 * SIZE(BB), %xmm0
  1176. addss %xmm2, %xmm6
  1177. movss 64 * SIZE(BB), %xmm2
  1178. addss %xmm0, %xmm7
  1179. movss 3 * SIZE(AA), %xmm0
  1180. mulss %xmm0, %xmm3
  1181. addss %xmm3, %xmm4
  1182. movss 52 * SIZE(BB), %xmm3
  1183. mulss %xmm0, %xmm3
  1184. addss %xmm3, %xmm5
  1185. movss 56 * SIZE(BB), %xmm3
  1186. mulss %xmm0, %xmm3
  1187. mulss 60 * SIZE(BB), %xmm0
  1188. addss %xmm3, %xmm6
  1189. movss 80 * SIZE(BB), %xmm3
  1190. addss %xmm0, %xmm7
  1191. movss 8 * SIZE(AA), %xmm0
  1192. mulss %xmm1, %xmm2
  1193. addss %xmm2, %xmm4
  1194. movss 68 * SIZE(BB), %xmm2
  1195. mulss %xmm1, %xmm2
  1196. addss %xmm2, %xmm5
  1197. movss 72 * SIZE(BB), %xmm2
  1198. mulss %xmm1, %xmm2
  1199. mulss 76 * SIZE(BB), %xmm1
  1200. addss %xmm2, %xmm6
  1201. movss 96 * SIZE(BB), %xmm2
  1202. addss %xmm1, %xmm7
  1203. movss 5 * SIZE(AA), %xmm1
  1204. mulss %xmm1, %xmm3
  1205. addss %xmm3, %xmm4
  1206. movss 84 * SIZE(BB), %xmm3
  1207. mulss %xmm1, %xmm3
  1208. addss %xmm3, %xmm5
  1209. movss 88 * SIZE(BB), %xmm3
  1210. mulss %xmm1, %xmm3
  1211. mulss 92 * SIZE(BB), %xmm1
  1212. addss %xmm3, %xmm6
  1213. movss 112 * SIZE(BB), %xmm3
  1214. addss %xmm1, %xmm7
  1215. movss 6 * SIZE(AA), %xmm1
  1216. mulss %xmm1, %xmm2
  1217. addss %xmm2, %xmm4
  1218. movss 100 * SIZE(BB), %xmm2
  1219. mulss %xmm1, %xmm2
  1220. addss %xmm2, %xmm5
  1221. movss 104 * SIZE(BB), %xmm2
  1222. mulss %xmm1, %xmm2
  1223. mulss 108 * SIZE(BB), %xmm1
  1224. addss %xmm2, %xmm6
  1225. movss 128 * SIZE(BB), %xmm2
  1226. addss %xmm1, %xmm7
  1227. movss 7 * SIZE(AA), %xmm1
  1228. mulss %xmm1, %xmm3
  1229. addss %xmm3, %xmm4
  1230. movss 116 * SIZE(BB), %xmm3
  1231. mulss %xmm1, %xmm3
  1232. addss %xmm3, %xmm5
  1233. movss 120 * SIZE(BB), %xmm3
  1234. mulss %xmm1, %xmm3
  1235. mulss 124 * SIZE(BB), %xmm1
  1236. addss %xmm3, %xmm6
  1237. movss 144 * SIZE(BB), %xmm3
  1238. addss %xmm1, %xmm7
  1239. movss 12 * SIZE(AA), %xmm1
  1240. addl $ 8 * SIZE, AA
  1241. addl $128 * SIZE, BB
  1242. decl %eax
  1243. jne .L32
  1244. ALIGN_4
  1245. .L35:
  1246. #if defined(LT) || defined(RN)
  1247. movl KK, %eax
  1248. #else
  1249. movl K, %eax
  1250. subl KK, %eax
  1251. #endif
  1252. andl $7, %eax # if (k & 1)
  1253. BRANCH
  1254. je .L38
  1255. ALIGN_4
  1256. .L36:
  1257. mulss %xmm0, %xmm2
  1258. addss %xmm2, %xmm4
  1259. movss 4 * SIZE(BB), %xmm2
  1260. mulss %xmm0, %xmm2
  1261. addss %xmm2, %xmm5
  1262. movss 8 * SIZE(BB), %xmm2
  1263. mulss %xmm0, %xmm2
  1264. mulss 12 * SIZE(BB), %xmm0
  1265. addss %xmm2, %xmm6
  1266. movss 16 * SIZE(BB), %xmm2
  1267. addss %xmm0, %xmm7
  1268. movss 1 * SIZE(AA), %xmm0
  1269. addl $ 1 * SIZE, AA
  1270. addl $16 * SIZE, BB
  1271. decl %eax
  1272. jg .L36
  1273. ALIGN_4
  1274. .L38:
  1275. #if defined(LN) || defined(RT)
  1276. movl KK, %eax
  1277. #ifdef LN
  1278. subl $1, %eax
  1279. #else
  1280. subl $4, %eax
  1281. #endif
  1282. movl AORIG, AA
  1283. movl BORIG, B
  1284. leal BUFFER, BB
  1285. leal (AA, %eax, SIZE), AA
  1286. sall $2 + BASE_SHIFT, %eax
  1287. leal (B, %eax, 1), B
  1288. leal (BB, %eax, 4), BB
  1289. #endif
  1290. #if defined(LN) || defined(LT)
  1291. unpcklps %xmm6, %xmm4
  1292. unpcklps %xmm7, %xmm5
  1293. unpcklps %xmm5, %xmm4
  1294. movaps 0 * SIZE(B), %xmm1
  1295. subps %xmm4, %xmm1
  1296. #else
  1297. movss 0 * SIZE(AA), %xmm0
  1298. movss 1 * SIZE(AA), %xmm1
  1299. movss 2 * SIZE(AA), %xmm2
  1300. movss 3 * SIZE(AA), %xmm3
  1301. subss %xmm4, %xmm0
  1302. subss %xmm5, %xmm1
  1303. subss %xmm6, %xmm2
  1304. subss %xmm7, %xmm3
  1305. #endif
  1306. #if defined(LN) || defined(LT)
  1307. movss 0 * SIZE(AA), %xmm4
  1308. pshufd $0x00, %xmm4, %xmm6
  1309. mulps %xmm6, %xmm1
  1310. #endif
  1311. #ifdef RN
  1312. movaps 0 * SIZE(B), %xmm6
  1313. pshufd $0x00, %xmm6, %xmm7
  1314. mulss %xmm7, %xmm0
  1315. pshufd $0x55, %xmm6, %xmm7
  1316. mulss %xmm0, %xmm7
  1317. subss %xmm7, %xmm1
  1318. pshufd $0xaa, %xmm6, %xmm7
  1319. mulss %xmm0, %xmm7
  1320. subss %xmm7, %xmm2
  1321. pshufd $0xff, %xmm6, %xmm7
  1322. mulss %xmm0, %xmm7
  1323. subss %xmm7, %xmm3
  1324. movaps 4 * SIZE(B), %xmm6
  1325. pshufd $0x55, %xmm6, %xmm7
  1326. mulss %xmm7, %xmm1
  1327. pshufd $0xaa, %xmm6, %xmm7
  1328. mulss %xmm1, %xmm7
  1329. subss %xmm7, %xmm2
  1330. pshufd $0xff, %xmm6, %xmm7
  1331. mulss %xmm1, %xmm7
  1332. subss %xmm7, %xmm3
  1333. movaps 8 * SIZE(B), %xmm6
  1334. pshufd $0xaa, %xmm6, %xmm7
  1335. mulss %xmm7, %xmm2
  1336. pshufd $0xff, %xmm6, %xmm7
  1337. mulss %xmm2, %xmm7
  1338. subss %xmm7, %xmm3
  1339. movaps 12 * SIZE(B), %xmm6
  1340. pshufd $0xff, %xmm6, %xmm7
  1341. mulss %xmm7, %xmm3
  1342. #endif
  1343. #ifdef RT
  1344. movaps 12 * SIZE(B), %xmm6
  1345. pshufd $0xff, %xmm6, %xmm7
  1346. mulss %xmm7, %xmm3
  1347. pshufd $0xaa, %xmm6, %xmm7
  1348. mulss %xmm3, %xmm7
  1349. subss %xmm7, %xmm2
  1350. pshufd $0x55, %xmm6, %xmm7
  1351. mulss %xmm3, %xmm7
  1352. subss %xmm7, %xmm1
  1353. pshufd $0x00, %xmm6, %xmm7
  1354. mulss %xmm3, %xmm7
  1355. subss %xmm7, %xmm0
  1356. movaps 8 * SIZE(B), %xmm6
  1357. pshufd $0xaa, %xmm6, %xmm7
  1358. mulss %xmm7, %xmm2
  1359. pshufd $0x55, %xmm6, %xmm7
  1360. mulss %xmm2, %xmm7
  1361. subss %xmm7, %xmm1
  1362. pshufd $0x00, %xmm6, %xmm7
  1363. mulss %xmm2, %xmm7
  1364. subss %xmm7, %xmm0
  1365. movaps 4 * SIZE(B), %xmm6
  1366. pshufd $0x55, %xmm6, %xmm7
  1367. mulss %xmm7, %xmm1
  1368. pshufd $0x00, %xmm6, %xmm7
  1369. mulss %xmm1, %xmm7
  1370. subss %xmm7, %xmm0
  1371. movaps 0 * SIZE(B), %xmm6
  1372. pshufd $0x00, %xmm6, %xmm7
  1373. mulss %xmm7, %xmm0
  1374. #endif
  1375. #if defined(LN) || defined(LT)
  1376. movaps %xmm1, 0 * SIZE(B)
  1377. pshufd $0x00, %xmm1, %xmm0
  1378. pshufd $0x55, %xmm1, %xmm2
  1379. pshufd $0xaa, %xmm1, %xmm4
  1380. pshufd $0xff, %xmm1, %xmm6
  1381. movaps %xmm0, 0 * SIZE(BB)
  1382. movaps %xmm2, 4 * SIZE(BB)
  1383. movaps %xmm4, 8 * SIZE(BB)
  1384. movaps %xmm6, 12 * SIZE(BB)
  1385. #else
  1386. movss %xmm0, 0 * SIZE(AA)
  1387. movss %xmm1, 1 * SIZE(AA)
  1388. movss %xmm2, 2 * SIZE(AA)
  1389. movss %xmm3, 3 * SIZE(AA)
  1390. #endif
  1391. #ifdef LN
  1392. subl $1 * SIZE, CO1
  1393. #endif
  1394. leal (LDC, LDC, 2), %eax
  1395. #if defined(LN) || defined(LT)
  1396. movaps %xmm1, %xmm0
  1397. unpcklps %xmm5, %xmm1
  1398. unpckhps %xmm5, %xmm0
  1399. movaps %xmm3, %xmm4
  1400. unpcklps %xmm7, %xmm3
  1401. unpckhps %xmm7, %xmm4
  1402. movaps %xmm1, %xmm2
  1403. unpcklps %xmm3, %xmm1
  1404. unpckhps %xmm3, %xmm2
  1405. movaps %xmm0, %xmm6
  1406. unpcklps %xmm4, %xmm0
  1407. unpckhps %xmm4, %xmm6
  1408. movss %xmm1, 0 * SIZE(CO1)
  1409. movss %xmm2, 0 * SIZE(CO1, LDC, 1)
  1410. movss %xmm0, 0 * SIZE(CO1, LDC, 2)
  1411. movss %xmm6, 0 * SIZE(CO1, %eax, 1)
  1412. #else
  1413. movss %xmm0, 0 * SIZE(CO1)
  1414. movss %xmm1, 0 * SIZE(CO1, LDC, 1)
  1415. movss %xmm2, 0 * SIZE(CO1, LDC, 2)
  1416. movss %xmm3, 0 * SIZE(CO1, %eax, 1)
  1417. #endif
  1418. #ifndef LN
  1419. addl $1 * SIZE, CO1
  1420. #endif
  1421. #if defined(LT) || defined(RN)
  1422. movl K, %eax
  1423. subl KK, %eax
  1424. leal (AA, %eax, SIZE), AA
  1425. #ifdef LT
  1426. addl $4 * SIZE, B
  1427. #endif
  1428. #endif
  1429. #ifdef LN
  1430. subl $1, KK
  1431. movl BORIG, B
  1432. #endif
  1433. #ifdef LT
  1434. addl $1, KK
  1435. #endif
  1436. #ifdef RT
  1437. movl K, %eax
  1438. movl BORIG, B
  1439. sall $BASE_SHIFT, %eax
  1440. addl %eax, AORIG
  1441. #endif
  1442. ALIGN_4
  1443. .L39:
  1444. #ifdef LN
  1445. movl K, %eax
  1446. leal (, %eax, SIZE), %eax
  1447. leal (B, %eax, 4), B
  1448. #endif
  1449. #if defined(LT) || defined(RN)
  1450. movl K, %eax
  1451. subl KK, %eax
  1452. leal (,%eax, SIZE), %eax
  1453. leal (B, %eax, 4), B
  1454. #endif
  1455. #ifdef RN
  1456. addl $4, KK
  1457. #endif
  1458. #ifdef RT
  1459. subl $4, KK
  1460. #endif
  1461. decl J # j --
  1462. jg .L01
  1463. ALIGN_4
  1464. .L40:
  1465. testl $2, N
  1466. je .L80
  1467. #ifdef LN
  1468. movl OFFSET, %eax
  1469. addl M, %eax
  1470. movl %eax, KK
  1471. #endif
  1472. leal BUFFER, %ecx
  1473. #ifdef RT
  1474. movl K, %eax
  1475. sall $1 + BASE_SHIFT, %eax
  1476. subl %eax, B
  1477. #endif
  1478. #if defined(LN) || defined(RT)
  1479. movl KK, %eax
  1480. movl B, BORIG
  1481. sall $1 + BASE_SHIFT, %eax
  1482. leal (B, %eax, 1), B
  1483. leal (BB, %eax, 4), BB
  1484. #endif
  1485. #ifdef LT
  1486. movl OFFSET, %eax
  1487. movl %eax, KK
  1488. #endif
  1489. #if defined(LT) || defined(RN)
  1490. movl KK, %eax
  1491. #else
  1492. movl K, %eax
  1493. subl KK, %eax
  1494. #endif
  1495. sarl $2, %eax
  1496. jle .L45
  1497. ALIGN_4
  1498. .L42:
  1499. movaps 0 * SIZE(B), %xmm3
  1500. movaps 4 * SIZE(B), %xmm7
  1501. pshufd $0x00, %xmm3, %xmm0
  1502. pshufd $0x55, %xmm3, %xmm1
  1503. pshufd $0xaa, %xmm3, %xmm2
  1504. pshufd $0xff, %xmm3, %xmm3
  1505. pshufd $0x00, %xmm7, %xmm4
  1506. pshufd $0x55, %xmm7, %xmm5
  1507. pshufd $0xaa, %xmm7, %xmm6
  1508. pshufd $0xff, %xmm7, %xmm7
  1509. movaps %xmm0, 0 * SIZE(BB)
  1510. movaps %xmm1, 4 * SIZE(BB)
  1511. movaps %xmm2, 8 * SIZE(BB)
  1512. movaps %xmm3, 12 * SIZE(BB)
  1513. movaps %xmm4, 16 * SIZE(BB)
  1514. movaps %xmm5, 20 * SIZE(BB)
  1515. movaps %xmm6, 24 * SIZE(BB)
  1516. movaps %xmm7, 28 * SIZE(BB)
  1517. addl $ 8 * SIZE, B
  1518. addl $32 * SIZE, %ecx
  1519. decl %eax
  1520. jne .L42
  1521. ALIGN_4
  1522. .L45:
  1523. #if defined(LT) || defined(RN)
  1524. movl KK, %eax
  1525. #else
  1526. movl K, %eax
  1527. subl KK, %eax
  1528. #endif
  1529. andl $3, %eax
  1530. BRANCH
  1531. jle .L50
  1532. ALIGN_4
  1533. .L46:
  1534. #ifdef movsd
  1535. xorps %xmm3, %xmm3
  1536. #endif
  1537. movsd 0 * SIZE(B), %xmm3
  1538. pshufd $0x00, %xmm3, %xmm0
  1539. pshufd $0x55, %xmm3, %xmm1
  1540. movaps %xmm0, 0 * SIZE(BB)
  1541. movaps %xmm1, 4 * SIZE(BB)
  1542. addl $2 * SIZE, B
  1543. addl $8 * SIZE, %ecx
  1544. decl %eax
  1545. jne .L46
  1546. ALIGN_4
  1547. .L50:
  1548. #if defined(LT) || defined(RN)
  1549. movl A, AA
  1550. #else
  1551. movl A, %eax
  1552. movl %eax, AORIG
  1553. #endif
  1554. leal (, LDC, 2), %eax
  1555. #ifdef RT
  1556. subl %eax, C
  1557. #endif
  1558. movl C, CO1
  1559. #ifndef RT
  1560. addl %eax, C
  1561. #endif
  1562. movl M, %ebx
  1563. sarl $2, %ebx # i = (m >> 2)
  1564. jle .L60
  1565. ALIGN_4
  1566. .L51:
  1567. #ifdef LN
  1568. movl K, %eax
  1569. sall $2 + BASE_SHIFT, %eax
  1570. subl %eax, AORIG
  1571. #endif
  1572. #if defined(LN) || defined(RT)
  1573. movl KK, %eax
  1574. movl AORIG, AA
  1575. leal (, %eax, SIZE), %eax
  1576. leal (AA, %eax, 4), AA
  1577. #endif
  1578. leal BUFFER, BB
  1579. #if defined(LN) || defined(RT)
  1580. movl KK, %eax
  1581. sall $1 + BASE_SHIFT, %eax
  1582. leal (BB, %eax, 4), BB
  1583. #endif
  1584. xorps %xmm4, %xmm4
  1585. xorps %xmm5, %xmm5
  1586. xorps %xmm6, %xmm6
  1587. xorps %xmm7, %xmm7
  1588. movaps 0 * SIZE(AA), %xmm0
  1589. movaps 16 * SIZE(AA), %xmm1
  1590. movaps 0 * SIZE(BB), %xmm2
  1591. movaps 16 * SIZE(BB), %xmm3
  1592. PREFETCHW 3 * SIZE(CO1)
  1593. PREFETCHW 3 * SIZE(CO1, LDC)
  1594. #if defined(LT) || defined(RN)
  1595. movl KK, %eax
  1596. #else
  1597. movl K, %eax
  1598. subl KK, %eax
  1599. #endif
  1600. sarl $3, %eax
  1601. je .L55
  1602. ALIGN_4
  1603. .L52:
  1604. mulps %xmm0, %xmm2
  1605. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  1606. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1607. #endif
  1608. mulps 4 * SIZE(BB), %xmm0
  1609. addps %xmm2, %xmm4
  1610. movaps 8 * SIZE(BB), %xmm2
  1611. addps %xmm0, %xmm5
  1612. movaps 4 * SIZE(AA), %xmm0
  1613. mulps %xmm0, %xmm2
  1614. mulps 12 * SIZE(BB), %xmm0
  1615. addps %xmm2, %xmm4
  1616. movaps 32 * SIZE(BB), %xmm2
  1617. addps %xmm0, %xmm5
  1618. movaps 8 * SIZE(AA), %xmm0
  1619. mulps %xmm0, %xmm3
  1620. mulps 20 * SIZE(BB), %xmm0
  1621. addps %xmm3, %xmm4
  1622. movaps 24 * SIZE(BB), %xmm3
  1623. addps %xmm0, %xmm5
  1624. movaps 12 * SIZE(AA), %xmm0
  1625. mulps %xmm0, %xmm3
  1626. mulps 28 * SIZE(BB), %xmm0
  1627. addps %xmm3, %xmm4
  1628. movaps 48 * SIZE(BB), %xmm3
  1629. addps %xmm0, %xmm5
  1630. movaps 32 * SIZE(AA), %xmm0
  1631. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  1632. prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
  1633. #endif
  1634. mulps %xmm1, %xmm2
  1635. mulps 36 * SIZE(BB), %xmm1
  1636. addps %xmm2, %xmm4
  1637. movaps 40 * SIZE(BB), %xmm2
  1638. addps %xmm1, %xmm5
  1639. movaps 20 * SIZE(AA), %xmm1
  1640. mulps %xmm1, %xmm2
  1641. mulps 44 * SIZE(BB), %xmm1
  1642. addps %xmm2, %xmm4
  1643. movaps 64 * SIZE(BB), %xmm2
  1644. addps %xmm1, %xmm5
  1645. movaps 24 * SIZE(AA), %xmm1
  1646. mulps %xmm1, %xmm3
  1647. mulps 52 * SIZE(BB), %xmm1
  1648. addps %xmm3, %xmm4
  1649. movaps 56 * SIZE(BB), %xmm3
  1650. addps %xmm1, %xmm5
  1651. movaps 28 * SIZE(AA), %xmm1
  1652. mulps %xmm1, %xmm3
  1653. mulps 60 * SIZE(BB), %xmm1
  1654. addps %xmm3, %xmm4
  1655. movaps 80 * SIZE(BB), %xmm3
  1656. addps %xmm1, %xmm5
  1657. movaps 48 * SIZE(AA), %xmm1
  1658. addl $32 * SIZE, AA
  1659. addl $64 * SIZE, BB
  1660. decl %eax
  1661. jne .L52
  1662. ALIGN_4
  1663. .L55:
  1664. #if defined(LT) || defined(RN)
  1665. movl KK, %eax
  1666. #else
  1667. movl K, %eax
  1668. subl KK, %eax
  1669. #endif
  1670. andl $7, %eax # if (k & 1)
  1671. BRANCH
  1672. je .L58
  1673. ALIGN_4
  1674. .L56:
  1675. mulps %xmm0, %xmm2
  1676. mulps 4 * SIZE(BB), %xmm0
  1677. addps %xmm2, %xmm4
  1678. movaps 8 * SIZE(BB), %xmm2
  1679. addps %xmm0, %xmm5
  1680. movaps 4 * SIZE(AA), %xmm0
  1681. addl $4 * SIZE, AA
  1682. addl $8 * SIZE, BB
  1683. decl %eax
  1684. jg .L56
  1685. ALIGN_4
  1686. .L58:
  1687. #if defined(LN) || defined(RT)
  1688. movl KK, %eax
  1689. #ifdef LN
  1690. subl $4, %eax
  1691. #else
  1692. subl $2, %eax
  1693. #endif
  1694. movl AORIG, AA
  1695. movl BORIG, B
  1696. leal BUFFER, BB
  1697. sall $1 + BASE_SHIFT, %eax
  1698. leal (AA, %eax, 2), AA
  1699. leal (B, %eax, 1), B
  1700. leal (BB, %eax, 4), BB
  1701. #endif
  1702. #if defined(LN) || defined(LT)
  1703. movaps %xmm4, %xmm0
  1704. unpcklps %xmm6, %xmm4
  1705. unpckhps %xmm6, %xmm0
  1706. movaps %xmm5, %xmm1
  1707. unpcklps %xmm7, %xmm5
  1708. unpckhps %xmm7, %xmm1
  1709. movaps %xmm4, %xmm6
  1710. unpcklps %xmm5, %xmm4
  1711. unpckhps %xmm5, %xmm6
  1712. movaps %xmm0, %xmm2
  1713. unpcklps %xmm1, %xmm0
  1714. unpckhps %xmm1, %xmm2
  1715. #ifdef movsd
  1716. xorps %xmm1, %xmm1
  1717. #endif
  1718. movsd 0 * SIZE(B), %xmm1
  1719. #ifdef movsd
  1720. xorps %xmm3, %xmm3
  1721. #endif
  1722. movsd 2 * SIZE(B), %xmm3
  1723. #ifdef movsd
  1724. xorps %xmm5, %xmm5
  1725. #endif
  1726. movsd 4 * SIZE(B), %xmm5
  1727. #ifdef movsd
  1728. xorps %xmm7, %xmm7
  1729. #endif
  1730. movsd 6 * SIZE(B), %xmm7
  1731. subps %xmm4, %xmm1
  1732. subps %xmm6, %xmm3
  1733. subps %xmm0, %xmm5
  1734. subps %xmm2, %xmm7
  1735. #else
  1736. movaps 0 * SIZE(AA), %xmm0
  1737. movaps 4 * SIZE(AA), %xmm1
  1738. subps %xmm4, %xmm0
  1739. subps %xmm5, %xmm1
  1740. #endif
  1741. #ifdef LN
  1742. movaps 12 * SIZE(AA), %xmm4
  1743. pshufd $0xff, %xmm4, %xmm6
  1744. mulps %xmm6, %xmm7
  1745. pshufd $0xaa, %xmm4, %xmm6
  1746. mulps %xmm7, %xmm6
  1747. subps %xmm6, %xmm5
  1748. pshufd $0x55, %xmm4, %xmm6
  1749. mulps %xmm7, %xmm6
  1750. subps %xmm6, %xmm3
  1751. pshufd $0x00, %xmm4, %xmm6
  1752. mulps %xmm7, %xmm6
  1753. subps %xmm6, %xmm1
  1754. movaps 8 * SIZE(AA), %xmm4
  1755. pshufd $0xaa, %xmm4, %xmm6
  1756. mulps %xmm6, %xmm5
  1757. pshufd $0x55, %xmm4, %xmm6
  1758. mulps %xmm5, %xmm6
  1759. subps %xmm6, %xmm3
  1760. pshufd $0x00, %xmm4, %xmm6
  1761. mulps %xmm5, %xmm6
  1762. subps %xmm6, %xmm1
  1763. movaps 4 * SIZE(AA), %xmm4
  1764. pshufd $0x55, %xmm4, %xmm6
  1765. mulps %xmm6, %xmm3
  1766. pshufd $0x00, %xmm4, %xmm6
  1767. mulps %xmm3, %xmm6
  1768. subps %xmm6, %xmm1
  1769. movaps 0 * SIZE(AA), %xmm4
  1770. pshufd $0x00, %xmm4, %xmm6
  1771. mulps %xmm6, %xmm1
  1772. #endif
  1773. #ifdef LT
  1774. movaps 0 * SIZE(AA), %xmm4
  1775. pshufd $0x00, %xmm4, %xmm6
  1776. mulps %xmm6, %xmm1
  1777. pshufd $0x55, %xmm4, %xmm6
  1778. mulps %xmm1, %xmm6
  1779. subps %xmm6, %xmm3
  1780. pshufd $0xaa, %xmm4, %xmm6
  1781. mulps %xmm1, %xmm6
  1782. subps %xmm6, %xmm5
  1783. pshufd $0xff, %xmm4, %xmm6
  1784. mulps %xmm1, %xmm6
  1785. subps %xmm6, %xmm7
  1786. movaps 4 * SIZE(AA), %xmm4
  1787. pshufd $0x55, %xmm4, %xmm6
  1788. mulps %xmm6, %xmm3
  1789. pshufd $0xaa, %xmm4, %xmm6
  1790. mulps %xmm3, %xmm6
  1791. subps %xmm6, %xmm5
  1792. pshufd $0xff, %xmm4, %xmm6
  1793. mulps %xmm3, %xmm6
  1794. subps %xmm6, %xmm7
  1795. movaps 8 * SIZE(AA), %xmm4
  1796. pshufd $0xaa, %xmm4, %xmm6
  1797. mulps %xmm6, %xmm5
  1798. pshufd $0xff, %xmm4, %xmm6
  1799. mulps %xmm5, %xmm6
  1800. subps %xmm6, %xmm7
  1801. movaps 12 * SIZE(AA), %xmm4
  1802. pshufd $0xff, %xmm4, %xmm6
  1803. mulps %xmm6, %xmm7
  1804. #endif
  1805. #ifdef RN
  1806. movaps 0 * SIZE(B), %xmm6
  1807. pshufd $0x00, %xmm6, %xmm7
  1808. mulps %xmm7, %xmm0
  1809. pshufd $0x55, %xmm6, %xmm7
  1810. mulps %xmm0, %xmm7
  1811. subps %xmm7, %xmm1
  1812. pshufd $0xff, %xmm6, %xmm7
  1813. mulps %xmm7, %xmm1
  1814. #endif
  1815. #ifdef RT
  1816. movaps 0 * SIZE(B), %xmm6
  1817. pshufd $0xff, %xmm6, %xmm7
  1818. mulps %xmm7, %xmm1
  1819. pshufd $0xaa, %xmm6, %xmm7
  1820. mulps %xmm1, %xmm7
  1821. subps %xmm7, %xmm0
  1822. pshufd $0x00, %xmm6, %xmm7
  1823. mulps %xmm7, %xmm0
  1824. #endif
  1825. #if defined(LN) || defined(LT)
  1826. movlps %xmm1, 0 * SIZE(B)
  1827. movlps %xmm3, 2 * SIZE(B)
  1828. movlps %xmm5, 4 * SIZE(B)
  1829. movlps %xmm7, 6 * SIZE(B)
  1830. pshufd $0x00, %xmm1, %xmm0
  1831. pshufd $0x55, %xmm1, %xmm2
  1832. movaps %xmm0, 0 * SIZE(BB)
  1833. movaps %xmm2, 4 * SIZE(BB)
  1834. pshufd $0x00, %xmm3, %xmm0
  1835. pshufd $0x55, %xmm3, %xmm2
  1836. movaps %xmm0, 8 * SIZE(BB)
  1837. movaps %xmm2, 12 * SIZE(BB)
  1838. pshufd $0x00, %xmm5, %xmm0
  1839. pshufd $0x55, %xmm5, %xmm2
  1840. movaps %xmm0, 16 * SIZE(BB)
  1841. movaps %xmm2, 20 * SIZE(BB)
  1842. pshufd $0x00, %xmm7, %xmm0
  1843. pshufd $0x55, %xmm7, %xmm2
  1844. movaps %xmm0, 24 * SIZE(BB)
  1845. movaps %xmm2, 28 * SIZE(BB)
  1846. #else
  1847. movaps %xmm0, 0 * SIZE(AA)
  1848. movaps %xmm1, 4 * SIZE(AA)
  1849. #endif
  1850. #ifdef LN
  1851. subl $4 * SIZE, CO1
  1852. #endif
  1853. #if defined(LN) || defined(LT)
  1854. unpcklps %xmm5, %xmm1
  1855. unpcklps %xmm7, %xmm3
  1856. movaps %xmm1, %xmm2
  1857. unpcklps %xmm3, %xmm1
  1858. unpckhps %xmm3, %xmm2
  1859. movlps %xmm1, 0 * SIZE(CO1)
  1860. movhps %xmm1, 2 * SIZE(CO1)
  1861. movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
  1862. movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
  1863. #else
  1864. movlps %xmm0, 0 * SIZE(CO1)
  1865. movhps %xmm0, 2 * SIZE(CO1)
  1866. movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
  1867. movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
  1868. #endif
  1869. #ifndef LN
  1870. addl $4 * SIZE, CO1
  1871. #endif
  1872. #if defined(LT) || defined(RN)
  1873. movl K, %eax
  1874. subl KK, %eax
  1875. leal (,%eax, SIZE), %eax
  1876. leal (AA, %eax, 4), AA
  1877. #ifdef LT
  1878. addl $8 * SIZE, B
  1879. #endif
  1880. #endif
  1881. #ifdef LN
  1882. subl $4, KK
  1883. movl BORIG, B
  1884. #endif
  1885. #ifdef LT
  1886. addl $4, KK
  1887. #endif
  1888. #ifdef RT
  1889. movl K, %eax
  1890. movl BORIG, B
  1891. sall $2 + BASE_SHIFT, %eax
  1892. addl %eax, AORIG
  1893. #endif
  1894. decl %ebx # i --
  1895. jg .L51
  1896. ALIGN_4
  1897. .L60:
  1898. testl $2, M
  1899. je .L70
  1900. #ifdef LN
  1901. movl K, %eax
  1902. sall $1 + BASE_SHIFT, %eax
  1903. subl %eax, AORIG
  1904. #endif
  1905. #if defined(LN) || defined(RT)
  1906. movl KK, %eax
  1907. movl AORIG, AA
  1908. leal (, %eax, SIZE), %eax
  1909. leal (AA, %eax, 2), AA
  1910. #endif
  1911. leal BUFFER, BB
  1912. #if defined(LN) || defined(RT)
  1913. movl KK, %eax
  1914. sall $1 + BASE_SHIFT, %eax
  1915. leal (BB, %eax, 4), BB
  1916. #endif
  1917. xorps %xmm4, %xmm4
  1918. xorps %xmm5, %xmm5
  1919. xorps %xmm6, %xmm6
  1920. xorps %xmm7, %xmm7
  1921. #ifdef movsd
  1922. xorps %xmm0, %xmm0
  1923. #endif
  1924. movsd 0 * SIZE(AA), %xmm0
  1925. #ifdef movsd
  1926. xorps %xmm1, %xmm1
  1927. #endif
  1928. movsd 8 * SIZE(AA), %xmm1
  1929. movaps 0 * SIZE(BB), %xmm2
  1930. movaps 16 * SIZE(BB), %xmm3
  1931. #if defined(LT) || defined(RN)
  1932. movl KK, %eax
  1933. #else
  1934. movl K, %eax
  1935. subl KK, %eax
  1936. #endif
  1937. sarl $3, %eax
  1938. je .L65
  1939. ALIGN_4
  1940. .L62:
  1941. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  1942. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1943. #endif
  1944. mulps %xmm0, %xmm2
  1945. addps %xmm2, %xmm4
  1946. movaps 4 * SIZE(BB), %xmm2
  1947. mulps %xmm0, %xmm2
  1948. movsd 2 * SIZE(AA), %xmm0
  1949. addps %xmm2, %xmm5
  1950. movaps 8 * SIZE(BB), %xmm2
  1951. mulps %xmm0, %xmm2
  1952. addps %xmm2, %xmm6
  1953. movaps 12 * SIZE(BB), %xmm2
  1954. mulps %xmm0, %xmm2
  1955. movsd 4 * SIZE(AA), %xmm0
  1956. addps %xmm2, %xmm7
  1957. movaps 32 * SIZE(BB), %xmm2
  1958. mulps %xmm0, %xmm3
  1959. addps %xmm3, %xmm4
  1960. movaps 20 * SIZE(BB), %xmm3
  1961. mulps %xmm0, %xmm3
  1962. movsd 6 * SIZE(AA), %xmm0
  1963. addps %xmm3, %xmm5
  1964. movaps 24 * SIZE(BB), %xmm3
  1965. mulps %xmm0, %xmm3
  1966. addps %xmm3, %xmm6
  1967. movaps 28 * SIZE(BB), %xmm3
  1968. mulps %xmm0, %xmm3
  1969. movsd 16 * SIZE(AA), %xmm0
  1970. addps %xmm3, %xmm7
  1971. movaps 48 * SIZE(BB), %xmm3
  1972. mulps %xmm1, %xmm2
  1973. addps %xmm2, %xmm4
  1974. movaps 36 * SIZE(BB), %xmm2
  1975. mulps %xmm1, %xmm2
  1976. movsd 10 * SIZE(AA), %xmm1
  1977. addps %xmm2, %xmm5
  1978. movaps 40 * SIZE(BB), %xmm2
  1979. mulps %xmm1, %xmm2
  1980. addps %xmm2, %xmm6
  1981. movaps 44 * SIZE(BB), %xmm2
  1982. mulps %xmm1, %xmm2
  1983. movsd 12 * SIZE(AA), %xmm1
  1984. addps %xmm2, %xmm7
  1985. movaps 64 * SIZE(BB), %xmm2
  1986. mulps %xmm1, %xmm3
  1987. addps %xmm3, %xmm4
  1988. movaps 52 * SIZE(BB), %xmm3
  1989. mulps %xmm1, %xmm3
  1990. movsd 14 * SIZE(AA), %xmm1
  1991. addps %xmm3, %xmm5
  1992. movaps 56 * SIZE(BB), %xmm3
  1993. mulps %xmm1, %xmm3
  1994. addps %xmm3, %xmm6
  1995. movaps 60 * SIZE(BB), %xmm3
  1996. mulps %xmm1, %xmm3
  1997. movsd 24 * SIZE(AA), %xmm1
  1998. addps %xmm3, %xmm7
  1999. movaps 80 * SIZE(BB), %xmm3
  2000. addl $16 * SIZE, AA
  2001. addl $64 * SIZE, BB
  2002. decl %eax
  2003. jne .L62
  2004. ALIGN_4
  2005. .L65:
  2006. #if defined(LT) || defined(RN)
  2007. movl KK, %eax
  2008. #else
  2009. movl K, %eax
  2010. subl KK, %eax
  2011. #endif
  2012. andl $7, %eax # if (k & 1)
  2013. BRANCH
  2014. je .L68
  2015. ALIGN_4
  2016. .L66:
  2017. mulps %xmm0, %xmm2
  2018. addps %xmm2, %xmm4
  2019. movaps 4 * SIZE(BB), %xmm2
  2020. mulps %xmm0, %xmm2
  2021. movsd 2 * SIZE(AA), %xmm0
  2022. addps %xmm2, %xmm5
  2023. movaps 8 * SIZE(BB), %xmm2
  2024. addl $2 * SIZE, AA
  2025. addl $8 * SIZE, BB
  2026. decl %eax
  2027. jg .L66
  2028. ALIGN_4
  2029. .L68:
  2030. addps %xmm6, %xmm4
  2031. addps %xmm7, %xmm5
  2032. #if defined(LN) || defined(RT)
  2033. movl KK, %eax
  2034. #ifdef LN
  2035. subl $2, %eax
  2036. #else
  2037. subl $2, %eax
  2038. #endif
  2039. movl AORIG, AA
  2040. movl BORIG, B
  2041. leal BUFFER, BB
  2042. sall $BASE_SHIFT, %eax
  2043. leal (AA, %eax, 2), AA
  2044. leal (B, %eax, 2), B
  2045. leal (BB, %eax, 8), BB
  2046. #endif
  2047. #if defined(LN) || defined(LT)
  2048. unpcklps %xmm6, %xmm4
  2049. unpcklps %xmm7, %xmm5
  2050. movaps %xmm4, %xmm6
  2051. unpcklps %xmm5, %xmm4
  2052. unpckhps %xmm5, %xmm6
  2053. #ifdef movsd
  2054. xorps %xmm1, %xmm1
  2055. #endif
  2056. movsd 0 * SIZE(B), %xmm1
  2057. #ifdef movsd
  2058. xorps %xmm3, %xmm3
  2059. #endif
  2060. movsd 2 * SIZE(B), %xmm3
  2061. subps %xmm4, %xmm1
  2062. subps %xmm6, %xmm3
  2063. #else
  2064. #ifdef movsd
  2065. xorps %xmm0, %xmm0
  2066. #endif
  2067. movsd 0 * SIZE(AA), %xmm0
  2068. #ifdef movsd
  2069. xorps %xmm1, %xmm1
  2070. #endif
  2071. movsd 2 * SIZE(AA), %xmm1
  2072. subps %xmm4, %xmm0
  2073. subps %xmm5, %xmm1
  2074. #endif
  2075. #ifdef LN
  2076. movaps 0 * SIZE(AA), %xmm4
  2077. pshufd $0xff, %xmm4, %xmm6
  2078. mulps %xmm6, %xmm3
  2079. pshufd $0xaa, %xmm4, %xmm6
  2080. mulps %xmm3, %xmm6
  2081. subps %xmm6, %xmm1
  2082. pshufd $0x00, %xmm4, %xmm6
  2083. mulps %xmm6, %xmm1
  2084. #endif
  2085. #ifdef LT
  2086. movaps 0 * SIZE(AA), %xmm4
  2087. pshufd $0x00, %xmm4, %xmm6
  2088. mulps %xmm6, %xmm1
  2089. pshufd $0x55, %xmm4, %xmm6
  2090. mulps %xmm1, %xmm6
  2091. subps %xmm6, %xmm3
  2092. pshufd $0xff, %xmm4, %xmm6
  2093. mulps %xmm6, %xmm3
  2094. #endif
  2095. #ifdef RN
  2096. movaps 0 * SIZE(B), %xmm6
  2097. pshufd $0x00, %xmm6, %xmm7
  2098. mulps %xmm7, %xmm0
  2099. pshufd $0x55, %xmm6, %xmm7
  2100. mulps %xmm0, %xmm7
  2101. subps %xmm7, %xmm1
  2102. pshufd $0xff, %xmm6, %xmm7
  2103. mulps %xmm7, %xmm1
  2104. #endif
  2105. #ifdef RT
  2106. movaps 0 * SIZE(B), %xmm6
  2107. pshufd $0xff, %xmm6, %xmm7
  2108. mulps %xmm7, %xmm1
  2109. pshufd $0xaa, %xmm6, %xmm7
  2110. mulps %xmm1, %xmm7
  2111. subps %xmm7, %xmm0
  2112. pshufd $0x00, %xmm6, %xmm7
  2113. mulps %xmm7, %xmm0
  2114. #endif
  2115. #if defined(LN) || defined(LT)
  2116. movlps %xmm1, 0 * SIZE(B)
  2117. movlps %xmm3, 2 * SIZE(B)
  2118. pshufd $0x00, %xmm1, %xmm0
  2119. pshufd $0x55, %xmm1, %xmm2
  2120. movaps %xmm0, 0 * SIZE(BB)
  2121. movaps %xmm2, 4 * SIZE(BB)
  2122. pshufd $0x00, %xmm3, %xmm0
  2123. pshufd $0x55, %xmm3, %xmm2
  2124. movaps %xmm0, 8 * SIZE(BB)
  2125. movaps %xmm2, 12 * SIZE(BB)
  2126. #else
  2127. movlps %xmm0, 0 * SIZE(AA)
  2128. movlps %xmm1, 2 * SIZE(AA)
  2129. #endif
  2130. #ifdef LN
  2131. subl $2 * SIZE, CO1
  2132. #endif
  2133. #if defined(LN) || defined(LT)
  2134. unpcklps %xmm3, %xmm1
  2135. movlps %xmm1, 0 * SIZE(CO1)
  2136. movhps %xmm1, 0 * SIZE(CO1, LDC)
  2137. #else
  2138. movlps %xmm0, 0 * SIZE(CO1)
  2139. movlps %xmm1, 0 * SIZE(CO1, LDC)
  2140. #endif
  2141. #ifndef LN
  2142. addl $2 * SIZE, CO1
  2143. #endif
  2144. #if defined(LT) || defined(RN)
  2145. movl K, %eax
  2146. subl KK, %eax
  2147. leal (,%eax, SIZE), %eax
  2148. leal (AA, %eax, 2), AA
  2149. #ifdef LT
  2150. addl $4 * SIZE, B
  2151. #endif
  2152. #endif
  2153. #ifdef LN
  2154. subl $2, KK
  2155. movl BORIG, B
  2156. #endif
  2157. #ifdef LT
  2158. addl $2, KK
  2159. #endif
  2160. #ifdef RT
  2161. movl K, %eax
  2162. movl BORIG, B
  2163. sall $1 + BASE_SHIFT, %eax
  2164. addl %eax, AORIG
  2165. #endif
  2166. ALIGN_4
  2167. .L70:
  2168. testl $1, M
  2169. je .L79
  2170. #ifdef LN
  2171. movl K, %eax
  2172. sall $BASE_SHIFT, %eax
  2173. subl %eax, AORIG
  2174. #endif
  2175. #if defined(LN) || defined(RT)
  2176. movl KK, %eax
  2177. movl AORIG, AA
  2178. leal (AA, %eax, SIZE), AA
  2179. #endif
  2180. leal BUFFER, BB
  2181. #if defined(LN) || defined(RT)
  2182. movl KK, %eax
  2183. sall $1 + BASE_SHIFT, %eax
  2184. leal (BB, %eax, 4), BB
  2185. #endif
  2186. xorps %xmm4, %xmm4
  2187. xorps %xmm5, %xmm5
  2188. xorps %xmm6, %xmm6
  2189. xorps %xmm7, %xmm7
  2190. movss 0 * SIZE(AA), %xmm0
  2191. movss 4 * SIZE(AA), %xmm1
  2192. movss 0 * SIZE(BB), %xmm2
  2193. movss 16 * SIZE(BB), %xmm3
  2194. #if defined(LT) || defined(RN)
  2195. movl KK, %eax
  2196. #else
  2197. movl K, %eax
  2198. subl KK, %eax
  2199. #endif
  2200. sarl $3, %eax
  2201. je .L75
  2202. ALIGN_4
  2203. .L72:
  2204. mulss %xmm0, %xmm2
  2205. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  2206. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  2207. #endif
  2208. mulss 4 * SIZE(BB), %xmm0
  2209. addss %xmm2, %xmm4
  2210. movss 8 * SIZE(BB), %xmm2
  2211. addss %xmm0, %xmm5
  2212. movss 1 * SIZE(AA), %xmm0
  2213. mulss %xmm0, %xmm2
  2214. mulss 12 * SIZE(BB), %xmm0
  2215. addss %xmm2, %xmm6
  2216. movss 32 * SIZE(BB), %xmm2
  2217. addss %xmm0, %xmm7
  2218. movss 2 * SIZE(AA), %xmm0
  2219. mulss %xmm0, %xmm3
  2220. mulss 20 * SIZE(BB), %xmm0
  2221. addss %xmm3, %xmm4
  2222. movss 24 * SIZE(BB), %xmm3
  2223. addss %xmm0, %xmm5
  2224. movss 3 * SIZE(AA), %xmm0
  2225. mulss %xmm0, %xmm3
  2226. mulss 28 * SIZE(BB), %xmm0
  2227. addss %xmm3, %xmm6
  2228. movss 48 * SIZE(BB), %xmm3
  2229. addss %xmm0, %xmm7
  2230. movss 8 * SIZE(AA), %xmm0
  2231. mulss %xmm1, %xmm2
  2232. mulss 36 * SIZE(BB), %xmm1
  2233. addss %xmm2, %xmm4
  2234. movss 40 * SIZE(BB), %xmm2
  2235. addss %xmm1, %xmm5
  2236. movss 5 * SIZE(AA), %xmm1
  2237. mulss %xmm1, %xmm2
  2238. mulss 44 * SIZE(BB), %xmm1
  2239. addss %xmm2, %xmm6
  2240. movss 64 * SIZE(BB), %xmm2
  2241. addss %xmm1, %xmm7
  2242. movss 6 * SIZE(AA), %xmm1
  2243. mulss %xmm1, %xmm3
  2244. mulss 52 * SIZE(BB), %xmm1
  2245. addss %xmm3, %xmm4
  2246. movss 56 * SIZE(BB), %xmm3
  2247. addss %xmm1, %xmm5
  2248. movss 7 * SIZE(AA), %xmm1
  2249. mulss %xmm1, %xmm3
  2250. mulss 60 * SIZE(BB), %xmm1
  2251. addss %xmm3, %xmm6
  2252. movss 80 * SIZE(BB), %xmm3
  2253. addss %xmm1, %xmm7
  2254. movss 12 * SIZE(AA), %xmm1
  2255. addl $ 8 * SIZE, AA
  2256. addl $64 * SIZE, BB
  2257. decl %eax
  2258. jne .L72
  2259. ALIGN_4
  2260. .L75:
  2261. #if defined(LT) || defined(RN)
  2262. movl KK, %eax
  2263. #else
  2264. movl K, %eax
  2265. subl KK, %eax
  2266. #endif
  2267. andl $7, %eax # if (k & 1)
  2268. BRANCH
  2269. je .L78
  2270. ALIGN_4
  2271. .L76:
  2272. mulss %xmm0, %xmm2
  2273. mulss 4 * SIZE(BB), %xmm0
  2274. addss %xmm2, %xmm4
  2275. movss 8 * SIZE(BB), %xmm2
  2276. addss %xmm0, %xmm5
  2277. movss 1 * SIZE(AA), %xmm0
  2278. addl $ 1 * SIZE, AA
  2279. addl $ 8 * SIZE, BB
  2280. decl %eax
  2281. jg .L76
  2282. ALIGN_4
  2283. .L78:
  2284. addss %xmm6, %xmm4
  2285. addss %xmm7, %xmm5
  2286. #if defined(LN) || defined(RT)
  2287. movl KK, %eax
  2288. #ifdef LN
  2289. subl $1, %eax
  2290. #else
  2291. subl $2, %eax
  2292. #endif
  2293. movl AORIG, AA
  2294. movl BORIG, B
  2295. leal BUFFER, BB
  2296. sall $BASE_SHIFT, %eax
  2297. leal (AA, %eax, 1), AA
  2298. leal (B, %eax, 2), B
  2299. leal (BB, %eax, 8), BB
  2300. #endif
  2301. #if defined(LN) || defined(LT)
  2302. unpcklps %xmm5, %xmm4
  2303. #ifdef movsd
  2304. xorps %xmm1, %xmm1
  2305. #endif
  2306. movsd 0 * SIZE(B), %xmm1
  2307. subps %xmm4, %xmm1
  2308. #else
  2309. movss 0 * SIZE(AA), %xmm0
  2310. movss 1 * SIZE(AA), %xmm1
  2311. subss %xmm4, %xmm0
  2312. subss %xmm5, %xmm1
  2313. #endif
  2314. #if defined(LN) || defined(LT)
  2315. movss 0 * SIZE(AA), %xmm4
  2316. pshufd $0x00, %xmm4, %xmm6
  2317. mulps %xmm6, %xmm1
  2318. #endif
  2319. #ifdef RN
  2320. movaps 0 * SIZE(B), %xmm6
  2321. pshufd $0x00, %xmm6, %xmm7
  2322. mulss %xmm7, %xmm0
  2323. pshufd $0x55, %xmm6, %xmm7
  2324. mulss %xmm0, %xmm7
  2325. subss %xmm7, %xmm1
  2326. pshufd $0xff, %xmm6, %xmm7
  2327. mulss %xmm7, %xmm1
  2328. #endif
  2329. #ifdef RT
  2330. movaps 0 * SIZE(B), %xmm6
  2331. pshufd $0xff, %xmm6, %xmm7
  2332. mulss %xmm7, %xmm1
  2333. pshufd $0xaa, %xmm6, %xmm7
  2334. mulss %xmm1, %xmm7
  2335. subss %xmm7, %xmm0
  2336. pshufd $0x00, %xmm6, %xmm7
  2337. mulss %xmm7, %xmm0
  2338. #endif
  2339. #if defined(LN) || defined(LT)
  2340. movlps %xmm1, 0 * SIZE(B)
  2341. pshufd $0x00, %xmm1, %xmm0
  2342. pshufd $0x55, %xmm1, %xmm2
  2343. movaps %xmm0, 0 * SIZE(BB)
  2344. movaps %xmm2, 4 * SIZE(BB)
  2345. #else
  2346. movss %xmm0, 0 * SIZE(AA)
  2347. movss %xmm1, 1 * SIZE(AA)
  2348. #endif
  2349. #ifdef LN
  2350. subl $1 * SIZE, CO1
  2351. #endif
  2352. #if defined(LN) || defined(LT)
  2353. pshufd $1, %xmm1, %xmm3
  2354. movss %xmm1, 0 * SIZE(CO1)
  2355. movss %xmm3, 0 * SIZE(CO1, LDC)
  2356. #else
  2357. movss %xmm0, 0 * SIZE(CO1)
  2358. movss %xmm1, 0 * SIZE(CO1, LDC)
  2359. #endif
  2360. #ifndef LN
  2361. addl $1 * SIZE, CO1
  2362. #endif
  2363. #if defined(LT) || defined(RN)
  2364. movl K, %eax
  2365. subl KK, %eax
  2366. leal (AA, %eax, SIZE), AA
  2367. #ifdef LT
  2368. addl $2 * SIZE, B
  2369. #endif
  2370. #endif
  2371. #ifdef LN
  2372. subl $1, KK
  2373. movl BORIG, B
  2374. #endif
  2375. #ifdef LT
  2376. addl $1, KK
  2377. #endif
  2378. #ifdef RT
  2379. movl K, %eax
  2380. movl BORIG, B
  2381. sall $BASE_SHIFT, %eax
  2382. addl %eax, AORIG
  2383. #endif
  2384. ALIGN_4
  2385. .L79:
  2386. #ifdef LN
  2387. movl K, %eax
  2388. leal (, %eax, SIZE), %eax
  2389. leal (B, %eax, 2), B
  2390. #endif
  2391. #if defined(LT) || defined(RN)
  2392. movl K, %eax
  2393. subl KK, %eax
  2394. leal (,%eax, SIZE), %eax
  2395. leal (B, %eax, 2), B
  2396. #endif
  2397. #ifdef RN
  2398. addl $2, KK
  2399. #endif
  2400. #ifdef RT
  2401. subl $2, KK
  2402. #endif
  2403. ALIGN_4
  2404. .L80:
  2405. testl $1, N
  2406. je .L999
  2407. #ifdef LN
  2408. movl OFFSET, %eax
  2409. addl M, %eax
  2410. movl %eax, KK
  2411. #endif
  2412. leal BUFFER, %ecx
  2413. #ifdef RT
  2414. movl K, %eax
  2415. sall $BASE_SHIFT, %eax
  2416. subl %eax, B
  2417. #endif
  2418. #if defined(LN) || defined(RT)
  2419. movl KK, %eax
  2420. movl B, BORIG
  2421. sall $BASE_SHIFT, %eax
  2422. leal (B, %eax, 1), B
  2423. leal (BB, %eax, 4), BB
  2424. #endif
  2425. #ifdef LT
  2426. movl OFFSET, %eax
  2427. movl %eax, KK
  2428. #endif
  2429. #if defined(LT) || defined(RN)
  2430. movl KK, %eax
  2431. #else
  2432. movl K, %eax
  2433. subl KK, %eax
  2434. #endif
  2435. sarl $3, %eax
  2436. jle .L85
  2437. ALIGN_4
  2438. .L82:
  2439. movsd 0 * SIZE(B), %xmm3
  2440. movhps 2 * SIZE(B), %xmm3
  2441. movsd 4 * SIZE(B), %xmm7
  2442. movhps 6 * SIZE(B), %xmm7
  2443. pshufd $0x00, %xmm3, %xmm0
  2444. pshufd $0x55, %xmm3, %xmm1
  2445. pshufd $0xaa, %xmm3, %xmm2
  2446. pshufd $0xff, %xmm3, %xmm3
  2447. pshufd $0x00, %xmm7, %xmm4
  2448. pshufd $0x55, %xmm7, %xmm5
  2449. pshufd $0xaa, %xmm7, %xmm6
  2450. pshufd $0xff, %xmm7, %xmm7
  2451. movaps %xmm0, 0 * SIZE(BB)
  2452. movaps %xmm1, 4 * SIZE(BB)
  2453. movaps %xmm2, 8 * SIZE(BB)
  2454. movaps %xmm3, 12 * SIZE(BB)
  2455. movaps %xmm4, 16 * SIZE(BB)
  2456. movaps %xmm5, 20 * SIZE(BB)
  2457. movaps %xmm6, 24 * SIZE(BB)
  2458. movaps %xmm7, 28 * SIZE(BB)
  2459. addl $ 8 * SIZE, B
  2460. addl $32 * SIZE, BB
  2461. decl %eax
  2462. jne .L82
  2463. ALIGN_4
  2464. .L85:
  2465. #if defined(LT) || defined(RN)
  2466. movl KK, %eax
  2467. #else
  2468. movl K, %eax
  2469. subl KK, %eax
  2470. #endif
  2471. andl $7, %eax
  2472. BRANCH
  2473. jle .L90
  2474. ALIGN_4
  2475. .L86:
  2476. movss 0 * SIZE(B), %xmm3
  2477. pshufd $0x00, %xmm3, %xmm0
  2478. movaps %xmm0, 0 * SIZE(BB)
  2479. addl $1 * SIZE, B
  2480. addl $4 * SIZE, BB
  2481. decl %eax
  2482. jne .L86
  2483. ALIGN_4
  2484. .L90:
  2485. #if defined(LT) || defined(RN)
  2486. movl A, AA
  2487. #else
  2488. movl A, %eax
  2489. movl %eax, AORIG
  2490. #endif
  2491. #ifdef RT
  2492. subl LDC, C
  2493. #endif
  2494. movl C, CO1
  2495. #ifndef RT
  2496. addl LDC, C
  2497. #endif
  2498. movl M, %ebx
  2499. sarl $2, %ebx # i = (m >> 2)
  2500. jle .L100
  2501. ALIGN_4
  2502. .L91:
  2503. #ifdef LN
  2504. movl K, %eax
  2505. sall $2 + BASE_SHIFT, %eax
  2506. subl %eax, AORIG
  2507. #endif
  2508. #if defined(LN) || defined(RT)
  2509. movl KK, %eax
  2510. movl AORIG, AA
  2511. leal (, %eax, SIZE), %eax
  2512. leal (AA, %eax, 4), AA
  2513. #endif
  2514. leal BUFFER, BB
  2515. #if defined(LN) || defined(RT)
  2516. movl KK, %eax
  2517. sall $BASE_SHIFT, %eax
  2518. leal (BB, %eax, 4), BB
  2519. #endif
  2520. xorps %xmm4, %xmm4
  2521. xorps %xmm5, %xmm5
  2522. xorps %xmm6, %xmm6
  2523. xorps %xmm7, %xmm7
  2524. movaps 0 * SIZE(AA), %xmm0
  2525. movaps 16 * SIZE(AA), %xmm1
  2526. movaps 0 * SIZE(BB), %xmm2
  2527. movaps 16 * SIZE(BB), %xmm3
  2528. PREFETCHW 3 * SIZE(CO1)
  2529. #if defined(LT) || defined(RN)
  2530. movl KK, %eax
  2531. #else
  2532. movl K, %eax
  2533. subl KK, %eax
  2534. #endif
  2535. sarl $3, %eax
  2536. je .L95
  2537. ALIGN_4
  2538. .L92:
  2539. mulps %xmm0, %xmm2
  2540. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  2541. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  2542. #endif
  2543. movaps 4 * SIZE(AA), %xmm0
  2544. addps %xmm2, %xmm4
  2545. movaps 32 * SIZE(BB), %xmm2
  2546. mulps 4 * SIZE(BB), %xmm0
  2547. addps %xmm0, %xmm5
  2548. movaps 8 * SIZE(AA), %xmm0
  2549. mulps 8 * SIZE(BB), %xmm0
  2550. addps %xmm0, %xmm6
  2551. movaps 12 * SIZE(AA), %xmm0
  2552. mulps 12 * SIZE(BB), %xmm0
  2553. addps %xmm0, %xmm7
  2554. movaps 32 * SIZE(AA), %xmm0
  2555. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  2556. prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
  2557. #endif
  2558. mulps %xmm1, %xmm3
  2559. movaps 20 * SIZE(AA), %xmm1
  2560. addps %xmm3, %xmm4
  2561. movaps 48 * SIZE(BB), %xmm3
  2562. mulps 20 * SIZE(BB), %xmm1
  2563. addps %xmm1, %xmm5
  2564. movaps 24 * SIZE(AA), %xmm1
  2565. mulps 24 * SIZE(BB), %xmm1
  2566. addps %xmm1, %xmm6
  2567. movaps 28 * SIZE(AA), %xmm1
  2568. mulps 28 * SIZE(BB), %xmm1
  2569. addps %xmm1, %xmm7
  2570. movaps 48 * SIZE(AA), %xmm1
  2571. addl $32 * SIZE, AA
  2572. addl $32 * SIZE, BB
  2573. decl %eax
  2574. jne .L92
  2575. ALIGN_4
  2576. .L95:
  2577. #if defined(LT) || defined(RN)
  2578. movl KK, %eax
  2579. #else
  2580. movl K, %eax
  2581. subl KK, %eax
  2582. #endif
  2583. andl $7, %eax # if (k & 1)
  2584. BRANCH
  2585. je .L98
  2586. ALIGN_4
  2587. .L96:
  2588. mulps %xmm0, %xmm2
  2589. addps %xmm2, %xmm4
  2590. movaps 4 * SIZE(AA), %xmm0
  2591. movaps 4 * SIZE(BB), %xmm2
  2592. addl $4 * SIZE, AA
  2593. addl $4 * SIZE, BB
  2594. decl %eax
  2595. jg .L96
  2596. ALIGN_4
  2597. .L98:
  2598. addps %xmm5, %xmm4
  2599. addps %xmm7, %xmm6
  2600. addps %xmm6, %xmm4
  2601. #if defined(LN) || defined(RT)
  2602. movl KK, %eax
  2603. #ifdef LN
  2604. subl $4, %eax
  2605. #else
  2606. subl $1, %eax
  2607. #endif
  2608. movl AORIG, AA
  2609. movl BORIG, B
  2610. leal BUFFER, BB
  2611. sall $ BASE_SHIFT, %eax
  2612. leal (AA, %eax, 4), AA
  2613. leal (B, %eax, 1), B
  2614. leal (BB, %eax, 4), BB
  2615. #endif
  2616. #if defined(LN) || defined(LT)
  2617. movaps %xmm4, %xmm0
  2618. unpcklps %xmm6, %xmm4
  2619. unpckhps %xmm6, %xmm0
  2620. movaps %xmm5, %xmm1
  2621. unpcklps %xmm7, %xmm5
  2622. unpckhps %xmm7, %xmm1
  2623. movaps %xmm4, %xmm6
  2624. unpcklps %xmm5, %xmm4
  2625. unpckhps %xmm5, %xmm6
  2626. movaps %xmm0, %xmm2
  2627. unpcklps %xmm1, %xmm0
  2628. unpckhps %xmm1, %xmm2
  2629. movss 0 * SIZE(B), %xmm1
  2630. movss 1 * SIZE(B), %xmm3
  2631. movss 2 * SIZE(B), %xmm5
  2632. movss 3 * SIZE(B), %xmm7
  2633. subss %xmm4, %xmm1
  2634. subss %xmm6, %xmm3
  2635. subss %xmm0, %xmm5
  2636. subss %xmm2, %xmm7
  2637. #else
  2638. movaps 0 * SIZE(AA), %xmm0
  2639. subps %xmm4, %xmm0
  2640. #endif
  2641. #ifdef LN
  2642. movaps 12 * SIZE(AA), %xmm4
  2643. pshufd $0xff, %xmm4, %xmm6
  2644. mulss %xmm6, %xmm7
  2645. pshufd $0xaa, %xmm4, %xmm6
  2646. mulss %xmm7, %xmm6
  2647. subss %xmm6, %xmm5
  2648. pshufd $0x55, %xmm4, %xmm6
  2649. mulss %xmm7, %xmm6
  2650. subss %xmm6, %xmm3
  2651. pshufd $0x00, %xmm4, %xmm6
  2652. mulss %xmm7, %xmm6
  2653. subss %xmm6, %xmm1
  2654. movaps 8 * SIZE(AA), %xmm4
  2655. pshufd $0xaa, %xmm4, %xmm6
  2656. mulss %xmm6, %xmm5
  2657. pshufd $0x55, %xmm4, %xmm6
  2658. mulss %xmm5, %xmm6
  2659. subss %xmm6, %xmm3
  2660. pshufd $0x00, %xmm4, %xmm6
  2661. mulss %xmm5, %xmm6
  2662. subss %xmm6, %xmm1
  2663. movaps 4 * SIZE(AA), %xmm4
  2664. pshufd $0x55, %xmm4, %xmm6
  2665. mulss %xmm6, %xmm3
  2666. pshufd $0x00, %xmm4, %xmm6
  2667. mulss %xmm3, %xmm6
  2668. subss %xmm6, %xmm1
  2669. movaps 0 * SIZE(AA), %xmm4
  2670. pshufd $0x00, %xmm4, %xmm6
  2671. mulss %xmm6, %xmm1
  2672. #endif
  2673. #ifdef LT
  2674. movaps 0 * SIZE(AA), %xmm4
  2675. pshufd $0x00, %xmm4, %xmm6
  2676. mulss %xmm6, %xmm1
  2677. pshufd $0x55, %xmm4, %xmm6
  2678. mulss %xmm1, %xmm6
  2679. subss %xmm6, %xmm3
  2680. pshufd $0xaa, %xmm4, %xmm6
  2681. mulss %xmm1, %xmm6
  2682. subss %xmm6, %xmm5
  2683. pshufd $0xff, %xmm4, %xmm6
  2684. mulss %xmm1, %xmm6
  2685. subss %xmm6, %xmm7
  2686. movaps 4 * SIZE(AA), %xmm4
  2687. pshufd $0x55, %xmm4, %xmm6
  2688. mulss %xmm6, %xmm3
  2689. pshufd $0xaa, %xmm4, %xmm6
  2690. mulss %xmm3, %xmm6
  2691. subss %xmm6, %xmm5
  2692. pshufd $0xff, %xmm4, %xmm6
  2693. mulss %xmm3, %xmm6
  2694. subss %xmm6, %xmm7
  2695. movaps 8 * SIZE(AA), %xmm4
  2696. pshufd $0xaa, %xmm4, %xmm6
  2697. mulss %xmm6, %xmm5
  2698. pshufd $0xff, %xmm4, %xmm6
  2699. mulss %xmm5, %xmm6
  2700. subss %xmm6, %xmm7
  2701. movaps 12 * SIZE(AA), %xmm4
  2702. pshufd $0xff, %xmm4, %xmm6
  2703. mulss %xmm6, %xmm7
  2704. #endif
  2705. #if defined(RN) || defined(RT)
  2706. movss 0 * SIZE(B), %xmm6
  2707. pshufd $0x00, %xmm6, %xmm7
  2708. mulps %xmm7, %xmm0
  2709. #endif
  2710. #if defined(LN) || defined(LT)
  2711. movss %xmm1, 0 * SIZE(B)
  2712. movss %xmm3, 1 * SIZE(B)
  2713. movss %xmm5, 2 * SIZE(B)
  2714. movss %xmm7, 3 * SIZE(B)
  2715. pshufd $0x00, %xmm1, %xmm0
  2716. movaps %xmm0, 0 * SIZE(BB)
  2717. pshufd $0x00, %xmm3, %xmm0
  2718. movaps %xmm0, 4 * SIZE(BB)
  2719. pshufd $0x00, %xmm5, %xmm0
  2720. movaps %xmm0, 8 * SIZE(BB)
  2721. pshufd $0x00, %xmm7, %xmm0
  2722. movaps %xmm0, 12 * SIZE(BB)
  2723. #else
  2724. movss %xmm0, 0 * SIZE(AA)
  2725. movss %xmm1, 1 * SIZE(AA)
  2726. movss %xmm2, 2 * SIZE(AA)
  2727. movss %xmm3, 3 * SIZE(AA)
  2728. #endif
  2729. #ifdef LN
  2730. subl $4 * SIZE, CO1
  2731. #endif
  2732. #if defined(LN) || defined(LT)
  2733. unpcklps %xmm5, %xmm1
  2734. unpcklps %xmm7, %xmm3
  2735. unpcklps %xmm3, %xmm1
  2736. movlps %xmm1, 0 * SIZE(CO1)
  2737. movhps %xmm1, 2 * SIZE(CO1)
  2738. #else
  2739. movlps %xmm0, 0 * SIZE(CO1)
  2740. movhps %xmm0, 2 * SIZE(CO1)
  2741. #endif
  2742. #ifndef LN
  2743. addl $4 * SIZE, CO1
  2744. #endif
  2745. #if defined(LT) || defined(RN)
  2746. movl K, %eax
  2747. subl KK, %eax
  2748. leal (,%eax, SIZE), %eax
  2749. leal (AA, %eax, 4), AA
  2750. #ifdef LT
  2751. addl $4 * SIZE, B
  2752. #endif
  2753. #endif
  2754. #ifdef LN
  2755. subl $4, KK
  2756. movl BORIG, B
  2757. #endif
  2758. #ifdef LT
  2759. addl $4, KK
  2760. #endif
  2761. #ifdef RT
  2762. movl K, %eax
  2763. movl BORIG, B
  2764. sall $2 + BASE_SHIFT, %eax
  2765. addl %eax, AORIG
  2766. #endif
  2767. decl %ebx # i --
  2768. jg .L91
  2769. ALIGN_4
  2770. .L100:
  2771. testl $2, M
  2772. je .L110
  2773. #ifdef LN
  2774. movl K, %eax
  2775. sall $1 + BASE_SHIFT, %eax
  2776. subl %eax, AORIG
  2777. #endif
  2778. #if defined(LN) || defined(RT)
  2779. movl KK, %eax
  2780. movl AORIG, AA
  2781. sall $1 + BASE_SHIFT, %eax
  2782. leal (, %eax, SIZE), %eax
  2783. leal (AA, %eax, 2), AA
  2784. #endif
  2785. leal BUFFER, BB
  2786. #if defined(LN) || defined(RT)
  2787. movl KK, %eax
  2788. sall $BASE_SHIFT, %eax
  2789. leal (BB, %eax, 4), BB
  2790. #endif
  2791. xorps %xmm4, %xmm4
  2792. xorps %xmm5, %xmm5
  2793. xorps %xmm6, %xmm6
  2794. xorps %xmm7, %xmm7
  2795. #ifdef movsd
  2796. xorps %xmm0, %xmm0
  2797. #endif
  2798. movsd 0 * SIZE(AA), %xmm0
  2799. #ifdef movsd
  2800. xorps %xmm1, %xmm1
  2801. #endif
  2802. movsd 8 * SIZE(AA), %xmm1
  2803. movaps 0 * SIZE(BB), %xmm2
  2804. movaps 16 * SIZE(BB), %xmm3
  2805. #if defined(LT) || defined(RN)
  2806. movl KK, %eax
  2807. #else
  2808. movl K, %eax
  2809. subl KK, %eax
  2810. #endif
  2811. sarl $3, %eax
  2812. je .L105
  2813. ALIGN_4
  2814. .L102:
  2815. mulps %xmm0, %xmm2
  2816. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  2817. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  2818. #endif
  2819. movsd 2 * SIZE(AA), %xmm0
  2820. addps %xmm2, %xmm4
  2821. movaps 4 * SIZE(BB), %xmm2
  2822. mulps %xmm0, %xmm2
  2823. movsd 4 * SIZE(AA), %xmm0
  2824. addps %xmm2, %xmm5
  2825. movaps 8 * SIZE(BB), %xmm2
  2826. mulps %xmm0, %xmm2
  2827. movsd 6 * SIZE(AA), %xmm0
  2828. addps %xmm2, %xmm6
  2829. movaps 12 * SIZE(BB), %xmm2
  2830. mulps %xmm0, %xmm2
  2831. movsd 16 * SIZE(AA), %xmm0
  2832. addps %xmm2, %xmm7
  2833. movaps 32 * SIZE(BB), %xmm2
  2834. mulps %xmm1, %xmm3
  2835. movsd 10 * SIZE(AA), %xmm1
  2836. addps %xmm3, %xmm4
  2837. movaps 20 * SIZE(BB), %xmm3
  2838. mulps %xmm1, %xmm3
  2839. movsd 12 * SIZE(AA), %xmm1
  2840. addps %xmm3, %xmm5
  2841. movaps 24 * SIZE(BB), %xmm3
  2842. mulps %xmm1, %xmm3
  2843. movsd 14 * SIZE(AA), %xmm1
  2844. addps %xmm3, %xmm6
  2845. movaps 28 * SIZE(BB), %xmm3
  2846. mulps %xmm1, %xmm3
  2847. movsd 24 * SIZE(AA), %xmm1
  2848. addps %xmm3, %xmm7
  2849. movaps 48 * SIZE(BB), %xmm3
  2850. addl $16 * SIZE, AA
  2851. addl $32 * SIZE, BB
  2852. decl %eax
  2853. jne .L102
  2854. ALIGN_4
  2855. .L105:
  2856. #if defined(LT) || defined(RN)
  2857. movl KK, %eax
  2858. #else
  2859. movl K, %eax
  2860. subl KK, %eax
  2861. #endif
  2862. andl $7, %eax # if (k & 1)
  2863. BRANCH
  2864. je .L108
  2865. ALIGN_4
  2866. .L106:
  2867. mulps %xmm0, %xmm2
  2868. addps %xmm2, %xmm4
  2869. movsd 2 * SIZE(AA), %xmm0
  2870. movaps 4 * SIZE(BB), %xmm2
  2871. addl $2 * SIZE, AA
  2872. addl $4 * SIZE, BB
  2873. decl %eax
  2874. jg .L106
  2875. ALIGN_4
  2876. .L108:
  2877. addps %xmm5, %xmm4
  2878. addps %xmm7, %xmm6
  2879. addps %xmm6, %xmm4
  2880. #if defined(LN) || defined(RT)
  2881. movl KK, %eax
  2882. #ifdef LN
  2883. subl $2, %eax
  2884. #else
  2885. subl $1, %eax
  2886. #endif
  2887. movl AORIG, AA
  2888. movl BORIG, B
  2889. leal BUFFER, BB
  2890. sall $ BASE_SHIFT, %eax
  2891. leal (AA, %eax, 2), AA
  2892. leal (B, %eax, 1), B
  2893. leal (BB, %eax, 4), BB
  2894. #endif
  2895. #if defined(LN) || defined(LT)
  2896. pshufd $1, %xmm4, %xmm6
  2897. movss 0 * SIZE(B), %xmm1
  2898. movss 1 * SIZE(B), %xmm3
  2899. subss %xmm4, %xmm1
  2900. subss %xmm6, %xmm3
  2901. #else
  2902. #ifdef movsd
  2903. xorps %xmm0, %xmm0
  2904. #endif
  2905. movsd 0 * SIZE(AA), %xmm0
  2906. subps %xmm4, %xmm0
  2907. #endif
  2908. #ifdef LN
  2909. movaps 0 * SIZE(AA), %xmm4
  2910. pshufd $0xff, %xmm4, %xmm6
  2911. mulss %xmm6, %xmm3
  2912. pshufd $0xaa, %xmm4, %xmm6
  2913. mulss %xmm3, %xmm6
  2914. subss %xmm6, %xmm1
  2915. pshufd $0x00, %xmm4, %xmm6
  2916. mulss %xmm6, %xmm1
  2917. #endif
  2918. #ifdef LT
  2919. movaps 0 * SIZE(AA), %xmm4
  2920. pshufd $0x00, %xmm4, %xmm6
  2921. mulss %xmm6, %xmm1
  2922. pshufd $0x55, %xmm4, %xmm6
  2923. mulss %xmm1, %xmm6
  2924. subss %xmm6, %xmm3
  2925. pshufd $0xff, %xmm4, %xmm6
  2926. mulss %xmm6, %xmm3
  2927. #endif
  2928. #if defined(RN) || defined(RT)
  2929. movss 0 * SIZE(B), %xmm6
  2930. pshufd $0x00, %xmm6, %xmm7
  2931. mulps %xmm7, %xmm0
  2932. #endif
  2933. #if defined(LN) || defined(LT)
  2934. movss %xmm1, 0 * SIZE(B)
  2935. movss %xmm3, 1 * SIZE(B)
  2936. pshufd $0x00, %xmm1, %xmm0
  2937. movaps %xmm0, 0 * SIZE(BB)
  2938. pshufd $0x00, %xmm3, %xmm0
  2939. movaps %xmm0, 4 * SIZE(BB)
  2940. #else
  2941. movlps %xmm0, 0 * SIZE(AA)
  2942. #endif
  2943. #ifdef LN
  2944. subl $2 * SIZE, CO1
  2945. #endif
  2946. #if defined(LN) || defined(LT)
  2947. movss %xmm1, 0 * SIZE(CO1)
  2948. movss %xmm3, 1 * SIZE(CO1)
  2949. #else
  2950. movlps %xmm0, 0 * SIZE(CO1)
  2951. #endif
  2952. #ifndef LN
  2953. addl $2 * SIZE, CO1
  2954. #endif
  2955. #if defined(LT) || defined(RN)
  2956. movl K, %eax
  2957. subl KK, %eax
  2958. leal (,%eax, SIZE), %eax
  2959. leal (AA, %eax, 2), AA
  2960. #ifdef LT
  2961. addl $2 * SIZE, B
  2962. #endif
  2963. #endif
  2964. #ifdef LN
  2965. subl $2, KK
  2966. movl BORIG, B
  2967. #endif
  2968. #ifdef LT
  2969. addl $2, KK
  2970. #endif
  2971. #ifdef RT
  2972. movl K, %eax
  2973. movl BORIG, B
  2974. sall $1 + BASE_SHIFT, %eax
  2975. addl %eax, AORIG
  2976. #endif
  2977. ALIGN_4
  2978. .L110:
  2979. testl $1, M
  2980. je .L119
  2981. #ifdef LN
  2982. movl K, %eax
  2983. sall $BASE_SHIFT, %eax
  2984. subl %eax, AORIG
  2985. #endif
  2986. #if defined(LN) || defined(RT)
  2987. movl KK, %eax
  2988. movl AORIG, AA
  2989. leal (AA, %eax, SIZE), AA
  2990. #endif
  2991. leal BUFFER, BB
  2992. #if defined(LN) || defined(RT)
  2993. movl KK, %eax
  2994. sall $BASE_SHIFT, %eax
  2995. leal (BB, %eax, 4), BB
  2996. #endif
  2997. xorps %xmm4, %xmm4
  2998. xorps %xmm5, %xmm5
  2999. xorps %xmm6, %xmm6
  3000. xorps %xmm7, %xmm7
  3001. movss 0 * SIZE(AA), %xmm0
  3002. movss 4 * SIZE(AA), %xmm1
  3003. movss 0 * SIZE(BB), %xmm2
  3004. movss 16 * SIZE(BB), %xmm3
  3005. #if defined(LT) || defined(RN)
  3006. movl KK, %eax
  3007. #else
  3008. movl K, %eax
  3009. subl KK, %eax
  3010. #endif
  3011. sarl $3, %eax
  3012. je .L115
  3013. ALIGN_4
  3014. .L112:
  3015. mulss %xmm0, %xmm2
  3016. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  3017. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  3018. #endif
  3019. movss 1 * SIZE(AA), %xmm0
  3020. addss %xmm2, %xmm4
  3021. movss 32 * SIZE(BB), %xmm2
  3022. mulss 4 * SIZE(BB), %xmm0
  3023. addss %xmm0, %xmm5
  3024. movss 2 * SIZE(AA), %xmm0
  3025. mulss 8 * SIZE(BB), %xmm0
  3026. addss %xmm0, %xmm6
  3027. movss 3 * SIZE(AA), %xmm0
  3028. mulss 12 * SIZE(BB), %xmm0
  3029. addss %xmm0, %xmm7
  3030. movss 8 * SIZE(AA), %xmm0
  3031. mulss %xmm1, %xmm3
  3032. movss 5 * SIZE(AA), %xmm1
  3033. addss %xmm3, %xmm4
  3034. movss 48 * SIZE(BB), %xmm3
  3035. mulss 20 * SIZE(BB), %xmm1
  3036. addss %xmm1, %xmm5
  3037. movss 6 * SIZE(AA), %xmm1
  3038. mulss 24 * SIZE(BB), %xmm1
  3039. addss %xmm1, %xmm6
  3040. movss 7 * SIZE(AA), %xmm1
  3041. mulss 28 * SIZE(BB), %xmm1
  3042. addss %xmm1, %xmm7
  3043. movss 12 * SIZE(AA), %xmm1
  3044. addl $ 8 * SIZE, AA
  3045. addl $32 * SIZE, BB
  3046. decl %eax
  3047. jne .L112
  3048. ALIGN_4
  3049. .L115:
  3050. #if defined(LT) || defined(RN)
  3051. movl KK, %eax
  3052. #else
  3053. movl K, %eax
  3054. subl KK, %eax
  3055. #endif
  3056. andl $7, %eax # if (k & 1)
  3057. BRANCH
  3058. je .L118
  3059. ALIGN_4
  3060. .L116:
  3061. mulss %xmm0, %xmm2
  3062. movss 1 * SIZE(AA), %xmm0
  3063. addss %xmm2, %xmm4
  3064. movss 4 * SIZE(BB), %xmm2
  3065. addl $ 1 * SIZE, AA
  3066. addl $ 4 * SIZE, BB
  3067. decl %eax
  3068. jg .L116
  3069. ALIGN_4
  3070. .L118:
  3071. addss %xmm5, %xmm4
  3072. addss %xmm7, %xmm6
  3073. addss %xmm6, %xmm4
  3074. #if defined(LN) || defined(RT)
  3075. movl KK, %eax
  3076. subl $1, %eax
  3077. movl AORIG, AA
  3078. movl BORIG, B
  3079. leal BUFFER, BB
  3080. sall $ BASE_SHIFT, %eax
  3081. leal (AA, %eax, 1), AA
  3082. leal (B, %eax, 1), B
  3083. leal (BB, %eax, 4), BB
  3084. #endif
  3085. #if defined(LN) || defined(LT)
  3086. movss 0 * SIZE(B), %xmm1
  3087. subss %xmm4, %xmm1
  3088. #else
  3089. movss 0 * SIZE(AA), %xmm0
  3090. subss %xmm4, %xmm0
  3091. #endif
  3092. #if defined(LN) || defined(LT)
  3093. mulss 0 * SIZE(AA), %xmm1
  3094. #endif
  3095. #if defined(RN) || defined(RT)
  3096. mulss 0 * SIZE(B), %xmm0
  3097. #endif
  3098. #if defined(LN) || defined(LT)
  3099. movss %xmm1, 0 * SIZE(B)
  3100. pshufd $0x00, %xmm1, %xmm0
  3101. movaps %xmm0, 0 * SIZE(BB)
  3102. #else
  3103. movss %xmm0, 0 * SIZE(AA)
  3104. #endif
  3105. #ifdef LN
  3106. subl $1 * SIZE, CO1
  3107. #endif
  3108. #if defined(LN) || defined(LT)
  3109. movss %xmm1, 0 * SIZE(CO1)
  3110. #else
  3111. movss %xmm0, 0 * SIZE(CO1)
  3112. #endif
  3113. #ifndef LN
  3114. addl $1 * SIZE, CO1
  3115. #endif
  3116. #if defined(LT) || defined(RN)
  3117. movl K, %eax
  3118. subl KK, %eax
  3119. leal (AA, %eax, SIZE), AA
  3120. #ifdef LT
  3121. addl $1 * SIZE, B
  3122. #endif
  3123. #endif
  3124. #ifdef LN
  3125. subl $1, KK
  3126. movl BORIG, B
  3127. #endif
  3128. #ifdef LT
  3129. addl $1, KK
  3130. #endif
  3131. #ifdef RT
  3132. movl K, %eax
  3133. movl BORIG, B
  3134. sall $BASE_SHIFT, %eax
  3135. addl %eax, AORIG
  3136. #endif
  3137. ALIGN_4
  3138. .L119:
  3139. #ifdef LN
  3140. movl K, %eax
  3141. leal (B, %eax, SIZE), B
  3142. #endif
  3143. #if defined(LT) || defined(RN)
  3144. movl K, %eax
  3145. subl KK, %eax
  3146. leal (B, %eax, SIZE), B
  3147. #endif
  3148. #ifdef RN
  3149. addl $1, KK
  3150. #endif
  3151. #ifdef RT
  3152. subl $1, KK
  3153. #endif
  3154. ALIGN_4
  3155. .L999:
  3156. movl OLD_STACK, %esp
  3157. popl %ebx
  3158. popl %esi
  3159. popl %edi
  3160. popl %ebp
  3161. ret
  3162. EPILOGUE