You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_4x4_sse.S 55 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define OLD_M 4 + STACK(%esi)
  42. #define OLD_N 8 + STACK(%esi)
  43. #define OLD_K 12 + STACK(%esi)
  44. #define OLD_ALPHA 16 + STACK(%esi)
  45. #define OLD_A 20 + STACK(%esi)
  46. #define OLD_B 24 + STACK(%esi)
  47. #define OLD_C 28 + STACK(%esi)
  48. #define OLD_LDC 32 + STACK(%esi)
  49. #define STACK_OFFT 36 + STACK(%esi)
  50. #define ALPHA 0(%esp)
  51. #define K 16(%esp)
  52. #define N 20(%esp)
  53. #define M 24(%esp)
  54. #define A 28(%esp)
  55. #define C 32(%esp)
  56. #define J 36(%esp)
  57. #define BX 40(%esp)
  58. #define OLD_STACK 44(%esp)
  59. #define OFFSET 48(%esp)
  60. #define KK 52(%esp)
  61. #define KKK 56(%esp)
  62. #define BUFFER 128(%esp)
  63. #ifdef ATHLON
  64. #define PREFETCH prefetch
  65. #define PREFETCHSIZE 64
  66. #endif
  67. #if defined(OPTERON) || defined(BARCELONA)
  68. #define PREFETCH prefetch
  69. #define PREFETCHSIZE (16 * 10 + 8)
  70. #endif
  71. #ifdef PENTIUM4
  72. #define PREFETCH prefetcht0
  73. #define PREFETCHSIZE 96
  74. #endif
  75. #define AA %edx
  76. #define BB %ecx
  77. #define LDC %ebp
  78. #if defined(OPTERON) || !defined(HAVE_SSE2)
  79. #define movsd movlps
  80. #endif
  81. #ifdef HAVE_SSE2
  82. #define xorps pxor
  83. #endif
  84. #if defined(OPTERON) || defined(BARCELONA)
  85. #define KERNEL1(address) \
  86. mulps %xmm0, %xmm2; \
  87. addps %xmm2, %xmm4; \
  88. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  89. movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  90. mulps %xmm0, %xmm2; \
  91. addps %xmm2, %xmm5; \
  92. movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  93. mulps %xmm0, %xmm2; \
  94. mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  95. addps %xmm2, %xmm6; \
  96. movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  97. addps %xmm0, %xmm7; \
  98. movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  99. #define KERNEL2(address) \
  100. mulps %xmm0, %xmm3; \
  101. addps %xmm3, %xmm4; \
  102. movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  103. mulps %xmm0, %xmm3; \
  104. addps %xmm3, %xmm5; \
  105. movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  106. mulps %xmm0, %xmm3; \
  107. mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  108. addps %xmm3, %xmm6; \
  109. movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  110. addps %xmm0, %xmm7; \
  111. movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  112. #define KERNEL3(address) \
  113. mulps %xmm0, %xmm2; \
  114. addps %xmm2, %xmm4; \
  115. movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  116. mulps %xmm0, %xmm2; \
  117. addps %xmm2, %xmm5; \
  118. movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  119. mulps %xmm0, %xmm2; \
  120. mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  121. addps %xmm2, %xmm6; \
  122. movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  123. addps %xmm0, %xmm7; \
  124. movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  125. #define KERNEL4(address) \
  126. mulps %xmm0, %xmm3; \
  127. addps %xmm3, %xmm4; \
  128. movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  129. mulps %xmm0, %xmm3; \
  130. addps %xmm3, %xmm5; \
  131. movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  132. mulps %xmm0, %xmm3; \
  133. mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  134. addps %xmm3, %xmm6; \
  135. movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  136. addps %xmm0, %xmm7; \
  137. movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  138. #define KERNEL5(address) \
  139. PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \
  140. mulps %xmm1, %xmm2; \
  141. addps %xmm2, %xmm4; \
  142. movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  143. mulps %xmm1, %xmm2; \
  144. addps %xmm2, %xmm5; \
  145. movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  146. mulps %xmm1, %xmm2; \
  147. mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  148. addps %xmm2, %xmm6; \
  149. movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  150. addps %xmm1, %xmm7; \
  151. movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  152. #define KERNEL6(address) \
  153. mulps %xmm1, %xmm3; \
  154. addps %xmm3, %xmm4; \
  155. movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  156. mulps %xmm1, %xmm3; \
  157. addps %xmm3, %xmm5; \
  158. movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  159. mulps %xmm1, %xmm3; \
  160. mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  161. addps %xmm3, %xmm6; \
  162. movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  163. addps %xmm1, %xmm7; \
  164. movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  165. #define KERNEL7(address) \
  166. mulps %xmm1, %xmm2; \
  167. addps %xmm2, %xmm4; \
  168. movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  169. mulps %xmm1, %xmm2; \
  170. addps %xmm2, %xmm5; \
  171. movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  172. mulps %xmm1, %xmm2; \
  173. mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  174. addps %xmm2, %xmm6; \
  175. movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  176. addps %xmm1, %xmm7; \
  177. movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  178. #define KERNEL8(address) \
  179. mulps %xmm1, %xmm3; \
  180. addps %xmm3, %xmm4; \
  181. movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  182. mulps %xmm1, %xmm3; \
  183. addps %xmm3, %xmm5; \
  184. movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  185. mulps %xmm1, %xmm3; \
  186. mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  187. addps %xmm3, %xmm6; \
  188. movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  189. addps %xmm1, %xmm7; \
  190. movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
  191. #endif
  192. #ifdef PENTIUM4
  193. #define KERNEL1(address) \
  194. mulps %xmm0, %xmm2; \
  195. addps %xmm2, %xmm4; \
  196. movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  197. mulps %xmm0, %xmm2; \
  198. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  199. addps %xmm2, %xmm5; \
  200. movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  201. mulps %xmm0, %xmm2; \
  202. mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  203. addps %xmm2, %xmm6; \
  204. movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  205. addps %xmm0, %xmm7; \
  206. movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  207. #define KERNEL2(address) \
  208. mulps %xmm0, %xmm3; \
  209. addps %xmm3, %xmm4; \
  210. movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  211. mulps %xmm0, %xmm3; \
  212. addps %xmm3, %xmm5; \
  213. movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  214. mulps %xmm0, %xmm3; \
  215. mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  216. addps %xmm3, %xmm6; \
  217. movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  218. addps %xmm0, %xmm7; \
  219. movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  220. #define KERNEL3(address) \
  221. mulps %xmm0, %xmm2; \
  222. addps %xmm2, %xmm4; \
  223. movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  224. mulps %xmm0, %xmm2; \
  225. addps %xmm2, %xmm5; \
  226. movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  227. mulps %xmm0, %xmm2; \
  228. mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  229. addps %xmm2, %xmm6; \
  230. movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  231. addps %xmm0, %xmm7; \
  232. movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  233. #define KERNEL4(address) \
  234. mulps %xmm0, %xmm3; \
  235. addps %xmm3, %xmm4; \
  236. movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  237. mulps %xmm0, %xmm3; \
  238. addps %xmm3, %xmm5; \
  239. movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  240. mulps %xmm0, %xmm3; \
  241. mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  242. addps %xmm3, %xmm6; \
  243. movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  244. addps %xmm0, %xmm7; \
  245. movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  246. #define KERNEL5(address) \
  247. mulps %xmm1, %xmm2; \
  248. addps %xmm2, %xmm4; \
  249. movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  250. mulps %xmm1, %xmm2; \
  251. addps %xmm2, %xmm5; \
  252. movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  253. mulps %xmm1, %xmm2; \
  254. mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  255. addps %xmm2, %xmm6; \
  256. movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  257. addps %xmm1, %xmm7; \
  258. movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  259. #define KERNEL6(address) \
  260. mulps %xmm1, %xmm3; \
  261. addps %xmm3, %xmm4; \
  262. movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  263. mulps %xmm1, %xmm3; \
  264. addps %xmm3, %xmm5; \
  265. movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  266. mulps %xmm1, %xmm3; \
  267. mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  268. addps %xmm3, %xmm6; \
  269. movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  270. addps %xmm1, %xmm7; \
  271. movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  272. #define KERNEL7(address) \
  273. mulps %xmm1, %xmm2; \
  274. addps %xmm2, %xmm4; \
  275. movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  276. mulps %xmm1, %xmm2; \
  277. addps %xmm2, %xmm5; \
  278. movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  279. mulps %xmm1, %xmm2; \
  280. mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  281. addps %xmm2, %xmm6; \
  282. movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  283. addps %xmm1, %xmm7; \
  284. movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  285. #define KERNEL8(address) \
  286. mulps %xmm1, %xmm3; \
  287. addps %xmm3, %xmm4; \
  288. movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  289. mulps %xmm1, %xmm3; \
  290. addps %xmm3, %xmm5; \
  291. movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  292. mulps %xmm1, %xmm3; \
  293. mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  294. addps %xmm3, %xmm6; \
  295. movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  296. addps %xmm1, %xmm7; \
  297. movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  298. #endif
  299. PROLOGUE
  300. pushl %ebp
  301. pushl %edi
  302. pushl %esi
  303. pushl %ebx
  304. PROFCODE
  305. EMMS
  306. movl %esp, %esi # save old stack
  307. subl $128 + LOCAL_BUFFER_SIZE, %esp
  308. movl OLD_M, %ebx
  309. andl $-1024, %esp # align stack
  310. STACK_TOUCHING
  311. movl OLD_N, %eax
  312. movl OLD_K, %ecx
  313. movl OLD_A, %edx
  314. movss OLD_ALPHA, %xmm3
  315. #ifdef TRMMKERNEL
  316. movss STACK_OFFT, %xmm4
  317. #endif
  318. movl %ebx, M
  319. movl %eax, N
  320. movl %ecx, K
  321. movl %edx, A
  322. movl %esi, OLD_STACK
  323. shufps $0, %xmm3, %xmm3
  324. movl OLD_B, %edi
  325. movl OLD_C, %ebx
  326. movaps %xmm3, ALPHA
  327. movl %ebx, C
  328. movl OLD_LDC, LDC
  329. #ifdef TRMMKERNEL
  330. movss %xmm4, OFFSET
  331. movss %xmm4, KK
  332. #ifndef LEFT
  333. negl KK
  334. #endif
  335. #endif
  336. leal (, LDC, SIZE), LDC
  337. sarl $2, %eax
  338. movl %eax, J
  339. jle .L40
  340. .L01:
  341. #if defined(TRMMKERNEL) && defined(LEFT)
  342. movl OFFSET, %eax
  343. movl %eax, KK
  344. #endif
  345. /* Copying to Sub Buffer */
  346. leal BUFFER, %ecx
  347. movl K, %eax
  348. sarl $1, %eax
  349. jle .L05
  350. ALIGN_4
  351. .L02:
  352. #ifdef HAVE_SSE2
  353. movss 0 * SIZE(%edi), %xmm0
  354. movss 1 * SIZE(%edi), %xmm1
  355. movss 2 * SIZE(%edi), %xmm2
  356. movss 3 * SIZE(%edi), %xmm3
  357. movss 4 * SIZE(%edi), %xmm4
  358. movss 5 * SIZE(%edi), %xmm5
  359. movss 6 * SIZE(%edi), %xmm6
  360. movss 7 * SIZE(%edi), %xmm7
  361. shufps $0, %xmm0, %xmm0
  362. shufps $0, %xmm1, %xmm1
  363. shufps $0, %xmm2, %xmm2
  364. shufps $0, %xmm3, %xmm3
  365. shufps $0, %xmm4, %xmm4
  366. shufps $0, %xmm5, %xmm5
  367. shufps $0, %xmm6, %xmm6
  368. shufps $0, %xmm7, %xmm7
  369. movaps %xmm0, 0 * SIZE(%ecx)
  370. movaps %xmm1, 4 * SIZE(%ecx)
  371. movaps %xmm2, 8 * SIZE(%ecx)
  372. movaps %xmm3, 12 * SIZE(%ecx)
  373. movaps %xmm4, 16 * SIZE(%ecx)
  374. movaps %xmm5, 20 * SIZE(%ecx)
  375. movaps %xmm6, 24 * SIZE(%ecx)
  376. movaps %xmm7, 28 * SIZE(%ecx)
  377. #else
  378. movd 0 * SIZE(%edi), %mm0
  379. movd 1 * SIZE(%edi), %mm1
  380. movd 2 * SIZE(%edi), %mm2
  381. movd 3 * SIZE(%edi), %mm3
  382. movd 4 * SIZE(%edi), %mm4
  383. movd 5 * SIZE(%edi), %mm5
  384. movd 6 * SIZE(%edi), %mm6
  385. movd 7 * SIZE(%edi), %mm7
  386. movd %mm0, 0 * SIZE(%ecx)
  387. movd %mm0, 1 * SIZE(%ecx)
  388. movd %mm0, 2 * SIZE(%ecx)
  389. movd %mm0, 3 * SIZE(%ecx)
  390. movd %mm1, 4 * SIZE(%ecx)
  391. movd %mm1, 5 * SIZE(%ecx)
  392. movd %mm1, 6 * SIZE(%ecx)
  393. movd %mm1, 7 * SIZE(%ecx)
  394. movd %mm2, 8 * SIZE(%ecx)
  395. movd %mm2, 9 * SIZE(%ecx)
  396. movd %mm2, 10 * SIZE(%ecx)
  397. movd %mm2, 11 * SIZE(%ecx)
  398. movd %mm3, 12 * SIZE(%ecx)
  399. movd %mm3, 13 * SIZE(%ecx)
  400. movd %mm3, 14 * SIZE(%ecx)
  401. movd %mm3, 15 * SIZE(%ecx)
  402. movd %mm4, 16 * SIZE(%ecx)
  403. movd %mm4, 17 * SIZE(%ecx)
  404. movd %mm4, 18 * SIZE(%ecx)
  405. movd %mm4, 19 * SIZE(%ecx)
  406. movd %mm5, 20 * SIZE(%ecx)
  407. movd %mm5, 21 * SIZE(%ecx)
  408. movd %mm5, 22 * SIZE(%ecx)
  409. movd %mm5, 23 * SIZE(%ecx)
  410. movd %mm6, 24 * SIZE(%ecx)
  411. movd %mm6, 25 * SIZE(%ecx)
  412. movd %mm6, 26 * SIZE(%ecx)
  413. movd %mm6, 27 * SIZE(%ecx)
  414. movd %mm7, 28 * SIZE(%ecx)
  415. movd %mm7, 29 * SIZE(%ecx)
  416. movd %mm7, 30 * SIZE(%ecx)
  417. movd %mm7, 31 * SIZE(%ecx)
  418. #endif
  419. #ifdef PENTIUM4
  420. prefetcht2 112 * SIZE(%ecx)
  421. #endif
  422. #if defined(OPTERON) || defined(BARCELONA)
  423. prefetchnta 80 * SIZE(%edi)
  424. prefetchw 112 * SIZE(%ecx)
  425. prefetchw 120 * SIZE(%ecx)
  426. #endif
  427. addl $ 8 * SIZE, %edi
  428. addl $32 * SIZE, %ecx
  429. decl %eax
  430. jne .L02
  431. ALIGN_2
  432. .L05:
  433. movl K, %eax
  434. andl $1, %eax
  435. BRANCH
  436. jle .L10
  437. #ifdef HAVE_SSE2
  438. movss 0 * SIZE(%edi), %xmm0
  439. movss 1 * SIZE(%edi), %xmm1
  440. movss 2 * SIZE(%edi), %xmm2
  441. movss 3 * SIZE(%edi), %xmm3
  442. shufps $0, %xmm0, %xmm0
  443. shufps $0, %xmm1, %xmm1
  444. shufps $0, %xmm2, %xmm2
  445. shufps $0, %xmm3, %xmm3
  446. movaps %xmm0, 0 * SIZE(%ecx)
  447. movaps %xmm1, 4 * SIZE(%ecx)
  448. movaps %xmm2, 8 * SIZE(%ecx)
  449. movaps %xmm3, 12 * SIZE(%ecx)
  450. #else
  451. movd 0 * SIZE(%edi), %mm0
  452. movd 1 * SIZE(%edi), %mm1
  453. movd 2 * SIZE(%edi), %mm2
  454. movd 3 * SIZE(%edi), %mm3
  455. movd %mm0, 0 * SIZE(%ecx)
  456. movd %mm0, 1 * SIZE(%ecx)
  457. movd %mm0, 2 * SIZE(%ecx)
  458. movd %mm0, 3 * SIZE(%ecx)
  459. movd %mm1, 4 * SIZE(%ecx)
  460. movd %mm1, 5 * SIZE(%ecx)
  461. movd %mm1, 6 * SIZE(%ecx)
  462. movd %mm1, 7 * SIZE(%ecx)
  463. movd %mm2, 8 * SIZE(%ecx)
  464. movd %mm2, 9 * SIZE(%ecx)
  465. movd %mm2, 10 * SIZE(%ecx)
  466. movd %mm2, 11 * SIZE(%ecx)
  467. movd %mm3, 12 * SIZE(%ecx)
  468. movd %mm3, 13 * SIZE(%ecx)
  469. movd %mm3, 14 * SIZE(%ecx)
  470. movd %mm3, 15 * SIZE(%ecx)
  471. #endif
  472. addl $4 * SIZE, %edi
  473. ALIGN_4
  474. .L10:
  475. movl %edi, BX
  476. movl C, %esi # coffset = c
  477. movl A, %edx # aoffset = a
  478. movl M, %ebx
  479. sarl $2, %ebx # i = (m >> 2)
  480. jle .L20
  481. ALIGN_4
  482. .L11:
  483. #if !defined(TRMMKERNEL) || \
  484. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  485. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  486. leal BUFFER, BB # boffset1 = boffset
  487. #else
  488. leal BUFFER, BB # boffset1 = boffset
  489. movl KK, %eax
  490. leal (, %eax, 8), %eax
  491. leal (AA, %eax, 2), AA
  492. leal (BB, %eax, 8), BB
  493. #endif
  494. movl BX, %eax
  495. #ifdef HAVE_SSE
  496. prefetcht2 0 * SIZE(%eax)
  497. prefetcht2 4 * SIZE(%eax)
  498. #if L2_SIZE > 262144
  499. subl $-8 * SIZE, BX
  500. #elif L2_SIZE > 131072
  501. prefetcht2 8 * SIZE(%eax)
  502. prefetcht2 12 * SIZE(%eax)
  503. subl $-16 * SIZE, BX
  504. #else
  505. prefetcht2 16 * SIZE(%eax)
  506. prefetcht2 20 * SIZE(%eax)
  507. prefetcht2 24 * SIZE(%eax)
  508. prefetcht2 28 * SIZE(%eax)
  509. subl $-32 * SIZE, BX
  510. #endif
  511. #endif
  512. movaps 0 * SIZE(AA), %xmm0
  513. xorps %xmm4, %xmm4
  514. movaps 16 * SIZE(AA), %xmm1
  515. xorps %xmm5, %xmm5
  516. movaps 0 * SIZE(BB), %xmm2
  517. xorps %xmm6, %xmm6
  518. movaps 16 * SIZE(BB), %xmm3
  519. xorps %xmm7, %xmm7
  520. leal (LDC, LDC, 2), %eax
  521. #if defined(OPTERON) || defined(BARCELONA)
  522. prefetchw 4 * SIZE(%esi)
  523. prefetchw 4 * SIZE(%esi, LDC)
  524. prefetchw 4 * SIZE(%esi, LDC, 2)
  525. prefetchw 4 * SIZE(%esi, %eax)
  526. #endif
  527. #ifdef PENTIUM4
  528. prefetchnta 4 * SIZE(%esi)
  529. prefetchnta 4 * SIZE(%esi, LDC)
  530. prefetchnta 4 * SIZE(%esi, LDC, 2)
  531. prefetchnta 4 * SIZE(%esi, %eax)
  532. #endif
  533. #ifndef TRMMKERNEL
  534. movl K, %eax
  535. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  536. movl K, %eax
  537. subl KK, %eax
  538. movl %eax, KKK
  539. #else
  540. movl KK, %eax
  541. #ifdef LEFT
  542. addl $4, %eax
  543. #else
  544. addl $4, %eax
  545. #endif
  546. movl %eax, KKK
  547. #endif
  548. #if 1
  549. andl $-8, %eax
  550. sall $4, %eax
  551. je .L15
  552. .L1X:
  553. KERNEL1(32 * 0)
  554. KERNEL2(32 * 0)
  555. KERNEL3(32 * 0)
  556. KERNEL4(32 * 0)
  557. KERNEL5(32 * 0)
  558. KERNEL6(32 * 0)
  559. KERNEL7(32 * 0)
  560. KERNEL8(32 * 0)
  561. cmpl $128 * 1, %eax
  562. jle .L12
  563. KERNEL1(32 * 1)
  564. KERNEL2(32 * 1)
  565. KERNEL3(32 * 1)
  566. KERNEL4(32 * 1)
  567. KERNEL5(32 * 1)
  568. KERNEL6(32 * 1)
  569. KERNEL7(32 * 1)
  570. KERNEL8(32 * 1)
  571. cmpl $128 * 2, %eax
  572. jle .L12
  573. KERNEL1(32 * 2)
  574. KERNEL2(32 * 2)
  575. KERNEL3(32 * 2)
  576. KERNEL4(32 * 2)
  577. KERNEL5(32 * 2)
  578. KERNEL6(32 * 2)
  579. KERNEL7(32 * 2)
  580. KERNEL8(32 * 2)
  581. cmpl $128 * 3, %eax
  582. jle .L12
  583. KERNEL1(32 * 3)
  584. KERNEL2(32 * 3)
  585. KERNEL3(32 * 3)
  586. KERNEL4(32 * 3)
  587. KERNEL5(32 * 3)
  588. KERNEL6(32 * 3)
  589. KERNEL7(32 * 3)
  590. KERNEL8(32 * 3)
  591. cmpl $128 * 4, %eax
  592. jle .L12
  593. KERNEL1(32 * 4)
  594. KERNEL2(32 * 4)
  595. KERNEL3(32 * 4)
  596. KERNEL4(32 * 4)
  597. KERNEL5(32 * 4)
  598. KERNEL6(32 * 4)
  599. KERNEL7(32 * 4)
  600. KERNEL8(32 * 4)
  601. cmpl $128 * 5, %eax
  602. jle .L12
  603. KERNEL1(32 * 5)
  604. KERNEL2(32 * 5)
  605. KERNEL3(32 * 5)
  606. KERNEL4(32 * 5)
  607. KERNEL5(32 * 5)
  608. KERNEL6(32 * 5)
  609. KERNEL7(32 * 5)
  610. KERNEL8(32 * 5)
  611. cmpl $128 * 6, %eax
  612. jle .L12
  613. KERNEL1(32 * 6)
  614. KERNEL2(32 * 6)
  615. KERNEL3(32 * 6)
  616. KERNEL4(32 * 6)
  617. KERNEL5(32 * 6)
  618. KERNEL6(32 * 6)
  619. KERNEL7(32 * 6)
  620. KERNEL8(32 * 6)
  621. cmpl $128 * 7, %eax
  622. jle .L12
  623. KERNEL1(32 * 7)
  624. KERNEL2(32 * 7)
  625. KERNEL3(32 * 7)
  626. KERNEL4(32 * 7)
  627. KERNEL5(32 * 7)
  628. KERNEL6(32 * 7)
  629. KERNEL7(32 * 7)
  630. KERNEL8(32 * 7)
  631. addl $128 * 8 * SIZE, BB
  632. addl $128 * 2 * SIZE, AA
  633. subl $128 * 8, %eax
  634. jg .L1X
  635. jmp .L15
  636. .L12:
  637. leal (AA, %eax, 1), AA
  638. leal (BB, %eax, 4), BB
  639. ALIGN_4
  640. #else
  641. sarl $3, %eax
  642. je .L15
  643. ALIGN_4
  644. .L12:
  645. KERNEL1(32 * 0)
  646. KERNEL2(32 * 0)
  647. KERNEL3(32 * 0)
  648. KERNEL4(32 * 0)
  649. KERNEL5(32 * 0)
  650. KERNEL6(32 * 0)
  651. KERNEL7(32 * 0)
  652. KERNEL8(32 * 0)
  653. addl $128 * SIZE, BB
  654. addl $32 * SIZE, AA
  655. decl %eax
  656. jne .L12
  657. ALIGN_4
  658. #endif
  659. .L15:
  660. #ifndef TRMMKERNEL
  661. movl K, %eax
  662. #else
  663. movl KKK, %eax
  664. #endif
  665. movaps ALPHA, %xmm3
  666. andl $7, %eax # if (k & 1)
  667. BRANCH
  668. je .L18
  669. ALIGN_4
  670. .L16:
  671. mulps %xmm0, %xmm2
  672. addps %xmm2, %xmm4
  673. movaps 4 * SIZE(BB), %xmm2
  674. mulps %xmm0, %xmm2
  675. addps %xmm2, %xmm5
  676. movaps 8 * SIZE(BB), %xmm2
  677. mulps %xmm0, %xmm2
  678. mulps 12 * SIZE(BB), %xmm0
  679. addps %xmm2, %xmm6
  680. movaps 16 * SIZE(BB), %xmm2
  681. addps %xmm0, %xmm7
  682. movaps 4 * SIZE(AA), %xmm0
  683. addl $ 4 * SIZE, AA
  684. addl $16 * SIZE, BB
  685. decl %eax
  686. jg .L16
  687. ALIGN_4
  688. .L18:
  689. leal (LDC, LDC, 2), %eax
  690. #ifndef TRMMKERNEL
  691. shufps $0xe4, %xmm0, %xmm0
  692. shufps $0xe4, %xmm1, %xmm1
  693. shufps $0xe4, %xmm2, %xmm2
  694. shufps $0xe4, %xmm3, %xmm3
  695. mulps %xmm3, %xmm4
  696. movsd 0 * SIZE(%esi), %xmm0
  697. movhps 2 * SIZE(%esi), %xmm0
  698. mulps %xmm3, %xmm5
  699. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  700. movhps 2 * SIZE(%esi, LDC, 1), %xmm1
  701. mulps %xmm3, %xmm6
  702. movsd 0 * SIZE(%esi, LDC, 2), %xmm2
  703. movhps 2 * SIZE(%esi, LDC, 2), %xmm2
  704. mulps %xmm3, %xmm7
  705. movsd 0 * SIZE(%esi, %eax, 1), %xmm3
  706. movhps 2 * SIZE(%esi, %eax, 1), %xmm3
  707. addps %xmm0, %xmm4
  708. addps %xmm1, %xmm5
  709. addps %xmm2, %xmm6
  710. addps %xmm3, %xmm7
  711. #else
  712. mulps %xmm3, %xmm4
  713. mulps %xmm3, %xmm5
  714. mulps %xmm3, %xmm6
  715. mulps %xmm3, %xmm7
  716. #endif
  717. movlps %xmm4, 0 * SIZE(%esi)
  718. movhps %xmm4, 2 * SIZE(%esi)
  719. movlps %xmm5, 0 * SIZE(%esi, LDC, 1)
  720. movhps %xmm5, 2 * SIZE(%esi, LDC, 1)
  721. movlps %xmm6, 0 * SIZE(%esi, LDC, 2)
  722. movhps %xmm6, 2 * SIZE(%esi, LDC, 2)
  723. movlps %xmm7, 0 * SIZE(%esi, %eax, 1)
  724. movhps %xmm7, 2 * SIZE(%esi, %eax, 1)
  725. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  726. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  727. movl K, %eax
  728. subl KKK, %eax
  729. leal (,%eax, 8), %eax
  730. leal (AA, %eax, 2), AA
  731. leal (BB, %eax, 8), BB
  732. #endif
  733. #if defined(TRMMKERNEL) && defined(LEFT)
  734. addl $4, KK
  735. #endif
  736. addl $4 * SIZE, %esi # coffset += 2
  737. decl %ebx # i --
  738. jg .L11
  739. ALIGN_4
  740. .L20:
  741. testl $2, M
  742. je .L30
  743. #if !defined(TRMMKERNEL) || \
  744. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  745. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  746. leal BUFFER, BB # boffset1 = boffset
  747. #else
  748. leal BUFFER, BB # boffset1 = boffset
  749. movl KK, %eax
  750. leal (, %eax, 8), %eax
  751. leal (AA, %eax, 1), AA
  752. leal (BB, %eax, 8), BB
  753. #endif
  754. movaps 0 * SIZE(AA), %xmm0
  755. xorps %xmm4, %xmm4
  756. movaps 8 * SIZE(AA), %xmm1
  757. xorps %xmm5, %xmm5
  758. movaps 0 * SIZE(BB), %xmm2
  759. xorps %xmm6, %xmm6
  760. movaps 16 * SIZE(BB), %xmm3
  761. xorps %xmm7, %xmm7
  762. #ifndef TRMMKERNEL
  763. movl K, %eax
  764. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  765. movl K, %eax
  766. subl KK, %eax
  767. movl %eax, KKK
  768. #else
  769. movl KK, %eax
  770. #ifdef LEFT
  771. addl $2, %eax
  772. #else
  773. addl $4, %eax
  774. #endif
  775. movl %eax, KKK
  776. #endif
  777. sarl $3, %eax
  778. je .L25
  779. ALIGN_4
  780. .L22:
  781. mulps %xmm0, %xmm2
  782. addps %xmm2, %xmm4
  783. #if defined(OPTERON) || defined(BARCELONA)
  784. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  785. #endif
  786. movaps 4 * SIZE(BB), %xmm2
  787. mulps %xmm0, %xmm2
  788. addps %xmm2, %xmm5
  789. movaps 8 * SIZE(BB), %xmm2
  790. mulps %xmm0, %xmm2
  791. addps %xmm2, %xmm6
  792. movaps 12 * SIZE(BB), %xmm2
  793. mulps %xmm0, %xmm2
  794. movsd 2 * SIZE(AA), %xmm0
  795. addps %xmm2, %xmm7
  796. movaps 32 * SIZE(BB), %xmm2
  797. mulps %xmm0, %xmm3
  798. addps %xmm3, %xmm4
  799. movaps 20 * SIZE(BB), %xmm3
  800. mulps %xmm0, %xmm3
  801. addps %xmm3, %xmm5
  802. movaps 24 * SIZE(BB), %xmm3
  803. mulps %xmm0, %xmm3
  804. addps %xmm3, %xmm6
  805. movaps 28 * SIZE(BB), %xmm3
  806. mulps %xmm0, %xmm3
  807. movsd 4 * SIZE(AA), %xmm0
  808. addps %xmm3, %xmm7
  809. movaps 48 * SIZE(BB), %xmm3
  810. mulps %xmm0, %xmm2
  811. addps %xmm2, %xmm4
  812. movaps 36 * SIZE(BB), %xmm2
  813. mulps %xmm0, %xmm2
  814. addps %xmm2, %xmm5
  815. movaps 40 * SIZE(BB), %xmm2
  816. mulps %xmm0, %xmm2
  817. addps %xmm2, %xmm6
  818. movaps 44 * SIZE(BB), %xmm2
  819. mulps %xmm0, %xmm2
  820. movsd 6 * SIZE(AA), %xmm0
  821. addps %xmm2, %xmm7
  822. movaps 64 * SIZE(BB), %xmm2
  823. mulps %xmm0, %xmm3
  824. addps %xmm3, %xmm4
  825. movaps 52 * SIZE(BB), %xmm3
  826. mulps %xmm0, %xmm3
  827. addps %xmm3, %xmm5
  828. movaps 56 * SIZE(BB), %xmm3
  829. mulps %xmm0, %xmm3
  830. addps %xmm3, %xmm6
  831. movaps 60 * SIZE(BB), %xmm3
  832. mulps %xmm0, %xmm3
  833. movsd 16 * SIZE(AA), %xmm0
  834. addps %xmm3, %xmm7
  835. movaps 80 * SIZE(BB), %xmm3
  836. mulps %xmm1, %xmm2
  837. addps %xmm2, %xmm4
  838. movaps 68 * SIZE(BB), %xmm2
  839. mulps %xmm1, %xmm2
  840. addps %xmm2, %xmm5
  841. movaps 72 * SIZE(BB), %xmm2
  842. mulps %xmm1, %xmm2
  843. addps %xmm2, %xmm6
  844. movaps 76 * SIZE(BB), %xmm2
  845. mulps %xmm1, %xmm2
  846. movsd 10 * SIZE(AA), %xmm1
  847. addps %xmm2, %xmm7
  848. movaps 96 * SIZE(BB), %xmm2
  849. mulps %xmm1, %xmm3
  850. addps %xmm3, %xmm4
  851. movaps 84 * SIZE(BB), %xmm3
  852. mulps %xmm1, %xmm3
  853. addps %xmm3, %xmm5
  854. movaps 88 * SIZE(BB), %xmm3
  855. mulps %xmm1, %xmm3
  856. addps %xmm3, %xmm6
  857. movaps 92 * SIZE(BB), %xmm3
  858. mulps %xmm1, %xmm3
  859. movsd 12 * SIZE(AA), %xmm1
  860. addps %xmm3, %xmm7
  861. movaps 112 * SIZE(BB), %xmm3
  862. mulps %xmm1, %xmm2
  863. addps %xmm2, %xmm4
  864. movaps 100 * SIZE(BB), %xmm2
  865. mulps %xmm1, %xmm2
  866. addps %xmm2, %xmm5
  867. movaps 104 * SIZE(BB), %xmm2
  868. mulps %xmm1, %xmm2
  869. addps %xmm2, %xmm6
  870. movaps 108 * SIZE(BB), %xmm2
  871. mulps %xmm1, %xmm2
  872. movsd 14 * SIZE(AA), %xmm1
  873. addps %xmm2, %xmm7
  874. movaps 128 * SIZE(BB), %xmm2
  875. mulps %xmm1, %xmm3
  876. addps %xmm3, %xmm4
  877. movaps 116 * SIZE(BB), %xmm3
  878. mulps %xmm1, %xmm3
  879. addps %xmm3, %xmm5
  880. movaps 120 * SIZE(BB), %xmm3
  881. mulps %xmm1, %xmm3
  882. addps %xmm3, %xmm6
  883. movaps 124 * SIZE(BB), %xmm3
  884. mulps %xmm1, %xmm3
  885. movsd 24 * SIZE(AA), %xmm1
  886. addps %xmm3, %xmm7
  887. movaps 144 * SIZE(BB), %xmm3
  888. addl $ 16 * SIZE, AA
  889. addl $128 * SIZE, BB
  890. decl %eax
  891. jne .L22
  892. ALIGN_4
  893. .L25:
  894. #ifndef TRMMKERNEL
  895. movl K, %eax
  896. #else
  897. movl KKK, %eax
  898. #endif
  899. movaps ALPHA, %xmm3
  900. andl $7, %eax # if (k & 1)
  901. BRANCH
  902. je .L28
  903. ALIGN_4
  904. .L26:
  905. mulps %xmm0, %xmm2
  906. addps %xmm2, %xmm4
  907. movaps 4 * SIZE(BB), %xmm2
  908. mulps %xmm0, %xmm2
  909. addps %xmm2, %xmm5
  910. movaps 8 * SIZE(BB), %xmm2
  911. mulps %xmm0, %xmm2
  912. addps %xmm2, %xmm6
  913. movaps 12 * SIZE(BB), %xmm2
  914. mulps %xmm0, %xmm2
  915. movsd 2 * SIZE(AA), %xmm0
  916. addps %xmm2, %xmm7
  917. movaps 16 * SIZE(BB), %xmm2
  918. addl $ 2 * SIZE, AA
  919. addl $16 * SIZE, BB
  920. decl %eax
  921. jg .L26
  922. ALIGN_4
  923. .L28:
  924. leal (LDC, LDC, 2), %eax
  925. #ifndef TRMMKERNEL
  926. mulps %xmm3, %xmm4
  927. #ifdef movsd
  928. xorps %xmm0, %xmm0
  929. #endif
  930. movsd 0 * SIZE(%esi), %xmm0
  931. mulps %xmm3, %xmm5
  932. #ifdef movsd
  933. xorps %xmm1, %xmm1
  934. #endif
  935. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  936. mulps %xmm3, %xmm6
  937. #ifdef movsd
  938. xorps %xmm2, %xmm2
  939. #endif
  940. movsd 0 * SIZE(%esi, LDC, 2), %xmm2
  941. mulps %xmm3, %xmm7
  942. #ifdef movsd
  943. xorps %xmm3, %xmm3
  944. #endif
  945. movsd 0 * SIZE(%esi, %eax, 1), %xmm3
  946. addps %xmm0, %xmm4
  947. addps %xmm1, %xmm5
  948. addps %xmm2, %xmm6
  949. addps %xmm3, %xmm7
  950. #else
  951. mulps %xmm3, %xmm4
  952. mulps %xmm3, %xmm5
  953. mulps %xmm3, %xmm6
  954. mulps %xmm3, %xmm7
  955. #endif
  956. movlps %xmm4, 0 * SIZE(%esi)
  957. movlps %xmm5, 0 * SIZE(%esi, LDC, 1)
  958. movlps %xmm6, 0 * SIZE(%esi, LDC, 2)
  959. movlps %xmm7, 0 * SIZE(%esi, %eax, 1)
  960. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  961. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  962. movl K, %eax
  963. subl KKK, %eax
  964. leal (,%eax, 8), %eax
  965. leal (AA, %eax, 1), AA
  966. leal (BB, %eax, 8), BB
  967. #endif
  968. #if defined(TRMMKERNEL) && defined(LEFT)
  969. addl $2, KK
  970. #endif
  971. addl $2 * SIZE, %esi # coffset += 2
  972. ALIGN_4
  973. .L30:
  974. testl $1, M
  975. je .L39
  976. #if !defined(TRMMKERNEL) || \
  977. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  978. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  979. leal BUFFER, BB # boffset1 = boffset
  980. #else
  981. leal BUFFER, BB # boffset1 = boffset
  982. movl KK, %eax
  983. leal (, %eax, 4), %eax
  984. leal (AA, %eax, 1), AA
  985. leal (BB, %eax, 8), BB
  986. leal (BB, %eax, 8), BB
  987. #endif
  988. movss 0 * SIZE(AA), %xmm0
  989. xorps %xmm4, %xmm4
  990. movss 4 * SIZE(AA), %xmm1
  991. xorps %xmm5, %xmm5
  992. movss 0 * SIZE(BB), %xmm2
  993. xorps %xmm6, %xmm6
  994. movss 16 * SIZE(BB), %xmm3
  995. xorps %xmm7, %xmm7
  996. #ifndef TRMMKERNEL
  997. movl K, %eax
  998. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  999. movl K, %eax
  1000. subl KK, %eax
  1001. movl %eax, KKK
  1002. #else
  1003. movl KK, %eax
  1004. #ifdef LEFT
  1005. addl $1, %eax
  1006. #else
  1007. addl $4, %eax
  1008. #endif
  1009. movl %eax, KKK
  1010. #endif
  1011. sarl $3, %eax
  1012. je .L35
  1013. ALIGN_4
  1014. .L32:
  1015. mulss %xmm0, %xmm2
  1016. addss %xmm2, %xmm4
  1017. #if defined(OPTERON) || defined(BARCELONA)
  1018. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1019. #endif
  1020. movss 4 * SIZE(BB), %xmm2
  1021. mulss %xmm0, %xmm2
  1022. addss %xmm2, %xmm5
  1023. movss 8 * SIZE(BB), %xmm2
  1024. mulss %xmm0, %xmm2
  1025. mulss 12 * SIZE(BB), %xmm0
  1026. addss %xmm2, %xmm6
  1027. movss 32 * SIZE(BB), %xmm2
  1028. addss %xmm0, %xmm7
  1029. movss 1 * SIZE(AA), %xmm0
  1030. mulss %xmm0, %xmm3
  1031. addss %xmm3, %xmm4
  1032. movss 20 * SIZE(BB), %xmm3
  1033. mulss %xmm0, %xmm3
  1034. addss %xmm3, %xmm5
  1035. movss 24 * SIZE(BB), %xmm3
  1036. mulss %xmm0, %xmm3
  1037. mulss 28 * SIZE(BB), %xmm0
  1038. addss %xmm3, %xmm6
  1039. movss 48 * SIZE(BB), %xmm3
  1040. addss %xmm0, %xmm7
  1041. movss 2 * SIZE(AA), %xmm0
  1042. mulss %xmm0, %xmm2
  1043. addss %xmm2, %xmm4
  1044. movss 36 * SIZE(BB), %xmm2
  1045. mulss %xmm0, %xmm2
  1046. addss %xmm2, %xmm5
  1047. movss 40 * SIZE(BB), %xmm2
  1048. mulss %xmm0, %xmm2
  1049. mulss 44 * SIZE(BB), %xmm0
  1050. addss %xmm2, %xmm6
  1051. movss 64 * SIZE(BB), %xmm2
  1052. addss %xmm0, %xmm7
  1053. movss 3 * SIZE(AA), %xmm0
  1054. mulss %xmm0, %xmm3
  1055. addss %xmm3, %xmm4
  1056. movss 52 * SIZE(BB), %xmm3
  1057. mulss %xmm0, %xmm3
  1058. addss %xmm3, %xmm5
  1059. movss 56 * SIZE(BB), %xmm3
  1060. mulss %xmm0, %xmm3
  1061. mulss 60 * SIZE(BB), %xmm0
  1062. addss %xmm3, %xmm6
  1063. movss 80 * SIZE(BB), %xmm3
  1064. addss %xmm0, %xmm7
  1065. movss 8 * SIZE(AA), %xmm0
  1066. mulss %xmm1, %xmm2
  1067. addss %xmm2, %xmm4
  1068. movss 68 * SIZE(BB), %xmm2
  1069. mulss %xmm1, %xmm2
  1070. addss %xmm2, %xmm5
  1071. movss 72 * SIZE(BB), %xmm2
  1072. mulss %xmm1, %xmm2
  1073. mulss 76 * SIZE(BB), %xmm1
  1074. addss %xmm2, %xmm6
  1075. movss 96 * SIZE(BB), %xmm2
  1076. addss %xmm1, %xmm7
  1077. movss 5 * SIZE(AA), %xmm1
  1078. mulss %xmm1, %xmm3
  1079. addss %xmm3, %xmm4
  1080. movss 84 * SIZE(BB), %xmm3
  1081. mulss %xmm1, %xmm3
  1082. addss %xmm3, %xmm5
  1083. movss 88 * SIZE(BB), %xmm3
  1084. mulss %xmm1, %xmm3
  1085. mulss 92 * SIZE(BB), %xmm1
  1086. addss %xmm3, %xmm6
  1087. movss 112 * SIZE(BB), %xmm3
  1088. addss %xmm1, %xmm7
  1089. movss 6 * SIZE(AA), %xmm1
  1090. mulss %xmm1, %xmm2
  1091. addss %xmm2, %xmm4
  1092. movss 100 * SIZE(BB), %xmm2
  1093. mulss %xmm1, %xmm2
  1094. addss %xmm2, %xmm5
  1095. movss 104 * SIZE(BB), %xmm2
  1096. mulss %xmm1, %xmm2
  1097. mulss 108 * SIZE(BB), %xmm1
  1098. addss %xmm2, %xmm6
  1099. movss 128 * SIZE(BB), %xmm2
  1100. addss %xmm1, %xmm7
  1101. movss 7 * SIZE(AA), %xmm1
  1102. mulss %xmm1, %xmm3
  1103. addss %xmm3, %xmm4
  1104. movss 116 * SIZE(BB), %xmm3
  1105. mulss %xmm1, %xmm3
  1106. addss %xmm3, %xmm5
  1107. movss 120 * SIZE(BB), %xmm3
  1108. mulss %xmm1, %xmm3
  1109. mulss 124 * SIZE(BB), %xmm1
  1110. addss %xmm3, %xmm6
  1111. movss 144 * SIZE(BB), %xmm3
  1112. addss %xmm1, %xmm7
  1113. movss 12 * SIZE(AA), %xmm1
  1114. addl $ 8 * SIZE, AA
  1115. addl $128 * SIZE, BB
  1116. decl %eax
  1117. jne .L32
  1118. ALIGN_4
  1119. .L35:
  1120. #ifndef TRMMKERNEL
  1121. movl K, %eax
  1122. #else
  1123. movl KKK, %eax
  1124. #endif
  1125. movaps ALPHA, %xmm3
  1126. andl $7, %eax # if (k & 1)
  1127. BRANCH
  1128. je .L38
  1129. ALIGN_4
  1130. .L36:
  1131. mulss %xmm0, %xmm2
  1132. addss %xmm2, %xmm4
  1133. movss 4 * SIZE(BB), %xmm2
  1134. mulss %xmm0, %xmm2
  1135. addss %xmm2, %xmm5
  1136. movss 8 * SIZE(BB), %xmm2
  1137. mulss %xmm0, %xmm2
  1138. mulss 12 * SIZE(BB), %xmm0
  1139. addss %xmm2, %xmm6
  1140. movss 16 * SIZE(BB), %xmm2
  1141. addss %xmm0, %xmm7
  1142. movss 1 * SIZE(AA), %xmm0
  1143. addl $ 1 * SIZE, AA
  1144. addl $16 * SIZE, BB
  1145. decl %eax
  1146. jg .L36
  1147. ALIGN_4
  1148. .L38:
  1149. leal (LDC, LDC, 2), %eax
  1150. #ifndef TRMMKERNEL
  1151. mulss %xmm3, %xmm4
  1152. movss 0 * SIZE(%esi), %xmm0
  1153. mulss %xmm3, %xmm5
  1154. movss 0 * SIZE(%esi, LDC, 1), %xmm1
  1155. mulss %xmm3, %xmm6
  1156. movss 0 * SIZE(%esi, LDC, 2), %xmm2
  1157. mulss %xmm3, %xmm7
  1158. movss 0 * SIZE(%esi, %eax, 1), %xmm3
  1159. addss %xmm0, %xmm4
  1160. addss %xmm1, %xmm5
  1161. addss %xmm2, %xmm6
  1162. addss %xmm3, %xmm7
  1163. #else
  1164. mulss %xmm3, %xmm4
  1165. mulss %xmm3, %xmm5
  1166. mulss %xmm3, %xmm6
  1167. mulss %xmm3, %xmm7
  1168. #endif
  1169. movss %xmm4, 0 * SIZE(%esi)
  1170. movss %xmm5, 0 * SIZE(%esi, LDC, 1)
  1171. movss %xmm6, 0 * SIZE(%esi, LDC, 2)
  1172. movss %xmm7, 0 * SIZE(%esi, %eax, 1)
  1173. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1174. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1175. movl K, %eax
  1176. subl KKK, %eax
  1177. leal (,%eax, 4), %eax
  1178. leal (AA, %eax, 1), AA
  1179. leal (BB, %eax, 8), BB
  1180. leal (BB, %eax, 8), BB
  1181. #endif
  1182. #if defined(TRMMKERNEL) && defined(LEFT)
  1183. addl $1, KK
  1184. #endif
  1185. ALIGN_4
  1186. .L39:
  1187. #if defined(TRMMKERNEL) && !defined(LEFT)
  1188. addl $4, KK
  1189. #endif
  1190. leal (, LDC, 4), %eax
  1191. addl %eax, C # c += 4 * ldc
  1192. decl J # j --
  1193. jg .L01
  1194. ALIGN_4
  1195. .L40:
  1196. testl $2, N
  1197. je .L80
  1198. #if defined(TRMMKERNEL) && defined(LEFT)
  1199. movl OFFSET, %eax
  1200. movl %eax, KK
  1201. #endif
  1202. movl K, %eax
  1203. leal BUFFER, %ecx
  1204. sarl $2, %eax
  1205. jle .L45
  1206. ALIGN_4
  1207. .L42:
  1208. prefetchnta 80 * SIZE(%edi)
  1209. #if defined(OPTERON) || defined(BARCELONA)
  1210. prefetchw 112 * SIZE(%ecx)
  1211. prefetchw 120 * SIZE(%ecx)
  1212. #endif
  1213. #ifdef PENTIUM4
  1214. prefetcht1 112 * SIZE(%ecx)
  1215. #endif
  1216. #ifdef HAVE_SSE2
  1217. movss 0 * SIZE(%edi), %xmm0
  1218. movss 1 * SIZE(%edi), %xmm1
  1219. movss 2 * SIZE(%edi), %xmm2
  1220. movss 3 * SIZE(%edi), %xmm3
  1221. movss 4 * SIZE(%edi), %xmm4
  1222. movss 5 * SIZE(%edi), %xmm5
  1223. movss 6 * SIZE(%edi), %xmm6
  1224. movss 7 * SIZE(%edi), %xmm7
  1225. shufps $0, %xmm0, %xmm0
  1226. shufps $0, %xmm1, %xmm1
  1227. shufps $0, %xmm2, %xmm2
  1228. shufps $0, %xmm3, %xmm3
  1229. shufps $0, %xmm4, %xmm4
  1230. shufps $0, %xmm5, %xmm5
  1231. shufps $0, %xmm6, %xmm6
  1232. shufps $0, %xmm7, %xmm7
  1233. movaps %xmm0, 0 * SIZE(%ecx)
  1234. movaps %xmm1, 4 * SIZE(%ecx)
  1235. movaps %xmm2, 8 * SIZE(%ecx)
  1236. movaps %xmm3, 12 * SIZE(%ecx)
  1237. movaps %xmm4, 16 * SIZE(%ecx)
  1238. movaps %xmm5, 20 * SIZE(%ecx)
  1239. movaps %xmm6, 24 * SIZE(%ecx)
  1240. movaps %xmm7, 28 * SIZE(%ecx)
  1241. #else
  1242. movd 0 * SIZE(%edi), %mm0
  1243. movd 1 * SIZE(%edi), %mm1
  1244. movd 2 * SIZE(%edi), %mm2
  1245. movd 3 * SIZE(%edi), %mm3
  1246. movd 4 * SIZE(%edi), %mm4
  1247. movd 5 * SIZE(%edi), %mm5
  1248. movd 6 * SIZE(%edi), %mm6
  1249. movd 7 * SIZE(%edi), %mm7
  1250. movd %mm0, 0 * SIZE(%ecx)
  1251. movd %mm0, 1 * SIZE(%ecx)
  1252. movd %mm0, 2 * SIZE(%ecx)
  1253. movd %mm0, 3 * SIZE(%ecx)
  1254. movd %mm1, 4 * SIZE(%ecx)
  1255. movd %mm1, 5 * SIZE(%ecx)
  1256. movd %mm1, 6 * SIZE(%ecx)
  1257. movd %mm1, 7 * SIZE(%ecx)
  1258. movd %mm2, 8 * SIZE(%ecx)
  1259. movd %mm2, 9 * SIZE(%ecx)
  1260. movd %mm2, 10 * SIZE(%ecx)
  1261. movd %mm2, 11 * SIZE(%ecx)
  1262. movd %mm3, 12 * SIZE(%ecx)
  1263. movd %mm3, 13 * SIZE(%ecx)
  1264. movd %mm3, 14 * SIZE(%ecx)
  1265. movd %mm3, 15 * SIZE(%ecx)
  1266. movd %mm4, 16 * SIZE(%ecx)
  1267. movd %mm4, 17 * SIZE(%ecx)
  1268. movd %mm4, 18 * SIZE(%ecx)
  1269. movd %mm4, 19 * SIZE(%ecx)
  1270. movd %mm5, 20 * SIZE(%ecx)
  1271. movd %mm5, 21 * SIZE(%ecx)
  1272. movd %mm5, 22 * SIZE(%ecx)
  1273. movd %mm5, 23 * SIZE(%ecx)
  1274. movd %mm6, 24 * SIZE(%ecx)
  1275. movd %mm6, 25 * SIZE(%ecx)
  1276. movd %mm6, 26 * SIZE(%ecx)
  1277. movd %mm6, 27 * SIZE(%ecx)
  1278. movd %mm7, 28 * SIZE(%ecx)
  1279. movd %mm7, 29 * SIZE(%ecx)
  1280. movd %mm7, 30 * SIZE(%ecx)
  1281. movd %mm7, 31 * SIZE(%ecx)
  1282. #endif
  1283. addl $ 8 * SIZE, %edi
  1284. addl $32 * SIZE, %ecx
  1285. decl %eax
  1286. jne .L42
  1287. ALIGN_4
  1288. .L45:
  1289. movl K, %eax
  1290. andl $3, %eax
  1291. BRANCH
  1292. jle .L50
  1293. ALIGN_4
  1294. .L46:
  1295. #ifdef HAVE_SSE2
  1296. movss 0 * SIZE(%edi), %xmm0
  1297. movss 1 * SIZE(%edi), %xmm1
  1298. shufps $0, %xmm0, %xmm0
  1299. shufps $0, %xmm1, %xmm1
  1300. movaps %xmm0, 0 * SIZE(%ecx)
  1301. movaps %xmm1, 4 * SIZE(%ecx)
  1302. #else
  1303. movd 0 * SIZE(%edi), %mm0
  1304. movd 1 * SIZE(%edi), %mm1
  1305. movd %mm0, 0 * SIZE(%ecx)
  1306. movd %mm0, 1 * SIZE(%ecx)
  1307. movd %mm0, 2 * SIZE(%ecx)
  1308. movd %mm0, 3 * SIZE(%ecx)
  1309. movd %mm1, 4 * SIZE(%ecx)
  1310. movd %mm1, 5 * SIZE(%ecx)
  1311. movd %mm1, 6 * SIZE(%ecx)
  1312. movd %mm1, 7 * SIZE(%ecx)
  1313. #endif
  1314. addl $2 * SIZE, %edi
  1315. addl $8 * SIZE, %ecx
  1316. decl %eax
  1317. jne .L46
  1318. ALIGN_4
  1319. .L50:
  1320. movl C, %esi # coffset = c
  1321. movl A, %edx # aoffset = a
  1322. movl M, %ebx
  1323. sarl $2, %ebx # i = (m >> 2)
  1324. jle .L60
  1325. ALIGN_4
  1326. .L51:
  1327. #if !defined(TRMMKERNEL) || \
  1328. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1329. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1330. leal BUFFER, BB # boffset1 = boffset
  1331. #else
  1332. leal BUFFER, BB # boffset1 = boffset
  1333. movl KK, %eax
  1334. leal (, %eax, 8), %eax
  1335. leal (AA, %eax, 2), AA
  1336. leal (BB, %eax, 4), BB
  1337. #endif
  1338. xorps %xmm4, %xmm4
  1339. xorps %xmm5, %xmm5
  1340. xorps %xmm6, %xmm6
  1341. xorps %xmm7, %xmm7
  1342. movaps 0 * SIZE(AA), %xmm0
  1343. movaps 16 * SIZE(AA), %xmm1
  1344. movaps 0 * SIZE(BB), %xmm2
  1345. movaps 16 * SIZE(BB), %xmm3
  1346. #ifdef HAVE_3DNOW
  1347. prefetchw 4 * SIZE(%esi)
  1348. prefetchw 4 * SIZE(%esi, LDC)
  1349. #elif defined(HAVE_SSE) || defined(HAVE_SSE2)
  1350. prefetcht2 4 * SIZE(%esi)
  1351. prefetcht2 4 * SIZE(%esi, LDC)
  1352. #endif
  1353. #ifndef TRMMKERNEL
  1354. movl K, %eax
  1355. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1356. movl K, %eax
  1357. subl KK, %eax
  1358. movl %eax, KKK
  1359. #else
  1360. movl KK, %eax
  1361. #ifdef LEFT
  1362. addl $4, %eax
  1363. #else
  1364. addl $2, %eax
  1365. #endif
  1366. movl %eax, KKK
  1367. #endif
  1368. sarl $3, %eax
  1369. je .L55
  1370. ALIGN_4
  1371. .L52:
  1372. mulps %xmm0, %xmm2
  1373. #if defined(OPTERON) || defined(BARCELONA)
  1374. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1375. #endif
  1376. mulps 4 * SIZE(BB), %xmm0
  1377. addps %xmm2, %xmm4
  1378. movaps 8 * SIZE(BB), %xmm2
  1379. addps %xmm0, %xmm5
  1380. movaps 4 * SIZE(AA), %xmm0
  1381. mulps %xmm0, %xmm2
  1382. mulps 12 * SIZE(BB), %xmm0
  1383. addps %xmm2, %xmm4
  1384. movaps 32 * SIZE(BB), %xmm2
  1385. addps %xmm0, %xmm5
  1386. movaps 8 * SIZE(AA), %xmm0
  1387. mulps %xmm0, %xmm3
  1388. mulps 20 * SIZE(BB), %xmm0
  1389. addps %xmm3, %xmm4
  1390. movaps 24 * SIZE(BB), %xmm3
  1391. addps %xmm0, %xmm5
  1392. movaps 12 * SIZE(AA), %xmm0
  1393. mulps %xmm0, %xmm3
  1394. mulps 28 * SIZE(BB), %xmm0
  1395. addps %xmm3, %xmm4
  1396. movaps 48 * SIZE(BB), %xmm3
  1397. addps %xmm0, %xmm5
  1398. movaps 32 * SIZE(AA), %xmm0
  1399. #if defined(OPTERON) || defined(BARCELONA)
  1400. prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
  1401. #endif
  1402. mulps %xmm1, %xmm2
  1403. mulps 36 * SIZE(BB), %xmm1
  1404. addps %xmm2, %xmm4
  1405. movaps 40 * SIZE(BB), %xmm2
  1406. addps %xmm1, %xmm5
  1407. movaps 20 * SIZE(AA), %xmm1
  1408. mulps %xmm1, %xmm2
  1409. mulps 44 * SIZE(BB), %xmm1
  1410. addps %xmm2, %xmm4
  1411. movaps 64 * SIZE(BB), %xmm2
  1412. addps %xmm1, %xmm5
  1413. movaps 24 * SIZE(AA), %xmm1
  1414. mulps %xmm1, %xmm3
  1415. mulps 52 * SIZE(BB), %xmm1
  1416. addps %xmm3, %xmm4
  1417. movaps 56 * SIZE(BB), %xmm3
  1418. addps %xmm1, %xmm5
  1419. movaps 28 * SIZE(AA), %xmm1
  1420. mulps %xmm1, %xmm3
  1421. mulps 60 * SIZE(BB), %xmm1
  1422. addps %xmm3, %xmm4
  1423. movaps 80 * SIZE(BB), %xmm3
  1424. addps %xmm1, %xmm5
  1425. movaps 48 * SIZE(AA), %xmm1
  1426. addl $32 * SIZE, AA
  1427. addl $64 * SIZE, BB
  1428. decl %eax
  1429. jne .L52
  1430. ALIGN_4
  1431. .L55:
  1432. #ifndef TRMMKERNEL
  1433. movl K, %eax
  1434. #else
  1435. movl KKK, %eax
  1436. #endif
  1437. movaps ALPHA, %xmm3
  1438. andl $7, %eax # if (k & 1)
  1439. BRANCH
  1440. je .L58
  1441. ALIGN_4
  1442. .L56:
  1443. mulps %xmm0, %xmm2
  1444. mulps 4 * SIZE(BB), %xmm0
  1445. addps %xmm2, %xmm4
  1446. movaps 8 * SIZE(BB), %xmm2
  1447. addps %xmm0, %xmm5
  1448. movaps 4 * SIZE(AA), %xmm0
  1449. addl $4 * SIZE, AA
  1450. addl $8 * SIZE, BB
  1451. decl %eax
  1452. jg .L56
  1453. ALIGN_4
  1454. .L58:
  1455. mulps %xmm3, %xmm4
  1456. mulps %xmm3, %xmm5
  1457. #ifndef TRMMKERNEL
  1458. movsd 0 * SIZE(%esi), %xmm0
  1459. movhps 2 * SIZE(%esi), %xmm0
  1460. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  1461. movhps 2 * SIZE(%esi, LDC, 1), %xmm1
  1462. addps %xmm0, %xmm4
  1463. addps %xmm1, %xmm5
  1464. #endif
  1465. movlps %xmm4, 0 * SIZE(%esi)
  1466. movhps %xmm4, 2 * SIZE(%esi)
  1467. movlps %xmm5, 0 * SIZE(%esi, LDC, 1)
  1468. movhps %xmm5, 2 * SIZE(%esi, LDC, 1)
  1469. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1470. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1471. movl K, %eax
  1472. subl KKK, %eax
  1473. leal (,%eax, 8), %eax
  1474. leal (AA, %eax, 2), AA
  1475. leal (BB, %eax, 4), BB
  1476. #endif
  1477. #if defined(TRMMKERNEL) && defined(LEFT)
  1478. addl $4, KK
  1479. #endif
  1480. addl $4 * SIZE, %esi # coffset += 2
  1481. decl %ebx # i --
  1482. jg .L51
  1483. ALIGN_4
  1484. .L60:
  1485. testl $2, M
  1486. je .L70
  1487. #if !defined(TRMMKERNEL) || \
  1488. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1489. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1490. leal BUFFER, BB # boffset1 = boffset
  1491. #else
  1492. leal BUFFER, BB # boffset1 = boffset
  1493. movl KK, %eax
  1494. leal (, %eax, 8), %eax
  1495. leal (AA, %eax, 1), AA
  1496. leal (BB, %eax, 4), BB
  1497. #endif
  1498. xorps %xmm4, %xmm4
  1499. xorps %xmm5, %xmm5
  1500. xorps %xmm6, %xmm6
  1501. xorps %xmm7, %xmm7
  1502. movaps 0 * SIZE(AA), %xmm0
  1503. movaps 8 * SIZE(AA), %xmm1
  1504. movaps 0 * SIZE(BB), %xmm2
  1505. movaps 16 * SIZE(BB), %xmm3
  1506. leal (LDC, LDC, 2), %eax
  1507. #ifndef TRMMKERNEL
  1508. movl K, %eax
  1509. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1510. movl K, %eax
  1511. subl KK, %eax
  1512. movl %eax, KKK
  1513. #else
  1514. movl KK, %eax
  1515. #ifdef LEFT
  1516. addl $2, %eax
  1517. #else
  1518. addl $2, %eax
  1519. #endif
  1520. movl %eax, KKK
  1521. #endif
  1522. sarl $3, %eax
  1523. je .L65
  1524. ALIGN_4
  1525. .L62:
  1526. #if defined(OPTERON) || defined(BARCELONA)
  1527. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1528. #endif
  1529. mulps %xmm0, %xmm2
  1530. addps %xmm2, %xmm4
  1531. movaps 4 * SIZE(BB), %xmm2
  1532. mulps %xmm0, %xmm2
  1533. movsd 2 * SIZE(AA), %xmm0
  1534. addps %xmm2, %xmm5
  1535. movaps 8 * SIZE(BB), %xmm2
  1536. mulps %xmm0, %xmm2
  1537. addps %xmm2, %xmm6
  1538. movaps 12 * SIZE(BB), %xmm2
  1539. mulps %xmm0, %xmm2
  1540. movsd 4 * SIZE(AA), %xmm0
  1541. addps %xmm2, %xmm7
  1542. movaps 32 * SIZE(BB), %xmm2
  1543. mulps %xmm0, %xmm3
  1544. addps %xmm3, %xmm4
  1545. movaps 20 * SIZE(BB), %xmm3
  1546. mulps %xmm0, %xmm3
  1547. movsd 6 * SIZE(AA), %xmm0
  1548. addps %xmm3, %xmm5
  1549. movaps 24 * SIZE(BB), %xmm3
  1550. mulps %xmm0, %xmm3
  1551. addps %xmm3, %xmm6
  1552. movaps 28 * SIZE(BB), %xmm3
  1553. mulps %xmm0, %xmm3
  1554. movsd 16 * SIZE(AA), %xmm0
  1555. addps %xmm3, %xmm7
  1556. movaps 48 * SIZE(BB), %xmm3
  1557. mulps %xmm1, %xmm2
  1558. addps %xmm2, %xmm4
  1559. movaps 36 * SIZE(BB), %xmm2
  1560. mulps %xmm1, %xmm2
  1561. movsd 10 * SIZE(AA), %xmm1
  1562. addps %xmm2, %xmm5
  1563. movaps 40 * SIZE(BB), %xmm2
  1564. mulps %xmm1, %xmm2
  1565. addps %xmm2, %xmm6
  1566. movaps 44 * SIZE(BB), %xmm2
  1567. mulps %xmm1, %xmm2
  1568. movsd 12 * SIZE(AA), %xmm1
  1569. addps %xmm2, %xmm7
  1570. movaps 64 * SIZE(BB), %xmm2
  1571. mulps %xmm1, %xmm3
  1572. addps %xmm3, %xmm4
  1573. movaps 52 * SIZE(BB), %xmm3
  1574. mulps %xmm1, %xmm3
  1575. movsd 14 * SIZE(AA), %xmm1
  1576. addps %xmm3, %xmm5
  1577. movaps 56 * SIZE(BB), %xmm3
  1578. mulps %xmm1, %xmm3
  1579. addps %xmm3, %xmm6
  1580. movaps 60 * SIZE(BB), %xmm3
  1581. mulps %xmm1, %xmm3
  1582. movsd 24 * SIZE(AA), %xmm1
  1583. addps %xmm3, %xmm7
  1584. movaps 80 * SIZE(BB), %xmm3
  1585. addl $16 * SIZE, AA
  1586. addl $64 * SIZE, BB
  1587. decl %eax
  1588. jne .L62
  1589. ALIGN_4
  1590. .L65:
  1591. #ifndef TRMMKERNEL
  1592. movl K, %eax
  1593. #else
  1594. movl KKK, %eax
  1595. #endif
  1596. movaps ALPHA, %xmm3
  1597. andl $7, %eax # if (k & 1)
  1598. BRANCH
  1599. je .L68
  1600. ALIGN_4
  1601. .L66:
  1602. mulps %xmm0, %xmm2
  1603. addps %xmm2, %xmm4
  1604. movaps 4 * SIZE(BB), %xmm2
  1605. mulps %xmm0, %xmm2
  1606. movsd 2 * SIZE(AA), %xmm0
  1607. addps %xmm2, %xmm5
  1608. movaps 8 * SIZE(BB), %xmm2
  1609. addl $2 * SIZE, AA
  1610. addl $8 * SIZE, BB
  1611. decl %eax
  1612. jg .L66
  1613. ALIGN_4
  1614. .L68:
  1615. addps %xmm6, %xmm4
  1616. addps %xmm7, %xmm5
  1617. mulps %xmm3, %xmm4
  1618. mulps %xmm3, %xmm5
  1619. #ifndef TRMMKERNEL
  1620. #ifdef movsd
  1621. xorps %xmm0, %xmm0
  1622. #endif
  1623. movsd 0 * SIZE(%esi), %xmm0
  1624. #ifdef movsd
  1625. xorps %xmm1, %xmm1
  1626. #endif
  1627. movsd 0 * SIZE(%esi, LDC, 1), %xmm1
  1628. addps %xmm0, %xmm4
  1629. addps %xmm1, %xmm5
  1630. #endif
  1631. movlps %xmm4, 0 * SIZE(%esi)
  1632. movlps %xmm5, 0 * SIZE(%esi, LDC, 1)
  1633. addl $2 * SIZE, %esi # coffset += 2
  1634. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1635. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1636. movl K, %eax
  1637. subl KKK, %eax
  1638. leal (,%eax, 8), %eax
  1639. leal (AA, %eax, 1), AA
  1640. leal (BB, %eax, 4), BB
  1641. #endif
  1642. #if defined(TRMMKERNEL) && defined(LEFT)
  1643. addl $2, KK
  1644. #endif
  1645. ALIGN_4
  1646. .L70:
  1647. testl $1, M
  1648. je .L79
  1649. #if !defined(TRMMKERNEL) || \
  1650. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1651. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1652. leal BUFFER, BB # boffset1 = boffset
  1653. #else
  1654. leal BUFFER, BB # boffset1 = boffset
  1655. movl KK, %eax
  1656. leal (, %eax, 4), %eax
  1657. leal (AA, %eax, 1), AA
  1658. leal (BB, %eax, 8), BB
  1659. #endif
  1660. xorps %xmm4, %xmm4
  1661. xorps %xmm5, %xmm5
  1662. xorps %xmm6, %xmm6
  1663. xorps %xmm7, %xmm7
  1664. movss 0 * SIZE(AA), %xmm0
  1665. movss 4 * SIZE(AA), %xmm1
  1666. movss 0 * SIZE(BB), %xmm2
  1667. movss 16 * SIZE(BB), %xmm3
  1668. leal (LDC, LDC, 2), %eax
  1669. #ifndef TRMMKERNEL
  1670. movl K, %eax
  1671. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1672. movl K, %eax
  1673. subl KK, %eax
  1674. movl %eax, KKK
  1675. #else
  1676. movl KK, %eax
  1677. #ifdef LEFT
  1678. addl $1, %eax
  1679. #else
  1680. addl $2, %eax
  1681. #endif
  1682. movl %eax, KKK
  1683. #endif
  1684. sarl $3, %eax
  1685. je .L75
  1686. ALIGN_4
  1687. .L72:
  1688. mulss %xmm0, %xmm2
  1689. #if defined(OPTERON) || defined(BARCELONA)
  1690. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1691. #endif
  1692. mulss 4 * SIZE(BB), %xmm0
  1693. addss %xmm2, %xmm4
  1694. movss 8 * SIZE(BB), %xmm2
  1695. addss %xmm0, %xmm5
  1696. movss 1 * SIZE(AA), %xmm0
  1697. mulss %xmm0, %xmm2
  1698. mulss 12 * SIZE(BB), %xmm0
  1699. addss %xmm2, %xmm6
  1700. movss 32 * SIZE(BB), %xmm2
  1701. addss %xmm0, %xmm7
  1702. movss 2 * SIZE(AA), %xmm0
  1703. mulss %xmm0, %xmm3
  1704. mulss 20 * SIZE(BB), %xmm0
  1705. addss %xmm3, %xmm4
  1706. movss 24 * SIZE(BB), %xmm3
  1707. addss %xmm0, %xmm5
  1708. movss 3 * SIZE(AA), %xmm0
  1709. mulss %xmm0, %xmm3
  1710. mulss 28 * SIZE(BB), %xmm0
  1711. addss %xmm3, %xmm6
  1712. movss 48 * SIZE(BB), %xmm3
  1713. addss %xmm0, %xmm7
  1714. movss 8 * SIZE(AA), %xmm0
  1715. mulss %xmm1, %xmm2
  1716. mulss 36 * SIZE(BB), %xmm1
  1717. addss %xmm2, %xmm4
  1718. movss 40 * SIZE(BB), %xmm2
  1719. addss %xmm1, %xmm5
  1720. movss 5 * SIZE(AA), %xmm1
  1721. mulss %xmm1, %xmm2
  1722. mulss 44 * SIZE(BB), %xmm1
  1723. addss %xmm2, %xmm6
  1724. movss 64 * SIZE(BB), %xmm2
  1725. addss %xmm1, %xmm7
  1726. movss 6 * SIZE(AA), %xmm1
  1727. mulss %xmm1, %xmm3
  1728. mulss 52 * SIZE(BB), %xmm1
  1729. addss %xmm3, %xmm4
  1730. movss 56 * SIZE(BB), %xmm3
  1731. addss %xmm1, %xmm5
  1732. movss 7 * SIZE(AA), %xmm1
  1733. mulss %xmm1, %xmm3
  1734. mulss 60 * SIZE(BB), %xmm1
  1735. addss %xmm3, %xmm6
  1736. movss 80 * SIZE(BB), %xmm3
  1737. addss %xmm1, %xmm7
  1738. movss 12 * SIZE(AA), %xmm1
  1739. addl $ 8 * SIZE, AA
  1740. addl $64 * SIZE, BB
  1741. decl %eax
  1742. jne .L72
  1743. ALIGN_4
  1744. .L75:
  1745. #ifndef TRMMKERNEL
  1746. movl K, %eax
  1747. #else
  1748. movl KKK, %eax
  1749. #endif
  1750. movaps ALPHA, %xmm3
  1751. andl $7, %eax # if (k & 1)
  1752. BRANCH
  1753. je .L78
  1754. ALIGN_4
  1755. .L76:
  1756. mulss %xmm0, %xmm2
  1757. mulss 4 * SIZE(BB), %xmm0
  1758. addss %xmm2, %xmm4
  1759. movss 8 * SIZE(BB), %xmm2
  1760. addss %xmm0, %xmm5
  1761. movss 1 * SIZE(AA), %xmm0
  1762. addl $ 1 * SIZE, AA
  1763. addl $ 8 * SIZE, BB
  1764. decl %eax
  1765. jg .L76
  1766. ALIGN_4
  1767. .L78:
  1768. addss %xmm6, %xmm4
  1769. addss %xmm7, %xmm5
  1770. mulss %xmm3, %xmm4
  1771. mulss %xmm3, %xmm5
  1772. #ifndef TRMMKERNEL
  1773. movss 0 * SIZE(%esi), %xmm0
  1774. movss 0 * SIZE(%esi, LDC, 1), %xmm1
  1775. addss %xmm0, %xmm4
  1776. addss %xmm1, %xmm5
  1777. #endif
  1778. movss %xmm4, 0 * SIZE(%esi)
  1779. movss %xmm5, 0 * SIZE(%esi, LDC, 1)
  1780. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1781. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1782. movl K, %eax
  1783. subl KKK, %eax
  1784. leal (,%eax, 4), %eax
  1785. leal (AA, %eax, 1), AA
  1786. leal (BB, %eax, 8), BB
  1787. #endif
  1788. #if defined(TRMMKERNEL) && defined(LEFT)
  1789. addl $1, KK
  1790. #endif
  1791. ALIGN_4
  1792. .L79:
  1793. #if defined(TRMMKERNEL) && !defined(LEFT)
  1794. addl $2, KK
  1795. #endif
  1796. leal (, LDC, 2), %eax
  1797. addl %eax, C
  1798. ALIGN_4
  1799. .L80:
  1800. testl $1, N
  1801. je .L999
  1802. #if defined(TRMMKERNEL) && defined(LEFT)
  1803. movl OFFSET, %eax
  1804. movl %eax, KK
  1805. #endif
  1806. movl K, %eax
  1807. leal BUFFER, %ecx
  1808. sarl $3, %eax
  1809. jle .L85
  1810. ALIGN_4
  1811. .L82:
  1812. prefetchnta 80 * SIZE(%edi)
  1813. #if defined(OPTERON) || defined(BARCELONA)
  1814. prefetchw 112 * SIZE(%ecx)
  1815. prefetchw 120 * SIZE(%ecx)
  1816. #endif
  1817. #ifdef PENTIUM4
  1818. prefetcht1 112 * SIZE(%ecx)
  1819. #endif
  1820. #ifdef HAVE_SSE2
  1821. movss 0 * SIZE(%edi), %xmm0
  1822. movss 1 * SIZE(%edi), %xmm1
  1823. movss 2 * SIZE(%edi), %xmm2
  1824. movss 3 * SIZE(%edi), %xmm3
  1825. movss 4 * SIZE(%edi), %xmm4
  1826. movss 5 * SIZE(%edi), %xmm5
  1827. movss 6 * SIZE(%edi), %xmm6
  1828. movss 7 * SIZE(%edi), %xmm7
  1829. shufps $0, %xmm0, %xmm0
  1830. shufps $0, %xmm1, %xmm1
  1831. shufps $0, %xmm2, %xmm2
  1832. shufps $0, %xmm3, %xmm3
  1833. shufps $0, %xmm4, %xmm4
  1834. shufps $0, %xmm5, %xmm5
  1835. shufps $0, %xmm6, %xmm6
  1836. shufps $0, %xmm7, %xmm7
  1837. movaps %xmm0, 0 * SIZE(%ecx)
  1838. movaps %xmm1, 4 * SIZE(%ecx)
  1839. movaps %xmm2, 8 * SIZE(%ecx)
  1840. movaps %xmm3, 12 * SIZE(%ecx)
  1841. movaps %xmm4, 16 * SIZE(%ecx)
  1842. movaps %xmm5, 20 * SIZE(%ecx)
  1843. movaps %xmm6, 24 * SIZE(%ecx)
  1844. movaps %xmm7, 28 * SIZE(%ecx)
  1845. #else
  1846. movd 0 * SIZE(%edi), %mm0
  1847. movd 1 * SIZE(%edi), %mm1
  1848. movd 2 * SIZE(%edi), %mm2
  1849. movd 3 * SIZE(%edi), %mm3
  1850. movd 4 * SIZE(%edi), %mm4
  1851. movd 5 * SIZE(%edi), %mm5
  1852. movd 6 * SIZE(%edi), %mm6
  1853. movd 7 * SIZE(%edi), %mm7
  1854. movd %mm0, 0 * SIZE(%ecx)
  1855. movd %mm0, 1 * SIZE(%ecx)
  1856. movd %mm0, 2 * SIZE(%ecx)
  1857. movd %mm0, 3 * SIZE(%ecx)
  1858. movd %mm1, 4 * SIZE(%ecx)
  1859. movd %mm1, 5 * SIZE(%ecx)
  1860. movd %mm1, 6 * SIZE(%ecx)
  1861. movd %mm1, 7 * SIZE(%ecx)
  1862. movd %mm2, 8 * SIZE(%ecx)
  1863. movd %mm2, 9 * SIZE(%ecx)
  1864. movd %mm2, 10 * SIZE(%ecx)
  1865. movd %mm2, 11 * SIZE(%ecx)
  1866. movd %mm3, 12 * SIZE(%ecx)
  1867. movd %mm3, 13 * SIZE(%ecx)
  1868. movd %mm3, 14 * SIZE(%ecx)
  1869. movd %mm3, 15 * SIZE(%ecx)
  1870. movd %mm4, 16 * SIZE(%ecx)
  1871. movd %mm4, 17 * SIZE(%ecx)
  1872. movd %mm4, 18 * SIZE(%ecx)
  1873. movd %mm4, 19 * SIZE(%ecx)
  1874. movd %mm5, 20 * SIZE(%ecx)
  1875. movd %mm5, 21 * SIZE(%ecx)
  1876. movd %mm5, 22 * SIZE(%ecx)
  1877. movd %mm5, 23 * SIZE(%ecx)
  1878. movd %mm6, 24 * SIZE(%ecx)
  1879. movd %mm6, 25 * SIZE(%ecx)
  1880. movd %mm6, 26 * SIZE(%ecx)
  1881. movd %mm6, 27 * SIZE(%ecx)
  1882. movd %mm7, 28 * SIZE(%ecx)
  1883. movd %mm7, 29 * SIZE(%ecx)
  1884. movd %mm7, 30 * SIZE(%ecx)
  1885. movd %mm7, 31 * SIZE(%ecx)
  1886. #endif
  1887. addl $ 8 * SIZE, %edi
  1888. addl $32 * SIZE, %ecx
  1889. decl %eax
  1890. jne .L82
  1891. ALIGN_4
  1892. .L85:
  1893. movl K, %eax
  1894. andl $7, %eax
  1895. BRANCH
  1896. jle .L90
  1897. ALIGN_4
  1898. .L86:
  1899. #ifdef HAVE_SSE2
  1900. movss 0 * SIZE(%edi), %xmm0
  1901. shufps $0, %xmm0, %xmm0
  1902. movaps %xmm0, 0 * SIZE(%ecx)
  1903. #else
  1904. movd 0 * SIZE(%edi), %mm0
  1905. movd %mm0, 0 * SIZE(%ecx)
  1906. movd %mm0, 1 * SIZE(%ecx)
  1907. movd %mm0, 2 * SIZE(%ecx)
  1908. movd %mm0, 3 * SIZE(%ecx)
  1909. #endif
  1910. addl $1 * SIZE, %edi
  1911. addl $4 * SIZE, %ecx
  1912. decl %eax
  1913. jne .L86
  1914. ALIGN_4
  1915. .L90:
  1916. movl C, %esi # coffset = c
  1917. movl A, %edx # aoffset = a
  1918. movl M, %ebx
  1919. sarl $2, %ebx # i = (m >> 2)
  1920. jle .L100
  1921. ALIGN_4
  1922. .L91:
  1923. #if !defined(TRMMKERNEL) || \
  1924. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1925. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1926. leal BUFFER, BB # boffset1 = boffset
  1927. #else
  1928. leal BUFFER, BB # boffset1 = boffset
  1929. movl KK, %eax
  1930. leal (, %eax, 8), %eax
  1931. leal (AA, %eax, 2), AA
  1932. leal (BB, %eax, 2), BB
  1933. #endif
  1934. xorps %xmm4, %xmm4
  1935. xorps %xmm5, %xmm5
  1936. xorps %xmm6, %xmm6
  1937. xorps %xmm7, %xmm7
  1938. movaps 0 * SIZE(AA), %xmm0
  1939. movaps 16 * SIZE(AA), %xmm1
  1940. movaps 0 * SIZE(BB), %xmm2
  1941. movaps 16 * SIZE(BB), %xmm3
  1942. #ifdef HAVE_3DNOW
  1943. prefetchw 4 * SIZE(%esi)
  1944. #elif defined(HAVE_SSE) || defined(HAVE_SSE2)
  1945. prefetcht2 4 * SIZE(%esi)
  1946. #endif
  1947. #ifndef TRMMKERNEL
  1948. movl K, %eax
  1949. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1950. movl K, %eax
  1951. subl KK, %eax
  1952. movl %eax, KKK
  1953. #else
  1954. movl KK, %eax
  1955. #ifdef LEFT
  1956. addl $4, %eax
  1957. #else
  1958. addl $1, %eax
  1959. #endif
  1960. movl %eax, KKK
  1961. #endif
  1962. sarl $3, %eax
  1963. je .L95
  1964. ALIGN_4
  1965. .L92:
  1966. mulps %xmm0, %xmm2
  1967. #if defined(OPTERON) || defined(BARCELONA)
  1968. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1969. #endif
  1970. movaps 4 * SIZE(AA), %xmm0
  1971. addps %xmm2, %xmm4
  1972. movaps 32 * SIZE(BB), %xmm2
  1973. mulps 4 * SIZE(BB), %xmm0
  1974. addps %xmm0, %xmm5
  1975. movaps 8 * SIZE(AA), %xmm0
  1976. mulps 8 * SIZE(BB), %xmm0
  1977. addps %xmm0, %xmm6
  1978. movaps 12 * SIZE(AA), %xmm0
  1979. mulps 12 * SIZE(BB), %xmm0
  1980. addps %xmm0, %xmm7
  1981. movaps 32 * SIZE(AA), %xmm0
  1982. #if defined(OPTERON) || defined(BARCELONA)
  1983. prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
  1984. #endif
  1985. mulps %xmm1, %xmm3
  1986. movaps 20 * SIZE(AA), %xmm1
  1987. addps %xmm3, %xmm4
  1988. movaps 48 * SIZE(BB), %xmm3
  1989. mulps 20 * SIZE(BB), %xmm1
  1990. addps %xmm1, %xmm5
  1991. movaps 24 * SIZE(AA), %xmm1
  1992. mulps 24 * SIZE(BB), %xmm1
  1993. addps %xmm1, %xmm6
  1994. movaps 28 * SIZE(AA), %xmm1
  1995. mulps 28 * SIZE(BB), %xmm1
  1996. addps %xmm1, %xmm7
  1997. movaps 48 * SIZE(AA), %xmm1
  1998. addl $32 * SIZE, AA
  1999. addl $32 * SIZE, BB
  2000. decl %eax
  2001. jne .L92
  2002. ALIGN_4
  2003. .L95:
  2004. #ifndef TRMMKERNEL
  2005. movl K, %eax
  2006. #else
  2007. movl KKK, %eax
  2008. #endif
  2009. movaps ALPHA, %xmm3
  2010. andl $7, %eax # if (k & 1)
  2011. BRANCH
  2012. je .L98
  2013. ALIGN_4
  2014. .L96:
  2015. mulps %xmm0, %xmm2
  2016. addps %xmm2, %xmm4
  2017. movaps 4 * SIZE(AA), %xmm0
  2018. movaps 4 * SIZE(BB), %xmm2
  2019. addl $4 * SIZE, AA
  2020. addl $4 * SIZE, BB
  2021. decl %eax
  2022. jg .L96
  2023. ALIGN_4
  2024. .L98:
  2025. addps %xmm5, %xmm4
  2026. addps %xmm7, %xmm6
  2027. addps %xmm6, %xmm4
  2028. mulps %xmm3, %xmm4
  2029. #ifndef TRMMKERNEL
  2030. movsd 0 * SIZE(%esi), %xmm0
  2031. movhps 2 * SIZE(%esi), %xmm0
  2032. addps %xmm0, %xmm4
  2033. #endif
  2034. movlps %xmm4, 0 * SIZE(%esi)
  2035. movhps %xmm4, 2 * SIZE(%esi)
  2036. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2037. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2038. movl K, %eax
  2039. subl KKK, %eax
  2040. leal (,%eax, 8), %eax
  2041. leal (AA, %eax, 2), AA
  2042. leal (BB, %eax, 2), BB
  2043. #endif
  2044. #if defined(TRMMKERNEL) && defined(LEFT)
  2045. addl $4, KK
  2046. #endif
  2047. addl $4 * SIZE, %esi # coffset += 2
  2048. decl %ebx # i --
  2049. jg .L91
  2050. ALIGN_4
  2051. .L100:
  2052. testl $2, M
  2053. je .L110
  2054. #if !defined(TRMMKERNEL) || \
  2055. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2056. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2057. leal BUFFER, BB # boffset1 = boffset
  2058. #else
  2059. leal BUFFER, BB # boffset1 = boffset
  2060. movl KK, %eax
  2061. leal (, %eax, 8), %eax
  2062. leal (AA, %eax, 1), AA
  2063. leal (BB, %eax, 2), BB
  2064. #endif
  2065. xorps %xmm4, %xmm4
  2066. xorps %xmm5, %xmm5
  2067. xorps %xmm6, %xmm6
  2068. xorps %xmm7, %xmm7
  2069. movaps 0 * SIZE(AA), %xmm0
  2070. movaps 8 * SIZE(AA), %xmm1
  2071. movaps 0 * SIZE(BB), %xmm2
  2072. movaps 16 * SIZE(BB), %xmm3
  2073. leal (LDC, LDC, 2), %eax
  2074. #ifndef TRMMKERNEL
  2075. movl K, %eax
  2076. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2077. movl K, %eax
  2078. subl KK, %eax
  2079. movl %eax, KKK
  2080. #else
  2081. movl KK, %eax
  2082. #ifdef LEFT
  2083. addl $2, %eax
  2084. #else
  2085. addl $1, %eax
  2086. #endif
  2087. movl %eax, KKK
  2088. #endif
  2089. sarl $3, %eax
  2090. je .L105
  2091. ALIGN_4
  2092. .L102:
  2093. mulps %xmm0, %xmm2
  2094. #if defined(OPTERON) || defined(BARCELONA)
  2095. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  2096. #endif
  2097. movsd 2 * SIZE(AA), %xmm0
  2098. addps %xmm2, %xmm4
  2099. movaps 4 * SIZE(BB), %xmm2
  2100. mulps %xmm0, %xmm2
  2101. movsd 4 * SIZE(AA), %xmm0
  2102. addps %xmm2, %xmm5
  2103. movaps 8 * SIZE(BB), %xmm2
  2104. mulps %xmm0, %xmm2
  2105. movsd 6 * SIZE(AA), %xmm0
  2106. addps %xmm2, %xmm6
  2107. movaps 12 * SIZE(BB), %xmm2
  2108. mulps %xmm0, %xmm2
  2109. movsd 16 * SIZE(AA), %xmm0
  2110. addps %xmm2, %xmm7
  2111. movaps 32 * SIZE(BB), %xmm2
  2112. mulps %xmm1, %xmm3
  2113. movsd 10 * SIZE(AA), %xmm1
  2114. addps %xmm3, %xmm4
  2115. movaps 20 * SIZE(BB), %xmm3
  2116. mulps %xmm1, %xmm3
  2117. movsd 12 * SIZE(AA), %xmm1
  2118. addps %xmm3, %xmm5
  2119. movaps 24 * SIZE(BB), %xmm3
  2120. mulps %xmm1, %xmm3
  2121. movsd 14 * SIZE(AA), %xmm1
  2122. addps %xmm3, %xmm6
  2123. movaps 28 * SIZE(BB), %xmm3
  2124. mulps %xmm1, %xmm3
  2125. movsd 24 * SIZE(AA), %xmm1
  2126. addps %xmm3, %xmm7
  2127. movaps 48 * SIZE(BB), %xmm3
  2128. addl $16 * SIZE, AA
  2129. addl $32 * SIZE, BB
  2130. decl %eax
  2131. jne .L102
  2132. ALIGN_4
  2133. .L105:
  2134. #ifndef TRMMKERNEL
  2135. movl K, %eax
  2136. #else
  2137. movl KKK, %eax
  2138. #endif
  2139. movaps ALPHA, %xmm3
  2140. andl $7, %eax # if (k & 1)
  2141. BRANCH
  2142. je .L108
  2143. ALIGN_4
  2144. .L106:
  2145. mulps %xmm0, %xmm2
  2146. addps %xmm2, %xmm4
  2147. movsd 2 * SIZE(AA), %xmm0
  2148. movaps 4 * SIZE(BB), %xmm2
  2149. addl $2 * SIZE, AA
  2150. addl $4 * SIZE, BB
  2151. decl %eax
  2152. jg .L106
  2153. ALIGN_4
  2154. .L108:
  2155. addps %xmm5, %xmm4
  2156. addps %xmm7, %xmm6
  2157. addps %xmm6, %xmm4
  2158. mulps %xmm3, %xmm4
  2159. #ifndef TRMMKERNEL
  2160. #ifdef movsd
  2161. xorps %xmm0, %xmm0
  2162. #endif
  2163. movsd 0 * SIZE(%esi), %xmm0
  2164. addps %xmm0, %xmm4
  2165. #endif
  2166. movlps %xmm4, 0 * SIZE(%esi)
  2167. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2168. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2169. movl K, %eax
  2170. subl KKK, %eax
  2171. leal (,%eax, 8), %eax
  2172. leal (AA, %eax, 1), AA
  2173. leal (BB, %eax, 2), BB
  2174. #endif
  2175. #if defined(TRMMKERNEL) && defined(LEFT)
  2176. addl $2, KK
  2177. #endif
  2178. addl $2 * SIZE, %esi # coffset += 2
  2179. ALIGN_4
  2180. .L110:
  2181. testl $1, M
  2182. je .L999
  2183. #if !defined(TRMMKERNEL) || \
  2184. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2185. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2186. leal BUFFER, BB # boffset1 = boffset
  2187. #else
  2188. leal BUFFER, BB # boffset1 = boffset
  2189. movl KK, %eax
  2190. leal (, %eax, 4), %eax
  2191. leal (AA, %eax, 1), AA
  2192. leal (BB, %eax, 4), BB
  2193. #endif
  2194. xorps %xmm4, %xmm4
  2195. xorps %xmm5, %xmm5
  2196. xorps %xmm6, %xmm6
  2197. xorps %xmm7, %xmm7
  2198. movss 0 * SIZE(AA), %xmm0
  2199. movss 4 * SIZE(AA), %xmm1
  2200. movss 0 * SIZE(BB), %xmm2
  2201. movss 16 * SIZE(BB), %xmm3
  2202. leal (LDC, LDC, 2), %eax
  2203. #ifndef TRMMKERNEL
  2204. movl K, %eax
  2205. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2206. movl K, %eax
  2207. subl KK, %eax
  2208. movl %eax, KKK
  2209. #else
  2210. movl KK, %eax
  2211. #ifdef LEFT
  2212. addl $1, %eax
  2213. #else
  2214. addl $1, %eax
  2215. #endif
  2216. movl %eax, KKK
  2217. #endif
  2218. sarl $3, %eax
  2219. je .L115
  2220. ALIGN_4
  2221. .L112:
  2222. mulss %xmm0, %xmm2
  2223. #if defined(OPTERON) || defined(BARCELONA)
  2224. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  2225. #endif
  2226. movss 1 * SIZE(AA), %xmm0
  2227. addss %xmm2, %xmm4
  2228. movss 32 * SIZE(BB), %xmm2
  2229. mulss 4 * SIZE(BB), %xmm0
  2230. addss %xmm0, %xmm5
  2231. movss 2 * SIZE(AA), %xmm0
  2232. mulss 8 * SIZE(BB), %xmm0
  2233. addss %xmm0, %xmm6
  2234. movss 3 * SIZE(AA), %xmm0
  2235. mulss 12 * SIZE(BB), %xmm0
  2236. addss %xmm0, %xmm7
  2237. movss 8 * SIZE(AA), %xmm0
  2238. mulss %xmm1, %xmm3
  2239. movss 5 * SIZE(AA), %xmm1
  2240. addss %xmm3, %xmm4
  2241. movss 48 * SIZE(BB), %xmm3
  2242. mulss 20 * SIZE(BB), %xmm1
  2243. addss %xmm1, %xmm5
  2244. movss 6 * SIZE(AA), %xmm1
  2245. mulss 24 * SIZE(BB), %xmm1
  2246. addss %xmm1, %xmm6
  2247. movss 7 * SIZE(AA), %xmm1
  2248. mulss 28 * SIZE(BB), %xmm1
  2249. addss %xmm1, %xmm7
  2250. movss 12 * SIZE(AA), %xmm1
  2251. addl $ 8 * SIZE, AA
  2252. addl $32 * SIZE, BB
  2253. decl %eax
  2254. jne .L112
  2255. ALIGN_4
  2256. .L115:
  2257. #ifndef TRMMKERNEL
  2258. movl K, %eax
  2259. #else
  2260. movl KKK, %eax
  2261. #endif
  2262. movaps ALPHA, %xmm3
  2263. andl $7, %eax # if (k & 1)
  2264. BRANCH
  2265. je .L118
  2266. ALIGN_4
  2267. .L116:
  2268. mulss %xmm0, %xmm2
  2269. movss 1 * SIZE(AA), %xmm0
  2270. addss %xmm2, %xmm4
  2271. movss 4 * SIZE(BB), %xmm2
  2272. addl $ 1 * SIZE, AA
  2273. addl $ 4 * SIZE, BB
  2274. decl %eax
  2275. jg .L116
  2276. ALIGN_4
  2277. .L118:
  2278. addss %xmm5, %xmm4
  2279. addss %xmm7, %xmm6
  2280. addss %xmm6, %xmm4
  2281. mulss %xmm3, %xmm4
  2282. #ifndef TRMMKERNEL
  2283. movss 0 * SIZE(%esi), %xmm0
  2284. addss %xmm0, %xmm4
  2285. #endif
  2286. movss %xmm4, 0 * SIZE(%esi)
  2287. ALIGN_4
  2288. .L999:
  2289. EMMS
  2290. movl OLD_STACK, %esp
  2291. popl %ebx
  2292. popl %esi
  2293. popl %edi
  2294. popl %ebp
  2295. ret
  2296. EPILOGUE