You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_2x4_sse2.S 48 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define OLD_M 4 + STACK + ARGS(%esi)
  43. #define OLD_N 8 + STACK + ARGS(%esi)
  44. #define OLD_K 12 + STACK + ARGS(%esi)
  45. #define OLD_ALPHA 16 + STACK + ARGS(%esi)
  46. #define OLD_A 24 + STACK + ARGS(%esi)
  47. #define OLD_B 28 + STACK + ARGS(%esi)
  48. #define OLD_C 32 + STACK + ARGS(%esi)
  49. #define OLD_LDC 36 + STACK + ARGS(%esi)
  50. #define OLD_OFFT 40 + STACK + ARGS(%esi)
  51. #define K 16(%esp)
  52. #define N 20(%esp)
  53. #define M 24(%esp)
  54. #define A 28(%esp)
  55. #define C 32(%esp)
  56. #define J 36(%esp)
  57. #define OLD_STACK 40(%esp)
  58. #define OFFSET 44(%esp)
  59. #define KK 48(%esp)
  60. #define KKK 52(%esp)
  61. #define AORIG 56(%esp)
  62. #define BORIG 60(%esp)
  63. #define BUFFER 128(%esp)
  64. #define STACK_ALIGN 4096
  65. #define STACK_OFFSET 1024
  66. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
  67. #define PREFETCH prefetch
  68. #define PREFETCHSIZE (8 * 10 + 4)
  69. #endif
  70. #define B %edi
  71. #define AA %edx
  72. #define BB %ecx
  73. #define LDC %ebp
  74. #define CO1 %esi
  75. #define KERNEL1(address) \
  76. mulpd %xmm0, %xmm2; \
  77. addpd %xmm2, %xmm4; \
  78. PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
  79. movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  80. mulpd %xmm0, %xmm2; \
  81. addpd %xmm2, %xmm5; \
  82. movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  83. mulpd %xmm0, %xmm2; \
  84. mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  85. addpd %xmm2, %xmm6; \
  86. movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  87. addpd %xmm0, %xmm7; \
  88. movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  89. #define KERNEL2(address) \
  90. mulpd %xmm0, %xmm3; \
  91. addpd %xmm3, %xmm4; \
  92. movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  93. mulpd %xmm0, %xmm3; \
  94. addpd %xmm3, %xmm5; \
  95. movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  96. mulpd %xmm0, %xmm3; \
  97. mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  98. addpd %xmm3, %xmm6; \
  99. movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  100. addpd %xmm0, %xmm7; \
  101. movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  102. #define KERNEL3(address) \
  103. mulpd %xmm0, %xmm2; \
  104. addpd %xmm2, %xmm4; \
  105. movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  106. mulpd %xmm0, %xmm2; \
  107. addpd %xmm2, %xmm5; \
  108. movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  109. mulpd %xmm0, %xmm2; \
  110. mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  111. addpd %xmm2, %xmm6; \
  112. movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  113. addpd %xmm0, %xmm7; \
  114. movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  115. #define KERNEL4(address) \
  116. mulpd %xmm0, %xmm3; \
  117. addpd %xmm3, %xmm4; \
  118. movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  119. mulpd %xmm0, %xmm3; \
  120. addpd %xmm3, %xmm5; \
  121. movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  122. mulpd %xmm0, %xmm3; \
  123. mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
  124. addpd %xmm3, %xmm6; \
  125. movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  126. addpd %xmm0, %xmm7; \
  127. movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0
  128. #define KERNEL5(address) \
  129. PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \
  130. mulpd %xmm1, %xmm2; \
  131. addpd %xmm2, %xmm4; \
  132. movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  133. mulpd %xmm1, %xmm2; \
  134. addpd %xmm2, %xmm5; \
  135. movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  136. mulpd %xmm1, %xmm2; \
  137. mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  138. addpd %xmm2, %xmm6; \
  139. movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  140. addpd %xmm1, %xmm7; \
  141. movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  142. #define KERNEL6(address) \
  143. mulpd %xmm1, %xmm3; \
  144. addpd %xmm3, %xmm4; \
  145. movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  146. mulpd %xmm1, %xmm3; \
  147. addpd %xmm3, %xmm5; \
  148. movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  149. mulpd %xmm1, %xmm3; \
  150. mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  151. addpd %xmm3, %xmm6; \
  152. movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  153. addpd %xmm1, %xmm7; \
  154. movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  155. #define KERNEL7(address) \
  156. mulpd %xmm1, %xmm2; \
  157. addpd %xmm2, %xmm4; \
  158. movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  159. mulpd %xmm1, %xmm2; \
  160. addpd %xmm2, %xmm5; \
  161. movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  162. mulpd %xmm1, %xmm2; \
  163. mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  164. addpd %xmm2, %xmm6; \
  165. movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
  166. addpd %xmm1, %xmm7; \
  167. movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  168. #define KERNEL8(address) \
  169. mulpd %xmm1, %xmm3; \
  170. addpd %xmm3, %xmm4; \
  171. movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  172. mulpd %xmm1, %xmm3; \
  173. addpd %xmm3, %xmm5; \
  174. movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  175. mulpd %xmm1, %xmm3; \
  176. mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
  177. addpd %xmm3, %xmm6; \
  178. movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
  179. addpd %xmm1, %xmm7; \
  180. movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
  181. PROLOGUE
  182. pushl %ebp
  183. pushl %edi
  184. pushl %esi
  185. pushl %ebx
  186. PROFCODE
  187. EMMS
  188. movl %esp, %esi # save old stack
  189. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  190. andl $-STACK_ALIGN, %esp
  191. addl $STACK_OFFSET, %esp
  192. STACK_TOUCHING
  193. movl OLD_M, %ebx
  194. movl OLD_N, %eax
  195. movl OLD_K, %ecx
  196. movl OLD_A, %edx
  197. movl %ebx, M
  198. movl %eax, N
  199. movl %ecx, K
  200. movl %edx, A
  201. movl %esi, OLD_STACK
  202. movd OLD_OFFT, %mm4
  203. movl OLD_B, B
  204. movl OLD_C, %ebx
  205. movl %ebx, C
  206. movl OLD_LDC, LDC
  207. movd %mm4, OFFSET
  208. movd %mm4, KK
  209. leal (, LDC, SIZE), LDC
  210. #ifdef LN
  211. movl M, %eax
  212. leal (, %eax, SIZE), %eax
  213. addl %eax, C
  214. imull K, %eax
  215. addl %eax, A
  216. #endif
  217. #ifdef RT
  218. movl N, %eax
  219. leal (, %eax, SIZE), %eax
  220. imull K, %eax
  221. addl %eax, B
  222. movl N, %eax
  223. imull LDC, %eax
  224. addl %eax, C
  225. #endif
  226. #ifdef RN
  227. negl KK
  228. #endif
  229. #ifdef RT
  230. movl N, %eax
  231. subl OFFSET, %eax
  232. movl %eax, KK
  233. #endif
  234. movl N, %eax
  235. sarl $2, %eax
  236. movl %eax, J
  237. jle .L30
  238. ALIGN_2
  239. .L01:
  240. #ifdef LN
  241. movl OFFSET, %eax
  242. addl M, %eax
  243. movl %eax, KK
  244. #endif
  245. leal BUFFER, BB
  246. #ifdef RT
  247. movl K, %eax
  248. sall $2 + BASE_SHIFT, %eax
  249. subl %eax, B
  250. #endif
  251. #if defined(LN) || defined(RT)
  252. movl KK, %eax
  253. movl B, BORIG
  254. leal (, %eax, SIZE), %eax
  255. leal (B, %eax, 4), B
  256. leal (BB, %eax, 8), BB
  257. #endif
  258. #ifdef LT
  259. movl OFFSET, %eax
  260. movl %eax, KK
  261. #endif
  262. #if defined(LT) || defined(RN)
  263. movl KK, %eax
  264. #else
  265. movl K, %eax
  266. subl KK, %eax
  267. #endif
  268. sarl $1, %eax
  269. jle .L05
  270. ALIGN_4
  271. .L02:
  272. #define COPYPREFETCH 40
  273. prefetchnta (COPYPREFETCH) * SIZE(B)
  274. movq 0 * SIZE(B), %mm0
  275. movq 1 * SIZE(B), %mm1
  276. movq 2 * SIZE(B), %mm2
  277. movq 3 * SIZE(B), %mm3
  278. movq 4 * SIZE(B), %mm4
  279. movq 5 * SIZE(B), %mm5
  280. movq 6 * SIZE(B), %mm6
  281. movq 7 * SIZE(B), %mm7
  282. movq %mm0, 0 * SIZE(BB)
  283. movq %mm0, 1 * SIZE(BB)
  284. movq %mm1, 2 * SIZE(BB)
  285. movq %mm1, 3 * SIZE(BB)
  286. movq %mm2, 4 * SIZE(BB)
  287. movq %mm2, 5 * SIZE(BB)
  288. movq %mm3, 6 * SIZE(BB)
  289. movq %mm3, 7 * SIZE(BB)
  290. movq %mm4, 8 * SIZE(BB)
  291. movq %mm4, 9 * SIZE(BB)
  292. movq %mm5, 10 * SIZE(BB)
  293. movq %mm5, 11 * SIZE(BB)
  294. movq %mm6, 12 * SIZE(BB)
  295. movq %mm6, 13 * SIZE(BB)
  296. movq %mm7, 14 * SIZE(BB)
  297. movq %mm7, 15 * SIZE(BB)
  298. addl $ 8 * SIZE, B
  299. addl $16 * SIZE, BB
  300. decl %eax
  301. jne .L02
  302. ALIGN_2
  303. .L05:
  304. #if defined(LT) || defined(RN)
  305. movl KK, %eax
  306. #else
  307. movl K, %eax
  308. subl KK, %eax
  309. #endif
  310. andl $1, %eax
  311. BRANCH
  312. jle .L10
  313. movq 0 * SIZE(B), %mm0
  314. movq 1 * SIZE(B), %mm1
  315. movq 2 * SIZE(B), %mm2
  316. movq 3 * SIZE(B), %mm3
  317. movq %mm0, 0 * SIZE(BB)
  318. movq %mm0, 1 * SIZE(BB)
  319. movq %mm1, 2 * SIZE(BB)
  320. movq %mm1, 3 * SIZE(BB)
  321. movq %mm2, 4 * SIZE(BB)
  322. movq %mm2, 5 * SIZE(BB)
  323. movq %mm3, 6 * SIZE(BB)
  324. movq %mm3, 7 * SIZE(BB)
  325. addl $4 * SIZE, B
  326. ALIGN_4
  327. .L10:
  328. #if defined(LT) || defined(RN)
  329. movl A, AA
  330. #else
  331. movl A, %eax
  332. movl %eax, AORIG
  333. #endif
  334. leal (, LDC, 4), %eax
  335. #ifdef RT
  336. subl %eax, C
  337. #endif
  338. movl C, CO1
  339. #ifndef RT
  340. addl %eax, C
  341. #endif
  342. movl M, %ebx
  343. sarl $1, %ebx # i = (m >> 2)
  344. jle .L20
  345. ALIGN_4
  346. .L11:
  347. #ifdef LN
  348. movl K, %eax
  349. sall $1 + BASE_SHIFT, %eax
  350. subl %eax, AORIG
  351. #endif
  352. #if defined(LN) || defined(RT)
  353. movl KK, %eax
  354. movl AORIG, AA
  355. leal (, %eax, SIZE), %eax
  356. leal (AA, %eax, 2), AA
  357. #endif
  358. leal BUFFER, BB
  359. #if defined(LN) || defined(RT)
  360. movl KK, %eax
  361. sall $3 + BASE_SHIFT, %eax
  362. addl %eax, BB
  363. #endif
  364. pxor %xmm4, %xmm4
  365. pxor %xmm5, %xmm5
  366. pxor %xmm6, %xmm6
  367. pxor %xmm7, %xmm7
  368. movapd 0 * SIZE(AA), %xmm0
  369. movapd 8 * SIZE(AA), %xmm1
  370. movapd 0 * SIZE(BB), %xmm2
  371. movapd 8 * SIZE(BB), %xmm3
  372. leal (LDC, LDC, 2), %eax
  373. #ifdef LN
  374. prefetchw -2 * SIZE(CO1)
  375. prefetchw -2 * SIZE(CO1, LDC)
  376. prefetchw -2 * SIZE(CO1, LDC, 2)
  377. prefetchw -2 * SIZE(CO1, %eax)
  378. #else
  379. prefetchw 1 * SIZE(CO1)
  380. prefetchw 1 * SIZE(CO1, LDC)
  381. prefetchw 1 * SIZE(CO1, LDC, 2)
  382. prefetchw 1 * SIZE(CO1, %eax)
  383. #endif
  384. #if defined(LT) || defined(RN)
  385. movl KK, %eax
  386. #else
  387. movl K, %eax
  388. subl KK, %eax
  389. #endif
  390. #if 1
  391. andl $-8, %eax
  392. sall $4, %eax
  393. je .L15
  394. .L1X:
  395. KERNEL1(16 * 0)
  396. KERNEL2(16 * 0)
  397. KERNEL3(16 * 0)
  398. KERNEL4(16 * 0)
  399. KERNEL5(16 * 0)
  400. KERNEL6(16 * 0)
  401. KERNEL7(16 * 0)
  402. KERNEL8(16 * 0)
  403. cmpl $128 * 1, %eax
  404. jle .L12
  405. KERNEL1(16 * 1)
  406. KERNEL2(16 * 1)
  407. KERNEL3(16 * 1)
  408. KERNEL4(16 * 1)
  409. KERNEL5(16 * 1)
  410. KERNEL6(16 * 1)
  411. KERNEL7(16 * 1)
  412. KERNEL8(16 * 1)
  413. cmpl $128 * 2, %eax
  414. jle .L12
  415. KERNEL1(16 * 2)
  416. KERNEL2(16 * 2)
  417. KERNEL3(16 * 2)
  418. KERNEL4(16 * 2)
  419. KERNEL5(16 * 2)
  420. KERNEL6(16 * 2)
  421. KERNEL7(16 * 2)
  422. KERNEL8(16 * 2)
  423. cmpl $128 * 3, %eax
  424. jle .L12
  425. KERNEL1(16 * 3)
  426. KERNEL2(16 * 3)
  427. KERNEL3(16 * 3)
  428. KERNEL4(16 * 3)
  429. KERNEL5(16 * 3)
  430. KERNEL6(16 * 3)
  431. KERNEL7(16 * 3)
  432. KERNEL8(16 * 3)
  433. cmpl $128 * 4, %eax
  434. jle .L12
  435. KERNEL1(16 * 4)
  436. KERNEL2(16 * 4)
  437. KERNEL3(16 * 4)
  438. KERNEL4(16 * 4)
  439. KERNEL5(16 * 4)
  440. KERNEL6(16 * 4)
  441. KERNEL7(16 * 4)
  442. KERNEL8(16 * 4)
  443. cmpl $128 * 5, %eax
  444. jle .L12
  445. KERNEL1(16 * 5)
  446. KERNEL2(16 * 5)
  447. KERNEL3(16 * 5)
  448. KERNEL4(16 * 5)
  449. KERNEL5(16 * 5)
  450. KERNEL6(16 * 5)
  451. KERNEL7(16 * 5)
  452. KERNEL8(16 * 5)
  453. cmpl $128 * 6, %eax
  454. jle .L12
  455. KERNEL1(16 * 6)
  456. KERNEL2(16 * 6)
  457. KERNEL3(16 * 6)
  458. KERNEL4(16 * 6)
  459. KERNEL5(16 * 6)
  460. KERNEL6(16 * 6)
  461. KERNEL7(16 * 6)
  462. KERNEL8(16 * 6)
  463. cmpl $128 * 7, %eax
  464. jle .L12
  465. KERNEL1(16 * 7)
  466. KERNEL2(16 * 7)
  467. KERNEL3(16 * 7)
  468. KERNEL4(16 * 7)
  469. KERNEL5(16 * 7)
  470. KERNEL6(16 * 7)
  471. KERNEL7(16 * 7)
  472. KERNEL8(16 * 7)
  473. addl $128 * 4 * SIZE, BB
  474. addl $128 * 1 * SIZE, AA
  475. subl $128 * 8, %eax
  476. jg .L1X
  477. jmp .L15
  478. .L12:
  479. leal (AA, %eax, 1), AA
  480. leal (BB, %eax, 4), BB
  481. ALIGN_4
  482. #else
  483. sarl $3, %eax
  484. je .L15
  485. ALIGN_4
  486. .L12:
  487. KERNEL1(16 * 0)
  488. KERNEL2(16 * 0)
  489. KERNEL3(16 * 0)
  490. KERNEL4(16 * 0)
  491. KERNEL5(16 * 0)
  492. KERNEL6(16 * 0)
  493. KERNEL7(16 * 0)
  494. KERNEL8(16 * 0)
  495. addl $64 * SIZE, BB
  496. addl $16 * SIZE, AA
  497. decl %eax
  498. jne .L12
  499. ALIGN_4
  500. #endif
  501. .L15:
  502. #if defined(LT) || defined(RN)
  503. movl KK, %eax
  504. #else
  505. movl K, %eax
  506. subl KK, %eax
  507. #endif
  508. andl $7, %eax # if (k & 1)
  509. BRANCH
  510. je .L18
  511. ALIGN_3
  512. .L16:
  513. mulpd %xmm0, %xmm2
  514. addpd %xmm2, %xmm4
  515. movapd 2 * SIZE(BB), %xmm2
  516. mulpd %xmm0, %xmm2
  517. addpd %xmm2, %xmm5
  518. movapd 4 * SIZE(BB), %xmm2
  519. mulpd %xmm0, %xmm2
  520. mulpd 6 * SIZE(BB), %xmm0
  521. addpd %xmm2, %xmm6
  522. movapd 8 * SIZE(BB), %xmm2
  523. addpd %xmm0, %xmm7
  524. movapd 2 * SIZE(AA), %xmm0
  525. addl $2 * SIZE, AA
  526. addl $8 * SIZE, BB
  527. decl %eax
  528. jg .L16
  529. ALIGN_4
  530. .L18:
  531. #if defined(LN) || defined(RT)
  532. movl KK, %eax
  533. #ifdef LN
  534. subl $2, %eax
  535. #else
  536. subl $4, %eax
  537. #endif
  538. movl AORIG, AA
  539. movl BORIG, B
  540. leal BUFFER, BB
  541. leal (, %eax, SIZE), %eax
  542. leal (AA, %eax, 2), AA
  543. leal (B, %eax, 4), B
  544. leal (BB, %eax, 8), BB
  545. #endif
  546. #if defined(LN) || defined(LT)
  547. movapd %xmm4, %xmm0
  548. unpcklpd %xmm5, %xmm4
  549. unpckhpd %xmm5, %xmm0
  550. movapd %xmm6, %xmm1
  551. unpcklpd %xmm7, %xmm6
  552. unpckhpd %xmm7, %xmm1
  553. movapd 0 * SIZE(B), %xmm2
  554. movapd 2 * SIZE(B), %xmm5
  555. movapd 4 * SIZE(B), %xmm3
  556. movapd 6 * SIZE(B), %xmm7
  557. subpd %xmm4, %xmm2
  558. subpd %xmm6, %xmm5
  559. subpd %xmm0, %xmm3
  560. subpd %xmm1, %xmm7
  561. #else
  562. movapd 0 * SIZE(AA), %xmm0
  563. movapd 2 * SIZE(AA), %xmm1
  564. movapd 4 * SIZE(AA), %xmm2
  565. movapd 6 * SIZE(AA), %xmm3
  566. subpd %xmm4, %xmm0
  567. subpd %xmm5, %xmm1
  568. subpd %xmm6, %xmm2
  569. subpd %xmm7, %xmm3
  570. #endif
  571. #ifdef LN
  572. movlpd 3 * SIZE(AA), %xmm4
  573. movhpd 3 * SIZE(AA), %xmm4
  574. mulpd %xmm4, %xmm3
  575. mulpd %xmm4, %xmm7
  576. movlpd 2 * SIZE(AA), %xmm4
  577. movhpd 2 * SIZE(AA), %xmm4
  578. movapd %xmm4, %xmm6
  579. mulpd %xmm3, %xmm4
  580. subpd %xmm4, %xmm2
  581. mulpd %xmm7, %xmm6
  582. subpd %xmm6, %xmm5
  583. movlpd 0 * SIZE(AA), %xmm4
  584. movhpd 0 * SIZE(AA), %xmm4
  585. mulpd %xmm4, %xmm2
  586. mulpd %xmm4, %xmm5
  587. #endif
  588. #ifdef LT
  589. movlpd 0 * SIZE(AA), %xmm4
  590. movhpd 0 * SIZE(AA), %xmm4
  591. mulpd %xmm4, %xmm2
  592. mulpd %xmm4, %xmm5
  593. movlpd 1 * SIZE(AA), %xmm4
  594. movhpd 1 * SIZE(AA), %xmm4
  595. movapd %xmm4, %xmm6
  596. mulpd %xmm2, %xmm4
  597. subpd %xmm4, %xmm3
  598. mulpd %xmm5, %xmm6
  599. subpd %xmm6, %xmm7
  600. movlpd 3 * SIZE(AA), %xmm4
  601. movhpd 3 * SIZE(AA), %xmm4
  602. mulpd %xmm4, %xmm3
  603. mulpd %xmm4, %xmm7
  604. #endif
  605. #ifdef RN
  606. movlpd 0 * SIZE(B), %xmm4
  607. movhpd 0 * SIZE(B), %xmm4
  608. mulpd %xmm4, %xmm0
  609. movlpd 1 * SIZE(B), %xmm4
  610. movhpd 1 * SIZE(B), %xmm4
  611. mulpd %xmm0, %xmm4
  612. subpd %xmm4, %xmm1
  613. movlpd 2 * SIZE(B), %xmm4
  614. movhpd 2 * SIZE(B), %xmm4
  615. mulpd %xmm0, %xmm4
  616. subpd %xmm4, %xmm2
  617. movlpd 3 * SIZE(B), %xmm4
  618. movhpd 3 * SIZE(B), %xmm4
  619. mulpd %xmm0, %xmm4
  620. subpd %xmm4, %xmm3
  621. movlpd 5 * SIZE(B), %xmm4
  622. movhpd 5 * SIZE(B), %xmm4
  623. mulpd %xmm4, %xmm1
  624. movlpd 6 * SIZE(B), %xmm4
  625. movhpd 6 * SIZE(B), %xmm4
  626. mulpd %xmm1, %xmm4
  627. subpd %xmm4, %xmm2
  628. movlpd 7 * SIZE(B), %xmm4
  629. movhpd 7 * SIZE(B), %xmm4
  630. mulpd %xmm1, %xmm4
  631. subpd %xmm4, %xmm3
  632. movlpd 10 * SIZE(B), %xmm4
  633. movhpd 10 * SIZE(B), %xmm4
  634. mulpd %xmm4, %xmm2
  635. movlpd 11 * SIZE(B), %xmm4
  636. movhpd 11 * SIZE(B), %xmm4
  637. mulpd %xmm2, %xmm4
  638. subpd %xmm4, %xmm3
  639. movlpd 15 * SIZE(B), %xmm4
  640. movhpd 15 * SIZE(B), %xmm4
  641. mulpd %xmm4, %xmm3
  642. #endif
  643. #ifdef RT
  644. movlpd 15 * SIZE(B), %xmm4
  645. movhpd 15 * SIZE(B), %xmm4
  646. mulpd %xmm4, %xmm3
  647. movlpd 14 * SIZE(B), %xmm4
  648. movhpd 14 * SIZE(B), %xmm4
  649. mulpd %xmm3, %xmm4
  650. subpd %xmm4, %xmm2
  651. movlpd 13 * SIZE(B), %xmm4
  652. movhpd 13 * SIZE(B), %xmm4
  653. mulpd %xmm3, %xmm4
  654. subpd %xmm4, %xmm1
  655. movlpd 12 * SIZE(B), %xmm4
  656. movhpd 12 * SIZE(B), %xmm4
  657. mulpd %xmm3, %xmm4
  658. subpd %xmm4, %xmm0
  659. movlpd 10 * SIZE(B), %xmm4
  660. movhpd 10 * SIZE(B), %xmm4
  661. mulpd %xmm4, %xmm2
  662. movlpd 9 * SIZE(B), %xmm4
  663. movhpd 9 * SIZE(B), %xmm4
  664. mulpd %xmm2, %xmm4
  665. subpd %xmm4, %xmm1
  666. movlpd 8 * SIZE(B), %xmm4
  667. movhpd 8 * SIZE(B), %xmm4
  668. mulpd %xmm2, %xmm4
  669. subpd %xmm4, %xmm0
  670. movlpd 5 * SIZE(B), %xmm4
  671. movhpd 5 * SIZE(B), %xmm4
  672. mulpd %xmm4, %xmm1
  673. movlpd 4 * SIZE(B), %xmm4
  674. movhpd 4 * SIZE(B), %xmm4
  675. mulpd %xmm1, %xmm4
  676. subpd %xmm4, %xmm0
  677. movlpd 0 * SIZE(B), %xmm4
  678. movhpd 0 * SIZE(B), %xmm4
  679. mulpd %xmm4, %xmm0
  680. #endif
  681. #if defined(LN) || defined(LT)
  682. movapd %xmm2, 0 * SIZE(B)
  683. movapd %xmm5, 2 * SIZE(B)
  684. movapd %xmm3, 4 * SIZE(B)
  685. movapd %xmm7, 6 * SIZE(B)
  686. movlpd %xmm2, 0 * SIZE(BB)
  687. movlpd %xmm2, 1 * SIZE(BB)
  688. movhpd %xmm2, 2 * SIZE(BB)
  689. movhpd %xmm2, 3 * SIZE(BB)
  690. movlpd %xmm5, 4 * SIZE(BB)
  691. movlpd %xmm5, 5 * SIZE(BB)
  692. movhpd %xmm5, 6 * SIZE(BB)
  693. movhpd %xmm5, 7 * SIZE(BB)
  694. movlpd %xmm3, 8 * SIZE(BB)
  695. movlpd %xmm3, 9 * SIZE(BB)
  696. movhpd %xmm3, 10 * SIZE(BB)
  697. movhpd %xmm3, 11 * SIZE(BB)
  698. movlpd %xmm7, 12 * SIZE(BB)
  699. movlpd %xmm7, 13 * SIZE(BB)
  700. movhpd %xmm7, 14 * SIZE(BB)
  701. movhpd %xmm7, 15 * SIZE(BB)
  702. #else
  703. movapd %xmm0, 0 * SIZE(AA)
  704. movapd %xmm1, 2 * SIZE(AA)
  705. movapd %xmm2, 4 * SIZE(AA)
  706. movapd %xmm3, 6 * SIZE(AA)
  707. #endif
  708. #ifdef LN
  709. subl $2 * SIZE, CO1
  710. #endif
  711. leal (LDC, LDC, 2), %eax
  712. #if defined(LN) || defined(LT)
  713. movlpd %xmm2, 0 * SIZE(CO1)
  714. movlpd %xmm3, 1 * SIZE(CO1)
  715. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  716. movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
  717. movlpd %xmm5, 0 * SIZE(CO1, LDC, 2)
  718. movlpd %xmm7, 1 * SIZE(CO1, LDC, 2)
  719. movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
  720. movhpd %xmm7, 1 * SIZE(CO1, %eax, 1)
  721. #else
  722. movlpd %xmm0, 0 * SIZE(CO1)
  723. movhpd %xmm0, 1 * SIZE(CO1)
  724. movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
  725. movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
  726. movlpd %xmm2, 0 * SIZE(CO1, LDC, 2)
  727. movhpd %xmm2, 1 * SIZE(CO1, LDC, 2)
  728. movlpd %xmm3, 0 * SIZE(CO1, %eax, 1)
  729. movhpd %xmm3, 1 * SIZE(CO1, %eax, 1)
  730. #endif
  731. #ifndef LN
  732. addl $2 * SIZE, CO1
  733. #endif
  734. #if defined(LT) || defined(RN)
  735. movl K, %eax
  736. subl KK, %eax
  737. leal (,%eax, SIZE), %eax
  738. leal (AA, %eax, 2), AA
  739. #ifdef LT
  740. addl $8 * SIZE, B
  741. #endif
  742. #endif
  743. #ifdef LN
  744. subl $2, KK
  745. movl BORIG, B
  746. #endif
  747. #ifdef LT
  748. addl $2, KK
  749. #endif
  750. #ifdef RT
  751. movl K, %eax
  752. movl BORIG, B
  753. sall $1 + BASE_SHIFT, %eax
  754. addl %eax, AORIG
  755. #endif
  756. decl %ebx # i --
  757. jg .L11
  758. ALIGN_4
  759. .L20:
  760. movl M, %ebx
  761. testl $1, %ebx # i = (m >> 2)
  762. jle .L29
  763. #ifdef LN
  764. movl K, %eax
  765. sall $BASE_SHIFT, %eax
  766. subl %eax, AORIG
  767. #endif
  768. #if defined(LN) || defined(RT)
  769. movl KK, %eax
  770. movl AORIG, AA
  771. leal (AA, %eax, SIZE), AA
  772. #endif
  773. leal BUFFER, BB
  774. #if defined(LN) || defined(RT)
  775. movl KK, %eax
  776. sall $3 + BASE_SHIFT, %eax
  777. addl %eax, BB
  778. #endif
  779. pxor %xmm4, %xmm4
  780. pxor %xmm5, %xmm5
  781. pxor %xmm6, %xmm6
  782. pxor %xmm7, %xmm7
  783. movlpd 0 * SIZE(AA), %xmm0
  784. movlpd 4 * SIZE(AA), %xmm1
  785. movlpd 0 * SIZE(BB), %xmm2
  786. movlpd 8 * SIZE(BB), %xmm3
  787. #if defined(LT) || defined(RN)
  788. movl KK, %eax
  789. #else
  790. movl K, %eax
  791. subl KK, %eax
  792. #endif
  793. sarl $3, %eax
  794. je .L25
  795. ALIGN_4
  796. .L22:
  797. mulsd %xmm0, %xmm2
  798. addsd %xmm2, %xmm4
  799. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
  800. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  801. #endif
  802. movlpd 2 * SIZE(BB), %xmm2
  803. mulsd %xmm0, %xmm2
  804. addsd %xmm2, %xmm5
  805. movlpd 4 * SIZE(BB), %xmm2
  806. mulsd %xmm0, %xmm2
  807. mulsd 6 * SIZE(BB), %xmm0
  808. addsd %xmm2, %xmm6
  809. movlpd 16 * SIZE(BB), %xmm2
  810. addsd %xmm0, %xmm7
  811. movlpd 1 * SIZE(AA), %xmm0
  812. mulsd %xmm0, %xmm3
  813. addsd %xmm3, %xmm4
  814. movlpd 10 * SIZE(BB), %xmm3
  815. mulsd %xmm0, %xmm3
  816. addsd %xmm3, %xmm5
  817. movlpd 12 * SIZE(BB), %xmm3
  818. mulsd %xmm0, %xmm3
  819. mulsd 14 * SIZE(BB), %xmm0
  820. addsd %xmm3, %xmm6
  821. movlpd 24 * SIZE(BB), %xmm3
  822. addsd %xmm0, %xmm7
  823. movlpd 2 * SIZE(AA), %xmm0
  824. mulsd %xmm0, %xmm2
  825. addsd %xmm2, %xmm4
  826. movlpd 18 * SIZE(BB), %xmm2
  827. mulsd %xmm0, %xmm2
  828. addsd %xmm2, %xmm5
  829. movlpd 20 * SIZE(BB), %xmm2
  830. mulsd %xmm0, %xmm2
  831. mulsd 22 * SIZE(BB), %xmm0
  832. addsd %xmm2, %xmm6
  833. movlpd 32 * SIZE(BB), %xmm2
  834. addsd %xmm0, %xmm7
  835. movlpd 3 * SIZE(AA), %xmm0
  836. mulsd %xmm0, %xmm3
  837. addsd %xmm3, %xmm4
  838. movlpd 26 * SIZE(BB), %xmm3
  839. mulsd %xmm0, %xmm3
  840. addsd %xmm3, %xmm5
  841. movlpd 28 * SIZE(BB), %xmm3
  842. mulsd %xmm0, %xmm3
  843. mulsd 30 * SIZE(BB), %xmm0
  844. addsd %xmm3, %xmm6
  845. movlpd 40 * SIZE(BB), %xmm3
  846. addsd %xmm0, %xmm7
  847. movlpd 8 * SIZE(AA), %xmm0
  848. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
  849. PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
  850. #endif
  851. mulsd %xmm1, %xmm2
  852. addsd %xmm2, %xmm4
  853. movlpd 34 * SIZE(BB), %xmm2
  854. mulsd %xmm1, %xmm2
  855. addsd %xmm2, %xmm5
  856. movlpd 36 * SIZE(BB), %xmm2
  857. mulsd %xmm1, %xmm2
  858. mulsd 38 * SIZE(BB), %xmm1
  859. addsd %xmm2, %xmm6
  860. movlpd 48 * SIZE(BB), %xmm2
  861. addsd %xmm1, %xmm7
  862. movlpd 5 * SIZE(AA), %xmm1
  863. mulsd %xmm1, %xmm3
  864. addsd %xmm3, %xmm4
  865. movlpd 42 * SIZE(BB), %xmm3
  866. mulsd %xmm1, %xmm3
  867. addsd %xmm3, %xmm5
  868. movlpd 44 * SIZE(BB), %xmm3
  869. mulsd %xmm1, %xmm3
  870. mulsd 46 * SIZE(BB), %xmm1
  871. addsd %xmm3, %xmm6
  872. movlpd 56 * SIZE(BB), %xmm3
  873. addsd %xmm1, %xmm7
  874. movlpd 6 * SIZE(AA), %xmm1
  875. mulsd %xmm1, %xmm2
  876. addsd %xmm2, %xmm4
  877. movlpd 50 * SIZE(BB), %xmm2
  878. mulsd %xmm1, %xmm2
  879. addsd %xmm2, %xmm5
  880. movlpd 52 * SIZE(BB), %xmm2
  881. mulsd %xmm1, %xmm2
  882. mulsd 54 * SIZE(BB), %xmm1
  883. addsd %xmm2, %xmm6
  884. movlpd 64 * SIZE(BB), %xmm2
  885. addsd %xmm1, %xmm7
  886. movlpd 7 * SIZE(AA), %xmm1
  887. mulsd %xmm1, %xmm3
  888. addsd %xmm3, %xmm4
  889. movlpd 58 * SIZE(BB), %xmm3
  890. mulsd %xmm1, %xmm3
  891. addsd %xmm3, %xmm5
  892. movlpd 60 * SIZE(BB), %xmm3
  893. mulsd %xmm1, %xmm3
  894. mulsd 62 * SIZE(BB), %xmm1
  895. addsd %xmm3, %xmm6
  896. movlpd 72 * SIZE(BB), %xmm3
  897. addl $64 * SIZE, BB
  898. addsd %xmm1, %xmm7
  899. movlpd 12 * SIZE(AA), %xmm1
  900. addl $8 * SIZE, AA
  901. decl %eax
  902. jne .L22
  903. ALIGN_4
  904. .L25:
  905. #if defined(LT) || defined(RN)
  906. movl KK, %eax
  907. #else
  908. movl K, %eax
  909. subl KK, %eax
  910. #endif
  911. andl $7, %eax # if (k & 1)
  912. BRANCH
  913. je .L28
  914. .L26:
  915. mulsd %xmm0, %xmm2
  916. addsd %xmm2, %xmm4
  917. movlpd 2 * SIZE(BB), %xmm2
  918. mulsd %xmm0, %xmm2
  919. addsd %xmm2, %xmm5
  920. movlpd 4 * SIZE(BB), %xmm2
  921. mulsd %xmm0, %xmm2
  922. mulsd 6 * SIZE(BB), %xmm0
  923. addsd %xmm2, %xmm6
  924. movlpd 8 * SIZE(BB), %xmm2
  925. addsd %xmm0, %xmm7
  926. movlpd 1 * SIZE(AA), %xmm0
  927. addl $1 * SIZE, AA
  928. addl $8 * SIZE, BB
  929. decl %eax
  930. jg .L26
  931. ALIGN_4
  932. .L28:
  933. #if defined(LN) || defined(RT)
  934. movl KK, %eax
  935. #ifdef LN
  936. subl $1, %eax
  937. #else
  938. subl $4, %eax
  939. #endif
  940. movl AORIG, AA
  941. movl BORIG, B
  942. leal BUFFER, BB
  943. leal (, %eax, SIZE), %eax
  944. addl %eax, AA
  945. leal (B, %eax, 4), B
  946. leal (BB, %eax, 8), BB
  947. #endif
  948. #if defined(LN) || defined(LT)
  949. unpcklpd %xmm5, %xmm4
  950. unpcklpd %xmm7, %xmm6
  951. movapd 0 * SIZE(B), %xmm2
  952. movapd 2 * SIZE(B), %xmm5
  953. subpd %xmm4, %xmm2
  954. subpd %xmm6, %xmm5
  955. #else
  956. movlpd 0 * SIZE(AA), %xmm0
  957. movlpd 1 * SIZE(AA), %xmm1
  958. movlpd 2 * SIZE(AA), %xmm2
  959. movlpd 3 * SIZE(AA), %xmm3
  960. subsd %xmm4, %xmm0
  961. subsd %xmm5, %xmm1
  962. subsd %xmm6, %xmm2
  963. subsd %xmm7, %xmm3
  964. #endif
  965. #ifdef LN
  966. movlpd 0 * SIZE(AA), %xmm4
  967. movhpd 0 * SIZE(AA), %xmm4
  968. mulpd %xmm4, %xmm2
  969. mulpd %xmm4, %xmm5
  970. #endif
  971. #ifdef LT
  972. movlpd 0 * SIZE(AA), %xmm4
  973. movhpd 0 * SIZE(AA), %xmm4
  974. mulpd %xmm4, %xmm2
  975. mulpd %xmm4, %xmm5
  976. #endif
  977. #ifdef RN
  978. movlpd 0 * SIZE(B), %xmm4
  979. mulsd %xmm4, %xmm0
  980. movlpd 1 * SIZE(B), %xmm4
  981. mulsd %xmm0, %xmm4
  982. subsd %xmm4, %xmm1
  983. movlpd 2 * SIZE(B), %xmm4
  984. mulsd %xmm0, %xmm4
  985. subsd %xmm4, %xmm2
  986. movlpd 3 * SIZE(B), %xmm4
  987. mulsd %xmm0, %xmm4
  988. subsd %xmm4, %xmm3
  989. movlpd 5 * SIZE(B), %xmm4
  990. mulsd %xmm4, %xmm1
  991. movlpd 6 * SIZE(B), %xmm4
  992. mulsd %xmm1, %xmm4
  993. subsd %xmm4, %xmm2
  994. movlpd 7 * SIZE(B), %xmm4
  995. mulsd %xmm1, %xmm4
  996. subsd %xmm4, %xmm3
  997. movlpd 10 * SIZE(B), %xmm4
  998. mulsd %xmm4, %xmm2
  999. movlpd 11 * SIZE(B), %xmm4
  1000. mulsd %xmm2, %xmm4
  1001. subsd %xmm4, %xmm3
  1002. movlpd 15 * SIZE(B), %xmm4
  1003. mulsd %xmm4, %xmm3
  1004. #endif
  1005. #ifdef RT
  1006. movlpd 15 * SIZE(B), %xmm4
  1007. mulsd %xmm4, %xmm3
  1008. movlpd 14 * SIZE(B), %xmm4
  1009. mulsd %xmm3, %xmm4
  1010. subsd %xmm4, %xmm2
  1011. movlpd 13 * SIZE(B), %xmm4
  1012. mulsd %xmm3, %xmm4
  1013. subsd %xmm4, %xmm1
  1014. movlpd 12 * SIZE(B), %xmm4
  1015. mulsd %xmm3, %xmm4
  1016. subsd %xmm4, %xmm0
  1017. movlpd 10 * SIZE(B), %xmm4
  1018. mulsd %xmm4, %xmm2
  1019. movlpd 9 * SIZE(B), %xmm4
  1020. mulsd %xmm2, %xmm4
  1021. subsd %xmm4, %xmm1
  1022. movlpd 8 * SIZE(B), %xmm4
  1023. mulsd %xmm2, %xmm4
  1024. subsd %xmm4, %xmm0
  1025. movlpd 5 * SIZE(B), %xmm4
  1026. mulsd %xmm4, %xmm1
  1027. movlpd 4 * SIZE(B), %xmm4
  1028. mulsd %xmm1, %xmm4
  1029. subsd %xmm4, %xmm0
  1030. movlpd 0 * SIZE(B), %xmm4
  1031. mulsd %xmm4, %xmm0
  1032. #endif
  1033. #if defined(LN) || defined(LT)
  1034. movapd %xmm2, 0 * SIZE(B)
  1035. movapd %xmm5, 2 * SIZE(B)
  1036. movlpd %xmm2, 0 * SIZE(BB)
  1037. movlpd %xmm2, 1 * SIZE(BB)
  1038. movhpd %xmm2, 2 * SIZE(BB)
  1039. movhpd %xmm2, 3 * SIZE(BB)
  1040. movlpd %xmm5, 4 * SIZE(BB)
  1041. movlpd %xmm5, 5 * SIZE(BB)
  1042. movhpd %xmm5, 6 * SIZE(BB)
  1043. movhpd %xmm5, 7 * SIZE(BB)
  1044. #else
  1045. movlpd %xmm0, 0 * SIZE(AA)
  1046. movlpd %xmm1, 1 * SIZE(AA)
  1047. movlpd %xmm2, 2 * SIZE(AA)
  1048. movlpd %xmm3, 3 * SIZE(AA)
  1049. #endif
  1050. #ifdef LN
  1051. subl $1 * SIZE, CO1
  1052. #endif
  1053. leal (LDC, LDC, 2), %eax
  1054. #if defined(LN) || defined(LT)
  1055. movlpd %xmm2, 0 * SIZE(CO1)
  1056. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  1057. movlpd %xmm5, 0 * SIZE(CO1, LDC, 2)
  1058. movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
  1059. #else
  1060. movlpd %xmm0, 0 * SIZE(CO1)
  1061. movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1062. movlpd %xmm2, 0 * SIZE(CO1, LDC, 2)
  1063. movlpd %xmm3, 0 * SIZE(CO1, %eax, 1)
  1064. #endif
  1065. #ifndef LN
  1066. addl $1 * SIZE, CO1
  1067. #endif
  1068. #if defined(LT) || defined(RN)
  1069. movl K, %eax
  1070. subl KK, %eax
  1071. leal (AA,%eax, SIZE), AA
  1072. #ifdef LT
  1073. addl $4 * SIZE, B
  1074. #endif
  1075. #endif
  1076. #ifdef LN
  1077. subl $1, KK
  1078. movl BORIG, B
  1079. #endif
  1080. #ifdef LT
  1081. addl $1, KK
  1082. #endif
  1083. #ifdef RT
  1084. movl K, %eax
  1085. movl BORIG, B
  1086. sall $BASE_SHIFT, %eax
  1087. addl %eax, AORIG
  1088. #endif
  1089. ALIGN_4
  1090. .L29:
  1091. #ifdef LN
  1092. movl K, %eax
  1093. leal (, %eax, SIZE), %eax
  1094. leal (B, %eax, 4), B
  1095. #endif
  1096. #if defined(LT) || defined(RN)
  1097. movl K, %eax
  1098. subl KK, %eax
  1099. leal (,%eax, SIZE), %eax
  1100. leal (B, %eax, 4), B
  1101. #endif
  1102. #ifdef RN
  1103. addl $4, KK
  1104. #endif
  1105. #ifdef RT
  1106. subl $4, KK
  1107. #endif
  1108. decl J # j --
  1109. jg .L01
  1110. ALIGN_4
  1111. .L30:
  1112. testl $2, N
  1113. je .L60
  1114. #ifdef LN
  1115. movl OFFSET, %eax
  1116. addl M, %eax
  1117. movl %eax, KK
  1118. #endif
  1119. leal BUFFER, BB
  1120. #ifdef RT
  1121. movl K, %eax
  1122. sall $1 + BASE_SHIFT, %eax
  1123. subl %eax, B
  1124. #endif
  1125. #if defined(LN) || defined(RT)
  1126. movl KK, %eax
  1127. movl B, BORIG
  1128. leal (, %eax, SIZE), %eax
  1129. leal (B, %eax, 2), B
  1130. leal (BB, %eax, 4), BB
  1131. #endif
  1132. #ifdef LT
  1133. movl OFFSET, %eax
  1134. movl %eax, KK
  1135. #endif
  1136. #if defined(LT) || defined(RN)
  1137. movl KK, %eax
  1138. #else
  1139. movl K, %eax
  1140. subl KK, %eax
  1141. #endif
  1142. sarl $2, %eax
  1143. jle .L35
  1144. ALIGN_4
  1145. .L32:
  1146. #define COPYPREFETCH 40
  1147. prefetchnta (COPYPREFETCH) * SIZE(B)
  1148. movq 0 * SIZE(B), %mm0
  1149. movq 1 * SIZE(B), %mm1
  1150. movq 2 * SIZE(B), %mm2
  1151. movq 3 * SIZE(B), %mm3
  1152. movq 4 * SIZE(B), %mm4
  1153. movq 5 * SIZE(B), %mm5
  1154. movq 6 * SIZE(B), %mm6
  1155. movq 7 * SIZE(B), %mm7
  1156. movq %mm0, 0 * SIZE(BB)
  1157. movq %mm0, 1 * SIZE(BB)
  1158. movq %mm1, 2 * SIZE(BB)
  1159. movq %mm1, 3 * SIZE(BB)
  1160. movq %mm2, 4 * SIZE(BB)
  1161. movq %mm2, 5 * SIZE(BB)
  1162. movq %mm3, 6 * SIZE(BB)
  1163. movq %mm3, 7 * SIZE(BB)
  1164. movq %mm4, 8 * SIZE(BB)
  1165. movq %mm4, 9 * SIZE(BB)
  1166. movq %mm5, 10 * SIZE(BB)
  1167. movq %mm5, 11 * SIZE(BB)
  1168. movq %mm6, 12 * SIZE(BB)
  1169. movq %mm6, 13 * SIZE(BB)
  1170. movq %mm7, 14 * SIZE(BB)
  1171. movq %mm7, 15 * SIZE(BB)
  1172. addl $ 8 * SIZE, B
  1173. addl $16 * SIZE, BB
  1174. decl %eax
  1175. jne .L32
  1176. ALIGN_2
  1177. .L35:
  1178. #if defined(LT) || defined(RN)
  1179. movl KK, %eax
  1180. #else
  1181. movl K, %eax
  1182. subl KK, %eax
  1183. #endif
  1184. andl $3, %eax
  1185. BRANCH
  1186. jle .L40
  1187. ALIGN_2
  1188. .L36:
  1189. movq 0 * SIZE(B), %mm0
  1190. movq 1 * SIZE(B), %mm1
  1191. movq %mm0, 0 * SIZE(BB)
  1192. movq %mm0, 1 * SIZE(BB)
  1193. movq %mm1, 2 * SIZE(BB)
  1194. movq %mm1, 3 * SIZE(BB)
  1195. addl $2 * SIZE, B
  1196. addl $4 * SIZE, BB
  1197. decl %eax
  1198. jne .L36
  1199. ALIGN_4
  1200. .L40:
  1201. #if defined(LT) || defined(RN)
  1202. movl A, AA
  1203. #else
  1204. movl A, %eax
  1205. movl %eax, AORIG
  1206. #endif
  1207. leal (, LDC, 2), %eax
  1208. #ifdef RT
  1209. subl %eax, C
  1210. #endif
  1211. movl C, CO1
  1212. #ifndef RT
  1213. addl %eax, C
  1214. #endif
  1215. movl M, %ebx
  1216. sarl $1, %ebx # i = (m >> 2)
  1217. jle .L50
  1218. ALIGN_4
  1219. .L41:
  1220. #ifdef LN
  1221. movl K, %eax
  1222. sall $1 + BASE_SHIFT, %eax
  1223. subl %eax, AORIG
  1224. #endif
  1225. #if defined(LN) || defined(RT)
  1226. movl KK, %eax
  1227. movl AORIG, AA
  1228. leal (, %eax, SIZE), %eax
  1229. leal (AA, %eax, 2), AA
  1230. #endif
  1231. leal BUFFER, BB
  1232. #if defined(LN) || defined(RT)
  1233. movl KK, %eax
  1234. sall $2 + BASE_SHIFT, %eax
  1235. addl %eax, BB
  1236. #endif
  1237. pxor %xmm4, %xmm4
  1238. pxor %xmm5, %xmm5
  1239. pxor %xmm6, %xmm6
  1240. pxor %xmm7, %xmm7
  1241. movapd 0 * SIZE(AA), %xmm0
  1242. movapd 8 * SIZE(AA), %xmm1
  1243. movapd 0 * SIZE(BB), %xmm2
  1244. movapd 8 * SIZE(BB), %xmm3
  1245. #ifdef LN
  1246. prefetchw -2 * SIZE(CO1)
  1247. prefetchw -2 * SIZE(CO1, LDC)
  1248. #else
  1249. prefetchw 1 * SIZE(CO1)
  1250. prefetchw 1 * SIZE(CO1, LDC)
  1251. #endif
  1252. #if defined(LT) || defined(RN)
  1253. movl KK, %eax
  1254. #else
  1255. movl K, %eax
  1256. subl KK, %eax
  1257. #endif
  1258. sarl $3, %eax
  1259. je .L45
  1260. ALIGN_4
  1261. .L42:
  1262. mulpd %xmm0, %xmm2
  1263. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
  1264. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1265. #endif
  1266. mulpd 2 * SIZE(BB), %xmm0
  1267. addpd %xmm2, %xmm4
  1268. movapd 4 * SIZE(BB), %xmm2
  1269. addpd %xmm0, %xmm5
  1270. movapd 2 * SIZE(AA), %xmm0
  1271. mulpd %xmm0, %xmm2
  1272. mulpd 6 * SIZE(BB), %xmm0
  1273. addpd %xmm2, %xmm6
  1274. movapd 16 * SIZE(BB), %xmm2
  1275. addpd %xmm0, %xmm7
  1276. movapd 4 * SIZE(AA), %xmm0
  1277. mulpd %xmm0, %xmm3
  1278. mulpd 10 * SIZE(BB), %xmm0
  1279. addpd %xmm3, %xmm4
  1280. movapd 12 * SIZE(BB), %xmm3
  1281. addpd %xmm0, %xmm5
  1282. movapd 6 * SIZE(AA), %xmm0
  1283. mulpd %xmm0, %xmm3
  1284. mulpd 14 * SIZE(BB), %xmm0
  1285. addpd %xmm3, %xmm6
  1286. movapd 24 * SIZE(BB), %xmm3
  1287. addpd %xmm0, %xmm7
  1288. movapd 16 * SIZE(AA), %xmm0
  1289. #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
  1290. prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
  1291. #endif
  1292. mulpd %xmm1, %xmm2
  1293. mulpd 18 * SIZE(BB), %xmm1
  1294. addpd %xmm2, %xmm4
  1295. movapd 20 * SIZE(BB), %xmm2
  1296. addpd %xmm1, %xmm5
  1297. movapd 10 * SIZE(AA), %xmm1
  1298. mulpd %xmm1, %xmm2
  1299. mulpd 22 * SIZE(BB), %xmm1
  1300. addpd %xmm2, %xmm6
  1301. movapd 32 * SIZE(BB), %xmm2
  1302. addpd %xmm1, %xmm7
  1303. movapd 12 * SIZE(AA), %xmm1
  1304. mulpd %xmm1, %xmm3
  1305. mulpd 26 * SIZE(BB), %xmm1
  1306. addpd %xmm3, %xmm4
  1307. movapd 28 * SIZE(BB), %xmm3
  1308. addpd %xmm1, %xmm5
  1309. movapd 14 * SIZE(AA), %xmm1
  1310. mulpd %xmm1, %xmm3
  1311. mulpd 30 * SIZE(BB), %xmm1
  1312. addpd %xmm3, %xmm6
  1313. movapd 40 * SIZE(BB), %xmm3
  1314. addpd %xmm1, %xmm7
  1315. movapd 24 * SIZE(AA), %xmm1
  1316. addl $16 * SIZE, AA
  1317. addl $32 * SIZE, BB
  1318. decl %eax
  1319. jne .L42
  1320. ALIGN_4
  1321. .L45:
  1322. #if defined(LT) || defined(RN)
  1323. movl KK, %eax
  1324. #else
  1325. movl K, %eax
  1326. subl KK, %eax
  1327. #endif
  1328. andl $7, %eax # if (k & 1)
  1329. BRANCH
  1330. je .L48
  1331. ALIGN_3
  1332. .L46:
  1333. mulpd %xmm0, %xmm2
  1334. mulpd 2 * SIZE(BB), %xmm0
  1335. addpd %xmm2, %xmm4
  1336. movapd 4 * SIZE(BB), %xmm2
  1337. addpd %xmm0, %xmm5
  1338. movapd 2 * SIZE(AA), %xmm0
  1339. addl $2 * SIZE, AA
  1340. addl $4 * SIZE, BB
  1341. decl %eax
  1342. jg .L46
  1343. ALIGN_4
  1344. .L48:
  1345. addpd %xmm6, %xmm4
  1346. addpd %xmm7, %xmm5
  1347. #if defined(LN) || defined(RT)
  1348. movl KK, %eax
  1349. #ifdef LN
  1350. subl $2, %eax
  1351. #else
  1352. subl $2, %eax
  1353. #endif
  1354. movl AORIG, AA
  1355. movl BORIG, B
  1356. leal BUFFER, BB
  1357. leal (, %eax, SIZE), %eax
  1358. leal (AA, %eax, 2), AA
  1359. leal (B, %eax, 2), B
  1360. leal (BB, %eax, 4), BB
  1361. #endif
  1362. #if defined(LN) || defined(LT)
  1363. movapd %xmm4, %xmm0
  1364. unpcklpd %xmm5, %xmm4
  1365. unpckhpd %xmm5, %xmm0
  1366. movapd 0 * SIZE(B), %xmm2
  1367. movapd 2 * SIZE(B), %xmm3
  1368. subpd %xmm4, %xmm2
  1369. subpd %xmm0, %xmm3
  1370. #else
  1371. movapd 0 * SIZE(AA), %xmm0
  1372. movapd 2 * SIZE(AA), %xmm1
  1373. subpd %xmm4, %xmm0
  1374. subpd %xmm5, %xmm1
  1375. #endif
  1376. #ifdef LN
  1377. movlpd 3 * SIZE(AA), %xmm4
  1378. movhpd 3 * SIZE(AA), %xmm4
  1379. mulpd %xmm4, %xmm3
  1380. movlpd 2 * SIZE(AA), %xmm4
  1381. movhpd 2 * SIZE(AA), %xmm4
  1382. mulpd %xmm3, %xmm4
  1383. subpd %xmm4, %xmm2
  1384. movlpd 0 * SIZE(AA), %xmm4
  1385. movhpd 0 * SIZE(AA), %xmm4
  1386. mulpd %xmm4, %xmm2
  1387. #endif
  1388. #ifdef LT
  1389. movlpd 0 * SIZE(AA), %xmm4
  1390. movhpd 0 * SIZE(AA), %xmm4
  1391. mulpd %xmm4, %xmm2
  1392. movlpd 1 * SIZE(AA), %xmm4
  1393. movhpd 1 * SIZE(AA), %xmm4
  1394. mulpd %xmm2, %xmm4
  1395. subpd %xmm4, %xmm3
  1396. movlpd 3 * SIZE(AA), %xmm4
  1397. movhpd 3 * SIZE(AA), %xmm4
  1398. mulpd %xmm4, %xmm3
  1399. #endif
  1400. #ifdef RN
  1401. movlpd 0 * SIZE(B), %xmm4
  1402. movhpd 0 * SIZE(B), %xmm4
  1403. mulpd %xmm4, %xmm0
  1404. movlpd 1 * SIZE(B), %xmm4
  1405. movhpd 1 * SIZE(B), %xmm4
  1406. mulpd %xmm0, %xmm4
  1407. subpd %xmm4, %xmm1
  1408. movlpd 3 * SIZE(B), %xmm4
  1409. movhpd 3 * SIZE(B), %xmm4
  1410. mulpd %xmm4, %xmm1
  1411. #endif
  1412. #ifdef RT
  1413. movlpd 3 * SIZE(B), %xmm4
  1414. movhpd 3 * SIZE(B), %xmm4
  1415. mulpd %xmm4, %xmm1
  1416. movlpd 2 * SIZE(B), %xmm4
  1417. movhpd 2 * SIZE(B), %xmm4
  1418. mulpd %xmm1, %xmm4
  1419. subpd %xmm4, %xmm0
  1420. movlpd 0 * SIZE(B), %xmm4
  1421. movhpd 0 * SIZE(B), %xmm4
  1422. mulpd %xmm4, %xmm0
  1423. #endif
  1424. #if defined(LN) || defined(LT)
  1425. movapd %xmm2, 0 * SIZE(B)
  1426. movapd %xmm3, 2 * SIZE(B)
  1427. movlpd %xmm2, 0 * SIZE(BB)
  1428. movlpd %xmm2, 1 * SIZE(BB)
  1429. movhpd %xmm2, 2 * SIZE(BB)
  1430. movhpd %xmm2, 3 * SIZE(BB)
  1431. movlpd %xmm3, 4 * SIZE(BB)
  1432. movlpd %xmm3, 5 * SIZE(BB)
  1433. movhpd %xmm3, 6 * SIZE(BB)
  1434. movhpd %xmm3, 7 * SIZE(BB)
  1435. #else
  1436. movapd %xmm0, 0 * SIZE(AA)
  1437. movapd %xmm1, 2 * SIZE(AA)
  1438. #endif
  1439. #ifdef LN
  1440. subl $2 * SIZE, CO1
  1441. #endif
  1442. #if defined(LN) || defined(LT)
  1443. movlpd %xmm2, 0 * SIZE(CO1)
  1444. movlpd %xmm3, 1 * SIZE(CO1)
  1445. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  1446. movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
  1447. #else
  1448. movlpd %xmm0, 0 * SIZE(CO1)
  1449. movhpd %xmm0, 1 * SIZE(CO1)
  1450. movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1451. movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
  1452. #endif
  1453. #ifndef LN
  1454. addl $2 * SIZE, CO1
  1455. #endif
  1456. #if defined(LT) || defined(RN)
  1457. movl K, %eax
  1458. subl KK, %eax
  1459. leal (,%eax, SIZE), %eax
  1460. leal (AA, %eax, 2), AA
  1461. #ifdef LT
  1462. addl $4 * SIZE, B
  1463. #endif
  1464. #endif
  1465. #ifdef LN
  1466. subl $2, KK
  1467. movl BORIG, B
  1468. #endif
  1469. #ifdef LT
  1470. addl $2, KK
  1471. #endif
  1472. #ifdef RT
  1473. movl K, %eax
  1474. movl BORIG, B
  1475. sall $1 + BASE_SHIFT, %eax
  1476. addl %eax, AORIG
  1477. #endif
  1478. decl %ebx # i --
  1479. jg .L41
  1480. ALIGN_4
  1481. .L50:
  1482. movl M, %ebx
  1483. testl $1, %ebx # i = (m >> 2)
  1484. jle .L59
  1485. #ifdef LN
  1486. movl K, %eax
  1487. sall $BASE_SHIFT, %eax
  1488. subl %eax, AORIG
  1489. #endif
  1490. #if defined(LN) || defined(RT)
  1491. movl KK, %eax
  1492. movl AORIG, AA
  1493. leal (AA, %eax, SIZE), AA
  1494. #endif
  1495. leal BUFFER, BB
  1496. #if defined(LN) || defined(RT)
  1497. movl KK, %eax
  1498. sall $2 + BASE_SHIFT, %eax
  1499. addl %eax, BB
  1500. #endif
  1501. pxor %xmm4, %xmm4
  1502. pxor %xmm5, %xmm5
  1503. pxor %xmm6, %xmm6
  1504. pxor %xmm7, %xmm7
  1505. movlpd 0 * SIZE(AA), %xmm0
  1506. movlpd 4 * SIZE(AA), %xmm1
  1507. movlpd 0 * SIZE(BB), %xmm2
  1508. movlpd 8 * SIZE(BB), %xmm3
  1509. #if defined(LT) || defined(RN)
  1510. movl KK, %eax
  1511. #else
  1512. movl K, %eax
  1513. subl KK, %eax
  1514. #endif
  1515. sarl $3, %eax
  1516. je .L55
  1517. ALIGN_4
  1518. .L52:
  1519. mulsd %xmm0, %xmm2
  1520. PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
  1521. mulsd 2 * SIZE(BB), %xmm0
  1522. addsd %xmm2, %xmm4
  1523. movlpd 4 * SIZE(BB), %xmm2
  1524. addsd %xmm0, %xmm5
  1525. movlpd 1 * SIZE(AA), %xmm0
  1526. mulsd %xmm0, %xmm2
  1527. mulsd 6 * SIZE(BB), %xmm0
  1528. addsd %xmm2, %xmm6
  1529. movlpd 16 * SIZE(BB), %xmm2
  1530. addsd %xmm0, %xmm7
  1531. movlpd 2 * SIZE(AA), %xmm0
  1532. mulsd %xmm0, %xmm3
  1533. mulsd 10 * SIZE(BB), %xmm0
  1534. addsd %xmm3, %xmm4
  1535. movlpd 12 * SIZE(BB), %xmm3
  1536. addsd %xmm0, %xmm5
  1537. movlpd 3 * SIZE(AA), %xmm0
  1538. mulsd %xmm0, %xmm3
  1539. mulsd 14 * SIZE(BB), %xmm0
  1540. addsd %xmm3, %xmm6
  1541. movlpd 24 * SIZE(BB), %xmm3
  1542. addsd %xmm0, %xmm7
  1543. movlpd 8 * SIZE(AA), %xmm0
  1544. mulsd %xmm1, %xmm2
  1545. mulsd 18 * SIZE(BB), %xmm1
  1546. addsd %xmm2, %xmm4
  1547. movlpd 20 * SIZE(BB), %xmm2
  1548. addsd %xmm1, %xmm5
  1549. movlpd 5 * SIZE(AA), %xmm1
  1550. mulsd %xmm1, %xmm2
  1551. mulsd 22 * SIZE(BB), %xmm1
  1552. addsd %xmm2, %xmm6
  1553. movlpd 32 * SIZE(BB), %xmm2
  1554. addsd %xmm1, %xmm7
  1555. movlpd 6 * SIZE(AA), %xmm1
  1556. mulsd %xmm1, %xmm3
  1557. mulsd 26 * SIZE(BB), %xmm1
  1558. addsd %xmm3, %xmm4
  1559. movlpd 28 * SIZE(BB), %xmm3
  1560. addsd %xmm1, %xmm5
  1561. movlpd 7 * SIZE(AA), %xmm1
  1562. mulsd %xmm1, %xmm3
  1563. mulsd 30 * SIZE(BB), %xmm1
  1564. addsd %xmm3, %xmm6
  1565. movlpd 40 * SIZE(BB), %xmm3
  1566. addsd %xmm1, %xmm7
  1567. movlpd 12 * SIZE(AA), %xmm1
  1568. addl $ 8 * SIZE, AA
  1569. addl $32 * SIZE, BB
  1570. decl %eax
  1571. jne .L52
  1572. ALIGN_4
  1573. .L55:
  1574. #if defined(LT) || defined(RN)
  1575. movl KK, %eax
  1576. #else
  1577. movl K, %eax
  1578. subl KK, %eax
  1579. #endif
  1580. andl $7, %eax # if (k & 1)
  1581. BRANCH
  1582. je .L58
  1583. .L56:
  1584. mulsd %xmm0, %xmm2
  1585. mulsd 2 * SIZE(BB), %xmm0
  1586. addsd %xmm2, %xmm4
  1587. movlpd 4 * SIZE(BB), %xmm2
  1588. addsd %xmm0, %xmm5
  1589. movlpd 1 * SIZE(AA), %xmm0
  1590. addl $1 * SIZE, AA
  1591. addl $4 * SIZE, BB
  1592. decl %eax
  1593. jg .L56
  1594. ALIGN_4
  1595. .L58:
  1596. addsd %xmm6, %xmm4
  1597. addsd %xmm7, %xmm5
  1598. #if defined(LN) || defined(RT)
  1599. movl KK, %eax
  1600. #ifdef LN
  1601. subl $1, %eax
  1602. #else
  1603. subl $2, %eax
  1604. #endif
  1605. movl AORIG, AA
  1606. movl BORIG, B
  1607. leal BUFFER, BB
  1608. leal (, %eax, SIZE), %eax
  1609. addl %eax, AA
  1610. leal (B, %eax, 2), B
  1611. leal (BB, %eax, 4), BB
  1612. #endif
  1613. #if defined(LN) || defined(LT)
  1614. unpcklpd %xmm5, %xmm4
  1615. movapd 0 * SIZE(B), %xmm2
  1616. subpd %xmm4, %xmm2
  1617. #else
  1618. movlpd 0 * SIZE(AA), %xmm0
  1619. movlpd 1 * SIZE(AA), %xmm1
  1620. subsd %xmm4, %xmm0
  1621. subsd %xmm5, %xmm1
  1622. #endif
  1623. #ifdef LN
  1624. movlpd 0 * SIZE(AA), %xmm4
  1625. movhpd 0 * SIZE(AA), %xmm4
  1626. mulpd %xmm4, %xmm2
  1627. #endif
  1628. #ifdef LT
  1629. movlpd 0 * SIZE(AA), %xmm4
  1630. movhpd 0 * SIZE(AA), %xmm4
  1631. mulpd %xmm4, %xmm2
  1632. #endif
  1633. #ifdef RN
  1634. movlpd 0 * SIZE(B), %xmm4
  1635. mulsd %xmm4, %xmm0
  1636. movlpd 1 * SIZE(B), %xmm4
  1637. mulsd %xmm0, %xmm4
  1638. subsd %xmm4, %xmm1
  1639. movlpd 3 * SIZE(B), %xmm4
  1640. mulsd %xmm4, %xmm1
  1641. #endif
  1642. #ifdef RT
  1643. movlpd 3 * SIZE(B), %xmm4
  1644. mulsd %xmm4, %xmm1
  1645. movlpd 2 * SIZE(B), %xmm4
  1646. mulsd %xmm1, %xmm4
  1647. subsd %xmm4, %xmm0
  1648. movlpd 0 * SIZE(B), %xmm4
  1649. mulsd %xmm4, %xmm0
  1650. #endif
  1651. #if defined(LN) || defined(LT)
  1652. movapd %xmm2, 0 * SIZE(B)
  1653. movlpd %xmm2, 0 * SIZE(BB)
  1654. movlpd %xmm2, 1 * SIZE(BB)
  1655. movhpd %xmm2, 2 * SIZE(BB)
  1656. movhpd %xmm2, 3 * SIZE(BB)
  1657. #else
  1658. movlpd %xmm0, 0 * SIZE(AA)
  1659. movlpd %xmm1, 1 * SIZE(AA)
  1660. #endif
  1661. #ifdef LN
  1662. subl $1 * SIZE, CO1
  1663. #endif
  1664. #if defined(LN) || defined(LT)
  1665. movlpd %xmm2, 0 * SIZE(CO1)
  1666. movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
  1667. #else
  1668. movlpd %xmm0, 0 * SIZE(CO1)
  1669. movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
  1670. #endif
  1671. #ifndef LN
  1672. addl $1 * SIZE, CO1
  1673. #endif
  1674. #if defined(LT) || defined(RN)
  1675. movl K, %eax
  1676. subl KK, %eax
  1677. leal (AA,%eax, SIZE), AA
  1678. #ifdef LT
  1679. addl $2 * SIZE, B
  1680. #endif
  1681. #endif
  1682. #ifdef LN
  1683. subl $1, KK
  1684. movl BORIG, B
  1685. #endif
  1686. #ifdef LT
  1687. addl $1, KK
  1688. #endif
  1689. #ifdef RT
  1690. movl K, %eax
  1691. movl BORIG, B
  1692. sall $BASE_SHIFT, %eax
  1693. addl %eax, AORIG
  1694. #endif
  1695. ALIGN_4
  1696. .L59:
  1697. #ifdef LN
  1698. movl K, %eax
  1699. leal (, %eax, SIZE), %eax
  1700. leal (B, %eax, 2), B
  1701. #endif
  1702. #if defined(LT) || defined(RN)
  1703. movl K, %eax
  1704. subl KK, %eax
  1705. leal (,%eax, SIZE), %eax
  1706. leal (B, %eax, 2), B
  1707. #endif
  1708. #ifdef RN
  1709. addl $2, KK
  1710. #endif
  1711. #ifdef RT
  1712. subl $2, KK
  1713. #endif
  1714. ALIGN_4
  1715. .L60:
  1716. testl $1, N
  1717. je .L999
  1718. #ifdef LN
  1719. movl OFFSET, %eax
  1720. addl M, %eax
  1721. movl %eax, KK
  1722. #endif
  1723. leal BUFFER, BB
  1724. #ifdef RT
  1725. movl K, %eax
  1726. sall $BASE_SHIFT, %eax
  1727. subl %eax, B
  1728. #endif
  1729. #if defined(LN) || defined(RT)
  1730. movl KK, %eax
  1731. movl B, BORIG
  1732. leal (, %eax, SIZE), %eax
  1733. leal (B, %eax, 1), B
  1734. leal (BB, %eax, 2), BB
  1735. #endif
  1736. #ifdef LT
  1737. movl OFFSET, %eax
  1738. movl %eax, KK
  1739. #endif
  1740. #if defined(LT) || defined(RN)
  1741. movl KK, %eax
  1742. #else
  1743. movl K, %eax
  1744. subl KK, %eax
  1745. #endif
  1746. sarl $3, %eax
  1747. jle .L65
  1748. ALIGN_4
  1749. .L62:
  1750. #define COPYPREFETCH 40
  1751. prefetchnta (COPYPREFETCH) * SIZE(B)
  1752. movq 0 * SIZE(B), %mm0
  1753. movq 1 * SIZE(B), %mm1
  1754. movq 2 * SIZE(B), %mm2
  1755. movq 3 * SIZE(B), %mm3
  1756. movq 4 * SIZE(B), %mm4
  1757. movq 5 * SIZE(B), %mm5
  1758. movq 6 * SIZE(B), %mm6
  1759. movq 7 * SIZE(B), %mm7
  1760. movq %mm0, 0 * SIZE(BB)
  1761. movq %mm0, 1 * SIZE(BB)
  1762. movq %mm1, 2 * SIZE(BB)
  1763. movq %mm1, 3 * SIZE(BB)
  1764. movq %mm2, 4 * SIZE(BB)
  1765. movq %mm2, 5 * SIZE(BB)
  1766. movq %mm3, 6 * SIZE(BB)
  1767. movq %mm3, 7 * SIZE(BB)
  1768. movq %mm4, 8 * SIZE(BB)
  1769. movq %mm4, 9 * SIZE(BB)
  1770. movq %mm5, 10 * SIZE(BB)
  1771. movq %mm5, 11 * SIZE(BB)
  1772. movq %mm6, 12 * SIZE(BB)
  1773. movq %mm6, 13 * SIZE(BB)
  1774. movq %mm7, 14 * SIZE(BB)
  1775. movq %mm7, 15 * SIZE(BB)
  1776. addl $ 8 * SIZE, B
  1777. addl $16 * SIZE, BB
  1778. decl %eax
  1779. jne .L62
  1780. ALIGN_2
  1781. .L65:
  1782. #if defined(LT) || defined(RN)
  1783. movl KK, %eax
  1784. #else
  1785. movl K, %eax
  1786. subl KK, %eax
  1787. #endif
  1788. andl $7, %eax
  1789. BRANCH
  1790. jle .L70
  1791. ALIGN_2
  1792. .L66:
  1793. movq 0 * SIZE(B), %mm0
  1794. movq %mm0, 0 * SIZE(BB)
  1795. movq %mm0, 1 * SIZE(BB)
  1796. addl $1 * SIZE, B
  1797. addl $2 * SIZE, BB
  1798. decl %eax
  1799. jne .L66
  1800. ALIGN_4
  1801. .L70:
  1802. #if defined(LT) || defined(RN)
  1803. movl A, AA
  1804. #else
  1805. movl A, %eax
  1806. movl %eax, AORIG
  1807. #endif
  1808. #ifdef RT
  1809. subl LDC, C
  1810. #endif
  1811. movl C, CO1
  1812. #ifndef RT
  1813. addl LDC, C
  1814. #endif
  1815. movl M, %ebx
  1816. sarl $1, %ebx # i = (m >> 2)
  1817. jle .L80
  1818. ALIGN_4
  1819. .L71:
  1820. #ifdef LN
  1821. movl K, %eax
  1822. sall $1 + BASE_SHIFT, %eax
  1823. subl %eax, AORIG
  1824. #endif
  1825. #if defined(LN) || defined(RT)
  1826. movl KK, %eax
  1827. movl AORIG, AA
  1828. leal (, %eax, SIZE), %eax
  1829. leal (AA, %eax, 2), AA
  1830. #endif
  1831. leal BUFFER, BB
  1832. #if defined(LN) || defined(RT)
  1833. movl KK, %eax
  1834. sall $1 + BASE_SHIFT, %eax
  1835. addl %eax, BB
  1836. #endif
  1837. pxor %xmm4, %xmm4
  1838. pxor %xmm5, %xmm5
  1839. pxor %xmm6, %xmm6
  1840. pxor %xmm7, %xmm7
  1841. movapd 0 * SIZE(AA), %xmm0
  1842. movapd 8 * SIZE(AA), %xmm1
  1843. movapd 0 * SIZE(BB), %xmm2
  1844. movapd 8 * SIZE(BB), %xmm3
  1845. #ifdef LN
  1846. prefetchw -2 * SIZE(CO1)
  1847. #else
  1848. prefetchw 1 * SIZE(CO1)
  1849. #endif
  1850. #if defined(LT) || defined(RN)
  1851. movl KK, %eax
  1852. #else
  1853. movl K, %eax
  1854. subl KK, %eax
  1855. #endif
  1856. sarl $3, %eax
  1857. je .L75
  1858. ALIGN_4
  1859. .L72:
  1860. mulpd %xmm0, %xmm2
  1861. addpd %xmm2, %xmm4
  1862. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  1863. movapd 16 * SIZE(BB), %xmm2
  1864. movapd 2 * SIZE(AA), %xmm0
  1865. mulpd 2 * SIZE(BB), %xmm0
  1866. addpd %xmm0, %xmm4
  1867. movapd 4 * SIZE(AA), %xmm0
  1868. mulpd 4 * SIZE(BB), %xmm0
  1869. addpd %xmm0, %xmm4
  1870. movapd 6 * SIZE(AA), %xmm0
  1871. mulpd 6 * SIZE(BB), %xmm0
  1872. addpd %xmm0, %xmm4
  1873. movapd 16 * SIZE(AA), %xmm0
  1874. prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
  1875. mulpd %xmm1, %xmm3
  1876. addpd %xmm3, %xmm4
  1877. movapd 24 * SIZE(BB), %xmm3
  1878. movapd 10 * SIZE(AA), %xmm1
  1879. mulpd 10 * SIZE(BB), %xmm1
  1880. addpd %xmm1, %xmm4
  1881. movapd 12 * SIZE(AA), %xmm1
  1882. mulpd 12 * SIZE(BB), %xmm1
  1883. addpd %xmm1, %xmm4
  1884. movapd 14 * SIZE(AA), %xmm1
  1885. mulpd 14 * SIZE(BB), %xmm1
  1886. addpd %xmm1, %xmm4
  1887. movapd 24 * SIZE(AA), %xmm1
  1888. addl $16 * SIZE, AA
  1889. addl $16 * SIZE, BB
  1890. decl %eax
  1891. jne .L72
  1892. ALIGN_4
  1893. .L75:
  1894. #if defined(LT) || defined(RN)
  1895. movl KK, %eax
  1896. #else
  1897. movl K, %eax
  1898. subl KK, %eax
  1899. #endif
  1900. andl $7, %eax # if (k & 1)
  1901. BRANCH
  1902. je .L78
  1903. ALIGN_3
  1904. .L76:
  1905. mulpd %xmm0, %xmm2
  1906. addpd %xmm2, %xmm4
  1907. movapd 2 * SIZE(AA), %xmm0
  1908. movapd 2 * SIZE(BB), %xmm2
  1909. addl $2 * SIZE, AA
  1910. addl $2 * SIZE, BB
  1911. decl %eax
  1912. jg .L76
  1913. ALIGN_4
  1914. .L78:
  1915. #if defined(LN) || defined(RT)
  1916. movl KK, %eax
  1917. #ifdef LN
  1918. subl $2, %eax
  1919. #else
  1920. subl $1, %eax
  1921. #endif
  1922. movl AORIG, AA
  1923. movl BORIG, B
  1924. leal BUFFER, BB
  1925. leal (, %eax, SIZE), %eax
  1926. leal (AA, %eax, 2), AA
  1927. leal (B, %eax, 1), B
  1928. leal (BB, %eax, 2), BB
  1929. #endif
  1930. #if defined(LN) || defined(LT)
  1931. movapd 0 * SIZE(B), %xmm2
  1932. subpd %xmm4, %xmm2
  1933. #else
  1934. movapd 0 * SIZE(AA), %xmm0
  1935. subpd %xmm4, %xmm0
  1936. #endif
  1937. #ifdef LN
  1938. movapd %xmm2, %xmm3
  1939. unpckhpd %xmm3, %xmm3
  1940. movlpd 3 * SIZE(AA), %xmm4
  1941. mulsd %xmm4, %xmm3
  1942. movlpd 2 * SIZE(AA), %xmm4
  1943. mulsd %xmm3, %xmm4
  1944. subsd %xmm4, %xmm2
  1945. movlpd 0 * SIZE(AA), %xmm4
  1946. mulsd %xmm4, %xmm2
  1947. unpcklpd %xmm3, %xmm2
  1948. #endif
  1949. #ifdef LT
  1950. movapd %xmm2, %xmm3
  1951. unpckhpd %xmm3, %xmm3
  1952. movlpd 0 * SIZE(AA), %xmm4
  1953. mulsd %xmm4, %xmm2
  1954. movlpd 1 * SIZE(AA), %xmm4
  1955. mulsd %xmm2, %xmm4
  1956. subsd %xmm4, %xmm3
  1957. movlpd 3 * SIZE(AA), %xmm4
  1958. mulsd %xmm4, %xmm3
  1959. unpcklpd %xmm3, %xmm2
  1960. #endif
  1961. #ifdef RN
  1962. movlpd 0 * SIZE(B), %xmm4
  1963. movhpd 0 * SIZE(B), %xmm4
  1964. mulpd %xmm4, %xmm0
  1965. #endif
  1966. #ifdef RT
  1967. movlpd 0 * SIZE(B), %xmm4
  1968. movhpd 0 * SIZE(B), %xmm4
  1969. mulpd %xmm4, %xmm0
  1970. #endif
  1971. #if defined(LN) || defined(LT)
  1972. movapd %xmm2, 0 * SIZE(B)
  1973. movlpd %xmm2, 0 * SIZE(BB)
  1974. movlpd %xmm2, 1 * SIZE(BB)
  1975. movhpd %xmm2, 2 * SIZE(BB)
  1976. movhpd %xmm2, 3 * SIZE(BB)
  1977. #else
  1978. movapd %xmm0, 0 * SIZE(AA)
  1979. #endif
  1980. #ifdef LN
  1981. subl $2 * SIZE, CO1
  1982. #endif
  1983. #if defined(LN) || defined(LT)
  1984. movlpd %xmm2, 0 * SIZE(CO1)
  1985. movhpd %xmm2, 1 * SIZE(CO1)
  1986. #else
  1987. movlpd %xmm0, 0 * SIZE(CO1)
  1988. movhpd %xmm0, 1 * SIZE(CO1)
  1989. #endif
  1990. #ifndef LN
  1991. addl $2 * SIZE, CO1
  1992. #endif
  1993. #if defined(LT) || defined(RN)
  1994. movl K, %eax
  1995. subl KK, %eax
  1996. leal (,%eax, SIZE), %eax
  1997. leal (AA, %eax, 2), AA
  1998. #ifdef LT
  1999. addl $2 * SIZE, B
  2000. #endif
  2001. #endif
  2002. #ifdef LN
  2003. subl $2, KK
  2004. movl BORIG, B
  2005. #endif
  2006. #ifdef LT
  2007. addl $2, KK
  2008. #endif
  2009. #ifdef RT
  2010. movl K, %eax
  2011. movl BORIG, B
  2012. sall $1 + BASE_SHIFT, %eax
  2013. addl %eax, AORIG
  2014. #endif
  2015. decl %ebx # i --
  2016. jg .L71
  2017. ALIGN_4
  2018. .L80:
  2019. movl M, %ebx
  2020. testl $1, %ebx # i = (m >> 2)
  2021. jle .L99
  2022. #ifdef LN
  2023. movl K, %eax
  2024. sall $BASE_SHIFT, %eax
  2025. subl %eax, AORIG
  2026. #endif
  2027. #if defined(LN) || defined(RT)
  2028. movl KK, %eax
  2029. movl AORIG, AA
  2030. leal (AA, %eax, SIZE), AA
  2031. #endif
  2032. leal BUFFER, BB
  2033. #if defined(LN) || defined(RT)
  2034. movl KK, %eax
  2035. sall $1 + BASE_SHIFT, %eax
  2036. addl %eax, BB
  2037. #endif
  2038. pxor %xmm4, %xmm4
  2039. pxor %xmm5, %xmm5
  2040. pxor %xmm6, %xmm6
  2041. pxor %xmm7, %xmm7
  2042. movlpd 0 * SIZE(AA), %xmm0
  2043. movlpd 4 * SIZE(AA), %xmm1
  2044. movlpd 0 * SIZE(BB), %xmm2
  2045. movlpd 8 * SIZE(BB), %xmm3
  2046. #if defined(LT) || defined(RN)
  2047. movl KK, %eax
  2048. #else
  2049. movl K, %eax
  2050. subl KK, %eax
  2051. #endif
  2052. sarl $3, %eax
  2053. je .L85
  2054. ALIGN_4
  2055. .L82:
  2056. mulsd %xmm0, %xmm2
  2057. prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
  2058. movlpd 1 * SIZE(AA), %xmm0
  2059. mulsd 2 * SIZE(BB), %xmm0
  2060. addsd %xmm2, %xmm4
  2061. movlpd 16 * SIZE(BB), %xmm2
  2062. addsd %xmm0, %xmm5
  2063. movlpd 2 * SIZE(AA), %xmm0
  2064. mulsd 4 * SIZE(BB), %xmm0
  2065. addsd %xmm0, %xmm6
  2066. movlpd 3 * SIZE(AA), %xmm0
  2067. mulsd 6 * SIZE(BB), %xmm0
  2068. addsd %xmm0, %xmm7
  2069. movlpd 8 * SIZE(AA), %xmm0
  2070. mulsd %xmm1, %xmm3
  2071. movlpd 5 * SIZE(AA), %xmm1
  2072. mulsd 10 * SIZE(BB), %xmm1
  2073. addsd %xmm3, %xmm4
  2074. movlpd 24 * SIZE(BB), %xmm3
  2075. addsd %xmm1, %xmm5
  2076. movlpd 6 * SIZE(AA), %xmm1
  2077. mulsd 12 * SIZE(BB), %xmm1
  2078. addsd %xmm1, %xmm6
  2079. movlpd 7 * SIZE(AA), %xmm1
  2080. mulsd 14 * SIZE(BB), %xmm1
  2081. addsd %xmm1, %xmm7
  2082. movlpd 12 * SIZE(AA), %xmm1
  2083. addl $ 8 * SIZE, AA
  2084. addl $16 * SIZE, BB
  2085. decl %eax
  2086. jne .L82
  2087. ALIGN_4
  2088. .L85:
  2089. #if defined(LT) || defined(RN)
  2090. movl KK, %eax
  2091. #else
  2092. movl K, %eax
  2093. subl KK, %eax
  2094. #endif
  2095. andl $7, %eax # if (k & 1)
  2096. BRANCH
  2097. je .L88
  2098. .L86:
  2099. mulsd %xmm0, %xmm2
  2100. addsd %xmm2, %xmm4
  2101. movlpd 2 * SIZE(BB), %xmm2
  2102. movlpd 1 * SIZE(AA), %xmm0
  2103. addl $1 * SIZE, AA
  2104. addl $2 * SIZE, BB
  2105. decl %eax
  2106. jg .L86
  2107. ALIGN_4
  2108. .L88:
  2109. addsd %xmm5, %xmm4
  2110. addsd %xmm7, %xmm6
  2111. addsd %xmm6, %xmm4
  2112. #if defined(LN) || defined(RT)
  2113. movl KK, %eax
  2114. #ifdef LN
  2115. subl $1, %eax
  2116. #else
  2117. subl $1, %eax
  2118. #endif
  2119. movl AORIG, AA
  2120. movl BORIG, B
  2121. leal BUFFER, BB
  2122. leal (, %eax, SIZE), %eax
  2123. addl %eax, AA
  2124. addl %eax, B
  2125. leal (BB, %eax, 2), BB
  2126. #endif
  2127. #if defined(LN) || defined(LT)
  2128. movlpd 0 * SIZE(B), %xmm2
  2129. subsd %xmm4, %xmm2
  2130. #else
  2131. movlpd 0 * SIZE(AA), %xmm0
  2132. subsd %xmm4, %xmm0
  2133. #endif
  2134. #ifdef LN
  2135. movlpd 0 * SIZE(AA), %xmm4
  2136. mulsd %xmm4, %xmm2
  2137. #endif
  2138. #ifdef LT
  2139. movlpd 0 * SIZE(AA), %xmm4
  2140. mulsd %xmm4, %xmm2
  2141. #endif
  2142. #ifdef RN
  2143. movlpd 0 * SIZE(B), %xmm4
  2144. mulsd %xmm4, %xmm0
  2145. #endif
  2146. #ifdef RT
  2147. movlpd 0 * SIZE(B), %xmm4
  2148. mulsd %xmm4, %xmm0
  2149. #endif
  2150. #if defined(LN) || defined(LT)
  2151. movlpd %xmm2, 0 * SIZE(B)
  2152. movlpd %xmm2, 0 * SIZE(BB)
  2153. movlpd %xmm2, 1 * SIZE(BB)
  2154. #else
  2155. movlpd %xmm0, 0 * SIZE(AA)
  2156. #endif
  2157. #ifdef LN
  2158. subl $1 * SIZE, CO1
  2159. #endif
  2160. #if defined(LN) || defined(LT)
  2161. movlpd %xmm2, 0 * SIZE(CO1)
  2162. #else
  2163. movlpd %xmm0, 0 * SIZE(CO1)
  2164. #endif
  2165. #ifndef LN
  2166. addl $1 * SIZE, CO1
  2167. #endif
  2168. #if defined(LT) || defined(RN)
  2169. movl K, %eax
  2170. subl KK, %eax
  2171. leal (AA,%eax, SIZE), AA
  2172. #ifdef LT
  2173. addl $1 * SIZE, B
  2174. #endif
  2175. #endif
  2176. #ifdef LN
  2177. subl $1, KK
  2178. movl BORIG, B
  2179. #endif
  2180. #ifdef LT
  2181. addl $1, KK
  2182. #endif
  2183. #ifdef RT
  2184. movl K, %eax
  2185. movl BORIG, B
  2186. sall $BASE_SHIFT, %eax
  2187. addl %eax, AORIG
  2188. #endif
  2189. ALIGN_4
  2190. .L99:
  2191. #ifdef LN
  2192. movl K, %eax
  2193. leal (B, %eax, SIZE), B
  2194. #endif
  2195. #if defined(LT) || defined(RN)
  2196. movl K, %eax
  2197. subl KK, %eax
  2198. leal (B,%eax, SIZE), B
  2199. #endif
  2200. #ifdef RN
  2201. addl $1, KK
  2202. #endif
  2203. #ifdef RT
  2204. subl $1, KK
  2205. #endif
  2206. ALIGN_4
  2207. .L999:
  2208. movl OLD_STACK, %esp
  2209. EMMS
  2210. popl %ebx
  2211. popl %esi
  2212. popl %edi
  2213. popl %ebp
  2214. ret
  2215. EPILOGUE