You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

trsm_kernel_LT_4x2_sse2.S 43 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #if !defined(HAVE_SSE2) || !defined(HAVE_MMX)
  41. #error You have to check your configuration.
  42. #endif
  43. #define STACK 16
  44. #define ARGS 0
  45. #define STACK_M 4 + STACK + ARGS(%esi)
  46. #define STACK_N 8 + STACK + ARGS(%esi)
  47. #define STACK_K 12 + STACK + ARGS(%esi)
  48. #define STACK_ALPHA 16 + STACK + ARGS(%esi)
  49. #define STACK_A 24 + STACK + ARGS(%esi)
  50. #define STACK_B 28 + STACK + ARGS(%esi)
  51. #define STACK_C 32 + STACK + ARGS(%esi)
  52. #define STACK_LDC 36 + STACK + ARGS(%esi)
  53. #define STACK_OFFT 40 + STACK + ARGS(%esi)
  54. #define ALPHA 0(%esp)
  55. #define K 16(%esp)
  56. #define N 20(%esp)
  57. #define M 24(%esp)
  58. #define A 28(%esp)
  59. #define C 32(%esp)
  60. #define J 36(%esp)
  61. #define OLD_STACK 40(%esp)
  62. #define OFFSET 44(%esp)
  63. #define KK 48(%esp)
  64. #define KKK 52(%esp)
  65. #define AORIG 56(%esp)
  66. #define BORIG 60(%esp)
  67. #define BUFFER 128(%esp)
  68. #define STACK_ALIGN 4096
  69. #define STACK_OFFSET 1024
  70. #define B %edi
  71. #define AA %edx
  72. #define BB %ecx
  73. #define LDC %ebp
  74. #define PREFETCHSIZE (8 * 4)
  75. #define KERNEL1(address) \
  76. movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \
  77. mulpd %xmm0, %xmm2; \
  78. mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \
  79. addpd %xmm2, %xmm4; \
  80. movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \
  81. addpd %xmm0, %xmm5; \
  82. movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \
  83. mulpd %xmm0, %xmm2; \
  84. mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \
  85. addpd %xmm2, %xmm6; \
  86. movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \
  87. addpd %xmm0, %xmm7; \
  88. movapd 4 * SIZE + (address) * SIZE(AA), %xmm0
  89. #define KERNEL2(address) \
  90. mulpd %xmm0, %xmm2; \
  91. mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \
  92. addpd %xmm2, %xmm4; \
  93. movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \
  94. addpd %xmm0, %xmm5; \
  95. movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \
  96. mulpd %xmm0, %xmm2; \
  97. mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \
  98. addpd %xmm2, %xmm6; \
  99. movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \
  100. addpd %xmm0, %xmm7; \
  101. movapd 16 * SIZE + (address) * SIZE(AA), %xmm0
  102. #define KERNEL3(address) \
  103. movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \
  104. mulpd %xmm1, %xmm3; \
  105. mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \
  106. addpd %xmm3, %xmm4; \
  107. movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \
  108. addpd %xmm1, %xmm5; \
  109. movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \
  110. mulpd %xmm1, %xmm3; \
  111. mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \
  112. addpd %xmm3, %xmm6; \
  113. movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \
  114. addpd %xmm1, %xmm7; \
  115. movapd 12 * SIZE + (address) * SIZE(AA), %xmm1
  116. #define KERNEL4(address) \
  117. mulpd %xmm1, %xmm3; \
  118. mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \
  119. addpd %xmm3, %xmm4; \
  120. movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \
  121. addpd %xmm1, %xmm5; \
  122. movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \
  123. mulpd %xmm1, %xmm3; \
  124. mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \
  125. addpd %xmm3, %xmm6; \
  126. movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \
  127. addpd %xmm1, %xmm7; \
  128. movapd 24 * SIZE + (address) * SIZE(AA), %xmm1
  129. #define KERNEL5(address) \
  130. movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \
  131. mulpd %xmm0, %xmm2; \
  132. mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \
  133. addpd %xmm2, %xmm4; \
  134. movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \
  135. addpd %xmm0, %xmm5; \
  136. movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \
  137. mulpd %xmm0, %xmm2; \
  138. mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \
  139. addpd %xmm2, %xmm6; \
  140. movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \
  141. addpd %xmm0, %xmm7; \
  142. movapd 20 * SIZE + (address) * SIZE(AA), %xmm0
  143. #define KERNEL6(address) \
  144. mulpd %xmm0, %xmm2; \
  145. mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \
  146. addpd %xmm2, %xmm4; \
  147. movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \
  148. addpd %xmm0, %xmm5; \
  149. movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \
  150. mulpd %xmm0, %xmm2; \
  151. mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \
  152. addpd %xmm2, %xmm6; \
  153. movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \
  154. addpd %xmm0, %xmm7; \
  155. movapd 32 * SIZE + (address) * SIZE(AA), %xmm0
  156. #define KERNEL7(address) \
  157. movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \
  158. mulpd %xmm1, %xmm3; \
  159. mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \
  160. addpd %xmm3, %xmm4; \
  161. movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \
  162. addpd %xmm1, %xmm5; \
  163. movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \
  164. mulpd %xmm1, %xmm3; \
  165. mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \
  166. addpd %xmm3, %xmm6; \
  167. movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \
  168. addpd %xmm1, %xmm7; \
  169. movapd 28 * SIZE + (address) * SIZE(AA), %xmm1
  170. #define KERNEL8(address) \
  171. mulpd %xmm1, %xmm3; \
  172. mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \
  173. addpd %xmm3, %xmm4; \
  174. movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \
  175. addpd %xmm1, %xmm5; \
  176. movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \
  177. mulpd %xmm1, %xmm3; \
  178. mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \
  179. addpd %xmm3, %xmm6; \
  180. movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \
  181. addpd %xmm1, %xmm7; \
  182. movapd 40 * SIZE + (address) * SIZE(AA), %xmm1
  183. PROLOGUE
  184. pushl %ebp
  185. pushl %edi
  186. pushl %esi
  187. pushl %ebx
  188. PROFCODE
  189. EMMS
  190. movl %esp, %esi # save old stack
  191. subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
  192. andl $-STACK_ALIGN, %esp
  193. addl $STACK_OFFSET, %esp
  194. STACK_TOUCHING
  195. movd STACK_M, %mm0
  196. movl STACK_N, %eax
  197. movd STACK_K, %mm1
  198. movd STACK_A, %mm2
  199. movl STACK_B, B
  200. movd STACK_C, %mm3
  201. movl STACK_LDC, LDC
  202. movd STACK_OFFT, %mm4
  203. movd %mm1, K
  204. movl %eax, N
  205. movd %mm0, M
  206. movd %mm2, A
  207. movd %mm3, C
  208. movl %esi, OLD_STACK
  209. movd %mm4, OFFSET
  210. movd %mm4, KK
  211. sall $BASE_SHIFT, LDC
  212. #ifdef LN
  213. movl M, %eax
  214. leal (, %eax, SIZE), %eax
  215. addl %eax, C
  216. imull K, %eax
  217. addl %eax, A
  218. #endif
  219. #ifdef RT
  220. movl N, %eax
  221. leal (, %eax, SIZE), %eax
  222. imull K, %eax
  223. addl %eax, B
  224. movl N, %eax
  225. imull LDC, %eax
  226. addl %eax, C
  227. #endif
  228. #ifdef RN
  229. negl KK
  230. #endif
  231. #ifdef RT
  232. movl N, %eax
  233. subl OFFSET, %eax
  234. movl %eax, KK
  235. #endif
  236. movl N, %eax
  237. sarl $1, %eax
  238. movl %eax, J
  239. jle .L100
  240. ALIGN_2
  241. .L01:
  242. /* Copying to Sub Buffer */
  243. #ifdef LN
  244. movl OFFSET, %eax
  245. addl M, %eax
  246. movl %eax, KK
  247. #endif
  248. leal BUFFER, %ecx
  249. #ifdef RT
  250. movl K, %eax
  251. sall $1 + BASE_SHIFT, %eax
  252. subl %eax, B
  253. #endif
  254. #if defined(LN) || defined(RT)
  255. movl KK, %eax
  256. movl B, BORIG
  257. leal (, %eax, SIZE), %eax
  258. leal (B, %eax, 2), B
  259. leal (BB, %eax, 4), BB
  260. #endif
  261. #ifdef LT
  262. movl OFFSET, %eax
  263. movl %eax, KK
  264. #endif
  265. #if defined(LT) || defined(RN)
  266. movl KK, %eax
  267. #else
  268. movl K, %eax
  269. subl KK, %eax
  270. #endif
  271. sarl $2, %eax
  272. jle .L03
  273. ALIGN_2
  274. .L02:
  275. movsd 0 * SIZE(B), %xmm0
  276. movsd 1 * SIZE(B), %xmm1
  277. movsd 2 * SIZE(B), %xmm2
  278. movsd 3 * SIZE(B), %xmm3
  279. movsd 4 * SIZE(B), %xmm4
  280. movsd 5 * SIZE(B), %xmm5
  281. movsd 6 * SIZE(B), %xmm6
  282. movsd 7 * SIZE(B), %xmm7
  283. unpcklpd %xmm0, %xmm0
  284. unpcklpd %xmm1, %xmm1
  285. unpcklpd %xmm2, %xmm2
  286. unpcklpd %xmm3, %xmm3
  287. unpcklpd %xmm4, %xmm4
  288. unpcklpd %xmm5, %xmm5
  289. unpcklpd %xmm6, %xmm6
  290. unpcklpd %xmm7, %xmm7
  291. movapd %xmm0, 0 * SIZE(%ecx)
  292. movapd %xmm1, 2 * SIZE(%ecx)
  293. movapd %xmm2, 4 * SIZE(%ecx)
  294. movapd %xmm3, 6 * SIZE(%ecx)
  295. movapd %xmm4, 8 * SIZE(%ecx)
  296. movapd %xmm5, 10 * SIZE(%ecx)
  297. movapd %xmm6, 12 * SIZE(%ecx)
  298. movapd %xmm7, 14 * SIZE(%ecx)
  299. prefetcht0 104 * SIZE(B)
  300. addl $ 8 * SIZE, B
  301. addl $16 * SIZE, %ecx
  302. decl %eax
  303. jne .L02
  304. ALIGN_2
  305. .L03:
  306. #if defined(LT) || defined(RN)
  307. movl KK, %eax
  308. #else
  309. movl K, %eax
  310. subl KK, %eax
  311. #endif
  312. andl $3, %eax
  313. BRANCH
  314. jle .L05
  315. ALIGN_4
  316. .L04:
  317. movsd 0 * SIZE(B), %xmm0
  318. movsd 1 * SIZE(B), %xmm1
  319. unpcklpd %xmm0, %xmm0
  320. unpcklpd %xmm1, %xmm1
  321. movapd %xmm0, 0 * SIZE(%ecx)
  322. movapd %xmm1, 2 * SIZE(%ecx)
  323. addl $2 * SIZE, B
  324. addl $4 * SIZE, %ecx
  325. decl %eax
  326. jne .L04
  327. ALIGN_4
  328. .L05:
  329. #if defined(LT) || defined(RN)
  330. movl A, AA
  331. #else
  332. movl A, %eax
  333. movl %eax, AORIG
  334. #endif
  335. leal (, LDC, 2), %eax
  336. #ifdef RT
  337. subl %eax, C
  338. #endif
  339. movl C, %esi # coffset = c
  340. #ifndef RT
  341. addl %eax, C
  342. #endif
  343. movl M, %ebx
  344. sarl $2, %ebx # i = (m >> 2)
  345. jle .L30
  346. ALIGN_4
  347. .L10:
  348. #ifdef LN
  349. movl K, %eax
  350. sall $2 + BASE_SHIFT, %eax
  351. subl %eax, AORIG
  352. #endif
  353. #if defined(LN) || defined(RT)
  354. movl KK, %eax
  355. movl AORIG, AA
  356. leal (, %eax, SIZE), %eax
  357. leal (AA, %eax, 4), AA
  358. #endif
  359. leal BUFFER, BB
  360. #if defined(LN) || defined(RT)
  361. movl KK, %eax
  362. sall $1 + BASE_SHIFT, %eax
  363. leal (BB, %eax, 2), BB
  364. #endif
  365. movapd 0 * SIZE(BB), %xmm2
  366. pxor %xmm4, %xmm4
  367. movapd 0 * SIZE(AA), %xmm0
  368. pxor %xmm5, %xmm5
  369. movapd 8 * SIZE(BB), %xmm3
  370. pxor %xmm6, %xmm6
  371. movapd 8 * SIZE(AA), %xmm1
  372. pxor %xmm7, %xmm7
  373. prefetcht2 4 * SIZE(%esi)
  374. prefetcht2 4 * SIZE(%esi, LDC)
  375. #if defined(LT) || defined(RN)
  376. movl KK, %eax
  377. #else
  378. movl K, %eax
  379. subl KK, %eax
  380. #endif
  381. #ifdef PENTIUM4
  382. andl $-8, %eax
  383. NOBRANCH
  384. je .L12
  385. sall $3, %eax
  386. .L1X:
  387. KERNEL1(32 * 0)
  388. KERNEL2(32 * 0)
  389. KERNEL3(32 * 0)
  390. KERNEL4(32 * 0)
  391. KERNEL5(32 * 0)
  392. KERNEL6(32 * 0)
  393. KERNEL7(32 * 0)
  394. KERNEL8(32 * 0)
  395. cmpl $64 * 1, %eax
  396. NOBRANCH
  397. jle .L11
  398. KERNEL1(32 * 1)
  399. KERNEL2(32 * 1)
  400. KERNEL3(32 * 1)
  401. KERNEL4(32 * 1)
  402. KERNEL5(32 * 1)
  403. KERNEL6(32 * 1)
  404. KERNEL7(32 * 1)
  405. KERNEL8(32 * 1)
  406. cmpl $64 * 2, %eax
  407. NOBRANCH
  408. jle .L11
  409. KERNEL1(32 * 2)
  410. KERNEL2(32 * 2)
  411. KERNEL3(32 * 2)
  412. KERNEL4(32 * 2)
  413. KERNEL5(32 * 2)
  414. KERNEL6(32 * 2)
  415. KERNEL7(32 * 2)
  416. KERNEL8(32 * 2)
  417. cmpl $64 * 3, %eax
  418. NOBRANCH
  419. jle .L11
  420. KERNEL1(32 * 3)
  421. KERNEL2(32 * 3)
  422. KERNEL3(32 * 3)
  423. KERNEL4(32 * 3)
  424. KERNEL5(32 * 3)
  425. KERNEL6(32 * 3)
  426. KERNEL7(32 * 3)
  427. KERNEL8(32 * 3)
  428. cmpl $64 * 4, %eax
  429. NOBRANCH
  430. jle .L11
  431. KERNEL1(32 * 4)
  432. KERNEL2(32 * 4)
  433. KERNEL3(32 * 4)
  434. KERNEL4(32 * 4)
  435. KERNEL5(32 * 4)
  436. KERNEL6(32 * 4)
  437. KERNEL7(32 * 4)
  438. KERNEL8(32 * 4)
  439. cmpl $64 * 5, %eax
  440. NOBRANCH
  441. jle .L11
  442. KERNEL1(32 * 5)
  443. KERNEL2(32 * 5)
  444. KERNEL3(32 * 5)
  445. KERNEL4(32 * 5)
  446. KERNEL5(32 * 5)
  447. KERNEL6(32 * 5)
  448. KERNEL7(32 * 5)
  449. KERNEL8(32 * 5)
  450. cmpl $64 * 6, %eax
  451. NOBRANCH
  452. jle .L11
  453. KERNEL1(32 * 6)
  454. KERNEL2(32 * 6)
  455. KERNEL3(32 * 6)
  456. KERNEL4(32 * 6)
  457. KERNEL5(32 * 6)
  458. KERNEL6(32 * 6)
  459. KERNEL7(32 * 6)
  460. KERNEL8(32 * 6)
  461. cmpl $64 * 7, %eax
  462. NOBRANCH
  463. jle .L11
  464. KERNEL1(32 * 7)
  465. KERNEL2(32 * 7)
  466. KERNEL3(32 * 7)
  467. KERNEL4(32 * 7)
  468. KERNEL5(32 * 7)
  469. KERNEL6(32 * 7)
  470. KERNEL7(32 * 7)
  471. KERNEL8(32 * 7)
  472. addl $64 * 4 * SIZE, AA
  473. addl $64 * 4 * SIZE, BB
  474. subl $64 * 8, %eax
  475. BRANCH
  476. jg .L1X
  477. .L11:
  478. leal (AA, %eax, 4), AA
  479. leal (BB, %eax, 4), BB
  480. #else
  481. sarl $3, %eax
  482. je .L12
  483. .L11:
  484. KERNEL1(32 * 0)
  485. KERNEL2(32 * 0)
  486. KERNEL3(32 * 0)
  487. KERNEL4(32 * 0)
  488. KERNEL5(32 * 0)
  489. KERNEL6(32 * 0)
  490. KERNEL7(32 * 0)
  491. KERNEL8(32 * 0)
  492. addl $32 * SIZE, %ecx
  493. addl $32 * SIZE, %edx
  494. decl %eax
  495. jne .L11
  496. #endif
  497. .L12:
  498. #if defined(LT) || defined(RN)
  499. movl KK, %eax
  500. #else
  501. movl K, %eax
  502. subl KK, %eax
  503. #endif
  504. andl $7, %eax # if (k & 1)
  505. BRANCH
  506. je .L14
  507. .L13:
  508. mulpd %xmm0, %xmm2
  509. mulpd 2 * SIZE(BB), %xmm0
  510. addpd %xmm2, %xmm4
  511. movapd 0 * SIZE(BB), %xmm2
  512. addpd %xmm0, %xmm5
  513. movapd 2 * SIZE(AA), %xmm0
  514. mulpd %xmm0, %xmm2
  515. mulpd 2 * SIZE(BB), %xmm0
  516. addpd %xmm2, %xmm6
  517. movapd 4 * SIZE(BB), %xmm2
  518. addpd %xmm0, %xmm7
  519. movapd 4 * SIZE(AA), %xmm0
  520. addl $4 * SIZE, AA # aoffset += 8
  521. addl $4 * SIZE, BB # boffset1 += 8
  522. subl $1, %eax
  523. jg .L13
  524. ALIGN_4
  525. .L14:
  526. #if defined(LN) || defined(RT)
  527. movl KK, %eax
  528. #ifdef LN
  529. subl $4, %eax
  530. #else
  531. subl $2, %eax
  532. #endif
  533. movl AORIG, AA
  534. movl BORIG, B
  535. leal BUFFER, BB
  536. leal (, %eax, SIZE), %eax
  537. leal (AA, %eax, 4), AA
  538. leal (B, %eax, 2), B
  539. leal (BB, %eax, 4), BB
  540. #endif
  541. #if defined(LN) || defined(LT)
  542. movapd %xmm4, %xmm0
  543. unpcklpd %xmm5, %xmm4
  544. unpckhpd %xmm5, %xmm0
  545. movapd %xmm6, %xmm1
  546. unpcklpd %xmm7, %xmm6
  547. unpckhpd %xmm7, %xmm1
  548. movapd 0 * SIZE(B), %xmm2
  549. movapd 2 * SIZE(B), %xmm3
  550. movapd 4 * SIZE(B), %xmm5
  551. movapd 6 * SIZE(B), %xmm7
  552. subpd %xmm4, %xmm2
  553. subpd %xmm0, %xmm3
  554. subpd %xmm6, %xmm5
  555. subpd %xmm1, %xmm7
  556. #else
  557. movapd 0 * SIZE(AA), %xmm0
  558. movapd 2 * SIZE(AA), %xmm1
  559. movapd 4 * SIZE(AA), %xmm2
  560. movapd 6 * SIZE(AA), %xmm3
  561. subpd %xmm4, %xmm0
  562. subpd %xmm6, %xmm1
  563. subpd %xmm5, %xmm2
  564. subpd %xmm7, %xmm3
  565. #endif
  566. #ifdef LN
  567. movsd 15 * SIZE(AA), %xmm0
  568. movhpd 15 * SIZE(AA), %xmm0
  569. mulpd %xmm0, %xmm7
  570. movsd 14 * SIZE(AA), %xmm0
  571. movhpd 14 * SIZE(AA), %xmm0
  572. mulpd %xmm7, %xmm0
  573. subpd %xmm0, %xmm5
  574. movsd 13 * SIZE(AA), %xmm0
  575. movhpd 13 * SIZE(AA), %xmm0
  576. mulpd %xmm7, %xmm0
  577. subpd %xmm0, %xmm3
  578. movsd 12 * SIZE(AA), %xmm0
  579. movhpd 12 * SIZE(AA), %xmm0
  580. mulpd %xmm7, %xmm0
  581. subpd %xmm0, %xmm2
  582. movsd 10 * SIZE(AA), %xmm0
  583. movhpd 10 * SIZE(AA), %xmm0
  584. mulpd %xmm0, %xmm5
  585. movsd 9 * SIZE(AA), %xmm0
  586. movhpd 9 * SIZE(AA), %xmm0
  587. mulpd %xmm5, %xmm0
  588. subpd %xmm0, %xmm3
  589. movsd 8 * SIZE(AA), %xmm0
  590. movhpd 8 * SIZE(AA), %xmm0
  591. mulpd %xmm5, %xmm0
  592. subpd %xmm0, %xmm2
  593. movsd 5 * SIZE(AA), %xmm0
  594. movhpd 5 * SIZE(AA), %xmm0
  595. mulpd %xmm0, %xmm3
  596. movsd 4 * SIZE(AA), %xmm0
  597. movhpd 4 * SIZE(AA), %xmm0
  598. mulpd %xmm3, %xmm0
  599. subpd %xmm0, %xmm2
  600. movsd 0 * SIZE(AA), %xmm0
  601. movhpd 0 * SIZE(AA), %xmm0
  602. mulpd %xmm0, %xmm2
  603. #endif
  604. #ifdef LT
  605. movsd 0 * SIZE(AA), %xmm0
  606. movhpd 0 * SIZE(AA), %xmm0
  607. mulpd %xmm0, %xmm2
  608. movsd 1 * SIZE(AA), %xmm0
  609. movhpd 1 * SIZE(AA), %xmm0
  610. mulpd %xmm2, %xmm0
  611. subpd %xmm0, %xmm3
  612. movsd 2 * SIZE(AA), %xmm0
  613. movhpd 2 * SIZE(AA), %xmm0
  614. mulpd %xmm2, %xmm0
  615. subpd %xmm0, %xmm5
  616. movsd 3 * SIZE(AA), %xmm0
  617. movhpd 3 * SIZE(AA), %xmm0
  618. mulpd %xmm2, %xmm0
  619. subpd %xmm0, %xmm7
  620. movsd 5 * SIZE(AA), %xmm0
  621. movhpd 5 * SIZE(AA), %xmm0
  622. mulpd %xmm0, %xmm3
  623. movsd 6 * SIZE(AA), %xmm0
  624. movhpd 6 * SIZE(AA), %xmm0
  625. mulpd %xmm3, %xmm0
  626. subpd %xmm0, %xmm5
  627. movsd 7 * SIZE(AA), %xmm0
  628. movhpd 7 * SIZE(AA), %xmm0
  629. mulpd %xmm3, %xmm0
  630. subpd %xmm0, %xmm7
  631. movsd 10 * SIZE(AA), %xmm0
  632. movhpd 10 * SIZE(AA), %xmm0
  633. mulpd %xmm0, %xmm5
  634. movsd 11 * SIZE(AA), %xmm0
  635. movhpd 11 * SIZE(AA), %xmm0
  636. mulpd %xmm5, %xmm0
  637. subpd %xmm0, %xmm7
  638. movsd 15 * SIZE(AA), %xmm0
  639. movhpd 15 * SIZE(AA), %xmm0
  640. mulpd %xmm0, %xmm7
  641. #endif
  642. #ifdef RN
  643. movsd 0 * SIZE(B), %xmm4
  644. movhpd 0 * SIZE(B), %xmm4
  645. mulpd %xmm4, %xmm0
  646. mulpd %xmm4, %xmm1
  647. movsd 1 * SIZE(B), %xmm4
  648. movhpd 1 * SIZE(B), %xmm4
  649. mulpd %xmm0, %xmm4
  650. subpd %xmm4, %xmm2
  651. movsd 1 * SIZE(B), %xmm4
  652. movhpd 1 * SIZE(B), %xmm4
  653. mulpd %xmm1, %xmm4
  654. subpd %xmm4, %xmm3
  655. movsd 3 * SIZE(B), %xmm4
  656. movhpd 3 * SIZE(B), %xmm4
  657. mulpd %xmm4, %xmm2
  658. mulpd %xmm4, %xmm3
  659. #endif
  660. #ifdef RT
  661. movsd 3 * SIZE(B), %xmm4
  662. movhpd 3 * SIZE(B), %xmm4
  663. mulpd %xmm4, %xmm2
  664. mulpd %xmm4, %xmm3
  665. movsd 2 * SIZE(B), %xmm4
  666. movhpd 2 * SIZE(B), %xmm4
  667. mulpd %xmm2, %xmm4
  668. subpd %xmm4, %xmm0
  669. movsd 2 * SIZE(B), %xmm4
  670. movhpd 2 * SIZE(B), %xmm4
  671. mulpd %xmm3, %xmm4
  672. subpd %xmm4, %xmm1
  673. movsd 0 * SIZE(B), %xmm4
  674. movhpd 0 * SIZE(B), %xmm4
  675. mulpd %xmm4, %xmm0
  676. mulpd %xmm4, %xmm1
  677. #endif
  678. #if defined(LN) || defined(LT)
  679. movapd %xmm2, 0 * SIZE(B)
  680. movapd %xmm3, 2 * SIZE(B)
  681. movapd %xmm5, 4 * SIZE(B)
  682. movapd %xmm7, 6 * SIZE(B)
  683. movsd %xmm2, 0 * SIZE(BB)
  684. movsd %xmm2, 1 * SIZE(BB)
  685. movhpd %xmm2, 2 * SIZE(BB)
  686. movhpd %xmm2, 3 * SIZE(BB)
  687. movsd %xmm3, 4 * SIZE(BB)
  688. movsd %xmm3, 5 * SIZE(BB)
  689. movhpd %xmm3, 6 * SIZE(BB)
  690. movhpd %xmm3, 7 * SIZE(BB)
  691. movsd %xmm5, 8 * SIZE(BB)
  692. movsd %xmm5, 9 * SIZE(BB)
  693. movhpd %xmm5, 10 * SIZE(BB)
  694. movhpd %xmm5, 11 * SIZE(BB)
  695. movsd %xmm7, 12 * SIZE(BB)
  696. movsd %xmm7, 13 * SIZE(BB)
  697. movhpd %xmm7, 14 * SIZE(BB)
  698. movhpd %xmm7, 15 * SIZE(BB)
  699. #else
  700. movapd %xmm0, 0 * SIZE(AA)
  701. movapd %xmm1, 2 * SIZE(AA)
  702. movapd %xmm2, 4 * SIZE(AA)
  703. movapd %xmm3, 6 * SIZE(AA)
  704. #endif
  705. #ifdef LN
  706. subl $4 * SIZE, %esi
  707. #endif
  708. #if defined(LN) || defined(LT)
  709. movsd %xmm2, 0 * SIZE(%esi)
  710. movsd %xmm3, 1 * SIZE(%esi)
  711. movsd %xmm5, 2 * SIZE(%esi)
  712. movsd %xmm7, 3 * SIZE(%esi)
  713. movhpd %xmm2, 0 * SIZE(%esi, LDC)
  714. movhpd %xmm3, 1 * SIZE(%esi, LDC)
  715. movhpd %xmm5, 2 * SIZE(%esi, LDC)
  716. movhpd %xmm7, 3 * SIZE(%esi, LDC)
  717. #else
  718. movsd %xmm0, 0 * SIZE(%esi)
  719. movhpd %xmm0, 1 * SIZE(%esi)
  720. movsd %xmm1, 2 * SIZE(%esi)
  721. movhpd %xmm1, 3 * SIZE(%esi)
  722. movsd %xmm2, 0 * SIZE(%esi, LDC)
  723. movhpd %xmm2, 1 * SIZE(%esi, LDC)
  724. movsd %xmm3, 2 * SIZE(%esi, LDC)
  725. movhpd %xmm3, 3 * SIZE(%esi, LDC)
  726. #endif
  727. #ifndef LN
  728. addl $4 * SIZE, %esi
  729. #endif
  730. #if defined(LT) || defined(RN)
  731. movl K, %eax
  732. subl KK, %eax
  733. leal (,%eax, SIZE), %eax
  734. leal (AA, %eax, 4), AA
  735. #ifdef LT
  736. addl $8 * SIZE, B
  737. #endif
  738. #endif
  739. #ifdef LN
  740. subl $4, KK
  741. movl BORIG, B
  742. #endif
  743. #ifdef LT
  744. addl $4, KK
  745. #endif
  746. #ifdef RT
  747. movl K, %eax
  748. movl BORIG, B
  749. sall $2 + BASE_SHIFT, %eax
  750. addl %eax, AORIG
  751. #endif
  752. decl %ebx # i --
  753. jg .L10
  754. ALIGN_2
  755. .L30:
  756. movl M, %ebx
  757. testl $2, %ebx
  758. jle .L50
  759. #ifdef LN
  760. movl K, %eax
  761. sall $1 + BASE_SHIFT, %eax
  762. subl %eax, AORIG
  763. #endif
  764. #if defined(LN) || defined(RT)
  765. movl KK, %eax
  766. movl AORIG, AA
  767. leal (, %eax, SIZE), %eax
  768. leal (AA, %eax, 2), AA
  769. #endif
  770. leal BUFFER, BB
  771. #if defined(LN) || defined(RT)
  772. movl KK, %eax
  773. sall $1 + BASE_SHIFT, %eax
  774. leal (BB, %eax, 2), BB
  775. #endif
  776. movapd 0 * SIZE(BB), %xmm2
  777. pxor %xmm4, %xmm4
  778. movapd 0 * SIZE(AA), %xmm0
  779. pxor %xmm5, %xmm5
  780. movapd 8 * SIZE(BB), %xmm3
  781. pxor %xmm6, %xmm6
  782. movapd 8 * SIZE(AA), %xmm1
  783. pxor %xmm7, %xmm7
  784. #if defined(LT) || defined(RN)
  785. movl KK, %eax
  786. #else
  787. movl K, %eax
  788. subl KK, %eax
  789. #endif
  790. sarl $3, %eax
  791. je .L32
  792. .L31:
  793. mulpd %xmm0, %xmm2
  794. mulpd 2 * SIZE(BB), %xmm0
  795. addpd %xmm2, %xmm4
  796. movapd 4 * SIZE(BB), %xmm2
  797. addpd %xmm0, %xmm5
  798. movapd 2 * SIZE(AA), %xmm0
  799. mulpd %xmm0, %xmm2
  800. mulpd 6 * SIZE(BB), %xmm0
  801. addpd %xmm2, %xmm6
  802. movapd 16 * SIZE(BB), %xmm2
  803. addpd %xmm0, %xmm7
  804. movapd 4 * SIZE(AA), %xmm0
  805. mulpd %xmm0, %xmm3
  806. mulpd 10 * SIZE(BB), %xmm0
  807. addpd %xmm3, %xmm4
  808. movapd 12 * SIZE(BB), %xmm3
  809. addpd %xmm0, %xmm5
  810. movapd 6 * SIZE(AA), %xmm0
  811. mulpd %xmm0, %xmm3
  812. mulpd 14 * SIZE(BB), %xmm0
  813. addpd %xmm3, %xmm6
  814. movapd 24 * SIZE(BB), %xmm3
  815. addpd %xmm0, %xmm7
  816. movapd 16 * SIZE(AA), %xmm0
  817. mulpd %xmm1, %xmm2
  818. mulpd 18 * SIZE(BB), %xmm1
  819. addpd %xmm2, %xmm4
  820. movapd 20 * SIZE(BB), %xmm2
  821. addpd %xmm1, %xmm5
  822. movapd 10 * SIZE(AA), %xmm1
  823. mulpd %xmm1, %xmm2
  824. mulpd 22 * SIZE(BB), %xmm1
  825. addpd %xmm2, %xmm6
  826. movapd 32 * SIZE(BB), %xmm2
  827. addpd %xmm1, %xmm7
  828. movapd 12 * SIZE(AA), %xmm1
  829. mulpd %xmm1, %xmm3
  830. mulpd 26 * SIZE(BB), %xmm1
  831. addpd %xmm3, %xmm4
  832. movapd 28 * SIZE(BB), %xmm3
  833. addpd %xmm1, %xmm5
  834. movapd 14 * SIZE(AA), %xmm1
  835. mulpd %xmm1, %xmm3
  836. mulpd 30 * SIZE(BB), %xmm1
  837. addpd %xmm3, %xmm6
  838. movapd 40 * SIZE(BB), %xmm3
  839. addpd %xmm1, %xmm7
  840. movapd 24 * SIZE(AA), %xmm1
  841. addl $16 * SIZE, AA
  842. addl $32 * SIZE, BB
  843. BRANCH
  844. decl %eax
  845. jne .L31
  846. .L32:
  847. #if defined(LT) || defined(RN)
  848. movl KK, %eax
  849. #else
  850. movl K, %eax
  851. subl KK, %eax
  852. #endif
  853. andl $7, %eax # if (k & 1)
  854. BRANCH
  855. je .L34
  856. .L33:
  857. mulpd %xmm0, %xmm2
  858. mulpd 2 * SIZE(BB), %xmm0
  859. addpd %xmm2, %xmm4
  860. movapd 4 * SIZE(BB), %xmm2
  861. addpd %xmm0, %xmm5
  862. movapd 2 * SIZE(AA), %xmm0
  863. addl $2 * SIZE, AA # aoffset += 8
  864. addl $4 * SIZE, BB # boffset1 += 8
  865. decl %eax
  866. BRANCH
  867. jg .L33
  868. ALIGN_4
  869. .L34:
  870. addpd %xmm6, %xmm4
  871. addpd %xmm7, %xmm5
  872. #if defined(LN) || defined(RT)
  873. movl KK, %eax
  874. #ifdef LN
  875. subl $2, %eax
  876. #else
  877. subl $2, %eax
  878. #endif
  879. movl AORIG, AA
  880. movl BORIG, B
  881. leal BUFFER, BB
  882. leal (, %eax, SIZE), %eax
  883. leal (AA, %eax, 2), AA
  884. leal (B, %eax, 2), B
  885. leal (BB, %eax, 4), BB
  886. #endif
  887. #if defined(LN) || defined(LT)
  888. movapd %xmm4, %xmm0
  889. unpcklpd %xmm5, %xmm4
  890. unpckhpd %xmm5, %xmm0
  891. movapd 0 * SIZE(B), %xmm2
  892. movapd 2 * SIZE(B), %xmm3
  893. subpd %xmm4, %xmm2
  894. subpd %xmm0, %xmm3
  895. #else
  896. movapd 0 * SIZE(AA), %xmm0
  897. movapd 2 * SIZE(AA), %xmm1
  898. subpd %xmm4, %xmm0
  899. subpd %xmm5, %xmm1
  900. #endif
  901. #ifdef LN
  902. movsd 3 * SIZE(AA), %xmm0
  903. movhpd 3 * SIZE(AA), %xmm0
  904. mulpd %xmm0, %xmm3
  905. movsd 2 * SIZE(AA), %xmm0
  906. movhpd 2 * SIZE(AA), %xmm0
  907. mulpd %xmm3, %xmm0
  908. subpd %xmm0, %xmm2
  909. movsd 0 * SIZE(AA), %xmm0
  910. movhpd 0 * SIZE(AA), %xmm0
  911. mulpd %xmm0, %xmm2
  912. #endif
  913. #ifdef LT
  914. movsd 0 * SIZE(AA), %xmm0
  915. movhpd 0 * SIZE(AA), %xmm0
  916. mulpd %xmm0, %xmm2
  917. movsd 1 * SIZE(AA), %xmm0
  918. movhpd 1 * SIZE(AA), %xmm0
  919. mulpd %xmm2, %xmm0
  920. subpd %xmm0, %xmm3
  921. movsd 3 * SIZE(AA), %xmm0
  922. movhpd 3 * SIZE(AA), %xmm0
  923. mulpd %xmm0, %xmm3
  924. #endif
  925. #ifdef RN
  926. movsd 0 * SIZE(B), %xmm4
  927. movhpd 0 * SIZE(B), %xmm4
  928. mulpd %xmm4, %xmm0
  929. movsd 1 * SIZE(B), %xmm4
  930. movhpd 1 * SIZE(B), %xmm4
  931. mulpd %xmm0, %xmm4
  932. subpd %xmm4, %xmm1
  933. movsd 3 * SIZE(B), %xmm4
  934. movhpd 3 * SIZE(B), %xmm4
  935. mulpd %xmm4, %xmm1
  936. #endif
  937. #ifdef RT
  938. movsd 3 * SIZE(B), %xmm4
  939. movhpd 3 * SIZE(B), %xmm4
  940. mulpd %xmm4, %xmm1
  941. movsd 2 * SIZE(B), %xmm4
  942. movhpd 2 * SIZE(B), %xmm4
  943. mulpd %xmm1, %xmm4
  944. subpd %xmm4, %xmm0
  945. movsd 0 * SIZE(B), %xmm4
  946. movhpd 0 * SIZE(B), %xmm4
  947. mulpd %xmm4, %xmm0
  948. #endif
  949. #if defined(LN) || defined(LT)
  950. movapd %xmm2, 0 * SIZE(B)
  951. movapd %xmm3, 2 * SIZE(B)
  952. movsd %xmm2, 0 * SIZE(BB)
  953. movsd %xmm2, 1 * SIZE(BB)
  954. movhpd %xmm2, 2 * SIZE(BB)
  955. movhpd %xmm2, 3 * SIZE(BB)
  956. movsd %xmm3, 4 * SIZE(BB)
  957. movsd %xmm3, 5 * SIZE(BB)
  958. movhpd %xmm3, 6 * SIZE(BB)
  959. movhpd %xmm3, 7 * SIZE(BB)
  960. #else
  961. movapd %xmm0, 0 * SIZE(AA)
  962. movapd %xmm1, 2 * SIZE(AA)
  963. #endif
  964. #ifdef LN
  965. subl $2 * SIZE, %esi
  966. #endif
  967. #if defined(LN) || defined(LT)
  968. movsd %xmm2, 0 * SIZE(%esi)
  969. movsd %xmm3, 1 * SIZE(%esi)
  970. movhpd %xmm2, 0 * SIZE(%esi, LDC)
  971. movhpd %xmm3, 1 * SIZE(%esi, LDC)
  972. #else
  973. movsd %xmm0, 0 * SIZE(%esi)
  974. movhpd %xmm0, 1 * SIZE(%esi)
  975. movsd %xmm1, 0 * SIZE(%esi, LDC)
  976. movhpd %xmm1, 1 * SIZE(%esi, LDC)
  977. #endif
  978. #ifndef LN
  979. addl $2 * SIZE, %esi
  980. #endif
  981. #if defined(LT) || defined(RN)
  982. movl K, %eax
  983. subl KK, %eax
  984. leal (,%eax, SIZE), %eax
  985. leal (AA, %eax, 2), AA
  986. #ifdef LT
  987. addl $4 * SIZE, B
  988. #endif
  989. #endif
  990. #ifdef LN
  991. subl $2, KK
  992. movl BORIG, B
  993. #endif
  994. #ifdef LT
  995. addl $2, KK
  996. #endif
  997. #ifdef RT
  998. movl K, %eax
  999. movl BORIG, B
  1000. sall $1 + BASE_SHIFT, %eax
  1001. addl %eax, AORIG
  1002. #endif
  1003. ALIGN_2
  1004. .L50:
  1005. movl M, %ebx
  1006. testl $1, %ebx
  1007. jle .L99
  1008. #ifdef LN
  1009. movl K, %eax
  1010. sall $0 + BASE_SHIFT, %eax
  1011. subl %eax, AORIG
  1012. #endif
  1013. #if defined(LN) || defined(RT)
  1014. movl KK, %eax
  1015. movl AORIG, AA
  1016. leal (, %eax, SIZE), %eax
  1017. leal (AA, %eax, 1), AA
  1018. #endif
  1019. leal BUFFER, %ecx
  1020. #if defined(LN) || defined(RT)
  1021. movl KK, %eax
  1022. sall $1 + BASE_SHIFT, %eax
  1023. leal (BB, %eax, 2), BB
  1024. #endif
  1025. movsd 0 * SIZE(BB), %xmm2
  1026. pxor %xmm4, %xmm4
  1027. movsd 0 * SIZE(AA), %xmm0
  1028. pxor %xmm5, %xmm5
  1029. movsd 8 * SIZE(BB), %xmm3
  1030. pxor %xmm6, %xmm6
  1031. movsd 4 * SIZE(AA), %xmm1
  1032. pxor %xmm7, %xmm7
  1033. #if defined(LT) || defined(RN)
  1034. movl KK, %eax
  1035. #else
  1036. movl K, %eax
  1037. subl KK, %eax
  1038. #endif
  1039. sarl $3, %eax
  1040. je .L52
  1041. .L51:
  1042. mulsd %xmm0, %xmm2
  1043. mulsd 2 * SIZE(BB), %xmm0
  1044. addsd %xmm2, %xmm4
  1045. movsd 4 * SIZE(BB), %xmm2
  1046. addsd %xmm0, %xmm5
  1047. movsd 1 * SIZE(AA), %xmm0
  1048. mulsd %xmm0, %xmm2
  1049. mulsd 6 * SIZE(BB), %xmm0
  1050. addsd %xmm2, %xmm4
  1051. movsd 16 * SIZE(BB), %xmm2
  1052. addsd %xmm0, %xmm5
  1053. movsd 2 * SIZE(AA), %xmm0
  1054. mulsd %xmm0, %xmm3
  1055. mulsd 10 * SIZE(BB), %xmm0
  1056. addsd %xmm3, %xmm4
  1057. movsd 12 * SIZE(BB), %xmm3
  1058. addsd %xmm0, %xmm5
  1059. movsd 3 * SIZE(AA), %xmm0
  1060. mulsd %xmm0, %xmm3
  1061. mulsd 14 * SIZE(BB), %xmm0
  1062. addsd %xmm3, %xmm4
  1063. movsd 24 * SIZE(BB), %xmm3
  1064. addsd %xmm0, %xmm5
  1065. movsd 8 * SIZE(AA), %xmm0
  1066. mulsd %xmm1, %xmm2
  1067. mulsd 18 * SIZE(BB), %xmm1
  1068. addsd %xmm2, %xmm4
  1069. movsd 20 * SIZE(BB), %xmm2
  1070. addsd %xmm1, %xmm5
  1071. movsd 5 * SIZE(AA), %xmm1
  1072. mulsd %xmm1, %xmm2
  1073. mulsd 22 * SIZE(BB), %xmm1
  1074. addsd %xmm2, %xmm4
  1075. movsd 32 * SIZE(BB), %xmm2
  1076. addsd %xmm1, %xmm5
  1077. movsd 6 * SIZE(AA), %xmm1
  1078. mulsd %xmm1, %xmm3
  1079. mulsd 26 * SIZE(BB), %xmm1
  1080. addsd %xmm3, %xmm4
  1081. movsd 28 * SIZE(BB), %xmm3
  1082. addsd %xmm1, %xmm5
  1083. movsd 7 * SIZE(AA), %xmm1
  1084. mulsd %xmm1, %xmm3
  1085. mulsd 30 * SIZE(BB), %xmm1
  1086. addsd %xmm3, %xmm4
  1087. movsd 40 * SIZE(BB), %xmm3
  1088. addsd %xmm1, %xmm5
  1089. movsd 12 * SIZE(AA), %xmm1
  1090. addl $ 8 * SIZE, AA
  1091. addl $32 * SIZE, BB
  1092. BRANCH
  1093. decl %eax
  1094. jne .L51
  1095. .L52:
  1096. #if defined(LT) || defined(RN)
  1097. movl KK, %eax
  1098. #else
  1099. movl K, %eax
  1100. subl KK, %eax
  1101. #endif
  1102. andl $7, %eax # if (k & 1)
  1103. BRANCH
  1104. je .L54
  1105. .L53:
  1106. mulsd %xmm0, %xmm2
  1107. mulsd 2 * SIZE(BB), %xmm0
  1108. addsd %xmm2, %xmm4
  1109. movsd 4 * SIZE(BB), %xmm2
  1110. addsd %xmm0, %xmm5
  1111. movsd 1 * SIZE(AA), %xmm0
  1112. addl $1 * SIZE, AA # aoffset += 8
  1113. addl $4 * SIZE, BB # boffset1 += 8
  1114. decl %eax
  1115. BRANCH
  1116. jg .L53
  1117. ALIGN_4
  1118. .L54:
  1119. addsd %xmm6, %xmm4
  1120. addsd %xmm7, %xmm5
  1121. #if defined(LN) || defined(RT)
  1122. movl KK, %eax
  1123. #ifdef LN
  1124. subl $1, %eax
  1125. #else
  1126. subl $2, %eax
  1127. #endif
  1128. movl AORIG, AA
  1129. movl BORIG, B
  1130. leal BUFFER, BB
  1131. leal (, %eax, SIZE), %eax
  1132. leal (AA, %eax, 1), AA
  1133. leal (B, %eax, 2), B
  1134. leal (BB, %eax, 4), BB
  1135. #endif
  1136. #if defined(LN) || defined(LT)
  1137. movsd 0 * SIZE(B), %xmm0
  1138. movsd 1 * SIZE(B), %xmm1
  1139. #else
  1140. movsd 0 * SIZE(AA), %xmm0
  1141. movsd 1 * SIZE(AA), %xmm1
  1142. #endif
  1143. subsd %xmm4, %xmm0
  1144. subsd %xmm5, %xmm1
  1145. #if defined(LN) || defined(LT)
  1146. movsd 0 * SIZE(AA), %xmm2
  1147. mulsd %xmm2, %xmm0
  1148. mulsd %xmm2, %xmm1
  1149. #endif
  1150. #ifdef RN
  1151. mulsd 0 * SIZE(B), %xmm0
  1152. movsd 1 * SIZE(B), %xmm4
  1153. mulsd %xmm0, %xmm4
  1154. subsd %xmm4, %xmm1
  1155. mulsd 3 * SIZE(B), %xmm1
  1156. #endif
  1157. #ifdef RT
  1158. mulsd 3 * SIZE(B), %xmm1
  1159. movsd 2 * SIZE(B), %xmm4
  1160. mulsd %xmm1, %xmm4
  1161. subsd %xmm4, %xmm0
  1162. mulsd 0 * SIZE(B), %xmm0
  1163. #endif
  1164. #if defined(LN) || defined(LT)
  1165. movsd %xmm0, 0 * SIZE(B)
  1166. movsd %xmm1, 1 * SIZE(B)
  1167. movsd %xmm0, 0 * SIZE(BB)
  1168. movsd %xmm0, 1 * SIZE(BB)
  1169. movsd %xmm1, 2 * SIZE(BB)
  1170. movsd %xmm1, 3 * SIZE(BB)
  1171. #else
  1172. movsd %xmm0, 0 * SIZE(AA)
  1173. movsd %xmm1, 1 * SIZE(AA)
  1174. #endif
  1175. #ifdef LN
  1176. subl $1 * SIZE, %esi
  1177. #endif
  1178. movsd %xmm0, 0 * SIZE(%esi)
  1179. movsd %xmm1, 0 * SIZE(%esi, LDC)
  1180. #ifndef LN
  1181. addl $1 * SIZE, %esi
  1182. #endif
  1183. #if defined(LT) || defined(RN)
  1184. movl K, %eax
  1185. subl KK, %eax
  1186. leal (,%eax, SIZE), %eax
  1187. leal (AA, %eax, 1), AA
  1188. #ifdef LT
  1189. addl $2 * SIZE, B
  1190. #endif
  1191. #endif
  1192. #ifdef LN
  1193. subl $1, KK
  1194. movl BORIG, B
  1195. #endif
  1196. #ifdef LT
  1197. addl $1, KK
  1198. #endif
  1199. #ifdef RT
  1200. movl K, %eax
  1201. movl BORIG, B
  1202. sall $0 + BASE_SHIFT, %eax
  1203. addl %eax, AORIG
  1204. #endif
  1205. ALIGN_2
  1206. .L99:
  1207. #ifdef LN
  1208. movl K, %eax
  1209. leal (, %eax, SIZE), %eax
  1210. leal (B, %eax, 2), B
  1211. #endif
  1212. #if defined(LT) || defined(RN)
  1213. movl K, %eax
  1214. subl KK, %eax
  1215. leal (,%eax, SIZE), %eax
  1216. leal (B, %eax, 2), B
  1217. #endif
  1218. #ifdef RN
  1219. addl $2, KK
  1220. #endif
  1221. #ifdef RT
  1222. subl $2, KK
  1223. #endif
  1224. decl J # j --
  1225. jg .L01
  1226. ALIGN_2
  1227. .L100:
  1228. movl N, %eax
  1229. testl $1, %eax
  1230. jle .L999
  1231. ALIGN_2
  1232. .L101:
  1233. /* Copying to Sub Buffer */
  1234. #ifdef LN
  1235. movl OFFSET, %eax
  1236. addl M, %eax
  1237. movl %eax, KK
  1238. #endif
  1239. leal BUFFER, %ecx
  1240. #ifdef RT
  1241. movl K, %eax
  1242. sall $0 + BASE_SHIFT, %eax
  1243. subl %eax, B
  1244. #endif
  1245. #if defined(LN) || defined(RT)
  1246. movl KK, %eax
  1247. movl B, BORIG
  1248. leal (, %eax, SIZE), %eax
  1249. leal (B, %eax, 1), B
  1250. leal (BB, %eax, 2), BB
  1251. #endif
  1252. #ifdef LT
  1253. movl OFFSET, %eax
  1254. movl %eax, KK
  1255. #endif
  1256. #if defined(LT) || defined(RN)
  1257. movl KK, %eax
  1258. #else
  1259. movl K, %eax
  1260. subl KK, %eax
  1261. #endif
  1262. sarl $3, %eax
  1263. jle .L103
  1264. ALIGN_4
  1265. .L102:
  1266. movsd 0 * SIZE(B), %xmm0
  1267. movsd 1 * SIZE(B), %xmm1
  1268. movsd 2 * SIZE(B), %xmm2
  1269. movsd 3 * SIZE(B), %xmm3
  1270. movsd 4 * SIZE(B), %xmm4
  1271. movsd 5 * SIZE(B), %xmm5
  1272. movsd 6 * SIZE(B), %xmm6
  1273. movsd 7 * SIZE(B), %xmm7
  1274. unpcklpd %xmm0, %xmm0
  1275. unpcklpd %xmm1, %xmm1
  1276. unpcklpd %xmm2, %xmm2
  1277. unpcklpd %xmm3, %xmm3
  1278. unpcklpd %xmm4, %xmm4
  1279. unpcklpd %xmm5, %xmm5
  1280. unpcklpd %xmm6, %xmm6
  1281. unpcklpd %xmm7, %xmm7
  1282. movapd %xmm0, 0 * SIZE(%ecx)
  1283. movapd %xmm1, 2 * SIZE(%ecx)
  1284. movapd %xmm2, 4 * SIZE(%ecx)
  1285. movapd %xmm3, 6 * SIZE(%ecx)
  1286. movapd %xmm4, 8 * SIZE(%ecx)
  1287. movapd %xmm5, 10 * SIZE(%ecx)
  1288. movapd %xmm6, 12 * SIZE(%ecx)
  1289. movapd %xmm7, 14 * SIZE(%ecx)
  1290. prefetcht0 104 * SIZE(B)
  1291. addl $ 8 * SIZE, B
  1292. addl $16 * SIZE, %ecx
  1293. decl %eax
  1294. BRANCH
  1295. jne .L102
  1296. ALIGN_2
  1297. .L103:
  1298. #if defined(LT) || defined(RN)
  1299. movl KK, %eax
  1300. #else
  1301. movl K, %eax
  1302. subl KK, %eax
  1303. #endif
  1304. andl $7, %eax
  1305. BRANCH
  1306. jle .L105
  1307. ALIGN_2
  1308. .L104:
  1309. movsd 0 * SIZE(B), %xmm0
  1310. unpcklpd %xmm0, %xmm0
  1311. movapd %xmm0, 0 * SIZE(%ecx)
  1312. addl $1 * SIZE, B
  1313. addl $2 * SIZE, %ecx
  1314. decl %eax
  1315. jne .L104
  1316. ALIGN_4
  1317. .L105:
  1318. #if defined(LT) || defined(RN)
  1319. movl A, AA
  1320. #else
  1321. movl A, %eax
  1322. movl %eax, AORIG
  1323. #endif
  1324. #ifdef RT
  1325. subl LDC, C
  1326. #endif
  1327. movl C, %esi # coffset = c
  1328. #ifndef RT
  1329. addl LDC, C
  1330. #endif
  1331. movl M, %ebx
  1332. sarl $2, %ebx # i = (m >> 2)
  1333. jle .L130
  1334. ALIGN_4
  1335. .L110:
  1336. #ifdef LN
  1337. movl K, %eax
  1338. sall $2 + BASE_SHIFT, %eax
  1339. subl %eax, AORIG
  1340. #endif
  1341. #if defined(LN) || defined(RT)
  1342. movl KK, %eax
  1343. movl AORIG, AA
  1344. leal (, %eax, SIZE), %eax
  1345. leal (AA, %eax, 4), AA
  1346. #endif
  1347. leal BUFFER, BB
  1348. #if defined(LN) || defined(RT)
  1349. movl KK, %eax
  1350. sall $0 + BASE_SHIFT, %eax
  1351. leal (BB, %eax, 2), BB
  1352. #endif
  1353. movapd 0 * SIZE(BB), %xmm2
  1354. pxor %xmm4, %xmm4
  1355. movapd 0 * SIZE(AA), %xmm0
  1356. pxor %xmm5, %xmm5
  1357. movapd 8 * SIZE(BB), %xmm3
  1358. pxor %xmm6, %xmm6
  1359. movapd 8 * SIZE(AA), %xmm1
  1360. pxor %xmm7, %xmm7
  1361. #if defined(LT) || defined(RN)
  1362. movl KK, %eax
  1363. #else
  1364. movl K, %eax
  1365. subl KK, %eax
  1366. #endif
  1367. sarl $3, %eax
  1368. je .L112
  1369. .L111:
  1370. mulpd %xmm2, %xmm0
  1371. mulpd 2 * SIZE(AA), %xmm2
  1372. addpd %xmm0, %xmm4
  1373. movapd 4 * SIZE(AA), %xmm0
  1374. addpd %xmm2, %xmm6
  1375. movapd 2 * SIZE(BB), %xmm2
  1376. mulpd %xmm2, %xmm0
  1377. mulpd 6 * SIZE(AA), %xmm2
  1378. addpd %xmm0, %xmm5
  1379. movapd 16 * SIZE(AA), %xmm0
  1380. addpd %xmm2, %xmm7
  1381. movapd 4 * SIZE(BB), %xmm2
  1382. mulpd %xmm2, %xmm1
  1383. mulpd 10 * SIZE(AA), %xmm2
  1384. addpd %xmm1, %xmm4
  1385. movapd 12 * SIZE(AA), %xmm1
  1386. addpd %xmm2, %xmm6
  1387. movapd 6 * SIZE(BB), %xmm2
  1388. mulpd %xmm2, %xmm1
  1389. mulpd 14 * SIZE(AA), %xmm2
  1390. addpd %xmm1, %xmm5
  1391. movapd 24 * SIZE(AA), %xmm1
  1392. addpd %xmm2, %xmm7
  1393. movapd 16 * SIZE(BB), %xmm2
  1394. mulpd %xmm3, %xmm0
  1395. mulpd 18 * SIZE(AA), %xmm3
  1396. addpd %xmm0, %xmm4
  1397. movapd 20 * SIZE(AA), %xmm0
  1398. addpd %xmm3, %xmm6
  1399. movapd 10 * SIZE(BB), %xmm3
  1400. mulpd %xmm3, %xmm0
  1401. mulpd 22 * SIZE(AA), %xmm3
  1402. addpd %xmm0, %xmm5
  1403. movapd 32 * SIZE(AA), %xmm0
  1404. addpd %xmm3, %xmm7
  1405. movapd 12 * SIZE(BB), %xmm3
  1406. mulpd %xmm3, %xmm1
  1407. mulpd 26 * SIZE(AA), %xmm3
  1408. addpd %xmm1, %xmm4
  1409. movapd 28 * SIZE(AA), %xmm1
  1410. addpd %xmm3, %xmm6
  1411. movapd 14 * SIZE(BB), %xmm3
  1412. mulpd %xmm3, %xmm1
  1413. mulpd 30 * SIZE(AA), %xmm3
  1414. addpd %xmm1, %xmm5
  1415. movapd 40 * SIZE(AA), %xmm1
  1416. addpd %xmm3, %xmm7
  1417. movapd 24 * SIZE(BB), %xmm3
  1418. addl $32 * SIZE, AA
  1419. addl $16 * SIZE, BB
  1420. decl %eax
  1421. jne .L111
  1422. .L112:
  1423. #if defined(LT) || defined(RN)
  1424. movl KK, %eax
  1425. #else
  1426. movl K, %eax
  1427. subl KK, %eax
  1428. #endif
  1429. andl $7, %eax # if (k & 1)
  1430. BRANCH
  1431. je .L114
  1432. .L113:
  1433. mulpd %xmm2, %xmm0
  1434. mulpd 2 * SIZE(AA), %xmm2
  1435. addpd %xmm0, %xmm4
  1436. movapd 4 * SIZE(AA), %xmm0
  1437. addpd %xmm2, %xmm6
  1438. movapd 2 * SIZE(BB), %xmm2
  1439. addl $4 * SIZE, AA # aoffset += 8
  1440. addl $2 * SIZE, BB # boffset1 += 8
  1441. subl $1, %eax
  1442. jg .L113
  1443. ALIGN_4
  1444. .L114:
  1445. addpd %xmm5, %xmm4
  1446. addpd %xmm7, %xmm6
  1447. #if defined(LN) || defined(RT)
  1448. movl KK, %eax
  1449. #ifdef LN
  1450. subl $4, %eax
  1451. #else
  1452. subl $1, %eax
  1453. #endif
  1454. movl AORIG, AA
  1455. movl BORIG, B
  1456. leal BUFFER, BB
  1457. leal (, %eax, SIZE), %eax
  1458. leal (AA, %eax, 4), AA
  1459. leal (B, %eax, 1), B
  1460. leal (BB, %eax, 2), BB
  1461. #endif
  1462. #if defined(LN) || defined(LT)
  1463. movapd 0 * SIZE(B), %xmm0
  1464. movapd 2 * SIZE(B), %xmm1
  1465. #else
  1466. movapd 0 * SIZE(AA), %xmm0
  1467. movapd 2 * SIZE(AA), %xmm1
  1468. #endif
  1469. subpd %xmm4, %xmm0
  1470. subpd %xmm6, %xmm1
  1471. #ifdef LN
  1472. movapd %xmm0, %xmm2
  1473. unpckhpd %xmm2, %xmm2
  1474. movapd %xmm1, %xmm3
  1475. unpckhpd %xmm3, %xmm3
  1476. movsd 15 * SIZE(AA), %xmm4
  1477. mulsd %xmm4, %xmm3
  1478. movsd 14 * SIZE(AA), %xmm5
  1479. mulsd %xmm3, %xmm5
  1480. subsd %xmm5, %xmm1
  1481. movsd 13 * SIZE(AA), %xmm6
  1482. mulsd %xmm3, %xmm6
  1483. subsd %xmm6, %xmm2
  1484. movsd 12 * SIZE(AA), %xmm7
  1485. mulsd %xmm3, %xmm7
  1486. subsd %xmm7, %xmm0
  1487. movsd 10 * SIZE(AA), %xmm4
  1488. mulsd %xmm4, %xmm1
  1489. movsd 9 * SIZE(AA), %xmm5
  1490. mulsd %xmm1, %xmm5
  1491. subsd %xmm5, %xmm2
  1492. movsd 8 * SIZE(AA), %xmm6
  1493. mulsd %xmm1, %xmm6
  1494. subsd %xmm6, %xmm0
  1495. movsd 5 * SIZE(AA), %xmm4
  1496. mulsd %xmm4, %xmm2
  1497. movsd 4 * SIZE(AA), %xmm5
  1498. mulsd %xmm2, %xmm5
  1499. subsd %xmm5, %xmm0
  1500. movsd 0 * SIZE(AA), %xmm4
  1501. mulsd %xmm4, %xmm0
  1502. unpcklpd %xmm2, %xmm0
  1503. unpcklpd %xmm3, %xmm1
  1504. #endif
  1505. #ifdef LT
  1506. movapd %xmm0, %xmm2
  1507. unpckhpd %xmm2, %xmm2
  1508. movapd %xmm1, %xmm3
  1509. unpckhpd %xmm3, %xmm3
  1510. movsd 0 * SIZE(AA), %xmm4
  1511. mulsd %xmm4, %xmm0
  1512. movsd 1 * SIZE(AA), %xmm5
  1513. mulsd %xmm0, %xmm5
  1514. subsd %xmm5, %xmm2
  1515. movsd 2 * SIZE(AA), %xmm6
  1516. mulsd %xmm0, %xmm6
  1517. subsd %xmm6, %xmm1
  1518. movsd 3 * SIZE(AA), %xmm7
  1519. mulsd %xmm0, %xmm7
  1520. subsd %xmm7, %xmm3
  1521. movsd 5 * SIZE(AA), %xmm4
  1522. mulsd %xmm4, %xmm2
  1523. movsd 6 * SIZE(AA), %xmm5
  1524. mulsd %xmm2, %xmm5
  1525. subsd %xmm5, %xmm1
  1526. movsd 7 * SIZE(AA), %xmm6
  1527. mulsd %xmm2, %xmm6
  1528. subsd %xmm6, %xmm3
  1529. movsd 10 * SIZE(AA), %xmm4
  1530. mulsd %xmm4, %xmm1
  1531. movsd 11 * SIZE(AA), %xmm5
  1532. mulsd %xmm1, %xmm5
  1533. subsd %xmm5, %xmm3
  1534. movsd 15 * SIZE(AA), %xmm4
  1535. mulsd %xmm4, %xmm3
  1536. unpcklpd %xmm2, %xmm0
  1537. unpcklpd %xmm3, %xmm1
  1538. #endif
  1539. #if defined(RN) || defined(RT)
  1540. movsd 0 * SIZE(B), %xmm4
  1541. movhpd 0 * SIZE(B), %xmm4
  1542. mulpd %xmm4, %xmm0
  1543. mulpd %xmm4, %xmm1
  1544. #endif
  1545. #if defined(LN) || defined(LT)
  1546. movapd %xmm0, 0 * SIZE(B)
  1547. movapd %xmm1, 2 * SIZE(B)
  1548. movsd %xmm0, 0 * SIZE(BB)
  1549. movsd %xmm0, 1 * SIZE(BB)
  1550. movhpd %xmm0, 2 * SIZE(BB)
  1551. movhpd %xmm0, 3 * SIZE(BB)
  1552. movsd %xmm1, 4 * SIZE(BB)
  1553. movsd %xmm1, 5 * SIZE(BB)
  1554. movhpd %xmm1, 6 * SIZE(BB)
  1555. movhpd %xmm1, 7 * SIZE(BB)
  1556. #else
  1557. movapd %xmm0, 0 * SIZE(AA)
  1558. movapd %xmm1, 2 * SIZE(AA)
  1559. #endif
  1560. #ifdef LN
  1561. subl $4 * SIZE, %esi
  1562. #endif
  1563. movsd %xmm0, 0 * SIZE(%esi)
  1564. movhpd %xmm0, 1 * SIZE(%esi)
  1565. movsd %xmm1, 2 * SIZE(%esi)
  1566. movhpd %xmm1, 3 * SIZE(%esi)
  1567. #ifndef LN
  1568. addl $4 * SIZE, %esi
  1569. #endif
  1570. #if defined(LT) || defined(RN)
  1571. movl K, %eax
  1572. subl KK, %eax
  1573. leal (,%eax, SIZE), %eax
  1574. leal (AA, %eax, 4), AA
  1575. #ifdef LT
  1576. addl $4 * SIZE, B
  1577. #endif
  1578. #endif
  1579. #ifdef LN
  1580. subl $4, KK
  1581. movl BORIG, B
  1582. #endif
  1583. #ifdef LT
  1584. addl $4, KK
  1585. #endif
  1586. #ifdef RT
  1587. movl K, %eax
  1588. movl BORIG, B
  1589. sall $2 + BASE_SHIFT, %eax
  1590. addl %eax, AORIG
  1591. #endif
  1592. BRANCH
  1593. decl %ebx # i --
  1594. jg .L110
  1595. ALIGN_2
  1596. .L130:
  1597. movl M, %ebx
  1598. testl $2, %ebx
  1599. jle .L150
  1600. #ifdef LN
  1601. movl K, %eax
  1602. sall $1 + BASE_SHIFT, %eax
  1603. subl %eax, AORIG
  1604. #endif
  1605. #if defined(LN) || defined(RT)
  1606. movl KK, %eax
  1607. movl AORIG, AA
  1608. leal (, %eax, SIZE), %eax
  1609. leal (AA, %eax, 2), AA
  1610. #endif
  1611. leal BUFFER, BB
  1612. movapd 0 * SIZE(BB), %xmm2
  1613. pxor %xmm4, %xmm4
  1614. movapd 0 * SIZE(AA), %xmm0
  1615. pxor %xmm5, %xmm5
  1616. movapd 8 * SIZE(BB), %xmm3
  1617. pxor %xmm6, %xmm6
  1618. movapd 8 * SIZE(AA), %xmm1
  1619. pxor %xmm7, %xmm7
  1620. #if defined(LN) || defined(RT)
  1621. movl KK, %eax
  1622. sall $0 + BASE_SHIFT, %eax
  1623. leal (BB, %eax, 2), BB
  1624. #endif
  1625. #if defined(LT) || defined(RN)
  1626. movl KK, %eax
  1627. #else
  1628. movl K, %eax
  1629. subl KK, %eax
  1630. #endif
  1631. sarl $3, %eax
  1632. je .L132
  1633. .L131:
  1634. mulpd %xmm0, %xmm2
  1635. movapd 2 * SIZE(AA), %xmm0
  1636. addpd %xmm2, %xmm4
  1637. mulpd 2 * SIZE(BB), %xmm0
  1638. movapd 16 * SIZE(BB), %xmm2
  1639. addpd %xmm0, %xmm5
  1640. movapd 4 * SIZE(AA), %xmm0
  1641. mulpd 4 * SIZE(BB), %xmm0
  1642. addpd %xmm0, %xmm6
  1643. movapd 6 * SIZE(AA), %xmm0
  1644. mulpd 6 * SIZE(BB), %xmm0
  1645. addpd %xmm0, %xmm7
  1646. movapd 16 * SIZE(AA), %xmm0
  1647. mulpd %xmm1, %xmm3
  1648. movapd 10 * SIZE(AA), %xmm1
  1649. addpd %xmm3, %xmm4
  1650. mulpd 10 * SIZE(BB), %xmm1
  1651. movapd 24 * SIZE(BB), %xmm3
  1652. addpd %xmm1, %xmm5
  1653. movapd 12 * SIZE(AA), %xmm1
  1654. mulpd 12 * SIZE(BB), %xmm1
  1655. addpd %xmm1, %xmm6
  1656. movapd 14 * SIZE(AA), %xmm1
  1657. mulpd 14 * SIZE(BB), %xmm1
  1658. addpd %xmm1, %xmm7
  1659. movapd 24 * SIZE(AA), %xmm1
  1660. addl $16 * SIZE, AA
  1661. addl $16 * SIZE, BB
  1662. BRANCH
  1663. decl %eax
  1664. jne .L131
  1665. .L132:
  1666. #if defined(LT) || defined(RN)
  1667. movl KK, %eax
  1668. #else
  1669. movl K, %eax
  1670. subl KK, %eax
  1671. #endif
  1672. andl $7, %eax # if (k & 1)
  1673. BRANCH
  1674. je .L134
  1675. .L133:
  1676. movapd 0 * SIZE(AA), %xmm0
  1677. mulpd 0 * SIZE(BB), %xmm0
  1678. addpd %xmm0, %xmm4
  1679. addl $2 * SIZE, AA # aoffset += 8
  1680. addl $2 * SIZE, BB # boffset1 += 8
  1681. decl %eax
  1682. BRANCH
  1683. jg .L133
  1684. ALIGN_4
  1685. .L134:
  1686. addpd %xmm5, %xmm4
  1687. addpd %xmm7, %xmm6
  1688. addpd %xmm6, %xmm4
  1689. #if defined(LN) || defined(RT)
  1690. movl KK, %eax
  1691. #ifdef LN
  1692. subl $2, %eax
  1693. #else
  1694. subl $1, %eax
  1695. #endif
  1696. movl AORIG, AA
  1697. movl BORIG, B
  1698. leal BUFFER, BB
  1699. leal (, %eax, SIZE), %eax
  1700. leal (AA, %eax, 2), AA
  1701. leal (B, %eax, 1), B
  1702. leal (BB, %eax, 2), BB
  1703. #endif
  1704. #if defined(LN) || defined(LT)
  1705. movapd 0 * SIZE(B), %xmm0
  1706. #else
  1707. movapd 0 * SIZE(AA), %xmm0
  1708. #endif
  1709. subpd %xmm4, %xmm0
  1710. #ifdef LN
  1711. movapd %xmm0, %xmm2
  1712. unpckhpd %xmm2, %xmm2
  1713. movsd 3 * SIZE(AA), %xmm4
  1714. mulsd %xmm4, %xmm2
  1715. movsd 2 * SIZE(AA), %xmm5
  1716. mulsd %xmm2, %xmm5
  1717. subsd %xmm5, %xmm0
  1718. movsd 0 * SIZE(AA), %xmm4
  1719. mulsd %xmm4, %xmm0
  1720. unpcklpd %xmm2, %xmm0
  1721. #endif
  1722. #ifdef LT
  1723. movapd %xmm0, %xmm2
  1724. unpckhpd %xmm2, %xmm2
  1725. movsd 0 * SIZE(AA), %xmm4
  1726. mulsd %xmm4, %xmm0
  1727. movsd 1 * SIZE(AA), %xmm5
  1728. mulsd %xmm0, %xmm5
  1729. subsd %xmm5, %xmm2
  1730. movsd 3 * SIZE(AA), %xmm4
  1731. mulsd %xmm4, %xmm2
  1732. unpcklpd %xmm2, %xmm0
  1733. #endif
  1734. #if defined(RN) || defined(RT)
  1735. movsd 0 * SIZE(B), %xmm4
  1736. movhpd 0 * SIZE(B), %xmm4
  1737. mulpd %xmm4, %xmm0
  1738. #endif
  1739. #if defined(LN) || defined(LT)
  1740. movapd %xmm0, 0 * SIZE(B)
  1741. movsd %xmm0, 0 * SIZE(BB)
  1742. movsd %xmm0, 1 * SIZE(BB)
  1743. movhpd %xmm0, 2 * SIZE(BB)
  1744. movhpd %xmm0, 3 * SIZE(BB)
  1745. #else
  1746. movapd %xmm0, 0 * SIZE(AA)
  1747. #endif
  1748. #ifdef LN
  1749. subl $2 * SIZE, %esi
  1750. #endif
  1751. movsd %xmm0, 0 * SIZE(%esi)
  1752. movhpd %xmm0, 1 * SIZE(%esi)
  1753. #ifndef LN
  1754. addl $2 * SIZE, %esi
  1755. #endif
  1756. #if defined(LT) || defined(RN)
  1757. movl K, %eax
  1758. subl KK, %eax
  1759. leal (,%eax, SIZE), %eax
  1760. leal (AA, %eax, 2), AA
  1761. #ifdef LT
  1762. addl $2 * SIZE, B
  1763. #endif
  1764. #endif
  1765. #ifdef LN
  1766. subl $2, KK
  1767. movl BORIG, B
  1768. #endif
  1769. #ifdef LT
  1770. addl $2, KK
  1771. #endif
  1772. #ifdef RT
  1773. movl K, %eax
  1774. movl BORIG, B
  1775. sall $1 + BASE_SHIFT, %eax
  1776. addl %eax, AORIG
  1777. #endif
  1778. ALIGN_2
  1779. .L150:
  1780. movl M, %ebx
  1781. testl $1, %ebx
  1782. jle .L159
  1783. #ifdef LN
  1784. movl K, %eax
  1785. sall $0 + BASE_SHIFT, %eax
  1786. subl %eax, AORIG
  1787. #endif
  1788. #if defined(LN) || defined(RT)
  1789. movl KK, %eax
  1790. movl AORIG, AA
  1791. leal (, %eax, SIZE), %eax
  1792. leal (AA, %eax, 1), AA
  1793. #endif
  1794. leal BUFFER, BB
  1795. movsd 0 * SIZE(BB), %xmm2
  1796. pxor %xmm4, %xmm4
  1797. movsd 0 * SIZE(AA), %xmm0
  1798. pxor %xmm5, %xmm5
  1799. movsd 8 * SIZE(BB), %xmm3
  1800. pxor %xmm6, %xmm6
  1801. movsd 4 * SIZE(AA), %xmm1
  1802. pxor %xmm7, %xmm7
  1803. #if defined(LN) || defined(RT)
  1804. movl KK, %eax
  1805. sall $0 + BASE_SHIFT, %eax
  1806. leal (BB, %eax, 2), BB
  1807. #endif
  1808. #if defined(LT) || defined(RN)
  1809. movl KK, %eax
  1810. #else
  1811. movl K, %eax
  1812. subl KK, %eax
  1813. #endif
  1814. sarl $3, %eax
  1815. je .L152
  1816. .L151:
  1817. mulsd %xmm0, %xmm2
  1818. movsd 1 * SIZE(AA), %xmm0
  1819. addsd %xmm2, %xmm4
  1820. mulsd 2 * SIZE(BB), %xmm0
  1821. movsd 16 * SIZE(BB), %xmm2
  1822. addsd %xmm0, %xmm4
  1823. movsd 2 * SIZE(AA), %xmm0
  1824. mulsd 4 * SIZE(BB), %xmm0
  1825. addsd %xmm0, %xmm4
  1826. movsd 3 * SIZE(AA), %xmm0
  1827. mulsd 6 * SIZE(BB), %xmm0
  1828. addsd %xmm0, %xmm4
  1829. movsd 8 * SIZE(AA), %xmm0
  1830. mulsd %xmm1, %xmm3
  1831. movsd 5 * SIZE(AA), %xmm1
  1832. addsd %xmm3, %xmm4
  1833. mulsd 10 * SIZE(BB), %xmm1
  1834. movsd 24 * SIZE(BB), %xmm3
  1835. addsd %xmm1, %xmm4
  1836. movsd 6 * SIZE(AA), %xmm1
  1837. mulsd 12 * SIZE(BB), %xmm1
  1838. addsd %xmm1, %xmm4
  1839. movsd 7 * SIZE(AA), %xmm1
  1840. mulsd 14 * SIZE(BB), %xmm1
  1841. addsd %xmm1, %xmm4
  1842. movsd 12 * SIZE(AA), %xmm1
  1843. addl $ 8 * SIZE, AA
  1844. addl $16 * SIZE, BB
  1845. BRANCH
  1846. decl %eax
  1847. jne .L151
  1848. .L152:
  1849. #if defined(LT) || defined(RN)
  1850. movl KK, %eax
  1851. #else
  1852. movl K, %eax
  1853. subl KK, %eax
  1854. #endif
  1855. andl $7, %eax # if (k & 1)
  1856. BRANCH
  1857. je .L154
  1858. .L153:
  1859. movsd 0 * SIZE(AA), %xmm0
  1860. mulsd 0 * SIZE(BB), %xmm0
  1861. addsd %xmm0, %xmm4
  1862. addl $1 * SIZE, AA # aoffset += 8
  1863. addl $2 * SIZE, BB # boffset1 += 8
  1864. decl %eax
  1865. BRANCH
  1866. jg .L153
  1867. ALIGN_4
  1868. .L154:
  1869. addsd %xmm6, %xmm4
  1870. addsd %xmm7, %xmm5
  1871. #if defined(LN) || defined(RT)
  1872. movl KK, %eax
  1873. subl $1, %eax
  1874. movl AORIG, AA
  1875. movl BORIG, B
  1876. leal BUFFER, BB
  1877. leal (, %eax, SIZE), %eax
  1878. leal (AA, %eax, 1), AA
  1879. leal (B, %eax, 1), B
  1880. leal (BB, %eax, 2), BB
  1881. #endif
  1882. #if defined(LN) || defined(LT)
  1883. movsd 0 * SIZE(B), %xmm0
  1884. #else
  1885. movsd 0 * SIZE(AA), %xmm0
  1886. #endif
  1887. subsd %xmm4, %xmm0
  1888. #if defined(LN) || defined(LT)
  1889. mulsd 0 * SIZE(AA), %xmm0
  1890. #endif
  1891. #if defined(RN) || defined(RT)
  1892. mulsd 0 * SIZE(B), %xmm0
  1893. #endif
  1894. #if defined(LN) || defined(LT)
  1895. movsd %xmm0, 0 * SIZE(B)
  1896. movsd %xmm0, 0 * SIZE(BB)
  1897. movsd %xmm0, 1 * SIZE(BB)
  1898. #else
  1899. movsd %xmm0, 0 * SIZE(AA)
  1900. #endif
  1901. #ifdef LN
  1902. subl $1 * SIZE, %esi
  1903. #endif
  1904. movsd %xmm0, 0 * SIZE(%esi)
  1905. #ifndef LN
  1906. addl $1 * SIZE, %esi
  1907. #endif
  1908. #if defined(LT) || defined(RN)
  1909. movl K, %eax
  1910. subl KK, %eax
  1911. leal (,%eax, SIZE), %eax
  1912. leal (AA, %eax, 1), AA
  1913. #ifdef LT
  1914. addl $1 * SIZE, B
  1915. #endif
  1916. #endif
  1917. #ifdef LN
  1918. subl $1, KK
  1919. movl BORIG, B
  1920. #endif
  1921. #ifdef LT
  1922. addl $1, KK
  1923. #endif
  1924. #ifdef RT
  1925. movl K, %eax
  1926. movl BORIG, B
  1927. sall $0 + BASE_SHIFT, %eax
  1928. addl %eax, AORIG
  1929. #endif
  1930. ALIGN_2
  1931. .L159:
  1932. #ifdef LN
  1933. movl K, %eax
  1934. leal (, %eax, SIZE), %eax
  1935. leal (B, %eax, 1), B
  1936. #endif
  1937. #if defined(LT) || defined(RN)
  1938. movl K, %eax
  1939. subl KK, %eax
  1940. leal (,%eax, SIZE), %eax
  1941. leal (B, %eax, 1), B
  1942. #endif
  1943. #ifdef RN
  1944. addl $1, KK
  1945. #endif
  1946. #ifdef RT
  1947. subl $1, KK
  1948. #endif
  1949. ALIGN_2
  1950. .L999:
  1951. movl OLD_STACK, %esp
  1952. EMMS
  1953. popl %ebx
  1954. popl %esi
  1955. popl %edi
  1956. popl %ebp
  1957. ret
  1958. ALIGN_2
  1959. EPILOGUE