You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zscal_sse2.S 33 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_ALPHA_R 16 + STACK + ARGS(%esp)
  44. #define STACK_ALPHA_I 24 + STACK + ARGS(%esp)
  45. #define STACK_X 32 + STACK + ARGS(%esp)
  46. #define STACK_INCX 36 + STACK + ARGS(%esp)
  47. #define M %ebx
  48. #define X %ecx
  49. #define INCX %edx
  50. #define I %esi
  51. #define XX %edi
  52. #define FLAG %ebp
  53. #include "l1param.h"
  54. #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
  55. #define USE_PSHUFD
  56. #else
  57. #define USE_PSHUFD_HALF
  58. #endif
  59. #define xmm8 xmm0
  60. #define xmm9 xmm1
  61. #define xmm10 xmm2
  62. #define xmm11 xmm3
  63. #define xmm12 xmm4
  64. #define xmm13 xmm5
  65. #define xmm14 xmm6
  66. #define xmm15 xmm7
  67. PROLOGUE
  68. PROFCODE
  69. pushl %edi
  70. pushl %esi
  71. pushl %ebx
  72. pushl %ebp
  73. movl STACK_M, M
  74. movl STACK_X, X
  75. movl STACK_INCX, INCX
  76. movsd STACK_ALPHA_R, %xmm0
  77. movsd STACK_ALPHA_I, %xmm1
  78. sall $ZBASE_SHIFT, INCX
  79. xor FLAG, FLAG
  80. testl M, M
  81. jle .L999
  82. xorps %xmm7, %xmm7
  83. comisd %xmm0, %xmm7
  84. jne .L100
  85. jp .L100
  86. comisd %xmm1, %xmm7
  87. jne .L100
  88. /* Alpha == ZERO */
  89. cmpl $2 * SIZE, INCX
  90. jne .L20
  91. /* INCX == 1 */
  92. testl $SIZE, X
  93. je .L05
  94. movsd %xmm7, 0 * SIZE(X)
  95. addl $SIZE, X
  96. movl $1, FLAG
  97. decl M
  98. jle .L19
  99. ALIGN_3
  100. .L05:
  101. movl M, I # rcx = n
  102. sarl $3, I
  103. jle .L12
  104. ALIGN_4
  105. .L11:
  106. #ifdef PREFETCHW
  107. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  108. #endif
  109. movaps %xmm7, 0 * SIZE(X)
  110. movaps %xmm7, 2 * SIZE(X)
  111. movaps %xmm7, 4 * SIZE(X)
  112. movaps %xmm7, 6 * SIZE(X)
  113. #ifdef PREFETCHW
  114. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  115. #endif
  116. movaps %xmm7, 8 * SIZE(X)
  117. movaps %xmm7, 10 * SIZE(X)
  118. movaps %xmm7, 12 * SIZE(X)
  119. movaps %xmm7, 14 * SIZE(X)
  120. addl $16 * SIZE, X
  121. decl I
  122. jg .L11
  123. ALIGN_4
  124. .L12:
  125. testl $4, M
  126. je .L13
  127. movaps %xmm7, 0 * SIZE(X)
  128. movaps %xmm7, 2 * SIZE(X)
  129. movaps %xmm7, 4 * SIZE(X)
  130. movaps %xmm7, 6 * SIZE(X)
  131. addl $8 * SIZE, X
  132. ALIGN_3
  133. .L13:
  134. testl $2, M
  135. je .L14
  136. movaps %xmm7, 0 * SIZE(X)
  137. movaps %xmm7, 2 * SIZE(X)
  138. addl $4 * SIZE, X
  139. ALIGN_3
  140. .L14:
  141. testl $1, M
  142. je .L19
  143. movaps %xmm7, 0 * SIZE(X)
  144. addl $2 * SIZE, X
  145. ALIGN_3
  146. .L19:
  147. testl $1, FLAG
  148. je .L999
  149. movsd %xmm7, 0 * SIZE(X)
  150. jmp .L999
  151. ALIGN_4
  152. /* incx != 1 */
  153. .L20:
  154. testl $SIZE, X
  155. jne .L30
  156. /* Aligned Mode */
  157. movl M, I # rcx = n
  158. sarl $2, I
  159. jle .L22
  160. ALIGN_4
  161. .L21:
  162. #ifdef PREFETCHW
  163. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  164. #endif
  165. movaps %xmm7, (X)
  166. addl INCX, X
  167. movaps %xmm7, (X)
  168. addl INCX, X
  169. movaps %xmm7, (X)
  170. addl INCX, X
  171. movaps %xmm7, (X)
  172. addl INCX, X
  173. decl I
  174. jg .L21
  175. ALIGN_4
  176. .L22:
  177. testl $3, M
  178. je .L999
  179. testl $2, M
  180. je .L23
  181. movaps %xmm7, (X)
  182. addl INCX, X
  183. movaps %xmm7, (X)
  184. addl INCX, X
  185. ALIGN_3
  186. .L23:
  187. testl $1, M
  188. je .L999
  189. movaps %xmm7, (X)
  190. jmp .L999
  191. ALIGN_4
  192. /* Unaligned Mode */
  193. .L30:
  194. movl M, I # rcx = n
  195. sarl $2, I
  196. jle .L32
  197. ALIGN_4
  198. .L31:
  199. #ifdef PREFETCHW
  200. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  201. #endif
  202. movlps %xmm7, 0 * SIZE(X)
  203. movlps %xmm7, 1 * SIZE(X)
  204. addl INCX, X
  205. movlps %xmm7, 0 * SIZE(X)
  206. movlps %xmm7, 1 * SIZE(X)
  207. addl INCX, X
  208. movlps %xmm7, 0 * SIZE(X)
  209. movlps %xmm7, 1 * SIZE(X)
  210. addl INCX, X
  211. movlps %xmm7, 0 * SIZE(X)
  212. movlps %xmm7, 1 * SIZE(X)
  213. addl INCX, X
  214. decl I
  215. jg .L31
  216. ALIGN_4
  217. .L32:
  218. testl $3, M
  219. je .L999
  220. testl $2, M
  221. je .L33
  222. movlps %xmm7, 0 * SIZE(X)
  223. movlps %xmm7, 1 * SIZE(X)
  224. addl INCX, X
  225. movlps %xmm7, 0 * SIZE(X)
  226. movlps %xmm7, 1 * SIZE(X)
  227. addl INCX, X
  228. ALIGN_3
  229. .L33:
  230. testl $1, M
  231. je .L999
  232. movlps %xmm7, 0 * SIZE(X)
  233. movlps %xmm7, 1 * SIZE(X)
  234. jmp .L999
  235. ALIGN_4
  236. /* Alpha != ZERO */
  237. .L100:
  238. testl $SIZE, X
  239. jne .L200
  240. #ifdef HAVE_SSE3
  241. movddup %xmm0, %xmm6
  242. #else
  243. pshufd $0x44, %xmm0, %xmm6
  244. #endif
  245. xorps %xmm7, %xmm7
  246. subsd %xmm1, %xmm7
  247. movlhps %xmm1, %xmm7
  248. cmpl $2 * SIZE, INCX
  249. jne .L120
  250. subl $-16 * SIZE, X
  251. movl M, I
  252. sarl $3, I
  253. jle .L115
  254. movaps -16 * SIZE(X), %xmm0
  255. movaps -14 * SIZE(X), %xmm1
  256. movaps -12 * SIZE(X), %xmm2
  257. movaps -10 * SIZE(X), %xmm3
  258. decl I
  259. jle .L112
  260. ALIGN_4
  261. .L111:
  262. #ifdef PREFETCHW
  263. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  264. #endif
  265. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  266. pshufd $0x4e, %xmm0, %xmm5
  267. #else
  268. movsd -15 * SIZE(X), %xmm5
  269. movhps -16 * SIZE(X), %xmm5
  270. #endif
  271. mulpd %xmm6, %xmm0
  272. mulpd %xmm7, %xmm5
  273. addpd %xmm5, %xmm0
  274. movaps %xmm0, -16 * SIZE(X)
  275. movaps -8 * SIZE(X), %xmm0
  276. #ifdef USE_PSHUFD
  277. pshufd $0x4e, %xmm1, %xmm5
  278. #else
  279. movsd -13 * SIZE(X), %xmm5
  280. movhps -14 * SIZE(X), %xmm5
  281. #endif
  282. mulpd %xmm6, %xmm1
  283. mulpd %xmm7, %xmm5
  284. addpd %xmm5, %xmm1
  285. movaps %xmm1, -14 * SIZE(X)
  286. movaps -6 * SIZE(X), %xmm1
  287. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  288. pshufd $0x4e, %xmm2, %xmm5
  289. #else
  290. movsd -11 * SIZE(X), %xmm5
  291. movhps -12 * SIZE(X), %xmm5
  292. #endif
  293. mulpd %xmm6, %xmm2
  294. mulpd %xmm7, %xmm5
  295. addpd %xmm5, %xmm2
  296. movaps %xmm2, -12 * SIZE(X)
  297. movaps -4 * SIZE(X), %xmm2
  298. #ifdef USE_PSHUFD
  299. pshufd $0x4e, %xmm3, %xmm5
  300. #else
  301. movsd -9 * SIZE(X), %xmm5
  302. movhps -10 * SIZE(X), %xmm5
  303. #endif
  304. mulpd %xmm6, %xmm3
  305. mulpd %xmm7, %xmm5
  306. addpd %xmm5, %xmm3
  307. movaps %xmm3, -10 * SIZE(X)
  308. movaps -2 * SIZE(X), %xmm3
  309. #ifdef PREFETCHW
  310. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  311. #endif
  312. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  313. pshufd $0x4e, %xmm0, %xmm5
  314. #else
  315. movsd -7 * SIZE(X), %xmm5
  316. movhps -8 * SIZE(X), %xmm5
  317. #endif
  318. mulpd %xmm6, %xmm0
  319. mulpd %xmm7, %xmm5
  320. addpd %xmm5, %xmm0
  321. movaps %xmm0, -8 * SIZE(X)
  322. movaps 0 * SIZE(X), %xmm0
  323. #ifdef USE_PSHUFD
  324. pshufd $0x4e, %xmm1, %xmm5
  325. #else
  326. movsd -5 * SIZE(X), %xmm5
  327. movhps -6 * SIZE(X), %xmm5
  328. #endif
  329. mulpd %xmm6, %xmm1
  330. mulpd %xmm7, %xmm5
  331. addpd %xmm5, %xmm1
  332. movaps %xmm1, -6 * SIZE(X)
  333. movaps 2 * SIZE(X), %xmm1
  334. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  335. pshufd $0x4e, %xmm2, %xmm5
  336. #else
  337. movsd -3 * SIZE(X), %xmm5
  338. movhps -4 * SIZE(X), %xmm5
  339. #endif
  340. mulpd %xmm6, %xmm2
  341. mulpd %xmm7, %xmm5
  342. addpd %xmm5, %xmm2
  343. movaps %xmm2, -4 * SIZE(X)
  344. movaps 4 * SIZE(X), %xmm2
  345. #ifdef USE_PSHUFD
  346. pshufd $0x4e, %xmm3, %xmm5
  347. #else
  348. movsd -1 * SIZE(X), %xmm5
  349. movhps -2 * SIZE(X), %xmm5
  350. #endif
  351. mulpd %xmm6, %xmm3
  352. mulpd %xmm7, %xmm5
  353. addpd %xmm5, %xmm3
  354. movaps %xmm3, -2 * SIZE(X)
  355. movaps 6 * SIZE(X), %xmm3
  356. subl $-16 * SIZE, X
  357. decl I
  358. jg .L111
  359. ALIGN_4
  360. .L112:
  361. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  362. pshufd $0x4e, %xmm0, %xmm5
  363. #else
  364. movsd -15 * SIZE(X), %xmm5
  365. movhps -16 * SIZE(X), %xmm5
  366. #endif
  367. mulpd %xmm6, %xmm0
  368. mulpd %xmm7, %xmm5
  369. addpd %xmm5, %xmm0
  370. movaps %xmm0, -16 * SIZE(X)
  371. movaps -8 * SIZE(X), %xmm0
  372. #ifdef USE_PSHUFD
  373. pshufd $0x4e, %xmm1, %xmm5
  374. #else
  375. movsd -13 * SIZE(X), %xmm5
  376. movhps -14 * SIZE(X), %xmm5
  377. #endif
  378. mulpd %xmm6, %xmm1
  379. mulpd %xmm7, %xmm5
  380. addpd %xmm5, %xmm1
  381. movaps %xmm1, -14 * SIZE(X)
  382. movaps -6 * SIZE(X), %xmm1
  383. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  384. pshufd $0x4e, %xmm2, %xmm5
  385. #else
  386. movsd -11 * SIZE(X), %xmm5
  387. movhps -12 * SIZE(X), %xmm5
  388. #endif
  389. mulpd %xmm6, %xmm2
  390. mulpd %xmm7, %xmm5
  391. addpd %xmm5, %xmm2
  392. movaps %xmm2, -12 * SIZE(X)
  393. movaps -4 * SIZE(X), %xmm2
  394. #ifdef USE_PSHUFD
  395. pshufd $0x4e, %xmm3, %xmm5
  396. #else
  397. movsd -9 * SIZE(X), %xmm5
  398. movhps -10 * SIZE(X), %xmm5
  399. #endif
  400. mulpd %xmm6, %xmm3
  401. mulpd %xmm7, %xmm5
  402. addpd %xmm5, %xmm3
  403. movaps %xmm3, -10 * SIZE(X)
  404. movaps -2 * SIZE(X), %xmm3
  405. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  406. pshufd $0x4e, %xmm0, %xmm5
  407. #else
  408. movsd -7 * SIZE(X), %xmm5
  409. movhps -8 * SIZE(X), %xmm5
  410. #endif
  411. mulpd %xmm6, %xmm0
  412. mulpd %xmm7, %xmm5
  413. addpd %xmm5, %xmm0
  414. movaps %xmm0, -8 * SIZE(X)
  415. #ifdef USE_PSHUFD
  416. pshufd $0x4e, %xmm1, %xmm5
  417. #else
  418. movsd -5 * SIZE(X), %xmm5
  419. movhps -6 * SIZE(X), %xmm5
  420. #endif
  421. mulpd %xmm6, %xmm1
  422. mulpd %xmm7, %xmm5
  423. addpd %xmm5, %xmm1
  424. movaps %xmm1, -6 * SIZE(X)
  425. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  426. pshufd $0x4e, %xmm2, %xmm5
  427. #else
  428. movsd -3 * SIZE(X), %xmm5
  429. movhps -4 * SIZE(X), %xmm5
  430. #endif
  431. mulpd %xmm6, %xmm2
  432. mulpd %xmm7, %xmm5
  433. addpd %xmm5, %xmm2
  434. movaps %xmm2, -4 * SIZE(X)
  435. #ifdef USE_PSHUFD
  436. pshufd $0x4e, %xmm3, %xmm5
  437. #else
  438. movsd -1 * SIZE(X), %xmm5
  439. movhps -2 * SIZE(X), %xmm5
  440. #endif
  441. mulpd %xmm6, %xmm3
  442. mulpd %xmm7, %xmm5
  443. addpd %xmm5, %xmm3
  444. movaps %xmm3, -2 * SIZE(X)
  445. subl $-16 * SIZE, X
  446. ALIGN_3
  447. .L115:
  448. testl $7, M
  449. je .L999
  450. testl $4, M
  451. je .L116
  452. movaps -16 * SIZE(X), %xmm0
  453. movaps -14 * SIZE(X), %xmm1
  454. pshufd $0x4e, %xmm0, %xmm5
  455. mulpd %xmm6, %xmm0
  456. mulpd %xmm7, %xmm5
  457. addpd %xmm5, %xmm0
  458. movaps %xmm0, -16 * SIZE(X)
  459. pshufd $0x4e, %xmm1, %xmm5
  460. mulpd %xmm6, %xmm1
  461. mulpd %xmm7, %xmm5
  462. addpd %xmm5, %xmm1
  463. movaps %xmm1, -14 * SIZE(X)
  464. movaps -12 * SIZE(X), %xmm2
  465. movaps -10 * SIZE(X), %xmm3
  466. pshufd $0x4e, %xmm2, %xmm5
  467. mulpd %xmm6, %xmm2
  468. mulpd %xmm7, %xmm5
  469. addpd %xmm5, %xmm2
  470. movaps %xmm2, -12 * SIZE(X)
  471. pshufd $0x4e, %xmm3, %xmm5
  472. mulpd %xmm6, %xmm3
  473. mulpd %xmm7, %xmm5
  474. addpd %xmm5, %xmm3
  475. movaps %xmm3, -10 * SIZE(X)
  476. addl $8 * SIZE, X
  477. ALIGN_3
  478. .L116:
  479. testl $2, M
  480. je .L117
  481. movaps -16 * SIZE(X), %xmm0
  482. movaps -14 * SIZE(X), %xmm1
  483. pshufd $0x4e, %xmm0, %xmm5
  484. mulpd %xmm6, %xmm0
  485. mulpd %xmm7, %xmm5
  486. addpd %xmm5, %xmm0
  487. movaps %xmm0, -16 * SIZE(X)
  488. pshufd $0x4e, %xmm1, %xmm5
  489. mulpd %xmm6, %xmm1
  490. mulpd %xmm7, %xmm5
  491. addpd %xmm5, %xmm1
  492. movaps %xmm1, -14 * SIZE(X)
  493. addl $4 * SIZE, X
  494. ALIGN_3
  495. .L117:
  496. testl $1, M
  497. je .L999
  498. movaps -16 * SIZE(X), %xmm0
  499. pshufd $0x4e, %xmm0, %xmm5
  500. mulpd %xmm6, %xmm0
  501. mulpd %xmm7, %xmm5
  502. addpd %xmm5, %xmm0
  503. movaps %xmm0, -16 * SIZE(X)
  504. jmp .L999
  505. ALIGN_3
  506. .L120:
  507. movl X, XX
  508. movl M, I
  509. sarl $3, I
  510. jle .L125
  511. movaps (X), %xmm0
  512. addl INCX, X
  513. movaps (X), %xmm1
  514. addl INCX, X
  515. movaps (X), %xmm2
  516. addl INCX, X
  517. movaps (X), %xmm3
  518. addl INCX, X
  519. decl I
  520. jle .L122
  521. ALIGN_4
  522. .L121:
  523. #ifdef PREFETCHW
  524. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  525. #endif
  526. pshufd $0x4e, %xmm0, %xmm5
  527. mulpd %xmm6, %xmm0
  528. mulpd %xmm7, %xmm5
  529. addpd %xmm5, %xmm0
  530. movaps %xmm0, (XX)
  531. addl INCX, XX
  532. movaps (X), %xmm0
  533. addl INCX, X
  534. pshufd $0x4e, %xmm1, %xmm5
  535. mulpd %xmm6, %xmm1
  536. mulpd %xmm7, %xmm5
  537. addpd %xmm5, %xmm1
  538. movaps %xmm1, (XX)
  539. addl INCX, XX
  540. movaps (X), %xmm1
  541. addl INCX, X
  542. pshufd $0x4e, %xmm2, %xmm5
  543. mulpd %xmm6, %xmm2
  544. mulpd %xmm7, %xmm5
  545. addpd %xmm5, %xmm2
  546. movaps %xmm2, (XX)
  547. addl INCX, XX
  548. movaps (X), %xmm2
  549. addl INCX, X
  550. pshufd $0x4e, %xmm3, %xmm5
  551. mulpd %xmm6, %xmm3
  552. mulpd %xmm7, %xmm5
  553. addpd %xmm5, %xmm3
  554. movaps %xmm3, (XX)
  555. addl INCX, XX
  556. movaps (X), %xmm3
  557. addl INCX, X
  558. #ifdef PREFETCHW
  559. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  560. #endif
  561. pshufd $0x4e, %xmm0, %xmm5
  562. mulpd %xmm6, %xmm0
  563. mulpd %xmm7, %xmm5
  564. addpd %xmm5, %xmm0
  565. movaps %xmm0, (XX)
  566. addl INCX, XX
  567. movaps (X), %xmm0
  568. addl INCX, X
  569. pshufd $0x4e, %xmm1, %xmm5
  570. mulpd %xmm6, %xmm1
  571. mulpd %xmm7, %xmm5
  572. addpd %xmm5, %xmm1
  573. movaps %xmm1, (XX)
  574. addl INCX, XX
  575. movaps (X), %xmm1
  576. addl INCX, X
  577. pshufd $0x4e, %xmm2, %xmm5
  578. mulpd %xmm6, %xmm2
  579. mulpd %xmm7, %xmm5
  580. addpd %xmm5, %xmm2
  581. movaps %xmm2, (XX)
  582. addl INCX, XX
  583. movaps (X), %xmm2
  584. addl INCX, X
  585. pshufd $0x4e, %xmm3, %xmm5
  586. mulpd %xmm6, %xmm3
  587. mulpd %xmm7, %xmm5
  588. addpd %xmm5, %xmm3
  589. movaps %xmm3, (XX)
  590. addl INCX, XX
  591. movaps (X), %xmm3
  592. addl INCX, X
  593. decl I
  594. jg .L121
  595. ALIGN_4
  596. .L122:
  597. pshufd $0x4e, %xmm0, %xmm5
  598. mulpd %xmm6, %xmm0
  599. mulpd %xmm7, %xmm5
  600. addpd %xmm5, %xmm0
  601. movaps %xmm0, (XX)
  602. addl INCX, XX
  603. movaps (X), %xmm0
  604. addl INCX, X
  605. pshufd $0x4e, %xmm1, %xmm5
  606. mulpd %xmm6, %xmm1
  607. mulpd %xmm7, %xmm5
  608. addpd %xmm5, %xmm1
  609. movaps %xmm1, (XX)
  610. addl INCX, XX
  611. movaps (X), %xmm1
  612. addl INCX, X
  613. pshufd $0x4e, %xmm2, %xmm5
  614. mulpd %xmm6, %xmm2
  615. mulpd %xmm7, %xmm5
  616. addpd %xmm5, %xmm2
  617. movaps %xmm2, (XX)
  618. addl INCX, XX
  619. movaps (X), %xmm2
  620. addl INCX, X
  621. pshufd $0x4e, %xmm3, %xmm5
  622. mulpd %xmm6, %xmm3
  623. mulpd %xmm7, %xmm5
  624. addpd %xmm5, %xmm3
  625. movaps %xmm3, (XX)
  626. addl INCX, XX
  627. movaps (X), %xmm3
  628. addl INCX, X
  629. pshufd $0x4e, %xmm0, %xmm5
  630. mulpd %xmm6, %xmm0
  631. mulpd %xmm7, %xmm5
  632. addpd %xmm5, %xmm0
  633. movaps %xmm0, (XX)
  634. addl INCX, XX
  635. pshufd $0x4e, %xmm1, %xmm5
  636. mulpd %xmm6, %xmm1
  637. mulpd %xmm7, %xmm5
  638. addpd %xmm5, %xmm1
  639. movaps %xmm1, (XX)
  640. addl INCX, XX
  641. pshufd $0x4e, %xmm2, %xmm5
  642. mulpd %xmm6, %xmm2
  643. mulpd %xmm7, %xmm5
  644. addpd %xmm5, %xmm2
  645. movaps %xmm2, (XX)
  646. addl INCX, XX
  647. pshufd $0x4e, %xmm3, %xmm5
  648. mulpd %xmm6, %xmm3
  649. mulpd %xmm7, %xmm5
  650. addpd %xmm5, %xmm3
  651. movaps %xmm3, (XX)
  652. addl INCX, XX
  653. ALIGN_3
  654. .L125:
  655. testl $7, M
  656. je .L999
  657. testl $4, M
  658. je .L126
  659. movaps (X), %xmm0
  660. addl INCX, X
  661. movaps (X), %xmm1
  662. addl INCX, X
  663. movaps (X), %xmm2
  664. addl INCX, X
  665. movaps (X), %xmm3
  666. addl INCX, X
  667. pshufd $0x4e, %xmm0, %xmm5
  668. mulpd %xmm6, %xmm0
  669. mulpd %xmm7, %xmm5
  670. addpd %xmm5, %xmm0
  671. movaps %xmm0, (XX)
  672. addl INCX, XX
  673. pshufd $0x4e, %xmm1, %xmm5
  674. mulpd %xmm6, %xmm1
  675. mulpd %xmm7, %xmm5
  676. addpd %xmm5, %xmm1
  677. movaps %xmm1, (XX)
  678. addl INCX, XX
  679. pshufd $0x4e, %xmm2, %xmm5
  680. mulpd %xmm6, %xmm2
  681. mulpd %xmm7, %xmm5
  682. addpd %xmm5, %xmm2
  683. movaps %xmm2, (XX)
  684. addl INCX, XX
  685. pshufd $0x4e, %xmm3, %xmm5
  686. mulpd %xmm6, %xmm3
  687. mulpd %xmm7, %xmm5
  688. addpd %xmm5, %xmm3
  689. movaps %xmm3, (XX)
  690. addl INCX, XX
  691. ALIGN_3
  692. .L126:
  693. testl $2, M
  694. je .L127
  695. movaps (X), %xmm0
  696. addl INCX, X
  697. movaps (X), %xmm1
  698. addl INCX, X
  699. pshufd $0x4e, %xmm0, %xmm5
  700. mulpd %xmm6, %xmm0
  701. mulpd %xmm7, %xmm5
  702. addpd %xmm5, %xmm0
  703. movaps %xmm0, (XX)
  704. addl INCX, XX
  705. pshufd $0x4e, %xmm1, %xmm5
  706. mulpd %xmm6, %xmm1
  707. mulpd %xmm7, %xmm5
  708. addpd %xmm5, %xmm1
  709. movaps %xmm1, (XX)
  710. addl INCX, XX
  711. ALIGN_3
  712. .L127:
  713. testl $1, M
  714. je .L999
  715. movaps (X), %xmm0
  716. pshufd $0x4e, %xmm0, %xmm5
  717. mulpd %xmm6, %xmm0
  718. mulpd %xmm7, %xmm5
  719. addpd %xmm5, %xmm0
  720. movaps %xmm0, (XX)
  721. jmp .L999
  722. ALIGN_3
  723. .L200:
  724. cmpl $2 * SIZE, INCX
  725. jne .L220
  726. #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
  727. #ifdef HAVE_SSE3
  728. movddup %xmm0, %xmm6
  729. #else
  730. pshufd $0x44, %xmm0, %xmm6
  731. #endif
  732. pxor %xmm7, %xmm7
  733. subsd %xmm1, %xmm7
  734. movlhps %xmm1, %xmm7
  735. shufpd $1, %xmm7, %xmm7
  736. movhps 0 * SIZE(X), %xmm0
  737. movaps 1 * SIZE(X), %xmm1
  738. subl $-16 * SIZE, X
  739. unpckhpd %xmm0, %xmm0
  740. mulsd %xmm6, %xmm0
  741. movaps %xmm1, %xmm5
  742. mulsd %xmm7, %xmm5
  743. subsd %xmm5, %xmm0
  744. movlps %xmm0, -16 * SIZE(X)
  745. decl M
  746. movl M, I
  747. sarl $3, I
  748. jle .L205
  749. movaps -13 * SIZE(X), %xmm2
  750. movaps -11 * SIZE(X), %xmm3
  751. decl I
  752. jle .L202
  753. ALIGN_4
  754. .L201:
  755. #ifdef PREFETCHW
  756. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  757. #endif
  758. movaps %xmm1, %xmm5
  759. SHUFPD_1 %xmm2, %xmm0
  760. mulpd %xmm6, %xmm5
  761. mulpd %xmm7, %xmm0
  762. addpd %xmm5, %xmm0
  763. movaps %xmm0, -15 * SIZE(X)
  764. movaps -9 * SIZE(X), %xmm0
  765. movaps %xmm2, %xmm5
  766. SHUFPD_1 %xmm3, %xmm1
  767. mulpd %xmm6, %xmm5
  768. mulpd %xmm7, %xmm1
  769. addpd %xmm5, %xmm1
  770. movaps %xmm1, -13 * SIZE(X)
  771. movaps -7 * SIZE(X), %xmm1
  772. movaps %xmm3, %xmm5
  773. SHUFPD_1 %xmm0, %xmm2
  774. mulpd %xmm6, %xmm5
  775. mulpd %xmm7, %xmm2
  776. addpd %xmm5, %xmm2
  777. movaps %xmm2, -11 * SIZE(X)
  778. movaps -5 * SIZE(X), %xmm2
  779. movaps %xmm0, %xmm5
  780. SHUFPD_1 %xmm1, %xmm3
  781. mulpd %xmm6, %xmm5
  782. mulpd %xmm7, %xmm3
  783. addpd %xmm5, %xmm3
  784. movaps %xmm3, -9 * SIZE(X)
  785. movaps -3 * SIZE(X), %xmm3
  786. #ifdef PREFETCHW
  787. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  788. #endif
  789. movaps %xmm1, %xmm5
  790. SHUFPD_1 %xmm2, %xmm0
  791. mulpd %xmm6, %xmm5
  792. mulpd %xmm7, %xmm0
  793. addpd %xmm5, %xmm0
  794. movaps %xmm0, -7 * SIZE(X)
  795. movaps -1 * SIZE(X), %xmm0
  796. movaps %xmm2, %xmm5
  797. SHUFPD_1 %xmm3, %xmm1
  798. mulpd %xmm6, %xmm5
  799. mulpd %xmm7, %xmm1
  800. addpd %xmm5, %xmm1
  801. movaps %xmm1, -5 * SIZE(X)
  802. movaps 1 * SIZE(X), %xmm1
  803. movaps %xmm3, %xmm5
  804. SHUFPD_1 %xmm0, %xmm2
  805. mulpd %xmm6, %xmm5
  806. mulpd %xmm7, %xmm2
  807. addpd %xmm5, %xmm2
  808. movaps %xmm2, -3 * SIZE(X)
  809. movaps 3 * SIZE(X), %xmm2
  810. movaps %xmm0, %xmm5
  811. SHUFPD_1 %xmm1, %xmm3
  812. mulpd %xmm6, %xmm5
  813. mulpd %xmm7, %xmm3
  814. addpd %xmm5, %xmm3
  815. movaps %xmm3, -1 * SIZE(X)
  816. movaps 5 * SIZE(X), %xmm3
  817. subl $-16 * SIZE, X
  818. decl I
  819. jg .L201
  820. ALIGN_4
  821. .L202:
  822. movaps %xmm1, %xmm5
  823. SHUFPD_1 %xmm2, %xmm0
  824. mulpd %xmm6, %xmm5
  825. mulpd %xmm7, %xmm0
  826. addpd %xmm5, %xmm0
  827. movaps %xmm0, -15 * SIZE(X)
  828. movaps -9 * SIZE(X), %xmm0
  829. movaps %xmm2, %xmm5
  830. SHUFPD_1 %xmm3, %xmm1
  831. mulpd %xmm6, %xmm5
  832. mulpd %xmm7, %xmm1
  833. addpd %xmm5, %xmm1
  834. movaps %xmm1, -13 * SIZE(X)
  835. movaps -7 * SIZE(X), %xmm1
  836. movaps %xmm3, %xmm5
  837. SHUFPD_1 %xmm0, %xmm2
  838. mulpd %xmm6, %xmm5
  839. mulpd %xmm7, %xmm2
  840. addpd %xmm5, %xmm2
  841. movaps %xmm2, -11 * SIZE(X)
  842. movaps -5 * SIZE(X), %xmm2
  843. movaps %xmm0, %xmm5
  844. SHUFPD_1 %xmm1, %xmm3
  845. mulpd %xmm6, %xmm5
  846. mulpd %xmm7, %xmm3
  847. addpd %xmm5, %xmm3
  848. movaps %xmm3, -9 * SIZE(X)
  849. movaps -3 * SIZE(X), %xmm3
  850. movaps %xmm1, %xmm5
  851. SHUFPD_1 %xmm2, %xmm0
  852. mulpd %xmm6, %xmm5
  853. mulpd %xmm7, %xmm0
  854. addpd %xmm5, %xmm0
  855. movaps %xmm0, -7 * SIZE(X)
  856. movaps -1 * SIZE(X), %xmm0
  857. movaps %xmm2, %xmm5
  858. SHUFPD_1 %xmm3, %xmm1
  859. mulpd %xmm6, %xmm5
  860. mulpd %xmm7, %xmm1
  861. addpd %xmm5, %xmm1
  862. movaps %xmm1, -5 * SIZE(X)
  863. movaps 1 * SIZE(X), %xmm1
  864. movaps %xmm3, %xmm5
  865. SHUFPD_1 %xmm0, %xmm2
  866. mulpd %xmm6, %xmm5
  867. mulpd %xmm7, %xmm2
  868. addpd %xmm5, %xmm2
  869. movaps %xmm2, -3 * SIZE(X)
  870. movaps %xmm0, %xmm5
  871. SHUFPD_1 %xmm1, %xmm3
  872. mulpd %xmm6, %xmm5
  873. mulpd %xmm7, %xmm3
  874. addpd %xmm5, %xmm3
  875. movaps %xmm3, -1 * SIZE(X)
  876. subl $-16 * SIZE, X
  877. ALIGN_3
  878. .L205:
  879. testl $4, M
  880. je .L206
  881. movaps -13 * SIZE(X), %xmm2
  882. movaps %xmm1, %xmm5
  883. SHUFPD_1 %xmm2, %xmm0
  884. mulpd %xmm6, %xmm5
  885. mulpd %xmm7, %xmm0
  886. addpd %xmm5, %xmm0
  887. movaps %xmm0, -15 * SIZE(X)
  888. movaps -11 * SIZE(X), %xmm3
  889. movaps %xmm2, %xmm5
  890. SHUFPD_1 %xmm3, %xmm1
  891. mulpd %xmm6, %xmm5
  892. mulpd %xmm7, %xmm1
  893. addpd %xmm5, %xmm1
  894. movaps %xmm1, -13 * SIZE(X)
  895. movaps -9 * SIZE(X), %xmm0
  896. movaps %xmm3, %xmm5
  897. SHUFPD_1 %xmm0, %xmm2
  898. mulpd %xmm6, %xmm5
  899. mulpd %xmm7, %xmm2
  900. addpd %xmm5, %xmm2
  901. movaps %xmm2, -11 * SIZE(X)
  902. movaps -7 * SIZE(X), %xmm1
  903. movaps %xmm0, %xmm5
  904. SHUFPD_1 %xmm1, %xmm3
  905. mulpd %xmm6, %xmm5
  906. mulpd %xmm7, %xmm3
  907. addpd %xmm5, %xmm3
  908. movaps %xmm3, -9 * SIZE(X)
  909. addl $8 * SIZE, X
  910. ALIGN_3
  911. .L206:
  912. testl $2, M
  913. je .L207
  914. movaps -13 * SIZE(X), %xmm2
  915. movaps %xmm1, %xmm5
  916. SHUFPD_1 %xmm2, %xmm0
  917. mulpd %xmm6, %xmm5
  918. mulpd %xmm7, %xmm0
  919. addpd %xmm5, %xmm0
  920. movaps %xmm0, -15 * SIZE(X)
  921. movaps -11 * SIZE(X), %xmm3
  922. movaps %xmm2, %xmm5
  923. SHUFPD_1 %xmm3, %xmm1
  924. mulpd %xmm6, %xmm5
  925. mulpd %xmm7, %xmm1
  926. addpd %xmm5, %xmm1
  927. movaps %xmm1, -13 * SIZE(X)
  928. movaps %xmm2, %xmm0
  929. movaps %xmm3, %xmm1
  930. addl $4 * SIZE, X
  931. ALIGN_3
  932. .L207:
  933. testl $1, M
  934. je .L208
  935. movaps -13 * SIZE(X), %xmm2
  936. movaps %xmm1, %xmm5
  937. SHUFPD_1 %xmm2, %xmm0
  938. mulpd %xmm6, %xmm5
  939. mulpd %xmm7, %xmm0
  940. addpd %xmm5, %xmm0
  941. movaps %xmm0, -15 * SIZE(X)
  942. movaps %xmm1, %xmm0
  943. movaps %xmm2, %xmm1
  944. addl $2 * SIZE, X
  945. ALIGN_3
  946. .L208:
  947. unpckhpd %xmm0, %xmm0
  948. mulsd %xmm6, %xmm1
  949. mulsd %xmm7, %xmm0
  950. addsd %xmm1, %xmm0
  951. movlps %xmm0, -15 * SIZE(X)
  952. jmp .L999
  953. ALIGN_3
  954. #else
  955. #ifdef HAVE_SSE3
  956. movddup %xmm0, %xmm6
  957. #else
  958. pshufd $0x44, %xmm0, %xmm6
  959. #endif
  960. pxor %xmm7, %xmm7
  961. subsd %xmm1, %xmm7
  962. movlhps %xmm1, %xmm7
  963. subl $-16 * SIZE, X
  964. movl M, I
  965. sarl $3, I
  966. jle .L205
  967. movsd -16 * SIZE(X), %xmm0
  968. movhps -15 * SIZE(X), %xmm0
  969. movsd -14 * SIZE(X), %xmm1
  970. movhps -13 * SIZE(X), %xmm1
  971. movsd -12 * SIZE(X), %xmm2
  972. movhps -11 * SIZE(X), %xmm2
  973. movsd -10 * SIZE(X), %xmm3
  974. movhps -9 * SIZE(X), %xmm3
  975. decl I
  976. jle .L202
  977. ALIGN_4
  978. .L201:
  979. #ifdef PREFETCHW
  980. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  981. #endif
  982. pshufd $0x4e, %xmm0, %xmm5
  983. mulpd %xmm6, %xmm0
  984. mulpd %xmm7, %xmm5
  985. addpd %xmm5, %xmm0
  986. movlps %xmm0, -16 * SIZE(X)
  987. movhps %xmm0, -15 * SIZE(X)
  988. movsd -8 * SIZE(X), %xmm0
  989. movhps -7 * SIZE(X), %xmm0
  990. pshufd $0x4e, %xmm1, %xmm5
  991. mulpd %xmm6, %xmm1
  992. mulpd %xmm7, %xmm5
  993. addpd %xmm5, %xmm1
  994. movlps %xmm1, -14 * SIZE(X)
  995. movhps %xmm1, -13 * SIZE(X)
  996. movsd -6 * SIZE(X), %xmm1
  997. movhps -5 * SIZE(X), %xmm1
  998. pshufd $0x4e, %xmm2, %xmm5
  999. mulpd %xmm6, %xmm2
  1000. mulpd %xmm7, %xmm5
  1001. addpd %xmm5, %xmm2
  1002. movlps %xmm2, -12 * SIZE(X)
  1003. movhps %xmm2, -11 * SIZE(X)
  1004. movsd -4 * SIZE(X), %xmm2
  1005. movhps -3 * SIZE(X), %xmm2
  1006. pshufd $0x4e, %xmm3, %xmm5
  1007. mulpd %xmm6, %xmm3
  1008. mulpd %xmm7, %xmm5
  1009. addpd %xmm5, %xmm3
  1010. movlps %xmm3, -10 * SIZE(X)
  1011. movhps %xmm3, -9 * SIZE(X)
  1012. movsd -2 * SIZE(X), %xmm3
  1013. movhps -1 * SIZE(X), %xmm3
  1014. #ifdef PREFETCHW
  1015. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  1016. #endif
  1017. pshufd $0x4e, %xmm0, %xmm5
  1018. mulpd %xmm6, %xmm0
  1019. mulpd %xmm7, %xmm5
  1020. addpd %xmm5, %xmm0
  1021. movlps %xmm0, -8 * SIZE(X)
  1022. movhps %xmm0, -7 * SIZE(X)
  1023. movsd 0 * SIZE(X), %xmm0
  1024. movhps 1 * SIZE(X), %xmm0
  1025. pshufd $0x4e, %xmm1, %xmm5
  1026. mulpd %xmm6, %xmm1
  1027. mulpd %xmm7, %xmm5
  1028. addpd %xmm5, %xmm1
  1029. movlps %xmm1, -6 * SIZE(X)
  1030. movhps %xmm1, -5 * SIZE(X)
  1031. movsd 2 * SIZE(X), %xmm1
  1032. movhps 3 * SIZE(X), %xmm1
  1033. pshufd $0x4e, %xmm2, %xmm5
  1034. mulpd %xmm6, %xmm2
  1035. mulpd %xmm7, %xmm5
  1036. addpd %xmm5, %xmm2
  1037. movlps %xmm2, -4 * SIZE(X)
  1038. movhps %xmm2, -3 * SIZE(X)
  1039. movsd 4 * SIZE(X), %xmm2
  1040. movhps 5 * SIZE(X), %xmm2
  1041. pshufd $0x4e, %xmm3, %xmm5
  1042. mulpd %xmm6, %xmm3
  1043. mulpd %xmm7, %xmm5
  1044. addpd %xmm5, %xmm3
  1045. movlps %xmm3, -2 * SIZE(X)
  1046. movhps %xmm3, -1 * SIZE(X)
  1047. movsd 6 * SIZE(X), %xmm3
  1048. movhps 7 * SIZE(X), %xmm3
  1049. subl $-16 * SIZE, X
  1050. decl I
  1051. jg .L201
  1052. ALIGN_4
  1053. .L202:
  1054. pshufd $0x4e, %xmm0, %xmm5
  1055. mulpd %xmm6, %xmm0
  1056. mulpd %xmm7, %xmm5
  1057. addpd %xmm5, %xmm0
  1058. movlps %xmm0, -16 * SIZE(X)
  1059. movhps %xmm0, -15 * SIZE(X)
  1060. movsd -8 * SIZE(X), %xmm0
  1061. movhps -7 * SIZE(X), %xmm0
  1062. pshufd $0x4e, %xmm1, %xmm5
  1063. mulpd %xmm6, %xmm1
  1064. mulpd %xmm7, %xmm5
  1065. addpd %xmm5, %xmm1
  1066. movlps %xmm1, -14 * SIZE(X)
  1067. movhps %xmm1, -13 * SIZE(X)
  1068. movsd -6 * SIZE(X), %xmm1
  1069. movhps -5 * SIZE(X), %xmm1
  1070. pshufd $0x4e, %xmm2, %xmm5
  1071. mulpd %xmm6, %xmm2
  1072. mulpd %xmm7, %xmm5
  1073. addpd %xmm5, %xmm2
  1074. movlps %xmm2, -12 * SIZE(X)
  1075. movhps %xmm2, -11 * SIZE(X)
  1076. movsd -4 * SIZE(X), %xmm2
  1077. movhps -3 * SIZE(X), %xmm2
  1078. pshufd $0x4e, %xmm3, %xmm5
  1079. mulpd %xmm6, %xmm3
  1080. mulpd %xmm7, %xmm5
  1081. addpd %xmm5, %xmm3
  1082. movlps %xmm3, -10 * SIZE(X)
  1083. movhps %xmm3, -9 * SIZE(X)
  1084. movsd -2 * SIZE(X), %xmm3
  1085. movhps -1 * SIZE(X), %xmm3
  1086. pshufd $0x4e, %xmm0, %xmm5
  1087. mulpd %xmm6, %xmm0
  1088. mulpd %xmm7, %xmm5
  1089. addpd %xmm5, %xmm0
  1090. movlps %xmm0, -8 * SIZE(X)
  1091. movhps %xmm0, -7 * SIZE(X)
  1092. pshufd $0x4e, %xmm1, %xmm5
  1093. mulpd %xmm6, %xmm1
  1094. mulpd %xmm7, %xmm5
  1095. addpd %xmm5, %xmm1
  1096. movlps %xmm1, -6 * SIZE(X)
  1097. movhps %xmm1, -5 * SIZE(X)
  1098. pshufd $0x4e, %xmm2, %xmm5
  1099. mulpd %xmm6, %xmm2
  1100. mulpd %xmm7, %xmm5
  1101. addpd %xmm5, %xmm2
  1102. movlps %xmm2, -4 * SIZE(X)
  1103. movhps %xmm2, -3 * SIZE(X)
  1104. pshufd $0x4e, %xmm3, %xmm5
  1105. mulpd %xmm6, %xmm3
  1106. mulpd %xmm7, %xmm5
  1107. addpd %xmm5, %xmm3
  1108. movlps %xmm3, -2 * SIZE(X)
  1109. movhps %xmm3, -1 * SIZE(X)
  1110. subl $-16 * SIZE, X
  1111. ALIGN_3
  1112. .L205:
  1113. testl $7, M
  1114. je .L999
  1115. testl $4, M
  1116. je .L206
  1117. movsd -16 * SIZE(X), %xmm0
  1118. movhps -15 * SIZE(X), %xmm0
  1119. movsd -14 * SIZE(X), %xmm1
  1120. movhps -13 * SIZE(X), %xmm1
  1121. pshufd $0x4e, %xmm0, %xmm5
  1122. mulpd %xmm6, %xmm0
  1123. mulpd %xmm7, %xmm5
  1124. addpd %xmm5, %xmm0
  1125. movlps %xmm0, -16 * SIZE(X)
  1126. movhps %xmm0, -15 * SIZE(X)
  1127. pshufd $0x4e, %xmm1, %xmm5
  1128. mulpd %xmm6, %xmm1
  1129. mulpd %xmm7, %xmm5
  1130. addpd %xmm5, %xmm1
  1131. movlps %xmm1, -14 * SIZE(X)
  1132. movhps %xmm1, -13 * SIZE(X)
  1133. movsd -12 * SIZE(X), %xmm2
  1134. movhps -11 * SIZE(X), %xmm2
  1135. movsd -10 * SIZE(X), %xmm3
  1136. movhps -9 * SIZE(X), %xmm3
  1137. pshufd $0x4e, %xmm2, %xmm5
  1138. mulpd %xmm6, %xmm2
  1139. mulpd %xmm7, %xmm5
  1140. addpd %xmm5, %xmm2
  1141. movlps %xmm2, -12 * SIZE(X)
  1142. movhps %xmm2, -11 * SIZE(X)
  1143. pshufd $0x4e, %xmm3, %xmm5
  1144. mulpd %xmm6, %xmm3
  1145. mulpd %xmm7, %xmm5
  1146. addpd %xmm5, %xmm3
  1147. movlps %xmm3, -10 * SIZE(X)
  1148. movhps %xmm3, -9 * SIZE(X)
  1149. addl $8 * SIZE, X
  1150. ALIGN_3
  1151. .L206:
  1152. testl $2, M
  1153. je .L207
  1154. movsd -16 * SIZE(X), %xmm0
  1155. movhps -15 * SIZE(X), %xmm0
  1156. pshufd $0x4e, %xmm0, %xmm5
  1157. mulpd %xmm6, %xmm0
  1158. mulpd %xmm7, %xmm5
  1159. addpd %xmm5, %xmm0
  1160. movlps %xmm0, -16 * SIZE(X)
  1161. movhps %xmm0, -15 * SIZE(X)
  1162. movsd -14 * SIZE(X), %xmm1
  1163. movhps -13 * SIZE(X), %xmm1
  1164. pshufd $0x4e, %xmm1, %xmm5
  1165. mulpd %xmm6, %xmm1
  1166. mulpd %xmm7, %xmm5
  1167. addpd %xmm5, %xmm1
  1168. movlps %xmm1, -14 * SIZE(X)
  1169. movhps %xmm1, -13 * SIZE(X)
  1170. addl $4 * SIZE, X
  1171. ALIGN_3
  1172. .L207:
  1173. testl $1, M
  1174. je .L999
  1175. movsd -16 * SIZE(X), %xmm0
  1176. movhps -15 * SIZE(X), %xmm0
  1177. pshufd $0x4e, %xmm0, %xmm5
  1178. mulpd %xmm6, %xmm0
  1179. mulpd %xmm7, %xmm5
  1180. addpd %xmm5, %xmm0
  1181. movlps %xmm0, -16 * SIZE(X)
  1182. movhps %xmm0, -15 * SIZE(X)
  1183. jmp .L999
  1184. ALIGN_3
  1185. #endif
  1186. .L220:
  1187. #ifdef HAVE_SSE3
  1188. movddup %xmm0, %xmm6
  1189. #else
  1190. pshufd $0x44, %xmm0, %xmm6
  1191. #endif
  1192. pxor %xmm7, %xmm7
  1193. subsd %xmm1, %xmm7
  1194. movlhps %xmm1, %xmm7
  1195. movl X, XX
  1196. movl M, I
  1197. sarl $3, I
  1198. jle .L225
  1199. movsd 0 * SIZE(X), %xmm0
  1200. movhps 1 * SIZE(X), %xmm0
  1201. addl INCX, X
  1202. movsd 0 * SIZE(X), %xmm1
  1203. movhps 1 * SIZE(X), %xmm1
  1204. addl INCX, X
  1205. movsd 0 * SIZE(X), %xmm2
  1206. movhps 1 * SIZE(X), %xmm2
  1207. addl INCX, X
  1208. movsd 0 * SIZE(X), %xmm3
  1209. movhps 1 * SIZE(X), %xmm3
  1210. addl INCX, X
  1211. decl I
  1212. jle .L222
  1213. ALIGN_4
  1214. .L221:
  1215. #ifdef PREFETCHW
  1216. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  1217. #endif
  1218. pshufd $0x4e, %xmm0, %xmm5
  1219. mulpd %xmm6, %xmm0
  1220. mulpd %xmm7, %xmm5
  1221. addpd %xmm5, %xmm0
  1222. movlps %xmm0, 0 * SIZE(XX)
  1223. movhps %xmm0, 1 * SIZE(XX)
  1224. addl INCX, XX
  1225. movsd 0 * SIZE(X), %xmm0
  1226. movhps 1 * SIZE(X), %xmm0
  1227. addl INCX, X
  1228. pshufd $0x4e, %xmm1, %xmm5
  1229. mulpd %xmm6, %xmm1
  1230. mulpd %xmm7, %xmm5
  1231. addpd %xmm5, %xmm1
  1232. movlps %xmm1, 0 * SIZE(XX)
  1233. movhps %xmm1, 1 * SIZE(XX)
  1234. addl INCX, XX
  1235. movsd 0 * SIZE(X), %xmm1
  1236. movhps 1 * SIZE(X), %xmm1
  1237. addl INCX, X
  1238. pshufd $0x4e, %xmm2, %xmm5
  1239. mulpd %xmm6, %xmm2
  1240. mulpd %xmm7, %xmm5
  1241. addpd %xmm5, %xmm2
  1242. movlps %xmm2, 0 * SIZE(XX)
  1243. movhps %xmm2, 1 * SIZE(XX)
  1244. addl INCX, XX
  1245. movsd 0 * SIZE(X), %xmm2
  1246. movhps 1 * SIZE(X), %xmm2
  1247. addl INCX, X
  1248. pshufd $0x4e, %xmm3, %xmm5
  1249. mulpd %xmm6, %xmm3
  1250. mulpd %xmm7, %xmm5
  1251. addpd %xmm5, %xmm3
  1252. movlps %xmm3, 0 * SIZE(XX)
  1253. movhps %xmm3, 1 * SIZE(XX)
  1254. addl INCX, XX
  1255. movsd 0 * SIZE(X), %xmm3
  1256. movhps 1 * SIZE(X), %xmm3
  1257. addl INCX, X
  1258. #ifdef PREFETCHW
  1259. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  1260. #endif
  1261. pshufd $0x4e, %xmm0, %xmm5
  1262. mulpd %xmm6, %xmm0
  1263. mulpd %xmm7, %xmm5
  1264. addpd %xmm5, %xmm0
  1265. movlps %xmm0, 0 * SIZE(XX)
  1266. movhps %xmm0, 1 * SIZE(XX)
  1267. addl INCX, XX
  1268. movsd 0 * SIZE(X), %xmm0
  1269. movhps 1 * SIZE(X), %xmm0
  1270. addl INCX, X
  1271. pshufd $0x4e, %xmm1, %xmm5
  1272. mulpd %xmm6, %xmm1
  1273. mulpd %xmm7, %xmm5
  1274. addpd %xmm5, %xmm1
  1275. movlps %xmm1, 0 * SIZE(XX)
  1276. movhps %xmm1, 1 * SIZE(XX)
  1277. addl INCX, XX
  1278. movsd 0 * SIZE(X), %xmm1
  1279. movhps 1 * SIZE(X), %xmm1
  1280. addl INCX, X
  1281. pshufd $0x4e, %xmm2, %xmm5
  1282. mulpd %xmm6, %xmm2
  1283. mulpd %xmm7, %xmm5
  1284. addpd %xmm5, %xmm2
  1285. movlps %xmm2, 0 * SIZE(XX)
  1286. movhps %xmm2, 1 * SIZE(XX)
  1287. addl INCX, XX
  1288. movsd 0 * SIZE(X), %xmm2
  1289. movhps 1 * SIZE(X), %xmm2
  1290. addl INCX, X
  1291. pshufd $0x4e, %xmm3, %xmm5
  1292. mulpd %xmm6, %xmm3
  1293. mulpd %xmm7, %xmm5
  1294. addpd %xmm5, %xmm3
  1295. movlps %xmm3, 0 * SIZE(XX)
  1296. movhps %xmm3, 1 * SIZE(XX)
  1297. addl INCX, XX
  1298. movsd 0 * SIZE(X), %xmm3
  1299. movhps 1 * SIZE(X), %xmm3
  1300. addl INCX, X
  1301. decl I
  1302. jg .L221
  1303. ALIGN_4
  1304. .L222:
  1305. pshufd $0x4e, %xmm0, %xmm5
  1306. mulpd %xmm6, %xmm0
  1307. mulpd %xmm7, %xmm5
  1308. addpd %xmm5, %xmm0
  1309. movlps %xmm0, 0 * SIZE(XX)
  1310. movhps %xmm0, 1 * SIZE(XX)
  1311. addl INCX, XX
  1312. movsd 0 * SIZE(X), %xmm0
  1313. movhps 1 * SIZE(X), %xmm0
  1314. addl INCX, X
  1315. pshufd $0x4e, %xmm1, %xmm5
  1316. mulpd %xmm6, %xmm1
  1317. mulpd %xmm7, %xmm5
  1318. addpd %xmm5, %xmm1
  1319. movlps %xmm1, 0 * SIZE(XX)
  1320. movhps %xmm1, 1 * SIZE(XX)
  1321. addl INCX, XX
  1322. movsd 0 * SIZE(X), %xmm1
  1323. movhps 1 * SIZE(X), %xmm1
  1324. addl INCX, X
  1325. pshufd $0x4e, %xmm2, %xmm5
  1326. mulpd %xmm6, %xmm2
  1327. mulpd %xmm7, %xmm5
  1328. addpd %xmm5, %xmm2
  1329. movlps %xmm2, 0 * SIZE(XX)
  1330. movhps %xmm2, 1 * SIZE(XX)
  1331. addl INCX, XX
  1332. movsd 0 * SIZE(X), %xmm2
  1333. movhps 1 * SIZE(X), %xmm2
  1334. addl INCX, X
  1335. pshufd $0x4e, %xmm3, %xmm5
  1336. mulpd %xmm6, %xmm3
  1337. mulpd %xmm7, %xmm5
  1338. addpd %xmm5, %xmm3
  1339. movlps %xmm3, 0 * SIZE(XX)
  1340. movhps %xmm3, 1 * SIZE(XX)
  1341. addl INCX, XX
  1342. movsd 0 * SIZE(X), %xmm3
  1343. movhps 1 * SIZE(X), %xmm3
  1344. addl INCX, X
  1345. pshufd $0x4e, %xmm0, %xmm5
  1346. mulpd %xmm6, %xmm0
  1347. mulpd %xmm7, %xmm5
  1348. addpd %xmm5, %xmm0
  1349. movlps %xmm0, 0 * SIZE(XX)
  1350. movhps %xmm0, 1 * SIZE(XX)
  1351. addl INCX, XX
  1352. pshufd $0x4e, %xmm1, %xmm5
  1353. mulpd %xmm6, %xmm1
  1354. mulpd %xmm7, %xmm5
  1355. addpd %xmm5, %xmm1
  1356. movlps %xmm1, 0 * SIZE(XX)
  1357. movhps %xmm1, 1 * SIZE(XX)
  1358. addl INCX, XX
  1359. pshufd $0x4e, %xmm2, %xmm5
  1360. mulpd %xmm6, %xmm2
  1361. mulpd %xmm7, %xmm5
  1362. addpd %xmm5, %xmm2
  1363. movlps %xmm2, 0 * SIZE(XX)
  1364. movhps %xmm2, 1 * SIZE(XX)
  1365. addl INCX, XX
  1366. pshufd $0x4e, %xmm3, %xmm5
  1367. mulpd %xmm6, %xmm3
  1368. mulpd %xmm7, %xmm5
  1369. addpd %xmm5, %xmm3
  1370. movlps %xmm3, 0 * SIZE(XX)
  1371. movhps %xmm3, 1 * SIZE(XX)
  1372. addl INCX, XX
  1373. ALIGN_3
  1374. .L225:
  1375. testl $7, M
  1376. je .L999
  1377. testl $4, M
  1378. je .L226
  1379. movsd 0 * SIZE(X), %xmm0
  1380. movhps 1 * SIZE(X), %xmm0
  1381. addl INCX, X
  1382. pshufd $0x4e, %xmm0, %xmm5
  1383. mulpd %xmm6, %xmm0
  1384. mulpd %xmm7, %xmm5
  1385. addpd %xmm5, %xmm0
  1386. movlps %xmm0, 0 * SIZE(XX)
  1387. movhps %xmm0, 1 * SIZE(XX)
  1388. addl INCX, XX
  1389. movsd 0 * SIZE(X), %xmm1
  1390. movhps 1 * SIZE(X), %xmm1
  1391. addl INCX, X
  1392. pshufd $0x4e, %xmm1, %xmm5
  1393. mulpd %xmm6, %xmm1
  1394. mulpd %xmm7, %xmm5
  1395. addpd %xmm5, %xmm1
  1396. movlps %xmm1, 0 * SIZE(XX)
  1397. movhps %xmm1, 1 * SIZE(XX)
  1398. addl INCX, XX
  1399. movsd 0 * SIZE(X), %xmm2
  1400. movhps 1 * SIZE(X), %xmm2
  1401. addl INCX, X
  1402. pshufd $0x4e, %xmm2, %xmm5
  1403. mulpd %xmm6, %xmm2
  1404. mulpd %xmm7, %xmm5
  1405. addpd %xmm5, %xmm2
  1406. movlps %xmm2, 0 * SIZE(XX)
  1407. movhps %xmm2, 1 * SIZE(XX)
  1408. addl INCX, XX
  1409. movsd 0 * SIZE(X), %xmm3
  1410. movhps 1 * SIZE(X), %xmm3
  1411. addl INCX, X
  1412. pshufd $0x4e, %xmm3, %xmm5
  1413. mulpd %xmm6, %xmm3
  1414. mulpd %xmm7, %xmm5
  1415. addpd %xmm5, %xmm3
  1416. movlps %xmm3, 0 * SIZE(XX)
  1417. movhps %xmm3, 1 * SIZE(XX)
  1418. addl INCX, XX
  1419. ALIGN_3
  1420. .L226:
  1421. testl $2, M
  1422. je .L227
  1423. movsd 0 * SIZE(X), %xmm0
  1424. movhps 1 * SIZE(X), %xmm0
  1425. addl INCX, X
  1426. pshufd $0x4e, %xmm0, %xmm5
  1427. mulpd %xmm6, %xmm0
  1428. mulpd %xmm7, %xmm5
  1429. addpd %xmm5, %xmm0
  1430. movlps %xmm0, 0 * SIZE(XX)
  1431. movhps %xmm0, 1 * SIZE(XX)
  1432. addl INCX, XX
  1433. movsd 0 * SIZE(X), %xmm1
  1434. movhps 1 * SIZE(X), %xmm1
  1435. addl INCX, X
  1436. pshufd $0x4e, %xmm1, %xmm5
  1437. mulpd %xmm6, %xmm1
  1438. mulpd %xmm7, %xmm5
  1439. addpd %xmm5, %xmm1
  1440. movlps %xmm1, 0 * SIZE(XX)
  1441. movhps %xmm1, 1 * SIZE(XX)
  1442. addl INCX, XX
  1443. ALIGN_3
  1444. .L227:
  1445. testl $1, M
  1446. je .L999
  1447. movsd 0 * SIZE(X), %xmm0
  1448. movhps 1 * SIZE(X), %xmm0
  1449. pshufd $0x4e, %xmm0, %xmm5
  1450. mulpd %xmm6, %xmm0
  1451. mulpd %xmm7, %xmm5
  1452. addpd %xmm5, %xmm0
  1453. movlps %xmm0, 0 * SIZE(XX)
  1454. movhps %xmm0, 1 * SIZE(XX)
  1455. ALIGN_3
  1456. .L999:
  1457. xorl %eax, %eax
  1458. popl %ebp
  1459. popl %ebx
  1460. popl %esi
  1461. popl %edi
  1462. ret
  1463. EPILOGUE