You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zcopy_sse2.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define M %ebx
  48. #define X %esi
  49. #define INCX %ecx
  50. #define Y %edi
  51. #define INCY %edx
  52. #define xmm8 xmm0
  53. #define xmm9 xmm1
  54. #define xmm10 xmm2
  55. #define xmm11 xmm3
  56. #define xmm12 xmm4
  57. #define xmm13 xmm5
  58. #define xmm14 xmm6
  59. #define xmm15 xmm7
  60. #include "l1param.h"
  61. #ifdef OPTERON
  62. #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG
  63. #else
  64. #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG
  65. #endif
  66. PROLOGUE
  67. PROFCODE
  68. pushl %edi
  69. pushl %esi
  70. pushl %ebx
  71. movl STACK_M, M
  72. movl STACK_X, X
  73. movl STACK_INCX, INCX
  74. movl STACK_Y, Y
  75. movl STACK_INCY, INCY
  76. sall $ZBASE_SHIFT, INCX
  77. sall $ZBASE_SHIFT, INCY
  78. cmpl $2 * SIZE, INCX
  79. jne .L50
  80. cmpl $2 * SIZE, INCY
  81. jne .L50
  82. addl M, M
  83. #ifdef ALIGNED_ACCESS
  84. testl $SIZE, Y
  85. #else
  86. testl $SIZE, X
  87. #endif
  88. je .L10
  89. movsd (X), %xmm0
  90. movsd %xmm0, (Y)
  91. addl $1 * SIZE, X
  92. addl $1 * SIZE, Y
  93. decl M
  94. jle .L19
  95. ALIGN_4
  96. .L10:
  97. subl $-16 * SIZE, X
  98. subl $-16 * SIZE, Y
  99. #ifdef ALIGNED_ACCESS
  100. testl $SIZE, X
  101. #else
  102. testl $SIZE, Y
  103. #endif
  104. jne .L20
  105. movl M, %eax
  106. sarl $4, %eax
  107. jle .L13
  108. movaps -16 * SIZE(X), %xmm0
  109. movaps -14 * SIZE(X), %xmm1
  110. movaps -12 * SIZE(X), %xmm2
  111. movaps -10 * SIZE(X), %xmm3
  112. movaps -8 * SIZE(X), %xmm4
  113. movaps -6 * SIZE(X), %xmm5
  114. movaps -4 * SIZE(X), %xmm6
  115. movaps -2 * SIZE(X), %xmm7
  116. decl %eax
  117. jle .L12
  118. ALIGN_3
  119. .L11:
  120. #ifdef PREFETCHW
  121. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  122. #endif
  123. movaps %xmm0, -16 * SIZE(Y)
  124. LOAD( 0 * SIZE, X, %xmm0)
  125. movaps %xmm1, -14 * SIZE(Y)
  126. LOAD( 2 * SIZE, X, %xmm1)
  127. #ifdef PREFETCH
  128. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  129. #endif
  130. movaps %xmm2, -12 * SIZE(Y)
  131. LOAD( 4 * SIZE, X, %xmm2)
  132. movaps %xmm3, -10 * SIZE(Y)
  133. LOAD( 6 * SIZE, X, %xmm3)
  134. #if defined(PREFETCHW) && !defined(FETCH128)
  135. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  136. #endif
  137. movaps %xmm4, -8 * SIZE(Y)
  138. LOAD( 8 * SIZE, X, %xmm4)
  139. movaps %xmm5, -6 * SIZE(Y)
  140. LOAD(10 * SIZE, X, %xmm5)
  141. #if defined(PREFETCH) && !defined(FETCH128)
  142. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  143. #endif
  144. movaps %xmm6, -4 * SIZE(Y)
  145. LOAD(12 * SIZE, X, %xmm6)
  146. movaps %xmm7, -2 * SIZE(Y)
  147. LOAD(14 * SIZE, X, %xmm7)
  148. subl $-16 * SIZE, Y
  149. subl $-16 * SIZE, X
  150. decl %eax
  151. jg .L11
  152. ALIGN_3
  153. .L12:
  154. movaps %xmm0, -16 * SIZE(Y)
  155. movaps %xmm1, -14 * SIZE(Y)
  156. movaps %xmm2, -12 * SIZE(Y)
  157. movaps %xmm3, -10 * SIZE(Y)
  158. movaps %xmm4, -8 * SIZE(Y)
  159. movaps %xmm5, -6 * SIZE(Y)
  160. movaps %xmm6, -4 * SIZE(Y)
  161. movaps %xmm7, -2 * SIZE(Y)
  162. subl $-16 * SIZE, Y
  163. subl $-16 * SIZE, X
  164. ALIGN_3
  165. .L13:
  166. testl $8, M
  167. jle .L14
  168. ALIGN_3
  169. movaps -16 * SIZE(X), %xmm0
  170. movaps -14 * SIZE(X), %xmm1
  171. movaps -12 * SIZE(X), %xmm2
  172. movaps -10 * SIZE(X), %xmm3
  173. movaps %xmm0, -16 * SIZE(Y)
  174. movaps %xmm1, -14 * SIZE(Y)
  175. movaps %xmm2, -12 * SIZE(Y)
  176. movaps %xmm3, -10 * SIZE(Y)
  177. addl $8 * SIZE, X
  178. addl $8 * SIZE, Y
  179. ALIGN_3
  180. .L14:
  181. testl $4, M
  182. jle .L15
  183. ALIGN_3
  184. movaps -16 * SIZE(X), %xmm0
  185. movaps -14 * SIZE(X), %xmm1
  186. movaps %xmm0, -16 * SIZE(Y)
  187. movaps %xmm1, -14 * SIZE(Y)
  188. addl $4 * SIZE, X
  189. addl $4 * SIZE, Y
  190. ALIGN_3
  191. .L15:
  192. testl $2, M
  193. jle .L16
  194. ALIGN_3
  195. movaps -16 * SIZE(X), %xmm0
  196. movaps %xmm0, -16 * SIZE(Y)
  197. addl $2 * SIZE, X
  198. addl $2 * SIZE, Y
  199. ALIGN_3
  200. .L16:
  201. testl $1, M
  202. jle .L19
  203. ALIGN_3
  204. movsd -16 * SIZE(X), %xmm0
  205. movsd %xmm0, -16 * SIZE(Y)
  206. ALIGN_3
  207. .L19:
  208. popl %ebx
  209. popl %esi
  210. popl %edi
  211. ret
  212. ALIGN_3
  213. .L20:
  214. #ifdef ALIGNED_ACCESS
  215. movhps -16 * SIZE(X), %xmm0
  216. movl M, %eax
  217. sarl $4, %eax
  218. jle .L23
  219. movaps -15 * SIZE(X), %xmm1
  220. movaps -13 * SIZE(X), %xmm2
  221. movaps -11 * SIZE(X), %xmm3
  222. movaps -9 * SIZE(X), %xmm4
  223. movaps -7 * SIZE(X), %xmm5
  224. movaps -5 * SIZE(X), %xmm6
  225. movaps -3 * SIZE(X), %xmm7
  226. decl %eax
  227. jle .L22
  228. ALIGN_4
  229. .L21:
  230. #ifdef PREFETCHW
  231. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  232. #endif
  233. SHUFPD_1 %xmm1, %xmm0
  234. movaps %xmm0, -16 * SIZE(Y)
  235. LOAD(-1 * SIZE, X, %xmm0)
  236. SHUFPD_1 %xmm2, %xmm1
  237. movaps %xmm1, -14 * SIZE(Y)
  238. LOAD( 1 * SIZE, X, %xmm1)
  239. #ifdef PREFETCH
  240. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  241. #endif
  242. SHUFPD_1 %xmm3, %xmm2
  243. movaps %xmm2, -12 * SIZE(Y)
  244. LOAD( 3 * SIZE, X, %xmm2)
  245. SHUFPD_1 %xmm4, %xmm3
  246. movaps %xmm3, -10 * SIZE(Y)
  247. LOAD( 5 * SIZE, X, %xmm3)
  248. #if defined(PREFETCHW) && !defined(FETCH128)
  249. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  250. #endif
  251. SHUFPD_1 %xmm5, %xmm4
  252. movaps %xmm4, -8 * SIZE(Y)
  253. LOAD( 7 * SIZE, X, %xmm4)
  254. SHUFPD_1 %xmm6, %xmm5
  255. movaps %xmm5, -6 * SIZE(Y)
  256. LOAD( 9 * SIZE, X, %xmm5)
  257. #if defined(PREFETCH) && !defined(FETCH128)
  258. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  259. #endif
  260. SHUFPD_1 %xmm7, %xmm6
  261. movaps %xmm6, -4 * SIZE(Y)
  262. LOAD(11 * SIZE, X, %xmm6)
  263. SHUFPD_1 %xmm0, %xmm7
  264. movaps %xmm7, -2 * SIZE(Y)
  265. LOAD(13 * SIZE, X, %xmm7)
  266. subl $-16 * SIZE, X
  267. subl $-16 * SIZE, Y
  268. decl %eax
  269. jg .L21
  270. ALIGN_3
  271. .L22:
  272. SHUFPD_1 %xmm1, %xmm0
  273. movaps %xmm0, -16 * SIZE(Y)
  274. LOAD(-1 * SIZE, X, %xmm0)
  275. SHUFPD_1 %xmm2, %xmm1
  276. movaps %xmm1, -14 * SIZE(Y)
  277. SHUFPD_1 %xmm3, %xmm2
  278. movaps %xmm2, -12 * SIZE(Y)
  279. SHUFPD_1 %xmm4, %xmm3
  280. movaps %xmm3, -10 * SIZE(Y)
  281. SHUFPD_1 %xmm5, %xmm4
  282. movaps %xmm4, -8 * SIZE(Y)
  283. SHUFPD_1 %xmm6, %xmm5
  284. movaps %xmm5, -6 * SIZE(Y)
  285. SHUFPD_1 %xmm7, %xmm6
  286. movaps %xmm6, -4 * SIZE(Y)
  287. SHUFPD_1 %xmm0, %xmm7
  288. movaps %xmm7, -2 * SIZE(Y)
  289. subl $-16 * SIZE, X
  290. subl $-16 * SIZE, Y
  291. ALIGN_3
  292. .L23:
  293. testl $8, M
  294. jle .L24
  295. ALIGN_3
  296. movaps -15 * SIZE(X), %xmm1
  297. movaps -13 * SIZE(X), %xmm2
  298. movaps -11 * SIZE(X), %xmm3
  299. movaps -9 * SIZE(X), %xmm4
  300. SHUFPD_1 %xmm1, %xmm0
  301. movaps %xmm0, -16 * SIZE(Y)
  302. SHUFPD_1 %xmm2, %xmm1
  303. movaps %xmm1, -14 * SIZE(Y)
  304. SHUFPD_1 %xmm3, %xmm2
  305. movaps %xmm2, -12 * SIZE(Y)
  306. SHUFPD_1 %xmm4, %xmm3
  307. movaps %xmm3, -10 * SIZE(Y)
  308. movaps %xmm4, %xmm0
  309. addl $8 * SIZE, X
  310. addl $8 * SIZE, Y
  311. ALIGN_3
  312. .L24:
  313. testl $4, M
  314. jle .L25
  315. ALIGN_3
  316. movaps -15 * SIZE(X), %xmm1
  317. movaps -13 * SIZE(X), %xmm2
  318. SHUFPD_1 %xmm1, %xmm0
  319. SHUFPD_1 %xmm2, %xmm1
  320. movaps %xmm0, -16 * SIZE(Y)
  321. movaps %xmm1, -14 * SIZE(Y)
  322. movaps %xmm2, %xmm0
  323. addl $4 * SIZE, X
  324. addl $4 * SIZE, Y
  325. ALIGN_3
  326. .L25:
  327. testl $2, M
  328. jle .L26
  329. ALIGN_3
  330. movaps -15 * SIZE(X), %xmm1
  331. SHUFPD_1 %xmm1, %xmm0
  332. movaps %xmm0, -16 * SIZE(Y)
  333. addl $2 * SIZE, X
  334. addl $2 * SIZE, Y
  335. ALIGN_3
  336. .L26:
  337. testl $1, M
  338. jle .L29
  339. ALIGN_3
  340. movsd -16 * SIZE(X), %xmm0
  341. movsd %xmm0, -16 * SIZE(Y)
  342. ALIGN_3
  343. .L29:
  344. popl %ebx
  345. popl %esi
  346. popl %edi
  347. ret
  348. ALIGN_3
  349. #else
  350. movl M, %eax
  351. sarl $4, %eax
  352. jle .L23
  353. movaps -16 * SIZE(X), %xmm0
  354. movaps -14 * SIZE(X), %xmm1
  355. movaps -12 * SIZE(X), %xmm2
  356. movaps -10 * SIZE(X), %xmm3
  357. movaps -8 * SIZE(X), %xmm4
  358. movaps -6 * SIZE(X), %xmm5
  359. movaps -4 * SIZE(X), %xmm6
  360. movaps -2 * SIZE(X), %xmm7
  361. decl %eax
  362. jle .L22
  363. ALIGN_3
  364. .L21:
  365. #ifdef PREFETCHW
  366. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  367. #endif
  368. movlps %xmm0, -16 * SIZE(Y)
  369. movhps %xmm0, -15 * SIZE(Y)
  370. LOAD( 0 * SIZE, X, %xmm0)
  371. movlps %xmm1, -14 * SIZE(Y)
  372. movhps %xmm1, -13 * SIZE(Y)
  373. LOAD( 2 * SIZE, X, %xmm1)
  374. #ifdef PREFETCH
  375. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  376. #endif
  377. movlps %xmm2, -12 * SIZE(Y)
  378. movhps %xmm2, -11 * SIZE(Y)
  379. LOAD( 4 * SIZE, X, %xmm2)
  380. movlps %xmm3, -10 * SIZE(Y)
  381. movhps %xmm3, -9 * SIZE(Y)
  382. LOAD( 6 * SIZE, X, %xmm3)
  383. #if defined(PREFETCHW) && !defined(FETCH128)
  384. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  385. #endif
  386. movlps %xmm4, -8 * SIZE(Y)
  387. movhps %xmm4, -7 * SIZE(Y)
  388. LOAD( 8 * SIZE, X, %xmm4)
  389. movlps %xmm5, -6 * SIZE(Y)
  390. movhps %xmm5, -5 * SIZE(Y)
  391. LOAD(10 * SIZE, X, %xmm5)
  392. #if defined(PREFETCH) && !defined(FETCH128)
  393. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  394. #endif
  395. movlps %xmm6, -4 * SIZE(Y)
  396. movhps %xmm6, -3 * SIZE(Y)
  397. LOAD(12 * SIZE, X, %xmm6)
  398. movlps %xmm7, -2 * SIZE(Y)
  399. movhps %xmm7, -1 * SIZE(Y)
  400. LOAD(14 * SIZE, X, %xmm7)
  401. subl $-16 * SIZE, Y
  402. subl $-16 * SIZE, X
  403. decl %eax
  404. jg .L21
  405. ALIGN_3
  406. .L22:
  407. movlps %xmm0, -16 * SIZE(Y)
  408. movhps %xmm0, -15 * SIZE(Y)
  409. movlps %xmm1, -14 * SIZE(Y)
  410. movhps %xmm1, -13 * SIZE(Y)
  411. movlps %xmm2, -12 * SIZE(Y)
  412. movhps %xmm2, -11 * SIZE(Y)
  413. movlps %xmm3, -10 * SIZE(Y)
  414. movhps %xmm3, -9 * SIZE(Y)
  415. movlps %xmm4, -8 * SIZE(Y)
  416. movhps %xmm4, -7 * SIZE(Y)
  417. movlps %xmm5, -6 * SIZE(Y)
  418. movhps %xmm5, -5 * SIZE(Y)
  419. movlps %xmm6, -4 * SIZE(Y)
  420. movhps %xmm6, -3 * SIZE(Y)
  421. movlps %xmm7, -2 * SIZE(Y)
  422. movhps %xmm7, -1 * SIZE(Y)
  423. subl $-16 * SIZE, Y
  424. subl $-16 * SIZE, X
  425. ALIGN_3
  426. .L23:
  427. testl $8, M
  428. jle .L24
  429. ALIGN_3
  430. movaps -16 * SIZE(X), %xmm0
  431. movlps %xmm0, -16 * SIZE(Y)
  432. movhps %xmm0, -15 * SIZE(Y)
  433. movaps -14 * SIZE(X), %xmm1
  434. movlps %xmm1, -14 * SIZE(Y)
  435. movhps %xmm1, -13 * SIZE(Y)
  436. movaps -12 * SIZE(X), %xmm2
  437. movlps %xmm2, -12 * SIZE(Y)
  438. movhps %xmm2, -11 * SIZE(Y)
  439. movaps -10 * SIZE(X), %xmm3
  440. movlps %xmm3, -10 * SIZE(Y)
  441. movhps %xmm3, -9 * SIZE(Y)
  442. addl $8 * SIZE, X
  443. addl $8 * SIZE, Y
  444. ALIGN_3
  445. .L24:
  446. testl $4, M
  447. jle .L25
  448. ALIGN_3
  449. movaps -16 * SIZE(X), %xmm0
  450. movlps %xmm0, -16 * SIZE(Y)
  451. movhps %xmm0, -15 * SIZE(Y)
  452. movaps -14 * SIZE(X), %xmm1
  453. movlps %xmm1, -14 * SIZE(Y)
  454. movhps %xmm1, -13 * SIZE(Y)
  455. addl $4 * SIZE, X
  456. addl $4 * SIZE, Y
  457. ALIGN_3
  458. .L25:
  459. testl $2, M
  460. jle .L26
  461. ALIGN_3
  462. movaps -16 * SIZE(X), %xmm0
  463. movlps %xmm0, -16 * SIZE(Y)
  464. movhps %xmm0, -15 * SIZE(Y)
  465. addl $2 * SIZE, X
  466. addl $2 * SIZE, Y
  467. ALIGN_3
  468. .L26:
  469. testl $1, M
  470. jle .L29
  471. ALIGN_3
  472. movsd -16 * SIZE(X), %xmm0
  473. movsd %xmm0, -16 * SIZE(Y)
  474. ALIGN_3
  475. .L29:
  476. popl %ebx
  477. popl %esi
  478. popl %edi
  479. ret
  480. ALIGN_3
  481. #endif
  482. .L50:
  483. movl M, %eax
  484. sarl $2, %eax
  485. jle .L55
  486. ALIGN_3
  487. .L51:
  488. movsd 0 * SIZE(X), %xmm0
  489. movhps 1 * SIZE(X), %xmm0
  490. addl INCX, X
  491. movsd 0 * SIZE(X), %xmm1
  492. movhps 1 * SIZE(X), %xmm1
  493. addl INCX, X
  494. movsd 0 * SIZE(X), %xmm2
  495. movhps 1 * SIZE(X), %xmm2
  496. addl INCX, X
  497. movsd 0 * SIZE(X), %xmm3
  498. movhps 1 * SIZE(X), %xmm3
  499. addl INCX, X
  500. movlps %xmm0, 0 * SIZE(Y)
  501. movhps %xmm0, 1 * SIZE(Y)
  502. addl INCY, Y
  503. movlps %xmm1, 0 * SIZE(Y)
  504. movhps %xmm1, 1 * SIZE(Y)
  505. addl INCY, Y
  506. movlps %xmm2, 0 * SIZE(Y)
  507. movhps %xmm2, 1 * SIZE(Y)
  508. addl INCY, Y
  509. movlps %xmm3, 0 * SIZE(Y)
  510. movhps %xmm3, 1 * SIZE(Y)
  511. addl INCY, Y
  512. decl %eax
  513. jg .L51
  514. ALIGN_3
  515. .L55:
  516. movl M, %eax
  517. andl $3, %eax
  518. jle .L57
  519. ALIGN_3
  520. .L56:
  521. movsd 0 * SIZE(X), %xmm0
  522. movhps 1 * SIZE(X), %xmm0
  523. addl INCX, X
  524. movlps %xmm0, 0 * SIZE(Y)
  525. movhps %xmm0, 1 * SIZE(Y)
  526. addl INCY, Y
  527. decl %eax
  528. jg .L56
  529. ALIGN_3
  530. .L57:
  531. popl %ebx
  532. popl %esi
  533. popl %edi
  534. ret
  535. EPILOGUE