You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

copy_sse2.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define M %ebx
  48. #define X %esi
  49. #define INCX %ecx
  50. #define Y %edi
  51. #define INCY %edx
  52. #include "l1param.h"
  53. #ifdef OPTERON
  54. #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG
  55. #else
  56. #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG
  57. #endif
  58. PROLOGUE
  59. PROFCODE
  60. pushl %edi
  61. pushl %esi
  62. pushl %ebx
  63. movl STACK_M, M
  64. movl STACK_X, X
  65. movl STACK_INCX, INCX
  66. movl STACK_Y, Y
  67. movl STACK_INCY, INCY
  68. leal (, INCX, SIZE), INCX
  69. leal (, INCY, SIZE), INCY
  70. cmpl $SIZE, INCX
  71. jne .L40
  72. cmpl $SIZE, INCY
  73. jne .L40
  74. #ifdef ALIGNED_ACCESS
  75. testl $SIZE, Y
  76. #else
  77. testl $SIZE, X
  78. #endif
  79. je .L10
  80. movsd (X), %xmm0
  81. movsd %xmm0, (Y)
  82. addl $1 * SIZE, X
  83. addl $1 * SIZE, Y
  84. decl M
  85. jle .L19
  86. ALIGN_4
  87. .L10:
  88. subl $-16 * SIZE, X
  89. subl $-16 * SIZE, Y
  90. #ifdef ALIGNED_ACCESS
  91. testl $SIZE, X
  92. #else
  93. testl $SIZE, Y
  94. #endif
  95. jne .L20
  96. movl M, %eax
  97. sarl $4, %eax
  98. jle .L13
  99. movaps -16 * SIZE(X), %xmm0
  100. movaps -14 * SIZE(X), %xmm1
  101. movaps -12 * SIZE(X), %xmm2
  102. movaps -10 * SIZE(X), %xmm3
  103. movaps -8 * SIZE(X), %xmm4
  104. movaps -6 * SIZE(X), %xmm5
  105. movaps -4 * SIZE(X), %xmm6
  106. movaps -2 * SIZE(X), %xmm7
  107. decl %eax
  108. jle .L12
  109. ALIGN_3
  110. .L11:
  111. #ifdef PREFETCHW
  112. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  113. #endif
  114. movaps %xmm0, -16 * SIZE(Y)
  115. LOAD( 0 * SIZE, X, %xmm0)
  116. movaps %xmm1, -14 * SIZE(Y)
  117. LOAD( 2 * SIZE, X, %xmm1)
  118. #ifdef PREFETCH
  119. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  120. #endif
  121. movaps %xmm2, -12 * SIZE(Y)
  122. LOAD( 4 * SIZE, X, %xmm2)
  123. movaps %xmm3, -10 * SIZE(Y)
  124. LOAD( 6 * SIZE, X, %xmm3)
  125. #if defined(PREFETCHW) && !defined(FETCH128)
  126. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  127. #endif
  128. movaps %xmm4, -8 * SIZE(Y)
  129. LOAD( 8 * SIZE, X, %xmm4)
  130. movaps %xmm5, -6 * SIZE(Y)
  131. LOAD(10 * SIZE, X, %xmm5)
  132. #if defined(PREFETCH) && !defined(FETCH128)
  133. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  134. #endif
  135. movaps %xmm6, -4 * SIZE(Y)
  136. LOAD(12 * SIZE, X, %xmm6)
  137. movaps %xmm7, -2 * SIZE(Y)
  138. LOAD(14 * SIZE, X, %xmm7)
  139. subl $-16 * SIZE, Y
  140. subl $-16 * SIZE, X
  141. decl %eax
  142. jg .L11
  143. ALIGN_3
  144. .L12:
  145. movaps %xmm0, -16 * SIZE(Y)
  146. movaps %xmm1, -14 * SIZE(Y)
  147. movaps %xmm2, -12 * SIZE(Y)
  148. movaps %xmm3, -10 * SIZE(Y)
  149. movaps %xmm4, -8 * SIZE(Y)
  150. movaps %xmm5, -6 * SIZE(Y)
  151. movaps %xmm6, -4 * SIZE(Y)
  152. movaps %xmm7, -2 * SIZE(Y)
  153. subl $-16 * SIZE, Y
  154. subl $-16 * SIZE, X
  155. ALIGN_3
  156. .L13:
  157. testl $8, M
  158. jle .L14
  159. ALIGN_3
  160. movaps -16 * SIZE(X), %xmm0
  161. movaps -14 * SIZE(X), %xmm1
  162. movaps -12 * SIZE(X), %xmm2
  163. movaps -10 * SIZE(X), %xmm3
  164. movaps %xmm0, -16 * SIZE(Y)
  165. movaps %xmm1, -14 * SIZE(Y)
  166. movaps %xmm2, -12 * SIZE(Y)
  167. movaps %xmm3, -10 * SIZE(Y)
  168. addl $8 * SIZE, X
  169. addl $8 * SIZE, Y
  170. ALIGN_3
  171. .L14:
  172. testl $4, M
  173. jle .L15
  174. ALIGN_3
  175. movaps -16 * SIZE(X), %xmm0
  176. movaps -14 * SIZE(X), %xmm1
  177. movaps %xmm0, -16 * SIZE(Y)
  178. movaps %xmm1, -14 * SIZE(Y)
  179. addl $4 * SIZE, X
  180. addl $4 * SIZE, Y
  181. ALIGN_3
  182. .L15:
  183. testl $2, M
  184. jle .L16
  185. ALIGN_3
  186. movaps -16 * SIZE(X), %xmm0
  187. movaps %xmm0, -16 * SIZE(Y)
  188. addl $2 * SIZE, X
  189. addl $2 * SIZE, Y
  190. ALIGN_3
  191. .L16:
  192. testl $1, M
  193. jle .L19
  194. ALIGN_3
  195. movsd -16 * SIZE(X), %xmm0
  196. movsd %xmm0, -16 * SIZE(Y)
  197. ALIGN_3
  198. .L19:
  199. popl %ebx
  200. popl %esi
  201. popl %edi
  202. ret
  203. ALIGN_3
  204. .L20:
  205. #ifdef ALIGNED_ACCESS
  206. movhps -16 * SIZE(X), %xmm0
  207. movl M, %eax
  208. sarl $4, %eax
  209. jle .L23
  210. movaps -15 * SIZE(X), %xmm1
  211. movaps -13 * SIZE(X), %xmm2
  212. movaps -11 * SIZE(X), %xmm3
  213. movaps -9 * SIZE(X), %xmm4
  214. movaps -7 * SIZE(X), %xmm5
  215. movaps -5 * SIZE(X), %xmm6
  216. movaps -3 * SIZE(X), %xmm7
  217. decl %eax
  218. jle .L22
  219. ALIGN_4
  220. .L21:
  221. #ifdef PREFETCHW
  222. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  223. #endif
  224. SHUFPD_1 %xmm1, %xmm0
  225. movaps %xmm0, -16 * SIZE(Y)
  226. LOAD(-1 * SIZE, X, %xmm0)
  227. SHUFPD_1 %xmm2, %xmm1
  228. movaps %xmm1, -14 * SIZE(Y)
  229. LOAD( 1 * SIZE, X, %xmm1)
  230. #ifdef PREFETCH
  231. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  232. #endif
  233. SHUFPD_1 %xmm3, %xmm2
  234. movaps %xmm2, -12 * SIZE(Y)
  235. LOAD( 3 * SIZE, X, %xmm2)
  236. SHUFPD_1 %xmm4, %xmm3
  237. movaps %xmm3, -10 * SIZE(Y)
  238. LOAD( 5 * SIZE, X, %xmm3)
  239. #if defined(PREFETCHW) && !defined(FETCH128)
  240. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  241. #endif
  242. SHUFPD_1 %xmm5, %xmm4
  243. movaps %xmm4, -8 * SIZE(Y)
  244. LOAD( 7 * SIZE, X, %xmm4)
  245. SHUFPD_1 %xmm6, %xmm5
  246. movaps %xmm5, -6 * SIZE(Y)
  247. LOAD( 9 * SIZE, X, %xmm5)
  248. #if defined(PREFETCH) && !defined(FETCH128)
  249. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  250. #endif
  251. SHUFPD_1 %xmm7, %xmm6
  252. movaps %xmm6, -4 * SIZE(Y)
  253. LOAD(11 * SIZE, X, %xmm6)
  254. SHUFPD_1 %xmm0, %xmm7
  255. movaps %xmm7, -2 * SIZE(Y)
  256. LOAD(13 * SIZE, X, %xmm7)
  257. subl $-16 * SIZE, X
  258. subl $-16 * SIZE, Y
  259. decl %eax
  260. jg .L21
  261. ALIGN_3
  262. .L22:
  263. SHUFPD_1 %xmm1, %xmm0
  264. movaps %xmm0, -16 * SIZE(Y)
  265. LOAD(-1 * SIZE, X, %xmm0)
  266. SHUFPD_1 %xmm2, %xmm1
  267. movaps %xmm1, -14 * SIZE(Y)
  268. SHUFPD_1 %xmm3, %xmm2
  269. movaps %xmm2, -12 * SIZE(Y)
  270. SHUFPD_1 %xmm4, %xmm3
  271. movaps %xmm3, -10 * SIZE(Y)
  272. SHUFPD_1 %xmm5, %xmm4
  273. movaps %xmm4, -8 * SIZE(Y)
  274. SHUFPD_1 %xmm6, %xmm5
  275. movaps %xmm5, -6 * SIZE(Y)
  276. SHUFPD_1 %xmm7, %xmm6
  277. movaps %xmm6, -4 * SIZE(Y)
  278. SHUFPD_1 %xmm0, %xmm7
  279. movaps %xmm7, -2 * SIZE(Y)
  280. subl $-16 * SIZE, X
  281. subl $-16 * SIZE, Y
  282. ALIGN_3
  283. .L23:
  284. testl $8, M
  285. jle .L24
  286. ALIGN_3
  287. movaps -15 * SIZE(X), %xmm1
  288. movaps -13 * SIZE(X), %xmm2
  289. movaps -11 * SIZE(X), %xmm3
  290. movaps -9 * SIZE(X), %xmm4
  291. SHUFPD_1 %xmm1, %xmm0
  292. movaps %xmm0, -16 * SIZE(Y)
  293. SHUFPD_1 %xmm2, %xmm1
  294. movaps %xmm1, -14 * SIZE(Y)
  295. SHUFPD_1 %xmm3, %xmm2
  296. movaps %xmm2, -12 * SIZE(Y)
  297. SHUFPD_1 %xmm4, %xmm3
  298. movaps %xmm3, -10 * SIZE(Y)
  299. movaps %xmm4, %xmm0
  300. addl $8 * SIZE, X
  301. addl $8 * SIZE, Y
  302. ALIGN_3
  303. .L24:
  304. testl $4, M
  305. jle .L25
  306. ALIGN_3
  307. movaps -15 * SIZE(X), %xmm1
  308. movaps -13 * SIZE(X), %xmm2
  309. SHUFPD_1 %xmm1, %xmm0
  310. SHUFPD_1 %xmm2, %xmm1
  311. movaps %xmm0, -16 * SIZE(Y)
  312. movaps %xmm1, -14 * SIZE(Y)
  313. movaps %xmm2, %xmm0
  314. addl $4 * SIZE, X
  315. addl $4 * SIZE, Y
  316. ALIGN_3
  317. .L25:
  318. testl $2, M
  319. jle .L26
  320. ALIGN_3
  321. movaps -15 * SIZE(X), %xmm1
  322. SHUFPD_1 %xmm1, %xmm0
  323. movaps %xmm0, -16 * SIZE(Y)
  324. addl $2 * SIZE, X
  325. addl $2 * SIZE, Y
  326. ALIGN_3
  327. .L26:
  328. testl $1, M
  329. jle .L29
  330. ALIGN_3
  331. movsd -16 * SIZE(X), %xmm0
  332. movsd %xmm0, -16 * SIZE(Y)
  333. ALIGN_3
  334. .L29:
  335. popl %ebx
  336. popl %esi
  337. popl %edi
  338. ret
  339. ALIGN_3
  340. #else
  341. movl M, %eax
  342. sarl $4, %eax
  343. jle .L23
  344. movaps -16 * SIZE(X), %xmm0
  345. movaps -14 * SIZE(X), %xmm1
  346. movaps -12 * SIZE(X), %xmm2
  347. movaps -10 * SIZE(X), %xmm3
  348. movaps -8 * SIZE(X), %xmm4
  349. movaps -6 * SIZE(X), %xmm5
  350. movaps -4 * SIZE(X), %xmm6
  351. movaps -2 * SIZE(X), %xmm7
  352. decl %eax
  353. jle .L22
  354. ALIGN_3
  355. .L21:
  356. #ifdef PREFETCHW
  357. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  358. #endif
  359. movlps %xmm0, -16 * SIZE(Y)
  360. movhps %xmm0, -15 * SIZE(Y)
  361. LOAD( 0 * SIZE, X, %xmm0)
  362. movlps %xmm1, -14 * SIZE(Y)
  363. movhps %xmm1, -13 * SIZE(Y)
  364. LOAD( 2 * SIZE, X, %xmm1)
  365. #ifdef PREFETCH
  366. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  367. #endif
  368. movlps %xmm2, -12 * SIZE(Y)
  369. movhps %xmm2, -11 * SIZE(Y)
  370. LOAD( 4 * SIZE, X, %xmm2)
  371. movlps %xmm3, -10 * SIZE(Y)
  372. movhps %xmm3, -9 * SIZE(Y)
  373. LOAD( 6 * SIZE, X, %xmm3)
  374. #if defined(PREFETCHW) && !defined(FETCH128)
  375. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  376. #endif
  377. movlps %xmm4, -8 * SIZE(Y)
  378. movhps %xmm4, -7 * SIZE(Y)
  379. LOAD( 8 * SIZE, X, %xmm4)
  380. movlps %xmm5, -6 * SIZE(Y)
  381. movhps %xmm5, -5 * SIZE(Y)
  382. LOAD(10 * SIZE, X, %xmm5)
  383. #if defined(PREFETCH) && !defined(FETCH128)
  384. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  385. #endif
  386. movlps %xmm6, -4 * SIZE(Y)
  387. movhps %xmm6, -3 * SIZE(Y)
  388. LOAD(12 * SIZE, X, %xmm6)
  389. movlps %xmm7, -2 * SIZE(Y)
  390. movhps %xmm7, -1 * SIZE(Y)
  391. LOAD(14 * SIZE, X, %xmm7)
  392. subl $-16 * SIZE, Y
  393. subl $-16 * SIZE, X
  394. decl %eax
  395. jg .L21
  396. ALIGN_3
  397. .L22:
  398. movlps %xmm0, -16 * SIZE(Y)
  399. movhps %xmm0, -15 * SIZE(Y)
  400. movlps %xmm1, -14 * SIZE(Y)
  401. movhps %xmm1, -13 * SIZE(Y)
  402. movlps %xmm2, -12 * SIZE(Y)
  403. movhps %xmm2, -11 * SIZE(Y)
  404. movlps %xmm3, -10 * SIZE(Y)
  405. movhps %xmm3, -9 * SIZE(Y)
  406. movlps %xmm4, -8 * SIZE(Y)
  407. movhps %xmm4, -7 * SIZE(Y)
  408. movlps %xmm5, -6 * SIZE(Y)
  409. movhps %xmm5, -5 * SIZE(Y)
  410. movlps %xmm6, -4 * SIZE(Y)
  411. movhps %xmm6, -3 * SIZE(Y)
  412. movlps %xmm7, -2 * SIZE(Y)
  413. movhps %xmm7, -1 * SIZE(Y)
  414. subl $-16 * SIZE, Y
  415. subl $-16 * SIZE, X
  416. ALIGN_3
  417. .L23:
  418. testl $8, M
  419. jle .L24
  420. ALIGN_3
  421. movaps -16 * SIZE(X), %xmm0
  422. movlps %xmm0, -16 * SIZE(Y)
  423. movhps %xmm0, -15 * SIZE(Y)
  424. movaps -14 * SIZE(X), %xmm1
  425. movlps %xmm1, -14 * SIZE(Y)
  426. movhps %xmm1, -13 * SIZE(Y)
  427. movaps -12 * SIZE(X), %xmm2
  428. movlps %xmm2, -12 * SIZE(Y)
  429. movhps %xmm2, -11 * SIZE(Y)
  430. movaps -10 * SIZE(X), %xmm3
  431. movlps %xmm3, -10 * SIZE(Y)
  432. movhps %xmm3, -9 * SIZE(Y)
  433. addl $8 * SIZE, X
  434. addl $8 * SIZE, Y
  435. ALIGN_3
  436. .L24:
  437. testl $4, M
  438. jle .L25
  439. ALIGN_3
  440. movaps -16 * SIZE(X), %xmm0
  441. movlps %xmm0, -16 * SIZE(Y)
  442. movhps %xmm0, -15 * SIZE(Y)
  443. movaps -14 * SIZE(X), %xmm1
  444. movlps %xmm1, -14 * SIZE(Y)
  445. movhps %xmm1, -13 * SIZE(Y)
  446. addl $4 * SIZE, X
  447. addl $4 * SIZE, Y
  448. ALIGN_3
  449. .L25:
  450. testl $2, M
  451. jle .L26
  452. ALIGN_3
  453. movaps -16 * SIZE(X), %xmm0
  454. movlps %xmm0, -16 * SIZE(Y)
  455. movhps %xmm0, -15 * SIZE(Y)
  456. addl $2 * SIZE, X
  457. addl $2 * SIZE, Y
  458. ALIGN_3
  459. .L26:
  460. testl $1, M
  461. jle .L29
  462. ALIGN_3
  463. movsd -16 * SIZE(X), %xmm0
  464. movsd %xmm0, -16 * SIZE(Y)
  465. ALIGN_3
  466. .L29:
  467. popl %ebx
  468. popl %esi
  469. popl %edi
  470. ret
  471. ALIGN_3
  472. #endif
  473. .L40:
  474. movl M, %eax
  475. sarl $3, %eax
  476. jle .L45
  477. ALIGN_3
  478. .L41:
  479. movsd (X), %xmm0
  480. addl INCX, X
  481. movhps (X), %xmm0
  482. addl INCX, X
  483. movsd (X), %xmm1
  484. addl INCX, X
  485. movhps (X), %xmm1
  486. addl INCX, X
  487. movsd (X), %xmm2
  488. addl INCX, X
  489. movhps (X), %xmm2
  490. addl INCX, X
  491. movsd (X), %xmm3
  492. addl INCX, X
  493. movhps (X), %xmm3
  494. addl INCX, X
  495. movlps %xmm0, (Y)
  496. addl INCY, Y
  497. movhps %xmm0, (Y)
  498. addl INCY, Y
  499. movlps %xmm1, (Y)
  500. addl INCY, Y
  501. movhps %xmm1, (Y)
  502. addl INCY, Y
  503. movlps %xmm2, (Y)
  504. addl INCY, Y
  505. movhps %xmm2, (Y)
  506. addl INCY, Y
  507. movlps %xmm3, (Y)
  508. addl INCY, Y
  509. movhps %xmm3, (Y)
  510. addl INCY, Y
  511. decl %eax
  512. jg .L41
  513. ALIGN_3
  514. .L45:
  515. movl M, %eax
  516. andl $7, %eax
  517. jle .L47
  518. ALIGN_3
  519. .L46:
  520. movsd (X), %xmm0
  521. addl INCX, X
  522. movlps %xmm0, (Y)
  523. addl INCY, Y
  524. decl %eax
  525. jg .L46
  526. ALIGN_3
  527. .L47:
  528. popl %ebx
  529. popl %esi
  530. popl %edi
  531. ret
  532. EPILOGUE