You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

copy_sse.S 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define M %ebx
  48. #define X %esi
  49. #define INCX %ecx
  50. #define Y %edi
  51. #define INCY %edx
  52. #include "l1param.h"
  53. #ifdef OPTERON
  54. #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG
  55. #else
  56. #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG
  57. #endif
  58. PROLOGUE
  59. PROFCODE
  60. pushl %edi
  61. pushl %esi
  62. pushl %ebx
  63. movl STACK_M, M
  64. movl STACK_X, X
  65. movl STACK_INCX, INCX
  66. movl STACK_Y, Y
  67. movl STACK_INCY, INCY
  68. leal (, INCX, SIZE), INCX
  69. leal (, INCY, SIZE), INCY
  70. cmpl $SIZE, INCX
  71. jne .L50
  72. cmpl $SIZE, INCY
  73. jne .L50
  74. cmpl $3, M
  75. jle .L55
  76. subl $-32 * SIZE, X
  77. subl $-32 * SIZE, Y
  78. testl $SIZE, Y
  79. je .L05
  80. movss -32 * SIZE(X), %xmm0
  81. movss %xmm0, -32 * SIZE(Y)
  82. addl $1 * SIZE, X
  83. addl $1 * SIZE, Y
  84. decl M
  85. ALIGN_4
  86. .L05:
  87. testl $2 * SIZE, Y
  88. je .L10
  89. movsd -32 * SIZE(X), %xmm0
  90. movlps %xmm0, -32 * SIZE(Y)
  91. addl $2 * SIZE, X
  92. addl $2 * SIZE, Y
  93. subl $2, M
  94. jle .L19
  95. ALIGN_4
  96. .L10:
  97. testl $3 * SIZE, X
  98. jne .L20
  99. movl M, %eax
  100. sarl $5, %eax
  101. jle .L13
  102. movaps -32 * SIZE(X), %xmm0
  103. movaps -28 * SIZE(X), %xmm1
  104. movaps -24 * SIZE(X), %xmm2
  105. movaps -20 * SIZE(X), %xmm3
  106. movaps -16 * SIZE(X), %xmm4
  107. movaps -12 * SIZE(X), %xmm5
  108. movaps -8 * SIZE(X), %xmm6
  109. movaps -4 * SIZE(X), %xmm7
  110. decl %eax
  111. jle .L12
  112. ALIGN_3
  113. .L11:
  114. #ifdef PREFETCHW
  115. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  116. #endif
  117. movaps %xmm0, -32 * SIZE(Y)
  118. LOAD( 0 * SIZE, X, %xmm0)
  119. movaps %xmm1, -28 * SIZE(Y)
  120. LOAD( 4 * SIZE, X, %xmm1)
  121. #ifdef PREFETCH
  122. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  123. #endif
  124. movaps %xmm2, -24 * SIZE(Y)
  125. LOAD( 8 * SIZE, X, %xmm2)
  126. movaps %xmm3, -20 * SIZE(Y)
  127. LOAD(12 * SIZE, X, %xmm3)
  128. #if defined(PREFETCHW) && !defined(FETCH128)
  129. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  130. #endif
  131. movaps %xmm4,-16 * SIZE(Y)
  132. LOAD(16 * SIZE, X, %xmm4)
  133. movaps %xmm5,-12 * SIZE(Y)
  134. LOAD(20 * SIZE, X, %xmm5)
  135. #if defined(PREFETCH) && !defined(FETCH128)
  136. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  137. #endif
  138. movaps %xmm6, -8 * SIZE(Y)
  139. LOAD(24 * SIZE, X, %xmm6)
  140. movaps %xmm7, -4 * SIZE(Y)
  141. LOAD(28 * SIZE, X, %xmm7)
  142. subl $-32 * SIZE, Y
  143. subl $-32 * SIZE, X
  144. decl %eax
  145. jg .L11
  146. ALIGN_3
  147. .L12:
  148. movaps %xmm0, -32 * SIZE(Y)
  149. movaps %xmm1, -28 * SIZE(Y)
  150. movaps %xmm2, -24 * SIZE(Y)
  151. movaps %xmm3, -20 * SIZE(Y)
  152. movaps %xmm4, -16 * SIZE(Y)
  153. movaps %xmm5, -12 * SIZE(Y)
  154. movaps %xmm6, -8 * SIZE(Y)
  155. movaps %xmm7, -4 * SIZE(Y)
  156. subl $-32 * SIZE, Y
  157. subl $-32 * SIZE, X
  158. ALIGN_3
  159. .L13:
  160. testl $16, M
  161. jle .L14
  162. movaps -32 * SIZE(X), %xmm0
  163. movaps -28 * SIZE(X), %xmm1
  164. movaps -24 * SIZE(X), %xmm2
  165. movaps -20 * SIZE(X), %xmm3
  166. movaps %xmm0, -32 * SIZE(Y)
  167. movaps %xmm1, -28 * SIZE(Y)
  168. movaps %xmm2, -24 * SIZE(Y)
  169. movaps %xmm3, -20 * SIZE(Y)
  170. addl $16 * SIZE, X
  171. addl $16 * SIZE, Y
  172. ALIGN_3
  173. .L14:
  174. testl $8, M
  175. jle .L15
  176. movaps -32 * SIZE(X), %xmm0
  177. movaps -28 * SIZE(X), %xmm1
  178. movaps %xmm0, -32 * SIZE(Y)
  179. movaps %xmm1, -28 * SIZE(Y)
  180. addl $8 * SIZE, X
  181. addl $8 * SIZE, Y
  182. ALIGN_3
  183. .L15:
  184. testl $4, M
  185. jle .L16
  186. movaps -32 * SIZE(X), %xmm0
  187. movaps %xmm0, -32 * SIZE(Y)
  188. addl $4 * SIZE, X
  189. addl $4 * SIZE, Y
  190. ALIGN_3
  191. .L16:
  192. testl $2, M
  193. jle .L17
  194. movsd -32 * SIZE(X), %xmm0
  195. movlps %xmm0, -32 * SIZE(Y)
  196. addl $2 * SIZE, X
  197. addl $2 * SIZE, Y
  198. ALIGN_3
  199. .L17:
  200. testl $1, M
  201. jle .L19
  202. movss -32 * SIZE(X), %xmm0
  203. movss %xmm0, -32 * SIZE(Y)
  204. ALIGN_3
  205. .L19:
  206. popl %ebx
  207. popl %esi
  208. popl %edi
  209. ret
  210. ALIGN_3
  211. .L20:
  212. testl $SIZE, X
  213. jne .L30
  214. movhps -32 * SIZE(X), %xmm0
  215. movl M, %eax
  216. sarl $5, %eax
  217. jle .L23
  218. movaps -30 * SIZE(X), %xmm1
  219. movaps -26 * SIZE(X), %xmm2
  220. movaps -22 * SIZE(X), %xmm3
  221. movaps -18 * SIZE(X), %xmm4
  222. movaps -14 * SIZE(X), %xmm5
  223. movaps -10 * SIZE(X), %xmm6
  224. movaps -6 * SIZE(X), %xmm7
  225. decl %eax
  226. jle .L22
  227. ALIGN_4
  228. .L21:
  229. #ifdef PREFETCHW
  230. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  231. #endif
  232. shufps $0x4e, %xmm1, %xmm0
  233. movaps %xmm0, -32 * SIZE(Y)
  234. movaps -2 * SIZE(X), %xmm0
  235. shufps $0x4e, %xmm2, %xmm1
  236. movaps %xmm1, -28 * SIZE(Y)
  237. movaps 2 * SIZE(X), %xmm1
  238. #ifdef PREFETCH
  239. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  240. #endif
  241. shufps $0x4e, %xmm3, %xmm2
  242. movaps %xmm2, -24 * SIZE(Y)
  243. movaps 6 * SIZE(X), %xmm2
  244. shufps $0x4e, %xmm4, %xmm3
  245. movaps %xmm3, -20 * SIZE(Y)
  246. movaps 10 * SIZE(X), %xmm3
  247. #if defined(PREFETCHW) && !defined(FETCH128)
  248. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  249. #endif
  250. shufps $0x4e, %xmm5, %xmm4
  251. movaps %xmm4, -16 * SIZE(Y)
  252. movaps 14 * SIZE(X), %xmm4
  253. shufps $0x4e, %xmm6, %xmm5
  254. movaps %xmm5, -12 * SIZE(Y)
  255. movaps 18 * SIZE(X), %xmm5
  256. #if defined(PREFETCH) && !defined(FETCH128)
  257. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  258. #endif
  259. shufps $0x4e, %xmm7, %xmm6
  260. movaps %xmm6, -8 * SIZE(Y)
  261. movaps 22 * SIZE(X), %xmm6
  262. shufps $0x4e, %xmm0, %xmm7
  263. movaps %xmm7, -4 * SIZE(Y)
  264. movaps 26 * SIZE(X), %xmm7
  265. subl $-32 * SIZE, X
  266. subl $-32 * SIZE, Y
  267. decl %eax
  268. jg .L21
  269. ALIGN_3
  270. .L22:
  271. shufps $0x4e, %xmm1, %xmm0
  272. movaps %xmm0, -32 * SIZE(Y)
  273. movaps -2 * SIZE(X), %xmm0
  274. shufps $0x4e, %xmm2, %xmm1
  275. movaps %xmm1, -28 * SIZE(Y)
  276. shufps $0x4e, %xmm3, %xmm2
  277. movaps %xmm2, -24 * SIZE(Y)
  278. shufps $0x4e, %xmm4, %xmm3
  279. movaps %xmm3, -20 * SIZE(Y)
  280. shufps $0x4e, %xmm5, %xmm4
  281. movaps %xmm4, -16 * SIZE(Y)
  282. shufps $0x4e, %xmm6, %xmm5
  283. movaps %xmm5, -12 * SIZE(Y)
  284. shufps $0x4e, %xmm7, %xmm6
  285. movaps %xmm6, -8 * SIZE(Y)
  286. shufps $0x4e, %xmm0, %xmm7
  287. movaps %xmm7, -4 * SIZE(Y)
  288. subl $-32 * SIZE, X
  289. subl $-32 * SIZE, Y
  290. ALIGN_3
  291. .L23:
  292. testl $16, M
  293. jle .L24
  294. ALIGN_3
  295. movaps -30 * SIZE(X), %xmm1
  296. movaps -26 * SIZE(X), %xmm2
  297. movaps -22 * SIZE(X), %xmm3
  298. movaps -18 * SIZE(X), %xmm4
  299. shufps $0x4e, %xmm1, %xmm0
  300. movaps %xmm0, -32 * SIZE(Y)
  301. shufps $0x4e, %xmm2, %xmm1
  302. movaps %xmm1, -28 * SIZE(Y)
  303. shufps $0x4e, %xmm3, %xmm2
  304. movaps %xmm2, -24 * SIZE(Y)
  305. shufps $0x4e, %xmm4, %xmm3
  306. movaps %xmm3, -20 * SIZE(Y)
  307. movaps %xmm4, %xmm0
  308. addl $16 * SIZE, X
  309. addl $16 * SIZE, Y
  310. ALIGN_3
  311. .L24:
  312. testl $8, M
  313. jle .L25
  314. ALIGN_3
  315. movaps -30 * SIZE(X), %xmm1
  316. movaps -26 * SIZE(X), %xmm2
  317. shufps $0x4e, %xmm1, %xmm0
  318. shufps $0x4e, %xmm2, %xmm1
  319. movaps %xmm0, -32 * SIZE(Y)
  320. movaps %xmm1, -28 * SIZE(Y)
  321. movaps %xmm2, %xmm0
  322. addl $8 * SIZE, X
  323. addl $8 * SIZE, Y
  324. ALIGN_3
  325. .L25:
  326. testl $4, M
  327. jle .L26
  328. ALIGN_3
  329. movaps -30 * SIZE(X), %xmm1
  330. shufps $0x4e, %xmm1, %xmm0
  331. movaps %xmm0, -32 * SIZE(Y)
  332. addl $4 * SIZE, X
  333. addl $4 * SIZE, Y
  334. ALIGN_3
  335. .L26:
  336. testl $2, M
  337. jle .L27
  338. ALIGN_3
  339. movsd -32 * SIZE(X), %xmm0
  340. movsd %xmm0, -32 * SIZE(Y)
  341. addl $2 * SIZE, X
  342. addl $2 * SIZE, Y
  343. ALIGN_3
  344. .L27:
  345. testl $1, M
  346. jle .L29
  347. ALIGN_3
  348. movss -32 * SIZE(X), %xmm0
  349. movss %xmm0, -32 * SIZE(Y)
  350. addl $SIZE, Y
  351. ALIGN_3
  352. .L29:
  353. popl %ebx
  354. popl %esi
  355. popl %edi
  356. ret
  357. ALIGN_3
  358. .L30:
  359. testl $2 * SIZE, X
  360. jne .L40
  361. movaps -33 * SIZE(X), %xmm0
  362. movl M, %eax
  363. sarl $5, %eax
  364. jle .L33
  365. movaps -29 * SIZE(X), %xmm1
  366. movaps -25 * SIZE(X), %xmm2
  367. movaps -21 * SIZE(X), %xmm3
  368. movaps -17 * SIZE(X), %xmm4
  369. movaps -13 * SIZE(X), %xmm5
  370. movaps -9 * SIZE(X), %xmm6
  371. movaps -5 * SIZE(X), %xmm7
  372. decl %eax
  373. jle .L32
  374. ALIGN_4
  375. .L31:
  376. #ifdef PREFETCHW
  377. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  378. #endif
  379. movss %xmm1, %xmm0
  380. shufps $0x39, %xmm0, %xmm0
  381. movaps %xmm0, -32 * SIZE(Y)
  382. movaps -1 * SIZE(X), %xmm0
  383. movss %xmm2, %xmm1
  384. shufps $0x39, %xmm1, %xmm1
  385. movaps %xmm1, -28 * SIZE(Y)
  386. movaps 3 * SIZE(X), %xmm1
  387. #ifdef PREFETCH
  388. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  389. #endif
  390. movss %xmm3, %xmm2
  391. shufps $0x39, %xmm2, %xmm2
  392. movaps %xmm2, -24 * SIZE(Y)
  393. movaps 7 * SIZE(X), %xmm2
  394. movss %xmm4, %xmm3
  395. shufps $0x39, %xmm3, %xmm3
  396. movaps %xmm3, -20 * SIZE(Y)
  397. movaps 11 * SIZE(X), %xmm3
  398. #if defined(PREFETCHW) && !defined(FETCH128)
  399. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  400. #endif
  401. movss %xmm5, %xmm4
  402. shufps $0x39, %xmm4, %xmm4
  403. movaps %xmm4, -16 * SIZE(Y)
  404. movaps 15 * SIZE(X), %xmm4
  405. movss %xmm6, %xmm5
  406. shufps $0x39, %xmm5, %xmm5
  407. movaps %xmm5, -12 * SIZE(Y)
  408. movaps 19 * SIZE(X), %xmm5
  409. #if defined(PREFETCH) && !defined(FETCH128)
  410. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  411. #endif
  412. movss %xmm7, %xmm6
  413. shufps $0x39, %xmm6, %xmm6
  414. movaps %xmm6, -8 * SIZE(Y)
  415. movaps 23 * SIZE(X), %xmm6
  416. movss %xmm0, %xmm7
  417. shufps $0x39, %xmm7, %xmm7
  418. movaps %xmm7, -4 * SIZE(Y)
  419. movaps 27 * SIZE(X), %xmm7
  420. subl $-32 * SIZE, X
  421. subl $-32 * SIZE, Y
  422. decl %eax
  423. jg .L31
  424. ALIGN_3
  425. .L32:
  426. movss %xmm1, %xmm0
  427. shufps $0x39, %xmm0, %xmm0
  428. movaps %xmm0, -32 * SIZE(Y)
  429. movaps -1 * SIZE(X), %xmm0
  430. movss %xmm2, %xmm1
  431. shufps $0x39, %xmm1, %xmm1
  432. movaps %xmm1, -28 * SIZE(Y)
  433. movss %xmm3, %xmm2
  434. shufps $0x39, %xmm2, %xmm2
  435. movaps %xmm2, -24 * SIZE(Y)
  436. movss %xmm4, %xmm3
  437. shufps $0x39, %xmm3, %xmm3
  438. movaps %xmm3, -20 * SIZE(Y)
  439. movss %xmm5, %xmm4
  440. shufps $0x39, %xmm4, %xmm4
  441. movaps %xmm4, -16 * SIZE(Y)
  442. movss %xmm6, %xmm5
  443. shufps $0x39, %xmm5, %xmm5
  444. movaps %xmm5, -12 * SIZE(Y)
  445. movss %xmm7, %xmm6
  446. shufps $0x39, %xmm6, %xmm6
  447. movaps %xmm6, -8 * SIZE(Y)
  448. movss %xmm0, %xmm7
  449. shufps $0x39, %xmm7, %xmm7
  450. movaps %xmm7, -4 * SIZE(Y)
  451. subl $-32 * SIZE, X
  452. subl $-32 * SIZE, Y
  453. ALIGN_3
  454. .L33:
  455. testl $16, M
  456. jle .L34
  457. ALIGN_3
  458. movaps -29 * SIZE(X), %xmm1
  459. movaps -25 * SIZE(X), %xmm2
  460. movaps -21 * SIZE(X), %xmm3
  461. movaps -17 * SIZE(X), %xmm4
  462. movss %xmm1, %xmm0
  463. shufps $0x39, %xmm0, %xmm0
  464. movaps %xmm0, -32 * SIZE(Y)
  465. movss %xmm2, %xmm1
  466. shufps $0x39, %xmm1, %xmm1
  467. movaps %xmm1, -28 * SIZE(Y)
  468. movss %xmm3, %xmm2
  469. shufps $0x39, %xmm2, %xmm2
  470. movaps %xmm2, -24 * SIZE(Y)
  471. movss %xmm4, %xmm3
  472. shufps $0x39, %xmm3, %xmm3
  473. movaps %xmm3, -20 * SIZE(Y)
  474. movaps %xmm4, %xmm0
  475. addl $16 * SIZE, X
  476. addl $16 * SIZE, Y
  477. ALIGN_3
  478. .L34:
  479. testl $8, M
  480. jle .L35
  481. ALIGN_3
  482. movaps -29 * SIZE(X), %xmm1
  483. movaps -25 * SIZE(X), %xmm2
  484. movss %xmm1, %xmm0
  485. shufps $0x39, %xmm0, %xmm0
  486. movaps %xmm0, -32 * SIZE(Y)
  487. movss %xmm2, %xmm1
  488. shufps $0x39, %xmm1, %xmm1
  489. movaps %xmm1, -28 * SIZE(Y)
  490. movaps %xmm2, %xmm0
  491. addl $8 * SIZE, X
  492. addl $8 * SIZE, Y
  493. ALIGN_3
  494. .L35:
  495. testl $4, M
  496. jle .L36
  497. ALIGN_3
  498. movaps -29 * SIZE(X), %xmm1
  499. movss %xmm1, %xmm0
  500. shufps $0x39, %xmm0, %xmm0
  501. movaps %xmm0, -32 * SIZE(Y)
  502. addl $4 * SIZE, X
  503. addl $4 * SIZE, Y
  504. ALIGN_3
  505. .L36:
  506. testl $2, M
  507. jle .L37
  508. ALIGN_3
  509. movsd -32 * SIZE(X), %xmm0
  510. movsd %xmm0, -32 * SIZE(Y)
  511. addl $2 * SIZE, X
  512. addl $2 * SIZE, Y
  513. ALIGN_3
  514. .L37:
  515. testl $1, M
  516. jle .L39
  517. ALIGN_3
  518. movss -32 * SIZE(X), %xmm0
  519. movss %xmm0, -32 * SIZE(Y)
  520. addl $SIZE, Y
  521. ALIGN_3
  522. .L39:
  523. popl %ebx
  524. popl %esi
  525. popl %edi
  526. ret
  527. ALIGN_3
  528. .L40:
  529. movaps -35 * SIZE(X), %xmm0
  530. movl M, %eax
  531. sarl $5, %eax
  532. jle .L43
  533. movaps -31 * SIZE(X), %xmm1
  534. movaps -27 * SIZE(X), %xmm2
  535. movaps -23 * SIZE(X), %xmm3
  536. movaps -19 * SIZE(X), %xmm4
  537. movaps -15 * SIZE(X), %xmm5
  538. movaps -11 * SIZE(X), %xmm6
  539. movaps -7 * SIZE(X), %xmm7
  540. decl %eax
  541. jle .L42
  542. ALIGN_4
  543. .L41:
  544. #ifdef PREFETCHW
  545. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  546. #endif
  547. movss %xmm1, %xmm0
  548. shufps $0x93, %xmm1, %xmm0
  549. movaps %xmm0, -32 * SIZE(Y)
  550. movaps -3 * SIZE(X), %xmm0
  551. movss %xmm2, %xmm1
  552. shufps $0x93, %xmm2, %xmm1
  553. movaps %xmm1, -28 * SIZE(Y)
  554. movaps 1 * SIZE(X), %xmm1
  555. #ifdef PREFETCH
  556. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  557. #endif
  558. movss %xmm3, %xmm2
  559. shufps $0x93, %xmm3, %xmm2
  560. movaps %xmm2, -24 * SIZE(Y)
  561. movaps 5 * SIZE(X), %xmm2
  562. movss %xmm4, %xmm3
  563. shufps $0x93, %xmm4, %xmm3
  564. movaps %xmm3, -20 * SIZE(Y)
  565. movaps 9 * SIZE(X), %xmm3
  566. #if defined(PREFETCHW) && !defined(FETCH128)
  567. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  568. #endif
  569. movss %xmm5, %xmm4
  570. shufps $0x93, %xmm5, %xmm4
  571. movaps %xmm4, -16 * SIZE(Y)
  572. movaps 13 * SIZE(X), %xmm4
  573. movss %xmm6, %xmm5
  574. shufps $0x93, %xmm6, %xmm5
  575. movaps %xmm5, -12 * SIZE(Y)
  576. movaps 17 * SIZE(X), %xmm5
  577. #if defined(PREFETCH) && !defined(FETCH128)
  578. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  579. #endif
  580. movss %xmm7, %xmm6
  581. shufps $0x93, %xmm7, %xmm6
  582. movaps %xmm6, -8 * SIZE(Y)
  583. movaps 21 * SIZE(X), %xmm6
  584. movss %xmm0, %xmm7
  585. shufps $0x93, %xmm0, %xmm7
  586. movaps %xmm7, -4 * SIZE(Y)
  587. movaps 25 * SIZE(X), %xmm7
  588. subl $-32 * SIZE, X
  589. subl $-32 * SIZE, Y
  590. decl %eax
  591. jg .L41
  592. ALIGN_3
  593. .L42:
  594. movss %xmm1, %xmm0
  595. shufps $0x93, %xmm1, %xmm0
  596. movaps %xmm0, -32 * SIZE(Y)
  597. movaps -3 * SIZE(X), %xmm0
  598. movss %xmm2, %xmm1
  599. shufps $0x93, %xmm2, %xmm1
  600. movaps %xmm1, -28 * SIZE(Y)
  601. movss %xmm3, %xmm2
  602. shufps $0x93, %xmm3, %xmm2
  603. movaps %xmm2, -24 * SIZE(Y)
  604. movss %xmm4, %xmm3
  605. shufps $0x93, %xmm4, %xmm3
  606. movaps %xmm3, -20 * SIZE(Y)
  607. movss %xmm5, %xmm4
  608. shufps $0x93, %xmm5, %xmm4
  609. movaps %xmm4, -16 * SIZE(Y)
  610. movss %xmm6, %xmm5
  611. shufps $0x93, %xmm6, %xmm5
  612. movaps %xmm5, -12 * SIZE(Y)
  613. movss %xmm7, %xmm6
  614. shufps $0x93, %xmm7, %xmm6
  615. movaps %xmm6, -8 * SIZE(Y)
  616. movss %xmm0, %xmm7
  617. shufps $0x93, %xmm0, %xmm7
  618. movaps %xmm7, -4 * SIZE(Y)
  619. subl $-32 * SIZE, X
  620. subl $-32 * SIZE, Y
  621. ALIGN_3
  622. .L43:
  623. testl $16, M
  624. jle .L44
  625. ALIGN_3
  626. movaps -31 * SIZE(X), %xmm1
  627. movaps -27 * SIZE(X), %xmm2
  628. movaps -23 * SIZE(X), %xmm3
  629. movaps -19 * SIZE(X), %xmm4
  630. movss %xmm1, %xmm0
  631. shufps $0x93, %xmm1, %xmm0
  632. movaps %xmm0, -32 * SIZE(Y)
  633. movss %xmm2, %xmm1
  634. shufps $0x93, %xmm2, %xmm1
  635. movaps %xmm1, -28 * SIZE(Y)
  636. movss %xmm3, %xmm2
  637. shufps $0x93, %xmm3, %xmm2
  638. movaps %xmm2, -24 * SIZE(Y)
  639. movss %xmm4, %xmm3
  640. shufps $0x93, %xmm4, %xmm3
  641. movaps %xmm3, -20 * SIZE(Y)
  642. movaps %xmm4, %xmm0
  643. addl $16 * SIZE, X
  644. addl $16 * SIZE, Y
  645. ALIGN_3
  646. .L44:
  647. testl $8, M
  648. jle .L45
  649. ALIGN_3
  650. movaps -31 * SIZE(X), %xmm1
  651. movaps -27 * SIZE(X), %xmm2
  652. movss %xmm1, %xmm0
  653. shufps $0x93, %xmm1, %xmm0
  654. movaps %xmm0, -32 * SIZE(Y)
  655. movss %xmm2, %xmm1
  656. shufps $0x93, %xmm2, %xmm1
  657. movaps %xmm1, -28 * SIZE(Y)
  658. movaps %xmm2, %xmm0
  659. addl $8 * SIZE, X
  660. addl $8 * SIZE, Y
  661. ALIGN_3
  662. .L45:
  663. testl $4, M
  664. jle .L46
  665. ALIGN_3
  666. movaps -31 * SIZE(X), %xmm1
  667. movss %xmm1, %xmm0
  668. shufps $0x93, %xmm1, %xmm0
  669. movaps %xmm0, -32 * SIZE(Y)
  670. addl $4 * SIZE, X
  671. addl $4 * SIZE, Y
  672. ALIGN_3
  673. .L46:
  674. testl $2, M
  675. jle .L47
  676. ALIGN_3
  677. movsd -32 * SIZE(X), %xmm0
  678. movsd %xmm0, -32 * SIZE(Y)
  679. addl $2 * SIZE, X
  680. addl $2 * SIZE, Y
  681. ALIGN_3
  682. .L47:
  683. testl $1, M
  684. jle .L49
  685. ALIGN_3
  686. movss -32 * SIZE(X), %xmm0
  687. movss %xmm0, -32 * SIZE(Y)
  688. addl $SIZE, Y
  689. ALIGN_3
  690. .L49:
  691. popl %ebx
  692. popl %esi
  693. popl %edi
  694. ret
  695. ALIGN_4
  696. .L50:
  697. movl M, %eax
  698. sarl $3, %eax
  699. jle .L55
  700. ALIGN_3
  701. .L51:
  702. movss (X), %xmm0
  703. addl INCX, X
  704. movss (X), %xmm1
  705. addl INCX, X
  706. movss (X), %xmm2
  707. addl INCX, X
  708. movss (X), %xmm3
  709. addl INCX, X
  710. movss (X), %xmm4
  711. addl INCX, X
  712. movss (X), %xmm5
  713. addl INCX, X
  714. movss (X), %xmm6
  715. addl INCX, X
  716. movss (X), %xmm7
  717. addl INCX, X
  718. movss %xmm0, (Y)
  719. addl INCY, Y
  720. movss %xmm1, (Y)
  721. addl INCY, Y
  722. movss %xmm2, (Y)
  723. addl INCY, Y
  724. movss %xmm3, (Y)
  725. addl INCY, Y
  726. movss %xmm4, (Y)
  727. addl INCY, Y
  728. movss %xmm5, (Y)
  729. addl INCY, Y
  730. movss %xmm6, (Y)
  731. addl INCY, Y
  732. movss %xmm7, (Y)
  733. addl INCY, Y
  734. decl %eax
  735. jg .L51
  736. ALIGN_3
  737. .L55:
  738. movl M, %eax
  739. andl $7, %eax
  740. jle .L57
  741. ALIGN_3
  742. .L56:
  743. movss (X), %xmm0
  744. addl INCX, X
  745. movss %xmm0, (Y)
  746. addl INCY, Y
  747. decl %eax
  748. jg .L56
  749. ALIGN_3
  750. .L57:
  751. popl %ebx
  752. popl %esi
  753. popl %edi
  754. ret
  755. EPILOGUE