You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

swap_sse2.S 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 24 + STACK + ARGS(%esp)
  44. #define STACK_INCX 28 + STACK + ARGS(%esp)
  45. #define STACK_Y 32 + STACK + ARGS(%esp)
  46. #define STACK_INCY 36 + STACK + ARGS(%esp)
  47. #define M %edx
  48. #define X %esi
  49. #define Y %edi
  50. #define INCX %ebx
  51. #define INCY %ecx
  52. #include "l1param.h"
  53. PROLOGUE
  54. PROFCODE
  55. pushl %ebp
  56. pushl %edi
  57. pushl %esi
  58. pushl %ebx
  59. movl STACK_M, M
  60. movl STACK_X, X
  61. movl STACK_Y, Y
  62. movl STACK_INCX, INCX
  63. movl STACK_INCY, INCY
  64. leal (, INCX, SIZE), INCX
  65. leal (, INCY, SIZE), INCY
  66. cmpl $SIZE, INCX
  67. jne .L40
  68. cmpl $SIZE, INCY
  69. jne .L40
  70. testl $SIZE, Y
  71. je .L10
  72. movsd 0 * SIZE(X), %xmm0
  73. movsd 0 * SIZE(Y), %xmm1
  74. movsd %xmm1, 0 * SIZE(X)
  75. movsd %xmm0, 0 * SIZE(Y)
  76. addl $1 * SIZE, X
  77. addl $1 * SIZE, Y
  78. decl M
  79. jle .L19
  80. ALIGN_4
  81. .L10:
  82. subl $-16 * SIZE, X
  83. subl $-16 * SIZE, Y
  84. testl $SIZE, X
  85. jne .L20
  86. movl M, %eax
  87. sarl $4, %eax
  88. jle .L13
  89. ALIGN_3
  90. .L11:
  91. #ifdef PREFETCHW
  92. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  93. #endif
  94. movaps -16 * SIZE(X), %xmm0
  95. movaps -16 * SIZE(Y), %xmm1
  96. movaps %xmm0, -16 * SIZE(Y)
  97. movaps %xmm1, -16 * SIZE(X)
  98. movaps -14 * SIZE(X), %xmm0
  99. movaps -14 * SIZE(Y), %xmm1
  100. movaps %xmm0, -14 * SIZE(Y)
  101. movaps %xmm1, -14 * SIZE(X)
  102. #ifdef PREFETCHW
  103. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  104. #endif
  105. movaps -12 * SIZE(X), %xmm0
  106. movaps -12 * SIZE(Y), %xmm1
  107. movaps %xmm0, -12 * SIZE(Y)
  108. movaps %xmm1, -12 * SIZE(X)
  109. movaps -10 * SIZE(X), %xmm0
  110. movaps -10 * SIZE(Y), %xmm1
  111. movaps %xmm0, -10 * SIZE(Y)
  112. movaps %xmm1, -10 * SIZE(X)
  113. #ifdef PREFETCHW
  114. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  115. #endif
  116. movaps -8 * SIZE(X), %xmm0
  117. movaps -8 * SIZE(Y), %xmm1
  118. movaps %xmm0, -8 * SIZE(Y)
  119. movaps %xmm1, -8 * SIZE(X)
  120. movaps -6 * SIZE(X), %xmm0
  121. movaps -6 * SIZE(Y), %xmm1
  122. movaps %xmm0, -6 * SIZE(Y)
  123. movaps %xmm1, -6 * SIZE(X)
  124. #ifdef PREFETCHW
  125. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  126. #endif
  127. movaps -4 * SIZE(X), %xmm0
  128. movaps -4 * SIZE(Y), %xmm1
  129. movaps %xmm0, -4 * SIZE(Y)
  130. movaps %xmm1, -4 * SIZE(X)
  131. movaps -2 * SIZE(X), %xmm0
  132. movaps -2 * SIZE(Y), %xmm1
  133. movaps %xmm0, -2 * SIZE(Y)
  134. movaps %xmm1, -2 * SIZE(X)
  135. subl $-16 * SIZE, Y
  136. subl $-16 * SIZE, X
  137. decl %eax
  138. jg .L11
  139. ALIGN_3
  140. .L13:
  141. testl $8, M
  142. jle .L14
  143. movaps -16 * SIZE(X), %xmm0
  144. movaps -16 * SIZE(Y), %xmm1
  145. movaps %xmm0, -16 * SIZE(Y)
  146. movaps %xmm1, -16 * SIZE(X)
  147. movaps -14 * SIZE(X), %xmm0
  148. movaps -14 * SIZE(Y), %xmm1
  149. movaps %xmm0, -14 * SIZE(Y)
  150. movaps %xmm1, -14 * SIZE(X)
  151. movaps -12 * SIZE(X), %xmm0
  152. movaps -12 * SIZE(Y), %xmm1
  153. movaps %xmm0, -12 * SIZE(Y)
  154. movaps %xmm1, -12 * SIZE(X)
  155. movaps -10 * SIZE(X), %xmm0
  156. movaps -10 * SIZE(Y), %xmm1
  157. movaps %xmm0, -10 * SIZE(Y)
  158. movaps %xmm1, -10 * SIZE(X)
  159. addl $8 * SIZE, X
  160. addl $8 * SIZE, Y
  161. ALIGN_3
  162. .L14:
  163. testl $4, M
  164. jle .L15
  165. movaps -16 * SIZE(X), %xmm0
  166. movaps -16 * SIZE(Y), %xmm1
  167. movaps %xmm0, -16 * SIZE(Y)
  168. movaps %xmm1, -16 * SIZE(X)
  169. movaps -14 * SIZE(X), %xmm0
  170. movaps -14 * SIZE(Y), %xmm1
  171. movaps %xmm0, -14 * SIZE(Y)
  172. movaps %xmm1, -14 * SIZE(X)
  173. addl $4 * SIZE, X
  174. addl $4 * SIZE, Y
  175. ALIGN_3
  176. .L15:
  177. testl $2, M
  178. jle .L16
  179. movaps -16 * SIZE(X), %xmm0
  180. movaps -16 * SIZE(Y), %xmm1
  181. movaps %xmm0, -16 * SIZE(Y)
  182. movaps %xmm1, -16 * SIZE(X)
  183. addl $2 * SIZE, X
  184. addl $2 * SIZE, Y
  185. ALIGN_3
  186. .L16:
  187. testl $1, M
  188. jle .L19
  189. movsd -16 * SIZE(X), %xmm0
  190. movsd -16 * SIZE(Y), %xmm1
  191. movlps %xmm1, -16 * SIZE(X)
  192. movlps %xmm0, -16 * SIZE(Y)
  193. ALIGN_3
  194. .L19:
  195. popl %ebx
  196. popl %esi
  197. popl %edi
  198. popl %ebp
  199. ret
  200. ALIGN_3
  201. .L20:
  202. movhps -16 * SIZE(X), %xmm0
  203. movaps -16 * SIZE(Y), %xmm1
  204. movlps %xmm1, -16 * SIZE(X)
  205. decl M
  206. jle .L29
  207. movl M, %eax
  208. sarl $4, %eax
  209. jle .L23
  210. ALIGN_4
  211. .L21:
  212. #ifdef PREFETCHW
  213. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  214. #endif
  215. movaps -15 * SIZE(X), %xmm2
  216. movaps -14 * SIZE(Y), %xmm3
  217. SHUFPD_1 %xmm2, %xmm0
  218. movaps %xmm0, -16 * SIZE(Y)
  219. SHUFPD_1 %xmm3, %xmm1
  220. movaps %xmm1, -15 * SIZE(X)
  221. movaps -13 * SIZE(X), %xmm0
  222. movaps -12 * SIZE(Y), %xmm1
  223. SHUFPD_1 %xmm0, %xmm2
  224. movaps %xmm2, -14 * SIZE(Y)
  225. SHUFPD_1 %xmm1, %xmm3
  226. movaps %xmm3, -13 * SIZE(X)
  227. #ifdef PREFETCHW
  228. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  229. #endif
  230. movaps -11 * SIZE(X), %xmm2
  231. movaps -10 * SIZE(Y), %xmm3
  232. SHUFPD_1 %xmm2, %xmm0
  233. movaps %xmm0, -12 * SIZE(Y)
  234. SHUFPD_1 %xmm3, %xmm1
  235. movaps %xmm1, -11 * SIZE(X)
  236. movaps -9 * SIZE(X), %xmm0
  237. movaps -8 * SIZE(Y), %xmm1
  238. SHUFPD_1 %xmm0, %xmm2
  239. movaps %xmm2, -10 * SIZE(Y)
  240. SHUFPD_1 %xmm1, %xmm3
  241. movaps %xmm3, -9 * SIZE(X)
  242. #ifdef PREFETCHW
  243. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  244. #endif
  245. movaps -7 * SIZE(X), %xmm2
  246. movaps -6 * SIZE(Y), %xmm3
  247. SHUFPD_1 %xmm2, %xmm0
  248. movaps %xmm0, -8 * SIZE(Y)
  249. SHUFPD_1 %xmm3, %xmm1
  250. movaps %xmm1, -7 * SIZE(X)
  251. movaps -5 * SIZE(X), %xmm0
  252. movaps -4 * SIZE(Y), %xmm1
  253. SHUFPD_1 %xmm0, %xmm2
  254. movaps %xmm2, -6 * SIZE(Y)
  255. SHUFPD_1 %xmm1, %xmm3
  256. movaps %xmm3, -5 * SIZE(X)
  257. #ifdef PREFETCHW
  258. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  259. #endif
  260. movaps -3 * SIZE(X), %xmm2
  261. movaps -2 * SIZE(Y), %xmm3
  262. SHUFPD_1 %xmm2, %xmm0
  263. movaps %xmm0, -4 * SIZE(Y)
  264. SHUFPD_1 %xmm3, %xmm1
  265. movaps %xmm1, -3 * SIZE(X)
  266. movaps -1 * SIZE(X), %xmm0
  267. movaps 0 * SIZE(Y), %xmm1
  268. SHUFPD_1 %xmm0, %xmm2
  269. movaps %xmm2, -2 * SIZE(Y)
  270. SHUFPD_1 %xmm1, %xmm3
  271. movaps %xmm3, -1 * SIZE(X)
  272. subl $-16 * SIZE, X
  273. subl $-16 * SIZE, Y
  274. decl %eax
  275. jg .L21
  276. ALIGN_3
  277. .L23:
  278. testl $8, M
  279. jle .L24
  280. movaps -15 * SIZE(X), %xmm2
  281. movaps -14 * SIZE(Y), %xmm3
  282. SHUFPD_1 %xmm2, %xmm0
  283. movaps %xmm0, -16 * SIZE(Y)
  284. SHUFPD_1 %xmm3, %xmm1
  285. movaps %xmm1, -15 * SIZE(X)
  286. movaps -13 * SIZE(X), %xmm0
  287. movaps -12 * SIZE(Y), %xmm1
  288. SHUFPD_1 %xmm0, %xmm2
  289. movaps %xmm2, -14 * SIZE(Y)
  290. SHUFPD_1 %xmm1, %xmm3
  291. movaps %xmm3, -13 * SIZE(X)
  292. movaps -11 * SIZE(X), %xmm2
  293. movaps -10 * SIZE(Y), %xmm3
  294. SHUFPD_1 %xmm2, %xmm0
  295. movaps %xmm0, -12 * SIZE(Y)
  296. SHUFPD_1 %xmm3, %xmm1
  297. movaps %xmm1, -11 * SIZE(X)
  298. movaps -9 * SIZE(X), %xmm0
  299. movaps -8 * SIZE(Y), %xmm1
  300. SHUFPD_1 %xmm0, %xmm2
  301. movaps %xmm2, -10 * SIZE(Y)
  302. SHUFPD_1 %xmm1, %xmm3
  303. movaps %xmm3, -9 * SIZE(X)
  304. addl $8 * SIZE, X
  305. addl $8 * SIZE, Y
  306. ALIGN_3
  307. .L24:
  308. testl $4, M
  309. jle .L25
  310. movaps -15 * SIZE(X), %xmm2
  311. movaps -14 * SIZE(Y), %xmm3
  312. SHUFPD_1 %xmm2, %xmm0
  313. movaps %xmm0, -16 * SIZE(Y)
  314. SHUFPD_1 %xmm3, %xmm1
  315. movaps %xmm1, -15 * SIZE(X)
  316. movaps -13 * SIZE(X), %xmm0
  317. movaps -12 * SIZE(Y), %xmm1
  318. SHUFPD_1 %xmm0, %xmm2
  319. movaps %xmm2, -14 * SIZE(Y)
  320. SHUFPD_1 %xmm1, %xmm3
  321. movaps %xmm3, -13 * SIZE(X)
  322. addl $4 * SIZE, X
  323. addl $4 * SIZE, Y
  324. ALIGN_3
  325. .L25:
  326. testl $2, M
  327. jle .L26
  328. movaps -15 * SIZE(X), %xmm2
  329. movaps -14 * SIZE(Y), %xmm3
  330. SHUFPD_1 %xmm3, %xmm1
  331. movaps %xmm1, -15 * SIZE(X)
  332. SHUFPD_1 %xmm2, %xmm0
  333. movaps %xmm0, -16 * SIZE(Y)
  334. movaps %xmm2, %xmm0
  335. movaps %xmm3, %xmm1
  336. addl $2 * SIZE, X
  337. addl $2 * SIZE, Y
  338. ALIGN_3
  339. .L26:
  340. testl $1, M
  341. jle .L29
  342. movhps %xmm0, -16 * SIZE(Y)
  343. movhps -15 * SIZE(X), %xmm0
  344. movhps %xmm1, -15 * SIZE(X)
  345. addl $SIZE, X
  346. addl $SIZE, Y
  347. ALIGN_3
  348. .L29:
  349. movhps %xmm0, -16 * SIZE(Y)
  350. popl %ebx
  351. popl %esi
  352. popl %edi
  353. popl %ebp
  354. ret
  355. ALIGN_3
  356. .L40:
  357. movl M, %eax
  358. sarl $3, %eax
  359. jle .L45
  360. ALIGN_3
  361. .L41:
  362. movsd (X), %xmm0
  363. movsd (Y), %xmm1
  364. movsd %xmm1, (X)
  365. addl INCX, X
  366. movsd %xmm0, (Y)
  367. addl INCY, Y
  368. movsd (X), %xmm0
  369. movsd (Y), %xmm1
  370. movsd %xmm1, (X)
  371. addl INCX, X
  372. movsd %xmm0, (Y)
  373. addl INCY, Y
  374. movsd (X), %xmm0
  375. movsd (Y), %xmm1
  376. movsd %xmm1, (X)
  377. addl INCX, X
  378. movsd %xmm0, (Y)
  379. addl INCY, Y
  380. movsd (X), %xmm0
  381. movsd (Y), %xmm1
  382. movsd %xmm1, (X)
  383. addl INCX, X
  384. movsd %xmm0, (Y)
  385. addl INCY, Y
  386. movsd (X), %xmm0
  387. movsd (Y), %xmm1
  388. movsd %xmm1, (X)
  389. addl INCX, X
  390. movsd %xmm0, (Y)
  391. addl INCY, Y
  392. movsd (X), %xmm0
  393. movsd (Y), %xmm1
  394. movsd %xmm1, (X)
  395. addl INCX, X
  396. movsd %xmm0, (Y)
  397. addl INCY, Y
  398. movsd (X), %xmm0
  399. movsd (Y), %xmm1
  400. movsd %xmm1, (X)
  401. addl INCX, X
  402. movsd %xmm0, (Y)
  403. addl INCY, Y
  404. movsd (X), %xmm0
  405. movsd (Y), %xmm1
  406. movsd %xmm1, (X)
  407. addl INCX, X
  408. movsd %xmm0, (Y)
  409. addl INCY, Y
  410. decl %eax
  411. jg .L41
  412. ALIGN_3
  413. .L45:
  414. movl M, %eax
  415. andl $7, %eax
  416. jle .L47
  417. ALIGN_3
  418. .L46:
  419. movsd (X), %xmm0
  420. movsd (Y), %xmm1
  421. movsd %xmm1, (X)
  422. movsd %xmm0, (Y)
  423. addl INCX, X
  424. addl INCY, Y
  425. decl %eax
  426. jg .L46
  427. ALIGN_3
  428. .L47:
  429. popl %ebx
  430. popl %esi
  431. popl %edi
  432. popl %ebp
  433. ret
  434. EPILOGUE