You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zswap_sse2.S 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 32 + STACK + ARGS(%esp)
  44. #define STACK_INCX 36 + STACK + ARGS(%esp)
  45. #define STACK_Y 40 + STACK + ARGS(%esp)
  46. #define STACK_INCY 44 + STACK + ARGS(%esp)
  47. #define M %edx
  48. #define X %esi
  49. #define Y %edi
  50. #define INCX %ebx
  51. #define INCY %ecx
  52. #include "l1param.h"
  53. PROLOGUE
  54. PROFCODE
  55. pushl %ebp
  56. pushl %edi
  57. pushl %esi
  58. pushl %ebx
  59. movl STACK_M, M
  60. movl STACK_X, X
  61. movl STACK_Y, Y
  62. movl STACK_INCX, INCX
  63. movl STACK_INCY, INCY
  64. sall $ZBASE_SHIFT, INCX
  65. sall $ZBASE_SHIFT, INCY
  66. testl M, M
  67. jle .L19
  68. cmpl $2 * SIZE, INCX
  69. jne .L50
  70. cmpl $2 * SIZE, INCY
  71. jne .L50
  72. subl $-16 * SIZE, X
  73. subl $-16 * SIZE, Y
  74. testl $SIZE, Y
  75. jne .L30
  76. testl $SIZE, X
  77. jne .L20
  78. movl M, %eax
  79. sarl $3, %eax
  80. jle .L13
  81. ALIGN_3
  82. .L11:
  83. #ifdef PREFETCHW
  84. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  85. #endif
  86. movaps -16 * SIZE(X), %xmm0
  87. movaps -16 * SIZE(Y), %xmm1
  88. movaps %xmm0, -16 * SIZE(Y)
  89. movaps %xmm1, -16 * SIZE(X)
  90. movaps -14 * SIZE(X), %xmm0
  91. movaps -14 * SIZE(Y), %xmm1
  92. movaps %xmm0, -14 * SIZE(Y)
  93. movaps %xmm1, -14 * SIZE(X)
  94. #ifdef PREFETCHW
  95. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  96. #endif
  97. movaps -12 * SIZE(X), %xmm0
  98. movaps -12 * SIZE(Y), %xmm1
  99. movaps %xmm0, -12 * SIZE(Y)
  100. movaps %xmm1, -12 * SIZE(X)
  101. movaps -10 * SIZE(X), %xmm0
  102. movaps -10 * SIZE(Y), %xmm1
  103. movaps %xmm0, -10 * SIZE(Y)
  104. movaps %xmm1, -10 * SIZE(X)
  105. #ifdef PREFETCHW
  106. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  107. #endif
  108. movaps -8 * SIZE(X), %xmm0
  109. movaps -8 * SIZE(Y), %xmm1
  110. movaps %xmm0, -8 * SIZE(Y)
  111. movaps %xmm1, -8 * SIZE(X)
  112. movaps -6 * SIZE(X), %xmm0
  113. movaps -6 * SIZE(Y), %xmm1
  114. movaps %xmm0, -6 * SIZE(Y)
  115. movaps %xmm1, -6 * SIZE(X)
  116. #ifdef PREFETCHW
  117. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  118. #endif
  119. movaps -4 * SIZE(X), %xmm0
  120. movaps -4 * SIZE(Y), %xmm1
  121. movaps %xmm0, -4 * SIZE(Y)
  122. movaps %xmm1, -4 * SIZE(X)
  123. movaps -2 * SIZE(X), %xmm0
  124. movaps -2 * SIZE(Y), %xmm1
  125. movaps %xmm0, -2 * SIZE(Y)
  126. movaps %xmm1, -2 * SIZE(X)
  127. subl $-16 * SIZE, Y
  128. subl $-16 * SIZE, X
  129. decl %eax
  130. jg .L11
  131. ALIGN_3
  132. .L13:
  133. testl $4, M
  134. jle .L14
  135. movaps -16 * SIZE(X), %xmm0
  136. movaps -16 * SIZE(Y), %xmm1
  137. movaps %xmm0, -16 * SIZE(Y)
  138. movaps %xmm1, -16 * SIZE(X)
  139. movaps -14 * SIZE(X), %xmm0
  140. movaps -14 * SIZE(Y), %xmm1
  141. movaps %xmm0, -14 * SIZE(Y)
  142. movaps %xmm1, -14 * SIZE(X)
  143. movaps -12 * SIZE(X), %xmm0
  144. movaps -12 * SIZE(Y), %xmm1
  145. movaps %xmm0, -12 * SIZE(Y)
  146. movaps %xmm1, -12 * SIZE(X)
  147. movaps -10 * SIZE(X), %xmm0
  148. movaps -10 * SIZE(Y), %xmm1
  149. movaps %xmm0, -10 * SIZE(Y)
  150. movaps %xmm1, -10 * SIZE(X)
  151. addl $8 * SIZE, X
  152. addl $8 * SIZE, Y
  153. ALIGN_3
  154. .L14:
  155. testl $2, M
  156. jle .L15
  157. movaps -16 * SIZE(X), %xmm0
  158. movaps -16 * SIZE(Y), %xmm1
  159. movaps %xmm0, -16 * SIZE(Y)
  160. movaps %xmm1, -16 * SIZE(X)
  161. movaps -14 * SIZE(X), %xmm0
  162. movaps -14 * SIZE(Y), %xmm1
  163. movaps %xmm0, -14 * SIZE(Y)
  164. movaps %xmm1, -14 * SIZE(X)
  165. addl $4 * SIZE, X
  166. addl $4 * SIZE, Y
  167. ALIGN_3
  168. .L15:
  169. testl $1, M
  170. jle .L19
  171. movaps -16 * SIZE(X), %xmm0
  172. movaps -16 * SIZE(Y), %xmm1
  173. movaps %xmm0, -16 * SIZE(Y)
  174. movaps %xmm1, -16 * SIZE(X)
  175. addl $2 * SIZE, X
  176. addl $2 * SIZE, Y
  177. ALIGN_3
  178. .L19:
  179. xorl %eax,%eax
  180. popl %ebx
  181. popl %esi
  182. popl %edi
  183. popl %ebp
  184. ret
  185. ALIGN_3
  186. .L20:
  187. movhps -16 * SIZE(X), %xmm0
  188. movaps -16 * SIZE(Y), %xmm1
  189. movlps %xmm1, -16 * SIZE(X)
  190. decl M
  191. jle .L29
  192. movl M, %eax
  193. sarl $3, %eax
  194. jle .L23
  195. ALIGN_4
  196. .L21:
  197. #ifdef PREFETCHW
  198. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  199. #endif
  200. movaps -15 * SIZE(X), %xmm2
  201. movaps -14 * SIZE(Y), %xmm3
  202. SHUFPD_1 %xmm2, %xmm0
  203. movaps %xmm0, -16 * SIZE(Y)
  204. SHUFPD_1 %xmm3, %xmm1
  205. movaps %xmm1, -15 * SIZE(X)
  206. movaps -13 * SIZE(X), %xmm0
  207. movaps -12 * SIZE(Y), %xmm1
  208. SHUFPD_1 %xmm0, %xmm2
  209. movaps %xmm2, -14 * SIZE(Y)
  210. SHUFPD_1 %xmm1, %xmm3
  211. movaps %xmm3, -13 * SIZE(X)
  212. #ifdef PREFETCHW
  213. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  214. #endif
  215. movaps -11 * SIZE(X), %xmm2
  216. movaps -10 * SIZE(Y), %xmm3
  217. SHUFPD_1 %xmm2, %xmm0
  218. movaps %xmm0, -12 * SIZE(Y)
  219. SHUFPD_1 %xmm3, %xmm1
  220. movaps %xmm1, -11 * SIZE(X)
  221. movaps -9 * SIZE(X), %xmm0
  222. movaps -8 * SIZE(Y), %xmm1
  223. SHUFPD_1 %xmm0, %xmm2
  224. movaps %xmm2, -10 * SIZE(Y)
  225. SHUFPD_1 %xmm1, %xmm3
  226. movaps %xmm3, -9 * SIZE(X)
  227. #ifdef PREFETCHW
  228. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  229. #endif
  230. movaps -7 * SIZE(X), %xmm2
  231. movaps -6 * SIZE(Y), %xmm3
  232. SHUFPD_1 %xmm2, %xmm0
  233. movaps %xmm0, -8 * SIZE(Y)
  234. SHUFPD_1 %xmm3, %xmm1
  235. movaps %xmm1, -7 * SIZE(X)
  236. movaps -5 * SIZE(X), %xmm0
  237. movaps -4 * SIZE(Y), %xmm1
  238. SHUFPD_1 %xmm0, %xmm2
  239. movaps %xmm2, -6 * SIZE(Y)
  240. SHUFPD_1 %xmm1, %xmm3
  241. movaps %xmm3, -5 * SIZE(X)
  242. #ifdef PREFETCHW
  243. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  244. #endif
  245. movaps -3 * SIZE(X), %xmm2
  246. movaps -2 * SIZE(Y), %xmm3
  247. SHUFPD_1 %xmm2, %xmm0
  248. movaps %xmm0, -4 * SIZE(Y)
  249. SHUFPD_1 %xmm3, %xmm1
  250. movaps %xmm1, -3 * SIZE(X)
  251. movaps -1 * SIZE(X), %xmm0
  252. movaps 0 * SIZE(Y), %xmm1
  253. SHUFPD_1 %xmm0, %xmm2
  254. movaps %xmm2, -2 * SIZE(Y)
  255. SHUFPD_1 %xmm1, %xmm3
  256. movaps %xmm3, -1 * SIZE(X)
  257. subl $-16 * SIZE, X
  258. subl $-16 * SIZE, Y
  259. decl %eax
  260. jg .L21
  261. ALIGN_3
  262. .L23:
  263. testl $4, M
  264. jle .L24
  265. movaps -15 * SIZE(X), %xmm2
  266. movaps -14 * SIZE(Y), %xmm3
  267. SHUFPD_1 %xmm2, %xmm0
  268. movaps %xmm0, -16 * SIZE(Y)
  269. SHUFPD_1 %xmm3, %xmm1
  270. movaps %xmm1, -15 * SIZE(X)
  271. movaps -13 * SIZE(X), %xmm0
  272. movaps -12 * SIZE(Y), %xmm1
  273. SHUFPD_1 %xmm0, %xmm2
  274. movaps %xmm2, -14 * SIZE(Y)
  275. SHUFPD_1 %xmm1, %xmm3
  276. movaps %xmm3, -13 * SIZE(X)
  277. movaps -11 * SIZE(X), %xmm2
  278. movaps -10 * SIZE(Y), %xmm3
  279. SHUFPD_1 %xmm2, %xmm0
  280. movaps %xmm0, -12 * SIZE(Y)
  281. SHUFPD_1 %xmm3, %xmm1
  282. movaps %xmm1, -11 * SIZE(X)
  283. movaps -9 * SIZE(X), %xmm0
  284. movaps -8 * SIZE(Y), %xmm1
  285. SHUFPD_1 %xmm0, %xmm2
  286. movaps %xmm2, -10 * SIZE(Y)
  287. SHUFPD_1 %xmm1, %xmm3
  288. movaps %xmm3, -9 * SIZE(X)
  289. addl $8 * SIZE, X
  290. addl $8 * SIZE, Y
  291. ALIGN_3
  292. .L24:
  293. testl $2, M
  294. jle .L25
  295. movaps -15 * SIZE(X), %xmm2
  296. movaps -14 * SIZE(Y), %xmm3
  297. SHUFPD_1 %xmm2, %xmm0
  298. movaps %xmm0, -16 * SIZE(Y)
  299. SHUFPD_1 %xmm3, %xmm1
  300. movaps %xmm1, -15 * SIZE(X)
  301. movaps -13 * SIZE(X), %xmm0
  302. movaps -12 * SIZE(Y), %xmm1
  303. SHUFPD_1 %xmm0, %xmm2
  304. movaps %xmm2, -14 * SIZE(Y)
  305. SHUFPD_1 %xmm1, %xmm3
  306. movaps %xmm3, -13 * SIZE(X)
  307. addl $4 * SIZE, X
  308. addl $4 * SIZE, Y
  309. ALIGN_3
  310. .L25:
  311. testl $1, M
  312. jle .L29
  313. movaps -15 * SIZE(X), %xmm2
  314. movaps -14 * SIZE(Y), %xmm3
  315. SHUFPD_1 %xmm3, %xmm1
  316. movaps %xmm1, -15 * SIZE(X)
  317. SHUFPD_1 %xmm2, %xmm0
  318. movaps %xmm0, -16 * SIZE(Y)
  319. movaps %xmm2, %xmm0
  320. movaps %xmm3, %xmm1
  321. addl $2 * SIZE, X
  322. addl $2 * SIZE, Y
  323. ALIGN_3
  324. .L29:
  325. movaps -15 * SIZE(X), %xmm2
  326. movhps %xmm1, -15 * SIZE(X)
  327. SHUFPD_1 %xmm2, %xmm0
  328. movaps %xmm0, -16 * SIZE(Y)
  329. popl %ebx
  330. popl %esi
  331. popl %edi
  332. popl %ebp
  333. ret
  334. ALIGN_3
  335. .L30:
  336. testl $SIZE, X
  337. jne .L40
  338. movhps -16 * SIZE(Y), %xmm0
  339. movaps -16 * SIZE(X), %xmm1
  340. movlps %xmm1, -16 * SIZE(Y)
  341. decl M
  342. jle .L39
  343. movl M, %eax
  344. sarl $3, %eax
  345. jle .L33
  346. ALIGN_4
  347. .L31:
  348. #ifdef PREFETCHW
  349. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  350. #endif
  351. movaps -15 * SIZE(Y), %xmm2
  352. movaps -14 * SIZE(X), %xmm3
  353. SHUFPD_1 %xmm2, %xmm0
  354. movaps %xmm0, -16 * SIZE(X)
  355. SHUFPD_1 %xmm3, %xmm1
  356. movaps %xmm1, -15 * SIZE(Y)
  357. movaps -13 * SIZE(Y), %xmm0
  358. movaps -12 * SIZE(X), %xmm1
  359. SHUFPD_1 %xmm0, %xmm2
  360. movaps %xmm2, -14 * SIZE(X)
  361. SHUFPD_1 %xmm1, %xmm3
  362. movaps %xmm3, -13 * SIZE(Y)
  363. #ifdef PREFETCHW
  364. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  365. #endif
  366. movaps -11 * SIZE(Y), %xmm2
  367. movaps -10 * SIZE(X), %xmm3
  368. SHUFPD_1 %xmm2, %xmm0
  369. movaps %xmm0, -12 * SIZE(X)
  370. SHUFPD_1 %xmm3, %xmm1
  371. movaps %xmm1, -11 * SIZE(Y)
  372. movaps -9 * SIZE(Y), %xmm0
  373. movaps -8 * SIZE(X), %xmm1
  374. SHUFPD_1 %xmm0, %xmm2
  375. movaps %xmm2, -10 * SIZE(X)
  376. SHUFPD_1 %xmm1, %xmm3
  377. movaps %xmm3, -9 * SIZE(Y)
  378. #ifdef PREFETCHW
  379. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  380. #endif
  381. movaps -7 * SIZE(Y), %xmm2
  382. movaps -6 * SIZE(X), %xmm3
  383. SHUFPD_1 %xmm2, %xmm0
  384. movaps %xmm0, -8 * SIZE(X)
  385. SHUFPD_1 %xmm3, %xmm1
  386. movaps %xmm1, -7 * SIZE(Y)
  387. movaps -5 * SIZE(Y), %xmm0
  388. movaps -4 * SIZE(X), %xmm1
  389. SHUFPD_1 %xmm0, %xmm2
  390. movaps %xmm2, -6 * SIZE(X)
  391. SHUFPD_1 %xmm1, %xmm3
  392. movaps %xmm3, -5 * SIZE(Y)
  393. #ifdef PREFETCHW
  394. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  395. #endif
  396. movaps -3 * SIZE(Y), %xmm2
  397. movaps -2 * SIZE(X), %xmm3
  398. SHUFPD_1 %xmm2, %xmm0
  399. movaps %xmm0, -4 * SIZE(X)
  400. SHUFPD_1 %xmm3, %xmm1
  401. movaps %xmm1, -3 * SIZE(Y)
  402. movaps -1 * SIZE(Y), %xmm0
  403. movaps 0 * SIZE(X), %xmm1
  404. SHUFPD_1 %xmm0, %xmm2
  405. movaps %xmm2, -2 * SIZE(X)
  406. SHUFPD_1 %xmm1, %xmm3
  407. movaps %xmm3, -1 * SIZE(Y)
  408. subl $-16 * SIZE, X
  409. subl $-16 * SIZE, Y
  410. decl %eax
  411. jg .L31
  412. ALIGN_3
  413. .L33:
  414. testl $4, M
  415. jle .L34
  416. movaps -15 * SIZE(Y), %xmm2
  417. movaps -14 * SIZE(X), %xmm3
  418. SHUFPD_1 %xmm2, %xmm0
  419. movaps %xmm0, -16 * SIZE(X)
  420. SHUFPD_1 %xmm3, %xmm1
  421. movaps %xmm1, -15 * SIZE(Y)
  422. movaps -13 * SIZE(Y), %xmm0
  423. movaps -12 * SIZE(X), %xmm1
  424. SHUFPD_1 %xmm0, %xmm2
  425. movaps %xmm2, -14 * SIZE(X)
  426. SHUFPD_1 %xmm1, %xmm3
  427. movaps %xmm3, -13 * SIZE(Y)
  428. movaps -11 * SIZE(Y), %xmm2
  429. movaps -10 * SIZE(X), %xmm3
  430. SHUFPD_1 %xmm2, %xmm0
  431. movaps %xmm0, -12 * SIZE(X)
  432. SHUFPD_1 %xmm3, %xmm1
  433. movaps %xmm1, -11 * SIZE(Y)
  434. movaps -9 * SIZE(Y), %xmm0
  435. movaps -8 * SIZE(X), %xmm1
  436. SHUFPD_1 %xmm0, %xmm2
  437. movaps %xmm2, -10 * SIZE(X)
  438. SHUFPD_1 %xmm1, %xmm3
  439. movaps %xmm3, -9 * SIZE(Y)
  440. addl $8 * SIZE, X
  441. addl $8 * SIZE, Y
  442. ALIGN_3
  443. .L34:
  444. testl $2, M
  445. jle .L35
  446. movaps -15 * SIZE(Y), %xmm2
  447. movaps -14 * SIZE(X), %xmm3
  448. SHUFPD_1 %xmm2, %xmm0
  449. movaps %xmm0, -16 * SIZE(X)
  450. SHUFPD_1 %xmm3, %xmm1
  451. movaps %xmm1, -15 * SIZE(Y)
  452. movaps -13 * SIZE(Y), %xmm0
  453. movaps -12 * SIZE(X), %xmm1
  454. SHUFPD_1 %xmm0, %xmm2
  455. movaps %xmm2, -14 * SIZE(X)
  456. SHUFPD_1 %xmm1, %xmm3
  457. movaps %xmm3, -13 * SIZE(Y)
  458. addl $4 * SIZE, X
  459. addl $4 * SIZE, Y
  460. ALIGN_3
  461. .L35:
  462. testl $1, M
  463. jle .L39
  464. movaps -15 * SIZE(Y), %xmm2
  465. movaps -14 * SIZE(X), %xmm3
  466. SHUFPD_1 %xmm3, %xmm1
  467. movaps %xmm1, -15 * SIZE(Y)
  468. SHUFPD_1 %xmm2, %xmm0
  469. movaps %xmm0, -16 * SIZE(X)
  470. movaps %xmm2, %xmm0
  471. movaps %xmm3, %xmm1
  472. addl $2 * SIZE, X
  473. addl $2 * SIZE, Y
  474. ALIGN_3
  475. .L39:
  476. movaps -15 * SIZE(Y), %xmm2
  477. movhps %xmm1, -15 * SIZE(Y)
  478. SHUFPD_1 %xmm2, %xmm0
  479. movaps %xmm0, -16 * SIZE(X)
  480. popl %ebx
  481. popl %esi
  482. popl %edi
  483. popl %ebp
  484. ret
  485. ALIGN_3
  486. .L40:
  487. movsd -16 * SIZE(X), %xmm0
  488. movsd -16 * SIZE(Y), %xmm1
  489. movlps %xmm0, -16 * SIZE(Y)
  490. movlps %xmm1, -16 * SIZE(X)
  491. addl $SIZE, X
  492. addl $SIZE, Y
  493. decl M
  494. jle .L49
  495. movl M, %eax
  496. sarl $3, %eax
  497. jle .L43
  498. ALIGN_3
  499. .L41:
  500. #ifdef PREFETCHW
  501. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  502. #endif
  503. movaps -16 * SIZE(X), %xmm0
  504. movaps -16 * SIZE(Y), %xmm1
  505. movaps %xmm0, -16 * SIZE(Y)
  506. movaps %xmm1, -16 * SIZE(X)
  507. movaps -14 * SIZE(X), %xmm0
  508. movaps -14 * SIZE(Y), %xmm1
  509. movaps %xmm0, -14 * SIZE(Y)
  510. movaps %xmm1, -14 * SIZE(X)
  511. #ifdef PREFETCHW
  512. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  513. #endif
  514. movaps -12 * SIZE(X), %xmm0
  515. movaps -12 * SIZE(Y), %xmm1
  516. movaps %xmm0, -12 * SIZE(Y)
  517. movaps %xmm1, -12 * SIZE(X)
  518. movaps -10 * SIZE(X), %xmm0
  519. movaps -10 * SIZE(Y), %xmm1
  520. movaps %xmm0, -10 * SIZE(Y)
  521. movaps %xmm1, -10 * SIZE(X)
  522. #ifdef PREFETCHW
  523. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  524. #endif
  525. movaps -8 * SIZE(X), %xmm0
  526. movaps -8 * SIZE(Y), %xmm1
  527. movaps %xmm0, -8 * SIZE(Y)
  528. movaps %xmm1, -8 * SIZE(X)
  529. movaps -6 * SIZE(X), %xmm0
  530. movaps -6 * SIZE(Y), %xmm1
  531. movaps %xmm0, -6 * SIZE(Y)
  532. movaps %xmm1, -6 * SIZE(X)
  533. #ifdef PREFETCHW
  534. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  535. #endif
  536. movaps -4 * SIZE(X), %xmm0
  537. movaps -4 * SIZE(Y), %xmm1
  538. movaps %xmm0, -4 * SIZE(Y)
  539. movaps %xmm1, -4 * SIZE(X)
  540. movaps -2 * SIZE(X), %xmm0
  541. movaps -2 * SIZE(Y), %xmm1
  542. movaps %xmm0, -2 * SIZE(Y)
  543. movaps %xmm1, -2 * SIZE(X)
  544. subl $-16 * SIZE, Y
  545. subl $-16 * SIZE, X
  546. decl %eax
  547. jg .L41
  548. ALIGN_3
  549. .L43:
  550. testl $4, M
  551. jle .L44
  552. movaps -16 * SIZE(X), %xmm0
  553. movaps -16 * SIZE(Y), %xmm1
  554. movaps %xmm0, -16 * SIZE(Y)
  555. movaps %xmm1, -16 * SIZE(X)
  556. movaps -14 * SIZE(X), %xmm0
  557. movaps -14 * SIZE(Y), %xmm1
  558. movaps %xmm0, -14 * SIZE(Y)
  559. movaps %xmm1, -14 * SIZE(X)
  560. movaps -12 * SIZE(X), %xmm0
  561. movaps -12 * SIZE(Y), %xmm1
  562. movaps %xmm0, -12 * SIZE(Y)
  563. movaps %xmm1, -12 * SIZE(X)
  564. movaps -10 * SIZE(X), %xmm0
  565. movaps -10 * SIZE(Y), %xmm1
  566. movaps %xmm0, -10 * SIZE(Y)
  567. movaps %xmm1, -10 * SIZE(X)
  568. addl $8 * SIZE, X
  569. addl $8 * SIZE, Y
  570. ALIGN_3
  571. .L44:
  572. testl $2, M
  573. jle .L45
  574. movaps -16 * SIZE(X), %xmm0
  575. movaps -16 * SIZE(Y), %xmm1
  576. movaps %xmm0, -16 * SIZE(Y)
  577. movaps %xmm1, -16 * SIZE(X)
  578. movaps -14 * SIZE(X), %xmm0
  579. movaps -14 * SIZE(Y), %xmm1
  580. movaps %xmm0, -14 * SIZE(Y)
  581. movaps %xmm1, -14 * SIZE(X)
  582. addl $4 * SIZE, X
  583. addl $4 * SIZE, Y
  584. ALIGN_3
  585. .L45:
  586. testl $1, M
  587. jle .L49
  588. movaps -16 * SIZE(X), %xmm0
  589. movaps -16 * SIZE(Y), %xmm1
  590. movaps %xmm0, -16 * SIZE(Y)
  591. movaps %xmm1, -16 * SIZE(X)
  592. addl $2 * SIZE, X
  593. addl $2 * SIZE, Y
  594. ALIGN_3
  595. .L49:
  596. movsd -16 * SIZE(X), %xmm0
  597. movsd -16 * SIZE(Y), %xmm1
  598. movlps %xmm0, -16 * SIZE(Y)
  599. movlps %xmm1, -16 * SIZE(X)
  600. popl %ebx
  601. popl %esi
  602. popl %edi
  603. popl %ebp
  604. ret
  605. ALIGN_3
  606. .L50:
  607. testl $SIZE, X
  608. jne .L60
  609. testl $SIZE, Y
  610. jne .L60
  611. movl M, %eax
  612. sarl $2, %eax
  613. jle .L55
  614. ALIGN_3
  615. .L51:
  616. movaps (X), %xmm0
  617. movaps (Y), %xmm1
  618. movaps %xmm1, (X)
  619. addl INCX, X
  620. movaps %xmm0, (Y)
  621. addl INCY, Y
  622. movaps (X), %xmm0
  623. movaps (Y), %xmm1
  624. movaps %xmm1, (X)
  625. addl INCX, X
  626. movaps %xmm0, (Y)
  627. addl INCY, Y
  628. movaps (X), %xmm0
  629. movaps (Y), %xmm1
  630. movaps %xmm1, (X)
  631. addl INCX, X
  632. movaps %xmm0, (Y)
  633. addl INCY, Y
  634. movaps (X), %xmm0
  635. movaps (Y), %xmm1
  636. movaps %xmm1, (X)
  637. addl INCX, X
  638. movaps %xmm0, (Y)
  639. addl INCY, Y
  640. decl %eax
  641. jg .L51
  642. ALIGN_3
  643. .L55:
  644. movl M, %eax
  645. andl $3, %eax
  646. jle .L57
  647. ALIGN_3
  648. .L56:
  649. movaps (X), %xmm0
  650. movaps (Y), %xmm1
  651. movaps %xmm1, (X)
  652. addl INCX, X
  653. movaps %xmm0, (Y)
  654. addl INCY, Y
  655. decl %eax
  656. jg .L56
  657. ALIGN_3
  658. .L57:
  659. popl %ebx
  660. popl %esi
  661. popl %edi
  662. popl %ebp
  663. ret
  664. ALIGN_3
  665. .L60:
  666. movl M, %eax
  667. sarl $2, %eax
  668. jle .L65
  669. ALIGN_3
  670. .L61:
  671. movsd 0 * SIZE(X), %xmm0
  672. movhps 1 * SIZE(X), %xmm0
  673. movsd 0 * SIZE(Y), %xmm1
  674. movhps 1 * SIZE(Y), %xmm1
  675. movlps %xmm1, 0 * SIZE(X)
  676. movhps %xmm1, 1 * SIZE(X)
  677. addl INCX, X
  678. movlps %xmm0, 0 * SIZE(Y)
  679. movhps %xmm0, 1 * SIZE(Y)
  680. addl INCY, Y
  681. movsd 0 * SIZE(X), %xmm0
  682. movhps 1 * SIZE(X), %xmm0
  683. movsd 0 * SIZE(Y), %xmm1
  684. movhps 1 * SIZE(Y), %xmm1
  685. movlps %xmm1, 0 * SIZE(X)
  686. movhps %xmm1, 1 * SIZE(X)
  687. addl INCX, X
  688. movlps %xmm0, 0 * SIZE(Y)
  689. movhps %xmm0, 1 * SIZE(Y)
  690. addl INCY, Y
  691. movsd 0 * SIZE(X), %xmm0
  692. movhps 1 * SIZE(X), %xmm0
  693. movsd 0 * SIZE(Y), %xmm1
  694. movhps 1 * SIZE(Y), %xmm1
  695. movlps %xmm1, 0 * SIZE(X)
  696. movhps %xmm1, 1 * SIZE(X)
  697. addl INCX, X
  698. movlps %xmm0, 0 * SIZE(Y)
  699. movhps %xmm0, 1 * SIZE(Y)
  700. addl INCY, Y
  701. movsd 0 * SIZE(X), %xmm0
  702. movhps 1 * SIZE(X), %xmm0
  703. movsd 0 * SIZE(Y), %xmm1
  704. movhps 1 * SIZE(Y), %xmm1
  705. movlps %xmm1, 0 * SIZE(X)
  706. movhps %xmm1, 1 * SIZE(X)
  707. addl INCX, X
  708. movlps %xmm0, 0 * SIZE(Y)
  709. movhps %xmm0, 1 * SIZE(Y)
  710. addl INCY, Y
  711. decl %eax
  712. jg .L61
  713. ALIGN_3
  714. .L65:
  715. movl M, %eax
  716. andl $3, %eax
  717. jle .L67
  718. ALIGN_3
  719. .L66:
  720. movsd 0 * SIZE(X), %xmm0
  721. movhps 1 * SIZE(X), %xmm0
  722. movsd 0 * SIZE(Y), %xmm1
  723. movhps 1 * SIZE(Y), %xmm1
  724. movlps %xmm1, 0 * SIZE(X)
  725. movhps %xmm1, 1 * SIZE(X)
  726. addl INCX, X
  727. movlps %xmm0, 0 * SIZE(Y)
  728. movhps %xmm0, 1 * SIZE(Y)
  729. addl INCY, Y
  730. decl %eax
  731. jg .L66
  732. ALIGN_3
  733. .L67:
  734. popl %ebx
  735. popl %esi
  736. popl %edi
  737. popl %ebp
  738. ret
  739. EPILOGUE