You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zcopy_sse.S 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define M ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define Y ARG4 /* rcx */
  44. #ifndef WINDOWS_ABI
  45. #define INCY ARG5 /* r8 */
  46. #else
  47. #define INCY %r10
  48. #endif
  49. #include "l1param.h"
  50. #ifdef OPTERON
  51. #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addps OFFSET(ADDR), REG
  52. #else
  53. #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG
  54. #endif
  55. PROLOGUE
  56. PROFCODE
  57. #ifdef WINDOWS_ABI
  58. movq 40(%rsp), INCY
  59. #endif
  60. SAVEREGISTERS
  61. salq $ZBASE_SHIFT, INCX
  62. salq $ZBASE_SHIFT, INCY
  63. cmpq $2 * SIZE, INCX
  64. jne .L100
  65. cmpq $2 * SIZE, INCY
  66. jne .L100
  67. cmpq $3, M
  68. jle .L106
  69. subq $-32 * SIZE, X
  70. subq $-32 * SIZE, Y
  71. addq M, M
  72. testq $SIZE, Y
  73. je .L05
  74. movss -32 * SIZE(X), %xmm0
  75. movss %xmm0, -32 * SIZE(Y)
  76. addq $1 * SIZE, X
  77. addq $1 * SIZE, Y
  78. decq M
  79. ALIGN_4
  80. .L05:
  81. testq $2 * SIZE, Y
  82. je .L10
  83. movsd -32 * SIZE(X), %xmm0
  84. movlps %xmm0, -32 * SIZE(Y)
  85. addq $2 * SIZE, X
  86. addq $2 * SIZE, Y
  87. subq $2, M
  88. jle .L19
  89. ALIGN_4
  90. .L10:
  91. testq $3 * SIZE, X
  92. jne .L20
  93. movq M, %rax
  94. sarq $5, %rax
  95. jle .L13
  96. movaps -32 * SIZE(X), %xmm0
  97. movaps -28 * SIZE(X), %xmm1
  98. movaps -24 * SIZE(X), %xmm2
  99. movaps -20 * SIZE(X), %xmm3
  100. movaps -16 * SIZE(X), %xmm4
  101. movaps -12 * SIZE(X), %xmm5
  102. movaps -8 * SIZE(X), %xmm6
  103. movaps -4 * SIZE(X), %xmm7
  104. decq %rax
  105. jle .L12
  106. ALIGN_3
  107. .L11:
  108. #ifdef PREFETCHW
  109. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  110. #endif
  111. movaps %xmm0, -32 * SIZE(Y)
  112. LOAD( 0 * SIZE, X, %xmm0)
  113. movaps %xmm1, -28 * SIZE(Y)
  114. LOAD( 4 * SIZE, X, %xmm1)
  115. #ifdef PREFETCH
  116. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  117. #endif
  118. movaps %xmm2, -24 * SIZE(Y)
  119. LOAD( 8 * SIZE, X, %xmm2)
  120. movaps %xmm3, -20 * SIZE(Y)
  121. LOAD(12 * SIZE, X, %xmm3)
  122. #if defined(PREFETCHW) && !defined(FETCH128)
  123. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  124. #endif
  125. movaps %xmm4,-16 * SIZE(Y)
  126. LOAD(16 * SIZE, X, %xmm4)
  127. movaps %xmm5,-12 * SIZE(Y)
  128. LOAD(20 * SIZE, X, %xmm5)
  129. #if defined(PREFETCH) && !defined(FETCH128)
  130. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  131. #endif
  132. movaps %xmm6, -8 * SIZE(Y)
  133. LOAD(24 * SIZE, X, %xmm6)
  134. movaps %xmm7, -4 * SIZE(Y)
  135. LOAD(28 * SIZE, X, %xmm7)
  136. subq $-32 * SIZE, Y
  137. subq $-32 * SIZE, X
  138. decq %rax
  139. jg .L11
  140. ALIGN_3
  141. .L12:
  142. movaps %xmm0, -32 * SIZE(Y)
  143. movaps %xmm1, -28 * SIZE(Y)
  144. movaps %xmm2, -24 * SIZE(Y)
  145. movaps %xmm3, -20 * SIZE(Y)
  146. movaps %xmm4, -16 * SIZE(Y)
  147. movaps %xmm5, -12 * SIZE(Y)
  148. movaps %xmm6, -8 * SIZE(Y)
  149. movaps %xmm7, -4 * SIZE(Y)
  150. subq $-32 * SIZE, Y
  151. subq $-32 * SIZE, X
  152. ALIGN_3
  153. .L13:
  154. testq $16, M
  155. jle .L14
  156. movaps -32 * SIZE(X), %xmm0
  157. movaps -28 * SIZE(X), %xmm1
  158. movaps -24 * SIZE(X), %xmm2
  159. movaps -20 * SIZE(X), %xmm3
  160. movaps %xmm0, -32 * SIZE(Y)
  161. movaps %xmm1, -28 * SIZE(Y)
  162. movaps %xmm2, -24 * SIZE(Y)
  163. movaps %xmm3, -20 * SIZE(Y)
  164. addq $16 * SIZE, X
  165. addq $16 * SIZE, Y
  166. ALIGN_3
  167. .L14:
  168. testq $8, M
  169. jle .L15
  170. movaps -32 * SIZE(X), %xmm0
  171. movaps -28 * SIZE(X), %xmm1
  172. movaps %xmm0, -32 * SIZE(Y)
  173. movaps %xmm1, -28 * SIZE(Y)
  174. addq $8 * SIZE, X
  175. addq $8 * SIZE, Y
  176. ALIGN_3
  177. .L15:
  178. testq $4, M
  179. jle .L16
  180. movaps -32 * SIZE(X), %xmm0
  181. movaps %xmm0, -32 * SIZE(Y)
  182. addq $4 * SIZE, X
  183. addq $4 * SIZE, Y
  184. ALIGN_3
  185. .L16:
  186. testq $2, M
  187. jle .L17
  188. movsd -32 * SIZE(X), %xmm0
  189. movlps %xmm0, -32 * SIZE(Y)
  190. addq $2 * SIZE, X
  191. addq $2 * SIZE, Y
  192. ALIGN_3
  193. .L17:
  194. testq $1, M
  195. jle .L19
  196. movss -32 * SIZE(X), %xmm0
  197. movss %xmm0, -32 * SIZE(Y)
  198. ALIGN_3
  199. .L19:
  200. xorq %rax,%rax
  201. RESTOREREGISTERS
  202. ret
  203. ALIGN_3
  204. .L20:
  205. testq $SIZE, X
  206. jne .L30
  207. movhps -32 * SIZE(X), %xmm0
  208. movq M, %rax
  209. sarq $5, %rax
  210. jle .L23
  211. movaps -30 * SIZE(X), %xmm1
  212. movaps -26 * SIZE(X), %xmm2
  213. movaps -22 * SIZE(X), %xmm3
  214. movaps -18 * SIZE(X), %xmm4
  215. movaps -14 * SIZE(X), %xmm5
  216. movaps -10 * SIZE(X), %xmm6
  217. movaps -6 * SIZE(X), %xmm7
  218. decq %rax
  219. jle .L22
  220. ALIGN_4
  221. .L21:
  222. #ifdef PREFETCHW
  223. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  224. #endif
  225. shufps $0x4e, %xmm1, %xmm0
  226. movaps %xmm0, -32 * SIZE(Y)
  227. movaps -2 * SIZE(X), %xmm0
  228. shufps $0x4e, %xmm2, %xmm1
  229. movaps %xmm1, -28 * SIZE(Y)
  230. movaps 2 * SIZE(X), %xmm1
  231. #ifdef PREFETCH
  232. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  233. #endif
  234. shufps $0x4e, %xmm3, %xmm2
  235. movaps %xmm2, -24 * SIZE(Y)
  236. movaps 6 * SIZE(X), %xmm2
  237. shufps $0x4e, %xmm4, %xmm3
  238. movaps %xmm3, -20 * SIZE(Y)
  239. movaps 10 * SIZE(X), %xmm3
  240. #if defined(PREFETCHW) && !defined(FETCH128)
  241. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  242. #endif
  243. shufps $0x4e, %xmm5, %xmm4
  244. movaps %xmm4, -16 * SIZE(Y)
  245. movaps 14 * SIZE(X), %xmm4
  246. shufps $0x4e, %xmm6, %xmm5
  247. movaps %xmm5, -12 * SIZE(Y)
  248. movaps 18 * SIZE(X), %xmm5
  249. #if defined(PREFETCH) && !defined(FETCH128)
  250. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  251. #endif
  252. shufps $0x4e, %xmm7, %xmm6
  253. movaps %xmm6, -8 * SIZE(Y)
  254. movaps 22 * SIZE(X), %xmm6
  255. shufps $0x4e, %xmm0, %xmm7
  256. movaps %xmm7, -4 * SIZE(Y)
  257. movaps 26 * SIZE(X), %xmm7
  258. subq $-32 * SIZE, X
  259. subq $-32 * SIZE, Y
  260. decq %rax
  261. jg .L21
  262. ALIGN_3
  263. .L22:
  264. shufps $0x4e, %xmm1, %xmm0
  265. movaps %xmm0, -32 * SIZE(Y)
  266. movaps -2 * SIZE(X), %xmm0
  267. shufps $0x4e, %xmm2, %xmm1
  268. movaps %xmm1, -28 * SIZE(Y)
  269. shufps $0x4e, %xmm3, %xmm2
  270. movaps %xmm2, -24 * SIZE(Y)
  271. shufps $0x4e, %xmm4, %xmm3
  272. movaps %xmm3, -20 * SIZE(Y)
  273. shufps $0x4e, %xmm5, %xmm4
  274. movaps %xmm4, -16 * SIZE(Y)
  275. shufps $0x4e, %xmm6, %xmm5
  276. movaps %xmm5, -12 * SIZE(Y)
  277. shufps $0x4e, %xmm7, %xmm6
  278. movaps %xmm6, -8 * SIZE(Y)
  279. shufps $0x4e, %xmm0, %xmm7
  280. movaps %xmm7, -4 * SIZE(Y)
  281. subq $-32 * SIZE, X
  282. subq $-32 * SIZE, Y
  283. ALIGN_3
  284. .L23:
  285. testq $16, M
  286. jle .L24
  287. ALIGN_3
  288. movaps -30 * SIZE(X), %xmm1
  289. movaps -26 * SIZE(X), %xmm2
  290. movaps -22 * SIZE(X), %xmm3
  291. movaps -18 * SIZE(X), %xmm4
  292. shufps $0x4e, %xmm1, %xmm0
  293. movaps %xmm0, -32 * SIZE(Y)
  294. shufps $0x4e, %xmm2, %xmm1
  295. movaps %xmm1, -28 * SIZE(Y)
  296. shufps $0x4e, %xmm3, %xmm2
  297. movaps %xmm2, -24 * SIZE(Y)
  298. shufps $0x4e, %xmm4, %xmm3
  299. movaps %xmm3, -20 * SIZE(Y)
  300. movaps %xmm4, %xmm0
  301. addq $16 * SIZE, X
  302. addq $16 * SIZE, Y
  303. ALIGN_3
  304. .L24:
  305. testq $8, M
  306. jle .L25
  307. ALIGN_3
  308. movaps -30 * SIZE(X), %xmm1
  309. movaps -26 * SIZE(X), %xmm2
  310. shufps $0x4e, %xmm1, %xmm0
  311. shufps $0x4e, %xmm2, %xmm1
  312. movaps %xmm0, -32 * SIZE(Y)
  313. movaps %xmm1, -28 * SIZE(Y)
  314. movaps %xmm2, %xmm0
  315. addq $8 * SIZE, X
  316. addq $8 * SIZE, Y
  317. ALIGN_3
  318. .L25:
  319. testq $4, M
  320. jle .L26
  321. ALIGN_3
  322. movaps -30 * SIZE(X), %xmm1
  323. shufps $0x4e, %xmm1, %xmm0
  324. movaps %xmm0, -32 * SIZE(Y)
  325. addq $4 * SIZE, X
  326. addq $4 * SIZE, Y
  327. ALIGN_3
  328. .L26:
  329. testq $2, M
  330. jle .L27
  331. ALIGN_3
  332. movsd -32 * SIZE(X), %xmm0
  333. movsd %xmm0, -32 * SIZE(Y)
  334. addq $2 * SIZE, X
  335. addq $2 * SIZE, Y
  336. ALIGN_3
  337. .L27:
  338. testq $1, M
  339. jle .L29
  340. ALIGN_3
  341. movss -32 * SIZE(X), %xmm0
  342. movss %xmm0, -32 * SIZE(Y)
  343. addq $SIZE, Y
  344. ALIGN_3
  345. .L29:
  346. xorq %rax,%rax
  347. RESTOREREGISTERS
  348. ret
  349. ALIGN_3
  350. .L30:
  351. testq $2 * SIZE, X
  352. jne .L40
  353. movaps -33 * SIZE(X), %xmm0
  354. movq M, %rax
  355. sarq $5, %rax
  356. jle .L33
  357. movaps -29 * SIZE(X), %xmm1
  358. movaps -25 * SIZE(X), %xmm2
  359. movaps -21 * SIZE(X), %xmm3
  360. movaps -17 * SIZE(X), %xmm4
  361. movaps -13 * SIZE(X), %xmm5
  362. movaps -9 * SIZE(X), %xmm6
  363. movaps -5 * SIZE(X), %xmm7
  364. decq %rax
  365. jle .L32
  366. ALIGN_4
  367. .L31:
  368. #ifdef PREFETCHW
  369. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  370. #endif
  371. movss %xmm1, %xmm0
  372. shufps $0x39, %xmm0, %xmm0
  373. movaps %xmm0, -32 * SIZE(Y)
  374. movaps -1 * SIZE(X), %xmm0
  375. movss %xmm2, %xmm1
  376. shufps $0x39, %xmm1, %xmm1
  377. movaps %xmm1, -28 * SIZE(Y)
  378. movaps 3 * SIZE(X), %xmm1
  379. #ifdef PREFETCH
  380. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  381. #endif
  382. movss %xmm3, %xmm2
  383. shufps $0x39, %xmm2, %xmm2
  384. movaps %xmm2, -24 * SIZE(Y)
  385. movaps 7 * SIZE(X), %xmm2
  386. movss %xmm4, %xmm3
  387. shufps $0x39, %xmm3, %xmm3
  388. movaps %xmm3, -20 * SIZE(Y)
  389. movaps 11 * SIZE(X), %xmm3
  390. #if defined(PREFETCHW) && !defined(FETCH128)
  391. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  392. #endif
  393. movss %xmm5, %xmm4
  394. shufps $0x39, %xmm4, %xmm4
  395. movaps %xmm4, -16 * SIZE(Y)
  396. movaps 15 * SIZE(X), %xmm4
  397. movss %xmm6, %xmm5
  398. shufps $0x39, %xmm5, %xmm5
  399. movaps %xmm5, -12 * SIZE(Y)
  400. movaps 19 * SIZE(X), %xmm5
  401. #if defined(PREFETCH) && !defined(FETCH128)
  402. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  403. #endif
  404. movss %xmm7, %xmm6
  405. shufps $0x39, %xmm6, %xmm6
  406. movaps %xmm6, -8 * SIZE(Y)
  407. movaps 23 * SIZE(X), %xmm6
  408. movss %xmm0, %xmm7
  409. shufps $0x39, %xmm7, %xmm7
  410. movaps %xmm7, -4 * SIZE(Y)
  411. movaps 27 * SIZE(X), %xmm7
  412. subq $-32 * SIZE, X
  413. subq $-32 * SIZE, Y
  414. decq %rax
  415. jg .L31
  416. ALIGN_3
  417. .L32:
  418. movss %xmm1, %xmm0
  419. shufps $0x39, %xmm0, %xmm0
  420. movaps %xmm0, -32 * SIZE(Y)
  421. movaps -1 * SIZE(X), %xmm0
  422. movss %xmm2, %xmm1
  423. shufps $0x39, %xmm1, %xmm1
  424. movaps %xmm1, -28 * SIZE(Y)
  425. movss %xmm3, %xmm2
  426. shufps $0x39, %xmm2, %xmm2
  427. movaps %xmm2, -24 * SIZE(Y)
  428. movss %xmm4, %xmm3
  429. shufps $0x39, %xmm3, %xmm3
  430. movaps %xmm3, -20 * SIZE(Y)
  431. movss %xmm5, %xmm4
  432. shufps $0x39, %xmm4, %xmm4
  433. movaps %xmm4, -16 * SIZE(Y)
  434. movss %xmm6, %xmm5
  435. shufps $0x39, %xmm5, %xmm5
  436. movaps %xmm5, -12 * SIZE(Y)
  437. movss %xmm7, %xmm6
  438. shufps $0x39, %xmm6, %xmm6
  439. movaps %xmm6, -8 * SIZE(Y)
  440. movss %xmm0, %xmm7
  441. shufps $0x39, %xmm7, %xmm7
  442. movaps %xmm7, -4 * SIZE(Y)
  443. subq $-32 * SIZE, X
  444. subq $-32 * SIZE, Y
  445. ALIGN_3
  446. .L33:
  447. testq $16, M
  448. jle .L34
  449. ALIGN_3
  450. movaps -29 * SIZE(X), %xmm1
  451. movaps -25 * SIZE(X), %xmm2
  452. movaps -21 * SIZE(X), %xmm3
  453. movaps -17 * SIZE(X), %xmm4
  454. movss %xmm1, %xmm0
  455. shufps $0x39, %xmm0, %xmm0
  456. movaps %xmm0, -32 * SIZE(Y)
  457. movss %xmm2, %xmm1
  458. shufps $0x39, %xmm1, %xmm1
  459. movaps %xmm1, -28 * SIZE(Y)
  460. movss %xmm3, %xmm2
  461. shufps $0x39, %xmm2, %xmm2
  462. movaps %xmm2, -24 * SIZE(Y)
  463. movss %xmm4, %xmm3
  464. shufps $0x39, %xmm3, %xmm3
  465. movaps %xmm3, -20 * SIZE(Y)
  466. movaps %xmm4, %xmm0
  467. addq $16 * SIZE, X
  468. addq $16 * SIZE, Y
  469. ALIGN_3
  470. .L34:
  471. testq $8, M
  472. jle .L35
  473. ALIGN_3
  474. movaps -29 * SIZE(X), %xmm1
  475. movaps -25 * SIZE(X), %xmm2
  476. movss %xmm1, %xmm0
  477. shufps $0x39, %xmm0, %xmm0
  478. movaps %xmm0, -32 * SIZE(Y)
  479. movss %xmm2, %xmm1
  480. shufps $0x39, %xmm1, %xmm1
  481. movaps %xmm1, -28 * SIZE(Y)
  482. movaps %xmm2, %xmm0
  483. addq $8 * SIZE, X
  484. addq $8 * SIZE, Y
  485. ALIGN_3
  486. .L35:
  487. testq $4, M
  488. jle .L36
  489. ALIGN_3
  490. movaps -29 * SIZE(X), %xmm1
  491. movss %xmm1, %xmm0
  492. shufps $0x39, %xmm0, %xmm0
  493. movaps %xmm0, -32 * SIZE(Y)
  494. addq $4 * SIZE, X
  495. addq $4 * SIZE, Y
  496. ALIGN_3
  497. .L36:
  498. testq $2, M
  499. jle .L37
  500. ALIGN_3
  501. movsd -32 * SIZE(X), %xmm0
  502. movsd %xmm0, -32 * SIZE(Y)
  503. addq $2 * SIZE, X
  504. addq $2 * SIZE, Y
  505. ALIGN_3
  506. .L37:
  507. testq $1, M
  508. jle .L39
  509. ALIGN_3
  510. movss -32 * SIZE(X), %xmm0
  511. movss %xmm0, -32 * SIZE(Y)
  512. addq $SIZE, Y
  513. ALIGN_3
  514. .L39:
  515. xorq %rax,%rax
  516. RESTOREREGISTERS
  517. ret
  518. ALIGN_3
  519. .L40:
  520. movaps -35 * SIZE(X), %xmm0
  521. movq M, %rax
  522. sarq $5, %rax
  523. jle .L43
  524. movaps -31 * SIZE(X), %xmm1
  525. movaps -27 * SIZE(X), %xmm2
  526. movaps -23 * SIZE(X), %xmm3
  527. movaps -19 * SIZE(X), %xmm4
  528. movaps -15 * SIZE(X), %xmm5
  529. movaps -11 * SIZE(X), %xmm6
  530. movaps -7 * SIZE(X), %xmm7
  531. decq %rax
  532. jle .L42
  533. ALIGN_4
  534. .L41:
  535. #ifdef PREFETCHW
  536. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  537. #endif
  538. movss %xmm1, %xmm0
  539. shufps $0x93, %xmm1, %xmm0
  540. movaps %xmm0, -32 * SIZE(Y)
  541. movaps -3 * SIZE(X), %xmm0
  542. movss %xmm2, %xmm1
  543. shufps $0x93, %xmm2, %xmm1
  544. movaps %xmm1, -28 * SIZE(Y)
  545. movaps 1 * SIZE(X), %xmm1
  546. #ifdef PREFETCH
  547. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  548. #endif
  549. movss %xmm3, %xmm2
  550. shufps $0x93, %xmm3, %xmm2
  551. movaps %xmm2, -24 * SIZE(Y)
  552. movaps 5 * SIZE(X), %xmm2
  553. movss %xmm4, %xmm3
  554. shufps $0x93, %xmm4, %xmm3
  555. movaps %xmm3, -20 * SIZE(Y)
  556. movaps 9 * SIZE(X), %xmm3
  557. #if defined(PREFETCHW) && !defined(FETCH128)
  558. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  559. #endif
  560. movss %xmm5, %xmm4
  561. shufps $0x93, %xmm5, %xmm4
  562. movaps %xmm4, -16 * SIZE(Y)
  563. movaps 13 * SIZE(X), %xmm4
  564. movss %xmm6, %xmm5
  565. shufps $0x93, %xmm6, %xmm5
  566. movaps %xmm5, -12 * SIZE(Y)
  567. movaps 17 * SIZE(X), %xmm5
  568. #if defined(PREFETCH) && !defined(FETCH128)
  569. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  570. #endif
  571. movss %xmm7, %xmm6
  572. shufps $0x93, %xmm7, %xmm6
  573. movaps %xmm6, -8 * SIZE(Y)
  574. movaps 21 * SIZE(X), %xmm6
  575. movss %xmm0, %xmm7
  576. shufps $0x93, %xmm0, %xmm7
  577. movaps %xmm7, -4 * SIZE(Y)
  578. movaps 25 * SIZE(X), %xmm7
  579. subq $-32 * SIZE, X
  580. subq $-32 * SIZE, Y
  581. decq %rax
  582. jg .L41
  583. ALIGN_3
  584. .L42:
  585. movss %xmm1, %xmm0
  586. shufps $0x93, %xmm1, %xmm0
  587. movaps %xmm0, -32 * SIZE(Y)
  588. movaps -3 * SIZE(X), %xmm0
  589. movss %xmm2, %xmm1
  590. shufps $0x93, %xmm2, %xmm1
  591. movaps %xmm1, -28 * SIZE(Y)
  592. movss %xmm3, %xmm2
  593. shufps $0x93, %xmm3, %xmm2
  594. movaps %xmm2, -24 * SIZE(Y)
  595. movss %xmm4, %xmm3
  596. shufps $0x93, %xmm4, %xmm3
  597. movaps %xmm3, -20 * SIZE(Y)
  598. movss %xmm5, %xmm4
  599. shufps $0x93, %xmm5, %xmm4
  600. movaps %xmm4, -16 * SIZE(Y)
  601. movss %xmm6, %xmm5
  602. shufps $0x93, %xmm6, %xmm5
  603. movaps %xmm5, -12 * SIZE(Y)
  604. movss %xmm7, %xmm6
  605. shufps $0x93, %xmm7, %xmm6
  606. movaps %xmm6, -8 * SIZE(Y)
  607. movss %xmm0, %xmm7
  608. shufps $0x93, %xmm0, %xmm7
  609. movaps %xmm7, -4 * SIZE(Y)
  610. subq $-32 * SIZE, X
  611. subq $-32 * SIZE, Y
  612. ALIGN_3
  613. .L43:
  614. testq $16, M
  615. jle .L44
  616. ALIGN_3
  617. movaps -31 * SIZE(X), %xmm1
  618. movaps -27 * SIZE(X), %xmm2
  619. movaps -23 * SIZE(X), %xmm3
  620. movaps -19 * SIZE(X), %xmm4
  621. movss %xmm1, %xmm0
  622. shufps $0x93, %xmm1, %xmm0
  623. movaps %xmm0, -32 * SIZE(Y)
  624. movss %xmm2, %xmm1
  625. shufps $0x93, %xmm2, %xmm1
  626. movaps %xmm1, -28 * SIZE(Y)
  627. movss %xmm3, %xmm2
  628. shufps $0x93, %xmm3, %xmm2
  629. movaps %xmm2, -24 * SIZE(Y)
  630. movss %xmm4, %xmm3
  631. shufps $0x93, %xmm4, %xmm3
  632. movaps %xmm3, -20 * SIZE(Y)
  633. movaps %xmm4, %xmm0
  634. addq $16 * SIZE, X
  635. addq $16 * SIZE, Y
  636. ALIGN_3
  637. .L44:
  638. testq $8, M
  639. jle .L45
  640. ALIGN_3
  641. movaps -31 * SIZE(X), %xmm1
  642. movaps -27 * SIZE(X), %xmm2
  643. movss %xmm1, %xmm0
  644. shufps $0x93, %xmm1, %xmm0
  645. movaps %xmm0, -32 * SIZE(Y)
  646. movss %xmm2, %xmm1
  647. shufps $0x93, %xmm2, %xmm1
  648. movaps %xmm1, -28 * SIZE(Y)
  649. movaps %xmm2, %xmm0
  650. addq $8 * SIZE, X
  651. addq $8 * SIZE, Y
  652. ALIGN_3
  653. .L45:
  654. testq $4, M
  655. jle .L46
  656. ALIGN_3
  657. movaps -31 * SIZE(X), %xmm1
  658. movss %xmm1, %xmm0
  659. shufps $0x93, %xmm1, %xmm0
  660. movaps %xmm0, -32 * SIZE(Y)
  661. addq $4 * SIZE, X
  662. addq $4 * SIZE, Y
  663. ALIGN_3
  664. .L46:
  665. testq $2, M
  666. jle .L47
  667. ALIGN_3
  668. movsd -32 * SIZE(X), %xmm0
  669. movsd %xmm0, -32 * SIZE(Y)
  670. addq $2 * SIZE, X
  671. addq $2 * SIZE, Y
  672. ALIGN_3
  673. .L47:
  674. testq $1, M
  675. jle .L49
  676. ALIGN_3
  677. movss -32 * SIZE(X), %xmm0
  678. movss %xmm0, -32 * SIZE(Y)
  679. addq $SIZE, Y
  680. ALIGN_3
  681. .L49:
  682. xorq %rax,%rax
  683. RESTOREREGISTERS
  684. ret
  685. ALIGN_4
  686. .L100:
  687. movq M, %rax
  688. sarq $3, %rax
  689. jle .L105
  690. ALIGN_3
  691. .L102:
  692. movsd (X), %xmm0
  693. addq INCX, X
  694. movhps (X), %xmm0
  695. addq INCX, X
  696. movsd (X), %xmm1
  697. addq INCX, X
  698. movhps (X), %xmm1
  699. addq INCX, X
  700. movsd (X), %xmm2
  701. addq INCX, X
  702. movhps (X), %xmm2
  703. addq INCX, X
  704. movsd (X), %xmm3
  705. addq INCX, X
  706. movhps (X), %xmm3
  707. addq INCX, X
  708. movsd %xmm0, (Y)
  709. addq INCY, Y
  710. movhps %xmm0, (Y)
  711. addq INCY, Y
  712. movsd %xmm1, (Y)
  713. addq INCY, Y
  714. movhps %xmm1, (Y)
  715. addq INCY, Y
  716. movsd %xmm2, (Y)
  717. addq INCY, Y
  718. movhps %xmm2, (Y)
  719. addq INCY, Y
  720. movsd %xmm3, (Y)
  721. addq INCY, Y
  722. movhps %xmm3, (Y)
  723. addq INCY, Y
  724. decq %rax
  725. jg .L102
  726. ALIGN_3
  727. .L105:
  728. testq $4, M
  729. jle .L106
  730. movsd (X), %xmm0
  731. addq INCX, X
  732. movhps (X), %xmm0
  733. addq INCX, X
  734. movsd (X), %xmm1
  735. addq INCX, X
  736. movhps (X), %xmm1
  737. addq INCX, X
  738. movsd %xmm0, (Y)
  739. addq INCY, Y
  740. movhps %xmm0, (Y)
  741. addq INCY, Y
  742. movsd %xmm1, (Y)
  743. addq INCY, Y
  744. movhps %xmm1, (Y)
  745. addq INCY, Y
  746. ALIGN_3
  747. .L106:
  748. testq $2, M
  749. jle .L107
  750. movsd (X), %xmm0
  751. addq INCX, X
  752. movhps (X), %xmm0
  753. addq INCX, X
  754. movsd %xmm0, (Y)
  755. addq INCY, Y
  756. movhps %xmm0, (Y)
  757. addq INCY, Y
  758. ALIGN_3
  759. .L107:
  760. testq $1, M
  761. jle .L999
  762. movsd (X), %xmm0
  763. movsd %xmm0, (Y)
  764. ALIGN_3
  765. .L999:
  766. xorq %rax, %rax
  767. RESTOREREGISTERS
  768. ret
  769. EPILOGUE