You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zcopy_sse.S 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define M %ebx
  48. #define X %esi
  49. #define INCX %ecx
  50. #define Y %edi
  51. #define INCY %edx
  52. #include "l1param.h"
  53. #ifdef OPTERON
  54. #define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG
  55. #else
  56. #define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG
  57. #endif
  58. PROLOGUE
  59. PROFCODE
  60. pushl %edi
  61. pushl %esi
  62. pushl %ebx
  63. movl STACK_M, M
  64. movl STACK_X, X
  65. movl STACK_INCX, INCX
  66. movl STACK_Y, Y
  67. movl STACK_INCY, INCY
  68. sall $ZBASE_SHIFT, INCX
  69. sall $ZBASE_SHIFT, INCY
  70. cmpl $2 * SIZE, INCX
  71. jne .L100
  72. cmpl $2 * SIZE, INCY
  73. jne .L100
  74. cmpl $3, M
  75. jle .L106
  76. subl $-32 * SIZE, X
  77. subl $-32 * SIZE, Y
  78. addl M, M
  79. testl $SIZE, Y
  80. je .L05
  81. movss -32 * SIZE(X), %xmm0
  82. movss %xmm0, -32 * SIZE(Y)
  83. addl $1 * SIZE, X
  84. addl $1 * SIZE, Y
  85. decl M
  86. ALIGN_4
  87. .L05:
  88. testl $2 * SIZE, Y
  89. je .L10
  90. movsd -32 * SIZE(X), %xmm0
  91. movlps %xmm0, -32 * SIZE(Y)
  92. addl $2 * SIZE, X
  93. addl $2 * SIZE, Y
  94. subl $2, M
  95. jle .L19
  96. ALIGN_4
  97. .L10:
  98. testl $3 * SIZE, X
  99. jne .L20
  100. movl M, %eax
  101. sarl $5, %eax
  102. jle .L13
  103. movaps -32 * SIZE(X), %xmm0
  104. movaps -28 * SIZE(X), %xmm1
  105. movaps -24 * SIZE(X), %xmm2
  106. movaps -20 * SIZE(X), %xmm3
  107. movaps -16 * SIZE(X), %xmm4
  108. movaps -12 * SIZE(X), %xmm5
  109. movaps -8 * SIZE(X), %xmm6
  110. movaps -4 * SIZE(X), %xmm7
  111. decl %eax
  112. jle .L12
  113. ALIGN_3
  114. .L11:
  115. #ifdef PREFETCHW
  116. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  117. #endif
  118. movaps %xmm0, -32 * SIZE(Y)
  119. LOAD( 0 * SIZE, X, %xmm0)
  120. movaps %xmm1, -28 * SIZE(Y)
  121. LOAD( 4 * SIZE, X, %xmm1)
  122. #ifdef PREFETCH
  123. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  124. #endif
  125. movaps %xmm2, -24 * SIZE(Y)
  126. LOAD( 8 * SIZE, X, %xmm2)
  127. movaps %xmm3, -20 * SIZE(Y)
  128. LOAD(12 * SIZE, X, %xmm3)
  129. #if defined(PREFETCHW) && !defined(FETCH128)
  130. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  131. #endif
  132. movaps %xmm4,-16 * SIZE(Y)
  133. LOAD(16 * SIZE, X, %xmm4)
  134. movaps %xmm5,-12 * SIZE(Y)
  135. LOAD(20 * SIZE, X, %xmm5)
  136. #if defined(PREFETCH) && !defined(FETCH128)
  137. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  138. #endif
  139. movaps %xmm6, -8 * SIZE(Y)
  140. LOAD(24 * SIZE, X, %xmm6)
  141. movaps %xmm7, -4 * SIZE(Y)
  142. LOAD(28 * SIZE, X, %xmm7)
  143. subl $-32 * SIZE, Y
  144. subl $-32 * SIZE, X
  145. decl %eax
  146. jg .L11
  147. ALIGN_3
  148. .L12:
  149. movaps %xmm0, -32 * SIZE(Y)
  150. movaps %xmm1, -28 * SIZE(Y)
  151. movaps %xmm2, -24 * SIZE(Y)
  152. movaps %xmm3, -20 * SIZE(Y)
  153. movaps %xmm4, -16 * SIZE(Y)
  154. movaps %xmm5, -12 * SIZE(Y)
  155. movaps %xmm6, -8 * SIZE(Y)
  156. movaps %xmm7, -4 * SIZE(Y)
  157. subl $-32 * SIZE, Y
  158. subl $-32 * SIZE, X
  159. ALIGN_3
  160. .L13:
  161. testl $16, M
  162. jle .L14
  163. movaps -32 * SIZE(X), %xmm0
  164. movaps -28 * SIZE(X), %xmm1
  165. movaps -24 * SIZE(X), %xmm2
  166. movaps -20 * SIZE(X), %xmm3
  167. movaps %xmm0, -32 * SIZE(Y)
  168. movaps %xmm1, -28 * SIZE(Y)
  169. movaps %xmm2, -24 * SIZE(Y)
  170. movaps %xmm3, -20 * SIZE(Y)
  171. addl $16 * SIZE, X
  172. addl $16 * SIZE, Y
  173. ALIGN_3
  174. .L14:
  175. testl $8, M
  176. jle .L15
  177. movaps -32 * SIZE(X), %xmm0
  178. movaps -28 * SIZE(X), %xmm1
  179. movaps %xmm0, -32 * SIZE(Y)
  180. movaps %xmm1, -28 * SIZE(Y)
  181. addl $8 * SIZE, X
  182. addl $8 * SIZE, Y
  183. ALIGN_3
  184. .L15:
  185. testl $4, M
  186. jle .L16
  187. movaps -32 * SIZE(X), %xmm0
  188. movaps %xmm0, -32 * SIZE(Y)
  189. addl $4 * SIZE, X
  190. addl $4 * SIZE, Y
  191. ALIGN_3
  192. .L16:
  193. testl $2, M
  194. jle .L17
  195. movsd -32 * SIZE(X), %xmm0
  196. movlps %xmm0, -32 * SIZE(Y)
  197. addl $2 * SIZE, X
  198. addl $2 * SIZE, Y
  199. ALIGN_3
  200. .L17:
  201. testl $1, M
  202. jle .L19
  203. movss -32 * SIZE(X), %xmm0
  204. movss %xmm0, -32 * SIZE(Y)
  205. ALIGN_3
  206. .L19:
  207. popl %ebx
  208. popl %esi
  209. popl %edi
  210. ret
  211. ALIGN_3
  212. .L20:
  213. testl $SIZE, X
  214. jne .L30
  215. movhps -32 * SIZE(X), %xmm0
  216. movl M, %eax
  217. sarl $5, %eax
  218. jle .L23
  219. movaps -30 * SIZE(X), %xmm1
  220. movaps -26 * SIZE(X), %xmm2
  221. movaps -22 * SIZE(X), %xmm3
  222. movaps -18 * SIZE(X), %xmm4
  223. movaps -14 * SIZE(X), %xmm5
  224. movaps -10 * SIZE(X), %xmm6
  225. movaps -6 * SIZE(X), %xmm7
  226. decl %eax
  227. jle .L22
  228. ALIGN_4
  229. .L21:
  230. #ifdef PREFETCHW
  231. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  232. #endif
  233. shufps $0x4e, %xmm1, %xmm0
  234. movaps %xmm0, -32 * SIZE(Y)
  235. movaps -2 * SIZE(X), %xmm0
  236. shufps $0x4e, %xmm2, %xmm1
  237. movaps %xmm1, -28 * SIZE(Y)
  238. movaps 2 * SIZE(X), %xmm1
  239. #ifdef PREFETCH
  240. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  241. #endif
  242. shufps $0x4e, %xmm3, %xmm2
  243. movaps %xmm2, -24 * SIZE(Y)
  244. movaps 6 * SIZE(X), %xmm2
  245. shufps $0x4e, %xmm4, %xmm3
  246. movaps %xmm3, -20 * SIZE(Y)
  247. movaps 10 * SIZE(X), %xmm3
  248. #if defined(PREFETCHW) && !defined(FETCH128)
  249. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  250. #endif
  251. shufps $0x4e, %xmm5, %xmm4
  252. movaps %xmm4, -16 * SIZE(Y)
  253. movaps 14 * SIZE(X), %xmm4
  254. shufps $0x4e, %xmm6, %xmm5
  255. movaps %xmm5, -12 * SIZE(Y)
  256. movaps 18 * SIZE(X), %xmm5
  257. #if defined(PREFETCH) && !defined(FETCH128)
  258. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  259. #endif
  260. shufps $0x4e, %xmm7, %xmm6
  261. movaps %xmm6, -8 * SIZE(Y)
  262. movaps 22 * SIZE(X), %xmm6
  263. shufps $0x4e, %xmm0, %xmm7
  264. movaps %xmm7, -4 * SIZE(Y)
  265. movaps 26 * SIZE(X), %xmm7
  266. subl $-32 * SIZE, X
  267. subl $-32 * SIZE, Y
  268. decl %eax
  269. jg .L21
  270. ALIGN_3
  271. .L22:
  272. shufps $0x4e, %xmm1, %xmm0
  273. movaps %xmm0, -32 * SIZE(Y)
  274. movaps -2 * SIZE(X), %xmm0
  275. shufps $0x4e, %xmm2, %xmm1
  276. movaps %xmm1, -28 * SIZE(Y)
  277. shufps $0x4e, %xmm3, %xmm2
  278. movaps %xmm2, -24 * SIZE(Y)
  279. shufps $0x4e, %xmm4, %xmm3
  280. movaps %xmm3, -20 * SIZE(Y)
  281. shufps $0x4e, %xmm5, %xmm4
  282. movaps %xmm4, -16 * SIZE(Y)
  283. shufps $0x4e, %xmm6, %xmm5
  284. movaps %xmm5, -12 * SIZE(Y)
  285. shufps $0x4e, %xmm7, %xmm6
  286. movaps %xmm6, -8 * SIZE(Y)
  287. shufps $0x4e, %xmm0, %xmm7
  288. movaps %xmm7, -4 * SIZE(Y)
  289. subl $-32 * SIZE, X
  290. subl $-32 * SIZE, Y
  291. ALIGN_3
  292. .L23:
  293. testl $16, M
  294. jle .L24
  295. ALIGN_3
  296. movaps -30 * SIZE(X), %xmm1
  297. movaps -26 * SIZE(X), %xmm2
  298. movaps -22 * SIZE(X), %xmm3
  299. movaps -18 * SIZE(X), %xmm4
  300. shufps $0x4e, %xmm1, %xmm0
  301. movaps %xmm0, -32 * SIZE(Y)
  302. shufps $0x4e, %xmm2, %xmm1
  303. movaps %xmm1, -28 * SIZE(Y)
  304. shufps $0x4e, %xmm3, %xmm2
  305. movaps %xmm2, -24 * SIZE(Y)
  306. shufps $0x4e, %xmm4, %xmm3
  307. movaps %xmm3, -20 * SIZE(Y)
  308. movaps %xmm4, %xmm0
  309. addl $16 * SIZE, X
  310. addl $16 * SIZE, Y
  311. ALIGN_3
  312. .L24:
  313. testl $8, M
  314. jle .L25
  315. ALIGN_3
  316. movaps -30 * SIZE(X), %xmm1
  317. movaps -26 * SIZE(X), %xmm2
  318. shufps $0x4e, %xmm1, %xmm0
  319. shufps $0x4e, %xmm2, %xmm1
  320. movaps %xmm0, -32 * SIZE(Y)
  321. movaps %xmm1, -28 * SIZE(Y)
  322. movaps %xmm2, %xmm0
  323. addl $8 * SIZE, X
  324. addl $8 * SIZE, Y
  325. ALIGN_3
  326. .L25:
  327. testl $4, M
  328. jle .L26
  329. ALIGN_3
  330. movaps -30 * SIZE(X), %xmm1
  331. shufps $0x4e, %xmm1, %xmm0
  332. movaps %xmm0, -32 * SIZE(Y)
  333. addl $4 * SIZE, X
  334. addl $4 * SIZE, Y
  335. ALIGN_3
  336. .L26:
  337. testl $2, M
  338. jle .L27
  339. ALIGN_3
  340. movsd -32 * SIZE(X), %xmm0
  341. movsd %xmm0, -32 * SIZE(Y)
  342. addl $2 * SIZE, X
  343. addl $2 * SIZE, Y
  344. ALIGN_3
  345. .L27:
  346. testl $1, M
  347. jle .L29
  348. ALIGN_3
  349. movss -32 * SIZE(X), %xmm0
  350. movss %xmm0, -32 * SIZE(Y)
  351. addl $SIZE, Y
  352. ALIGN_3
  353. .L29:
  354. popl %ebx
  355. popl %esi
  356. popl %edi
  357. ret
  358. ALIGN_3
  359. .L30:
  360. testl $2 * SIZE, X
  361. jne .L40
  362. movaps -33 * SIZE(X), %xmm0
  363. movl M, %eax
  364. sarl $5, %eax
  365. jle .L33
  366. movaps -29 * SIZE(X), %xmm1
  367. movaps -25 * SIZE(X), %xmm2
  368. movaps -21 * SIZE(X), %xmm3
  369. movaps -17 * SIZE(X), %xmm4
  370. movaps -13 * SIZE(X), %xmm5
  371. movaps -9 * SIZE(X), %xmm6
  372. movaps -5 * SIZE(X), %xmm7
  373. decl %eax
  374. jle .L32
  375. ALIGN_4
  376. .L31:
  377. #ifdef PREFETCHW
  378. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  379. #endif
  380. movss %xmm1, %xmm0
  381. shufps $0x39, %xmm0, %xmm0
  382. movaps %xmm0, -32 * SIZE(Y)
  383. movaps -1 * SIZE(X), %xmm0
  384. movss %xmm2, %xmm1
  385. shufps $0x39, %xmm1, %xmm1
  386. movaps %xmm1, -28 * SIZE(Y)
  387. movaps 3 * SIZE(X), %xmm1
  388. #ifdef PREFETCH
  389. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  390. #endif
  391. movss %xmm3, %xmm2
  392. shufps $0x39, %xmm2, %xmm2
  393. movaps %xmm2, -24 * SIZE(Y)
  394. movaps 7 * SIZE(X), %xmm2
  395. movss %xmm4, %xmm3
  396. shufps $0x39, %xmm3, %xmm3
  397. movaps %xmm3, -20 * SIZE(Y)
  398. movaps 11 * SIZE(X), %xmm3
  399. #if defined(PREFETCHW) && !defined(FETCH128)
  400. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  401. #endif
  402. movss %xmm5, %xmm4
  403. shufps $0x39, %xmm4, %xmm4
  404. movaps %xmm4, -16 * SIZE(Y)
  405. movaps 15 * SIZE(X), %xmm4
  406. movss %xmm6, %xmm5
  407. shufps $0x39, %xmm5, %xmm5
  408. movaps %xmm5, -12 * SIZE(Y)
  409. movaps 19 * SIZE(X), %xmm5
  410. #if defined(PREFETCH) && !defined(FETCH128)
  411. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  412. #endif
  413. movss %xmm7, %xmm6
  414. shufps $0x39, %xmm6, %xmm6
  415. movaps %xmm6, -8 * SIZE(Y)
  416. movaps 23 * SIZE(X), %xmm6
  417. movss %xmm0, %xmm7
  418. shufps $0x39, %xmm7, %xmm7
  419. movaps %xmm7, -4 * SIZE(Y)
  420. movaps 27 * SIZE(X), %xmm7
  421. subl $-32 * SIZE, X
  422. subl $-32 * SIZE, Y
  423. decl %eax
  424. jg .L31
  425. ALIGN_3
  426. .L32:
  427. movss %xmm1, %xmm0
  428. shufps $0x39, %xmm0, %xmm0
  429. movaps %xmm0, -32 * SIZE(Y)
  430. movaps -1 * SIZE(X), %xmm0
  431. movss %xmm2, %xmm1
  432. shufps $0x39, %xmm1, %xmm1
  433. movaps %xmm1, -28 * SIZE(Y)
  434. movss %xmm3, %xmm2
  435. shufps $0x39, %xmm2, %xmm2
  436. movaps %xmm2, -24 * SIZE(Y)
  437. movss %xmm4, %xmm3
  438. shufps $0x39, %xmm3, %xmm3
  439. movaps %xmm3, -20 * SIZE(Y)
  440. movss %xmm5, %xmm4
  441. shufps $0x39, %xmm4, %xmm4
  442. movaps %xmm4, -16 * SIZE(Y)
  443. movss %xmm6, %xmm5
  444. shufps $0x39, %xmm5, %xmm5
  445. movaps %xmm5, -12 * SIZE(Y)
  446. movss %xmm7, %xmm6
  447. shufps $0x39, %xmm6, %xmm6
  448. movaps %xmm6, -8 * SIZE(Y)
  449. movss %xmm0, %xmm7
  450. shufps $0x39, %xmm7, %xmm7
  451. movaps %xmm7, -4 * SIZE(Y)
  452. subl $-32 * SIZE, X
  453. subl $-32 * SIZE, Y
  454. ALIGN_3
  455. .L33:
  456. testl $16, M
  457. jle .L34
  458. ALIGN_3
  459. movaps -29 * SIZE(X), %xmm1
  460. movaps -25 * SIZE(X), %xmm2
  461. movaps -21 * SIZE(X), %xmm3
  462. movaps -17 * SIZE(X), %xmm4
  463. movss %xmm1, %xmm0
  464. shufps $0x39, %xmm0, %xmm0
  465. movaps %xmm0, -32 * SIZE(Y)
  466. movss %xmm2, %xmm1
  467. shufps $0x39, %xmm1, %xmm1
  468. movaps %xmm1, -28 * SIZE(Y)
  469. movss %xmm3, %xmm2
  470. shufps $0x39, %xmm2, %xmm2
  471. movaps %xmm2, -24 * SIZE(Y)
  472. movss %xmm4, %xmm3
  473. shufps $0x39, %xmm3, %xmm3
  474. movaps %xmm3, -20 * SIZE(Y)
  475. movaps %xmm4, %xmm0
  476. addl $16 * SIZE, X
  477. addl $16 * SIZE, Y
  478. ALIGN_3
  479. .L34:
  480. testl $8, M
  481. jle .L35
  482. ALIGN_3
  483. movaps -29 * SIZE(X), %xmm1
  484. movaps -25 * SIZE(X), %xmm2
  485. movss %xmm1, %xmm0
  486. shufps $0x39, %xmm0, %xmm0
  487. movaps %xmm0, -32 * SIZE(Y)
  488. movss %xmm2, %xmm1
  489. shufps $0x39, %xmm1, %xmm1
  490. movaps %xmm1, -28 * SIZE(Y)
  491. movaps %xmm2, %xmm0
  492. addl $8 * SIZE, X
  493. addl $8 * SIZE, Y
  494. ALIGN_3
  495. .L35:
  496. testl $4, M
  497. jle .L36
  498. ALIGN_3
  499. movaps -29 * SIZE(X), %xmm1
  500. movss %xmm1, %xmm0
  501. shufps $0x39, %xmm0, %xmm0
  502. movaps %xmm0, -32 * SIZE(Y)
  503. addl $4 * SIZE, X
  504. addl $4 * SIZE, Y
  505. ALIGN_3
  506. .L36:
  507. testl $2, M
  508. jle .L37
  509. ALIGN_3
  510. movsd -32 * SIZE(X), %xmm0
  511. movsd %xmm0, -32 * SIZE(Y)
  512. addl $2 * SIZE, X
  513. addl $2 * SIZE, Y
  514. ALIGN_3
  515. .L37:
  516. testl $1, M
  517. jle .L39
  518. ALIGN_3
  519. movss -32 * SIZE(X), %xmm0
  520. movss %xmm0, -32 * SIZE(Y)
  521. addl $SIZE, Y
  522. ALIGN_3
  523. .L39:
  524. popl %ebx
  525. popl %esi
  526. popl %edi
  527. ret
  528. ALIGN_3
  529. .L40:
  530. movaps -35 * SIZE(X), %xmm0
  531. movl M, %eax
  532. sarl $5, %eax
  533. jle .L43
  534. movaps -31 * SIZE(X), %xmm1
  535. movaps -27 * SIZE(X), %xmm2
  536. movaps -23 * SIZE(X), %xmm3
  537. movaps -19 * SIZE(X), %xmm4
  538. movaps -15 * SIZE(X), %xmm5
  539. movaps -11 * SIZE(X), %xmm6
  540. movaps -7 * SIZE(X), %xmm7
  541. decl %eax
  542. jle .L42
  543. ALIGN_4
  544. .L41:
  545. #ifdef PREFETCHW
  546. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  547. #endif
  548. movss %xmm1, %xmm0
  549. shufps $0x93, %xmm1, %xmm0
  550. movaps %xmm0, -32 * SIZE(Y)
  551. movaps -3 * SIZE(X), %xmm0
  552. movss %xmm2, %xmm1
  553. shufps $0x93, %xmm2, %xmm1
  554. movaps %xmm1, -28 * SIZE(Y)
  555. movaps 1 * SIZE(X), %xmm1
  556. #ifdef PREFETCH
  557. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  558. #endif
  559. movss %xmm3, %xmm2
  560. shufps $0x93, %xmm3, %xmm2
  561. movaps %xmm2, -24 * SIZE(Y)
  562. movaps 5 * SIZE(X), %xmm2
  563. movss %xmm4, %xmm3
  564. shufps $0x93, %xmm4, %xmm3
  565. movaps %xmm3, -20 * SIZE(Y)
  566. movaps 9 * SIZE(X), %xmm3
  567. #if defined(PREFETCHW) && !defined(FETCH128)
  568. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  569. #endif
  570. movss %xmm5, %xmm4
  571. shufps $0x93, %xmm5, %xmm4
  572. movaps %xmm4, -16 * SIZE(Y)
  573. movaps 13 * SIZE(X), %xmm4
  574. movss %xmm6, %xmm5
  575. shufps $0x93, %xmm6, %xmm5
  576. movaps %xmm5, -12 * SIZE(Y)
  577. movaps 17 * SIZE(X), %xmm5
  578. #if defined(PREFETCH) && !defined(FETCH128)
  579. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  580. #endif
  581. movss %xmm7, %xmm6
  582. shufps $0x93, %xmm7, %xmm6
  583. movaps %xmm6, -8 * SIZE(Y)
  584. movaps 21 * SIZE(X), %xmm6
  585. movss %xmm0, %xmm7
  586. shufps $0x93, %xmm0, %xmm7
  587. movaps %xmm7, -4 * SIZE(Y)
  588. movaps 25 * SIZE(X), %xmm7
  589. subl $-32 * SIZE, X
  590. subl $-32 * SIZE, Y
  591. decl %eax
  592. jg .L41
  593. ALIGN_3
  594. .L42:
  595. movss %xmm1, %xmm0
  596. shufps $0x93, %xmm1, %xmm0
  597. movaps %xmm0, -32 * SIZE(Y)
  598. movaps -3 * SIZE(X), %xmm0
  599. movss %xmm2, %xmm1
  600. shufps $0x93, %xmm2, %xmm1
  601. movaps %xmm1, -28 * SIZE(Y)
  602. movss %xmm3, %xmm2
  603. shufps $0x93, %xmm3, %xmm2
  604. movaps %xmm2, -24 * SIZE(Y)
  605. movss %xmm4, %xmm3
  606. shufps $0x93, %xmm4, %xmm3
  607. movaps %xmm3, -20 * SIZE(Y)
  608. movss %xmm5, %xmm4
  609. shufps $0x93, %xmm5, %xmm4
  610. movaps %xmm4, -16 * SIZE(Y)
  611. movss %xmm6, %xmm5
  612. shufps $0x93, %xmm6, %xmm5
  613. movaps %xmm5, -12 * SIZE(Y)
  614. movss %xmm7, %xmm6
  615. shufps $0x93, %xmm7, %xmm6
  616. movaps %xmm6, -8 * SIZE(Y)
  617. movss %xmm0, %xmm7
  618. shufps $0x93, %xmm0, %xmm7
  619. movaps %xmm7, -4 * SIZE(Y)
  620. subl $-32 * SIZE, X
  621. subl $-32 * SIZE, Y
  622. ALIGN_3
  623. .L43:
  624. testl $16, M
  625. jle .L44
  626. ALIGN_3
  627. movaps -31 * SIZE(X), %xmm1
  628. movaps -27 * SIZE(X), %xmm2
  629. movaps -23 * SIZE(X), %xmm3
  630. movaps -19 * SIZE(X), %xmm4
  631. movss %xmm1, %xmm0
  632. shufps $0x93, %xmm1, %xmm0
  633. movaps %xmm0, -32 * SIZE(Y)
  634. movss %xmm2, %xmm1
  635. shufps $0x93, %xmm2, %xmm1
  636. movaps %xmm1, -28 * SIZE(Y)
  637. movss %xmm3, %xmm2
  638. shufps $0x93, %xmm3, %xmm2
  639. movaps %xmm2, -24 * SIZE(Y)
  640. movss %xmm4, %xmm3
  641. shufps $0x93, %xmm4, %xmm3
  642. movaps %xmm3, -20 * SIZE(Y)
  643. movaps %xmm4, %xmm0
  644. addl $16 * SIZE, X
  645. addl $16 * SIZE, Y
  646. ALIGN_3
  647. .L44:
  648. testl $8, M
  649. jle .L45
  650. ALIGN_3
  651. movaps -31 * SIZE(X), %xmm1
  652. movaps -27 * SIZE(X), %xmm2
  653. movss %xmm1, %xmm0
  654. shufps $0x93, %xmm1, %xmm0
  655. movaps %xmm0, -32 * SIZE(Y)
  656. movss %xmm2, %xmm1
  657. shufps $0x93, %xmm2, %xmm1
  658. movaps %xmm1, -28 * SIZE(Y)
  659. movaps %xmm2, %xmm0
  660. addl $8 * SIZE, X
  661. addl $8 * SIZE, Y
  662. ALIGN_3
  663. .L45:
  664. testl $4, M
  665. jle .L46
  666. ALIGN_3
  667. movaps -31 * SIZE(X), %xmm1
  668. movss %xmm1, %xmm0
  669. shufps $0x93, %xmm1, %xmm0
  670. movaps %xmm0, -32 * SIZE(Y)
  671. addl $4 * SIZE, X
  672. addl $4 * SIZE, Y
  673. ALIGN_3
  674. .L46:
  675. testl $2, M
  676. jle .L47
  677. ALIGN_3
  678. movsd -32 * SIZE(X), %xmm0
  679. movsd %xmm0, -32 * SIZE(Y)
  680. addl $2 * SIZE, X
  681. addl $2 * SIZE, Y
  682. ALIGN_3
  683. .L47:
  684. testl $1, M
  685. jle .L49
  686. ALIGN_3
  687. movss -32 * SIZE(X), %xmm0
  688. movss %xmm0, -32 * SIZE(Y)
  689. addl $SIZE, Y
  690. ALIGN_3
  691. .L49:
  692. popl %ebx
  693. popl %esi
  694. popl %edi
  695. ret
  696. ALIGN_4
  697. .L100:
  698. movl M, %eax
  699. sarl $3, %eax
  700. jle .L105
  701. ALIGN_3
  702. .L102:
  703. movsd (X), %xmm0
  704. addl INCX, X
  705. movhps (X), %xmm0
  706. addl INCX, X
  707. movsd (X), %xmm1
  708. addl INCX, X
  709. movhps (X), %xmm1
  710. addl INCX, X
  711. movsd (X), %xmm2
  712. addl INCX, X
  713. movhps (X), %xmm2
  714. addl INCX, X
  715. movsd (X), %xmm3
  716. addl INCX, X
  717. movhps (X), %xmm3
  718. addl INCX, X
  719. movsd %xmm0, (Y)
  720. addl INCY, Y
  721. movhps %xmm0, (Y)
  722. addl INCY, Y
  723. movsd %xmm1, (Y)
  724. addl INCY, Y
  725. movhps %xmm1, (Y)
  726. addl INCY, Y
  727. movsd %xmm2, (Y)
  728. addl INCY, Y
  729. movhps %xmm2, (Y)
  730. addl INCY, Y
  731. movsd %xmm3, (Y)
  732. addl INCY, Y
  733. movhps %xmm3, (Y)
  734. addl INCY, Y
  735. decl %eax
  736. jg .L102
  737. ALIGN_3
  738. .L105:
  739. testl $4, M
  740. jle .L106
  741. movsd (X), %xmm0
  742. addl INCX, X
  743. movhps (X), %xmm0
  744. addl INCX, X
  745. movsd (X), %xmm1
  746. addl INCX, X
  747. movhps (X), %xmm1
  748. addl INCX, X
  749. movsd %xmm0, (Y)
  750. addl INCY, Y
  751. movhps %xmm0, (Y)
  752. addl INCY, Y
  753. movsd %xmm1, (Y)
  754. addl INCY, Y
  755. movhps %xmm1, (Y)
  756. addl INCY, Y
  757. ALIGN_3
  758. .L106:
  759. testl $2, M
  760. jle .L107
  761. movsd (X), %xmm0
  762. addl INCX, X
  763. movhps (X), %xmm0
  764. addl INCX, X
  765. movsd %xmm0, (Y)
  766. addl INCY, Y
  767. movhps %xmm0, (Y)
  768. addl INCY, Y
  769. ALIGN_3
  770. .L107:
  771. testl $1, M
  772. jle .L999
  773. movsd (X), %xmm0
  774. movsd %xmm0, (Y)
  775. ALIGN_3
  776. .L999:
  777. popl %ebx
  778. popl %esi
  779. popl %edi
  780. ret
  781. EPILOGUE