You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rot_sse2.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_N 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define STACK_C 24 + STACK + ARGS(%esp)
  48. #define STACK_S 32 + STACK + ARGS(%esp)
  49. #define N %ebx
  50. #define X %esi
  51. #define INCX %ecx
  52. #define Y %edi
  53. #define INCY %edx
  54. #define I %eax
  55. #include "l1param.h"
  56. #define C %xmm6
  57. #define S %xmm7
  58. PROLOGUE
  59. PROFCODE
  60. pushl %edi
  61. pushl %esi
  62. pushl %ebx
  63. movl STACK_N, N
  64. movl STACK_X, X
  65. movl STACK_INCX, INCX
  66. movl STACK_Y, Y
  67. movl STACK_INCY, INCY
  68. leal (, INCX, SIZE), INCX
  69. leal (, INCY, SIZE), INCY
  70. movsd STACK_C, C
  71. movsd STACK_S, S
  72. pshufd $0x44, C, C
  73. pshufd $0x44, S, S
  74. cmpl $0, N
  75. jle .L999
  76. cmpl $SIZE, INCX
  77. jne .L50
  78. cmpl $SIZE, INCY
  79. jne .L50
  80. testl $SIZE, X
  81. je .L10
  82. movsd 0 * SIZE(Y), %xmm1
  83. movsd 0 * SIZE(X), %xmm0
  84. movapd %xmm1, %xmm2
  85. movapd %xmm0, %xmm3
  86. mulsd C, %xmm0
  87. mulsd S, %xmm1
  88. mulsd C, %xmm2
  89. mulsd S, %xmm3
  90. addsd %xmm1, %xmm0
  91. subsd %xmm3, %xmm2
  92. movsd %xmm0, 0 * SIZE(X)
  93. movsd %xmm2, 0 * SIZE(Y)
  94. addl $1 * SIZE, X
  95. addl $1 * SIZE, Y
  96. decl N
  97. jle .L999
  98. ALIGN_2
  99. .L10:
  100. testl $SIZE, Y
  101. jne .L20
  102. movl N, I
  103. sarl $4, I
  104. jle .L14
  105. ALIGN_3
  106. .L11:
  107. #ifdef PREFETCHW
  108. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  109. #endif
  110. movapd 0 * SIZE(Y), %xmm1
  111. movapd 0 * SIZE(X), %xmm0
  112. movapd %xmm1, %xmm2
  113. movapd %xmm0, %xmm3
  114. mulpd C, %xmm0
  115. mulpd S, %xmm1
  116. mulpd C, %xmm2
  117. mulpd S, %xmm3
  118. addpd %xmm1, %xmm0
  119. subpd %xmm3, %xmm2
  120. movapd %xmm0, 0 * SIZE(X)
  121. movapd %xmm2, 0 * SIZE(Y)
  122. movapd 2 * SIZE(Y), %xmm1
  123. movapd 2 * SIZE(X), %xmm0
  124. movapd %xmm1, %xmm2
  125. movapd %xmm0, %xmm3
  126. mulpd C, %xmm0
  127. mulpd S, %xmm1
  128. mulpd C, %xmm2
  129. mulpd S, %xmm3
  130. addpd %xmm1, %xmm0
  131. subpd %xmm3, %xmm2
  132. movapd %xmm0, 2 * SIZE(X)
  133. movapd %xmm2, 2 * SIZE(Y)
  134. #ifdef PREFETCHW
  135. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  136. #endif
  137. movapd 4 * SIZE(Y), %xmm1
  138. movapd 4 * SIZE(X), %xmm0
  139. movapd %xmm1, %xmm2
  140. movapd %xmm0, %xmm3
  141. mulpd C, %xmm0
  142. mulpd S, %xmm1
  143. mulpd C, %xmm2
  144. mulpd S, %xmm3
  145. addpd %xmm1, %xmm0
  146. subpd %xmm3, %xmm2
  147. movapd %xmm0, 4 * SIZE(X)
  148. movapd %xmm2, 4 * SIZE(Y)
  149. movapd 6 * SIZE(Y), %xmm1
  150. movapd 6 * SIZE(X), %xmm0
  151. movapd %xmm1, %xmm2
  152. movapd %xmm0, %xmm3
  153. mulpd C, %xmm0
  154. mulpd S, %xmm1
  155. mulpd C, %xmm2
  156. mulpd S, %xmm3
  157. addpd %xmm1, %xmm0
  158. subpd %xmm3, %xmm2
  159. movapd %xmm0, 6 * SIZE(X)
  160. movapd %xmm2, 6 * SIZE(Y)
  161. #ifdef PREFETCHW
  162. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  163. #endif
  164. movapd 8 * SIZE(Y), %xmm1
  165. movapd 8 * SIZE(X), %xmm0
  166. movapd %xmm1, %xmm2
  167. movapd %xmm0, %xmm3
  168. mulpd C, %xmm0
  169. mulpd S, %xmm1
  170. mulpd C, %xmm2
  171. mulpd S, %xmm3
  172. addpd %xmm1, %xmm0
  173. subpd %xmm3, %xmm2
  174. movapd %xmm0, 8 * SIZE(X)
  175. movapd %xmm2, 8 * SIZE(Y)
  176. movapd 10 * SIZE(Y), %xmm1
  177. movapd 10 * SIZE(X), %xmm0
  178. movapd %xmm1, %xmm2
  179. movapd %xmm0, %xmm3
  180. mulpd C, %xmm0
  181. mulpd S, %xmm1
  182. mulpd C, %xmm2
  183. mulpd S, %xmm3
  184. addpd %xmm1, %xmm0
  185. subpd %xmm3, %xmm2
  186. movapd %xmm0, 10 * SIZE(X)
  187. movapd %xmm2, 10 * SIZE(Y)
  188. #ifdef PREFETCHW
  189. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  190. #endif
  191. movapd 12 * SIZE(Y), %xmm1
  192. movapd 12 * SIZE(X), %xmm0
  193. movapd %xmm1, %xmm2
  194. movapd %xmm0, %xmm3
  195. mulpd C, %xmm0
  196. mulpd S, %xmm1
  197. mulpd C, %xmm2
  198. mulpd S, %xmm3
  199. addpd %xmm1, %xmm0
  200. subpd %xmm3, %xmm2
  201. movapd %xmm0, 12 * SIZE(X)
  202. movapd %xmm2, 12 * SIZE(Y)
  203. movapd 14 * SIZE(Y), %xmm1
  204. movapd 14 * SIZE(X), %xmm0
  205. movapd %xmm1, %xmm2
  206. movapd %xmm0, %xmm3
  207. mulpd C, %xmm0
  208. mulpd S, %xmm1
  209. mulpd C, %xmm2
  210. mulpd S, %xmm3
  211. addpd %xmm1, %xmm0
  212. subpd %xmm3, %xmm2
  213. movapd %xmm0, 14 * SIZE(X)
  214. movapd %xmm2, 14 * SIZE(Y)
  215. addl $16 * SIZE, X
  216. addl $16 * SIZE, Y
  217. decl I
  218. jg .L11
  219. ALIGN_3
  220. .L14:
  221. testl $15, N
  222. jle .L999
  223. testl $8, N
  224. jle .L15
  225. movapd 0 * SIZE(Y), %xmm1
  226. movapd 0 * SIZE(X), %xmm0
  227. movapd %xmm1, %xmm2
  228. movapd %xmm0, %xmm3
  229. mulpd C, %xmm0
  230. mulpd S, %xmm1
  231. mulpd C, %xmm2
  232. mulpd S, %xmm3
  233. addpd %xmm1, %xmm0
  234. subpd %xmm3, %xmm2
  235. movapd %xmm0, 0 * SIZE(X)
  236. movapd %xmm2, 0 * SIZE(Y)
  237. movapd 2 * SIZE(Y), %xmm1
  238. movapd 2 * SIZE(X), %xmm0
  239. movapd %xmm1, %xmm2
  240. movapd %xmm0, %xmm3
  241. mulpd C, %xmm0
  242. mulpd S, %xmm1
  243. mulpd C, %xmm2
  244. mulpd S, %xmm3
  245. addpd %xmm1, %xmm0
  246. subpd %xmm3, %xmm2
  247. movapd %xmm0, 2 * SIZE(X)
  248. movapd %xmm2, 2 * SIZE(Y)
  249. movapd 4 * SIZE(Y), %xmm1
  250. movapd 4 * SIZE(X), %xmm0
  251. movapd %xmm1, %xmm2
  252. movapd %xmm0, %xmm3
  253. mulpd C, %xmm0
  254. mulpd S, %xmm1
  255. mulpd C, %xmm2
  256. mulpd S, %xmm3
  257. addpd %xmm1, %xmm0
  258. subpd %xmm3, %xmm2
  259. movapd %xmm0, 4 * SIZE(X)
  260. movapd %xmm2, 4 * SIZE(Y)
  261. movapd 6 * SIZE(Y), %xmm1
  262. movapd 6 * SIZE(X), %xmm0
  263. movapd %xmm1, %xmm2
  264. movapd %xmm0, %xmm3
  265. mulpd C, %xmm0
  266. mulpd S, %xmm1
  267. mulpd C, %xmm2
  268. mulpd S, %xmm3
  269. addpd %xmm1, %xmm0
  270. subpd %xmm3, %xmm2
  271. movapd %xmm0, 6 * SIZE(X)
  272. movapd %xmm2, 6 * SIZE(Y)
  273. addl $8 * SIZE, X
  274. addl $8 * SIZE, Y
  275. ALIGN_3
  276. .L15:
  277. testl $4, N
  278. jle .L16
  279. movapd 0 * SIZE(Y), %xmm1
  280. movapd 0 * SIZE(X), %xmm0
  281. movapd %xmm1, %xmm2
  282. movapd %xmm0, %xmm3
  283. mulpd C, %xmm0
  284. mulpd S, %xmm1
  285. mulpd C, %xmm2
  286. mulpd S, %xmm3
  287. addpd %xmm1, %xmm0
  288. subpd %xmm3, %xmm2
  289. movapd %xmm0, 0 * SIZE(X)
  290. movapd %xmm2, 0 * SIZE(Y)
  291. movapd 2 * SIZE(Y), %xmm1
  292. movapd 2 * SIZE(X), %xmm0
  293. movapd %xmm1, %xmm2
  294. movapd %xmm0, %xmm3
  295. mulpd C, %xmm0
  296. mulpd S, %xmm1
  297. mulpd C, %xmm2
  298. mulpd S, %xmm3
  299. addpd %xmm1, %xmm0
  300. subpd %xmm3, %xmm2
  301. movapd %xmm0, 2 * SIZE(X)
  302. movapd %xmm2, 2 * SIZE(Y)
  303. addl $4 * SIZE, X
  304. addl $4 * SIZE, Y
  305. ALIGN_3
  306. .L16:
  307. testl $2, N
  308. jle .L17
  309. movapd 0 * SIZE(Y), %xmm1
  310. movapd 0 * SIZE(X), %xmm0
  311. movapd %xmm1, %xmm2
  312. movapd %xmm0, %xmm3
  313. mulpd C, %xmm0
  314. mulpd S, %xmm1
  315. mulpd C, %xmm2
  316. mulpd S, %xmm3
  317. addpd %xmm1, %xmm0
  318. subpd %xmm3, %xmm2
  319. movapd %xmm0, 0 * SIZE(X)
  320. movapd %xmm2, 0 * SIZE(Y)
  321. addl $2 * SIZE, X
  322. addl $2 * SIZE, Y
  323. ALIGN_3
  324. .L17:
  325. testl $1, N
  326. jle .L999
  327. movsd 0 * SIZE(Y), %xmm1
  328. movsd 0 * SIZE(X), %xmm0
  329. movapd %xmm1, %xmm2
  330. movapd %xmm0, %xmm3
  331. mulsd C, %xmm0
  332. mulsd S, %xmm1
  333. mulsd C, %xmm2
  334. mulsd S, %xmm3
  335. addsd %xmm1, %xmm0
  336. subsd %xmm3, %xmm2
  337. movsd %xmm0, 0 * SIZE(X)
  338. movsd %xmm2, 0 * SIZE(Y)
  339. jmp .L999
  340. ALIGN_3
  341. .L20:
  342. movapd -1 * SIZE(Y), %xmm1
  343. movl N, I
  344. sarl $4, I
  345. jle .L24
  346. ALIGN_3
  347. .L21:
  348. #ifdef PREFETCHW
  349. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  350. #endif
  351. movapd 1 * SIZE(Y), %xmm4
  352. movapd 0 * SIZE(X), %xmm0
  353. SHUFPD_1 %xmm4, %xmm1
  354. movapd %xmm1, %xmm2
  355. movapd %xmm0, %xmm3
  356. mulpd C, %xmm0
  357. mulpd S, %xmm1
  358. mulpd C, %xmm2
  359. mulpd S, %xmm3
  360. addpd %xmm1, %xmm0
  361. subpd %xmm3, %xmm2
  362. movapd %xmm0, 0 * SIZE(X)
  363. movlpd %xmm2, 0 * SIZE(Y)
  364. movhpd %xmm2, 1 * SIZE(Y)
  365. movapd 3 * SIZE(Y), %xmm1
  366. movapd 2 * SIZE(X), %xmm0
  367. SHUFPD_1 %xmm1, %xmm4
  368. movapd %xmm4, %xmm2
  369. movapd %xmm0, %xmm3
  370. mulpd C, %xmm0
  371. mulpd S, %xmm4
  372. mulpd C, %xmm2
  373. mulpd S, %xmm3
  374. addpd %xmm4, %xmm0
  375. subpd %xmm3, %xmm2
  376. movapd %xmm0, 2 * SIZE(X)
  377. movlpd %xmm2, 2 * SIZE(Y)
  378. movhpd %xmm2, 3 * SIZE(Y)
  379. movapd 5 * SIZE(Y), %xmm4
  380. movapd 4 * SIZE(X), %xmm0
  381. SHUFPD_1 %xmm4, %xmm1
  382. movapd %xmm1, %xmm2
  383. movapd %xmm0, %xmm3
  384. mulpd C, %xmm0
  385. mulpd S, %xmm1
  386. mulpd C, %xmm2
  387. mulpd S, %xmm3
  388. addpd %xmm1, %xmm0
  389. subpd %xmm3, %xmm2
  390. movapd %xmm0, 4 * SIZE(X)
  391. movlpd %xmm2, 4 * SIZE(Y)
  392. movhpd %xmm2, 5 * SIZE(Y)
  393. movapd 7 * SIZE(Y), %xmm1
  394. movapd 6 * SIZE(X), %xmm0
  395. SHUFPD_1 %xmm1, %xmm4
  396. movapd %xmm4, %xmm2
  397. movapd %xmm0, %xmm3
  398. mulpd C, %xmm0
  399. mulpd S, %xmm4
  400. mulpd C, %xmm2
  401. mulpd S, %xmm3
  402. addpd %xmm4, %xmm0
  403. subpd %xmm3, %xmm2
  404. movapd %xmm0, 6 * SIZE(X)
  405. movlpd %xmm2, 6 * SIZE(Y)
  406. movhpd %xmm2, 7 * SIZE(Y)
  407. #ifdef PREFETCHW
  408. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  409. #endif
  410. movapd 9 * SIZE(Y), %xmm4
  411. movapd 8 * SIZE(X), %xmm0
  412. SHUFPD_1 %xmm4, %xmm1
  413. movapd %xmm1, %xmm2
  414. movapd %xmm0, %xmm3
  415. mulpd C, %xmm0
  416. mulpd S, %xmm1
  417. mulpd C, %xmm2
  418. mulpd S, %xmm3
  419. addpd %xmm1, %xmm0
  420. subpd %xmm3, %xmm2
  421. movapd %xmm0, 8 * SIZE(X)
  422. movlpd %xmm2, 8 * SIZE(Y)
  423. movhpd %xmm2, 9 * SIZE(Y)
  424. #ifdef PREFETCHW
  425. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  426. #endif
  427. movapd 11 * SIZE(Y), %xmm1
  428. movapd 10 * SIZE(X), %xmm0
  429. SHUFPD_1 %xmm1, %xmm4
  430. movapd %xmm4, %xmm2
  431. movapd %xmm0, %xmm3
  432. mulpd C, %xmm0
  433. mulpd S, %xmm4
  434. mulpd C, %xmm2
  435. mulpd S, %xmm3
  436. addpd %xmm4, %xmm0
  437. subpd %xmm3, %xmm2
  438. movapd %xmm0, 10 * SIZE(X)
  439. movlpd %xmm2, 10 * SIZE(Y)
  440. movhpd %xmm2, 11 * SIZE(Y)
  441. movapd 13 * SIZE(Y), %xmm4
  442. movapd 12 * SIZE(X), %xmm0
  443. SHUFPD_1 %xmm4, %xmm1
  444. movapd %xmm1, %xmm2
  445. movapd %xmm0, %xmm3
  446. mulpd C, %xmm0
  447. mulpd S, %xmm1
  448. mulpd C, %xmm2
  449. mulpd S, %xmm3
  450. addpd %xmm1, %xmm0
  451. subpd %xmm3, %xmm2
  452. movapd %xmm0, 12 * SIZE(X)
  453. movlpd %xmm2, 12 * SIZE(Y)
  454. movhpd %xmm2, 13 * SIZE(Y)
  455. #ifdef PREFETCHW
  456. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  457. #endif
  458. movapd 15 * SIZE(Y), %xmm1
  459. movapd 14 * SIZE(X), %xmm0
  460. SHUFPD_1 %xmm1, %xmm4
  461. movapd %xmm4, %xmm2
  462. movapd %xmm0, %xmm3
  463. mulpd C, %xmm0
  464. mulpd S, %xmm4
  465. mulpd C, %xmm2
  466. mulpd S, %xmm3
  467. addpd %xmm4, %xmm0
  468. subpd %xmm3, %xmm2
  469. movapd %xmm0, 14 * SIZE(X)
  470. movlpd %xmm2, 14 * SIZE(Y)
  471. movhpd %xmm2, 15 * SIZE(Y)
  472. addl $16 * SIZE, X
  473. addl $16 * SIZE, Y
  474. decl I
  475. jg .L21
  476. ALIGN_3
  477. .L24:
  478. testl $15, N
  479. jle .L999
  480. testl $8, N
  481. jle .L25
  482. movapd 1 * SIZE(Y), %xmm4
  483. movapd 0 * SIZE(X), %xmm0
  484. SHUFPD_1 %xmm4, %xmm1
  485. movapd %xmm1, %xmm2
  486. movapd %xmm0, %xmm3
  487. mulpd C, %xmm0
  488. mulpd S, %xmm1
  489. mulpd C, %xmm2
  490. mulpd S, %xmm3
  491. addpd %xmm1, %xmm0
  492. subpd %xmm3, %xmm2
  493. movapd %xmm0, 0 * SIZE(X)
  494. movlpd %xmm2, 0 * SIZE(Y)
  495. movhpd %xmm2, 1 * SIZE(Y)
  496. movapd 3 * SIZE(Y), %xmm1
  497. movapd 2 * SIZE(X), %xmm0
  498. SHUFPD_1 %xmm1, %xmm4
  499. movapd %xmm4, %xmm2
  500. movapd %xmm0, %xmm3
  501. mulpd C, %xmm0
  502. mulpd S, %xmm4
  503. mulpd C, %xmm2
  504. mulpd S, %xmm3
  505. addpd %xmm4, %xmm0
  506. subpd %xmm3, %xmm2
  507. movapd %xmm0, 2 * SIZE(X)
  508. movlpd %xmm2, 2 * SIZE(Y)
  509. movhpd %xmm2, 3 * SIZE(Y)
  510. movapd 5 * SIZE(Y), %xmm4
  511. movapd 4 * SIZE(X), %xmm0
  512. SHUFPD_1 %xmm4, %xmm1
  513. movapd %xmm1, %xmm2
  514. movapd %xmm0, %xmm3
  515. mulpd C, %xmm0
  516. mulpd S, %xmm1
  517. mulpd C, %xmm2
  518. mulpd S, %xmm3
  519. addpd %xmm1, %xmm0
  520. subpd %xmm3, %xmm2
  521. movapd %xmm0, 4 * SIZE(X)
  522. movlpd %xmm2, 4 * SIZE(Y)
  523. movhpd %xmm2, 5 * SIZE(Y)
  524. movapd 7 * SIZE(Y), %xmm1
  525. movapd 6 * SIZE(X), %xmm0
  526. SHUFPD_1 %xmm1, %xmm4
  527. movapd %xmm4, %xmm2
  528. movapd %xmm0, %xmm3
  529. mulpd C, %xmm0
  530. mulpd S, %xmm4
  531. mulpd C, %xmm2
  532. mulpd S, %xmm3
  533. addpd %xmm4, %xmm0
  534. subpd %xmm3, %xmm2
  535. movapd %xmm0, 6 * SIZE(X)
  536. movlpd %xmm2, 6 * SIZE(Y)
  537. movhpd %xmm2, 7 * SIZE(Y)
  538. addl $8 * SIZE, X
  539. addl $8 * SIZE, Y
  540. ALIGN_3
  541. .L25:
  542. testl $4, N
  543. jle .L26
  544. movapd 1 * SIZE(Y), %xmm4
  545. movapd 0 * SIZE(X), %xmm0
  546. SHUFPD_1 %xmm4, %xmm1
  547. movapd %xmm1, %xmm2
  548. movapd %xmm0, %xmm3
  549. mulpd C, %xmm0
  550. mulpd S, %xmm1
  551. mulpd C, %xmm2
  552. mulpd S, %xmm3
  553. addpd %xmm1, %xmm0
  554. subpd %xmm3, %xmm2
  555. movapd %xmm0, 0 * SIZE(X)
  556. movlpd %xmm2, 0 * SIZE(Y)
  557. movhpd %xmm2, 1 * SIZE(Y)
  558. movapd 3 * SIZE(Y), %xmm1
  559. movapd 2 * SIZE(X), %xmm0
  560. SHUFPD_1 %xmm1, %xmm4
  561. movapd %xmm4, %xmm2
  562. movapd %xmm0, %xmm3
  563. mulpd C, %xmm0
  564. mulpd S, %xmm4
  565. mulpd C, %xmm2
  566. mulpd S, %xmm3
  567. addpd %xmm4, %xmm0
  568. subpd %xmm3, %xmm2
  569. movapd %xmm0, 2 * SIZE(X)
  570. movlpd %xmm2, 2 * SIZE(Y)
  571. movhpd %xmm2, 3 * SIZE(Y)
  572. addl $4 * SIZE, X
  573. addl $4 * SIZE, Y
  574. ALIGN_3
  575. .L26:
  576. testl $2, N
  577. jle .L27
  578. movapd 1 * SIZE(Y), %xmm4
  579. movapd 0 * SIZE(X), %xmm0
  580. SHUFPD_1 %xmm4, %xmm1
  581. movapd %xmm1, %xmm2
  582. movapd %xmm0, %xmm3
  583. mulpd C, %xmm0
  584. mulpd S, %xmm1
  585. mulpd C, %xmm2
  586. mulpd S, %xmm3
  587. addpd %xmm1, %xmm0
  588. subpd %xmm3, %xmm2
  589. movapd %xmm0, 0 * SIZE(X)
  590. movlpd %xmm2, 0 * SIZE(Y)
  591. movhpd %xmm2, 1 * SIZE(Y)
  592. movapd %xmm4, %xmm1
  593. addl $2 * SIZE, X
  594. addl $2 * SIZE, Y
  595. ALIGN_3
  596. .L27:
  597. testl $1, N
  598. jle .L999
  599. unpckhpd %xmm1, %xmm1
  600. movsd 0 * SIZE(X), %xmm0
  601. movapd %xmm1, %xmm2
  602. movapd %xmm0, %xmm3
  603. mulsd C, %xmm0
  604. mulsd S, %xmm1
  605. mulsd C, %xmm2
  606. mulsd S, %xmm3
  607. addsd %xmm1, %xmm0
  608. subsd %xmm3, %xmm2
  609. movsd %xmm0, 0 * SIZE(X)
  610. movsd %xmm2, 0 * SIZE(Y)
  611. jmp .L999
  612. ALIGN_3
  613. .L50:
  614. movl N, I
  615. cmpl $0, INCX
  616. je .L56
  617. cmpl $0, INCY
  618. je .L56
  619. sarl $2, I
  620. jle .L55
  621. ALIGN_3
  622. .L53:
  623. movsd (Y), %xmm1
  624. movhpd (Y, INCY), %xmm1
  625. movsd (X), %xmm0
  626. movhpd (X, INCX), %xmm0
  627. movapd %xmm1, %xmm2
  628. movapd %xmm0, %xmm3
  629. mulpd C, %xmm0
  630. mulpd S, %xmm1
  631. mulpd C, %xmm2
  632. mulpd S, %xmm3
  633. addpd %xmm1, %xmm0
  634. subpd %xmm3, %xmm2
  635. movlpd %xmm0, (X)
  636. movhpd %xmm0, (X, INCX)
  637. movlpd %xmm2, (Y)
  638. movhpd %xmm2, (Y, INCY)
  639. leal (X, INCX, 2), X
  640. leal (Y, INCY, 2), Y
  641. movsd (Y), %xmm1
  642. movhpd (Y, INCY), %xmm1
  643. movsd (X), %xmm0
  644. movhpd (X, INCX), %xmm0
  645. movapd %xmm1, %xmm2
  646. movapd %xmm0, %xmm3
  647. mulpd C, %xmm0
  648. mulpd S, %xmm1
  649. mulpd C, %xmm2
  650. mulpd S, %xmm3
  651. addpd %xmm1, %xmm0
  652. subpd %xmm3, %xmm2
  653. movlpd %xmm0, (X)
  654. movhpd %xmm0, (X, INCX)
  655. movlpd %xmm2, (Y)
  656. movhpd %xmm2, (Y, INCY)
  657. leal (X, INCX, 2), X
  658. leal (Y, INCY, 2), Y
  659. decl I
  660. jg .L53
  661. ALIGN_3
  662. .L55:
  663. movl N, I
  664. andl $3, I
  665. jle .L999
  666. ALIGN_3
  667. .L56:
  668. movsd (Y), %xmm1
  669. movsd (X), %xmm0
  670. movapd %xmm1, %xmm2
  671. movapd %xmm0, %xmm3
  672. mulsd C, %xmm0
  673. mulsd S, %xmm1
  674. mulsd C, %xmm2
  675. mulsd S, %xmm3
  676. addsd %xmm1, %xmm0
  677. subsd %xmm3, %xmm2
  678. movsd %xmm0, (X)
  679. movsd %xmm2, (Y)
  680. addl INCX, X
  681. addl INCY, Y
  682. decl I
  683. jg .L56
  684. ALIGN_3
  685. .L999:
  686. popl %ebx
  687. popl %esi
  688. popl %edi
  689. ret
  690. EPILOGUE