You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zrot_sse2.S 26 kB


  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_N 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define STACK_C 24 + STACK + ARGS(%esp)
  48. #define STACK_S 32 + STACK + ARGS(%esp)
  49. #define N %ebx
  50. #define X %esi
  51. #define INCX %ecx
  52. #define Y %edi
  53. #define INCY %edx
  54. #define I %eax
  55. #include "l1param.h"
  56. #define C %xmm6
  57. #define S %xmm7
  58. PROLOGUE
  59. pushl %edi
  60. pushl %esi
  61. pushl %ebx
  62. PROFCODE
  63. movl STACK_N, N
  64. movl STACK_X, X
  65. movl STACK_INCX, INCX
  66. movl STACK_Y, Y
  67. movl STACK_INCY, INCY
  68. sall $ZBASE_SHIFT, INCX
  69. sall $ZBASE_SHIFT, INCY
  70. movsd STACK_C, C
  71. movsd STACK_S, S
  72. pshufd $0x44, C, C
  73. pshufd $0x44, S, S
  74. cmpl $0, N
  75. jle .L999
  76. cmpl $2 * SIZE, INCX
  77. jne .L50
  78. cmpl $2 * SIZE, INCY
  79. jne .L50
  80. .L10:
  81. testl $SIZE, X
  82. jne .L30
  83. testl $SIZE, Y
  84. jne .L20
  85. movl N, I
  86. sarl $3, I
  87. jle .L14
  88. ALIGN_3
  89. .L11:
  90. #ifdef PREFETCHW
  91. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  92. #endif
  93. movapd 0 * SIZE(Y), %xmm1
  94. movapd 0 * SIZE(X), %xmm0
  95. movapd %xmm1, %xmm2
  96. movapd %xmm0, %xmm3
  97. mulpd C, %xmm0
  98. mulpd S, %xmm1
  99. mulpd C, %xmm2
  100. mulpd S, %xmm3
  101. addpd %xmm1, %xmm0
  102. subpd %xmm3, %xmm2
  103. movapd %xmm0, 0 * SIZE(X)
  104. movapd %xmm2, 0 * SIZE(Y)
  105. movapd 2 * SIZE(Y), %xmm1
  106. movapd 2 * SIZE(X), %xmm0
  107. movapd %xmm1, %xmm2
  108. movapd %xmm0, %xmm3
  109. mulpd C, %xmm0
  110. mulpd S, %xmm1
  111. mulpd C, %xmm2
  112. mulpd S, %xmm3
  113. addpd %xmm1, %xmm0
  114. subpd %xmm3, %xmm2
  115. movapd %xmm0, 2 * SIZE(X)
  116. movapd %xmm2, 2 * SIZE(Y)
  117. #ifdef PREFETCHW
  118. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  119. #endif
  120. movapd 4 * SIZE(Y), %xmm1
  121. movapd 4 * SIZE(X), %xmm0
  122. movapd %xmm1, %xmm2
  123. movapd %xmm0, %xmm3
  124. mulpd C, %xmm0
  125. mulpd S, %xmm1
  126. mulpd C, %xmm2
  127. mulpd S, %xmm3
  128. addpd %xmm1, %xmm0
  129. subpd %xmm3, %xmm2
  130. movapd %xmm0, 4 * SIZE(X)
  131. movapd %xmm2, 4 * SIZE(Y)
  132. movapd 6 * SIZE(Y), %xmm1
  133. movapd 6 * SIZE(X), %xmm0
  134. movapd %xmm1, %xmm2
  135. movapd %xmm0, %xmm3
  136. mulpd C, %xmm0
  137. mulpd S, %xmm1
  138. mulpd C, %xmm2
  139. mulpd S, %xmm3
  140. addpd %xmm1, %xmm0
  141. subpd %xmm3, %xmm2
  142. movapd %xmm0, 6 * SIZE(X)
  143. movapd %xmm2, 6 * SIZE(Y)
  144. #ifdef PREFETCHW
  145. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  146. #endif
  147. movapd 8 * SIZE(Y), %xmm1
  148. movapd 8 * SIZE(X), %xmm0
  149. movapd %xmm1, %xmm2
  150. movapd %xmm0, %xmm3
  151. mulpd C, %xmm0
  152. mulpd S, %xmm1
  153. mulpd C, %xmm2
  154. mulpd S, %xmm3
  155. addpd %xmm1, %xmm0
  156. subpd %xmm3, %xmm2
  157. movapd %xmm0, 8 * SIZE(X)
  158. movapd %xmm2, 8 * SIZE(Y)
  159. movapd 10 * SIZE(Y), %xmm1
  160. movapd 10 * SIZE(X), %xmm0
  161. movapd %xmm1, %xmm2
  162. movapd %xmm0, %xmm3
  163. mulpd C, %xmm0
  164. mulpd S, %xmm1
  165. mulpd C, %xmm2
  166. mulpd S, %xmm3
  167. addpd %xmm1, %xmm0
  168. subpd %xmm3, %xmm2
  169. movapd %xmm0, 10 * SIZE(X)
  170. movapd %xmm2, 10 * SIZE(Y)
  171. #ifdef PREFETCHW
  172. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  173. #endif
  174. movapd 12 * SIZE(Y), %xmm1
  175. movapd 12 * SIZE(X), %xmm0
  176. movapd %xmm1, %xmm2
  177. movapd %xmm0, %xmm3
  178. mulpd C, %xmm0
  179. mulpd S, %xmm1
  180. mulpd C, %xmm2
  181. mulpd S, %xmm3
  182. addpd %xmm1, %xmm0
  183. subpd %xmm3, %xmm2
  184. movapd %xmm0, 12 * SIZE(X)
  185. movapd %xmm2, 12 * SIZE(Y)
  186. movapd 14 * SIZE(Y), %xmm1
  187. movapd 14 * SIZE(X), %xmm0
  188. movapd %xmm1, %xmm2
  189. movapd %xmm0, %xmm3
  190. mulpd C, %xmm0
  191. mulpd S, %xmm1
  192. mulpd C, %xmm2
  193. mulpd S, %xmm3
  194. addpd %xmm1, %xmm0
  195. subpd %xmm3, %xmm2
  196. movapd %xmm0, 14 * SIZE(X)
  197. movapd %xmm2, 14 * SIZE(Y)
  198. addl $16 * SIZE, X
  199. addl $16 * SIZE, Y
  200. decl I
  201. jg .L11
  202. ALIGN_3
  203. .L14:
  204. testl $7, N
  205. jle .L999
  206. testl $4, N
  207. jle .L15
  208. movapd 0 * SIZE(Y), %xmm1
  209. movapd 0 * SIZE(X), %xmm0
  210. movapd %xmm1, %xmm2
  211. movapd %xmm0, %xmm3
  212. mulpd C, %xmm0
  213. mulpd S, %xmm1
  214. mulpd C, %xmm2
  215. mulpd S, %xmm3
  216. addpd %xmm1, %xmm0
  217. subpd %xmm3, %xmm2
  218. movapd %xmm0, 0 * SIZE(X)
  219. movapd %xmm2, 0 * SIZE(Y)
  220. movapd 2 * SIZE(Y), %xmm1
  221. movapd 2 * SIZE(X), %xmm0
  222. movapd %xmm1, %xmm2
  223. movapd %xmm0, %xmm3
  224. mulpd C, %xmm0
  225. mulpd S, %xmm1
  226. mulpd C, %xmm2
  227. mulpd S, %xmm3
  228. addpd %xmm1, %xmm0
  229. subpd %xmm3, %xmm2
  230. movapd %xmm0, 2 * SIZE(X)
  231. movapd %xmm2, 2 * SIZE(Y)
  232. movapd 4 * SIZE(Y), %xmm1
  233. movapd 4 * SIZE(X), %xmm0
  234. movapd %xmm1, %xmm2
  235. movapd %xmm0, %xmm3
  236. mulpd C, %xmm0
  237. mulpd S, %xmm1
  238. mulpd C, %xmm2
  239. mulpd S, %xmm3
  240. addpd %xmm1, %xmm0
  241. subpd %xmm3, %xmm2
  242. movapd %xmm0, 4 * SIZE(X)
  243. movapd %xmm2, 4 * SIZE(Y)
  244. movapd 6 * SIZE(Y), %xmm1
  245. movapd 6 * SIZE(X), %xmm0
  246. movapd %xmm1, %xmm2
  247. movapd %xmm0, %xmm3
  248. mulpd C, %xmm0
  249. mulpd S, %xmm1
  250. mulpd C, %xmm2
  251. mulpd S, %xmm3
  252. addpd %xmm1, %xmm0
  253. subpd %xmm3, %xmm2
  254. movapd %xmm0, 6 * SIZE(X)
  255. movapd %xmm2, 6 * SIZE(Y)
  256. addl $8 * SIZE, X
  257. addl $8 * SIZE, Y
  258. ALIGN_3
  259. .L15:
  260. testl $2, N
  261. jle .L16
  262. movapd 0 * SIZE(Y), %xmm1
  263. movapd 0 * SIZE(X), %xmm0
  264. movapd %xmm1, %xmm2
  265. movapd %xmm0, %xmm3
  266. mulpd C, %xmm0
  267. mulpd S, %xmm1
  268. mulpd C, %xmm2
  269. mulpd S, %xmm3
  270. addpd %xmm1, %xmm0
  271. subpd %xmm3, %xmm2
  272. movapd %xmm0, 0 * SIZE(X)
  273. movapd %xmm2, 0 * SIZE(Y)
  274. movapd 2 * SIZE(Y), %xmm1
  275. movapd 2 * SIZE(X), %xmm0
  276. movapd %xmm1, %xmm2
  277. movapd %xmm0, %xmm3
  278. mulpd C, %xmm0
  279. mulpd S, %xmm1
  280. mulpd C, %xmm2
  281. mulpd S, %xmm3
  282. addpd %xmm1, %xmm0
  283. subpd %xmm3, %xmm2
  284. movapd %xmm0, 2 * SIZE(X)
  285. movapd %xmm2, 2 * SIZE(Y)
  286. addl $4 * SIZE, X
  287. addl $4 * SIZE, Y
  288. ALIGN_3
  289. .L16:
  290. testl $1, N
  291. jle .L999
  292. movapd 0 * SIZE(Y), %xmm1
  293. movapd 0 * SIZE(X), %xmm0
  294. movapd %xmm1, %xmm2
  295. movapd %xmm0, %xmm3
  296. mulpd C, %xmm0
  297. mulpd S, %xmm1
  298. mulpd C, %xmm2
  299. mulpd S, %xmm3
  300. addpd %xmm1, %xmm0
  301. subpd %xmm3, %xmm2
  302. movapd %xmm0, 0 * SIZE(X)
  303. movapd %xmm2, 0 * SIZE(Y)
  304. jmp .L999
  305. ALIGN_3
  306. .L20:
  307. movapd -1 * SIZE(Y), %xmm1
  308. movl N, I
  309. sarl $3, I
  310. jle .L24
  311. ALIGN_3
  312. .L21:
  313. #ifdef PREFETCHW
  314. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  315. #endif
  316. movapd 1 * SIZE(Y), %xmm4
  317. movapd 0 * SIZE(X), %xmm0
  318. SHUFPD_1 %xmm4, %xmm1
  319. movapd %xmm1, %xmm2
  320. movapd %xmm0, %xmm3
  321. mulpd C, %xmm0
  322. mulpd S, %xmm1
  323. mulpd C, %xmm2
  324. mulpd S, %xmm3
  325. addpd %xmm1, %xmm0
  326. subpd %xmm3, %xmm2
  327. movapd %xmm0, 0 * SIZE(X)
  328. movlpd %xmm2, 0 * SIZE(Y)
  329. movhpd %xmm2, 1 * SIZE(Y)
  330. movapd 3 * SIZE(Y), %xmm1
  331. movapd 2 * SIZE(X), %xmm0
  332. SHUFPD_1 %xmm1, %xmm4
  333. movapd %xmm4, %xmm2
  334. movapd %xmm0, %xmm3
  335. mulpd C, %xmm0
  336. mulpd S, %xmm4
  337. mulpd C, %xmm2
  338. mulpd S, %xmm3
  339. addpd %xmm4, %xmm0
  340. subpd %xmm3, %xmm2
  341. movapd %xmm0, 2 * SIZE(X)
  342. movlpd %xmm2, 2 * SIZE(Y)
  343. movhpd %xmm2, 3 * SIZE(Y)
  344. #ifdef PREFETCHW
  345. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  346. #endif
  347. movapd 5 * SIZE(Y), %xmm4
  348. movapd 4 * SIZE(X), %xmm0
  349. SHUFPD_1 %xmm4, %xmm1
  350. movapd %xmm1, %xmm2
  351. movapd %xmm0, %xmm3
  352. mulpd C, %xmm0
  353. mulpd S, %xmm1
  354. mulpd C, %xmm2
  355. mulpd S, %xmm3
  356. addpd %xmm1, %xmm0
  357. subpd %xmm3, %xmm2
  358. movapd %xmm0, 4 * SIZE(X)
  359. movlpd %xmm2, 4 * SIZE(Y)
  360. movhpd %xmm2, 5 * SIZE(Y)
  361. movapd 7 * SIZE(Y), %xmm1
  362. movapd 6 * SIZE(X), %xmm0
  363. SHUFPD_1 %xmm1, %xmm4
  364. movapd %xmm4, %xmm2
  365. movapd %xmm0, %xmm3
  366. mulpd C, %xmm0
  367. mulpd S, %xmm4
  368. mulpd C, %xmm2
  369. mulpd S, %xmm3
  370. addpd %xmm4, %xmm0
  371. subpd %xmm3, %xmm2
  372. movapd %xmm0, 6 * SIZE(X)
  373. movlpd %xmm2, 6 * SIZE(Y)
  374. movhpd %xmm2, 7 * SIZE(Y)
  375. #ifdef PREFETCHW
  376. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  377. #endif
  378. movapd 9 * SIZE(Y), %xmm4
  379. movapd 8 * SIZE(X), %xmm0
  380. SHUFPD_1 %xmm4, %xmm1
  381. movapd %xmm1, %xmm2
  382. movapd %xmm0, %xmm3
  383. mulpd C, %xmm0
  384. mulpd S, %xmm1
  385. mulpd C, %xmm2
  386. mulpd S, %xmm3
  387. addpd %xmm1, %xmm0
  388. subpd %xmm3, %xmm2
  389. movapd %xmm0, 8 * SIZE(X)
  390. movlpd %xmm2, 8 * SIZE(Y)
  391. movhpd %xmm2, 9 * SIZE(Y)
  392. movapd 11 * SIZE(Y), %xmm1
  393. movapd 10 * SIZE(X), %xmm0
  394. SHUFPD_1 %xmm1, %xmm4
  395. movapd %xmm4, %xmm2
  396. movapd %xmm0, %xmm3
  397. mulpd C, %xmm0
  398. mulpd S, %xmm4
  399. mulpd C, %xmm2
  400. mulpd S, %xmm3
  401. addpd %xmm4, %xmm0
  402. subpd %xmm3, %xmm2
  403. movapd %xmm0, 10 * SIZE(X)
  404. movlpd %xmm2, 10 * SIZE(Y)
  405. movhpd %xmm2, 11 * SIZE(Y)
  406. #ifdef PREFETCHW
  407. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  408. #endif
  409. movapd 13 * SIZE(Y), %xmm4
  410. movapd 12 * SIZE(X), %xmm0
  411. SHUFPD_1 %xmm4, %xmm1
  412. movapd %xmm1, %xmm2
  413. movapd %xmm0, %xmm3
  414. mulpd C, %xmm0
  415. mulpd S, %xmm1
  416. mulpd C, %xmm2
  417. mulpd S, %xmm3
  418. addpd %xmm1, %xmm0
  419. subpd %xmm3, %xmm2
  420. movapd %xmm0, 12 * SIZE(X)
  421. movlpd %xmm2, 12 * SIZE(Y)
  422. movhpd %xmm2, 13 * SIZE(Y)
  423. movapd 15 * SIZE(Y), %xmm1
  424. movapd 14 * SIZE(X), %xmm0
  425. SHUFPD_1 %xmm1, %xmm4
  426. movapd %xmm4, %xmm2
  427. movapd %xmm0, %xmm3
  428. mulpd C, %xmm0
  429. mulpd S, %xmm4
  430. mulpd C, %xmm2
  431. mulpd S, %xmm3
  432. addpd %xmm4, %xmm0
  433. subpd %xmm3, %xmm2
  434. movapd %xmm0, 14 * SIZE(X)
  435. movlpd %xmm2, 14 * SIZE(Y)
  436. movhpd %xmm2, 15 * SIZE(Y)
  437. addl $16 * SIZE, X
  438. addl $16 * SIZE, Y
  439. decl I
  440. jg .L21
  441. ALIGN_3
  442. .L24:
  443. testl $7, N
  444. jle .L999
  445. testl $4, N
  446. jle .L25
  447. movapd 1 * SIZE(Y), %xmm4
  448. movapd 0 * SIZE(X), %xmm0
  449. SHUFPD_1 %xmm4, %xmm1
  450. movapd %xmm1, %xmm2
  451. movapd %xmm0, %xmm3
  452. mulpd C, %xmm0
  453. mulpd S, %xmm1
  454. mulpd C, %xmm2
  455. mulpd S, %xmm3
  456. addpd %xmm1, %xmm0
  457. subpd %xmm3, %xmm2
  458. movapd %xmm0, 0 * SIZE(X)
  459. movlpd %xmm2, 0 * SIZE(Y)
  460. movhpd %xmm2, 1 * SIZE(Y)
  461. movapd 3 * SIZE(Y), %xmm1
  462. movapd 2 * SIZE(X), %xmm0
  463. SHUFPD_1 %xmm1, %xmm4
  464. movapd %xmm4, %xmm2
  465. movapd %xmm0, %xmm3
  466. mulpd C, %xmm0
  467. mulpd S, %xmm4
  468. mulpd C, %xmm2
  469. mulpd S, %xmm3
  470. addpd %xmm4, %xmm0
  471. subpd %xmm3, %xmm2
  472. movapd %xmm0, 2 * SIZE(X)
  473. movlpd %xmm2, 2 * SIZE(Y)
  474. movhpd %xmm2, 3 * SIZE(Y)
  475. movapd 5 * SIZE(Y), %xmm4
  476. movapd 4 * SIZE(X), %xmm0
  477. SHUFPD_1 %xmm4, %xmm1
  478. movapd %xmm1, %xmm2
  479. movapd %xmm0, %xmm3
  480. mulpd C, %xmm0
  481. mulpd S, %xmm1
  482. mulpd C, %xmm2
  483. mulpd S, %xmm3
  484. addpd %xmm1, %xmm0
  485. subpd %xmm3, %xmm2
  486. movapd %xmm0, 4 * SIZE(X)
  487. movlpd %xmm2, 4 * SIZE(Y)
  488. movhpd %xmm2, 5 * SIZE(Y)
  489. movapd 7 * SIZE(Y), %xmm1
  490. movapd 6 * SIZE(X), %xmm0
  491. SHUFPD_1 %xmm1, %xmm4
  492. movapd %xmm4, %xmm2
  493. movapd %xmm0, %xmm3
  494. mulpd C, %xmm0
  495. mulpd S, %xmm4
  496. mulpd C, %xmm2
  497. mulpd S, %xmm3
  498. addpd %xmm4, %xmm0
  499. subpd %xmm3, %xmm2
  500. movapd %xmm0, 6 * SIZE(X)
  501. movlpd %xmm2, 6 * SIZE(Y)
  502. movhpd %xmm2, 7 * SIZE(Y)
  503. addl $8 * SIZE, X
  504. addl $8 * SIZE, Y
  505. ALIGN_3
  506. .L25:
  507. testl $2, N
  508. jle .L26
  509. movapd 1 * SIZE(Y), %xmm4
  510. movapd 0 * SIZE(X), %xmm0
  511. SHUFPD_1 %xmm4, %xmm1
  512. movapd %xmm1, %xmm2
  513. movapd %xmm0, %xmm3
  514. mulpd C, %xmm0
  515. mulpd S, %xmm1
  516. mulpd C, %xmm2
  517. mulpd S, %xmm3
  518. addpd %xmm1, %xmm0
  519. subpd %xmm3, %xmm2
  520. movapd %xmm0, 0 * SIZE(X)
  521. movlpd %xmm2, 0 * SIZE(Y)
  522. movhpd %xmm2, 1 * SIZE(Y)
  523. movapd 3 * SIZE(Y), %xmm1
  524. movapd 2 * SIZE(X), %xmm0
  525. SHUFPD_1 %xmm1, %xmm4
  526. movapd %xmm4, %xmm2
  527. movapd %xmm0, %xmm3
  528. mulpd C, %xmm0
  529. mulpd S, %xmm4
  530. mulpd C, %xmm2
  531. mulpd S, %xmm3
  532. addpd %xmm4, %xmm0
  533. subpd %xmm3, %xmm2
  534. movapd %xmm0, 2 * SIZE(X)
  535. movlpd %xmm2, 2 * SIZE(Y)
  536. movhpd %xmm2, 3 * SIZE(Y)
  537. addl $4 * SIZE, X
  538. addl $4 * SIZE, Y
  539. ALIGN_3
  540. .L26:
  541. testl $1, N
  542. jle .L999
  543. movapd 1 * SIZE(Y), %xmm4
  544. movapd 0 * SIZE(X), %xmm0
  545. SHUFPD_1 %xmm4, %xmm1
  546. movapd %xmm1, %xmm2
  547. movapd %xmm0, %xmm3
  548. mulpd C, %xmm0
  549. mulpd S, %xmm1
  550. mulpd C, %xmm2
  551. mulpd S, %xmm3
  552. addpd %xmm1, %xmm0
  553. subpd %xmm3, %xmm2
  554. movapd %xmm0, 0 * SIZE(X)
  555. movlpd %xmm2, 0 * SIZE(Y)
  556. movhpd %xmm2, 1 * SIZE(Y)
  557. jmp .L999
  558. ALIGN_3
  559. .L30:
  560. testl $SIZE, Y
  561. jne .L40
  562. movapd -1 * SIZE(X), %xmm0
  563. movl N, I
  564. sarl $3, I
  565. jle .L34
  566. ALIGN_3
  567. .L31:
  568. #ifdef PREFETCHW
  569. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  570. #endif
  571. movapd 1 * SIZE(X), %xmm4
  572. movapd 0 * SIZE(Y), %xmm1
  573. SHUFPD_1 %xmm4, %xmm0
  574. movapd %xmm1, %xmm2
  575. movapd %xmm0, %xmm3
  576. mulpd C, %xmm0
  577. mulpd S, %xmm1
  578. mulpd C, %xmm2
  579. mulpd S, %xmm3
  580. addpd %xmm1, %xmm0
  581. subpd %xmm3, %xmm2
  582. movlpd %xmm0, 0 * SIZE(X)
  583. movhpd %xmm0, 1 * SIZE(X)
  584. movapd %xmm2, 0 * SIZE(Y)
  585. movapd 3 * SIZE(X), %xmm0
  586. movapd 2 * SIZE(Y), %xmm1
  587. SHUFPD_1 %xmm0, %xmm4
  588. movapd %xmm1, %xmm2
  589. movapd %xmm4, %xmm3
  590. mulpd C, %xmm4
  591. mulpd S, %xmm1
  592. mulpd C, %xmm2
  593. mulpd S, %xmm3
  594. addpd %xmm1, %xmm4
  595. subpd %xmm3, %xmm2
  596. movlpd %xmm4, 2 * SIZE(X)
  597. movhpd %xmm4, 3 * SIZE(X)
  598. movapd %xmm2, 2 * SIZE(Y)
  599. #ifdef PREFETCHW
  600. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  601. #endif
  602. movapd 5 * SIZE(X), %xmm4
  603. movapd 4 * SIZE(Y), %xmm1
  604. SHUFPD_1 %xmm4, %xmm0
  605. movapd %xmm1, %xmm2
  606. movapd %xmm0, %xmm3
  607. mulpd C, %xmm0
  608. mulpd S, %xmm1
  609. mulpd C, %xmm2
  610. mulpd S, %xmm3
  611. addpd %xmm1, %xmm0
  612. subpd %xmm3, %xmm2
  613. movlpd %xmm0, 4 * SIZE(X)
  614. movhpd %xmm0, 5 * SIZE(X)
  615. movapd %xmm2, 4 * SIZE(Y)
  616. movapd 7 * SIZE(X), %xmm0
  617. movapd 6 * SIZE(Y), %xmm1
  618. SHUFPD_1 %xmm0, %xmm4
  619. movapd %xmm1, %xmm2
  620. movapd %xmm4, %xmm3
  621. mulpd C, %xmm4
  622. mulpd S, %xmm1
  623. mulpd C, %xmm2
  624. mulpd S, %xmm3
  625. addpd %xmm1, %xmm4
  626. subpd %xmm3, %xmm2
  627. movlpd %xmm4, 6 * SIZE(X)
  628. movhpd %xmm4, 7 * SIZE(X)
  629. movapd %xmm2, 6 * SIZE(Y)
  630. #ifdef PREFETCHW
  631. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  632. #endif
  633. movapd 9 * SIZE(X), %xmm4
  634. movapd 8 * SIZE(Y), %xmm1
  635. SHUFPD_1 %xmm4, %xmm0
  636. movapd %xmm1, %xmm2
  637. movapd %xmm0, %xmm3
  638. mulpd C, %xmm0
  639. mulpd S, %xmm1
  640. mulpd C, %xmm2
  641. mulpd S, %xmm3
  642. addpd %xmm1, %xmm0
  643. subpd %xmm3, %xmm2
  644. movlpd %xmm0, 8 * SIZE(X)
  645. movhpd %xmm0, 9 * SIZE(X)
  646. movapd %xmm2, 8 * SIZE(Y)
  647. movapd 11 * SIZE(X), %xmm0
  648. movapd 10 * SIZE(Y), %xmm1
  649. SHUFPD_1 %xmm0, %xmm4
  650. movapd %xmm1, %xmm2
  651. movapd %xmm4, %xmm3
  652. mulpd C, %xmm4
  653. mulpd S, %xmm1
  654. mulpd C, %xmm2
  655. mulpd S, %xmm3
  656. addpd %xmm1, %xmm4
  657. subpd %xmm3, %xmm2
  658. movlpd %xmm4, 10 * SIZE(X)
  659. movhpd %xmm4, 11 * SIZE(X)
  660. movapd %xmm2, 10 * SIZE(Y)
  661. #ifdef PREFETCHW
  662. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  663. #endif
  664. movapd 13 * SIZE(X), %xmm4
  665. movapd 12 * SIZE(Y), %xmm1
  666. SHUFPD_1 %xmm4, %xmm0
  667. movapd %xmm1, %xmm2
  668. movapd %xmm0, %xmm3
  669. mulpd C, %xmm0
  670. mulpd S, %xmm1
  671. mulpd C, %xmm2
  672. mulpd S, %xmm3
  673. addpd %xmm1, %xmm0
  674. subpd %xmm3, %xmm2
  675. movlpd %xmm0, 12 * SIZE(X)
  676. movhpd %xmm0, 13 * SIZE(X)
  677. movapd %xmm2, 12 * SIZE(Y)
  678. movapd 15 * SIZE(X), %xmm0
  679. movapd 14 * SIZE(Y), %xmm1
  680. SHUFPD_1 %xmm0, %xmm4
  681. movapd %xmm1, %xmm2
  682. movapd %xmm4, %xmm3
  683. mulpd C, %xmm4
  684. mulpd S, %xmm1
  685. mulpd C, %xmm2
  686. mulpd S, %xmm3
  687. addpd %xmm1, %xmm4
  688. subpd %xmm3, %xmm2
  689. movlpd %xmm4, 14 * SIZE(X)
  690. movhpd %xmm4, 15 * SIZE(X)
  691. movapd %xmm2, 14 * SIZE(Y)
  692. addl $16 * SIZE, Y
  693. addl $16 * SIZE, X
  694. decl I
  695. jg .L31
  696. ALIGN_3
  697. .L34:
  698. testl $7, N
  699. jle .L999
  700. testl $4, N
  701. jle .L35
  702. movapd 1 * SIZE(X), %xmm4
  703. movapd 0 * SIZE(Y), %xmm1
  704. SHUFPD_1 %xmm4, %xmm0
  705. movapd %xmm1, %xmm2
  706. movapd %xmm0, %xmm3
  707. mulpd C, %xmm0
  708. mulpd S, %xmm1
  709. mulpd C, %xmm2
  710. mulpd S, %xmm3
  711. addpd %xmm1, %xmm0
  712. subpd %xmm3, %xmm2
  713. movlpd %xmm0, 0 * SIZE(X)
  714. movhpd %xmm0, 1 * SIZE(X)
  715. movapd %xmm2, 0 * SIZE(Y)
  716. movapd 3 * SIZE(X), %xmm0
  717. movapd 2 * SIZE(Y), %xmm1
  718. SHUFPD_1 %xmm0, %xmm4
  719. movapd %xmm1, %xmm2
  720. movapd %xmm4, %xmm3
  721. mulpd C, %xmm4
  722. mulpd S, %xmm1
  723. mulpd C, %xmm2
  724. mulpd S, %xmm3
  725. addpd %xmm1, %xmm4
  726. subpd %xmm3, %xmm2
  727. movlpd %xmm4, 2 * SIZE(X)
  728. movhpd %xmm4, 3 * SIZE(X)
  729. movapd %xmm2, 2 * SIZE(Y)
  730. movapd 5 * SIZE(X), %xmm4
  731. movapd 4 * SIZE(Y), %xmm1
  732. SHUFPD_1 %xmm4, %xmm0
  733. movapd %xmm1, %xmm2
  734. movapd %xmm0, %xmm3
  735. mulpd C, %xmm0
  736. mulpd S, %xmm1
  737. mulpd C, %xmm2
  738. mulpd S, %xmm3
  739. addpd %xmm1, %xmm0
  740. subpd %xmm3, %xmm2
  741. movlpd %xmm0, 4 * SIZE(X)
  742. movhpd %xmm0, 5 * SIZE(X)
  743. movapd %xmm2, 4 * SIZE(Y)
  744. movapd 7 * SIZE(X), %xmm0
  745. movapd 6 * SIZE(Y), %xmm1
  746. SHUFPD_1 %xmm0, %xmm4
  747. movapd %xmm1, %xmm2
  748. movapd %xmm4, %xmm3
  749. mulpd C, %xmm4
  750. mulpd S, %xmm1
  751. mulpd C, %xmm2
  752. mulpd S, %xmm3
  753. addpd %xmm1, %xmm4
  754. subpd %xmm3, %xmm2
  755. movlpd %xmm4, 6 * SIZE(X)
  756. movhpd %xmm4, 7 * SIZE(X)
  757. movapd %xmm2, 6 * SIZE(Y)
  758. addl $8 * SIZE, Y
  759. addl $8 * SIZE, X
  760. ALIGN_3
  761. .L35:
  762. testl $2, N
  763. jle .L36
  764. movapd 1 * SIZE(X), %xmm4
  765. movapd 0 * SIZE(Y), %xmm1
  766. SHUFPD_1 %xmm4, %xmm0
  767. movapd %xmm1, %xmm2
  768. movapd %xmm0, %xmm3
  769. mulpd C, %xmm0
  770. mulpd S, %xmm1
  771. mulpd C, %xmm2
  772. mulpd S, %xmm3
  773. addpd %xmm1, %xmm0
  774. subpd %xmm3, %xmm2
  775. movlpd %xmm0, 0 * SIZE(X)
  776. movhpd %xmm0, 1 * SIZE(X)
  777. movapd %xmm2, 0 * SIZE(Y)
  778. movapd 3 * SIZE(X), %xmm0
  779. movapd 2 * SIZE(Y), %xmm1
  780. SHUFPD_1 %xmm0, %xmm4
  781. movapd %xmm1, %xmm2
  782. movapd %xmm4, %xmm3
  783. mulpd C, %xmm4
  784. mulpd S, %xmm1
  785. mulpd C, %xmm2
  786. mulpd S, %xmm3
  787. addpd %xmm1, %xmm4
  788. subpd %xmm3, %xmm2
  789. movlpd %xmm4, 2 * SIZE(X)
  790. movhpd %xmm4, 3 * SIZE(X)
  791. movapd %xmm2, 2 * SIZE(Y)
  792. addl $4 * SIZE, Y
  793. addl $4 * SIZE, X
  794. ALIGN_3
  795. .L36:
  796. testl $1, N
  797. jle .L999
  798. movapd 1 * SIZE(X), %xmm4
  799. movapd 0 * SIZE(Y), %xmm1
  800. SHUFPD_1 %xmm4, %xmm0
  801. movapd %xmm1, %xmm2
  802. movapd %xmm0, %xmm3
  803. mulpd C, %xmm0
  804. mulpd S, %xmm1
  805. mulpd C, %xmm2
  806. mulpd S, %xmm3
  807. addpd %xmm1, %xmm0
  808. subpd %xmm3, %xmm2
  809. movlpd %xmm0, 0 * SIZE(X)
  810. movhpd %xmm0, 1 * SIZE(X)
  811. movapd %xmm2, 0 * SIZE(Y)
  812. jmp .L999
  813. ALIGN_3
  814. .L40:
  815. movsd 0 * SIZE(Y), %xmm1
  816. movsd 0 * SIZE(X), %xmm0
  817. movapd %xmm1, %xmm2
  818. movapd %xmm0, %xmm3
  819. mulsd C, %xmm0
  820. mulsd S, %xmm1
  821. mulsd C, %xmm2
  822. mulsd S, %xmm3
  823. addsd %xmm1, %xmm0
  824. subsd %xmm3, %xmm2
  825. movsd %xmm0, 0 * SIZE(X)
  826. movsd %xmm2, 0 * SIZE(Y)
  827. addl $1 * SIZE, Y
  828. addl $1 * SIZE, X
  829. decl N
  830. jle .L47
  831. movl N, I
  832. sarl $3, I
  833. jle .L44
  834. ALIGN_3
  835. .L41:
  836. #ifdef PREFETCHW
  837. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  838. #endif
  839. movapd 0 * SIZE(Y), %xmm1
  840. movapd 0 * SIZE(X), %xmm0
  841. movapd %xmm1, %xmm2
  842. movapd %xmm0, %xmm3
  843. mulpd C, %xmm0
  844. mulpd S, %xmm1
  845. mulpd C, %xmm2
  846. mulpd S, %xmm3
  847. addpd %xmm1, %xmm0
  848. subpd %xmm3, %xmm2
  849. movapd %xmm0, 0 * SIZE(X)
  850. movapd %xmm2, 0 * SIZE(Y)
  851. movapd 2 * SIZE(Y), %xmm1
  852. movapd 2 * SIZE(X), %xmm0
  853. movapd %xmm1, %xmm2
  854. movapd %xmm0, %xmm3
  855. mulpd C, %xmm0
  856. mulpd S, %xmm1
  857. mulpd C, %xmm2
  858. mulpd S, %xmm3
  859. addpd %xmm1, %xmm0
  860. subpd %xmm3, %xmm2
  861. movapd %xmm0, 2 * SIZE(X)
  862. movapd %xmm2, 2 * SIZE(Y)
  863. #ifdef PREFETCHW
  864. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  865. #endif
  866. movapd 4 * SIZE(Y), %xmm1
  867. movapd 4 * SIZE(X), %xmm0
  868. movapd %xmm1, %xmm2
  869. movapd %xmm0, %xmm3
  870. mulpd C, %xmm0
  871. mulpd S, %xmm1
  872. mulpd C, %xmm2
  873. mulpd S, %xmm3
  874. addpd %xmm1, %xmm0
  875. subpd %xmm3, %xmm2
  876. movapd %xmm0, 4 * SIZE(X)
  877. movapd %xmm2, 4 * SIZE(Y)
  878. movapd 6 * SIZE(Y), %xmm1
  879. movapd 6 * SIZE(X), %xmm0
  880. movapd %xmm1, %xmm2
  881. movapd %xmm0, %xmm3
  882. mulpd C, %xmm0
  883. mulpd S, %xmm1
  884. mulpd C, %xmm2
  885. mulpd S, %xmm3
  886. addpd %xmm1, %xmm0
  887. subpd %xmm3, %xmm2
  888. movapd %xmm0, 6 * SIZE(X)
  889. movapd %xmm2, 6 * SIZE(Y)
  890. #ifdef PREFETCHW
  891. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  892. #endif
  893. movapd 8 * SIZE(Y), %xmm1
  894. movapd 8 * SIZE(X), %xmm0
  895. movapd %xmm1, %xmm2
  896. movapd %xmm0, %xmm3
  897. mulpd C, %xmm0
  898. mulpd S, %xmm1
  899. mulpd C, %xmm2
  900. mulpd S, %xmm3
  901. addpd %xmm1, %xmm0
  902. subpd %xmm3, %xmm2
  903. movapd %xmm0, 8 * SIZE(X)
  904. movapd %xmm2, 8 * SIZE(Y)
  905. movapd 10 * SIZE(Y), %xmm1
  906. movapd 10 * SIZE(X), %xmm0
  907. movapd %xmm1, %xmm2
  908. movapd %xmm0, %xmm3
  909. mulpd C, %xmm0
  910. mulpd S, %xmm1
  911. mulpd C, %xmm2
  912. mulpd S, %xmm3
  913. addpd %xmm1, %xmm0
  914. subpd %xmm3, %xmm2
  915. movapd %xmm0, 10 * SIZE(X)
  916. movapd %xmm2, 10 * SIZE(Y)
  917. #ifdef PREFETCHW
  918. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  919. #endif
  920. movapd 12 * SIZE(Y), %xmm1
  921. movapd 12 * SIZE(X), %xmm0
  922. movapd %xmm1, %xmm2
  923. movapd %xmm0, %xmm3
  924. mulpd C, %xmm0
  925. mulpd S, %xmm1
  926. mulpd C, %xmm2
  927. mulpd S, %xmm3
  928. addpd %xmm1, %xmm0
  929. subpd %xmm3, %xmm2
  930. movapd %xmm0, 12 * SIZE(X)
  931. movapd %xmm2, 12 * SIZE(Y)
  932. movapd 14 * SIZE(Y), %xmm1
  933. movapd 14 * SIZE(X), %xmm0
  934. movapd %xmm1, %xmm2
  935. movapd %xmm0, %xmm3
  936. mulpd C, %xmm0
  937. mulpd S, %xmm1
  938. mulpd C, %xmm2
  939. mulpd S, %xmm3
  940. addpd %xmm1, %xmm0
  941. subpd %xmm3, %xmm2
  942. movapd %xmm0, 14 * SIZE(X)
  943. movapd %xmm2, 14 * SIZE(Y)
  944. addl $16 * SIZE, X
  945. addl $16 * SIZE, Y
  946. decl I
  947. jg .L41
  948. ALIGN_3
  949. .L44:
  950. testl $4, N
  951. jle .L45
  952. movapd 0 * SIZE(Y), %xmm1
  953. movapd 0 * SIZE(X), %xmm0
  954. movapd %xmm1, %xmm2
  955. movapd %xmm0, %xmm3
  956. mulpd C, %xmm0
  957. mulpd S, %xmm1
  958. mulpd C, %xmm2
  959. mulpd S, %xmm3
  960. addpd %xmm1, %xmm0
  961. subpd %xmm3, %xmm2
  962. movapd %xmm0, 0 * SIZE(X)
  963. movapd %xmm2, 0 * SIZE(Y)
  964. movapd 2 * SIZE(Y), %xmm1
  965. movapd 2 * SIZE(X), %xmm0
  966. movapd %xmm1, %xmm2
  967. movapd %xmm0, %xmm3
  968. mulpd C, %xmm0
  969. mulpd S, %xmm1
  970. mulpd C, %xmm2
  971. mulpd S, %xmm3
  972. addpd %xmm1, %xmm0
  973. subpd %xmm3, %xmm2
  974. movapd %xmm0, 2 * SIZE(X)
  975. movapd %xmm2, 2 * SIZE(Y)
  976. movapd 4 * SIZE(Y), %xmm1
  977. movapd 4 * SIZE(X), %xmm0
  978. movapd %xmm1, %xmm2
  979. movapd %xmm0, %xmm3
  980. mulpd C, %xmm0
  981. mulpd S, %xmm1
  982. mulpd C, %xmm2
  983. mulpd S, %xmm3
  984. addpd %xmm1, %xmm0
  985. subpd %xmm3, %xmm2
  986. movapd %xmm0, 4 * SIZE(X)
  987. movapd %xmm2, 4 * SIZE(Y)
  988. movapd 6 * SIZE(Y), %xmm1
  989. movapd 6 * SIZE(X), %xmm0
  990. movapd %xmm1, %xmm2
  991. movapd %xmm0, %xmm3
  992. mulpd C, %xmm0
  993. mulpd S, %xmm1
  994. mulpd C, %xmm2
  995. mulpd S, %xmm3
  996. addpd %xmm1, %xmm0
  997. subpd %xmm3, %xmm2
  998. movapd %xmm0, 6 * SIZE(X)
  999. movapd %xmm2, 6 * SIZE(Y)
  1000. addl $8 * SIZE, X
  1001. addl $8 * SIZE, Y
  1002. ALIGN_3
  1003. .L45:
  1004. testl $2, N
  1005. jle .L46
  1006. movapd 0 * SIZE(Y), %xmm1
  1007. movapd 0 * SIZE(X), %xmm0
  1008. movapd %xmm1, %xmm2
  1009. movapd %xmm0, %xmm3
  1010. mulpd C, %xmm0
  1011. mulpd S, %xmm1
  1012. mulpd C, %xmm2
  1013. mulpd S, %xmm3
  1014. addpd %xmm1, %xmm0
  1015. subpd %xmm3, %xmm2
  1016. movapd %xmm0, 0 * SIZE(X)
  1017. movapd %xmm2, 0 * SIZE(Y)
  1018. movapd 2 * SIZE(Y), %xmm1
  1019. movapd 2 * SIZE(X), %xmm0
  1020. movapd %xmm1, %xmm2
  1021. movapd %xmm0, %xmm3
  1022. mulpd C, %xmm0
  1023. mulpd S, %xmm1
  1024. mulpd C, %xmm2
  1025. mulpd S, %xmm3
  1026. addpd %xmm1, %xmm0
  1027. subpd %xmm3, %xmm2
  1028. movapd %xmm0, 2 * SIZE(X)
  1029. movapd %xmm2, 2 * SIZE(Y)
  1030. addl $4 * SIZE, X
  1031. addl $4 * SIZE, Y
  1032. ALIGN_3
  1033. .L46:
  1034. testl $1, N
  1035. jle .L47
  1036. movapd 0 * SIZE(Y), %xmm1
  1037. movapd 0 * SIZE(X), %xmm0
  1038. movapd %xmm1, %xmm2
  1039. movapd %xmm0, %xmm3
  1040. mulpd C, %xmm0
  1041. mulpd S, %xmm1
  1042. mulpd C, %xmm2
  1043. mulpd S, %xmm3
  1044. addpd %xmm1, %xmm0
  1045. subpd %xmm3, %xmm2
  1046. movapd %xmm0, 0 * SIZE(X)
  1047. movapd %xmm2, 0 * SIZE(Y)
  1048. addl $2 * SIZE, Y
  1049. addl $2 * SIZE, X
  1050. ALIGN_3
  1051. .L47:
  1052. movsd 0 * SIZE(Y), %xmm1
  1053. movsd 0 * SIZE(X), %xmm0
  1054. movapd %xmm1, %xmm2
  1055. movapd %xmm0, %xmm3
  1056. mulsd C, %xmm0
  1057. mulsd S, %xmm1
  1058. mulsd C, %xmm2
  1059. mulsd S, %xmm3
  1060. addsd %xmm1, %xmm0
  1061. subsd %xmm3, %xmm2
  1062. movsd %xmm0, 0 * SIZE(X)
  1063. movsd %xmm2, 0 * SIZE(Y)
  1064. jmp .L999
  1065. ALIGN_3
  1066. .L50:
  1067. movl N, I
  1068. sarl $2, I
  1069. jle .L55
  1070. ALIGN_3
  1071. .L53:
  1072. movsd 0 * SIZE(Y), %xmm1
  1073. movhpd 1 * SIZE(Y), %xmm1
  1074. movsd 0 * SIZE(X), %xmm0
  1075. movhpd 1 * SIZE(X), %xmm0
  1076. movapd %xmm1, %xmm2
  1077. movapd %xmm0, %xmm3
  1078. mulpd C, %xmm0
  1079. mulpd S, %xmm1
  1080. mulpd C, %xmm2
  1081. mulpd S, %xmm3
  1082. addpd %xmm1, %xmm0
  1083. subpd %xmm3, %xmm2
  1084. movlpd %xmm0, 0 * SIZE(X)
  1085. movhpd %xmm0, 1 * SIZE(X)
  1086. movlpd %xmm2, 0 * SIZE(Y)
  1087. movhpd %xmm2, 1 * SIZE(Y)
  1088. addl INCX, X
  1089. addl INCY, Y
  1090. movsd 0 * SIZE(Y), %xmm1
  1091. movhpd 1 * SIZE(Y), %xmm1
  1092. movsd 0 * SIZE(X), %xmm0
  1093. movhpd 1 * SIZE(X), %xmm0
  1094. movapd %xmm1, %xmm2
  1095. movapd %xmm0, %xmm3
  1096. mulpd C, %xmm0
  1097. mulpd S, %xmm1
  1098. mulpd C, %xmm2
  1099. mulpd S, %xmm3
  1100. addpd %xmm1, %xmm0
  1101. subpd %xmm3, %xmm2
  1102. movlpd %xmm0, 0 * SIZE(X)
  1103. movhpd %xmm0, 1 * SIZE(X)
  1104. movlpd %xmm2, 0 * SIZE(Y)
  1105. movhpd %xmm2, 1 * SIZE(Y)
  1106. addl INCX, X
  1107. addl INCY, Y
  1108. movsd 0 * SIZE(Y), %xmm1
  1109. movhpd 1 * SIZE(Y), %xmm1
  1110. movsd 0 * SIZE(X), %xmm0
  1111. movhpd 1 * SIZE(X), %xmm0
  1112. movapd %xmm1, %xmm2
  1113. movapd %xmm0, %xmm3
  1114. mulpd C, %xmm0
  1115. mulpd S, %xmm1
  1116. mulpd C, %xmm2
  1117. mulpd S, %xmm3
  1118. addpd %xmm1, %xmm0
  1119. subpd %xmm3, %xmm2
  1120. movlpd %xmm0, 0 * SIZE(X)
  1121. movhpd %xmm0, 1 * SIZE(X)
  1122. movlpd %xmm2, 0 * SIZE(Y)
  1123. movhpd %xmm2, 1 * SIZE(Y)
  1124. addl INCX, X
  1125. addl INCY, Y
  1126. movsd 0 * SIZE(Y), %xmm1
  1127. movhpd 1 * SIZE(Y), %xmm1
  1128. movsd 0 * SIZE(X), %xmm0
  1129. movhpd 1 * SIZE(X), %xmm0
  1130. movapd %xmm1, %xmm2
  1131. movapd %xmm0, %xmm3
  1132. mulpd C, %xmm0
  1133. mulpd S, %xmm1
  1134. mulpd C, %xmm2
  1135. mulpd S, %xmm3
  1136. addpd %xmm1, %xmm0
  1137. subpd %xmm3, %xmm2
  1138. movlpd %xmm0, 0 * SIZE(X)
  1139. movhpd %xmm0, 1 * SIZE(X)
  1140. movlpd %xmm2, 0 * SIZE(Y)
  1141. movhpd %xmm2, 1 * SIZE(Y)
  1142. addl INCX, X
  1143. addl INCY, Y
  1144. decl I
  1145. jg .L53
  1146. ALIGN_3
  1147. .L55:
  1148. movl N, I
  1149. andl $3, I
  1150. jle .L999
  1151. ALIGN_3
  1152. .L56:
  1153. movsd 0 * SIZE(Y), %xmm1
  1154. movhpd 1 * SIZE(Y), %xmm1
  1155. movsd 0 * SIZE(X), %xmm0
  1156. movhpd 1 * SIZE(X), %xmm0
  1157. movapd %xmm1, %xmm2
  1158. movapd %xmm0, %xmm3
  1159. mulpd C, %xmm0
  1160. mulpd S, %xmm1
  1161. mulpd C, %xmm2
  1162. mulpd S, %xmm3
  1163. addpd %xmm1, %xmm0
  1164. subpd %xmm3, %xmm2
  1165. movlpd %xmm0, 0 * SIZE(X)
  1166. movhpd %xmm0, 1 * SIZE(X)
  1167. movlpd %xmm2, 0 * SIZE(Y)
  1168. movhpd %xmm2, 1 * SIZE(Y)
  1169. addl INCX, X
  1170. addl INCY, Y
  1171. decl I
  1172. jg .L56
  1173. ALIGN_3
  1174. .L999:
  1175. popl %ebx
  1176. popl %esi
  1177. popl %edi
  1178. ret
  1179. EPILOGUE