You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zrot_sse2.S 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_N 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define STACK_C 24 + STACK + ARGS(%esp)
  48. #define STACK_S 32 + STACK + ARGS(%esp)
  49. #define N %ebx
  50. #define X %esi
  51. #define INCX %ecx
  52. #define Y %edi
  53. #define INCY %edx
  54. #define I %eax
  55. #include "l1param.h"
  56. #define C %xmm6
  57. #define S %xmm7
  58. PROLOGUE
  59. pushl %edi
  60. pushl %esi
  61. pushl %ebx
  62. PROFCODE
  63. movl STACK_N, N
  64. movl STACK_X, X
  65. movl STACK_INCX, INCX
  66. movl STACK_Y, Y
  67. movl STACK_INCY, INCY
  68. sall $ZBASE_SHIFT, INCX
  69. sall $ZBASE_SHIFT, INCY
  70. movsd STACK_C, C
  71. movsd STACK_S, S
  72. pshufd $0x44, C, C
  73. pshufd $0x44, S, S
  74. cmpl $0, N
  75. jle .L999
  76. cmpl $2 * SIZE, INCX
  77. jne .L50
  78. cmpl $2 * SIZE, INCY
  79. jne .L50
  80. .L10:
  81. testl $SIZE, X
  82. jne .L30
  83. testl $SIZE, Y
  84. jne .L20
  85. movl N, I
  86. sarl $3, I
  87. jle .L14
  88. ALIGN_3
  89. .L11:
  90. #ifdef PREFETCHW
  91. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  92. #endif
  93. movapd 0 * SIZE(Y), %xmm1
  94. movapd 0 * SIZE(X), %xmm0
  95. movapd %xmm1, %xmm2
  96. movapd %xmm0, %xmm3
  97. mulpd C, %xmm0
  98. mulpd S, %xmm1
  99. mulpd C, %xmm2
  100. mulpd S, %xmm3
  101. addpd %xmm1, %xmm0
  102. subpd %xmm3, %xmm2
  103. movapd %xmm0, 0 * SIZE(X)
  104. movapd %xmm2, 0 * SIZE(Y)
  105. movapd 2 * SIZE(Y), %xmm1
  106. movapd 2 * SIZE(X), %xmm0
  107. movapd %xmm1, %xmm2
  108. movapd %xmm0, %xmm3
  109. mulpd C, %xmm0
  110. mulpd S, %xmm1
  111. mulpd C, %xmm2
  112. mulpd S, %xmm3
  113. addpd %xmm1, %xmm0
  114. subpd %xmm3, %xmm2
  115. movapd %xmm0, 2 * SIZE(X)
  116. movapd %xmm2, 2 * SIZE(Y)
  117. #ifdef PREFETCHW
  118. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  119. #endif
  120. movapd 4 * SIZE(Y), %xmm1
  121. movapd 4 * SIZE(X), %xmm0
  122. movapd %xmm1, %xmm2
  123. movapd %xmm0, %xmm3
  124. mulpd C, %xmm0
  125. mulpd S, %xmm1
  126. mulpd C, %xmm2
  127. mulpd S, %xmm3
  128. addpd %xmm1, %xmm0
  129. subpd %xmm3, %xmm2
  130. movapd %xmm0, 4 * SIZE(X)
  131. movapd %xmm2, 4 * SIZE(Y)
  132. movapd 6 * SIZE(Y), %xmm1
  133. movapd 6 * SIZE(X), %xmm0
  134. movapd %xmm1, %xmm2
  135. movapd %xmm0, %xmm3
  136. mulpd C, %xmm0
  137. mulpd S, %xmm1
  138. mulpd C, %xmm2
  139. mulpd S, %xmm3
  140. addpd %xmm1, %xmm0
  141. subpd %xmm3, %xmm2
  142. movapd %xmm0, 6 * SIZE(X)
  143. movapd %xmm2, 6 * SIZE(Y)
  144. #ifdef PREFETCHW
  145. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  146. #endif
  147. movapd 8 * SIZE(Y), %xmm1
  148. movapd 8 * SIZE(X), %xmm0
  149. movapd %xmm1, %xmm2
  150. movapd %xmm0, %xmm3
  151. mulpd C, %xmm0
  152. mulpd S, %xmm1
  153. mulpd C, %xmm2
  154. mulpd S, %xmm3
  155. addpd %xmm1, %xmm0
  156. subpd %xmm3, %xmm2
  157. movapd %xmm0, 8 * SIZE(X)
  158. movapd %xmm2, 8 * SIZE(Y)
  159. movapd 10 * SIZE(Y), %xmm1
  160. movapd 10 * SIZE(X), %xmm0
  161. movapd %xmm1, %xmm2
  162. movapd %xmm0, %xmm3
  163. mulpd C, %xmm0
  164. mulpd S, %xmm1
  165. mulpd C, %xmm2
  166. mulpd S, %xmm3
  167. addpd %xmm1, %xmm0
  168. subpd %xmm3, %xmm2
  169. movapd %xmm0, 10 * SIZE(X)
  170. movapd %xmm2, 10 * SIZE(Y)
  171. #ifdef PREFETCHW
  172. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  173. #endif
  174. movapd 12 * SIZE(Y), %xmm1
  175. movapd 12 * SIZE(X), %xmm0
  176. movapd %xmm1, %xmm2
  177. movapd %xmm0, %xmm3
  178. mulpd C, %xmm0
  179. mulpd S, %xmm1
  180. mulpd C, %xmm2
  181. mulpd S, %xmm3
  182. addpd %xmm1, %xmm0
  183. subpd %xmm3, %xmm2
  184. movapd %xmm0, 12 * SIZE(X)
  185. movapd %xmm2, 12 * SIZE(Y)
  186. movapd 14 * SIZE(Y), %xmm1
  187. movapd 14 * SIZE(X), %xmm0
  188. movapd %xmm1, %xmm2
  189. movapd %xmm0, %xmm3
  190. mulpd C, %xmm0
  191. mulpd S, %xmm1
  192. mulpd C, %xmm2
  193. mulpd S, %xmm3
  194. addpd %xmm1, %xmm0
  195. subpd %xmm3, %xmm2
  196. movapd %xmm0, 14 * SIZE(X)
  197. movapd %xmm2, 14 * SIZE(Y)
  198. addl $16 * SIZE, X
  199. addl $16 * SIZE, Y
  200. decl I
  201. jg .L11
  202. ALIGN_3
  203. .L14:
  204. testl $7, N
  205. jle .L999
  206. testl $4, N
  207. jle .L15
  208. movapd 0 * SIZE(Y), %xmm1
  209. movapd 0 * SIZE(X), %xmm0
  210. movapd %xmm1, %xmm2
  211. movapd %xmm0, %xmm3
  212. mulpd C, %xmm0
  213. mulpd S, %xmm1
  214. mulpd C, %xmm2
  215. mulpd S, %xmm3
  216. addpd %xmm1, %xmm0
  217. subpd %xmm3, %xmm2
  218. movapd %xmm0, 0 * SIZE(X)
  219. movapd %xmm2, 0 * SIZE(Y)
  220. movapd 2 * SIZE(Y), %xmm1
  221. movapd 2 * SIZE(X), %xmm0
  222. movapd %xmm1, %xmm2
  223. movapd %xmm0, %xmm3
  224. mulpd C, %xmm0
  225. mulpd S, %xmm1
  226. mulpd C, %xmm2
  227. mulpd S, %xmm3
  228. addpd %xmm1, %xmm0
  229. subpd %xmm3, %xmm2
  230. movapd %xmm0, 2 * SIZE(X)
  231. movapd %xmm2, 2 * SIZE(Y)
  232. movapd 4 * SIZE(Y), %xmm1
  233. movapd 4 * SIZE(X), %xmm0
  234. movapd %xmm1, %xmm2
  235. movapd %xmm0, %xmm3
  236. mulpd C, %xmm0
  237. mulpd S, %xmm1
  238. mulpd C, %xmm2
  239. mulpd S, %xmm3
  240. addpd %xmm1, %xmm0
  241. subpd %xmm3, %xmm2
  242. movapd %xmm0, 4 * SIZE(X)
  243. movapd %xmm2, 4 * SIZE(Y)
  244. movapd 6 * SIZE(Y), %xmm1
  245. movapd 6 * SIZE(X), %xmm0
  246. movapd %xmm1, %xmm2
  247. movapd %xmm0, %xmm3
  248. mulpd C, %xmm0
  249. mulpd S, %xmm1
  250. mulpd C, %xmm2
  251. mulpd S, %xmm3
  252. addpd %xmm1, %xmm0
  253. subpd %xmm3, %xmm2
  254. movapd %xmm0, 6 * SIZE(X)
  255. movapd %xmm2, 6 * SIZE(Y)
  256. addl $8 * SIZE, X
  257. addl $8 * SIZE, Y
  258. ALIGN_3
  259. .L15:
  260. testl $2, N
  261. jle .L16
  262. movapd 0 * SIZE(Y), %xmm1
  263. movapd 0 * SIZE(X), %xmm0
  264. movapd %xmm1, %xmm2
  265. movapd %xmm0, %xmm3
  266. mulpd C, %xmm0
  267. mulpd S, %xmm1
  268. mulpd C, %xmm2
  269. mulpd S, %xmm3
  270. addpd %xmm1, %xmm0
  271. subpd %xmm3, %xmm2
  272. movapd %xmm0, 0 * SIZE(X)
  273. movapd %xmm2, 0 * SIZE(Y)
  274. movapd 2 * SIZE(Y), %xmm1
  275. movapd 2 * SIZE(X), %xmm0
  276. movapd %xmm1, %xmm2
  277. movapd %xmm0, %xmm3
  278. mulpd C, %xmm0
  279. mulpd S, %xmm1
  280. mulpd C, %xmm2
  281. mulpd S, %xmm3
  282. addpd %xmm1, %xmm0
  283. subpd %xmm3, %xmm2
  284. movapd %xmm0, 2 * SIZE(X)
  285. movapd %xmm2, 2 * SIZE(Y)
  286. addl $4 * SIZE, X
  287. addl $4 * SIZE, Y
  288. ALIGN_3
  289. .L16:
  290. testl $1, N
  291. jle .L999
  292. movapd 0 * SIZE(Y), %xmm1
  293. movapd 0 * SIZE(X), %xmm0
  294. movapd %xmm1, %xmm2
  295. movapd %xmm0, %xmm3
  296. mulpd C, %xmm0
  297. mulpd S, %xmm1
  298. mulpd C, %xmm2
  299. mulpd S, %xmm3
  300. addpd %xmm1, %xmm0
  301. subpd %xmm3, %xmm2
  302. movapd %xmm0, 0 * SIZE(X)
  303. movapd %xmm2, 0 * SIZE(Y)
  304. jmp .L999
  305. ALIGN_3
  306. .L20:
  307. movapd -1 * SIZE(Y), %xmm1
  308. movl N, I
  309. sarl $3, I
  310. jle .L24
  311. ALIGN_3
  312. .L21:
  313. #ifdef PREFETCHW
  314. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  315. #endif
  316. movapd 1 * SIZE(Y), %xmm4
  317. movapd 0 * SIZE(X), %xmm0
  318. SHUFPD_1 %xmm4, %xmm1
  319. movapd %xmm1, %xmm2
  320. movapd %xmm0, %xmm3
  321. mulpd C, %xmm0
  322. mulpd S, %xmm1
  323. mulpd C, %xmm2
  324. mulpd S, %xmm3
  325. addpd %xmm1, %xmm0
  326. subpd %xmm3, %xmm2
  327. movapd %xmm0, 0 * SIZE(X)
  328. movlpd %xmm2, 0 * SIZE(Y)
  329. movhpd %xmm2, 1 * SIZE(Y)
  330. movapd 3 * SIZE(Y), %xmm1
  331. movapd 2 * SIZE(X), %xmm0
  332. SHUFPD_1 %xmm1, %xmm4
  333. movapd %xmm4, %xmm2
  334. movapd %xmm0, %xmm3
  335. mulpd C, %xmm0
  336. mulpd S, %xmm4
  337. mulpd C, %xmm2
  338. mulpd S, %xmm3
  339. addpd %xmm4, %xmm0
  340. subpd %xmm3, %xmm2
  341. movapd %xmm0, 2 * SIZE(X)
  342. movlpd %xmm2, 2 * SIZE(Y)
  343. movhpd %xmm2, 3 * SIZE(Y)
  344. #ifdef PREFETCHW
  345. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  346. #endif
  347. movapd 5 * SIZE(Y), %xmm4
  348. movapd 4 * SIZE(X), %xmm0
  349. SHUFPD_1 %xmm4, %xmm1
  350. movapd %xmm1, %xmm2
  351. movapd %xmm0, %xmm3
  352. mulpd C, %xmm0
  353. mulpd S, %xmm1
  354. mulpd C, %xmm2
  355. mulpd S, %xmm3
  356. addpd %xmm1, %xmm0
  357. subpd %xmm3, %xmm2
  358. movapd %xmm0, 4 * SIZE(X)
  359. movlpd %xmm2, 4 * SIZE(Y)
  360. movhpd %xmm2, 5 * SIZE(Y)
  361. movapd 7 * SIZE(Y), %xmm1
  362. movapd 6 * SIZE(X), %xmm0
  363. SHUFPD_1 %xmm1, %xmm4
  364. movapd %xmm4, %xmm2
  365. movapd %xmm0, %xmm3
  366. mulpd C, %xmm0
  367. mulpd S, %xmm4
  368. mulpd C, %xmm2
  369. mulpd S, %xmm3
  370. addpd %xmm4, %xmm0
  371. subpd %xmm3, %xmm2
  372. movapd %xmm0, 6 * SIZE(X)
  373. movlpd %xmm2, 6 * SIZE(Y)
  374. movhpd %xmm2, 7 * SIZE(Y)
  375. #ifdef PREFETCHW
  376. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  377. #endif
  378. movapd 9 * SIZE(Y), %xmm4
  379. movapd 8 * SIZE(X), %xmm0
  380. SHUFPD_1 %xmm4, %xmm1
  381. movapd %xmm1, %xmm2
  382. movapd %xmm0, %xmm3
  383. mulpd C, %xmm0
  384. mulpd S, %xmm1
  385. mulpd C, %xmm2
  386. mulpd S, %xmm3
  387. addpd %xmm1, %xmm0
  388. subpd %xmm3, %xmm2
  389. movapd %xmm0, 8 * SIZE(X)
  390. movlpd %xmm2, 8 * SIZE(Y)
  391. movhpd %xmm2, 9 * SIZE(Y)
  392. movapd 11 * SIZE(Y), %xmm1
  393. movapd 10 * SIZE(X), %xmm0
  394. SHUFPD_1 %xmm1, %xmm4
  395. movapd %xmm4, %xmm2
  396. movapd %xmm0, %xmm3
  397. mulpd C, %xmm0
  398. mulpd S, %xmm4
  399. mulpd C, %xmm2
  400. mulpd S, %xmm3
  401. addpd %xmm4, %xmm0
  402. subpd %xmm3, %xmm2
  403. movapd %xmm0, 10 * SIZE(X)
  404. movlpd %xmm2, 10 * SIZE(Y)
  405. movhpd %xmm2, 11 * SIZE(Y)
  406. #ifdef PREFETCHW
  407. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  408. #endif
  409. movapd 13 * SIZE(Y), %xmm4
  410. movapd 12 * SIZE(X), %xmm0
  411. SHUFPD_1 %xmm4, %xmm1
  412. movapd %xmm1, %xmm2
  413. movapd %xmm0, %xmm3
  414. mulpd C, %xmm0
  415. mulpd S, %xmm1
  416. mulpd C, %xmm2
  417. mulpd S, %xmm3
  418. addpd %xmm1, %xmm0
  419. subpd %xmm3, %xmm2
  420. movapd %xmm0, 12 * SIZE(X)
  421. movlpd %xmm2, 12 * SIZE(Y)
  422. movhpd %xmm2, 13 * SIZE(Y)
  423. movapd 15 * SIZE(Y), %xmm1
  424. movapd 14 * SIZE(X), %xmm0
  425. SHUFPD_1 %xmm1, %xmm4
  426. movapd %xmm4, %xmm2
  427. movapd %xmm0, %xmm3
  428. mulpd C, %xmm0
  429. mulpd S, %xmm4
  430. mulpd C, %xmm2
  431. mulpd S, %xmm3
  432. addpd %xmm4, %xmm0
  433. subpd %xmm3, %xmm2
  434. movapd %xmm0, 14 * SIZE(X)
  435. movlpd %xmm2, 14 * SIZE(Y)
  436. movhpd %xmm2, 15 * SIZE(Y)
  437. addl $16 * SIZE, X
  438. addl $16 * SIZE, Y
  439. decl I
  440. jg .L21
  441. ALIGN_3
  442. .L24:
  443. testl $7, N
  444. jle .L999
  445. testl $4, N
  446. jle .L25
  447. movapd 1 * SIZE(Y), %xmm4
  448. movapd 0 * SIZE(X), %xmm0
  449. SHUFPD_1 %xmm4, %xmm1
  450. movapd %xmm1, %xmm2
  451. movapd %xmm0, %xmm3
  452. mulpd C, %xmm0
  453. mulpd S, %xmm1
  454. mulpd C, %xmm2
  455. mulpd S, %xmm3
  456. addpd %xmm1, %xmm0
  457. subpd %xmm3, %xmm2
  458. movapd %xmm0, 0 * SIZE(X)
  459. movlpd %xmm2, 0 * SIZE(Y)
  460. movhpd %xmm2, 1 * SIZE(Y)
  461. movapd 3 * SIZE(Y), %xmm1
  462. movapd 2 * SIZE(X), %xmm0
  463. SHUFPD_1 %xmm1, %xmm4
  464. movapd %xmm4, %xmm2
  465. movapd %xmm0, %xmm3
  466. mulpd C, %xmm0
  467. mulpd S, %xmm4
  468. mulpd C, %xmm2
  469. mulpd S, %xmm3
  470. addpd %xmm4, %xmm0
  471. subpd %xmm3, %xmm2
  472. movapd %xmm0, 2 * SIZE(X)
  473. movlpd %xmm2, 2 * SIZE(Y)
  474. movhpd %xmm2, 3 * SIZE(Y)
  475. movapd 5 * SIZE(Y), %xmm4
  476. movapd 4 * SIZE(X), %xmm0
  477. SHUFPD_1 %xmm4, %xmm1
  478. movapd %xmm1, %xmm2
  479. movapd %xmm0, %xmm3
  480. mulpd C, %xmm0
  481. mulpd S, %xmm1
  482. mulpd C, %xmm2
  483. mulpd S, %xmm3
  484. addpd %xmm1, %xmm0
  485. subpd %xmm3, %xmm2
  486. movapd %xmm0, 4 * SIZE(X)
  487. movlpd %xmm2, 4 * SIZE(Y)
  488. movhpd %xmm2, 5 * SIZE(Y)
  489. movapd 7 * SIZE(Y), %xmm1
  490. movapd 6 * SIZE(X), %xmm0
  491. SHUFPD_1 %xmm1, %xmm4
  492. movapd %xmm4, %xmm2
  493. movapd %xmm0, %xmm3
  494. mulpd C, %xmm0
  495. mulpd S, %xmm4
  496. mulpd C, %xmm2
  497. mulpd S, %xmm3
  498. addpd %xmm4, %xmm0
  499. subpd %xmm3, %xmm2
  500. movapd %xmm0, 6 * SIZE(X)
  501. movlpd %xmm2, 6 * SIZE(Y)
  502. movhpd %xmm2, 7 * SIZE(Y)
  503. addl $8 * SIZE, X
  504. addl $8 * SIZE, Y
  505. ALIGN_3
  506. .L25:
  507. testl $2, N
  508. jle .L26
  509. movapd 1 * SIZE(Y), %xmm4
  510. movapd 0 * SIZE(X), %xmm0
  511. SHUFPD_1 %xmm4, %xmm1
  512. movapd %xmm1, %xmm2
  513. movapd %xmm0, %xmm3
  514. mulpd C, %xmm0
  515. mulpd S, %xmm1
  516. mulpd C, %xmm2
  517. mulpd S, %xmm3
  518. addpd %xmm1, %xmm0
  519. subpd %xmm3, %xmm2
  520. movapd %xmm0, 0 * SIZE(X)
  521. movlpd %xmm2, 0 * SIZE(Y)
  522. movhpd %xmm2, 1 * SIZE(Y)
  523. movapd 3 * SIZE(Y), %xmm1
  524. movapd 2 * SIZE(X), %xmm0
  525. SHUFPD_1 %xmm1, %xmm4
  526. movapd %xmm4, %xmm2
  527. movapd %xmm0, %xmm3
  528. mulpd C, %xmm0
  529. mulpd S, %xmm4
  530. mulpd C, %xmm2
  531. mulpd S, %xmm3
  532. addpd %xmm4, %xmm0
  533. subpd %xmm3, %xmm2
  534. movapd %xmm0, 2 * SIZE(X)
  535. movlpd %xmm2, 2 * SIZE(Y)
  536. movhpd %xmm2, 3 * SIZE(Y)
  537. addl $4 * SIZE, X
  538. addl $4 * SIZE, Y
  539. ALIGN_3
  540. .L26:
  541. testl $1, N
  542. jle .L999
  543. movapd 1 * SIZE(Y), %xmm4
  544. movapd 0 * SIZE(X), %xmm0
  545. SHUFPD_1 %xmm4, %xmm1
  546. movapd %xmm1, %xmm2
  547. movapd %xmm0, %xmm3
  548. mulpd C, %xmm0
  549. mulpd S, %xmm1
  550. mulpd C, %xmm2
  551. mulpd S, %xmm3
  552. addpd %xmm1, %xmm0
  553. subpd %xmm3, %xmm2
  554. movapd %xmm0, 0 * SIZE(X)
  555. movlpd %xmm2, 0 * SIZE(Y)
  556. movhpd %xmm2, 1 * SIZE(Y)
  557. jmp .L999
  558. ALIGN_3
  559. .L30:
  560. testl $SIZE, Y
  561. jne .L40
  562. movapd -1 * SIZE(X), %xmm0
  563. movl N, I
  564. sarl $3, I
  565. jle .L34
  566. ALIGN_3
  567. .L31:
  568. #ifdef PREFETCHW
  569. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  570. #endif
  571. movapd 1 * SIZE(X), %xmm4
  572. movapd 0 * SIZE(Y), %xmm1
  573. SHUFPD_1 %xmm4, %xmm0
  574. movapd %xmm1, %xmm2
  575. movapd %xmm0, %xmm3
  576. mulpd C, %xmm0
  577. mulpd S, %xmm1
  578. mulpd C, %xmm2
  579. mulpd S, %xmm3
  580. addpd %xmm1, %xmm0
  581. subpd %xmm3, %xmm2
  582. movlpd %xmm0, 0 * SIZE(X)
  583. movhpd %xmm0, 1 * SIZE(X)
  584. movapd %xmm2, 0 * SIZE(Y)
  585. movapd 3 * SIZE(X), %xmm0
  586. movapd 2 * SIZE(Y), %xmm1
  587. SHUFPD_1 %xmm0, %xmm4
  588. movapd %xmm1, %xmm2
  589. movapd %xmm4, %xmm3
  590. mulpd C, %xmm4
  591. mulpd S, %xmm1
  592. mulpd C, %xmm2
  593. mulpd S, %xmm3
  594. addpd %xmm1, %xmm4
  595. subpd %xmm3, %xmm2
  596. movlpd %xmm4, 2 * SIZE(X)
  597. movhpd %xmm4, 3 * SIZE(X)
  598. movapd %xmm2, 2 * SIZE(Y)
  599. #ifdef PREFETCHW
  600. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  601. #endif
  602. movapd 5 * SIZE(X), %xmm4
  603. movapd 4 * SIZE(Y), %xmm1
  604. SHUFPD_1 %xmm4, %xmm0
  605. movapd %xmm1, %xmm2
  606. movapd %xmm0, %xmm3
  607. mulpd C, %xmm0
  608. mulpd S, %xmm1
  609. mulpd C, %xmm2
  610. mulpd S, %xmm3
  611. addpd %xmm1, %xmm0
  612. subpd %xmm3, %xmm2
  613. movlpd %xmm0, 4 * SIZE(X)
  614. movhpd %xmm0, 5 * SIZE(X)
  615. movapd %xmm2, 4 * SIZE(Y)
  616. movapd 7 * SIZE(X), %xmm0
  617. movapd 6 * SIZE(Y), %xmm1
  618. SHUFPD_1 %xmm0, %xmm4
  619. movapd %xmm1, %xmm2
  620. movapd %xmm4, %xmm3
  621. mulpd C, %xmm4
  622. mulpd S, %xmm1
  623. mulpd C, %xmm2
  624. mulpd S, %xmm3
  625. addpd %xmm1, %xmm4
  626. subpd %xmm3, %xmm2
  627. movlpd %xmm4, 6 * SIZE(X)
  628. movhpd %xmm4, 7 * SIZE(X)
  629. movapd %xmm2, 6 * SIZE(Y)
  630. #ifdef PREFETCHW
  631. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  632. #endif
  633. movapd 9 * SIZE(X), %xmm4
  634. movapd 8 * SIZE(Y), %xmm1
  635. SHUFPD_1 %xmm4, %xmm0
  636. movapd %xmm1, %xmm2
  637. movapd %xmm0, %xmm3
  638. mulpd C, %xmm0
  639. mulpd S, %xmm1
  640. mulpd C, %xmm2
  641. mulpd S, %xmm3
  642. addpd %xmm1, %xmm0
  643. subpd %xmm3, %xmm2
  644. movlpd %xmm0, 8 * SIZE(X)
  645. movhpd %xmm0, 9 * SIZE(X)
  646. movapd %xmm2, 8 * SIZE(Y)
  647. movapd 11 * SIZE(X), %xmm0
  648. movapd 10 * SIZE(Y), %xmm1
  649. SHUFPD_1 %xmm0, %xmm4
  650. movapd %xmm1, %xmm2
  651. movapd %xmm4, %xmm3
  652. mulpd C, %xmm4
  653. mulpd S, %xmm1
  654. mulpd C, %xmm2
  655. mulpd S, %xmm3
  656. addpd %xmm1, %xmm4
  657. subpd %xmm3, %xmm2
  658. movlpd %xmm4, 10 * SIZE(X)
  659. movhpd %xmm4, 11 * SIZE(X)
  660. movapd %xmm2, 10 * SIZE(Y)
  661. #ifdef PREFETCHW
  662. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  663. #endif
  664. movapd 13 * SIZE(X), %xmm4
  665. movapd 12 * SIZE(Y), %xmm1
  666. SHUFPD_1 %xmm4, %xmm0
  667. movapd %xmm1, %xmm2
  668. movapd %xmm0, %xmm3
  669. mulpd C, %xmm0
  670. mulpd S, %xmm1
  671. mulpd C, %xmm2
  672. mulpd S, %xmm3
  673. addpd %xmm1, %xmm0
  674. subpd %xmm3, %xmm2
  675. movlpd %xmm0, 12 * SIZE(X)
  676. movhpd %xmm0, 13 * SIZE(X)
  677. movapd %xmm2, 12 * SIZE(Y)
  678. movapd 15 * SIZE(X), %xmm0
  679. movapd 14 * SIZE(Y), %xmm1
  680. SHUFPD_1 %xmm0, %xmm4
  681. movapd %xmm1, %xmm2
  682. movapd %xmm4, %xmm3
  683. mulpd C, %xmm4
  684. mulpd S, %xmm1
  685. mulpd C, %xmm2
  686. mulpd S, %xmm3
  687. addpd %xmm1, %xmm4
  688. subpd %xmm3, %xmm2
  689. movlpd %xmm4, 14 * SIZE(X)
  690. movhpd %xmm4, 15 * SIZE(X)
  691. movapd %xmm2, 14 * SIZE(Y)
  692. addl $16 * SIZE, Y
  693. addl $16 * SIZE, X
  694. decl I
  695. jg .L31
  696. ALIGN_3
  697. .L34:
  698. testl $7, N
  699. jle .L999
  700. testl $4, N
  701. jle .L35
  702. movapd 1 * SIZE(X), %xmm4
  703. movapd 0 * SIZE(Y), %xmm1
  704. SHUFPD_1 %xmm4, %xmm0
  705. movapd %xmm1, %xmm2
  706. movapd %xmm0, %xmm3
  707. mulpd C, %xmm0
  708. mulpd S, %xmm1
  709. mulpd C, %xmm2
  710. mulpd S, %xmm3
  711. addpd %xmm1, %xmm0
  712. subpd %xmm3, %xmm2
  713. movlpd %xmm0, 0 * SIZE(X)
  714. movhpd %xmm0, 1 * SIZE(X)
  715. movapd %xmm2, 0 * SIZE(Y)
  716. movapd 3 * SIZE(X), %xmm0
  717. movapd 2 * SIZE(Y), %xmm1
  718. SHUFPD_1 %xmm0, %xmm4
  719. movapd %xmm1, %xmm2
  720. movapd %xmm4, %xmm3
  721. mulpd C, %xmm4
  722. mulpd S, %xmm1
  723. mulpd C, %xmm2
  724. mulpd S, %xmm3
  725. addpd %xmm1, %xmm4
  726. subpd %xmm3, %xmm2
  727. movlpd %xmm4, 2 * SIZE(X)
  728. movhpd %xmm4, 3 * SIZE(X)
  729. movapd %xmm2, 2 * SIZE(Y)
  730. movapd 5 * SIZE(X), %xmm4
  731. movapd 4 * SIZE(Y), %xmm1
  732. SHUFPD_1 %xmm4, %xmm0
  733. movapd %xmm1, %xmm2
  734. movapd %xmm0, %xmm3
  735. mulpd C, %xmm0
  736. mulpd S, %xmm1
  737. mulpd C, %xmm2
  738. mulpd S, %xmm3
  739. addpd %xmm1, %xmm0
  740. subpd %xmm3, %xmm2
  741. movlpd %xmm0, 4 * SIZE(X)
  742. movhpd %xmm0, 5 * SIZE(X)
  743. movapd %xmm2, 4 * SIZE(Y)
  744. movapd 7 * SIZE(X), %xmm0
  745. movapd 6 * SIZE(Y), %xmm1
  746. SHUFPD_1 %xmm0, %xmm4
  747. movapd %xmm1, %xmm2
  748. movapd %xmm4, %xmm3
  749. mulpd C, %xmm4
  750. mulpd S, %xmm1
  751. mulpd C, %xmm2
  752. mulpd S, %xmm3
  753. addpd %xmm1, %xmm4
  754. subpd %xmm3, %xmm2
  755. movlpd %xmm4, 6 * SIZE(X)
  756. movhpd %xmm4, 7 * SIZE(X)
  757. movapd %xmm2, 6 * SIZE(Y)
  758. addl $8 * SIZE, Y
  759. addl $8 * SIZE, X
  760. ALIGN_3
  761. .L35:
  762. testl $2, N
  763. jle .L36
  764. movapd 1 * SIZE(X), %xmm4
  765. movapd 0 * SIZE(Y), %xmm1
  766. SHUFPD_1 %xmm4, %xmm0
  767. movapd %xmm1, %xmm2
  768. movapd %xmm0, %xmm3
  769. mulpd C, %xmm0
  770. mulpd S, %xmm1
  771. mulpd C, %xmm2
  772. mulpd S, %xmm3
  773. addpd %xmm1, %xmm0
  774. subpd %xmm3, %xmm2
  775. movlpd %xmm0, 0 * SIZE(X)
  776. movhpd %xmm0, 1 * SIZE(X)
  777. movapd %xmm2, 0 * SIZE(Y)
  778. movapd 3 * SIZE(X), %xmm0
  779. movapd 2 * SIZE(Y), %xmm1
  780. SHUFPD_1 %xmm0, %xmm4
  781. movapd %xmm1, %xmm2
  782. movapd %xmm4, %xmm3
  783. mulpd C, %xmm4
  784. mulpd S, %xmm1
  785. mulpd C, %xmm2
  786. mulpd S, %xmm3
  787. addpd %xmm1, %xmm4
  788. subpd %xmm3, %xmm2
  789. movlpd %xmm4, 2 * SIZE(X)
  790. movhpd %xmm4, 3 * SIZE(X)
  791. movapd %xmm2, 2 * SIZE(Y)
  792. addl $4 * SIZE, Y
  793. addl $4 * SIZE, X
  794. ALIGN_3
  795. .L36:
  796. testl $1, N
  797. jle .L999
  798. movapd 1 * SIZE(X), %xmm4
  799. movapd 0 * SIZE(Y), %xmm1
  800. SHUFPD_1 %xmm4, %xmm0
  801. movapd %xmm1, %xmm2
  802. movapd %xmm0, %xmm3
  803. mulpd C, %xmm0
  804. mulpd S, %xmm1
  805. mulpd C, %xmm2
  806. mulpd S, %xmm3
  807. addpd %xmm1, %xmm0
  808. subpd %xmm3, %xmm2
  809. movlpd %xmm0, 0 * SIZE(X)
  810. movhpd %xmm0, 1 * SIZE(X)
  811. movapd %xmm2, 0 * SIZE(Y)
  812. jmp .L999
  813. ALIGN_3
  814. .L40:
  815. movsd 0 * SIZE(Y), %xmm1
  816. movsd 0 * SIZE(X), %xmm0
  817. movapd %xmm1, %xmm2
  818. movapd %xmm0, %xmm3
  819. mulsd C, %xmm0
  820. mulsd S, %xmm1
  821. mulsd C, %xmm2
  822. mulsd S, %xmm3
  823. addsd %xmm1, %xmm0
  824. subsd %xmm3, %xmm2
  825. movsd %xmm0, 0 * SIZE(X)
  826. movsd %xmm2, 0 * SIZE(Y)
  827. addl $1 * SIZE, Y
  828. addl $1 * SIZE, X
  829. decl N
  830. jle .L47
  831. movl N, I
  832. sarl $3, I
  833. jle .L44
  834. ALIGN_3
  835. .L41:
  836. #ifdef PREFETCHW
  837. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  838. #endif
  839. movapd 0 * SIZE(Y), %xmm1
  840. movapd 0 * SIZE(X), %xmm0
  841. movapd %xmm1, %xmm2
  842. movapd %xmm0, %xmm3
  843. mulpd C, %xmm0
  844. mulpd S, %xmm1
  845. mulpd C, %xmm2
  846. mulpd S, %xmm3
  847. addpd %xmm1, %xmm0
  848. subpd %xmm3, %xmm2
  849. movapd %xmm0, 0 * SIZE(X)
  850. movapd %xmm2, 0 * SIZE(Y)
  851. movapd 2 * SIZE(Y), %xmm1
  852. movapd 2 * SIZE(X), %xmm0
  853. movapd %xmm1, %xmm2
  854. movapd %xmm0, %xmm3
  855. mulpd C, %xmm0
  856. mulpd S, %xmm1
  857. mulpd C, %xmm2
  858. mulpd S, %xmm3
  859. addpd %xmm1, %xmm0
  860. subpd %xmm3, %xmm2
  861. movapd %xmm0, 2 * SIZE(X)
  862. movapd %xmm2, 2 * SIZE(Y)
  863. #ifdef PREFETCHW
  864. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  865. #endif
  866. movapd 4 * SIZE(Y), %xmm1
  867. movapd 4 * SIZE(X), %xmm0
  868. movapd %xmm1, %xmm2
  869. movapd %xmm0, %xmm3
  870. mulpd C, %xmm0
  871. mulpd S, %xmm1
  872. mulpd C, %xmm2
  873. mulpd S, %xmm3
  874. addpd %xmm1, %xmm0
  875. subpd %xmm3, %xmm2
  876. movapd %xmm0, 4 * SIZE(X)
  877. movapd %xmm2, 4 * SIZE(Y)
  878. movapd 6 * SIZE(Y), %xmm1
  879. movapd 6 * SIZE(X), %xmm0
  880. movapd %xmm1, %xmm2
  881. movapd %xmm0, %xmm3
  882. mulpd C, %xmm0
  883. mulpd S, %xmm1
  884. mulpd C, %xmm2
  885. mulpd S, %xmm3
  886. addpd %xmm1, %xmm0
  887. subpd %xmm3, %xmm2
  888. movapd %xmm0, 6 * SIZE(X)
  889. movapd %xmm2, 6 * SIZE(Y)
  890. #ifdef PREFETCHW
  891. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  892. #endif
  893. movapd 8 * SIZE(Y), %xmm1
  894. movapd 8 * SIZE(X), %xmm0
  895. movapd %xmm1, %xmm2
  896. movapd %xmm0, %xmm3
  897. mulpd C, %xmm0
  898. mulpd S, %xmm1
  899. mulpd C, %xmm2
  900. mulpd S, %xmm3
  901. addpd %xmm1, %xmm0
  902. subpd %xmm3, %xmm2
  903. movapd %xmm0, 8 * SIZE(X)
  904. movapd %xmm2, 8 * SIZE(Y)
  905. movapd 10 * SIZE(Y), %xmm1
  906. movapd 10 * SIZE(X), %xmm0
  907. movapd %xmm1, %xmm2
  908. movapd %xmm0, %xmm3
  909. mulpd C, %xmm0
  910. mulpd S, %xmm1
  911. mulpd C, %xmm2
  912. mulpd S, %xmm3
  913. addpd %xmm1, %xmm0
  914. subpd %xmm3, %xmm2
  915. movapd %xmm0, 10 * SIZE(X)
  916. movapd %xmm2, 10 * SIZE(Y)
  917. #ifdef PREFETCHW
  918. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  919. #endif
  920. movapd 12 * SIZE(Y), %xmm1
  921. movapd 12 * SIZE(X), %xmm0
  922. movapd %xmm1, %xmm2
  923. movapd %xmm0, %xmm3
  924. mulpd C, %xmm0
  925. mulpd S, %xmm1
  926. mulpd C, %xmm2
  927. mulpd S, %xmm3
  928. addpd %xmm1, %xmm0
  929. subpd %xmm3, %xmm2
  930. movapd %xmm0, 12 * SIZE(X)
  931. movapd %xmm2, 12 * SIZE(Y)
  932. movapd 14 * SIZE(Y), %xmm1
  933. movapd 14 * SIZE(X), %xmm0
  934. movapd %xmm1, %xmm2
  935. movapd %xmm0, %xmm3
  936. mulpd C, %xmm0
  937. mulpd S, %xmm1
  938. mulpd C, %xmm2
  939. mulpd S, %xmm3
  940. addpd %xmm1, %xmm0
  941. subpd %xmm3, %xmm2
  942. movapd %xmm0, 14 * SIZE(X)
  943. movapd %xmm2, 14 * SIZE(Y)
  944. addl $16 * SIZE, X
  945. addl $16 * SIZE, Y
  946. decl I
  947. jg .L41
  948. ALIGN_3
  949. .L44:
  950. testl $4, N
  951. jle .L45
  952. movapd 0 * SIZE(Y), %xmm1
  953. movapd 0 * SIZE(X), %xmm0
  954. movapd %xmm1, %xmm2
  955. movapd %xmm0, %xmm3
  956. mulpd C, %xmm0
  957. mulpd S, %xmm1
  958. mulpd C, %xmm2
  959. mulpd S, %xmm3
  960. addpd %xmm1, %xmm0
  961. subpd %xmm3, %xmm2
  962. movapd %xmm0, 0 * SIZE(X)
  963. movapd %xmm2, 0 * SIZE(Y)
  964. movapd 2 * SIZE(Y), %xmm1
  965. movapd 2 * SIZE(X), %xmm0
  966. movapd %xmm1, %xmm2
  967. movapd %xmm0, %xmm3
  968. mulpd C, %xmm0
  969. mulpd S, %xmm1
  970. mulpd C, %xmm2
  971. mulpd S, %xmm3
  972. addpd %xmm1, %xmm0
  973. subpd %xmm3, %xmm2
  974. movapd %xmm0, 2 * SIZE(X)
  975. movapd %xmm2, 2 * SIZE(Y)
  976. movapd 4 * SIZE(Y), %xmm1
  977. movapd 4 * SIZE(X), %xmm0
  978. movapd %xmm1, %xmm2
  979. movapd %xmm0, %xmm3
  980. mulpd C, %xmm0
  981. mulpd S, %xmm1
  982. mulpd C, %xmm2
  983. mulpd S, %xmm3
  984. addpd %xmm1, %xmm0
  985. subpd %xmm3, %xmm2
  986. movapd %xmm0, 4 * SIZE(X)
  987. movapd %xmm2, 4 * SIZE(Y)
  988. movapd 6 * SIZE(Y), %xmm1
  989. movapd 6 * SIZE(X), %xmm0
  990. movapd %xmm1, %xmm2
  991. movapd %xmm0, %xmm3
  992. mulpd C, %xmm0
  993. mulpd S, %xmm1
  994. mulpd C, %xmm2
  995. mulpd S, %xmm3
  996. addpd %xmm1, %xmm0
  997. subpd %xmm3, %xmm2
  998. movapd %xmm0, 6 * SIZE(X)
  999. movapd %xmm2, 6 * SIZE(Y)
  1000. addl $8 * SIZE, X
  1001. addl $8 * SIZE, Y
  1002. ALIGN_3
  1003. .L45:
  1004. testl $2, N
  1005. jle .L46
  1006. movapd 0 * SIZE(Y), %xmm1
  1007. movapd 0 * SIZE(X), %xmm0
  1008. movapd %xmm1, %xmm2
  1009. movapd %xmm0, %xmm3
  1010. mulpd C, %xmm0
  1011. mulpd S, %xmm1
  1012. mulpd C, %xmm2
  1013. mulpd S, %xmm3
  1014. addpd %xmm1, %xmm0
  1015. subpd %xmm3, %xmm2
  1016. movapd %xmm0, 0 * SIZE(X)
  1017. movapd %xmm2, 0 * SIZE(Y)
  1018. movapd 2 * SIZE(Y), %xmm1
  1019. movapd 2 * SIZE(X), %xmm0
  1020. movapd %xmm1, %xmm2
  1021. movapd %xmm0, %xmm3
  1022. mulpd C, %xmm0
  1023. mulpd S, %xmm1
  1024. mulpd C, %xmm2
  1025. mulpd S, %xmm3
  1026. addpd %xmm1, %xmm0
  1027. subpd %xmm3, %xmm2
  1028. movapd %xmm0, 2 * SIZE(X)
  1029. movapd %xmm2, 2 * SIZE(Y)
  1030. addl $4 * SIZE, X
  1031. addl $4 * SIZE, Y
  1032. ALIGN_3
  1033. .L46:
  1034. testl $1, N
  1035. jle .L47
  1036. movapd 0 * SIZE(Y), %xmm1
  1037. movapd 0 * SIZE(X), %xmm0
  1038. movapd %xmm1, %xmm2
  1039. movapd %xmm0, %xmm3
  1040. mulpd C, %xmm0
  1041. mulpd S, %xmm1
  1042. mulpd C, %xmm2
  1043. mulpd S, %xmm3
  1044. addpd %xmm1, %xmm0
  1045. subpd %xmm3, %xmm2
  1046. movapd %xmm0, 0 * SIZE(X)
  1047. movapd %xmm2, 0 * SIZE(Y)
  1048. addl $2 * SIZE, Y
  1049. addl $2 * SIZE, X
  1050. ALIGN_3
  1051. .L47:
  1052. movsd 0 * SIZE(Y), %xmm1
  1053. movsd 0 * SIZE(X), %xmm0
  1054. movapd %xmm1, %xmm2
  1055. movapd %xmm0, %xmm3
  1056. mulsd C, %xmm0
  1057. mulsd S, %xmm1
  1058. mulsd C, %xmm2
  1059. mulsd S, %xmm3
  1060. addsd %xmm1, %xmm0
  1061. subsd %xmm3, %xmm2
  1062. movsd %xmm0, 0 * SIZE(X)
  1063. movsd %xmm2, 0 * SIZE(Y)
  1064. jmp .L999
  1065. ALIGN_3
  1066. .L50:
  1067. movl N, I
  1068. sarl $2, I
  1069. jle .L55
  1070. ALIGN_3
  1071. .L53:
  1072. movsd 0 * SIZE(Y), %xmm1
  1073. movhpd 1 * SIZE(Y), %xmm1
  1074. movsd 0 * SIZE(X), %xmm0
  1075. movhpd 1 * SIZE(X), %xmm0
  1076. movapd %xmm1, %xmm2
  1077. movapd %xmm0, %xmm3
  1078. mulpd C, %xmm0
  1079. mulpd S, %xmm1
  1080. mulpd C, %xmm2
  1081. mulpd S, %xmm3
  1082. addpd %xmm1, %xmm0
  1083. subpd %xmm3, %xmm2
  1084. movlpd %xmm0, 0 * SIZE(X)
  1085. movhpd %xmm0, 1 * SIZE(X)
  1086. movlpd %xmm2, 0 * SIZE(Y)
  1087. movhpd %xmm2, 1 * SIZE(Y)
  1088. addl INCX, X
  1089. addl INCY, Y
  1090. movsd 0 * SIZE(Y), %xmm1
  1091. movhpd 1 * SIZE(Y), %xmm1
  1092. movsd 0 * SIZE(X), %xmm0
  1093. movhpd 1 * SIZE(X), %xmm0
  1094. movapd %xmm1, %xmm2
  1095. movapd %xmm0, %xmm3
  1096. mulpd C, %xmm0
  1097. mulpd S, %xmm1
  1098. mulpd C, %xmm2
  1099. mulpd S, %xmm3
  1100. addpd %xmm1, %xmm0
  1101. subpd %xmm3, %xmm2
  1102. movlpd %xmm0, 0 * SIZE(X)
  1103. movhpd %xmm0, 1 * SIZE(X)
  1104. movlpd %xmm2, 0 * SIZE(Y)
  1105. movhpd %xmm2, 1 * SIZE(Y)
  1106. addl INCX, X
  1107. addl INCY, Y
  1108. movsd 0 * SIZE(Y), %xmm1
  1109. movhpd 1 * SIZE(Y), %xmm1
  1110. movsd 0 * SIZE(X), %xmm0
  1111. movhpd 1 * SIZE(X), %xmm0
  1112. movapd %xmm1, %xmm2
  1113. movapd %xmm0, %xmm3
  1114. mulpd C, %xmm0
  1115. mulpd S, %xmm1
  1116. mulpd C, %xmm2
  1117. mulpd S, %xmm3
  1118. addpd %xmm1, %xmm0
  1119. subpd %xmm3, %xmm2
  1120. movlpd %xmm0, 0 * SIZE(X)
  1121. movhpd %xmm0, 1 * SIZE(X)
  1122. movlpd %xmm2, 0 * SIZE(Y)
  1123. movhpd %xmm2, 1 * SIZE(Y)
  1124. addl INCX, X
  1125. addl INCY, Y
  1126. movsd 0 * SIZE(Y), %xmm1
  1127. movhpd 1 * SIZE(Y), %xmm1
  1128. movsd 0 * SIZE(X), %xmm0
  1129. movhpd 1 * SIZE(X), %xmm0
  1130. movapd %xmm1, %xmm2
  1131. movapd %xmm0, %xmm3
  1132. mulpd C, %xmm0
  1133. mulpd S, %xmm1
  1134. mulpd C, %xmm2
  1135. mulpd S, %xmm3
  1136. addpd %xmm1, %xmm0
  1137. subpd %xmm3, %xmm2
  1138. movlpd %xmm0, 0 * SIZE(X)
  1139. movhpd %xmm0, 1 * SIZE(X)
  1140. movlpd %xmm2, 0 * SIZE(Y)
  1141. movhpd %xmm2, 1 * SIZE(Y)
  1142. addl INCX, X
  1143. addl INCY, Y
  1144. decl I
  1145. jg .L53
  1146. ALIGN_3
  1147. .L55:
  1148. movl N, I
  1149. andl $3, I
  1150. jle .L999
  1151. ALIGN_3
  1152. .L56:
  1153. movsd 0 * SIZE(Y), %xmm1
  1154. movhpd 1 * SIZE(Y), %xmm1
  1155. movsd 0 * SIZE(X), %xmm0
  1156. movhpd 1 * SIZE(X), %xmm0
  1157. movapd %xmm1, %xmm2
  1158. movapd %xmm0, %xmm3
  1159. mulpd C, %xmm0
  1160. mulpd S, %xmm1
  1161. mulpd C, %xmm2
  1162. mulpd S, %xmm3
  1163. addpd %xmm1, %xmm0
  1164. subpd %xmm3, %xmm2
  1165. movlpd %xmm0, 0 * SIZE(X)
  1166. movhpd %xmm0, 1 * SIZE(X)
  1167. movlpd %xmm2, 0 * SIZE(Y)
  1168. movhpd %xmm2, 1 * SIZE(Y)
  1169. addl INCX, X
  1170. addl INCY, Y
  1171. decl I
  1172. jg .L56
  1173. ALIGN_3
  1174. .L999:
  1175. popl %ebx
  1176. popl %esi
  1177. popl %edi
  1178. ret
  1179. EPILOGUE