You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zrot_sse.S 30 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define N ARG1 /* rdi */
  41. #define X ARG2 /* rsi */
  42. #define INCX ARG3 /* rdx */
  43. #define Y ARG4 /* rcx */
  44. #ifndef WINDOWS_ABI
  45. #define INCY ARG5 /* r8 */
  46. #else
  47. #define INCY %r10
  48. #endif
  49. #define C %xmm14
  50. #define S %xmm15
  51. #include "l1param.h"
  52. PROLOGUE
  53. PROFCODE
  54. #ifdef WINDOWS_ABI
  55. movq 40(%rsp), INCY
  56. movss 48(%rsp), %xmm0
  57. movss 56(%rsp), %xmm1
  58. #endif
  59. SAVEREGISTERS
  60. salq $ZBASE_SHIFT, INCX
  61. salq $ZBASE_SHIFT, INCY
  62. pshufd $0x0, %xmm0, C
  63. pshufd $0x0, %xmm1, S
  64. cmpq $0, N
  65. jle .L999
  66. cmpq $2 * SIZE, INCX
  67. jne .L50
  68. cmpq $2 * SIZE, INCY
  69. jne .L50
  70. testq $2 * SIZE, X
  71. je .L10
  72. movsd 0 * SIZE(Y), %xmm1
  73. movsd 0 * SIZE(X), %xmm0
  74. movaps %xmm1, %xmm2
  75. movaps %xmm0, %xmm3
  76. mulps C, %xmm0
  77. mulps S, %xmm1
  78. mulps C, %xmm2
  79. mulps S, %xmm3
  80. addps %xmm1, %xmm0
  81. subps %xmm3, %xmm2
  82. movlps %xmm0, 0 * SIZE(X)
  83. movlps %xmm2, 0 * SIZE(Y)
  84. addq $2 * SIZE, X
  85. addq $2 * SIZE, Y
  86. decq N
  87. jle .L999
  88. .L10:
  89. testq $1 * SIZE, X
  90. jne .L30
  91. testq $3 * SIZE, Y
  92. jne .L20
  93. movq N, %rax
  94. sarq $4, %rax
  95. jle .L14
  96. movaps 0 * SIZE(Y), %xmm1
  97. movaps 4 * SIZE(Y), %xmm3
  98. movaps 8 * SIZE(Y), %xmm9
  99. movaps 12 * SIZE(Y), %xmm11
  100. movaps 0 * SIZE(X), %xmm0
  101. movaps 4 * SIZE(X), %xmm2
  102. movaps 8 * SIZE(X), %xmm8
  103. movaps 12 * SIZE(X), %xmm10
  104. decq %rax
  105. jle .L12
  106. ALIGN_3
  107. .L11:
  108. #if defined(PREFETCHW)
  109. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  110. #endif
  111. movaps %xmm1, %xmm4
  112. mulps S, %xmm1
  113. movaps %xmm3, %xmm6
  114. mulps S, %xmm3
  115. movaps %xmm0, %xmm5
  116. mulps C, %xmm0
  117. movaps %xmm2, %xmm7
  118. mulps C, %xmm2
  119. mulps C, %xmm4
  120. mulps S, %xmm5
  121. mulps C, %xmm6
  122. mulps S, %xmm7
  123. addps %xmm1, %xmm0
  124. movaps 16 * SIZE(Y), %xmm1
  125. addps %xmm3, %xmm2
  126. movaps 20 * SIZE(Y), %xmm3
  127. subps %xmm5, %xmm4
  128. subps %xmm7, %xmm6
  129. #if defined(PREFETCHW)
  130. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  131. #endif
  132. movaps %xmm0, 0 * SIZE(X)
  133. movaps 16 * SIZE(X), %xmm0
  134. movaps %xmm2, 4 * SIZE(X)
  135. movaps 20 * SIZE(X), %xmm2
  136. movaps %xmm4, 0 * SIZE(Y)
  137. movaps %xmm6, 4 * SIZE(Y)
  138. movaps %xmm9, %xmm4
  139. mulps S, %xmm9
  140. movaps %xmm8, %xmm5
  141. mulps C, %xmm8
  142. movaps %xmm11, %xmm6
  143. mulps S, %xmm11
  144. movaps %xmm10, %xmm7
  145. mulps C, %xmm10
  146. mulps C, %xmm4
  147. mulps S, %xmm5
  148. mulps C, %xmm6
  149. mulps S, %xmm7
  150. addps %xmm9, %xmm8
  151. movaps 24 * SIZE(Y), %xmm9
  152. addps %xmm11, %xmm10
  153. movaps 28 * SIZE(Y), %xmm11
  154. subps %xmm5, %xmm4
  155. subps %xmm7, %xmm6
  156. movaps %xmm8, 8 * SIZE(X)
  157. movaps 24 * SIZE(X), %xmm8
  158. movaps %xmm10,12 * SIZE(X)
  159. movaps 28 * SIZE(X), %xmm10
  160. movaps %xmm4, 8 * SIZE(Y)
  161. movaps %xmm6, 12 * SIZE(Y)
  162. #if defined(PREFETCHW)
  163. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  164. #endif
  165. movaps %xmm1, %xmm4
  166. mulps S, %xmm1
  167. movaps %xmm3, %xmm6
  168. mulps S, %xmm3
  169. movaps %xmm0, %xmm5
  170. mulps C, %xmm0
  171. movaps %xmm2, %xmm7
  172. mulps C, %xmm2
  173. mulps C, %xmm4
  174. mulps S, %xmm5
  175. mulps C, %xmm6
  176. mulps S, %xmm7
  177. addps %xmm1, %xmm0
  178. movaps 32 * SIZE(Y), %xmm1
  179. addps %xmm3, %xmm2
  180. movaps 36 * SIZE(Y), %xmm3
  181. subps %xmm5, %xmm4
  182. subps %xmm7, %xmm6
  183. movaps %xmm0, 16 * SIZE(X)
  184. movaps 32 * SIZE(X), %xmm0
  185. movaps %xmm2, 20 * SIZE(X)
  186. movaps 36 * SIZE(X), %xmm2
  187. movaps %xmm4, 16 * SIZE(Y)
  188. movaps %xmm6, 20 * SIZE(Y)
  189. #if defined(PREFETCHW) && !defined(FETCH128)
  190. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  191. #endif
  192. movaps %xmm9, %xmm4
  193. mulps S, %xmm9
  194. movaps %xmm8, %xmm5
  195. mulps C, %xmm8
  196. movaps %xmm11, %xmm6
  197. mulps S, %xmm11
  198. movaps %xmm10, %xmm7
  199. mulps C, %xmm10
  200. mulps C, %xmm4
  201. mulps S, %xmm5
  202. mulps C, %xmm6
  203. mulps S, %xmm7
  204. addps %xmm9, %xmm8
  205. movaps 40 * SIZE(Y), %xmm9
  206. addps %xmm11, %xmm10
  207. movaps 44 * SIZE(Y), %xmm11
  208. subps %xmm5, %xmm4
  209. subps %xmm7, %xmm6
  210. movaps %xmm8, 24 * SIZE(X)
  211. movaps 40 * SIZE(X), %xmm8
  212. movaps %xmm10, 28 * SIZE(X)
  213. movaps 44 * SIZE(X), %xmm10
  214. movaps %xmm4, 24 * SIZE(Y)
  215. movaps %xmm6, 28 * SIZE(Y)
  216. addq $32 * SIZE, X
  217. addq $32 * SIZE, Y
  218. decq %rax
  219. jg .L11
  220. ALIGN_3
  221. .L12:
  222. movaps %xmm1, %xmm4
  223. mulps S, %xmm1
  224. movaps %xmm3, %xmm6
  225. mulps S, %xmm3
  226. movaps %xmm0, %xmm5
  227. mulps C, %xmm0
  228. movaps %xmm2, %xmm7
  229. mulps C, %xmm2
  230. mulps C, %xmm4
  231. mulps S, %xmm5
  232. mulps C, %xmm6
  233. mulps S, %xmm7
  234. addps %xmm1, %xmm0
  235. movaps 16 * SIZE(Y), %xmm1
  236. addps %xmm3, %xmm2
  237. movaps 20 * SIZE(Y), %xmm3
  238. subps %xmm5, %xmm4
  239. subps %xmm7, %xmm6
  240. movaps %xmm0, 0 * SIZE(X)
  241. movaps 16 * SIZE(X), %xmm0
  242. movaps %xmm2, 4 * SIZE(X)
  243. movaps 20 * SIZE(X), %xmm2
  244. movaps %xmm4, 0 * SIZE(Y)
  245. movaps %xmm6, 4 * SIZE(Y)
  246. movaps %xmm9, %xmm4
  247. mulps S, %xmm9
  248. movaps %xmm8, %xmm5
  249. mulps C, %xmm8
  250. movaps %xmm11, %xmm6
  251. mulps S, %xmm11
  252. movaps %xmm10, %xmm7
  253. mulps C, %xmm10
  254. mulps C, %xmm4
  255. mulps S, %xmm5
  256. mulps C, %xmm6
  257. mulps S, %xmm7
  258. addps %xmm9, %xmm8
  259. movaps 24 * SIZE(Y), %xmm9
  260. addps %xmm11, %xmm10
  261. movaps 28 * SIZE(Y), %xmm11
  262. subps %xmm5, %xmm4
  263. subps %xmm7, %xmm6
  264. movaps %xmm8, 8 * SIZE(X)
  265. movaps 24 * SIZE(X), %xmm8
  266. movaps %xmm10,12 * SIZE(X)
  267. movaps 28 * SIZE(X), %xmm10
  268. movaps %xmm4, 8 * SIZE(Y)
  269. movaps %xmm6, 12 * SIZE(Y)
  270. movaps %xmm1, %xmm4
  271. mulps S, %xmm1
  272. movaps %xmm3, %xmm6
  273. mulps S, %xmm3
  274. movaps %xmm0, %xmm5
  275. mulps C, %xmm0
  276. movaps %xmm2, %xmm7
  277. mulps C, %xmm2
  278. mulps C, %xmm4
  279. mulps S, %xmm5
  280. mulps C, %xmm6
  281. mulps S, %xmm7
  282. addps %xmm1, %xmm0
  283. addps %xmm3, %xmm2
  284. subps %xmm5, %xmm4
  285. subps %xmm7, %xmm6
  286. movaps %xmm0, 16 * SIZE(X)
  287. movaps %xmm2, 20 * SIZE(X)
  288. movaps %xmm4, 16 * SIZE(Y)
  289. movaps %xmm6, 20 * SIZE(Y)
  290. movaps %xmm9, %xmm4
  291. mulps S, %xmm9
  292. movaps %xmm8, %xmm5
  293. mulps C, %xmm8
  294. movaps %xmm11, %xmm6
  295. mulps S, %xmm11
  296. movaps %xmm10, %xmm7
  297. mulps C, %xmm10
  298. mulps C, %xmm4
  299. mulps S, %xmm5
  300. mulps C, %xmm6
  301. mulps S, %xmm7
  302. addps %xmm9, %xmm8
  303. addps %xmm11, %xmm10
  304. subps %xmm5, %xmm4
  305. subps %xmm7, %xmm6
  306. movaps %xmm8, 24 * SIZE(X)
  307. movaps %xmm10, 28 * SIZE(X)
  308. movaps %xmm4, 24 * SIZE(Y)
  309. movaps %xmm6, 28 * SIZE(Y)
  310. addq $32 * SIZE, X
  311. addq $32 * SIZE, Y
  312. ALIGN_3
  313. .L14:
  314. testq $15, N
  315. jle .L999
  316. testq $8, N
  317. jle .L15
  318. movaps 0 * SIZE(Y), %xmm1
  319. movaps 0 * SIZE(X), %xmm0
  320. movaps 4 * SIZE(Y), %xmm3
  321. movaps 4 * SIZE(X), %xmm2
  322. movaps %xmm1, %xmm4
  323. movaps %xmm0, %xmm5
  324. movaps %xmm3, %xmm6
  325. movaps %xmm2, %xmm7
  326. mulps C, %xmm0
  327. mulps S, %xmm1
  328. mulps C, %xmm2
  329. mulps S, %xmm3
  330. mulps C, %xmm4
  331. mulps S, %xmm5
  332. mulps C, %xmm6
  333. mulps S, %xmm7
  334. addps %xmm1, %xmm0
  335. addps %xmm3, %xmm2
  336. subps %xmm5, %xmm4
  337. subps %xmm7, %xmm6
  338. movaps %xmm0, 0 * SIZE(X)
  339. movaps %xmm2, 4 * SIZE(X)
  340. movaps %xmm4, 0 * SIZE(Y)
  341. movaps %xmm6, 4 * SIZE(Y)
  342. movaps 8 * SIZE(Y), %xmm1
  343. movaps 8 * SIZE(X), %xmm0
  344. movaps 12 * SIZE(Y), %xmm3
  345. movaps 12 * SIZE(X), %xmm2
  346. movaps %xmm1, %xmm4
  347. movaps %xmm0, %xmm5
  348. movaps %xmm3, %xmm6
  349. movaps %xmm2, %xmm7
  350. mulps C, %xmm0
  351. mulps S, %xmm1
  352. mulps C, %xmm2
  353. mulps S, %xmm3
  354. mulps C, %xmm4
  355. mulps S, %xmm5
  356. mulps C, %xmm6
  357. mulps S, %xmm7
  358. addps %xmm1, %xmm0
  359. addps %xmm3, %xmm2
  360. subps %xmm5, %xmm4
  361. subps %xmm7, %xmm6
  362. movaps %xmm0, 8 * SIZE(X)
  363. movaps %xmm2, 12 * SIZE(X)
  364. movaps %xmm4, 8 * SIZE(Y)
  365. movaps %xmm6, 12 * SIZE(Y)
  366. addq $16 * SIZE, X
  367. addq $16 * SIZE, Y
  368. ALIGN_3
  369. .L15:
  370. testq $4, N
  371. jle .L16
  372. movaps 0 * SIZE(Y), %xmm1
  373. movaps 0 * SIZE(X), %xmm0
  374. movaps 4 * SIZE(Y), %xmm3
  375. movaps 4 * SIZE(X), %xmm2
  376. movaps %xmm1, %xmm4
  377. movaps %xmm0, %xmm5
  378. movaps %xmm3, %xmm6
  379. movaps %xmm2, %xmm7
  380. mulps C, %xmm0
  381. mulps S, %xmm1
  382. mulps C, %xmm2
  383. mulps S, %xmm3
  384. mulps C, %xmm4
  385. mulps S, %xmm5
  386. mulps C, %xmm6
  387. mulps S, %xmm7
  388. addps %xmm1, %xmm0
  389. addps %xmm3, %xmm2
  390. subps %xmm5, %xmm4
  391. subps %xmm7, %xmm6
  392. movaps %xmm0, 0 * SIZE(X)
  393. movaps %xmm2, 4 * SIZE(X)
  394. movaps %xmm4, 0 * SIZE(Y)
  395. movaps %xmm6, 4 * SIZE(Y)
  396. addq $8 * SIZE, X
  397. addq $8 * SIZE, Y
  398. ALIGN_3
  399. .L16:
  400. testq $2, N
  401. jle .L17
  402. movaps 0 * SIZE(Y), %xmm1
  403. movaps 0 * SIZE(X), %xmm0
  404. movaps %xmm1, %xmm2
  405. movaps %xmm0, %xmm3
  406. mulps C, %xmm0
  407. mulps S, %xmm1
  408. mulps C, %xmm2
  409. mulps S, %xmm3
  410. addps %xmm1, %xmm0
  411. subps %xmm3, %xmm2
  412. movaps %xmm0, 0 * SIZE(X)
  413. movaps %xmm2, 0 * SIZE(Y)
  414. addq $4 * SIZE, X
  415. addq $4 * SIZE, Y
  416. ALIGN_3
  417. .L17:
  418. testq $1, N
  419. jle .L999
  420. movsd 0 * SIZE(Y), %xmm1
  421. movsd 0 * SIZE(X), %xmm0
  422. movaps %xmm1, %xmm2
  423. movaps %xmm0, %xmm3
  424. mulps C, %xmm0
  425. mulps S, %xmm1
  426. mulps C, %xmm2
  427. mulps S, %xmm3
  428. addps %xmm1, %xmm0
  429. subps %xmm3, %xmm2
  430. movlps %xmm0, 0 * SIZE(X)
  431. movlps %xmm2, 0 * SIZE(Y)
  432. jmp .L999
  433. ALIGN_3
  434. .L20:
  435. movq N, %rax
  436. sarq $4, %rax
  437. jle .L24
  438. movsd 0 * SIZE(Y), %xmm1
  439. movhps 2 * SIZE(Y), %xmm1
  440. movsd 4 * SIZE(Y), %xmm3
  441. movhps 6 * SIZE(Y), %xmm3
  442. movsd 8 * SIZE(Y), %xmm9
  443. movhps 10 * SIZE(Y), %xmm9
  444. movsd 12 * SIZE(Y), %xmm11
  445. movhps 14 * SIZE(Y), %xmm11
  446. movaps 0 * SIZE(X), %xmm0
  447. movaps 4 * SIZE(X), %xmm2
  448. movaps 8 * SIZE(X), %xmm8
  449. movaps 12 * SIZE(X), %xmm10
  450. decq %rax
  451. jle .L22
  452. ALIGN_3
  453. .L21:
  454. #if defined(PREFETCHW)
  455. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  456. #endif
  457. movaps %xmm1, %xmm4
  458. mulps S, %xmm1
  459. movaps %xmm3, %xmm6
  460. mulps S, %xmm3
  461. movaps %xmm0, %xmm5
  462. mulps C, %xmm0
  463. movaps %xmm2, %xmm7
  464. mulps C, %xmm2
  465. mulps C, %xmm4
  466. mulps S, %xmm5
  467. mulps C, %xmm6
  468. mulps S, %xmm7
  469. addps %xmm1, %xmm0
  470. movsd 16 * SIZE(Y), %xmm1
  471. movhps 18 * SIZE(Y), %xmm1
  472. addps %xmm3, %xmm2
  473. movsd 20 * SIZE(Y), %xmm3
  474. movhps 22 * SIZE(Y), %xmm3
  475. subps %xmm5, %xmm4
  476. subps %xmm7, %xmm6
  477. #if defined(PREFETCHW)
  478. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  479. #endif
  480. movaps %xmm0, 0 * SIZE(X)
  481. movaps 16 * SIZE(X), %xmm0
  482. movaps %xmm2, 4 * SIZE(X)
  483. movaps 20 * SIZE(X), %xmm2
  484. movlps %xmm4, 0 * SIZE(Y)
  485. movhps %xmm4, 2 * SIZE(Y)
  486. movlps %xmm6, 4 * SIZE(Y)
  487. movhps %xmm6, 6 * SIZE(Y)
  488. movaps %xmm9, %xmm4
  489. mulps S, %xmm9
  490. movaps %xmm8, %xmm5
  491. mulps C, %xmm8
  492. movaps %xmm11, %xmm6
  493. mulps S, %xmm11
  494. movaps %xmm10, %xmm7
  495. mulps C, %xmm10
  496. mulps C, %xmm4
  497. mulps S, %xmm5
  498. mulps C, %xmm6
  499. mulps S, %xmm7
  500. addps %xmm9, %xmm8
  501. movsd 24 * SIZE(Y), %xmm9
  502. movhps 26 * SIZE(Y), %xmm9
  503. addps %xmm11, %xmm10
  504. movsd 28 * SIZE(Y), %xmm11
  505. movhps 30 * SIZE(Y), %xmm11
  506. subps %xmm5, %xmm4
  507. subps %xmm7, %xmm6
  508. movaps %xmm8, 8 * SIZE(X)
  509. movaps 24 * SIZE(X), %xmm8
  510. movaps %xmm10,12 * SIZE(X)
  511. movaps 28 * SIZE(X), %xmm10
  512. movlps %xmm4, 8 * SIZE(Y)
  513. movhps %xmm4, 10 * SIZE(Y)
  514. movlps %xmm6, 12 * SIZE(Y)
  515. movhps %xmm6, 14 * SIZE(Y)
  516. #if defined(PREFETCHW) && !defined(FETCH128)
  517. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  518. #endif
  519. movaps %xmm1, %xmm4
  520. mulps S, %xmm1
  521. movaps %xmm3, %xmm6
  522. mulps S, %xmm3
  523. movaps %xmm0, %xmm5
  524. mulps C, %xmm0
  525. movaps %xmm2, %xmm7
  526. mulps C, %xmm2
  527. mulps C, %xmm4
  528. mulps S, %xmm5
  529. mulps C, %xmm6
  530. mulps S, %xmm7
  531. addps %xmm1, %xmm0
  532. movsd 32 * SIZE(Y), %xmm1
  533. movhps 34 * SIZE(Y), %xmm1
  534. addps %xmm3, %xmm2
  535. movsd 36 * SIZE(Y), %xmm3
  536. movhps 38 * SIZE(Y), %xmm3
  537. subps %xmm5, %xmm4
  538. subps %xmm7, %xmm6
  539. movaps %xmm0, 16 * SIZE(X)
  540. movaps 32 * SIZE(X), %xmm0
  541. movaps %xmm2, 20 * SIZE(X)
  542. movaps 36 * SIZE(X), %xmm2
  543. movlps %xmm4, 16 * SIZE(Y)
  544. movhps %xmm4, 18 * SIZE(Y)
  545. movlps %xmm6, 20 * SIZE(Y)
  546. movhps %xmm6, 22 * SIZE(Y)
  547. #if defined(PREFETCHW) && !defined(FETCH128)
  548. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  549. #endif
  550. movaps %xmm9, %xmm4
  551. mulps S, %xmm9
  552. movaps %xmm8, %xmm5
  553. mulps C, %xmm8
  554. movaps %xmm11, %xmm6
  555. mulps S, %xmm11
  556. movaps %xmm10, %xmm7
  557. mulps C, %xmm10
  558. mulps C, %xmm4
  559. mulps S, %xmm5
  560. mulps C, %xmm6
  561. mulps S, %xmm7
  562. addps %xmm9, %xmm8
  563. movsd 40 * SIZE(Y), %xmm9
  564. movhps 42 * SIZE(Y), %xmm9
  565. addps %xmm11, %xmm10
  566. movsd 44 * SIZE(Y), %xmm11
  567. movhps 46 * SIZE(Y), %xmm11
  568. subps %xmm5, %xmm4
  569. subps %xmm7, %xmm6
  570. movaps %xmm8, 24 * SIZE(X)
  571. movaps 40 * SIZE(X), %xmm8
  572. movaps %xmm10, 28 * SIZE(X)
  573. movaps 44 * SIZE(X), %xmm10
  574. movlps %xmm4, 24 * SIZE(Y)
  575. movhps %xmm4, 26 * SIZE(Y)
  576. movlps %xmm6, 28 * SIZE(Y)
  577. movhps %xmm6, 30 * SIZE(Y)
  578. addq $32 * SIZE, X
  579. addq $32 * SIZE, Y
  580. decq %rax
  581. jg .L21
  582. ALIGN_3
  583. .L22:
  584. movaps %xmm1, %xmm4
  585. mulps S, %xmm1
  586. movaps %xmm3, %xmm6
  587. mulps S, %xmm3
  588. movaps %xmm0, %xmm5
  589. mulps C, %xmm0
  590. movaps %xmm2, %xmm7
  591. mulps C, %xmm2
  592. mulps C, %xmm4
  593. mulps S, %xmm5
  594. mulps C, %xmm6
  595. mulps S, %xmm7
  596. addps %xmm1, %xmm0
  597. movsd 16 * SIZE(Y), %xmm1
  598. movhps 18 * SIZE(Y), %xmm1
  599. addps %xmm3, %xmm2
  600. movsd 20 * SIZE(Y), %xmm3
  601. movhps 22 * SIZE(Y), %xmm3
  602. subps %xmm5, %xmm4
  603. subps %xmm7, %xmm6
  604. movaps %xmm0, 0 * SIZE(X)
  605. movaps 16 * SIZE(X), %xmm0
  606. movaps %xmm2, 4 * SIZE(X)
  607. movaps 20 * SIZE(X), %xmm2
  608. movsd %xmm4, 0 * SIZE(Y)
  609. movhps %xmm4, 2 * SIZE(Y)
  610. movsd %xmm6, 4 * SIZE(Y)
  611. movhps %xmm6, 6 * SIZE(Y)
  612. movaps %xmm9, %xmm4
  613. mulps S, %xmm9
  614. movaps %xmm8, %xmm5
  615. mulps C, %xmm8
  616. movaps %xmm11, %xmm6
  617. mulps S, %xmm11
  618. movaps %xmm10, %xmm7
  619. mulps C, %xmm10
  620. mulps C, %xmm4
  621. mulps S, %xmm5
  622. mulps C, %xmm6
  623. mulps S, %xmm7
  624. addps %xmm9, %xmm8
  625. movsd 24 * SIZE(Y), %xmm9
  626. movhps 26 * SIZE(Y), %xmm9
  627. addps %xmm11, %xmm10
  628. movsd 28 * SIZE(Y), %xmm11
  629. movhps 30 * SIZE(Y), %xmm11
  630. subps %xmm5, %xmm4
  631. subps %xmm7, %xmm6
  632. movaps %xmm8, 8 * SIZE(X)
  633. movaps 24 * SIZE(X), %xmm8
  634. movaps %xmm10,12 * SIZE(X)
  635. movaps 28 * SIZE(X), %xmm10
  636. movlps %xmm4, 8 * SIZE(Y)
  637. movhps %xmm4, 10 * SIZE(Y)
  638. movlps %xmm6, 12 * SIZE(Y)
  639. movhps %xmm6, 14 * SIZE(Y)
  640. movaps %xmm1, %xmm4
  641. mulps S, %xmm1
  642. movaps %xmm3, %xmm6
  643. mulps S, %xmm3
  644. movaps %xmm0, %xmm5
  645. mulps C, %xmm0
  646. movaps %xmm2, %xmm7
  647. mulps C, %xmm2
  648. mulps C, %xmm4
  649. mulps S, %xmm5
  650. mulps C, %xmm6
  651. mulps S, %xmm7
  652. addps %xmm1, %xmm0
  653. addps %xmm3, %xmm2
  654. subps %xmm5, %xmm4
  655. subps %xmm7, %xmm6
  656. movaps %xmm0, 16 * SIZE(X)
  657. movaps %xmm2, 20 * SIZE(X)
  658. movlps %xmm4, 16 * SIZE(Y)
  659. movhps %xmm4, 18 * SIZE(Y)
  660. movlps %xmm6, 20 * SIZE(Y)
  661. movhps %xmm6, 22 * SIZE(Y)
  662. movaps %xmm9, %xmm4
  663. mulps S, %xmm9
  664. movaps %xmm8, %xmm5
  665. mulps C, %xmm8
  666. movaps %xmm11, %xmm6
  667. mulps S, %xmm11
  668. movaps %xmm10, %xmm7
  669. mulps C, %xmm10
  670. mulps C, %xmm4
  671. mulps S, %xmm5
  672. mulps C, %xmm6
  673. mulps S, %xmm7
  674. addps %xmm9, %xmm8
  675. addps %xmm11, %xmm10
  676. subps %xmm5, %xmm4
  677. subps %xmm7, %xmm6
  678. movaps %xmm8, 24 * SIZE(X)
  679. movaps %xmm10, 28 * SIZE(X)
  680. movlps %xmm4, 24 * SIZE(Y)
  681. movhps %xmm4, 26 * SIZE(Y)
  682. movlps %xmm6, 28 * SIZE(Y)
  683. movhps %xmm6, 30 * SIZE(Y)
  684. addq $32 * SIZE, X
  685. addq $32 * SIZE, Y
  686. ALIGN_3
  687. .L24:
  688. testq $15, N
  689. jle .L999
  690. testq $8, N
  691. jle .L25
  692. movsd 0 * SIZE(Y), %xmm1
  693. movhps 2 * SIZE(Y), %xmm1
  694. movaps 0 * SIZE(X), %xmm0
  695. movsd 4 * SIZE(Y), %xmm3
  696. movhps 6 * SIZE(Y), %xmm3
  697. movaps 4 * SIZE(X), %xmm2
  698. movaps %xmm1, %xmm4
  699. movaps %xmm0, %xmm5
  700. movaps %xmm3, %xmm6
  701. movaps %xmm2, %xmm7
  702. mulps C, %xmm0
  703. mulps S, %xmm1
  704. mulps C, %xmm2
  705. mulps S, %xmm3
  706. mulps C, %xmm4
  707. mulps S, %xmm5
  708. mulps C, %xmm6
  709. mulps S, %xmm7
  710. addps %xmm1, %xmm0
  711. addps %xmm3, %xmm2
  712. subps %xmm5, %xmm4
  713. subps %xmm7, %xmm6
  714. movaps %xmm0, 0 * SIZE(X)
  715. movaps %xmm2, 4 * SIZE(X)
  716. movlps %xmm4, 0 * SIZE(Y)
  717. movhps %xmm4, 2 * SIZE(Y)
  718. movlps %xmm6, 4 * SIZE(Y)
  719. movhps %xmm6, 6 * SIZE(Y)
  720. movsd 8 * SIZE(Y), %xmm1
  721. movhps 10 * SIZE(Y), %xmm1
  722. movaps 8 * SIZE(X), %xmm0
  723. movsd 12 * SIZE(Y), %xmm3
  724. movhps 14 * SIZE(Y), %xmm3
  725. movaps 12 * SIZE(X), %xmm2
  726. movaps %xmm1, %xmm4
  727. movaps %xmm0, %xmm5
  728. movaps %xmm3, %xmm6
  729. movaps %xmm2, %xmm7
  730. mulps C, %xmm0
  731. mulps S, %xmm1
  732. mulps C, %xmm2
  733. mulps S, %xmm3
  734. mulps C, %xmm4
  735. mulps S, %xmm5
  736. mulps C, %xmm6
  737. mulps S, %xmm7
  738. addps %xmm1, %xmm0
  739. addps %xmm3, %xmm2
  740. subps %xmm5, %xmm4
  741. subps %xmm7, %xmm6
  742. movaps %xmm0, 8 * SIZE(X)
  743. movaps %xmm2, 12 * SIZE(X)
  744. movlps %xmm4, 8 * SIZE(Y)
  745. movhps %xmm4, 10 * SIZE(Y)
  746. movlps %xmm6, 12 * SIZE(Y)
  747. movhps %xmm6, 14 * SIZE(Y)
  748. addq $16 * SIZE, X
  749. addq $16 * SIZE, Y
  750. ALIGN_3
  751. .L25:
  752. testq $4, N
  753. jle .L26
  754. movsd 0 * SIZE(Y), %xmm1
  755. movhps 2 * SIZE(Y), %xmm1
  756. movaps 0 * SIZE(X), %xmm0
  757. movsd 4 * SIZE(Y), %xmm3
  758. movhps 6 * SIZE(Y), %xmm3
  759. movaps 4 * SIZE(X), %xmm2
  760. movaps %xmm1, %xmm4
  761. movaps %xmm0, %xmm5
  762. movaps %xmm3, %xmm6
  763. movaps %xmm2, %xmm7
  764. mulps C, %xmm0
  765. mulps S, %xmm1
  766. mulps C, %xmm2
  767. mulps S, %xmm3
  768. mulps C, %xmm4
  769. mulps S, %xmm5
  770. mulps C, %xmm6
  771. mulps S, %xmm7
  772. addps %xmm1, %xmm0
  773. addps %xmm3, %xmm2
  774. subps %xmm5, %xmm4
  775. subps %xmm7, %xmm6
  776. movaps %xmm0, 0 * SIZE(X)
  777. movaps %xmm2, 4 * SIZE(X)
  778. movlps %xmm4, 0 * SIZE(Y)
  779. movhps %xmm4, 2 * SIZE(Y)
  780. movlps %xmm6, 4 * SIZE(Y)
  781. movhps %xmm6, 6 * SIZE(Y)
  782. addq $8 * SIZE, X
  783. addq $8 * SIZE, Y
  784. ALIGN_3
  785. .L26:
  786. testq $2, N
  787. jle .L27
  788. movsd 0 * SIZE(Y), %xmm1
  789. movhps 2 * SIZE(Y), %xmm1
  790. movaps 0 * SIZE(X), %xmm0
  791. movaps %xmm1, %xmm2
  792. movaps %xmm0, %xmm3
  793. mulps C, %xmm0
  794. mulps S, %xmm1
  795. mulps C, %xmm2
  796. mulps S, %xmm3
  797. addps %xmm1, %xmm0
  798. subps %xmm3, %xmm2
  799. movaps %xmm0, 0 * SIZE(X)
  800. movlps %xmm2, 0 * SIZE(Y)
  801. movhps %xmm2, 2 * SIZE(Y)
  802. addq $4 * SIZE, X
  803. addq $4 * SIZE, Y
  804. ALIGN_3
  805. .L27:
  806. testq $1, N
  807. jle .L999
  808. movsd 0 * SIZE(Y), %xmm1
  809. movsd 0 * SIZE(X), %xmm0
  810. movaps %xmm1, %xmm2
  811. movaps %xmm0, %xmm3
  812. mulps C, %xmm0
  813. mulps S, %xmm1
  814. mulps C, %xmm2
  815. mulps S, %xmm3
  816. addps %xmm1, %xmm0
  817. subps %xmm3, %xmm2
  818. movlps %xmm0, 0 * SIZE(X)
  819. movlps %xmm2, 0 * SIZE(Y)
  820. jmp .L999
  821. ALIGN_3
  822. .L30:
  823. movq N, %rax
  824. sarq $4, %rax
  825. jle .L34
  826. movsd 0 * SIZE(Y), %xmm1
  827. movhps 2 * SIZE(Y), %xmm1
  828. movsd 4 * SIZE(Y), %xmm3
  829. movhps 6 * SIZE(Y), %xmm3
  830. movsd 8 * SIZE(Y), %xmm9
  831. movhps 10 * SIZE(Y), %xmm9
  832. movsd 12 * SIZE(Y), %xmm11
  833. movhps 14 * SIZE(Y), %xmm11
  834. movsd 0 * SIZE(X), %xmm0
  835. movhps 2 * SIZE(X), %xmm0
  836. movsd 4 * SIZE(X), %xmm2
  837. movhps 6 * SIZE(X), %xmm2
  838. movsd 8 * SIZE(X), %xmm8
  839. movhps 10 * SIZE(X), %xmm8
  840. movsd 12 * SIZE(X), %xmm10
  841. movhps 14 * SIZE(X), %xmm10
  842. decq %rax
  843. jle .L32
  844. ALIGN_3
  845. .L31:
  846. #if defined(PREFETCHW)
  847. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  848. #endif
  849. movaps %xmm1, %xmm4
  850. mulps S, %xmm1
  851. movaps %xmm3, %xmm6
  852. mulps S, %xmm3
  853. movaps %xmm0, %xmm5
  854. mulps C, %xmm0
  855. movaps %xmm2, %xmm7
  856. mulps C, %xmm2
  857. mulps C, %xmm4
  858. mulps S, %xmm5
  859. mulps C, %xmm6
  860. mulps S, %xmm7
  861. addps %xmm1, %xmm0
  862. movsd 16 * SIZE(Y), %xmm1
  863. movhps 18 * SIZE(Y), %xmm1
  864. addps %xmm3, %xmm2
  865. movsd 20 * SIZE(Y), %xmm3
  866. movhps 22 * SIZE(Y), %xmm3
  867. subps %xmm5, %xmm4
  868. subps %xmm7, %xmm6
  869. #if defined(PREFETCHW)
  870. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  871. #endif
  872. movlps %xmm0, 0 * SIZE(X)
  873. movhps %xmm0, 2 * SIZE(X)
  874. movsd 16 * SIZE(X), %xmm0
  875. movhps 18 * SIZE(X), %xmm0
  876. movlps %xmm2, 4 * SIZE(X)
  877. movhps %xmm2, 6 * SIZE(X)
  878. movsd 20 * SIZE(X), %xmm2
  879. movhps 22 * SIZE(X), %xmm2
  880. movlps %xmm4, 0 * SIZE(Y)
  881. movhps %xmm4, 2 * SIZE(Y)
  882. movlps %xmm6, 4 * SIZE(Y)
  883. movhps %xmm6, 6 * SIZE(Y)
  884. movaps %xmm9, %xmm4
  885. mulps S, %xmm9
  886. movaps %xmm8, %xmm5
  887. mulps C, %xmm8
  888. movaps %xmm11, %xmm6
  889. mulps S, %xmm11
  890. movaps %xmm10, %xmm7
  891. mulps C, %xmm10
  892. mulps C, %xmm4
  893. mulps S, %xmm5
  894. mulps C, %xmm6
  895. mulps S, %xmm7
  896. addps %xmm9, %xmm8
  897. movsd 24 * SIZE(Y), %xmm9
  898. movhps 26 * SIZE(Y), %xmm9
  899. addps %xmm11, %xmm10
  900. movsd 28 * SIZE(Y), %xmm11
  901. movhps 30 * SIZE(Y), %xmm11
  902. subps %xmm5, %xmm4
  903. subps %xmm7, %xmm6
  904. movlps %xmm8, 8 * SIZE(X)
  905. movhps %xmm8, 10 * SIZE(X)
  906. movsd 24 * SIZE(X), %xmm8
  907. movhps 26 * SIZE(X), %xmm8
  908. movlps %xmm10, 12 * SIZE(X)
  909. movhps %xmm10, 14 * SIZE(X)
  910. movsd 28 * SIZE(X), %xmm10
  911. movhps 30 * SIZE(X), %xmm10
  912. movlps %xmm4, 8 * SIZE(Y)
  913. movhps %xmm4, 10 * SIZE(Y)
  914. movlps %xmm6, 12 * SIZE(Y)
  915. movhps %xmm6, 14 * SIZE(Y)
  916. #if defined(PREFETCHW) && !defined(FETCH128)
  917. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  918. #endif
  919. movaps %xmm1, %xmm4
  920. mulps S, %xmm1
  921. movaps %xmm3, %xmm6
  922. mulps S, %xmm3
  923. movaps %xmm0, %xmm5
  924. mulps C, %xmm0
  925. movaps %xmm2, %xmm7
  926. mulps C, %xmm2
  927. mulps C, %xmm4
  928. mulps S, %xmm5
  929. mulps C, %xmm6
  930. mulps S, %xmm7
  931. addps %xmm1, %xmm0
  932. movsd 32 * SIZE(Y), %xmm1
  933. movhps 34 * SIZE(Y), %xmm1
  934. addps %xmm3, %xmm2
  935. movsd 36 * SIZE(Y), %xmm3
  936. movhps 38 * SIZE(Y), %xmm3
  937. subps %xmm5, %xmm4
  938. subps %xmm7, %xmm6
  939. movlps %xmm0, 16 * SIZE(X)
  940. movhps %xmm0, 18 * SIZE(X)
  941. movsd 32 * SIZE(X), %xmm0
  942. movhps 34 * SIZE(X), %xmm0
  943. movlps %xmm2, 20 * SIZE(X)
  944. movhps %xmm2, 22 * SIZE(X)
  945. movsd 36 * SIZE(X), %xmm2
  946. movhps 38 * SIZE(X), %xmm2
  947. movlps %xmm4, 16 * SIZE(Y)
  948. movhps %xmm4, 18 * SIZE(Y)
  949. movlps %xmm6, 20 * SIZE(Y)
  950. movhps %xmm6, 22 * SIZE(Y)
  951. #if defined(PREFETCHW) && !defined(FETCH128)
  952. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  953. #endif
  954. movaps %xmm9, %xmm4
  955. mulps S, %xmm9
  956. movaps %xmm8, %xmm5
  957. mulps C, %xmm8
  958. movaps %xmm11, %xmm6
  959. mulps S, %xmm11
  960. movaps %xmm10, %xmm7
  961. mulps C, %xmm10
  962. mulps C, %xmm4
  963. mulps S, %xmm5
  964. mulps C, %xmm6
  965. mulps S, %xmm7
  966. addps %xmm9, %xmm8
  967. movsd 40 * SIZE(Y), %xmm9
  968. movhps 42 * SIZE(Y), %xmm9
  969. addps %xmm11, %xmm10
  970. movsd 44 * SIZE(Y), %xmm11
  971. movhps 46 * SIZE(Y), %xmm11
  972. subps %xmm5, %xmm4
  973. subps %xmm7, %xmm6
  974. movlps %xmm8, 24 * SIZE(X)
  975. movhps %xmm8, 26 * SIZE(X)
  976. movsd 40 * SIZE(X), %xmm8
  977. movhps 42 * SIZE(X), %xmm8
  978. movlps %xmm10, 28 * SIZE(X)
  979. movhps %xmm10, 30 * SIZE(X)
  980. movsd 44 * SIZE(X), %xmm10
  981. movhps 46 * SIZE(X), %xmm10
  982. movlps %xmm4, 24 * SIZE(Y)
  983. movhps %xmm4, 26 * SIZE(Y)
  984. movlps %xmm6, 28 * SIZE(Y)
  985. movhps %xmm6, 30 * SIZE(Y)
  986. addq $32 * SIZE, X
  987. addq $32 * SIZE, Y
  988. decq %rax
  989. jg .L31
  990. ALIGN_3
  991. .L32:
  992. movaps %xmm1, %xmm4
  993. mulps S, %xmm1
  994. movaps %xmm3, %xmm6
  995. mulps S, %xmm3
  996. movaps %xmm0, %xmm5
  997. mulps C, %xmm0
  998. movaps %xmm2, %xmm7
  999. mulps C, %xmm2
  1000. mulps C, %xmm4
  1001. mulps S, %xmm5
  1002. mulps C, %xmm6
  1003. mulps S, %xmm7
  1004. addps %xmm1, %xmm0
  1005. movsd 16 * SIZE(Y), %xmm1
  1006. movhps 18 * SIZE(Y), %xmm1
  1007. addps %xmm3, %xmm2
  1008. movsd 20 * SIZE(Y), %xmm3
  1009. movhps 22 * SIZE(Y), %xmm3
  1010. subps %xmm5, %xmm4
  1011. subps %xmm7, %xmm6
  1012. movlps %xmm0, 0 * SIZE(X)
  1013. movhps %xmm0, 2 * SIZE(X)
  1014. movsd 16 * SIZE(X), %xmm0
  1015. movhps 18 * SIZE(X), %xmm0
  1016. movlps %xmm2, 4 * SIZE(X)
  1017. movhps %xmm2, 6 * SIZE(X)
  1018. movsd 20 * SIZE(X), %xmm2
  1019. movhps 22 * SIZE(X), %xmm2
  1020. movsd %xmm4, 0 * SIZE(Y)
  1021. movhps %xmm4, 2 * SIZE(Y)
  1022. movsd %xmm6, 4 * SIZE(Y)
  1023. movhps %xmm6, 6 * SIZE(Y)
  1024. movaps %xmm9, %xmm4
  1025. mulps S, %xmm9
  1026. movaps %xmm8, %xmm5
  1027. mulps C, %xmm8
  1028. movaps %xmm11, %xmm6
  1029. mulps S, %xmm11
  1030. movaps %xmm10, %xmm7
  1031. mulps C, %xmm10
  1032. mulps C, %xmm4
  1033. mulps S, %xmm5
  1034. mulps C, %xmm6
  1035. mulps S, %xmm7
  1036. addps %xmm9, %xmm8
  1037. movsd 24 * SIZE(Y), %xmm9
  1038. movhps 26 * SIZE(Y), %xmm9
  1039. addps %xmm11, %xmm10
  1040. movsd 28 * SIZE(Y), %xmm11
  1041. movhps 30 * SIZE(Y), %xmm11
  1042. subps %xmm5, %xmm4
  1043. subps %xmm7, %xmm6
  1044. movlps %xmm8, 8 * SIZE(X)
  1045. movhps %xmm8, 10 * SIZE(X)
  1046. movsd 24 * SIZE(X), %xmm8
  1047. movhps 26 * SIZE(X), %xmm8
  1048. movlps %xmm10, 12 * SIZE(X)
  1049. movhps %xmm10, 14 * SIZE(X)
  1050. movsd 28 * SIZE(X), %xmm10
  1051. movhps 30 * SIZE(X), %xmm10
  1052. movlps %xmm4, 8 * SIZE(Y)
  1053. movhps %xmm4, 10 * SIZE(Y)
  1054. movlps %xmm6, 12 * SIZE(Y)
  1055. movhps %xmm6, 14 * SIZE(Y)
  1056. movaps %xmm1, %xmm4
  1057. mulps S, %xmm1
  1058. movaps %xmm3, %xmm6
  1059. mulps S, %xmm3
  1060. movaps %xmm0, %xmm5
  1061. mulps C, %xmm0
  1062. movaps %xmm2, %xmm7
  1063. mulps C, %xmm2
  1064. mulps C, %xmm4
  1065. mulps S, %xmm5
  1066. mulps C, %xmm6
  1067. mulps S, %xmm7
  1068. addps %xmm1, %xmm0
  1069. addps %xmm3, %xmm2
  1070. subps %xmm5, %xmm4
  1071. subps %xmm7, %xmm6
  1072. movlps %xmm0, 16 * SIZE(X)
  1073. movhps %xmm0, 18 * SIZE(X)
  1074. movlps %xmm2, 20 * SIZE(X)
  1075. movhps %xmm2, 22 * SIZE(X)
  1076. movlps %xmm4, 16 * SIZE(Y)
  1077. movhps %xmm4, 18 * SIZE(Y)
  1078. movlps %xmm6, 20 * SIZE(Y)
  1079. movhps %xmm6, 22 * SIZE(Y)
  1080. movaps %xmm9, %xmm4
  1081. mulps S, %xmm9
  1082. movaps %xmm8, %xmm5
  1083. mulps C, %xmm8
  1084. movaps %xmm11, %xmm6
  1085. mulps S, %xmm11
  1086. movaps %xmm10, %xmm7
  1087. mulps C, %xmm10
  1088. mulps C, %xmm4
  1089. mulps S, %xmm5
  1090. mulps C, %xmm6
  1091. mulps S, %xmm7
  1092. addps %xmm9, %xmm8
  1093. addps %xmm11, %xmm10
  1094. subps %xmm5, %xmm4
  1095. subps %xmm7, %xmm6
  1096. movlps %xmm8, 24 * SIZE(X)
  1097. movhps %xmm8, 26 * SIZE(X)
  1098. movlps %xmm10, 28 * SIZE(X)
  1099. movhps %xmm10, 30 * SIZE(X)
  1100. movlps %xmm4, 24 * SIZE(Y)
  1101. movhps %xmm4, 26 * SIZE(Y)
  1102. movlps %xmm6, 28 * SIZE(Y)
  1103. movhps %xmm6, 30 * SIZE(Y)
  1104. addq $32 * SIZE, X
  1105. addq $32 * SIZE, Y
  1106. ALIGN_3
  1107. .L34:
  1108. testq $15, N
  1109. jle .L999
  1110. testq $8, N
  1111. jle .L35
  1112. movsd 0 * SIZE(Y), %xmm1
  1113. movhps 2 * SIZE(Y), %xmm1
  1114. movsd 0 * SIZE(X), %xmm0
  1115. movhps 2 * SIZE(X), %xmm0
  1116. movsd 4 * SIZE(Y), %xmm3
  1117. movhps 6 * SIZE(Y), %xmm3
  1118. movsd 4 * SIZE(X), %xmm2
  1119. movhps 6 * SIZE(X), %xmm2
  1120. movaps %xmm1, %xmm4
  1121. movaps %xmm0, %xmm5
  1122. movaps %xmm3, %xmm6
  1123. movaps %xmm2, %xmm7
  1124. mulps C, %xmm0
  1125. mulps S, %xmm1
  1126. mulps C, %xmm2
  1127. mulps S, %xmm3
  1128. mulps C, %xmm4
  1129. mulps S, %xmm5
  1130. mulps C, %xmm6
  1131. mulps S, %xmm7
  1132. addps %xmm1, %xmm0
  1133. addps %xmm3, %xmm2
  1134. subps %xmm5, %xmm4
  1135. subps %xmm7, %xmm6
  1136. movlps %xmm0, 0 * SIZE(X)
  1137. movhps %xmm0, 2 * SIZE(X)
  1138. movlps %xmm2, 4 * SIZE(X)
  1139. movhps %xmm2, 6 * SIZE(X)
  1140. movlps %xmm4, 0 * SIZE(Y)
  1141. movhps %xmm4, 2 * SIZE(Y)
  1142. movlps %xmm6, 4 * SIZE(Y)
  1143. movhps %xmm6, 6 * SIZE(Y)
  1144. movsd 8 * SIZE(Y), %xmm1
  1145. movhps 10 * SIZE(Y), %xmm1
  1146. movsd 8 * SIZE(X), %xmm0
  1147. movhps 10 * SIZE(X), %xmm0
  1148. movsd 12 * SIZE(Y), %xmm3
  1149. movhps 14 * SIZE(Y), %xmm3
  1150. movsd 12 * SIZE(X), %xmm2
  1151. movhps 14 * SIZE(X), %xmm2
  1152. movaps %xmm1, %xmm4
  1153. movaps %xmm0, %xmm5
  1154. movaps %xmm3, %xmm6
  1155. movaps %xmm2, %xmm7
  1156. mulps C, %xmm0
  1157. mulps S, %xmm1
  1158. mulps C, %xmm2
  1159. mulps S, %xmm3
  1160. mulps C, %xmm4
  1161. mulps S, %xmm5
  1162. mulps C, %xmm6
  1163. mulps S, %xmm7
  1164. addps %xmm1, %xmm0
  1165. addps %xmm3, %xmm2
  1166. subps %xmm5, %xmm4
  1167. subps %xmm7, %xmm6
  1168. movlps %xmm0, 8 * SIZE(X)
  1169. movhps %xmm0, 10 * SIZE(X)
  1170. movlps %xmm2, 12 * SIZE(X)
  1171. movhps %xmm2, 14 * SIZE(X)
  1172. movlps %xmm4, 8 * SIZE(Y)
  1173. movhps %xmm4, 10 * SIZE(Y)
  1174. movlps %xmm6, 12 * SIZE(Y)
  1175. movhps %xmm6, 14 * SIZE(Y)
  1176. addq $16 * SIZE, X
  1177. addq $16 * SIZE, Y
  1178. ALIGN_3
  1179. .L35:
  1180. testq $4, N
  1181. jle .L36
  1182. movsd 0 * SIZE(Y), %xmm1
  1183. movhps 2 * SIZE(Y), %xmm1
  1184. movsd 0 * SIZE(X), %xmm0
  1185. movhps 2 * SIZE(X), %xmm0
  1186. movsd 4 * SIZE(Y), %xmm3
  1187. movhps 6 * SIZE(Y), %xmm3
  1188. movsd 4 * SIZE(X), %xmm2
  1189. movhps 6 * SIZE(X), %xmm2
  1190. movaps %xmm1, %xmm4
  1191. movaps %xmm0, %xmm5
  1192. movaps %xmm3, %xmm6
  1193. movaps %xmm2, %xmm7
  1194. mulps C, %xmm0
  1195. mulps S, %xmm1
  1196. mulps C, %xmm2
  1197. mulps S, %xmm3
  1198. mulps C, %xmm4
  1199. mulps S, %xmm5
  1200. mulps C, %xmm6
  1201. mulps S, %xmm7
  1202. addps %xmm1, %xmm0
  1203. addps %xmm3, %xmm2
  1204. subps %xmm5, %xmm4
  1205. subps %xmm7, %xmm6
  1206. movlps %xmm0, 0 * SIZE(X)
  1207. movhps %xmm0, 2 * SIZE(X)
  1208. movlps %xmm2, 4 * SIZE(X)
  1209. movhps %xmm2, 6 * SIZE(X)
  1210. movlps %xmm4, 0 * SIZE(Y)
  1211. movhps %xmm4, 2 * SIZE(Y)
  1212. movlps %xmm6, 4 * SIZE(Y)
  1213. movhps %xmm6, 6 * SIZE(Y)
  1214. addq $8 * SIZE, X
  1215. addq $8 * SIZE, Y
  1216. ALIGN_3
  1217. .L36:
  1218. testq $2, N
  1219. jle .L37
  1220. movsd 0 * SIZE(Y), %xmm1
  1221. movhps 2 * SIZE(Y), %xmm1
  1222. movsd 0 * SIZE(X), %xmm0
  1223. movhps 2 * SIZE(X), %xmm0
  1224. movaps %xmm1, %xmm2
  1225. movaps %xmm0, %xmm3
  1226. mulps C, %xmm0
  1227. mulps S, %xmm1
  1228. mulps C, %xmm2
  1229. mulps S, %xmm3
  1230. addps %xmm1, %xmm0
  1231. subps %xmm3, %xmm2
  1232. movlps %xmm0, 0 * SIZE(X)
  1233. movhps %xmm0, 2 * SIZE(X)
  1234. movlps %xmm2, 0 * SIZE(Y)
  1235. movhps %xmm2, 2 * SIZE(Y)
  1236. addq $4 * SIZE, X
  1237. addq $4 * SIZE, Y
  1238. ALIGN_3
  1239. .L37:
  1240. testq $1, N
  1241. jle .L999
  1242. movsd 0 * SIZE(Y), %xmm1
  1243. movsd 0 * SIZE(X), %xmm0
  1244. movaps %xmm1, %xmm2
  1245. movaps %xmm0, %xmm3
  1246. mulps C, %xmm0
  1247. mulps S, %xmm1
  1248. mulps C, %xmm2
  1249. mulps S, %xmm3
  1250. addps %xmm1, %xmm0
  1251. subps %xmm3, %xmm2
  1252. movlps %xmm0, 0 * SIZE(X)
  1253. movlps %xmm2, 0 * SIZE(Y)
  1254. jmp .L999
  1255. ALIGN_3
  1256. ALIGN_3
  1257. .L50:
  1258. movq N, %rax
  1259. cmpq $0, INCX
  1260. je .L56
  1261. cmpq $0, INCY
  1262. je .L56
  1263. sarq $2, %rax
  1264. jle .L55
  1265. ALIGN_3
  1266. .L53:
  1267. movsd (Y), %xmm1
  1268. movhps (Y, INCY), %xmm1
  1269. movsd (X), %xmm0
  1270. movhps (X, INCX), %xmm0
  1271. movaps %xmm1, %xmm2
  1272. movaps %xmm0, %xmm3
  1273. mulps C, %xmm0
  1274. mulps S, %xmm1
  1275. mulps C, %xmm2
  1276. mulps S, %xmm3
  1277. addps %xmm1, %xmm0
  1278. subps %xmm3, %xmm2
  1279. movlps %xmm0, (X)
  1280. movhps %xmm0, (X, INCX)
  1281. movlps %xmm2, (Y)
  1282. movhps %xmm2, (Y, INCY)
  1283. leaq (X, INCX, 2), X
  1284. leaq (Y, INCY, 2), Y
  1285. movsd (Y), %xmm1
  1286. movhps (Y, INCY), %xmm1
  1287. movsd (X), %xmm0
  1288. movhps (X, INCX), %xmm0
  1289. movaps %xmm1, %xmm2
  1290. movaps %xmm0, %xmm3
  1291. mulps C, %xmm0
  1292. mulps S, %xmm1
  1293. mulps C, %xmm2
  1294. mulps S, %xmm3
  1295. addps %xmm1, %xmm0
  1296. subps %xmm3, %xmm2
  1297. movlps %xmm0, (X)
  1298. movhps %xmm0, (X, INCX)
  1299. movlps %xmm2, (Y)
  1300. movhps %xmm2, (Y, INCY)
  1301. leaq (X, INCX, 2), X
  1302. leaq (Y, INCY, 2), Y
  1303. decq %rax
  1304. jg .L53
  1305. ALIGN_3
  1306. .L55:
  1307. movq N, %rax
  1308. andq $3, %rax
  1309. jle .L999
  1310. ALIGN_3
  1311. .L56:
  1312. movsd (Y), %xmm1
  1313. movsd (X), %xmm0
  1314. movaps %xmm1, %xmm2
  1315. movaps %xmm0, %xmm3
  1316. mulps C, %xmm0
  1317. mulps S, %xmm1
  1318. mulps C, %xmm2
  1319. mulps S, %xmm3
  1320. addps %xmm1, %xmm0
  1321. subps %xmm3, %xmm2
  1322. movlps %xmm0, (X)
  1323. movlps %xmm2, (Y)
  1324. addq INCX, X
  1325. addq INCY, Y
  1326. decq %rax
  1327. jg .L56
  1328. ALIGN_3
  1329. .L999:
  1330. RESTOREREGISTERS
  1331. ret
  1332. EPILOGUE