You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rot_sse.S 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_N 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define STACK_C 24 + STACK + ARGS(%esp)
  48. #define STACK_S 28 + STACK + ARGS(%esp)
  49. #define N %ebx
  50. #define X %esi
  51. #define INCX %ecx
  52. #define Y %edi
  53. #define INCY %edx
  54. #define I %eax
  55. #define C %xmm6
  56. #define S %xmm7
  57. #include "l1param.h"
  58. PROLOGUE
  59. PROFCODE
  60. pushl %edi
  61. pushl %esi
  62. pushl %ebx
  63. movl STACK_N, N
  64. movl STACK_X, X
  65. movl STACK_INCX, INCX
  66. movl STACK_Y, Y
  67. movl STACK_INCY, INCY
  68. leal (, INCX, SIZE), INCX
  69. leal (, INCY, SIZE), INCY
  70. movss STACK_C, C
  71. movss STACK_S, S
  72. shufps $0x0, C, C
  73. shufps $0x0, S, S
  74. cmpl $0, N
  75. jle .L999
  76. cmpl $SIZE, INCX
  77. jne .L50
  78. cmpl $SIZE, INCY
  79. jne .L50
  80. testl $SIZE, X
  81. je .L05
  82. movss 0 * SIZE(Y), %xmm1
  83. movss 0 * SIZE(X), %xmm0
  84. movaps %xmm1, %xmm2
  85. movaps %xmm0, %xmm3
  86. mulss C, %xmm0
  87. mulss S, %xmm1
  88. mulss C, %xmm2
  89. mulss S, %xmm3
  90. addss %xmm1, %xmm0
  91. subss %xmm3, %xmm2
  92. movss %xmm0, 0 * SIZE(X)
  93. movss %xmm2, 0 * SIZE(Y)
  94. addl $1 * SIZE, X
  95. addl $1 * SIZE, Y
  96. decl N
  97. jle .L999
  98. .L05:
  99. testl $2 * SIZE, X
  100. je .L10
  101. cmpl $1, N
  102. je .L17
  103. #ifdef movsd
  104. xorps %xmm0, %xmm0
  105. xorps %xmm1, %xmm1
  106. #endif
  107. movsd 0 * SIZE(Y), %xmm1
  108. movsd 0 * SIZE(X), %xmm0
  109. movaps %xmm1, %xmm2
  110. movaps %xmm0, %xmm3
  111. mulps C, %xmm0
  112. mulps S, %xmm1
  113. mulps C, %xmm2
  114. mulps S, %xmm3
  115. addps %xmm1, %xmm0
  116. subps %xmm3, %xmm2
  117. movlps %xmm0, 0 * SIZE(X)
  118. movlps %xmm2, 0 * SIZE(Y)
  119. addl $2 * SIZE, X
  120. addl $2 * SIZE, Y
  121. subl $2, N
  122. jle .L999
  123. ALIGN_2
  124. .L10:
  125. testl $3 * SIZE, Y
  126. jne .L20
  127. movl N, I
  128. sarl $5, I
  129. jle .L14
  130. ALIGN_3
  131. .L11:
  132. #ifdef PREFETCHW
  133. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  134. #endif
  135. movsd 0 * SIZE(Y), %xmm1
  136. movhps 2 * SIZE(Y), %xmm1
  137. movaps 0 * SIZE(X), %xmm0
  138. movaps %xmm1, %xmm2
  139. movaps %xmm0, %xmm3
  140. mulps C, %xmm0
  141. mulps S, %xmm1
  142. mulps C, %xmm2
  143. mulps S, %xmm3
  144. addps %xmm1, %xmm0
  145. subps %xmm3, %xmm2
  146. movaps %xmm0, 0 * SIZE(X)
  147. movlps %xmm2, 0 * SIZE(Y)
  148. movhps %xmm2, 2 * SIZE(Y)
  149. movsd 4 * SIZE(Y), %xmm1
  150. movhps 6 * SIZE(Y), %xmm1
  151. movaps 4 * SIZE(X), %xmm0
  152. movaps %xmm1, %xmm2
  153. movaps %xmm0, %xmm3
  154. mulps C, %xmm0
  155. mulps S, %xmm1
  156. mulps C, %xmm2
  157. mulps S, %xmm3
  158. addps %xmm1, %xmm0
  159. subps %xmm3, %xmm2
  160. #ifdef PREFETCHW
  161. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  162. #endif
  163. movaps %xmm0, 4 * SIZE(X)
  164. movlps %xmm2, 4 * SIZE(Y)
  165. movhps %xmm2, 6 * SIZE(Y)
  166. movsd 8 * SIZE(Y), %xmm1
  167. movhps 10 * SIZE(Y), %xmm1
  168. movaps 8 * SIZE(X), %xmm0
  169. movaps %xmm1, %xmm2
  170. movaps %xmm0, %xmm3
  171. mulps C, %xmm0
  172. mulps S, %xmm1
  173. mulps C, %xmm2
  174. mulps S, %xmm3
  175. addps %xmm1, %xmm0
  176. subps %xmm3, %xmm2
  177. movaps %xmm0, 8 * SIZE(X)
  178. movlps %xmm2, 8 * SIZE(Y)
  179. movhps %xmm2, 10 * SIZE(Y)
  180. movsd 12 * SIZE(Y), %xmm1
  181. movhps 14 * SIZE(Y), %xmm1
  182. movaps 12 * SIZE(X), %xmm0
  183. movaps %xmm1, %xmm2
  184. movaps %xmm0, %xmm3
  185. mulps C, %xmm0
  186. mulps S, %xmm1
  187. mulps C, %xmm2
  188. mulps S, %xmm3
  189. addps %xmm1, %xmm0
  190. subps %xmm3, %xmm2
  191. movaps %xmm0, 12 * SIZE(X)
  192. movlps %xmm2, 12 * SIZE(Y)
  193. movhps %xmm2, 14 * SIZE(Y)
  194. #ifdef PREFETCHW
  195. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  196. #endif
  197. movsd 16 * SIZE(Y), %xmm1
  198. movhps 18 * SIZE(Y), %xmm1
  199. movaps 16 * SIZE(X), %xmm0
  200. movaps %xmm1, %xmm2
  201. movaps %xmm0, %xmm3
  202. mulps C, %xmm0
  203. mulps S, %xmm1
  204. mulps C, %xmm2
  205. mulps S, %xmm3
  206. addps %xmm1, %xmm0
  207. subps %xmm3, %xmm2
  208. movaps %xmm0, 16 * SIZE(X)
  209. movlps %xmm2, 16 * SIZE(Y)
  210. movhps %xmm2, 18 * SIZE(Y)
  211. movsd 20 * SIZE(Y), %xmm1
  212. movhps 22 * SIZE(Y), %xmm1
  213. movaps 20 * SIZE(X), %xmm0
  214. movaps %xmm1, %xmm2
  215. movaps %xmm0, %xmm3
  216. mulps C, %xmm0
  217. mulps S, %xmm1
  218. mulps C, %xmm2
  219. mulps S, %xmm3
  220. addps %xmm1, %xmm0
  221. subps %xmm3, %xmm2
  222. movaps %xmm0, 20 * SIZE(X)
  223. movlps %xmm2, 20 * SIZE(Y)
  224. movhps %xmm2, 22 * SIZE(Y)
  225. #ifdef PREFETCHW
  226. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  227. #endif
  228. movsd 24 * SIZE(Y), %xmm1
  229. movhps 26 * SIZE(Y), %xmm1
  230. movaps 24 * SIZE(X), %xmm0
  231. movaps %xmm1, %xmm2
  232. movaps %xmm0, %xmm3
  233. mulps C, %xmm0
  234. mulps S, %xmm1
  235. mulps C, %xmm2
  236. mulps S, %xmm3
  237. addps %xmm1, %xmm0
  238. subps %xmm3, %xmm2
  239. movaps %xmm0, 24 * SIZE(X)
  240. movlps %xmm2, 24 * SIZE(Y)
  241. movhps %xmm2, 26 * SIZE(Y)
  242. movsd 28 * SIZE(Y), %xmm1
  243. movhps 30 * SIZE(Y), %xmm1
  244. movaps 28 * SIZE(X), %xmm0
  245. movaps %xmm1, %xmm2
  246. movaps %xmm0, %xmm3
  247. mulps C, %xmm0
  248. mulps S, %xmm1
  249. mulps C, %xmm2
  250. mulps S, %xmm3
  251. addps %xmm1, %xmm0
  252. subps %xmm3, %xmm2
  253. movaps %xmm0, 28 * SIZE(X)
  254. movlps %xmm2, 28 * SIZE(Y)
  255. movhps %xmm2, 30 * SIZE(Y)
  256. addl $32 * SIZE, X
  257. addl $32 * SIZE, Y
  258. decl I
  259. jg .L11
  260. ALIGN_3
  261. .L14:
  262. testl $31, N
  263. jle .L999
  264. testl $16, N
  265. jle .L15
  266. movsd 0 * SIZE(Y), %xmm1
  267. movhps 2 * SIZE(Y), %xmm1
  268. movaps 0 * SIZE(X), %xmm0
  269. movaps %xmm1, %xmm2
  270. movaps %xmm0, %xmm3
  271. mulps C, %xmm0
  272. mulps S, %xmm1
  273. mulps C, %xmm2
  274. mulps S, %xmm3
  275. addps %xmm1, %xmm0
  276. subps %xmm3, %xmm2
  277. movaps %xmm0, 0 * SIZE(X)
  278. movlps %xmm2, 0 * SIZE(Y)
  279. movhps %xmm2, 2 * SIZE(Y)
  280. movsd 4 * SIZE(Y), %xmm1
  281. movhps 6 * SIZE(Y), %xmm1
  282. movaps 4 * SIZE(X), %xmm0
  283. movaps %xmm1, %xmm2
  284. movaps %xmm0, %xmm3
  285. mulps C, %xmm0
  286. mulps S, %xmm1
  287. mulps C, %xmm2
  288. mulps S, %xmm3
  289. addps %xmm1, %xmm0
  290. subps %xmm3, %xmm2
  291. movaps %xmm0, 4 * SIZE(X)
  292. movlps %xmm2, 4 * SIZE(Y)
  293. movhps %xmm2, 6 * SIZE(Y)
  294. movsd 8 * SIZE(Y), %xmm1
  295. movhps 10 * SIZE(Y), %xmm1
  296. movaps 8 * SIZE(X), %xmm0
  297. movaps %xmm1, %xmm2
  298. movaps %xmm0, %xmm3
  299. mulps C, %xmm0
  300. mulps S, %xmm1
  301. mulps C, %xmm2
  302. mulps S, %xmm3
  303. addps %xmm1, %xmm0
  304. subps %xmm3, %xmm2
  305. movaps %xmm0, 8 * SIZE(X)
  306. movlps %xmm2, 8 * SIZE(Y)
  307. movhps %xmm2, 10 * SIZE(Y)
  308. movsd 12 * SIZE(Y), %xmm1
  309. movhps 14 * SIZE(Y), %xmm1
  310. movaps 12 * SIZE(X), %xmm0
  311. movaps %xmm1, %xmm2
  312. movaps %xmm0, %xmm3
  313. mulps C, %xmm0
  314. mulps S, %xmm1
  315. mulps C, %xmm2
  316. mulps S, %xmm3
  317. addps %xmm1, %xmm0
  318. subps %xmm3, %xmm2
  319. movaps %xmm0, 12 * SIZE(X)
  320. movlps %xmm2, 12 * SIZE(Y)
  321. movhps %xmm2, 14 * SIZE(Y)
  322. addl $16 * SIZE, X
  323. addl $16 * SIZE, Y
  324. ALIGN_3
  325. .L15:
  326. testl $8, N
  327. jle .L16
  328. movsd 0 * SIZE(Y), %xmm1
  329. movhps 2 * SIZE(Y), %xmm1
  330. movaps 0 * SIZE(X), %xmm0
  331. movaps %xmm1, %xmm2
  332. movaps %xmm0, %xmm3
  333. mulps C, %xmm0
  334. mulps S, %xmm1
  335. mulps C, %xmm2
  336. mulps S, %xmm3
  337. addps %xmm1, %xmm0
  338. subps %xmm3, %xmm2
  339. movaps %xmm0, 0 * SIZE(X)
  340. movlps %xmm2, 0 * SIZE(Y)
  341. movhps %xmm2, 2 * SIZE(Y)
  342. movsd 4 * SIZE(Y), %xmm1
  343. movhps 6 * SIZE(Y), %xmm1
  344. movaps 4 * SIZE(X), %xmm0
  345. movaps %xmm1, %xmm2
  346. movaps %xmm0, %xmm3
  347. mulps C, %xmm0
  348. mulps S, %xmm1
  349. mulps C, %xmm2
  350. mulps S, %xmm3
  351. addps %xmm1, %xmm0
  352. subps %xmm3, %xmm2
  353. movaps %xmm0, 4 * SIZE(X)
  354. movlps %xmm2, 4 * SIZE(Y)
  355. movhps %xmm2, 6 * SIZE(Y)
  356. addl $8 * SIZE, X
  357. addl $8 * SIZE, Y
  358. ALIGN_3
  359. .L16:
  360. testl $4, N
  361. jle .L17
  362. movsd 0 * SIZE(Y), %xmm1
  363. movhps 2 * SIZE(Y), %xmm1
  364. movaps 0 * SIZE(X), %xmm0
  365. movaps %xmm1, %xmm2
  366. movaps %xmm0, %xmm3
  367. mulps C, %xmm0
  368. mulps S, %xmm1
  369. mulps C, %xmm2
  370. mulps S, %xmm3
  371. addps %xmm1, %xmm0
  372. subps %xmm3, %xmm2
  373. movaps %xmm0, 0 * SIZE(X)
  374. movlps %xmm2, 0 * SIZE(Y)
  375. movhps %xmm2, 2 * SIZE(Y)
  376. addl $4 * SIZE, X
  377. addl $4 * SIZE, Y
  378. ALIGN_3
  379. .L17:
  380. testl $2, N
  381. jle .L18
  382. #ifdef movsd
  383. xorps %xmm0, %xmm0
  384. xorps %xmm1, %xmm1
  385. #endif
  386. movsd 0 * SIZE(Y), %xmm1
  387. movsd 0 * SIZE(X), %xmm0
  388. movaps %xmm1, %xmm2
  389. movaps %xmm0, %xmm3
  390. mulps C, %xmm0
  391. mulps S, %xmm1
  392. mulps C, %xmm2
  393. mulps S, %xmm3
  394. addps %xmm1, %xmm0
  395. subps %xmm3, %xmm2
  396. movlps %xmm0, 0 * SIZE(X)
  397. movlps %xmm2, 0 * SIZE(Y)
  398. addl $2 * SIZE, X
  399. addl $2 * SIZE, Y
  400. ALIGN_3
  401. .L18:
  402. testl $1, N
  403. jle .L999
  404. movss 0 * SIZE(Y), %xmm1
  405. movss 0 * SIZE(X), %xmm0
  406. movaps %xmm1, %xmm2
  407. movaps %xmm0, %xmm3
  408. mulss C, %xmm0
  409. mulss S, %xmm1
  410. mulss C, %xmm2
  411. mulss S, %xmm3
  412. addss %xmm1, %xmm0
  413. subss %xmm3, %xmm2
  414. movss %xmm0, 0 * SIZE(X)
  415. movss %xmm2, 0 * SIZE(Y)
  416. jmp .L999
  417. ALIGN_3
  418. .L20:
  419. movl N, I
  420. sarl $5, I
  421. jle .L24
  422. ALIGN_3
  423. .L21:
  424. #ifdef PREFETCHW
  425. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  426. #endif
  427. movsd 0 * SIZE(Y), %xmm1
  428. movhps 2 * SIZE(Y), %xmm1
  429. movaps 0 * SIZE(X), %xmm0
  430. movaps %xmm1, %xmm2
  431. movaps %xmm0, %xmm3
  432. mulps C, %xmm0
  433. mulps S, %xmm1
  434. mulps C, %xmm2
  435. mulps S, %xmm3
  436. addps %xmm1, %xmm0
  437. subps %xmm3, %xmm2
  438. movaps %xmm0, 0 * SIZE(X)
  439. movlps %xmm2, 0 * SIZE(Y)
  440. movhps %xmm2, 2 * SIZE(Y)
  441. movsd 4 * SIZE(Y), %xmm1
  442. movhps 6 * SIZE(Y), %xmm1
  443. movaps 4 * SIZE(X), %xmm0
  444. movaps %xmm1, %xmm2
  445. movaps %xmm0, %xmm3
  446. mulps C, %xmm0
  447. mulps S, %xmm1
  448. mulps C, %xmm2
  449. mulps S, %xmm3
  450. addps %xmm1, %xmm0
  451. subps %xmm3, %xmm2
  452. movaps %xmm0, 4 * SIZE(X)
  453. movlps %xmm2, 4 * SIZE(Y)
  454. movhps %xmm2, 6 * SIZE(Y)
  455. #ifdef PREFETCHW
  456. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  457. #endif
  458. movsd 8 * SIZE(Y), %xmm1
  459. movhps 10 * SIZE(Y), %xmm1
  460. movaps 8 * SIZE(X), %xmm0
  461. movaps %xmm1, %xmm2
  462. movaps %xmm0, %xmm3
  463. mulps C, %xmm0
  464. mulps S, %xmm1
  465. mulps C, %xmm2
  466. mulps S, %xmm3
  467. addps %xmm1, %xmm0
  468. subps %xmm3, %xmm2
  469. movaps %xmm0, 8 * SIZE(X)
  470. movlps %xmm2, 8 * SIZE(Y)
  471. movhps %xmm2, 10 * SIZE(Y)
  472. movsd 12 * SIZE(Y), %xmm1
  473. movhps 14 * SIZE(Y), %xmm1
  474. movaps 12 * SIZE(X), %xmm0
  475. movaps %xmm1, %xmm2
  476. movaps %xmm0, %xmm3
  477. mulps C, %xmm0
  478. mulps S, %xmm1
  479. mulps C, %xmm2
  480. mulps S, %xmm3
  481. addps %xmm1, %xmm0
  482. subps %xmm3, %xmm2
  483. movaps %xmm0, 12 * SIZE(X)
  484. movlps %xmm2, 12 * SIZE(Y)
  485. movhps %xmm2, 14 * SIZE(Y)
  486. #ifdef PREFETCHW
  487. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  488. #endif
  489. movsd 16 * SIZE(Y), %xmm1
  490. movhps 18 * SIZE(Y), %xmm1
  491. movaps 16 * SIZE(X), %xmm0
  492. movaps %xmm1, %xmm2
  493. movaps %xmm0, %xmm3
  494. mulps C, %xmm0
  495. mulps S, %xmm1
  496. mulps C, %xmm2
  497. mulps S, %xmm3
  498. addps %xmm1, %xmm0
  499. subps %xmm3, %xmm2
  500. movaps %xmm0, 16 * SIZE(X)
  501. movlps %xmm2, 16 * SIZE(Y)
  502. movhps %xmm2, 18 * SIZE(Y)
  503. movsd 20 * SIZE(Y), %xmm1
  504. movhps 22 * SIZE(Y), %xmm1
  505. movaps 20 * SIZE(X), %xmm0
  506. movaps %xmm1, %xmm2
  507. movaps %xmm0, %xmm3
  508. mulps C, %xmm0
  509. mulps S, %xmm1
  510. mulps C, %xmm2
  511. mulps S, %xmm3
  512. addps %xmm1, %xmm0
  513. subps %xmm3, %xmm2
  514. movaps %xmm0, 20 * SIZE(X)
  515. movlps %xmm2, 20 * SIZE(Y)
  516. movhps %xmm2, 22 * SIZE(Y)
  517. #ifdef PREFETCHW
  518. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  519. #endif
  520. movsd 24 * SIZE(Y), %xmm1
  521. movhps 26 * SIZE(Y), %xmm1
  522. movaps 24 * SIZE(X), %xmm0
  523. movaps %xmm1, %xmm2
  524. movaps %xmm0, %xmm3
  525. mulps C, %xmm0
  526. mulps S, %xmm1
  527. mulps C, %xmm2
  528. mulps S, %xmm3
  529. addps %xmm1, %xmm0
  530. subps %xmm3, %xmm2
  531. movaps %xmm0, 24 * SIZE(X)
  532. movlps %xmm2, 24 * SIZE(Y)
  533. movhps %xmm2, 26 * SIZE(Y)
  534. movsd 28 * SIZE(Y), %xmm1
  535. movhps 30 * SIZE(Y), %xmm1
  536. movaps 28 * SIZE(X), %xmm0
  537. movaps %xmm1, %xmm2
  538. movaps %xmm0, %xmm3
  539. mulps C, %xmm0
  540. mulps S, %xmm1
  541. mulps C, %xmm2
  542. mulps S, %xmm3
  543. addps %xmm1, %xmm0
  544. subps %xmm3, %xmm2
  545. movaps %xmm0, 28 * SIZE(X)
  546. movlps %xmm2, 28 * SIZE(Y)
  547. movhps %xmm2, 30 * SIZE(Y)
  548. addl $32 * SIZE, X
  549. addl $32 * SIZE, Y
  550. decl I
  551. jg .L21
  552. ALIGN_3
  553. .L24:
  554. testl $31, N
  555. jle .L999
  556. testl $16, N
  557. jle .L25
  558. movsd 0 * SIZE(Y), %xmm1
  559. movhps 2 * SIZE(Y), %xmm1
  560. movaps 0 * SIZE(X), %xmm0
  561. movaps %xmm1, %xmm2
  562. movaps %xmm0, %xmm3
  563. mulps C, %xmm0
  564. mulps S, %xmm1
  565. mulps C, %xmm2
  566. mulps S, %xmm3
  567. addps %xmm1, %xmm0
  568. subps %xmm3, %xmm2
  569. movaps %xmm0, 0 * SIZE(X)
  570. movlps %xmm2, 0 * SIZE(Y)
  571. movhps %xmm2, 2 * SIZE(Y)
  572. movsd 4 * SIZE(Y), %xmm1
  573. movhps 6 * SIZE(Y), %xmm1
  574. movaps 4 * SIZE(X), %xmm0
  575. movaps %xmm1, %xmm2
  576. movaps %xmm0, %xmm3
  577. mulps C, %xmm0
  578. mulps S, %xmm1
  579. mulps C, %xmm2
  580. mulps S, %xmm3
  581. addps %xmm1, %xmm0
  582. subps %xmm3, %xmm2
  583. movaps %xmm0, 4 * SIZE(X)
  584. movlps %xmm2, 4 * SIZE(Y)
  585. movhps %xmm2, 6 * SIZE(Y)
  586. movsd 8 * SIZE(Y), %xmm1
  587. movhps 10 * SIZE(Y), %xmm1
  588. movaps 8 * SIZE(X), %xmm0
  589. movaps %xmm1, %xmm2
  590. movaps %xmm0, %xmm3
  591. mulps C, %xmm0
  592. mulps S, %xmm1
  593. mulps C, %xmm2
  594. mulps S, %xmm3
  595. addps %xmm1, %xmm0
  596. subps %xmm3, %xmm2
  597. movaps %xmm0, 8 * SIZE(X)
  598. movlps %xmm2, 8 * SIZE(Y)
  599. movhps %xmm2, 10 * SIZE(Y)
  600. movsd 12 * SIZE(Y), %xmm1
  601. movhps 14 * SIZE(Y), %xmm1
  602. movaps 12 * SIZE(X), %xmm0
  603. movaps %xmm1, %xmm2
  604. movaps %xmm0, %xmm3
  605. mulps C, %xmm0
  606. mulps S, %xmm1
  607. mulps C, %xmm2
  608. mulps S, %xmm3
  609. addps %xmm1, %xmm0
  610. subps %xmm3, %xmm2
  611. movaps %xmm0, 12 * SIZE(X)
  612. movlps %xmm2, 12 * SIZE(Y)
  613. movhps %xmm2, 14 * SIZE(Y)
  614. addl $16 * SIZE, X
  615. addl $16 * SIZE, Y
  616. ALIGN_3
  617. .L25:
  618. testl $8, N
  619. jle .L26
  620. movsd 0 * SIZE(Y), %xmm1
  621. movhps 2 * SIZE(Y), %xmm1
  622. movaps 0 * SIZE(X), %xmm0
  623. movaps %xmm1, %xmm2
  624. movaps %xmm0, %xmm3
  625. mulps C, %xmm0
  626. mulps S, %xmm1
  627. mulps C, %xmm2
  628. mulps S, %xmm3
  629. addps %xmm1, %xmm0
  630. subps %xmm3, %xmm2
  631. movaps %xmm0, 0 * SIZE(X)
  632. movlps %xmm2, 0 * SIZE(Y)
  633. movhps %xmm2, 2 * SIZE(Y)
  634. movsd 4 * SIZE(Y), %xmm1
  635. movhps 6 * SIZE(Y), %xmm1
  636. movaps 4 * SIZE(X), %xmm0
  637. movaps %xmm1, %xmm2
  638. movaps %xmm0, %xmm3
  639. mulps C, %xmm0
  640. mulps S, %xmm1
  641. mulps C, %xmm2
  642. mulps S, %xmm3
  643. addps %xmm1, %xmm0
  644. subps %xmm3, %xmm2
  645. movaps %xmm0, 4 * SIZE(X)
  646. movlps %xmm2, 4 * SIZE(Y)
  647. movhps %xmm2, 6 * SIZE(Y)
  648. addl $8 * SIZE, X
  649. addl $8 * SIZE, Y
  650. ALIGN_3
  651. .L26:
  652. testl $4, N
  653. jle .L27
  654. movsd 0 * SIZE(Y), %xmm1
  655. movhps 2 * SIZE(Y), %xmm1
  656. movaps 0 * SIZE(X), %xmm0
  657. movaps %xmm1, %xmm2
  658. movaps %xmm0, %xmm3
  659. mulps C, %xmm0
  660. mulps S, %xmm1
  661. mulps C, %xmm2
  662. mulps S, %xmm3
  663. addps %xmm1, %xmm0
  664. subps %xmm3, %xmm2
  665. movaps %xmm0, 0 * SIZE(X)
  666. movlps %xmm2, 0 * SIZE(Y)
  667. movhps %xmm2, 2 * SIZE(Y)
  668. addl $4 * SIZE, X
  669. addl $4 * SIZE, Y
  670. ALIGN_3
  671. .L27:
  672. testl $2, N
  673. jle .L28
  674. #ifdef movsd
  675. xorps %xmm0, %xmm0
  676. xorps %xmm1, %xmm1
  677. #endif
  678. movsd 0 * SIZE(Y), %xmm1
  679. movsd 0 * SIZE(X), %xmm0
  680. movaps %xmm1, %xmm2
  681. movaps %xmm0, %xmm3
  682. mulps C, %xmm0
  683. mulps S, %xmm1
  684. mulps C, %xmm2
  685. mulps S, %xmm3
  686. addps %xmm1, %xmm0
  687. subps %xmm3, %xmm2
  688. movlps %xmm0, 0 * SIZE(X)
  689. movlps %xmm2, 0 * SIZE(Y)
  690. addl $2 * SIZE, X
  691. addl $2 * SIZE, Y
  692. ALIGN_3
  693. .L28:
  694. testl $1, N
  695. jle .L999
  696. movss 0 * SIZE(Y), %xmm1
  697. movss 0 * SIZE(X), %xmm0
  698. movaps %xmm1, %xmm2
  699. movaps %xmm0, %xmm3
  700. mulss C, %xmm0
  701. mulss S, %xmm1
  702. mulss C, %xmm2
  703. mulss S, %xmm3
  704. addss %xmm1, %xmm0
  705. subss %xmm3, %xmm2
  706. movss %xmm0, 0 * SIZE(X)
  707. movss %xmm2, 0 * SIZE(Y)
  708. jmp .L999
  709. ALIGN_3
  710. .L50:
  711. movl N, I
  712. sarl $2, I
  713. jle .L55
  714. ALIGN_3
  715. .L53:
  716. movss (Y), %xmm1
  717. movss (X), %xmm0
  718. movaps %xmm1, %xmm2
  719. movaps %xmm0, %xmm3
  720. mulss C, %xmm0
  721. mulss S, %xmm1
  722. mulss C, %xmm2
  723. mulss S, %xmm3
  724. addss %xmm1, %xmm0
  725. subss %xmm3, %xmm2
  726. movss %xmm0, (X)
  727. movss %xmm2, (Y)
  728. addl INCX, X
  729. addl INCY, Y
  730. movss (Y), %xmm1
  731. movss (X), %xmm0
  732. movaps %xmm1, %xmm2
  733. movaps %xmm0, %xmm3
  734. mulss C, %xmm0
  735. mulss S, %xmm1
  736. mulss C, %xmm2
  737. mulss S, %xmm3
  738. addss %xmm1, %xmm0
  739. subss %xmm3, %xmm2
  740. movss %xmm0, (X)
  741. movss %xmm2, (Y)
  742. addl INCX, X
  743. addl INCY, Y
  744. movss (Y), %xmm1
  745. movss (X), %xmm0
  746. movaps %xmm1, %xmm2
  747. movaps %xmm0, %xmm3
  748. mulss C, %xmm0
  749. mulss S, %xmm1
  750. mulss C, %xmm2
  751. mulss S, %xmm3
  752. addss %xmm1, %xmm0
  753. subss %xmm3, %xmm2
  754. movss %xmm0, (X)
  755. movss %xmm2, (Y)
  756. addl INCX, X
  757. addl INCY, Y
  758. movss (Y), %xmm1
  759. movss (X), %xmm0
  760. movaps %xmm1, %xmm2
  761. movaps %xmm0, %xmm3
  762. mulss C, %xmm0
  763. mulss S, %xmm1
  764. mulss C, %xmm2
  765. mulss S, %xmm3
  766. addss %xmm1, %xmm0
  767. subss %xmm3, %xmm2
  768. movss %xmm0, (X)
  769. movss %xmm2, (Y)
  770. addl INCX, X
  771. addl INCY, Y
  772. decl I
  773. jg .L53
  774. ALIGN_3
  775. .L55:
  776. movl N, I
  777. andl $3, I
  778. jle .L999
  779. ALIGN_3
  780. .L56:
  781. movss (Y), %xmm1
  782. movss (X), %xmm0
  783. movaps %xmm1, %xmm2
  784. movaps %xmm0, %xmm3
  785. mulss C, %xmm0
  786. mulss S, %xmm1
  787. mulss C, %xmm2
  788. mulss S, %xmm3
  789. addss %xmm1, %xmm0
  790. subss %xmm3, %xmm2
  791. movss %xmm0, (X)
  792. movss %xmm2, (Y)
  793. addl INCX, X
  794. addl INCY, Y
  795. decl I
  796. jg .L56
  797. ALIGN_3
  798. .L999:
  799. popl %ebx
  800. popl %esi
  801. popl %edi
  802. ret
  803. EPILOGUE