You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zrot_sse.S 23 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_N 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define STACK_C 24 + STACK + ARGS(%esp)
  48. #define STACK_S 28 + STACK + ARGS(%esp)
  49. #define N %ebx
  50. #define X %esi
  51. #define INCX %ecx
  52. #define Y %edi
  53. #define INCY %edx
  54. #define I %eax
  55. #include "l1param.h"
  56. #define C %xmm6
  57. #define S %xmm7
  58. PROLOGUE
  59. pushl %edi
  60. pushl %esi
  61. pushl %ebx
  62. PROFCODE
  63. movl STACK_N, N
  64. movl STACK_X, X
  65. movl STACK_INCX, INCX
  66. movl STACK_Y, Y
  67. movl STACK_INCY, INCY
  68. sall $ZBASE_SHIFT, INCX
  69. sall $ZBASE_SHIFT, INCY
  70. movss STACK_C, C
  71. movss STACK_S, S
  72. shufps $0x0, C, C
  73. shufps $0x0, S, S
  74. cmpl $0, N
  75. jle .L999
  76. cmpl $2 * SIZE, INCX
  77. jne .L50
  78. cmpl $2 * SIZE, INCY
  79. jne .L50
  80. testl $2 * SIZE, X
  81. je .L10
  82. #ifndef HAVE_SSE2
  83. xorps %xmm0, %xmm0
  84. xorps %xmm1, %xmm1
  85. #endif
  86. movsd 0 * SIZE(Y), %xmm1
  87. movsd 0 * SIZE(X), %xmm0
  88. movaps %xmm1, %xmm2
  89. movaps %xmm0, %xmm3
  90. mulps C, %xmm0
  91. mulps S, %xmm1
  92. mulps C, %xmm2
  93. mulps S, %xmm3
  94. addps %xmm1, %xmm0
  95. subps %xmm3, %xmm2
  96. movlps %xmm0, 0 * SIZE(X)
  97. movlps %xmm2, 0 * SIZE(Y)
  98. addl $2 * SIZE, X
  99. addl $2 * SIZE, Y
  100. decl N
  101. jle .L999
  102. .L10:
  103. testl $1 * SIZE, X
  104. jne .L30
  105. testl $3 * SIZE, Y
  106. jne .L20
  107. movl N, I
  108. sarl $4, I
  109. jle .L14
  110. ALIGN_3
  111. .L11:
  112. #ifdef PREFETCHW
  113. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  114. #endif
  115. movaps 0 * SIZE(Y), %xmm1
  116. movaps 0 * SIZE(X), %xmm0
  117. movaps %xmm1, %xmm2
  118. movaps %xmm0, %xmm3
  119. mulps C, %xmm0
  120. mulps S, %xmm1
  121. mulps C, %xmm2
  122. mulps S, %xmm3
  123. addps %xmm1, %xmm0
  124. subps %xmm3, %xmm2
  125. movaps %xmm0, 0 * SIZE(X)
  126. movaps %xmm2, 0 * SIZE(Y)
  127. movaps 4 * SIZE(Y), %xmm1
  128. movaps 4 * SIZE(X), %xmm0
  129. movaps %xmm1, %xmm2
  130. movaps %xmm0, %xmm3
  131. mulps C, %xmm0
  132. mulps S, %xmm1
  133. mulps C, %xmm2
  134. mulps S, %xmm3
  135. addps %xmm1, %xmm0
  136. subps %xmm3, %xmm2
  137. movaps %xmm0, 4 * SIZE(X)
  138. movaps %xmm2, 4 * SIZE(Y)
  139. #ifdef PREFETCHW
  140. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  141. #endif
  142. movaps 8 * SIZE(Y), %xmm1
  143. movaps 8 * SIZE(X), %xmm0
  144. movaps %xmm1, %xmm2
  145. movaps %xmm0, %xmm3
  146. mulps C, %xmm0
  147. mulps S, %xmm1
  148. mulps C, %xmm2
  149. mulps S, %xmm3
  150. addps %xmm1, %xmm0
  151. subps %xmm3, %xmm2
  152. movaps %xmm0, 8 * SIZE(X)
  153. movaps %xmm2, 8 * SIZE(Y)
  154. movaps 12 * SIZE(Y), %xmm1
  155. movaps 12 * SIZE(X), %xmm0
  156. movaps %xmm1, %xmm2
  157. movaps %xmm0, %xmm3
  158. mulps C, %xmm0
  159. mulps S, %xmm1
  160. mulps C, %xmm2
  161. mulps S, %xmm3
  162. addps %xmm1, %xmm0
  163. subps %xmm3, %xmm2
  164. movaps %xmm0, 12 * SIZE(X)
  165. movaps %xmm2, 12 * SIZE(Y)
  166. #ifdef PREFETCHW
  167. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  168. #endif
  169. movaps 16 * SIZE(Y), %xmm1
  170. movaps 16 * SIZE(X), %xmm0
  171. movaps %xmm1, %xmm2
  172. movaps %xmm0, %xmm3
  173. mulps C, %xmm0
  174. mulps S, %xmm1
  175. mulps C, %xmm2
  176. mulps S, %xmm3
  177. addps %xmm1, %xmm0
  178. subps %xmm3, %xmm2
  179. movaps %xmm0, 16 * SIZE(X)
  180. movaps %xmm2, 16 * SIZE(Y)
  181. movaps 20 * SIZE(Y), %xmm1
  182. movaps 20 * SIZE(X), %xmm0
  183. movaps %xmm1, %xmm2
  184. movaps %xmm0, %xmm3
  185. mulps C, %xmm0
  186. mulps S, %xmm1
  187. mulps C, %xmm2
  188. mulps S, %xmm3
  189. addps %xmm1, %xmm0
  190. subps %xmm3, %xmm2
  191. movaps %xmm0, 20 * SIZE(X)
  192. movaps %xmm2, 20 * SIZE(Y)
  193. #ifdef PREFETCHW
  194. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  195. #endif
  196. movaps 24 * SIZE(Y), %xmm1
  197. movaps 24 * SIZE(X), %xmm0
  198. movaps %xmm1, %xmm2
  199. movaps %xmm0, %xmm3
  200. mulps C, %xmm0
  201. mulps S, %xmm1
  202. mulps C, %xmm2
  203. mulps S, %xmm3
  204. addps %xmm1, %xmm0
  205. subps %xmm3, %xmm2
  206. movaps %xmm0, 24 * SIZE(X)
  207. movaps %xmm2, 24 * SIZE(Y)
  208. movaps 28 * SIZE(Y), %xmm1
  209. movaps 28 * SIZE(X), %xmm0
  210. movaps %xmm1, %xmm2
  211. movaps %xmm0, %xmm3
  212. mulps C, %xmm0
  213. mulps S, %xmm1
  214. mulps C, %xmm2
  215. mulps S, %xmm3
  216. addps %xmm1, %xmm0
  217. subps %xmm3, %xmm2
  218. movaps %xmm0, 28 * SIZE(X)
  219. movaps %xmm2, 28 * SIZE(Y)
  220. addl $32 * SIZE, X
  221. addl $32 * SIZE, Y
  222. decl I
  223. jg .L11
  224. ALIGN_3
  225. .L14:
  226. testl $15, N
  227. jle .L999
  228. testl $8, N
  229. jle .L15
  230. movaps 0 * SIZE(Y), %xmm1
  231. movaps 0 * SIZE(X), %xmm0
  232. movaps %xmm1, %xmm2
  233. movaps %xmm0, %xmm3
  234. mulps C, %xmm0
  235. mulps S, %xmm1
  236. mulps C, %xmm2
  237. mulps S, %xmm3
  238. addps %xmm1, %xmm0
  239. subps %xmm3, %xmm2
  240. movaps %xmm0, 0 * SIZE(X)
  241. movaps %xmm2, 0 * SIZE(Y)
  242. movaps 4 * SIZE(Y), %xmm1
  243. movaps 4 * SIZE(X), %xmm0
  244. movaps %xmm1, %xmm2
  245. movaps %xmm0, %xmm3
  246. mulps C, %xmm0
  247. mulps S, %xmm1
  248. mulps C, %xmm2
  249. mulps S, %xmm3
  250. addps %xmm1, %xmm0
  251. subps %xmm3, %xmm2
  252. movaps %xmm0, 4 * SIZE(X)
  253. movaps %xmm2, 4 * SIZE(Y)
  254. movaps 8 * SIZE(Y), %xmm1
  255. movaps 8 * SIZE(X), %xmm0
  256. movaps %xmm1, %xmm2
  257. movaps %xmm0, %xmm3
  258. mulps C, %xmm0
  259. mulps S, %xmm1
  260. mulps C, %xmm2
  261. mulps S, %xmm3
  262. addps %xmm1, %xmm0
  263. subps %xmm3, %xmm2
  264. movaps %xmm0, 8 * SIZE(X)
  265. movaps %xmm2, 8 * SIZE(Y)
  266. movaps 12 * SIZE(Y), %xmm1
  267. movaps 12 * SIZE(X), %xmm0
  268. movaps %xmm1, %xmm2
  269. movaps %xmm0, %xmm3
  270. mulps C, %xmm0
  271. mulps S, %xmm1
  272. mulps C, %xmm2
  273. mulps S, %xmm3
  274. addps %xmm1, %xmm0
  275. subps %xmm3, %xmm2
  276. movaps %xmm0, 12 * SIZE(X)
  277. movaps %xmm2, 12 * SIZE(Y)
  278. addl $16 * SIZE, X
  279. addl $16 * SIZE, Y
  280. ALIGN_3
  281. .L15:
  282. testl $4, N
  283. jle .L16
  284. movaps 0 * SIZE(Y), %xmm1
  285. movaps 0 * SIZE(X), %xmm0
  286. movaps %xmm1, %xmm2
  287. movaps %xmm0, %xmm3
  288. mulps C, %xmm0
  289. mulps S, %xmm1
  290. mulps C, %xmm2
  291. mulps S, %xmm3
  292. addps %xmm1, %xmm0
  293. subps %xmm3, %xmm2
  294. movaps %xmm0, 0 * SIZE(X)
  295. movaps %xmm2, 0 * SIZE(Y)
  296. movaps 4 * SIZE(Y), %xmm1
  297. movaps 4 * SIZE(X), %xmm0
  298. movaps %xmm1, %xmm2
  299. movaps %xmm0, %xmm3
  300. mulps C, %xmm0
  301. mulps S, %xmm1
  302. mulps C, %xmm2
  303. mulps S, %xmm3
  304. addps %xmm1, %xmm0
  305. subps %xmm3, %xmm2
  306. movaps %xmm0, 4 * SIZE(X)
  307. movaps %xmm2, 4 * SIZE(Y)
  308. addl $8 * SIZE, X
  309. addl $8 * SIZE, Y
  310. ALIGN_3
  311. .L16:
  312. testl $2, N
  313. jle .L17
  314. movaps 0 * SIZE(Y), %xmm1
  315. movaps 0 * SIZE(X), %xmm0
  316. movaps %xmm1, %xmm2
  317. movaps %xmm0, %xmm3
  318. mulps C, %xmm0
  319. mulps S, %xmm1
  320. mulps C, %xmm2
  321. mulps S, %xmm3
  322. addps %xmm1, %xmm0
  323. subps %xmm3, %xmm2
  324. movaps %xmm0, 0 * SIZE(X)
  325. movaps %xmm2, 0 * SIZE(Y)
  326. addl $4 * SIZE, X
  327. addl $4 * SIZE, Y
  328. ALIGN_3
  329. .L17:
  330. testl $1, N
  331. jle .L999
  332. #ifndef HAVE_SSE2
  333. xorps %xmm0, %xmm0
  334. xorps %xmm1, %xmm1
  335. #endif
  336. movsd 0 * SIZE(Y), %xmm1
  337. movsd 0 * SIZE(X), %xmm0
  338. movaps %xmm1, %xmm2
  339. movaps %xmm0, %xmm3
  340. mulps C, %xmm0
  341. mulps S, %xmm1
  342. mulps C, %xmm2
  343. mulps S, %xmm3
  344. addps %xmm1, %xmm0
  345. subps %xmm3, %xmm2
  346. movlps %xmm0, 0 * SIZE(X)
  347. movlps %xmm2, 0 * SIZE(Y)
  348. jmp .L999
  349. ALIGN_3
  350. .L20:
  351. movl N, I
  352. sarl $4, I
  353. jle .L24
  354. ALIGN_3
  355. .L21:
  356. #ifdef PREFETCHW
  357. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  358. #endif
  359. movsd 0 * SIZE(Y), %xmm1
  360. movhps 2 * SIZE(Y), %xmm1
  361. movaps 0 * SIZE(X), %xmm0
  362. movaps %xmm1, %xmm2
  363. movaps %xmm0, %xmm3
  364. mulps C, %xmm0
  365. mulps S, %xmm1
  366. mulps C, %xmm2
  367. mulps S, %xmm3
  368. addps %xmm1, %xmm0
  369. subps %xmm3, %xmm2
  370. movaps %xmm0, 0 * SIZE(X)
  371. movlps %xmm2, 0 * SIZE(Y)
  372. movhps %xmm2, 2 * SIZE(Y)
  373. movsd 4 * SIZE(Y), %xmm1
  374. movhps 6 * SIZE(Y), %xmm1
  375. movaps 4 * SIZE(X), %xmm0
  376. movaps %xmm1, %xmm2
  377. movaps %xmm0, %xmm3
  378. mulps C, %xmm0
  379. mulps S, %xmm1
  380. mulps C, %xmm2
  381. mulps S, %xmm3
  382. addps %xmm1, %xmm0
  383. subps %xmm3, %xmm2
  384. movaps %xmm0, 4 * SIZE(X)
  385. movlps %xmm2, 4 * SIZE(Y)
  386. movhps %xmm2, 6 * SIZE(Y)
  387. #ifdef PREFETCHW
  388. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  389. #endif
  390. movsd 8 * SIZE(Y), %xmm1
  391. movhps 10 * SIZE(Y), %xmm1
  392. movaps 8 * SIZE(X), %xmm0
  393. movaps %xmm1, %xmm2
  394. movaps %xmm0, %xmm3
  395. mulps C, %xmm0
  396. mulps S, %xmm1
  397. mulps C, %xmm2
  398. mulps S, %xmm3
  399. addps %xmm1, %xmm0
  400. subps %xmm3, %xmm2
  401. movaps %xmm0, 8 * SIZE(X)
  402. movlps %xmm2, 8 * SIZE(Y)
  403. movhps %xmm2, 10 * SIZE(Y)
  404. movsd 12 * SIZE(Y), %xmm1
  405. movhps 14 * SIZE(Y), %xmm1
  406. movaps 12 * SIZE(X), %xmm0
  407. movaps %xmm1, %xmm2
  408. movaps %xmm0, %xmm3
  409. mulps C, %xmm0
  410. mulps S, %xmm1
  411. mulps C, %xmm2
  412. mulps S, %xmm3
  413. addps %xmm1, %xmm0
  414. subps %xmm3, %xmm2
  415. movaps %xmm0, 12 * SIZE(X)
  416. movlps %xmm2, 12 * SIZE(Y)
  417. movhps %xmm2, 14 * SIZE(Y)
  418. #ifdef PREFETCHW
  419. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  420. #endif
  421. movsd 16 * SIZE(Y), %xmm1
  422. movhps 18 * SIZE(Y), %xmm1
  423. movaps 16 * SIZE(X), %xmm0
  424. movaps %xmm1, %xmm2
  425. movaps %xmm0, %xmm3
  426. mulps C, %xmm0
  427. mulps S, %xmm1
  428. mulps C, %xmm2
  429. mulps S, %xmm3
  430. addps %xmm1, %xmm0
  431. subps %xmm3, %xmm2
  432. movaps %xmm0, 16 * SIZE(X)
  433. movlps %xmm2, 16 * SIZE(Y)
  434. movhps %xmm2, 18 * SIZE(Y)
  435. movsd 20 * SIZE(Y), %xmm1
  436. movhps 22 * SIZE(Y), %xmm1
  437. movaps 20 * SIZE(X), %xmm0
  438. movaps %xmm1, %xmm2
  439. movaps %xmm0, %xmm3
  440. mulps C, %xmm0
  441. mulps S, %xmm1
  442. mulps C, %xmm2
  443. mulps S, %xmm3
  444. addps %xmm1, %xmm0
  445. subps %xmm3, %xmm2
  446. movaps %xmm0, 20 * SIZE(X)
  447. movlps %xmm2, 20 * SIZE(Y)
  448. movhps %xmm2, 22 * SIZE(Y)
  449. #ifdef PREFETCHW
  450. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  451. #endif
  452. movsd 24 * SIZE(Y), %xmm1
  453. movhps 26 * SIZE(Y), %xmm1
  454. movaps 24 * SIZE(X), %xmm0
  455. movaps %xmm1, %xmm2
  456. movaps %xmm0, %xmm3
  457. mulps C, %xmm0
  458. mulps S, %xmm1
  459. mulps C, %xmm2
  460. mulps S, %xmm3
  461. addps %xmm1, %xmm0
  462. subps %xmm3, %xmm2
  463. movaps %xmm0, 24 * SIZE(X)
  464. movlps %xmm2, 24 * SIZE(Y)
  465. movhps %xmm2, 26 * SIZE(Y)
  466. movsd 28 * SIZE(Y), %xmm1
  467. movhps 30 * SIZE(Y), %xmm1
  468. movaps 28 * SIZE(X), %xmm0
  469. movaps %xmm1, %xmm2
  470. movaps %xmm0, %xmm3
  471. mulps C, %xmm0
  472. mulps S, %xmm1
  473. mulps C, %xmm2
  474. mulps S, %xmm3
  475. addps %xmm1, %xmm0
  476. subps %xmm3, %xmm2
  477. movaps %xmm0, 28 * SIZE(X)
  478. movlps %xmm2, 28 * SIZE(Y)
  479. movhps %xmm2, 30 * SIZE(Y)
  480. addl $32 * SIZE, X
  481. addl $32 * SIZE, Y
  482. decl I
  483. jg .L21
  484. ALIGN_3
  485. .L24:
  486. testl $15, N
  487. jle .L999
  488. testl $8, N
  489. jle .L25
  490. movsd 0 * SIZE(Y), %xmm1
  491. movhps 2 * SIZE(Y), %xmm1
  492. movaps 0 * SIZE(X), %xmm0
  493. movaps %xmm1, %xmm2
  494. movaps %xmm0, %xmm3
  495. mulps C, %xmm0
  496. mulps S, %xmm1
  497. mulps C, %xmm2
  498. mulps S, %xmm3
  499. addps %xmm1, %xmm0
  500. subps %xmm3, %xmm2
  501. movaps %xmm0, 0 * SIZE(X)
  502. movlps %xmm2, 0 * SIZE(Y)
  503. movhps %xmm2, 2 * SIZE(Y)
  504. movsd 4 * SIZE(Y), %xmm1
  505. movhps 6 * SIZE(Y), %xmm1
  506. movaps 4 * SIZE(X), %xmm0
  507. movaps %xmm1, %xmm2
  508. movaps %xmm0, %xmm3
  509. mulps C, %xmm0
  510. mulps S, %xmm1
  511. mulps C, %xmm2
  512. mulps S, %xmm3
  513. addps %xmm1, %xmm0
  514. subps %xmm3, %xmm2
  515. movaps %xmm0, 4 * SIZE(X)
  516. movlps %xmm2, 4 * SIZE(Y)
  517. movhps %xmm2, 6 * SIZE(Y)
  518. movsd 8 * SIZE(Y), %xmm1
  519. movhps 10 * SIZE(Y), %xmm1
  520. movaps 8 * SIZE(X), %xmm0
  521. movaps %xmm1, %xmm2
  522. movaps %xmm0, %xmm3
  523. mulps C, %xmm0
  524. mulps S, %xmm1
  525. mulps C, %xmm2
  526. mulps S, %xmm3
  527. addps %xmm1, %xmm0
  528. subps %xmm3, %xmm2
  529. movaps %xmm0, 8 * SIZE(X)
  530. movlps %xmm2, 8 * SIZE(Y)
  531. movhps %xmm2, 10 * SIZE(Y)
  532. movsd 12 * SIZE(Y), %xmm1
  533. movhps 14 * SIZE(Y), %xmm1
  534. movaps 12 * SIZE(X), %xmm0
  535. movaps %xmm1, %xmm2
  536. movaps %xmm0, %xmm3
  537. mulps C, %xmm0
  538. mulps S, %xmm1
  539. mulps C, %xmm2
  540. mulps S, %xmm3
  541. addps %xmm1, %xmm0
  542. subps %xmm3, %xmm2
  543. movaps %xmm0, 12 * SIZE(X)
  544. movlps %xmm2, 12 * SIZE(Y)
  545. movhps %xmm2, 14 * SIZE(Y)
  546. addl $16 * SIZE, X
  547. addl $16 * SIZE, Y
  548. ALIGN_3
  549. .L25:
  550. testl $4, N
  551. jle .L26
  552. movsd 0 * SIZE(Y), %xmm1
  553. movhps 2 * SIZE(Y), %xmm1
  554. movaps 0 * SIZE(X), %xmm0
  555. movaps %xmm1, %xmm2
  556. movaps %xmm0, %xmm3
  557. mulps C, %xmm0
  558. mulps S, %xmm1
  559. mulps C, %xmm2
  560. mulps S, %xmm3
  561. addps %xmm1, %xmm0
  562. subps %xmm3, %xmm2
  563. movaps %xmm0, 0 * SIZE(X)
  564. movlps %xmm2, 0 * SIZE(Y)
  565. movhps %xmm2, 2 * SIZE(Y)
  566. movsd 4 * SIZE(Y), %xmm1
  567. movhps 6 * SIZE(Y), %xmm1
  568. movaps 4 * SIZE(X), %xmm0
  569. movaps %xmm1, %xmm2
  570. movaps %xmm0, %xmm3
  571. mulps C, %xmm0
  572. mulps S, %xmm1
  573. mulps C, %xmm2
  574. mulps S, %xmm3
  575. addps %xmm1, %xmm0
  576. subps %xmm3, %xmm2
  577. movaps %xmm0, 4 * SIZE(X)
  578. movlps %xmm2, 4 * SIZE(Y)
  579. movhps %xmm2, 6 * SIZE(Y)
  580. addl $8 * SIZE, X
  581. addl $8 * SIZE, Y
  582. ALIGN_3
  583. .L26:
  584. testl $2, N
  585. jle .L27
  586. movsd 0 * SIZE(Y), %xmm1
  587. movhps 2 * SIZE(Y), %xmm1
  588. movaps 0 * SIZE(X), %xmm0
  589. movaps %xmm1, %xmm2
  590. movaps %xmm0, %xmm3
  591. mulps C, %xmm0
  592. mulps S, %xmm1
  593. mulps C, %xmm2
  594. mulps S, %xmm3
  595. addps %xmm1, %xmm0
  596. subps %xmm3, %xmm2
  597. movaps %xmm0, 0 * SIZE(X)
  598. movlps %xmm2, 0 * SIZE(Y)
  599. movhps %xmm2, 2 * SIZE(Y)
  600. addl $4 * SIZE, X
  601. addl $4 * SIZE, Y
  602. ALIGN_3
  603. .L27:
  604. testl $1, N
  605. jle .L999
  606. movsd 0 * SIZE(Y), %xmm1
  607. movsd 0 * SIZE(X), %xmm0
  608. movaps %xmm1, %xmm2
  609. movaps %xmm0, %xmm3
  610. mulps C, %xmm0
  611. mulps S, %xmm1
  612. mulps C, %xmm2
  613. mulps S, %xmm3
  614. addps %xmm1, %xmm0
  615. subps %xmm3, %xmm2
  616. movlps %xmm0, 0 * SIZE(X)
  617. movlps %xmm2, 0 * SIZE(Y)
  618. jmp .L999
  619. ALIGN_3
  620. .L30:
  621. movl N, I
  622. sarl $4, I
  623. jle .L34
  624. ALIGN_3
  625. .L31:
  626. #ifdef PREFETCHW
  627. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  628. #endif
  629. movsd 0 * SIZE(Y), %xmm1
  630. movhps 2 * SIZE(Y), %xmm1
  631. movsd 0 * SIZE(X), %xmm0
  632. movhps 2 * SIZE(X), %xmm0
  633. movaps %xmm1, %xmm2
  634. movaps %xmm0, %xmm3
  635. mulps C, %xmm0
  636. mulps S, %xmm1
  637. mulps C, %xmm2
  638. mulps S, %xmm3
  639. addps %xmm1, %xmm0
  640. subps %xmm3, %xmm2
  641. movlps %xmm0, 0 * SIZE(X)
  642. movhps %xmm0, 2 * SIZE(X)
  643. movlps %xmm2, 0 * SIZE(Y)
  644. movhps %xmm2, 2 * SIZE(Y)
  645. movsd 4 * SIZE(Y), %xmm1
  646. movhps 6 * SIZE(Y), %xmm1
  647. movsd 4 * SIZE(X), %xmm0
  648. movhps 6 * SIZE(X), %xmm0
  649. movaps %xmm1, %xmm2
  650. movaps %xmm0, %xmm3
  651. mulps C, %xmm0
  652. mulps S, %xmm1
  653. mulps C, %xmm2
  654. mulps S, %xmm3
  655. addps %xmm1, %xmm0
  656. subps %xmm3, %xmm2
  657. movlps %xmm0, 4 * SIZE(X)
  658. movhps %xmm0, 6 * SIZE(X)
  659. movlps %xmm2, 4 * SIZE(Y)
  660. movhps %xmm2, 6 * SIZE(Y)
  661. #ifdef PREFETCHW
  662. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  663. #endif
  664. movsd 8 * SIZE(Y), %xmm1
  665. movhps 10 * SIZE(Y), %xmm1
  666. movsd 8 * SIZE(X), %xmm0
  667. movhps 10 * SIZE(X), %xmm0
  668. movaps %xmm1, %xmm2
  669. movaps %xmm0, %xmm3
  670. mulps C, %xmm0
  671. mulps S, %xmm1
  672. mulps C, %xmm2
  673. mulps S, %xmm3
  674. addps %xmm1, %xmm0
  675. subps %xmm3, %xmm2
  676. movlps %xmm0, 8 * SIZE(X)
  677. movhps %xmm0, 10 * SIZE(X)
  678. movlps %xmm2, 8 * SIZE(Y)
  679. movhps %xmm2, 10 * SIZE(Y)
  680. movsd 12 * SIZE(Y), %xmm1
  681. movhps 14 * SIZE(Y), %xmm1
  682. movsd 12 * SIZE(X), %xmm0
  683. movhps 14 * SIZE(X), %xmm0
  684. movaps %xmm1, %xmm2
  685. movaps %xmm0, %xmm3
  686. mulps C, %xmm0
  687. mulps S, %xmm1
  688. mulps C, %xmm2
  689. mulps S, %xmm3
  690. addps %xmm1, %xmm0
  691. subps %xmm3, %xmm2
  692. movlps %xmm0, 12 * SIZE(X)
  693. movhps %xmm0, 14 * SIZE(X)
  694. movlps %xmm2, 12 * SIZE(Y)
  695. movhps %xmm2, 14 * SIZE(Y)
  696. #ifdef PREFETCHW
  697. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  698. #endif
  699. movsd 16 * SIZE(Y), %xmm1
  700. movhps 18 * SIZE(Y), %xmm1
  701. movsd 16 * SIZE(X), %xmm0
  702. movhps 18 * SIZE(X), %xmm0
  703. movaps %xmm1, %xmm2
  704. movaps %xmm0, %xmm3
  705. mulps C, %xmm0
  706. mulps S, %xmm1
  707. mulps C, %xmm2
  708. mulps S, %xmm3
  709. addps %xmm1, %xmm0
  710. subps %xmm3, %xmm2
  711. movlps %xmm0, 16 * SIZE(X)
  712. movhps %xmm0, 18 * SIZE(X)
  713. movlps %xmm2, 16 * SIZE(Y)
  714. movhps %xmm2, 18 * SIZE(Y)
  715. movsd 20 * SIZE(Y), %xmm1
  716. movhps 22 * SIZE(Y), %xmm1
  717. movsd 20 * SIZE(X), %xmm0
  718. movhps 22 * SIZE(X), %xmm0
  719. movaps %xmm1, %xmm2
  720. movaps %xmm0, %xmm3
  721. mulps C, %xmm0
  722. mulps S, %xmm1
  723. mulps C, %xmm2
  724. mulps S, %xmm3
  725. addps %xmm1, %xmm0
  726. subps %xmm3, %xmm2
  727. movlps %xmm0, 20 * SIZE(X)
  728. movhps %xmm0, 22 * SIZE(X)
  729. movlps %xmm2, 20 * SIZE(Y)
  730. movhps %xmm2, 22 * SIZE(Y)
  731. #ifdef PREFETCHW
  732. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  733. #endif
  734. movsd 24 * SIZE(Y), %xmm1
  735. movhps 26 * SIZE(Y), %xmm1
  736. movsd 24 * SIZE(X), %xmm0
  737. movhps 26 * SIZE(X), %xmm0
  738. movaps %xmm1, %xmm2
  739. movaps %xmm0, %xmm3
  740. mulps C, %xmm0
  741. mulps S, %xmm1
  742. mulps C, %xmm2
  743. mulps S, %xmm3
  744. addps %xmm1, %xmm0
  745. subps %xmm3, %xmm2
  746. movlps %xmm0, 24 * SIZE(X)
  747. movhps %xmm0, 26 * SIZE(X)
  748. movlps %xmm2, 24 * SIZE(Y)
  749. movhps %xmm2, 26 * SIZE(Y)
  750. movsd 28 * SIZE(Y), %xmm1
  751. movhps 30 * SIZE(Y), %xmm1
  752. movsd 28 * SIZE(X), %xmm0
  753. movhps 30 * SIZE(X), %xmm0
  754. movaps %xmm1, %xmm2
  755. movaps %xmm0, %xmm3
  756. mulps C, %xmm0
  757. mulps S, %xmm1
  758. mulps C, %xmm2
  759. mulps S, %xmm3
  760. addps %xmm1, %xmm0
  761. subps %xmm3, %xmm2
  762. movlps %xmm0, 28 * SIZE(X)
  763. movhps %xmm0, 30 * SIZE(X)
  764. movlps %xmm2, 28 * SIZE(Y)
  765. movhps %xmm2, 30 * SIZE(Y)
  766. addl $32 * SIZE, X
  767. addl $32 * SIZE, Y
  768. decl I
  769. jg .L31
  770. ALIGN_3
  771. .L34:
  772. testl $15, N
  773. jle .L999
  774. testl $8, N
  775. jle .L35
  776. movsd 0 * SIZE(Y), %xmm1
  777. movhps 2 * SIZE(Y), %xmm1
  778. movsd 0 * SIZE(X), %xmm0
  779. movhps 2 * SIZE(X), %xmm0
  780. movaps %xmm1, %xmm2
  781. movaps %xmm0, %xmm3
  782. mulps C, %xmm0
  783. mulps S, %xmm1
  784. mulps C, %xmm2
  785. mulps S, %xmm3
  786. addps %xmm1, %xmm0
  787. subps %xmm3, %xmm2
  788. movlps %xmm0, 0 * SIZE(X)
  789. movhps %xmm0, 2 * SIZE(X)
  790. movlps %xmm2, 0 * SIZE(Y)
  791. movhps %xmm2, 2 * SIZE(Y)
  792. movsd 4 * SIZE(Y), %xmm1
  793. movhps 6 * SIZE(Y), %xmm1
  794. movsd 4 * SIZE(X), %xmm0
  795. movhps 6 * SIZE(X), %xmm0
  796. movaps %xmm1, %xmm2
  797. movaps %xmm0, %xmm3
  798. mulps C, %xmm0
  799. mulps S, %xmm1
  800. mulps C, %xmm2
  801. mulps S, %xmm3
  802. addps %xmm1, %xmm0
  803. subps %xmm3, %xmm2
  804. movlps %xmm0, 4 * SIZE(X)
  805. movhps %xmm0, 6 * SIZE(X)
  806. movlps %xmm2, 4 * SIZE(Y)
  807. movhps %xmm2, 6 * SIZE(Y)
  808. movsd 8 * SIZE(Y), %xmm1
  809. movhps 10 * SIZE(Y), %xmm1
  810. movsd 8 * SIZE(X), %xmm0
  811. movhps 10 * SIZE(X), %xmm0
  812. movaps %xmm1, %xmm2
  813. movaps %xmm0, %xmm3
  814. mulps C, %xmm0
  815. mulps S, %xmm1
  816. mulps C, %xmm2
  817. mulps S, %xmm3
  818. addps %xmm1, %xmm0
  819. subps %xmm3, %xmm2
  820. movlps %xmm0, 8 * SIZE(X)
  821. movhps %xmm0, 10 * SIZE(X)
  822. movlps %xmm2, 8 * SIZE(Y)
  823. movhps %xmm2, 10 * SIZE(Y)
  824. movsd 12 * SIZE(Y), %xmm1
  825. movhps 14 * SIZE(Y), %xmm1
  826. movsd 12 * SIZE(X), %xmm0
  827. movhps 14 * SIZE(X), %xmm0
  828. movaps %xmm1, %xmm2
  829. movaps %xmm0, %xmm3
  830. mulps C, %xmm0
  831. mulps S, %xmm1
  832. mulps C, %xmm2
  833. mulps S, %xmm3
  834. addps %xmm1, %xmm0
  835. subps %xmm3, %xmm2
  836. movlps %xmm0, 12 * SIZE(X)
  837. movhps %xmm0, 14 * SIZE(X)
  838. movlps %xmm2, 12 * SIZE(Y)
  839. movhps %xmm2, 14 * SIZE(Y)
  840. addl $16 * SIZE, X
  841. addl $16 * SIZE, Y
  842. ALIGN_3
  843. .L35:
  844. testl $4, N
  845. jle .L36
  846. movsd 0 * SIZE(Y), %xmm1
  847. movhps 2 * SIZE(Y), %xmm1
  848. movsd 0 * SIZE(X), %xmm0
  849. movhps 2 * SIZE(X), %xmm0
  850. movaps %xmm1, %xmm2
  851. movaps %xmm0, %xmm3
  852. mulps C, %xmm0
  853. mulps S, %xmm1
  854. mulps C, %xmm2
  855. mulps S, %xmm3
  856. addps %xmm1, %xmm0
  857. subps %xmm3, %xmm2
  858. movlps %xmm0, 0 * SIZE(X)
  859. movhps %xmm0, 2 * SIZE(X)
  860. movlps %xmm2, 0 * SIZE(Y)
  861. movhps %xmm2, 2 * SIZE(Y)
  862. movsd 4 * SIZE(Y), %xmm1
  863. movhps 6 * SIZE(Y), %xmm1
  864. movsd 4 * SIZE(X), %xmm0
  865. movhps 6 * SIZE(X), %xmm0
  866. movaps %xmm1, %xmm2
  867. movaps %xmm0, %xmm3
  868. mulps C, %xmm0
  869. mulps S, %xmm1
  870. mulps C, %xmm2
  871. mulps S, %xmm3
  872. addps %xmm1, %xmm0
  873. subps %xmm3, %xmm2
  874. movlps %xmm0, 4 * SIZE(X)
  875. movhps %xmm0, 6 * SIZE(X)
  876. movlps %xmm2, 4 * SIZE(Y)
  877. movhps %xmm2, 6 * SIZE(Y)
  878. addl $8 * SIZE, X
  879. addl $8 * SIZE, Y
  880. ALIGN_3
  881. .L36:
  882. testl $2, N
  883. jle .L37
  884. movsd 0 * SIZE(Y), %xmm1
  885. movhps 2 * SIZE(Y), %xmm1
  886. movsd 0 * SIZE(X), %xmm0
  887. movhps 2 * SIZE(X), %xmm0
  888. movaps %xmm1, %xmm2
  889. movaps %xmm0, %xmm3
  890. mulps C, %xmm0
  891. mulps S, %xmm1
  892. mulps C, %xmm2
  893. mulps S, %xmm3
  894. addps %xmm1, %xmm0
  895. subps %xmm3, %xmm2
  896. movlps %xmm0, 0 * SIZE(X)
  897. movhps %xmm0, 2 * SIZE(X)
  898. movlps %xmm2, 0 * SIZE(Y)
  899. movhps %xmm2, 2 * SIZE(Y)
  900. addl $4 * SIZE, X
  901. addl $4 * SIZE, Y
  902. ALIGN_3
  903. .L37:
  904. testl $1, N
  905. jle .L999
  906. #ifndef HAVE_SSE2
  907. xorps %xmm0, %xmm0
  908. xorps %xmm1, %xmm1
  909. #endif
  910. movsd 0 * SIZE(Y), %xmm1
  911. movsd 0 * SIZE(X), %xmm0
  912. movaps %xmm1, %xmm2
  913. movaps %xmm0, %xmm3
  914. mulps C, %xmm0
  915. mulps S, %xmm1
  916. mulps C, %xmm2
  917. mulps S, %xmm3
  918. addps %xmm1, %xmm0
  919. subps %xmm3, %xmm2
  920. movlps %xmm0, 0 * SIZE(X)
  921. movlps %xmm2, 0 * SIZE(Y)
  922. jmp .L999
  923. ALIGN_3
  924. ALIGN_3
  925. .L50:
  926. movl N, I
  927. //if incx ==0 || incy==0 jump to the tail
  928. cmpl $0, INCX
  929. je .L56
  930. cmpl $0, INCY
  931. je .L56
  932. sarl $2, I
  933. jle .L55
  934. ALIGN_3
  935. .L53:
  936. movsd (Y), %xmm1
  937. movhps (Y, INCY), %xmm1
  938. movsd (X), %xmm0
  939. movhps (X, INCX), %xmm0
  940. movaps %xmm1, %xmm2
  941. movaps %xmm0, %xmm3
  942. mulps C, %xmm0
  943. mulps S, %xmm1
  944. mulps C, %xmm2
  945. mulps S, %xmm3
  946. addps %xmm1, %xmm0
  947. subps %xmm3, %xmm2
  948. movlps %xmm0, (X)
  949. movhps %xmm0, (X, INCX)
  950. movlps %xmm2, (Y)
  951. movhps %xmm2, (Y, INCY)
  952. leal (X, INCX, 2), X
  953. leal (Y, INCY, 2), Y
  954. movsd (Y), %xmm1
  955. movhps (Y, INCY), %xmm1
  956. movsd (X), %xmm0
  957. movhps (X, INCX), %xmm0
  958. movaps %xmm1, %xmm2
  959. movaps %xmm0, %xmm3
  960. mulps C, %xmm0
  961. mulps S, %xmm1
  962. mulps C, %xmm2
  963. mulps S, %xmm3
  964. addps %xmm1, %xmm0
  965. subps %xmm3, %xmm2
  966. movlps %xmm0, (X)
  967. movhps %xmm0, (X, INCX)
  968. movlps %xmm2, (Y)
  969. movhps %xmm2, (Y, INCY)
  970. leal (X, INCX, 2), X
  971. leal (Y, INCY, 2), Y
  972. decl I
  973. jg .L53
  974. ALIGN_3
  975. .L55:
  976. #ifndef HAVE_SSE2
  977. xorps %xmm0, %xmm0
  978. xorps %xmm1, %xmm1
  979. #endif
  980. movl N, I
  981. andl $3, I
  982. jle .L999
  983. ALIGN_3
  984. .L56:
  985. movsd (Y), %xmm1
  986. movsd (X), %xmm0
  987. movaps %xmm1, %xmm2
  988. movaps %xmm0, %xmm3
  989. mulps C, %xmm0
  990. mulps S, %xmm1
  991. mulps C, %xmm2
  992. mulps S, %xmm3
  993. addps %xmm1, %xmm0
  994. subps %xmm3, %xmm2
  995. movlps %xmm0, (X)
  996. movlps %xmm2, (Y)
  997. addl INCX, X
  998. addl INCY, Y
  999. decl I
  1000. jg .L56
  1001. ALIGN_3
  1002. .L999:
  1003. popl %ebx
  1004. popl %esi
  1005. popl %edi
  1006. ret
  1007. EPILOGUE