You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zsymv_L_sse.S 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef ATOM
  41. #define PREFETCH prefetcht0
  42. #define PREFETCHW prefetcht0
  43. #define PREFETCHSIZE (16 * 24)
  44. #endif
  45. #ifdef CORE2
  46. #define PREFETCH prefetcht0
  47. #define PREFETCHW prefetcht0
  48. #define PREFETCHSIZE (16 * 24)
  49. #endif
  50. #if defined(PENRYN) || defined(DUNNINGTON)
  51. #define PREFETCH prefetcht0
  52. #define PREFETCHW prefetcht0
  53. #define PREFETCHSIZE (16 * 24)
  54. #endif
  55. #if defined(NEHALEM) || defined(SANDYBRIDGE)
  56. #define PREFETCH prefetcht0
  57. #define PREFETCHW prefetcht0
  58. #define PREFETCHSIZE (16 * 24)
  59. #endif
  60. #ifdef PENTIUM4
  61. #define PREFETCH prefetcht0
  62. #define PREFETCHW prefetcht0
  63. #define PREFETCHSIZE (16 * 28)
  64. #endif
  65. #ifdef OPTERON
  66. #define PREFETCH prefetch
  67. #define PREFETCHW prefetchw
  68. #define PREFETCHSIZE (16 * 12)
  69. #define movsd movlpd
  70. #endif
  71. #ifdef NANO
  72. #define PREFETCH prefetcht0
  73. #define PREFETCHW prefetcht0
  74. #define PREFETCHSIZE (16 * 24)
  75. #endif
  76. #ifdef GENERIC
  77. #define PREFETCH prefetcht0
  78. #define PREFETCHW prefetcht0
  79. #define PREFETCHSIZE (16 * 12)
  80. #endif
  81. #ifndef WINDOWS_ABI
  82. #define STACKSIZE 80
  83. #define OLD_Y 8 + STACKSIZE(%rsp)
  84. #define OLD_INCY 16 + STACKSIZE(%rsp)
  85. #define OLD_BUFFER 24 + STACKSIZE(%rsp)
  86. #define M ARG1
  87. #define N ARG2
  88. #define A ARG3
  89. #define LDA ARG4
  90. #define X ARG5
  91. #define INCX ARG6
  92. #else
  93. #define STACKSIZE 256
  94. #define OLD_A 40 + STACKSIZE(%rsp)
  95. #define OLD_LDA 48 + STACKSIZE(%rsp)
  96. #define OLD_X 56 + STACKSIZE(%rsp)
  97. #define OLD_INCX 64 + STACKSIZE(%rsp)
  98. #define OLD_Y 72 + STACKSIZE(%rsp)
  99. #define OLD_INCY 80 + STACKSIZE(%rsp)
  100. #define OLD_BUFFER 88 + STACKSIZE(%rsp)
  101. #define M ARG1
  102. #define N ARG2
  103. #define A ARG4
  104. #define LDA ARG3
  105. #define X %rdi
  106. #define INCX %rsi
  107. #endif
  108. #define Y %r10
  109. #define INCY %r11
  110. #define BUFFER %r12
  111. #define TEMP %rax
  112. #define I %rax
  113. #define A1 %rbx
  114. #define A2 %rbp
  115. #define XX %r13
  116. #define YY %r14
  117. #define IS %r15
  118. #define NEW_X BUFFER
  119. #define NEW_Y X
  120. #define ALPHA_R %xmm0
  121. #define ALPHA_I %xmm1
  122. #define xtemp1 %xmm0
  123. #define xtemp2 %xmm1
  124. #define xtemp3 %xmm2
  125. #define xtemp4 %xmm3
  126. #define atemp1 %xmm4
  127. #define atemp2 %xmm5
  128. #define atemp3 %xmm6
  129. #define atemp4 %xmm7
  130. #define xsum1 %xmm8
  131. #define xsum2 %xmm9
  132. #define yy1 %xmm10
  133. #define yy2 %xmm11
  134. #define a1 %xmm12
  135. #define a2 %xmm13
  136. #define a3 %xmm14
  137. #define xt1 %xmm15
  138. #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
  139. #define MOVDDUP(a, b, c) movddup a(b), c
  140. #define MOVDDUP2(a, b, c) movddup a##b, c
  141. #else
  142. #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c
  143. #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c
  144. #endif
  145. PROLOGUE
  146. PROFCODE
  147. subq $STACKSIZE, %rsp
  148. movq %rbx, 0(%rsp)
  149. movq %rbp, 8(%rsp)
  150. movq %r12, 16(%rsp)
  151. movq %r13, 24(%rsp)
  152. movq %r14, 32(%rsp)
  153. movq %r15, 40(%rsp)
  154. movq OLD_Y, Y
  155. movq OLD_INCY, INCY
  156. movq OLD_BUFFER, BUFFER
  157. salq $ZBASE_SHIFT, INCX
  158. salq $ZBASE_SHIFT, INCY
  159. salq $ZBASE_SHIFT, LDA
  160. testq M, M
  161. jle .L999
  162. pcmpeqb %xmm2, %xmm2
  163. xorpd %xmm3, %xmm3
  164. psllq $63, %xmm2
  165. unpcklpd %xmm3, %xmm2
  166. unpcklpd ALPHA_I, ALPHA_R
  167. unpcklpd ALPHA_R, ALPHA_I
  168. xorpd %xmm2, ALPHA_I
  169. movq BUFFER, XX
  170. movq M, %rax
  171. sarq $2, %rax
  172. jle .L02
  173. ALIGN_3
  174. .L01:
  175. MOVDDUP(0 * SIZE, X, %xmm3)
  176. MOVDDUP(1 * SIZE, X, %xmm4)
  177. addq INCX, X
  178. MOVDDUP(0 * SIZE, X, %xmm5)
  179. MOVDDUP(1 * SIZE, X, %xmm6)
  180. addq INCX, X
  181. mulpd ALPHA_R, %xmm3
  182. mulpd ALPHA_I, %xmm4
  183. mulpd ALPHA_R, %xmm5
  184. mulpd ALPHA_I, %xmm6
  185. addpd %xmm4, %xmm3
  186. addpd %xmm6, %xmm5
  187. movapd %xmm3, 0 * SIZE(XX)
  188. SHUFPD_1 %xmm3, %xmm3
  189. pxor %xmm2, %xmm3
  190. movapd %xmm3, 2 * SIZE(XX)
  191. movapd %xmm5, 4 * SIZE(XX)
  192. SHUFPD_1 %xmm5, %xmm5
  193. pxor %xmm2, %xmm5
  194. movapd %xmm5, 6 * SIZE(XX)
  195. MOVDDUP(0 * SIZE, X, %xmm3)
  196. MOVDDUP(1 * SIZE, X, %xmm4)
  197. addq INCX, X
  198. MOVDDUP(0 * SIZE, X, %xmm5)
  199. MOVDDUP(1 * SIZE, X, %xmm6)
  200. addq INCX, X
  201. mulpd ALPHA_R, %xmm3
  202. mulpd ALPHA_I, %xmm4
  203. mulpd ALPHA_R, %xmm5
  204. mulpd ALPHA_I, %xmm6
  205. addpd %xmm4, %xmm3
  206. addpd %xmm6, %xmm5
  207. movapd %xmm3, 8 * SIZE(XX)
  208. SHUFPD_1 %xmm3, %xmm3
  209. pxor %xmm2, %xmm3
  210. movapd %xmm3, 10 * SIZE(XX)
  211. movapd %xmm5, 12 * SIZE(XX)
  212. SHUFPD_1 %xmm5, %xmm5
  213. pxor %xmm2, %xmm5
  214. movapd %xmm5, 14 * SIZE(XX)
  215. subq $-16 * SIZE, XX
  216. decq %rax
  217. jg .L01
  218. ALIGN_3
  219. .L02:
  220. movq M, %rax
  221. andq $3, %rax
  222. jle .L05
  223. ALIGN_3
  224. .L03:
  225. MOVDDUP(0 * SIZE, X, %xmm3)
  226. MOVDDUP(1 * SIZE, X, %xmm4)
  227. addq INCX, X
  228. mulpd ALPHA_R, %xmm3
  229. mulpd ALPHA_I, %xmm4
  230. addpd %xmm4, %xmm3
  231. movapd %xmm3, 0 * SIZE(XX)
  232. SHUFPD_1 %xmm3, %xmm3
  233. pxor %xmm2, %xmm3
  234. movapd %xmm3, 2 * SIZE(XX)
  235. addq $4 * SIZE, XX
  236. decq %rax
  237. jg .L03
  238. ALIGN_3
  239. .L05:
  240. /* now we don't need original X */
  241. movq Y, NEW_Y
  242. addq $512, XX
  243. andq $-512, XX
  244. cmpq $2 * SIZE, INCY
  245. je .L10
  246. movq Y, YY
  247. movq XX, NEW_Y
  248. movq M, %rax
  249. sarq $2, %rax
  250. jle .L07
  251. ALIGN_3
  252. .L06:
  253. movsd 0 * SIZE(YY), %xmm0
  254. movhpd 1 * SIZE(YY), %xmm0
  255. addq INCY, YY
  256. movsd 0 * SIZE(YY), %xmm1
  257. movhpd 1 * SIZE(YY), %xmm1
  258. addq INCY, YY
  259. movsd 0 * SIZE(YY), %xmm2
  260. movhpd 1 * SIZE(YY), %xmm2
  261. addq INCY, YY
  262. movsd 0 * SIZE(YY), %xmm3
  263. movhpd 1 * SIZE(YY), %xmm3
  264. addq INCY, YY
  265. movapd %xmm0, 0 * SIZE(XX)
  266. movapd %xmm1, 2 * SIZE(XX)
  267. movapd %xmm2, 4 * SIZE(XX)
  268. movapd %xmm3, 6 * SIZE(XX)
  269. addq $8 * SIZE, XX
  270. decq %rax
  271. jg .L06
  272. ALIGN_3
  273. .L07:
  274. movq M, %rax
  275. andq $3, %rax
  276. jle .L10
  277. ALIGN_3
  278. .L08:
  279. movsd 0 * SIZE(YY), %xmm0
  280. movhpd 1 * SIZE(YY), %xmm0
  281. addq INCY, YY
  282. movapd %xmm0, 0 * SIZE(XX)
  283. addq $2 * SIZE, XX
  284. decq %rax
  285. jg .L08
  286. ALIGN_3
  287. .L10:
  288. xorq IS, IS # is = 0
  289. cmpq $2, N
  290. jl .L20
  291. ALIGN_3
  292. .L11:
  293. movq A, A1
  294. leaq (A, LDA, 1), A2
  295. leaq 4 * SIZE(A, LDA, 2), A
  296. leaq (, IS, SIZE), I
  297. leaq 0 * SIZE(NEW_X, I, 4), XX
  298. leaq 4 * SIZE(NEW_Y, I, 2), YY
  299. movapd 0 * SIZE(XX), atemp1
  300. movapd 2 * SIZE(XX), atemp2
  301. movapd 4 * SIZE(XX), atemp3
  302. movapd 6 * SIZE(XX), atemp4
  303. MOVDDUP(0 * SIZE, A1, xsum1)
  304. MOVDDUP(2 * SIZE, A1, xsum2)
  305. mulpd atemp1, xsum1
  306. mulpd atemp1, xsum2
  307. MOVDDUP(1 * SIZE, A1, a1)
  308. MOVDDUP(3 * SIZE, A1, a2)
  309. mulpd atemp2, a1
  310. mulpd atemp2, a2
  311. addpd a1, xsum1
  312. addpd a2, xsum2
  313. MOVDDUP(2 * SIZE, A1, a1)
  314. MOVDDUP(2 * SIZE, A2, a2)
  315. mulpd atemp3, a1
  316. mulpd atemp3, a2
  317. addpd a1, xsum1
  318. addpd a2, xsum2
  319. MOVDDUP(3 * SIZE, A1, a1)
  320. MOVDDUP(3 * SIZE, A2, a2)
  321. mulpd atemp4, a1
  322. mulpd atemp4, a2
  323. addpd a1, xsum1
  324. addpd a2, xsum2
  325. MOVDDUP(4 * SIZE, A1, a1)
  326. MOVDDUP(6 * SIZE, A2, a2)
  327. movsd 0 * SIZE(YY), yy1
  328. movhpd 1 * SIZE(YY), yy1
  329. movsd 2 * SIZE(YY), yy2
  330. movhpd 3 * SIZE(YY), yy2
  331. movapd 8 * SIZE(XX), xtemp1
  332. movapd 10 * SIZE(XX), xtemp2
  333. movapd 12 * SIZE(XX), xtemp3
  334. movapd 14 * SIZE(XX), xtemp4
  335. addq $8 * SIZE, XX
  336. addq $4 * SIZE, A1
  337. addq $4 * SIZE, A2
  338. movq M, I
  339. subq IS, I
  340. subq $2, I
  341. sarq $2, I
  342. jle .L15
  343. ALIGN_3
  344. .L12:
  345. movapd xtemp1, xt1
  346. mulpd a1, xt1
  347. mulpd atemp1, a1
  348. addpd xt1, xsum1
  349. addpd a1, yy1
  350. MOVDDUP(1 * SIZE, A1, a1)
  351. PREFETCH PREFETCHSIZE(A1)
  352. movapd xtemp3, xt1
  353. mulpd a2, xt1
  354. mulpd atemp3, a2
  355. addpd xt1, xsum2
  356. addpd a2, yy2
  357. MOVDDUP(3 * SIZE, A2, a2)
  358. movapd xtemp2, xt1
  359. mulpd a1, xt1
  360. mulpd atemp2, a1
  361. addpd xt1, xsum1
  362. addpd a1, yy1
  363. MOVDDUP(2 * SIZE, A1, a1)
  364. movapd xtemp4, xt1
  365. mulpd a2, xt1
  366. mulpd atemp4, a2
  367. addpd xt1, xsum2
  368. addpd a2, yy2
  369. MOVDDUP(0 * SIZE, A2, a2)
  370. PREFETCH PREFETCHSIZE(XX)
  371. movapd xtemp3, xt1
  372. movapd 12 * SIZE(XX), xtemp3
  373. mulpd a1, xt1
  374. mulpd atemp1, a1
  375. addpd xt1, xsum1
  376. addpd a1, yy2
  377. MOVDDUP(3 * SIZE, A1, a1)
  378. movapd xtemp1, xt1
  379. movapd 8 * SIZE(XX), xtemp1
  380. mulpd a2, xt1
  381. mulpd atemp3, a2
  382. addpd xt1, xsum2
  383. addpd a2, yy1
  384. MOVDDUP(1 * SIZE, A2, a2)
  385. movapd xtemp4, xt1
  386. movapd 14 * SIZE(XX), xtemp4
  387. mulpd a1, xt1
  388. mulpd atemp2, a1
  389. addpd xt1, xsum1
  390. addpd a1, yy2
  391. MOVDDUP(4 * SIZE, A1, a1)
  392. movlpd yy2, 2 * SIZE(YY)
  393. movhpd yy2, 3 * SIZE(YY)
  394. movsd 6 * SIZE(YY), yy2
  395. movhpd 7 * SIZE(YY), yy2
  396. movapd xtemp2, xt1
  397. movapd 10 * SIZE(XX), xtemp2
  398. mulpd a2, xt1
  399. mulpd atemp4, a2
  400. addpd xt1, xsum2
  401. addpd a2, yy1
  402. MOVDDUP(6 * SIZE, A2, a2)
  403. PREFETCH PREFETCHSIZE(A2)
  404. movlpd yy1, 0 * SIZE(YY)
  405. movhpd yy1, 1 * SIZE(YY)
  406. movsd 4 * SIZE(YY), yy1
  407. movhpd 5 * SIZE(YY), yy1
  408. movapd xtemp1, xt1
  409. mulpd a1, xt1
  410. mulpd atemp1, a1
  411. addpd xt1, xsum1
  412. addpd a1, yy1
  413. MOVDDUP(5 * SIZE, A1, a1)
  414. movapd xtemp3, xt1
  415. mulpd a2, xt1
  416. mulpd atemp3, a2
  417. addpd xt1, xsum2
  418. addpd a2, yy2
  419. MOVDDUP(7 * SIZE, A2, a2)
  420. movapd xtemp2, xt1
  421. mulpd a1, xt1
  422. mulpd atemp2, a1
  423. addpd xt1, xsum1
  424. addpd a1, yy1
  425. MOVDDUP(6 * SIZE, A1, a1)
  426. PREFETCHW PREFETCHSIZE(YY)
  427. movapd xtemp4, xt1
  428. mulpd a2, xt1
  429. mulpd atemp4, a2
  430. addpd xt1, xsum2
  431. addpd a2, yy2
  432. MOVDDUP(4 * SIZE, A2, a2)
  433. movapd xtemp3, xt1
  434. movapd 20 * SIZE(XX), xtemp3
  435. mulpd a1, xt1
  436. mulpd atemp1, a1
  437. addpd xt1, xsum1
  438. addpd a1, yy2
  439. MOVDDUP(7 * SIZE, A1, a1)
  440. movapd xtemp1, xt1
  441. movapd 16 * SIZE(XX), xtemp1
  442. mulpd a2, xt1
  443. mulpd atemp3, a2
  444. addpd xt1, xsum2
  445. addpd a2, yy1
  446. MOVDDUP(5 * SIZE, A2, a2)
  447. movapd xtemp4, xt1
  448. movapd 22 * SIZE(XX), xtemp4
  449. mulpd a1, xt1
  450. mulpd atemp2, a1
  451. addpd xt1, xsum1
  452. addpd a1, yy2
  453. MOVDDUP( 8 * SIZE, A1, a1)
  454. movlpd yy2, 6 * SIZE(YY)
  455. movhpd yy2, 7 * SIZE(YY)
  456. movsd 10 * SIZE(YY), yy2
  457. movhpd 11 * SIZE(YY), yy2
  458. movapd xtemp2, xt1
  459. movapd 18 * SIZE(XX), xtemp2
  460. mulpd a2, xt1
  461. mulpd atemp4, a2
  462. addpd xt1, xsum2
  463. addpd a2, yy1
  464. MOVDDUP(10 * SIZE, A2, a2)
  465. movlpd yy1, 4 * SIZE(YY)
  466. movhpd yy1, 5 * SIZE(YY)
  467. movsd 8 * SIZE(YY), yy1
  468. movhpd 9 * SIZE(YY), yy1
  469. subq $-16 * SIZE, XX
  470. addq $ 8 * SIZE, YY
  471. addq $ 8 * SIZE, A1
  472. addq $ 8 * SIZE, A2
  473. decq I
  474. jg .L12
  475. ALIGN_3
  476. .L15:
  477. movq M, I
  478. subq IS, I
  479. subq $2, I
  480. testq $2, I
  481. jle .L16
  482. movapd xtemp1, xt1
  483. mulpd a1, xt1
  484. mulpd atemp1, a1
  485. addpd xt1, xsum1
  486. addpd a1, yy1
  487. MOVDDUP(1 * SIZE, A1, a1)
  488. movapd xtemp3, xt1
  489. mulpd a2, xt1
  490. mulpd atemp3, a2
  491. addpd xt1, xsum2
  492. addpd a2, yy2
  493. MOVDDUP(3 * SIZE, A2, a2)
  494. movapd xtemp2, xt1
  495. mulpd a1, xt1
  496. mulpd atemp2, a1
  497. addpd xt1, xsum1
  498. addpd a1, yy1
  499. MOVDDUP(2 * SIZE, A1, a1)
  500. movapd xtemp4, xt1
  501. mulpd a2, xt1
  502. mulpd atemp4, a2
  503. addpd xt1, xsum2
  504. addpd a2, yy2
  505. MOVDDUP(0 * SIZE, A2, a2)
  506. movapd xtemp3, xt1
  507. movapd 12 * SIZE(XX), xtemp3
  508. mulpd a1, xt1
  509. mulpd atemp1, a1
  510. addpd xt1, xsum1
  511. addpd a1, yy2
  512. MOVDDUP(3 * SIZE, A1, a1)
  513. movapd xtemp1, xt1
  514. movapd 8 * SIZE(XX), xtemp1
  515. mulpd a2, xt1
  516. mulpd atemp3, a2
  517. addpd xt1, xsum2
  518. addpd a2, yy1
  519. MOVDDUP(1 * SIZE, A2, a2)
  520. movapd xtemp4, xt1
  521. movapd 14 * SIZE(XX), xtemp4
  522. mulpd a1, xt1
  523. mulpd atemp2, a1
  524. addpd xt1, xsum1
  525. addpd a1, yy2
  526. MOVDDUP(4 * SIZE, A1, a1)
  527. movlpd yy2, 2 * SIZE(YY)
  528. movhpd yy2, 3 * SIZE(YY)
  529. movsd 6 * SIZE(YY), yy2
  530. movhpd 7 * SIZE(YY), yy2
  531. movapd xtemp2, xt1
  532. movapd 10 * SIZE(XX), xtemp2
  533. mulpd a2, xt1
  534. mulpd atemp4, a2
  535. addpd xt1, xsum2
  536. addpd a2, yy1
  537. movlpd yy1, 0 * SIZE(YY)
  538. movhpd yy1, 1 * SIZE(YY)
  539. movsd 4 * SIZE(YY), yy1
  540. movhpd 5 * SIZE(YY), yy1
  541. addq $4 * SIZE, YY
  542. addq $4 * SIZE, A1
  543. addq $4 * SIZE, A2
  544. ALIGN_3
  545. .L16:
  546. testq $1, M
  547. jle .L18
  548. MOVDDUP(1 * SIZE, A1, a2)
  549. movapd xtemp1, xt1
  550. mulpd a1, xt1
  551. mulpd atemp1, a1
  552. addpd xt1, xsum1
  553. addpd a1, yy1
  554. MOVDDUP(0 * SIZE, A2, a1)
  555. movapd xtemp2, xt1
  556. mulpd a2, xt1
  557. mulpd atemp2, a2
  558. addpd xt1, xsum1
  559. addpd a2, yy1
  560. MOVDDUP(1 * SIZE, A2, a2)
  561. movapd xtemp1, xt1
  562. mulpd a1, xt1
  563. mulpd atemp3, a1
  564. addpd xt1, xsum2
  565. addpd a1, yy1
  566. movapd xtemp2, xt1
  567. mulpd a2, xt1
  568. mulpd atemp4, a2
  569. addpd xt1, xsum2
  570. addpd a2, yy1
  571. movlpd yy1, 0 * SIZE(YY)
  572. movhpd yy1, 1 * SIZE(YY)
  573. ALIGN_3
  574. .L18:
  575. leaq (, IS, SIZE), I
  576. movsd 0 * SIZE(NEW_Y, I, 2), yy1
  577. movhpd 1 * SIZE(NEW_Y, I, 2), yy1
  578. movsd 2 * SIZE(NEW_Y, I, 2), yy2
  579. movhpd 3 * SIZE(NEW_Y, I, 2), yy2
  580. addpd xsum1, yy1
  581. addpd xsum2, yy2
  582. movlpd yy1, 0 * SIZE(NEW_Y, I, 2)
  583. movhpd yy1, 1 * SIZE(NEW_Y, I, 2)
  584. movlpd yy2, 2 * SIZE(NEW_Y, I, 2)
  585. movhpd yy2, 3 * SIZE(NEW_Y, I, 2)
  586. addq $2, IS
  587. movq IS, I
  588. addq $2, I
  589. cmpq M, I
  590. jle .L11
  591. ALIGN_3
  592. .L20:
  593. HALT
  594. testq $1, N
  595. jle .L990
  596. leaq (, IS, SIZE), I
  597. movapd 0 * SIZE(NEW_X, I, 4), atemp1
  598. movapd 2 * SIZE(NEW_X, I, 4), atemp2
  599. movsd 0 * SIZE(NEW_Y, I, 2), yy1
  600. movhpd 1 * SIZE(NEW_Y, I, 2), yy1
  601. MOVDDUP(0 * SIZE, A, a1)
  602. MOVDDUP(1 * SIZE, A, a2)
  603. mulpd atemp1, a1
  604. mulpd atemp2, a2
  605. addpd a1, yy1
  606. addpd a2, yy1
  607. movlpd yy1, 0 * SIZE(NEW_Y, I, 2)
  608. movhpd yy1, 1 * SIZE(NEW_Y, I, 2)
  609. ALIGN_3
  610. .L990:
  611. cmpq $2 * SIZE, INCY
  612. je .L999
  613. movq M, %rax
  614. sarq $2, %rax
  615. jle .L997
  616. ALIGN_3
  617. .L996:
  618. movapd 0 * SIZE(NEW_Y), %xmm0
  619. movapd 2 * SIZE(NEW_Y), %xmm1
  620. movapd 4 * SIZE(NEW_Y), %xmm2
  621. movapd 6 * SIZE(NEW_Y), %xmm3
  622. movsd %xmm0, 0 * SIZE(Y)
  623. movhpd %xmm0, 1 * SIZE(Y)
  624. addq INCY, Y
  625. movsd %xmm1, 0 * SIZE(Y)
  626. movhpd %xmm1, 1 * SIZE(Y)
  627. addq INCY, Y
  628. movsd %xmm2, 0 * SIZE(Y)
  629. movhpd %xmm2, 1 * SIZE(Y)
  630. addq INCY, Y
  631. movsd %xmm3, 0 * SIZE(Y)
  632. movhpd %xmm3, 1 * SIZE(Y)
  633. addq INCY, Y
  634. addq $8 * SIZE, NEW_Y
  635. decq %rax
  636. jg .L996
  637. ALIGN_3
  638. .L997:
  639. movq M, %rax
  640. andq $3, %rax
  641. jle .L999
  642. ALIGN_3
  643. .L998:
  644. movapd 0 * SIZE(NEW_Y), %xmm0
  645. movsd %xmm0, 0 * SIZE(Y)
  646. movhpd %xmm0, 1 * SIZE(Y)
  647. addq INCY, Y
  648. addq $2 * SIZE, NEW_Y
  649. decq %rax
  650. jg .L998
  651. ALIGN_3
  652. .L999:
  653. movq 0(%rsp), %rbx
  654. movq 8(%rsp), %rbp
  655. movq 16(%rsp), %r12
  656. movq 24(%rsp), %r13
  657. movq 32(%rsp), %r14
  658. movq 40(%rsp), %r15
  659. addq $STACKSIZE, %rsp
  660. ret
  661. EPILOGUE