You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zaxpy_atom.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef WINDOWS_ABI
  41. #define M ARG1
  42. #define X ARG4
  43. #define INCX ARG5
  44. #define Y ARG6
  45. #define INCY ARG2
  46. #else
  47. #define M ARG1
  48. #define X ARG2
  49. #define INCX ARG3
  50. #define Y ARG4
  51. #define INCY %r10
  52. #endif
  53. #define YY %r11
  54. #define ALPHA_R %xmm14
  55. #define ALPHA_I %xmm15
  56. #include "l1param.h"
  57. PROLOGUE
  58. PROFCODE
  59. #ifndef WINDOWS_ABI
  60. #ifndef XDOUBLE
  61. movq 8(%rsp), INCY
  62. #else
  63. movq 40(%rsp), INCY
  64. #endif
  65. #else
  66. movaps %xmm3, %xmm0
  67. movsd 40(%rsp), %xmm1
  68. movq 48(%rsp), X
  69. movq 56(%rsp), INCX
  70. movq 64(%rsp), Y
  71. movq 72(%rsp), INCY
  72. #endif
  73. SAVEREGISTERS
  74. #ifndef CONJ
  75. #define ADD1 subsd
  76. #define ADD2 addsd
  77. #else
  78. #define ADD1 addsd
  79. #define ADD2 subsd
  80. #endif
  81. salq $ZBASE_SHIFT, INCX
  82. movaps %xmm0, ALPHA_R
  83. salq $ZBASE_SHIFT, INCY
  84. movaps %xmm1, ALPHA_I
  85. testq M, M
  86. jle .L999
  87. cmpq $2 * SIZE, INCX
  88. jne .L20
  89. cmpq $2 * SIZE, INCY
  90. jne .L20
  91. movq M, %rax
  92. sarq $2, %rax
  93. jle .L15
  94. movsd 0 * SIZE(X), %xmm0
  95. movsd 1 * SIZE(X), %xmm1
  96. movsd 0 * SIZE(Y), %xmm8
  97. movsd 1 * SIZE(Y), %xmm9
  98. movsd 2 * SIZE(X), %xmm4
  99. movsd 3 * SIZE(X), %xmm5
  100. movsd 2 * SIZE(Y), %xmm10
  101. movsd 3 * SIZE(Y), %xmm11
  102. movaps %xmm0, %xmm2
  103. mulsd ALPHA_R, %xmm0
  104. movaps %xmm1, %xmm3
  105. mulsd ALPHA_R, %xmm1
  106. mulsd ALPHA_I, %xmm3
  107. mulsd ALPHA_I, %xmm2
  108. movaps %xmm4, %xmm6
  109. mulsd ALPHA_R, %xmm4
  110. addsd %xmm0, %xmm8
  111. movsd 4 * SIZE(X), %xmm0
  112. movaps %xmm5, %xmm7
  113. mulsd ALPHA_R, %xmm5
  114. ADD2 %xmm1, %xmm9
  115. movsd 5 * SIZE(X), %xmm1
  116. decq %rax
  117. jle .L12
  118. ALIGN_3
  119. .L11:
  120. #ifdef PREFETCH
  121. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  122. #endif
  123. mulsd ALPHA_I, %xmm7
  124. movsd 4 * SIZE(Y), %xmm12
  125. ADD1 %xmm3, %xmm8
  126. mulsd ALPHA_I, %xmm6
  127. movsd 5 * SIZE(Y), %xmm13
  128. addsd %xmm2, %xmm9
  129. addsd %xmm4, %xmm10
  130. movsd 6 * SIZE(X), %xmm4
  131. movaps %xmm0, %xmm2
  132. mulsd ALPHA_R, %xmm0
  133. ADD2 %xmm5, %xmm11
  134. movsd 7 * SIZE(X), %xmm5
  135. movaps %xmm1, %xmm3
  136. mulsd ALPHA_R, %xmm1
  137. ADD1 %xmm7, %xmm10
  138. movsd %xmm8, 0 * SIZE(Y)
  139. mulsd ALPHA_I, %xmm3
  140. addsd %xmm6, %xmm11
  141. movsd %xmm9, 1 * SIZE(Y)
  142. mulsd ALPHA_I, %xmm2
  143. movaps %xmm4, %xmm6
  144. movsd %xmm10, 2 * SIZE(Y)
  145. mulsd ALPHA_R, %xmm4
  146. movsd 6 * SIZE(Y), %xmm10
  147. addsd %xmm0, %xmm12
  148. movsd 8 * SIZE(X), %xmm0
  149. #ifdef PREFETCHW
  150. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  151. #endif
  152. movaps %xmm5, %xmm7
  153. movsd %xmm11, 3 * SIZE(Y)
  154. mulsd ALPHA_R, %xmm5
  155. movsd 7 * SIZE(Y), %xmm11
  156. ADD2 %xmm1, %xmm13
  157. movsd 9 * SIZE(X), %xmm1
  158. mulsd ALPHA_I, %xmm7
  159. movsd 8 * SIZE(Y), %xmm8
  160. ADD1 %xmm3, %xmm12
  161. mulsd ALPHA_I, %xmm6
  162. movsd 9 * SIZE(Y), %xmm9
  163. addsd %xmm2, %xmm13
  164. movaps %xmm0, %xmm2
  165. mulsd ALPHA_R, %xmm0
  166. addsd %xmm4, %xmm10
  167. movsd 10 * SIZE(X), %xmm4
  168. movaps %xmm1, %xmm3
  169. mulsd ALPHA_R, %xmm1
  170. ADD2 %xmm5, %xmm11
  171. movsd 11 * SIZE(X), %xmm5
  172. mulsd ALPHA_I, %xmm3
  173. movsd %xmm12, 4 * SIZE(Y)
  174. ADD1 %xmm7, %xmm10
  175. mulsd ALPHA_I, %xmm2
  176. movsd %xmm13, 5 * SIZE(Y)
  177. addsd %xmm6, %xmm11
  178. movaps %xmm4, %xmm6
  179. movsd %xmm10, 6 * SIZE(Y)
  180. mulsd ALPHA_R, %xmm4
  181. addsd %xmm0, %xmm8
  182. movsd 10 * SIZE(Y), %xmm10
  183. movsd 12 * SIZE(X), %xmm0
  184. movaps %xmm5, %xmm7
  185. movsd %xmm11, 7 * SIZE(Y)
  186. mulsd ALPHA_R, %xmm5
  187. movsd 11 * SIZE(Y), %xmm11
  188. ADD2 %xmm1, %xmm9
  189. movsd 13 * SIZE(X), %xmm1
  190. addq $8 * SIZE, X
  191. addq $8 * SIZE, Y
  192. decq %rax
  193. jg .L11
  194. ALIGN_3
  195. .L12:
  196. mulsd ALPHA_I, %xmm7
  197. movsd 4 * SIZE(Y), %xmm12
  198. ADD1 %xmm3, %xmm8
  199. mulsd ALPHA_I, %xmm6
  200. movsd 5 * SIZE(Y), %xmm13
  201. addsd %xmm2, %xmm9
  202. addsd %xmm4, %xmm10
  203. movsd 6 * SIZE(X), %xmm4
  204. movaps %xmm0, %xmm2
  205. mulsd ALPHA_R, %xmm0
  206. ADD2 %xmm5, %xmm11
  207. movsd 7 * SIZE(X), %xmm5
  208. movaps %xmm1, %xmm3
  209. mulsd ALPHA_R, %xmm1
  210. ADD1 %xmm7, %xmm10
  211. movsd %xmm8, 0 * SIZE(Y)
  212. mulsd ALPHA_I, %xmm3
  213. addsd %xmm6, %xmm11
  214. movsd %xmm9, 1 * SIZE(Y)
  215. mulsd ALPHA_I, %xmm2
  216. movaps %xmm4, %xmm6
  217. movsd %xmm10, 2 * SIZE(Y)
  218. mulsd ALPHA_R, %xmm4
  219. movsd 6 * SIZE(Y), %xmm10
  220. addsd %xmm0, %xmm12
  221. movaps %xmm5, %xmm7
  222. movsd %xmm11, 3 * SIZE(Y)
  223. mulsd ALPHA_R, %xmm5
  224. ADD2 %xmm1, %xmm13
  225. movsd 7 * SIZE(Y), %xmm11
  226. mulsd ALPHA_I, %xmm7
  227. ADD1 %xmm3, %xmm12
  228. mulsd ALPHA_I, %xmm6
  229. addsd %xmm2, %xmm13
  230. movaps %xmm0, %xmm2
  231. mulsd ALPHA_R, %xmm0
  232. addsd %xmm4, %xmm10
  233. movaps %xmm1, %xmm3
  234. mulsd ALPHA_R, %xmm1
  235. ADD2 %xmm5, %xmm11
  236. mulsd ALPHA_I, %xmm3
  237. ADD1 %xmm7, %xmm10
  238. addsd %xmm6, %xmm11
  239. mulsd ALPHA_I, %xmm2
  240. movsd %xmm12, 4 * SIZE(Y)
  241. movsd %xmm13, 5 * SIZE(Y)
  242. movsd %xmm10, 6 * SIZE(Y)
  243. movsd %xmm11, 7 * SIZE(Y)
  244. addq $8 * SIZE, X
  245. addq $8 * SIZE, Y
  246. ALIGN_3
  247. .L15:
  248. movq M, %rax
  249. andq $2, %rax
  250. jle .L17
  251. movsd 0 * SIZE(X), %xmm0
  252. movsd 1 * SIZE(X), %xmm1
  253. movsd 2 * SIZE(X), %xmm4
  254. movsd 3 * SIZE(X), %xmm5
  255. movaps %xmm0, %xmm2
  256. movsd 0 * SIZE(Y), %xmm8
  257. mulsd ALPHA_R, %xmm0
  258. movaps %xmm1, %xmm3
  259. movsd 1 * SIZE(Y), %xmm9
  260. mulsd ALPHA_R, %xmm1
  261. movsd 2 * SIZE(Y), %xmm10
  262. mulsd ALPHA_I, %xmm3
  263. movsd 3 * SIZE(Y), %xmm11
  264. mulsd ALPHA_I, %xmm2
  265. movaps %xmm4, %xmm6
  266. mulsd ALPHA_R, %xmm4
  267. addsd %xmm0, %xmm8
  268. movaps %xmm5, %xmm7
  269. mulsd ALPHA_R, %xmm5
  270. ADD2 %xmm1, %xmm9
  271. mulsd ALPHA_I, %xmm7
  272. ADD1 %xmm3, %xmm8
  273. mulsd ALPHA_I, %xmm6
  274. addsd %xmm2, %xmm9
  275. addsd %xmm4, %xmm10
  276. movsd %xmm8, 0 * SIZE(Y)
  277. ADD2 %xmm5, %xmm11
  278. movsd %xmm9, 1 * SIZE(Y)
  279. ADD1 %xmm7, %xmm10
  280. addsd %xmm6, %xmm11
  281. movsd %xmm10, 2 * SIZE(Y)
  282. movsd %xmm11, 3 * SIZE(Y)
  283. addq $4 * SIZE, X
  284. addq $4 * SIZE, Y
  285. ALIGN_3
  286. .L17:
  287. movq M, %rax
  288. andq $1, %rax
  289. jle .L999
  290. movsd 0 * SIZE(X), %xmm0
  291. movsd 1 * SIZE(X), %xmm1
  292. movsd 0 * SIZE(Y), %xmm8
  293. movsd 1 * SIZE(Y), %xmm9
  294. movaps %xmm0, %xmm2
  295. mulsd ALPHA_R, %xmm0
  296. movaps %xmm1, %xmm3
  297. mulsd ALPHA_R, %xmm1
  298. mulsd ALPHA_I, %xmm3
  299. mulsd ALPHA_I, %xmm2
  300. addsd %xmm0, %xmm8
  301. ADD2 %xmm1, %xmm9
  302. ADD1 %xmm3, %xmm8
  303. addsd %xmm2, %xmm9
  304. movsd %xmm8, 0 * SIZE(Y)
  305. movsd %xmm9, 1 * SIZE(Y)
  306. jmp .L999
  307. ALIGN_3
  308. .L20:
  309. movq Y, YY
  310. movq M, %rax
  311. sarq $2, %rax
  312. jle .L25
  313. movsd 0 * SIZE(X), %xmm0
  314. movsd 1 * SIZE(X), %xmm1
  315. addq INCX, X
  316. movsd 0 * SIZE(Y), %xmm8
  317. movsd 1 * SIZE(Y), %xmm9
  318. addq INCY, Y
  319. movsd 0 * SIZE(X), %xmm4
  320. movsd 1 * SIZE(X), %xmm5
  321. addq INCX, X
  322. movsd 0 * SIZE(Y), %xmm10
  323. movsd 1 * SIZE(Y), %xmm11
  324. addq INCY, Y
  325. movaps %xmm0, %xmm2
  326. mulsd ALPHA_R, %xmm0
  327. movaps %xmm1, %xmm3
  328. mulsd ALPHA_R, %xmm1
  329. mulsd ALPHA_I, %xmm3
  330. mulsd ALPHA_I, %xmm2
  331. movaps %xmm4, %xmm6
  332. mulsd ALPHA_R, %xmm4
  333. addsd %xmm0, %xmm8
  334. movsd 0 * SIZE(X), %xmm0
  335. movaps %xmm5, %xmm7
  336. mulsd ALPHA_R, %xmm5
  337. ADD2 %xmm1, %xmm9
  338. movsd 1 * SIZE(X), %xmm1
  339. addq INCX, X
  340. decq %rax
  341. jle .L22
  342. ALIGN_3
  343. .L21:
  344. mulsd ALPHA_I, %xmm7
  345. movsd 0 * SIZE(Y), %xmm12
  346. ADD1 %xmm3, %xmm8
  347. mulsd ALPHA_I, %xmm6
  348. movsd 1 * SIZE(Y), %xmm13
  349. addsd %xmm2, %xmm9
  350. addq INCY, Y
  351. addsd %xmm4, %xmm10
  352. movsd 0 * SIZE(X), %xmm4
  353. movaps %xmm0, %xmm2
  354. mulsd ALPHA_R, %xmm0
  355. ADD2 %xmm5, %xmm11
  356. movsd 1 * SIZE(X), %xmm5
  357. movaps %xmm1, %xmm3
  358. addq INCX, X
  359. mulsd ALPHA_R, %xmm1
  360. ADD1 %xmm7, %xmm10
  361. movsd %xmm8, 0 * SIZE(YY)
  362. mulsd ALPHA_I, %xmm3
  363. addsd %xmm6, %xmm11
  364. movsd %xmm9, 1 * SIZE(YY)
  365. mulsd ALPHA_I, %xmm2
  366. addq INCY, YY
  367. movaps %xmm4, %xmm6
  368. movsd %xmm10, 0 * SIZE(YY)
  369. mulsd ALPHA_R, %xmm4
  370. movsd 0 * SIZE(Y), %xmm10
  371. addsd %xmm0, %xmm12
  372. movsd 0 * SIZE(X), %xmm0
  373. movaps %xmm5, %xmm7
  374. movsd %xmm11, 1 * SIZE(YY)
  375. addq INCY, YY
  376. mulsd ALPHA_R, %xmm5
  377. movsd 1 * SIZE(Y), %xmm11
  378. addq INCY, Y
  379. ADD2 %xmm1, %xmm13
  380. movsd 1 * SIZE(X), %xmm1
  381. addq INCX, X
  382. mulsd ALPHA_I, %xmm7
  383. movsd 0 * SIZE(Y), %xmm8
  384. ADD1 %xmm3, %xmm12
  385. mulsd ALPHA_I, %xmm6
  386. movsd 1 * SIZE(Y), %xmm9
  387. addsd %xmm2, %xmm13
  388. addq INCY, Y
  389. movaps %xmm0, %xmm2
  390. mulsd ALPHA_R, %xmm0
  391. addsd %xmm4, %xmm10
  392. movsd 0 * SIZE(X), %xmm4
  393. movaps %xmm1, %xmm3
  394. mulsd ALPHA_R, %xmm1
  395. ADD2 %xmm5, %xmm11
  396. movsd 1 * SIZE(X), %xmm5
  397. addq INCX, X
  398. mulsd ALPHA_I, %xmm3
  399. movsd %xmm12, 0 * SIZE(YY)
  400. ADD1 %xmm7, %xmm10
  401. mulsd ALPHA_I, %xmm2
  402. movsd %xmm13, 1 * SIZE(YY)
  403. addsd %xmm6, %xmm11
  404. addq INCY, YY
  405. movaps %xmm4, %xmm6
  406. movsd %xmm10, 0 * SIZE(YY)
  407. mulsd ALPHA_R, %xmm4
  408. addsd %xmm0, %xmm8
  409. movsd 0 * SIZE(Y), %xmm10
  410. movsd 0 * SIZE(X), %xmm0
  411. movaps %xmm5, %xmm7
  412. movsd %xmm11, 1 * SIZE(YY)
  413. addq INCY, YY
  414. mulsd ALPHA_R, %xmm5
  415. movsd 1 * SIZE(Y), %xmm11
  416. addq INCY, Y
  417. ADD2 %xmm1, %xmm9
  418. movsd 1 * SIZE(X), %xmm1
  419. addq INCX, X
  420. decq %rax
  421. jg .L21
  422. ALIGN_3
  423. .L22:
  424. mulsd ALPHA_I, %xmm7
  425. movsd 0 * SIZE(Y), %xmm12
  426. ADD1 %xmm3, %xmm8
  427. mulsd ALPHA_I, %xmm6
  428. movsd 1 * SIZE(Y), %xmm13
  429. addsd %xmm2, %xmm9
  430. addq INCY, Y
  431. addsd %xmm4, %xmm10
  432. movsd 0 * SIZE(X), %xmm4
  433. movaps %xmm0, %xmm2
  434. mulsd ALPHA_R, %xmm0
  435. ADD2 %xmm5, %xmm11
  436. movsd 1 * SIZE(X), %xmm5
  437. movaps %xmm1, %xmm3
  438. addq INCX, X
  439. mulsd ALPHA_R, %xmm1
  440. ADD1 %xmm7, %xmm10
  441. movsd %xmm8, 0 * SIZE(YY)
  442. mulsd ALPHA_I, %xmm3
  443. addsd %xmm6, %xmm11
  444. movsd %xmm9, 1 * SIZE(YY)
  445. mulsd ALPHA_I, %xmm2
  446. addq INCY, YY
  447. movaps %xmm4, %xmm6
  448. movsd %xmm10, 0 * SIZE(YY)
  449. mulsd ALPHA_R, %xmm4
  450. movsd 0 * SIZE(Y), %xmm10
  451. addsd %xmm0, %xmm12
  452. movaps %xmm5, %xmm7
  453. movsd %xmm11, 1 * SIZE(YY)
  454. mulsd ALPHA_R, %xmm5
  455. addq INCY, YY
  456. ADD2 %xmm1, %xmm13
  457. movsd 1 * SIZE(Y), %xmm11
  458. mulsd ALPHA_I, %xmm7
  459. addq INCY, Y
  460. ADD1 %xmm3, %xmm12
  461. mulsd ALPHA_I, %xmm6
  462. addsd %xmm2, %xmm13
  463. movaps %xmm0, %xmm2
  464. mulsd ALPHA_R, %xmm0
  465. addsd %xmm4, %xmm10
  466. movaps %xmm1, %xmm3
  467. mulsd ALPHA_R, %xmm1
  468. ADD2 %xmm5, %xmm11
  469. mulsd ALPHA_I, %xmm3
  470. ADD1 %xmm7, %xmm10
  471. addsd %xmm6, %xmm11
  472. mulsd ALPHA_I, %xmm2
  473. movsd %xmm12, 0 * SIZE(YY)
  474. movsd %xmm13, 1 * SIZE(YY)
  475. addq INCY, YY
  476. movsd %xmm10, 0 * SIZE(YY)
  477. movsd %xmm11, 1 * SIZE(YY)
  478. addq INCY, YY
  479. ALIGN_3
  480. .L25:
  481. movq M, %rax
  482. andq $2, %rax
  483. jle .L27
  484. movsd 0 * SIZE(X), %xmm0
  485. movsd 1 * SIZE(X), %xmm1
  486. addq INCX, X
  487. movsd 0 * SIZE(X), %xmm4
  488. movsd 1 * SIZE(X), %xmm5
  489. addq INCX, X
  490. movaps %xmm0, %xmm2
  491. movsd 0 * SIZE(Y), %xmm8
  492. mulsd ALPHA_R, %xmm0
  493. movaps %xmm1, %xmm3
  494. movsd 1 * SIZE(Y), %xmm9
  495. addq INCY, Y
  496. mulsd ALPHA_R, %xmm1
  497. movsd 0 * SIZE(Y), %xmm10
  498. mulsd ALPHA_I, %xmm3
  499. movsd 1 * SIZE(Y), %xmm11
  500. mulsd ALPHA_I, %xmm2
  501. addq INCY, Y
  502. movaps %xmm4, %xmm6
  503. mulsd ALPHA_R, %xmm4
  504. addsd %xmm0, %xmm8
  505. movaps %xmm5, %xmm7
  506. mulsd ALPHA_R, %xmm5
  507. ADD2 %xmm1, %xmm9
  508. mulsd ALPHA_I, %xmm7
  509. ADD1 %xmm3, %xmm8
  510. mulsd ALPHA_I, %xmm6
  511. addsd %xmm2, %xmm9
  512. addsd %xmm4, %xmm10
  513. movsd %xmm8, 0 * SIZE(YY)
  514. ADD2 %xmm5, %xmm11
  515. movsd %xmm9, 1 * SIZE(YY)
  516. ADD1 %xmm7, %xmm10
  517. addq INCY, YY
  518. addsd %xmm6, %xmm11
  519. movsd %xmm10, 0 * SIZE(YY)
  520. movsd %xmm11, 1 * SIZE(YY)
  521. addq INCY, YY
  522. ALIGN_3
  523. .L27:
  524. movq M, %rax
  525. andq $1, %rax
  526. jle .L999
  527. movsd 0 * SIZE(X), %xmm0
  528. movsd 1 * SIZE(X), %xmm1
  529. movsd 0 * SIZE(Y), %xmm8
  530. movsd 1 * SIZE(Y), %xmm9
  531. movaps %xmm0, %xmm2
  532. mulsd ALPHA_R, %xmm0
  533. movaps %xmm1, %xmm3
  534. mulsd ALPHA_R, %xmm1
  535. mulsd ALPHA_I, %xmm3
  536. mulsd ALPHA_I, %xmm2
  537. addsd %xmm0, %xmm8
  538. ADD2 %xmm1, %xmm9
  539. ADD1 %xmm3, %xmm8
  540. addsd %xmm2, %xmm9
  541. movsd %xmm8, 0 * SIZE(YY)
  542. movsd %xmm9, 1 * SIZE(YY)
  543. ALIGN_3
  544. .L999:
  545. xorq %rax, %rax
  546. RESTOREREGISTERS
  547. ret
  548. EPILOGUE