You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy_sse2.S 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef WINDOWS_ABI
  41. #define M ARG1
  42. #define X ARG4
  43. #define INCX ARG5
  44. #define Y ARG6
  45. #define INCY ARG2
  46. #else
  47. #define M ARG1
  48. #define X ARG2
  49. #define INCX ARG3
  50. #define Y ARG4
  51. #define INCY %r10
  52. #endif
  53. #define YY %r11
  54. #define ALPHA %xmm15
  55. #include "l1param.h"
  56. PROLOGUE
  57. PROFCODE
  58. #ifndef WINDOWS_ABI
  59. #ifndef XDOUBLE
  60. movq 8(%rsp), INCY
  61. #else
  62. movq 24(%rsp), INCY
  63. #endif
  64. movaps %xmm0, ALPHA
  65. #else
  66. movq 40(%rsp), X
  67. movq 48(%rsp), INCX
  68. movq 56(%rsp), Y
  69. movq 64(%rsp), INCY
  70. #endif
  71. SAVEREGISTERS
  72. #ifdef WINDOWS_ABI
  73. movaps %xmm3, ALPHA
  74. #endif
  75. unpcklpd ALPHA, ALPHA
  76. leaq (, INCX, SIZE), INCX
  77. leaq (, INCY, SIZE), INCY
  78. testq M, M
  79. jle .L47
  80. cmpq $SIZE, INCX
  81. jne .L40
  82. cmpq $SIZE, INCY
  83. jne .L40
  84. testq $SIZE, Y
  85. je .L10
  86. movsd (X), %xmm0
  87. mulsd ALPHA, %xmm0
  88. addsd (Y), %xmm0
  89. movsd %xmm0, (Y)
  90. addq $1 * SIZE, X
  91. addq $1 * SIZE, Y
  92. decq M
  93. jle .L19
  94. ALIGN_4
  95. .L10:
  96. subq $-16 * SIZE, X
  97. subq $-16 * SIZE, Y
  98. testq $SIZE, X
  99. jne .L20
  100. movq M, %rax
  101. sarq $4, %rax
  102. jle .L13
  103. movaps -16 * SIZE(X), %xmm0
  104. movaps -14 * SIZE(X), %xmm1
  105. movaps -12 * SIZE(X), %xmm2
  106. movaps -10 * SIZE(X), %xmm3
  107. decq %rax
  108. jle .L12
  109. ALIGN_3
  110. .L11:
  111. movaps -8 * SIZE(X), %xmm4
  112. movaps -6 * SIZE(X), %xmm5
  113. #ifdef PREFETCHW
  114. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  115. #endif
  116. mulpd ALPHA, %xmm0
  117. addpd -16 * SIZE(Y), %xmm0
  118. movaps %xmm0, -16 * SIZE(Y)
  119. mulpd ALPHA, %xmm1
  120. addpd -14 * SIZE(Y), %xmm1
  121. movaps %xmm1, -14 * SIZE(Y)
  122. movaps -4 * SIZE(X), %xmm6
  123. movaps -2 * SIZE(X), %xmm7
  124. #ifdef PREFETCH
  125. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  126. #endif
  127. mulpd ALPHA, %xmm2
  128. addpd -12 * SIZE(Y), %xmm2
  129. movaps %xmm2, -12 * SIZE(Y)
  130. mulpd ALPHA, %xmm3
  131. addpd -10 * SIZE(Y), %xmm3
  132. movaps %xmm3, -10 * SIZE(Y)
  133. movaps 0 * SIZE(X), %xmm0
  134. movaps 2 * SIZE(X), %xmm1
  135. #if defined(PREFETCHW) && !defined(FETCH128)
  136. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  137. #endif
  138. mulpd ALPHA, %xmm4
  139. addpd -8 * SIZE(Y), %xmm4
  140. movaps %xmm4, -8 * SIZE(Y)
  141. mulpd ALPHA, %xmm5
  142. addpd -6 * SIZE(Y), %xmm5
  143. movaps %xmm5, -6 * SIZE(Y)
  144. movaps 4 * SIZE(X), %xmm2
  145. movaps 6 * SIZE(X), %xmm3
  146. #if defined(PREFETCH) && !defined(FETCH128)
  147. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  148. #endif
  149. mulpd ALPHA, %xmm6
  150. addpd -4 * SIZE(Y), %xmm6
  151. movaps %xmm6, -4 * SIZE(Y)
  152. mulpd ALPHA, %xmm7
  153. addpd -2 * SIZE(Y), %xmm7
  154. movaps %xmm7, -2 * SIZE(Y)
  155. subq $-16 * SIZE, Y
  156. subq $-16 * SIZE, X
  157. decq %rax
  158. jg .L11
  159. ALIGN_3
  160. .L12:
  161. movaps -8 * SIZE(X), %xmm4
  162. movaps -6 * SIZE(X), %xmm5
  163. mulpd ALPHA, %xmm0
  164. addpd -16 * SIZE(Y), %xmm0
  165. movaps %xmm0, -16 * SIZE(Y)
  166. mulpd ALPHA, %xmm1
  167. addpd -14 * SIZE(Y), %xmm1
  168. movaps %xmm1, -14 * SIZE(Y)
  169. movaps -4 * SIZE(X), %xmm6
  170. movaps -2 * SIZE(X), %xmm7
  171. mulpd ALPHA, %xmm2
  172. addpd -12 * SIZE(Y), %xmm2
  173. movaps %xmm2, -12 * SIZE(Y)
  174. mulpd ALPHA, %xmm3
  175. addpd -10 * SIZE(Y), %xmm3
  176. movaps %xmm3, -10 * SIZE(Y)
  177. mulpd ALPHA, %xmm4
  178. addpd -8 * SIZE(Y), %xmm4
  179. movaps %xmm4, -8 * SIZE(Y)
  180. mulpd ALPHA, %xmm5
  181. addpd -6 * SIZE(Y), %xmm5
  182. movaps %xmm5, -6 * SIZE(Y)
  183. mulpd ALPHA, %xmm6
  184. addpd -4 * SIZE(Y), %xmm6
  185. movaps %xmm6, -4 * SIZE(Y)
  186. mulpd ALPHA, %xmm7
  187. addpd -2 * SIZE(Y), %xmm7
  188. movaps %xmm7, -2 * SIZE(Y)
  189. subq $-16 * SIZE, Y
  190. subq $-16 * SIZE, X
  191. ALIGN_3
  192. .L13:
  193. movq M, %rax
  194. andq $8, %rax
  195. jle .L14
  196. ALIGN_3
  197. movaps -16 * SIZE(X), %xmm0
  198. movaps -14 * SIZE(X), %xmm1
  199. movaps -12 * SIZE(X), %xmm2
  200. movaps -10 * SIZE(X), %xmm3
  201. mulpd ALPHA, %xmm0
  202. addpd -16 * SIZE(Y), %xmm0
  203. mulpd ALPHA, %xmm1
  204. addpd -14 * SIZE(Y), %xmm1
  205. mulpd ALPHA, %xmm2
  206. addpd -12 * SIZE(Y), %xmm2
  207. mulpd ALPHA, %xmm3
  208. addpd -10 * SIZE(Y), %xmm3
  209. movaps %xmm0, -16 * SIZE(Y)
  210. movaps %xmm1, -14 * SIZE(Y)
  211. movaps %xmm2, -12 * SIZE(Y)
  212. movaps %xmm3, -10 * SIZE(Y)
  213. addq $8 * SIZE, X
  214. addq $8 * SIZE, Y
  215. ALIGN_3
  216. .L14:
  217. movq M, %rax
  218. andq $4, %rax
  219. jle .L15
  220. ALIGN_3
  221. movaps -16 * SIZE(X), %xmm0
  222. movaps -14 * SIZE(X), %xmm1
  223. mulpd ALPHA, %xmm0
  224. mulpd ALPHA, %xmm1
  225. addpd -16 * SIZE(Y), %xmm0
  226. addpd -14 * SIZE(Y), %xmm1
  227. movaps %xmm0, -16 * SIZE(Y)
  228. movaps %xmm1, -14 * SIZE(Y)
  229. addq $4 * SIZE, X
  230. addq $4 * SIZE, Y
  231. ALIGN_3
  232. .L15:
  233. movq M, %rax
  234. andq $2, %rax
  235. jle .L16
  236. ALIGN_3
  237. movaps -16 * SIZE(X), %xmm0
  238. mulpd ALPHA, %xmm0
  239. addpd -16 * SIZE(Y), %xmm0
  240. movaps %xmm0, -16 * SIZE(Y)
  241. addq $2 * SIZE, X
  242. addq $2 * SIZE, Y
  243. ALIGN_3
  244. .L16:
  245. movq M, %rax
  246. andq $1, %rax
  247. jle .L19
  248. ALIGN_3
  249. movsd -16 * SIZE(X), %xmm0
  250. mulsd ALPHA, %xmm0
  251. addsd -16 * SIZE(Y), %xmm0
  252. movsd %xmm0, -16 * SIZE(Y)
  253. ALIGN_3
  254. .L19:
  255. xorq %rax,%rax
  256. RESTOREREGISTERS
  257. ret
  258. ALIGN_3
  259. .L20:
  260. #ifdef ALIGNED_ACCESS
  261. movhps -16 * SIZE(X), %xmm0
  262. movq M, %rax
  263. sarq $4, %rax
  264. jle .L23
  265. movaps -15 * SIZE(X), %xmm1
  266. movaps -13 * SIZE(X), %xmm2
  267. movaps -11 * SIZE(X), %xmm3
  268. decq %rax
  269. jle .L22
  270. ALIGN_4
  271. .L21:
  272. movaps -9 * SIZE(X), %xmm4
  273. movaps -7 * SIZE(X), %xmm5
  274. #ifdef PREFETCHW
  275. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  276. #endif
  277. SHUFPD_1 %xmm1, %xmm0
  278. mulpd ALPHA, %xmm0
  279. addpd -16 * SIZE(Y), %xmm0
  280. movaps %xmm0, -16 * SIZE(Y)
  281. SHUFPD_1 %xmm2, %xmm1
  282. mulpd ALPHA, %xmm1
  283. addpd -14 * SIZE(Y), %xmm1
  284. movaps %xmm1, -14 * SIZE(Y)
  285. movaps -5 * SIZE(X), %xmm6
  286. movaps -3 * SIZE(X), %xmm7
  287. #ifdef PREFETCH
  288. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  289. #endif
  290. SHUFPD_1 %xmm3, %xmm2
  291. mulpd ALPHA, %xmm2
  292. addpd -12 * SIZE(Y), %xmm2
  293. movaps %xmm2, -12 * SIZE(Y)
  294. SHUFPD_1 %xmm4, %xmm3
  295. mulpd ALPHA, %xmm3
  296. addpd -10 * SIZE(Y), %xmm3
  297. movaps %xmm3, -10 * SIZE(Y)
  298. movaps -1 * SIZE(X), %xmm0
  299. movaps 1 * SIZE(X), %xmm1
  300. #if defined(PREFETCHW) && !defined(FETCH128)
  301. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  302. #endif
  303. SHUFPD_1 %xmm5, %xmm4
  304. mulpd ALPHA, %xmm4
  305. addpd -8 * SIZE(Y), %xmm4
  306. movaps %xmm4, -8 * SIZE(Y)
  307. SHUFPD_1 %xmm6, %xmm5
  308. mulpd ALPHA, %xmm5
  309. addpd -6 * SIZE(Y), %xmm5
  310. movaps %xmm5, -6 * SIZE(Y)
  311. movaps 3 * SIZE(X), %xmm2
  312. movaps 5 * SIZE(X), %xmm3
  313. #if defined(PREFETCH) && !defined(FETCH128)
  314. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  315. #endif
  316. SHUFPD_1 %xmm7, %xmm6
  317. mulpd ALPHA, %xmm6
  318. addpd -4 * SIZE(Y), %xmm6
  319. movaps %xmm6, -4 * SIZE(Y)
  320. SHUFPD_1 %xmm0, %xmm7
  321. mulpd ALPHA, %xmm7
  322. addpd -2 * SIZE(Y), %xmm7
  323. movaps %xmm7, -2 * SIZE(Y)
  324. subq $-16 * SIZE, X
  325. subq $-16 * SIZE, Y
  326. decq %rax
  327. jg .L21
  328. ALIGN_3
  329. .L22:
  330. movaps -9 * SIZE(X), %xmm4
  331. movaps -7 * SIZE(X), %xmm5
  332. SHUFPD_1 %xmm1, %xmm0
  333. mulpd ALPHA, %xmm0
  334. addpd -16 * SIZE(Y), %xmm0
  335. movaps %xmm0, -16 * SIZE(Y)
  336. movaps -1 * SIZE(X), %xmm0
  337. SHUFPD_1 %xmm2, %xmm1
  338. mulpd ALPHA, %xmm1
  339. addpd -14 * SIZE(Y), %xmm1
  340. movaps %xmm1, -14 * SIZE(Y)
  341. movaps -5 * SIZE(X), %xmm6
  342. movaps -3 * SIZE(X), %xmm7
  343. SHUFPD_1 %xmm3, %xmm2
  344. mulpd ALPHA, %xmm2
  345. addpd -12 * SIZE(Y), %xmm2
  346. movaps %xmm2, -12 * SIZE(Y)
  347. SHUFPD_1 %xmm4, %xmm3
  348. mulpd ALPHA, %xmm3
  349. addpd -10 * SIZE(Y), %xmm3
  350. movaps %xmm3, -10 * SIZE(Y)
  351. SHUFPD_1 %xmm5, %xmm4
  352. mulpd ALPHA, %xmm4
  353. addpd -8 * SIZE(Y), %xmm4
  354. movaps %xmm4, -8 * SIZE(Y)
  355. SHUFPD_1 %xmm6, %xmm5
  356. mulpd ALPHA, %xmm5
  357. addpd -6 * SIZE(Y), %xmm5
  358. movaps %xmm5, -6 * SIZE(Y)
  359. SHUFPD_1 %xmm7, %xmm6
  360. mulpd ALPHA, %xmm6
  361. addpd -4 * SIZE(Y), %xmm6
  362. movaps %xmm6, -4 * SIZE(Y)
  363. SHUFPD_1 %xmm0, %xmm7
  364. mulpd ALPHA, %xmm7
  365. addpd -2 * SIZE(Y), %xmm7
  366. movaps %xmm7, -2 * SIZE(Y)
  367. subq $-16 * SIZE, X
  368. subq $-16 * SIZE, Y
  369. ALIGN_3
  370. .L23:
  371. movq M, %rax
  372. andq $8, %rax
  373. jle .L24
  374. ALIGN_3
  375. movaps -15 * SIZE(X), %xmm1
  376. movaps -13 * SIZE(X), %xmm2
  377. movaps -11 * SIZE(X), %xmm3
  378. movaps -9 * SIZE(X), %xmm8
  379. SHUFPD_1 %xmm1, %xmm0
  380. mulpd ALPHA, %xmm0
  381. addpd -16 * SIZE(Y), %xmm0
  382. movaps %xmm0, -16 * SIZE(Y)
  383. SHUFPD_1 %xmm2, %xmm1
  384. mulpd ALPHA, %xmm1
  385. addpd -14 * SIZE(Y), %xmm1
  386. movaps %xmm1, -14 * SIZE(Y)
  387. SHUFPD_1 %xmm3, %xmm2
  388. mulpd ALPHA, %xmm2
  389. addpd -12 * SIZE(Y), %xmm2
  390. movaps %xmm2, -12 * SIZE(Y)
  391. SHUFPD_1 %xmm8, %xmm3
  392. mulpd ALPHA, %xmm3
  393. addpd -10 * SIZE(Y), %xmm3
  394. movaps %xmm3, -10 * SIZE(Y)
  395. movaps %xmm8, %xmm0
  396. addq $8 * SIZE, X
  397. addq $8 * SIZE, Y
  398. ALIGN_3
  399. .L24:
  400. movq M, %rax
  401. andq $4, %rax
  402. jle .L25
  403. ALIGN_3
  404. movaps -15 * SIZE(X), %xmm1
  405. movaps -13 * SIZE(X), %xmm2
  406. SHUFPD_1 %xmm1, %xmm0
  407. SHUFPD_1 %xmm2, %xmm1
  408. mulpd ALPHA, %xmm0
  409. mulpd ALPHA, %xmm1
  410. addpd -16 * SIZE(Y), %xmm0
  411. addpd -14 * SIZE(Y), %xmm1
  412. movaps %xmm0, -16 * SIZE(Y)
  413. movaps %xmm1, -14 * SIZE(Y)
  414. movaps %xmm2, %xmm0
  415. addq $4 * SIZE, X
  416. addq $4 * SIZE, Y
  417. ALIGN_3
  418. .L25:
  419. movq M, %rax
  420. andq $2, %rax
  421. jle .L26
  422. ALIGN_3
  423. movaps -15 * SIZE(X), %xmm1
  424. SHUFPD_1 %xmm1, %xmm0
  425. mulpd ALPHA, %xmm0
  426. addpd -16 * SIZE(Y), %xmm0
  427. movaps %xmm0, -16 * SIZE(Y)
  428. addq $2 * SIZE, X
  429. addq $2 * SIZE, Y
  430. ALIGN_3
  431. .L26:
  432. movq M, %rax
  433. andq $1, %rax
  434. jle .L29
  435. ALIGN_3
  436. movsd -16 * SIZE(X), %xmm0
  437. mulsd ALPHA, %xmm0
  438. addsd -16 * SIZE(Y), %xmm0
  439. movsd %xmm0, -16 * SIZE(Y)
  440. ALIGN_3
  441. .L29:
  442. xorq %rax,%rax
  443. RESTOREREGISTERS
  444. ret
  445. ALIGN_3
  446. #else
  447. movq M, %rax
  448. sarq $4, %rax
  449. jle .L23
  450. movsd -16 * SIZE(X), %xmm0
  451. movhps -15 * SIZE(X), %xmm0
  452. movsd -14 * SIZE(X), %xmm1
  453. movhps -13 * SIZE(X), %xmm1
  454. movsd -12 * SIZE(X), %xmm2
  455. movhps -11 * SIZE(X), %xmm2
  456. movsd -10 * SIZE(X), %xmm3
  457. movhps -9 * SIZE(X), %xmm3
  458. decq %rax
  459. jle .L22
  460. ALIGN_3
  461. .L21:
  462. movsd -8 * SIZE(X), %xmm4
  463. movhps -7 * SIZE(X), %xmm4
  464. movsd -6 * SIZE(X), %xmm5
  465. movhps -5 * SIZE(X), %xmm5
  466. #ifdef PREFETCHW
  467. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  468. #endif
  469. mulpd ALPHA, %xmm0
  470. addpd -16 * SIZE(Y), %xmm0
  471. movaps %xmm0, -16 * SIZE(Y)
  472. mulpd ALPHA, %xmm1
  473. addpd -14 * SIZE(Y), %xmm1
  474. movaps %xmm1, -14 * SIZE(Y)
  475. movsd -4 * SIZE(X), %xmm6
  476. movhps -3 * SIZE(X), %xmm6
  477. movsd -2 * SIZE(X), %xmm7
  478. movhps -1 * SIZE(X), %xmm7
  479. #ifdef PREFETCH
  480. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  481. #endif
  482. mulpd ALPHA, %xmm2
  483. addpd -12 * SIZE(Y), %xmm2
  484. movaps %xmm2, -12 * SIZE(Y)
  485. mulpd ALPHA, %xmm3
  486. addpd -10 * SIZE(Y), %xmm3
  487. movaps %xmm3, -10 * SIZE(Y)
  488. movsd 0 * SIZE(X), %xmm0
  489. movhps 1 * SIZE(X), %xmm0
  490. movsd 2 * SIZE(X), %xmm1
  491. movhps 3 * SIZE(X), %xmm1
  492. #if defined(PREFETCHW) && !defined(FETCH128)
  493. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  494. #endif
  495. mulpd ALPHA, %xmm4
  496. addpd -8 * SIZE(Y), %xmm4
  497. movaps %xmm4, -8 * SIZE(Y)
  498. mulpd ALPHA, %xmm5
  499. addpd -6 * SIZE(Y), %xmm5
  500. movaps %xmm5, -6 * SIZE(Y)
  501. movsd 4 * SIZE(X), %xmm2
  502. movhps 5 * SIZE(X), %xmm2
  503. movsd 6 * SIZE(X), %xmm3
  504. movhps 7 * SIZE(X), %xmm3
  505. #if defined(PREFETCH) && !defined(FETCH128)
  506. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  507. #endif
  508. mulpd ALPHA, %xmm6
  509. addpd -4 * SIZE(Y), %xmm6
  510. movaps %xmm6, -4 * SIZE(Y)
  511. mulpd ALPHA, %xmm7
  512. addpd -2 * SIZE(Y), %xmm7
  513. movaps %xmm7, -2 * SIZE(Y)
  514. subq $-16 * SIZE, Y
  515. subq $-16 * SIZE, X
  516. decq %rax
  517. jg .L21
  518. ALIGN_3
  519. .L22:
  520. movsd -8 * SIZE(X), %xmm4
  521. movhps -7 * SIZE(X), %xmm4
  522. movsd -6 * SIZE(X), %xmm5
  523. movhps -5 * SIZE(X), %xmm5
  524. mulpd ALPHA, %xmm0
  525. addpd -16 * SIZE(Y), %xmm0
  526. movaps %xmm0, -16 * SIZE(Y)
  527. mulpd ALPHA, %xmm1
  528. addpd -14 * SIZE(Y), %xmm1
  529. movaps %xmm1, -14 * SIZE(Y)
  530. movsd -4 * SIZE(X), %xmm6
  531. movhps -3 * SIZE(X), %xmm6
  532. movsd -2 * SIZE(X), %xmm7
  533. movhps -1 * SIZE(X), %xmm7
  534. mulpd ALPHA, %xmm2
  535. addpd -12 * SIZE(Y), %xmm2
  536. movaps %xmm2, -12 * SIZE(Y)
  537. mulpd ALPHA, %xmm3
  538. addpd -10 * SIZE(Y), %xmm3
  539. movaps %xmm3, -10 * SIZE(Y)
  540. mulpd ALPHA, %xmm4
  541. addpd -8 * SIZE(Y), %xmm4
  542. movaps %xmm4, -8 * SIZE(Y)
  543. mulpd ALPHA, %xmm5
  544. addpd -6 * SIZE(Y), %xmm5
  545. movaps %xmm5, -6 * SIZE(Y)
  546. mulpd ALPHA, %xmm6
  547. addpd -4 * SIZE(Y), %xmm6
  548. movaps %xmm6, -4 * SIZE(Y)
  549. mulpd ALPHA, %xmm7
  550. addpd -2 * SIZE(Y), %xmm7
  551. movaps %xmm7, -2 * SIZE(Y)
  552. subq $-16 * SIZE, Y
  553. subq $-16 * SIZE, X
  554. ALIGN_3
  555. .L23:
  556. movq M, %rax
  557. andq $8, %rax
  558. jle .L24
  559. ALIGN_3
  560. movsd -16 * SIZE(X), %xmm0
  561. movhps -15 * SIZE(X), %xmm0
  562. movsd -14 * SIZE(X), %xmm1
  563. movhps -13 * SIZE(X), %xmm1
  564. movsd -12 * SIZE(X), %xmm2
  565. movhps -11 * SIZE(X), %xmm2
  566. movsd -10 * SIZE(X), %xmm3
  567. movhps -9 * SIZE(X), %xmm3
  568. mulpd ALPHA, %xmm0
  569. addpd -16 * SIZE(Y), %xmm0
  570. mulpd ALPHA, %xmm1
  571. addpd -14 * SIZE(Y), %xmm1
  572. mulpd ALPHA, %xmm2
  573. addpd -12 * SIZE(Y), %xmm2
  574. mulpd ALPHA, %xmm3
  575. addpd -10 * SIZE(Y), %xmm3
  576. movaps %xmm0, -16 * SIZE(Y)
  577. movaps %xmm1, -14 * SIZE(Y)
  578. movaps %xmm2, -12 * SIZE(Y)
  579. movaps %xmm3, -10 * SIZE(Y)
  580. addq $8 * SIZE, X
  581. addq $8 * SIZE, Y
  582. ALIGN_3
  583. .L24:
  584. movq M, %rax
  585. andq $4, %rax
  586. jle .L25
  587. ALIGN_3
  588. movsd -16 * SIZE(X), %xmm0
  589. movhps -15 * SIZE(X), %xmm0
  590. movsd -14 * SIZE(X), %xmm1
  591. movhps -13 * SIZE(X), %xmm1
  592. mulpd ALPHA, %xmm0
  593. mulpd ALPHA, %xmm1
  594. addpd -16 * SIZE(Y), %xmm0
  595. addpd -14 * SIZE(Y), %xmm1
  596. movaps %xmm0, -16 * SIZE(Y)
  597. movaps %xmm1, -14 * SIZE(Y)
  598. addq $4 * SIZE, X
  599. addq $4 * SIZE, Y
  600. ALIGN_3
  601. .L25:
  602. movq M, %rax
  603. andq $2, %rax
  604. jle .L26
  605. ALIGN_3
  606. movsd -16 * SIZE(X), %xmm0
  607. movhps -15 * SIZE(X), %xmm0
  608. mulpd ALPHA, %xmm0
  609. addpd -16 * SIZE(Y), %xmm0
  610. movaps %xmm0, -16 * SIZE(Y)
  611. addq $2 * SIZE, X
  612. addq $2 * SIZE, Y
  613. ALIGN_3
  614. .L26:
  615. movq M, %rax
  616. andq $1, %rax
  617. jle .L29
  618. ALIGN_3
  619. movsd -16 * SIZE(X), %xmm0
  620. mulsd ALPHA, %xmm0
  621. addsd -16 * SIZE(Y), %xmm0
  622. movsd %xmm0, -16 * SIZE(Y)
  623. ALIGN_3
  624. .L29:
  625. xorq %rax,%rax
  626. RESTOREREGISTERS
  627. ret
  628. ALIGN_3
  629. #endif
  630. .L40:
  631. movq Y, YY
  632. movq M, %rax
  633. //If incx==0 || incy==0, avoid unloop.
  634. cmpq $0, INCX
  635. je .L46
  636. cmpq $0, INCY
  637. je .L46
  638. sarq $3, %rax
  639. jle .L45
  640. ALIGN_3
  641. .L41:
  642. movsd 0 * SIZE(X), %xmm0
  643. addq INCX, X
  644. movhpd 0 * SIZE(X), %xmm0
  645. addq INCX, X
  646. mulpd ALPHA, %xmm0
  647. movsd 0 * SIZE(YY), %xmm6
  648. addq INCY, YY
  649. movhpd 0 * SIZE(YY), %xmm6
  650. addq INCY, YY
  651. addpd %xmm6, %xmm0
  652. movsd 0 * SIZE(X), %xmm1
  653. addq INCX, X
  654. movhpd 0 * SIZE(X), %xmm1
  655. addq INCX, X
  656. mulpd ALPHA, %xmm1
  657. movsd 0 * SIZE(YY), %xmm6
  658. addq INCY, YY
  659. movhpd 0 * SIZE(YY), %xmm6
  660. addq INCY, YY
  661. addpd %xmm6, %xmm1
  662. movsd 0 * SIZE(X), %xmm2
  663. addq INCX, X
  664. movhpd 0 * SIZE(X), %xmm2
  665. addq INCX, X
  666. mulpd ALPHA, %xmm2
  667. movsd 0 * SIZE(YY), %xmm6
  668. addq INCY, YY
  669. movhpd 0 * SIZE(YY), %xmm6
  670. addq INCY, YY
  671. addpd %xmm6, %xmm2
  672. movsd 0 * SIZE(X), %xmm3
  673. addq INCX, X
  674. movhpd 0 * SIZE(X), %xmm3
  675. addq INCX, X
  676. mulpd ALPHA, %xmm3
  677. movsd 0 * SIZE(YY), %xmm6
  678. addq INCY, YY
  679. movhpd 0 * SIZE(YY), %xmm6
  680. addq INCY, YY
  681. addpd %xmm6, %xmm3
  682. movsd %xmm0, 0 * SIZE(Y)
  683. addq INCY, Y
  684. movhpd %xmm0, 0 * SIZE(Y)
  685. addq INCY, Y
  686. movsd %xmm1, 0 * SIZE(Y)
  687. addq INCY, Y
  688. movhpd %xmm1, 0 * SIZE(Y)
  689. addq INCY, Y
  690. movsd %xmm2, 0 * SIZE(Y)
  691. addq INCY, Y
  692. movhpd %xmm2, 0 * SIZE(Y)
  693. addq INCY, Y
  694. movsd %xmm3, 0 * SIZE(Y)
  695. addq INCY, Y
  696. movhpd %xmm3, 0 * SIZE(Y)
  697. addq INCY, Y
  698. decq %rax
  699. jg .L41
  700. ALIGN_3
  701. .L45:
  702. movq M, %rax
  703. andq $7, %rax
  704. jle .L47
  705. ALIGN_3
  706. .L46:
  707. movsd (X), %xmm0
  708. addq INCX, X
  709. mulsd %xmm15, %xmm0
  710. addsd (Y), %xmm0
  711. movsd %xmm0, (Y)
  712. addq INCY, Y
  713. decq %rax
  714. jg .L46
  715. ALIGN_3
  716. .L47:
  717. xorq %rax, %rax
  718. RESTOREREGISTERS
  719. ret
  720. EPILOGUE