You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n_sse2.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef PENTIUM4
  41. #define PREFETCH prefetcht0
  42. #define PREFETCHW prefetcht0
  43. #define PREFETCHSIZE (8 * 2)
  44. #endif
  45. #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
  46. #define PREFETCH prefetcht0
  47. #define PREFETCHW prefetcht0
  48. #define PREFETCHSIZE (8 * 7)
  49. #endif
  50. #ifdef OPTERON
  51. #define PREFETCH prefetchnta
  52. #define PREFETCHW prefetchw
  53. #define PREFETCHSIZE (8 * 3)
  54. #define movsd movlps
  55. #endif
  56. #ifdef BARCELONA
  57. #define PREFETCH prefetchnta
  58. #define PREFETCHW prefetchw
  59. #define PREFETCHSIZE (8 * 5)
  60. #endif
  61. #ifdef ATOM
  62. #define PREFETCH prefetch
  63. #define PREFETCHW prefetcht0
  64. #define PREFETCHSIZE (8 * 6)
  65. #endif
  66. #ifdef NANO
  67. #define PREFETCH prefetcht0
  68. #define PREFETCHSIZE (8 * 4)
  69. #endif
  70. #define STACKSIZE 16
  71. #define ARGS 16
  72. #define M 4 + STACKSIZE+ARGS(%esp)
  73. #define N 8 + STACKSIZE+ARGS(%esp)
  74. #define ALPHA 16 + STACKSIZE+ARGS(%esp)
  75. #define A 24 + STACKSIZE+ARGS(%esp)
  76. #define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
  77. #define STACK_X 32 + STACKSIZE+ARGS(%esp)
  78. #define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
  79. #define Y 40 + STACKSIZE+ARGS(%esp)
  80. #define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
  81. #define BUFFER 48 + STACKSIZE+ARGS(%esp)
  82. #define MMM 0+ARGS(%esp)
  83. #define YY 4+ARGS(%esp)
  84. #define AA 8+ARGS(%esp)
  85. #define I %eax
  86. #define J %ebx
  87. #define INCX %ecx
  88. #define INCY J
  89. #define A1 %esi
  90. #define X %edx
  91. #define Y1 %edi
  92. #define LDA %ebp
  93. PROLOGUE
  94. subl $ARGS,%esp
  95. pushl %ebp
  96. pushl %edi
  97. pushl %esi
  98. pushl %ebx
  99. PROFCODE
  100. movl Y,J
  101. movl J,YY # backup Y
  102. movl A,J
  103. movl J,AA # backup A
  104. movl M,J
  105. movl J,MMM # backup MM
  106. .L0t:
  107. xorl J,J
  108. addl $1,J
  109. sall $20,J
  110. subl J,MMM
  111. movl J,M
  112. jge .L00t
  113. ALIGN_4
  114. movl MMM,%eax
  115. addl J,%eax
  116. jle .L999x
  117. movl %eax,M
  118. .L00t:
  119. movl AA,%eax
  120. movl %eax,A
  121. movl YY,J
  122. movl J,Y
  123. movl STACK_LDA, LDA
  124. movl STACK_X, X
  125. movl STACK_INCX, INCX
  126. leal (,INCX, SIZE), INCX
  127. leal (,LDA, SIZE), LDA
  128. subl $-16 * SIZE, A
  129. cmpl $0, N
  130. jle .L999
  131. cmpl $0, M
  132. jle .L999
  133. movl BUFFER, Y1
  134. pxor %xmm7, %xmm7
  135. movl M, %eax
  136. addl $16, %eax
  137. sarl $4, %eax
  138. ALIGN_3
  139. .L01:
  140. movapd %xmm7, 0 * SIZE(Y1)
  141. movapd %xmm7, 2 * SIZE(Y1)
  142. movapd %xmm7, 4 * SIZE(Y1)
  143. movapd %xmm7, 6 * SIZE(Y1)
  144. movapd %xmm7, 8 * SIZE(Y1)
  145. movapd %xmm7, 10 * SIZE(Y1)
  146. movapd %xmm7, 12 * SIZE(Y1)
  147. movapd %xmm7, 14 * SIZE(Y1)
  148. subl $-16 * SIZE, Y1
  149. decl %eax
  150. jg .L01
  151. ALIGN_3
  152. .L10:
  153. movl N, J
  154. sarl $1, J
  155. jle .L20
  156. ALIGN_3
  157. .L11:
  158. movl BUFFER, Y1
  159. addl $16 * SIZE, Y1
  160. movl A, A1
  161. leal (A1, LDA, 2), %eax
  162. movl %eax, A
  163. #ifdef HAVE_SSE3
  164. movddup (X), %xmm6
  165. addl INCX, X
  166. movddup (X), %xmm7
  167. addl INCX, X
  168. movddup ALPHA, %xmm0
  169. mulpd %xmm0, %xmm6
  170. mulpd %xmm0, %xmm7
  171. #else
  172. movsd (X), %xmm6
  173. addl INCX, X
  174. movsd (X), %xmm7
  175. addl INCX, X
  176. movsd ALPHA, %xmm0
  177. mulsd %xmm0, %xmm6
  178. mulsd %xmm0, %xmm7
  179. unpcklpd %xmm6, %xmm6
  180. unpcklpd %xmm7, %xmm7
  181. #endif
  182. ALIGN_3
  183. movl M, I
  184. sarl $3, I
  185. jle .L15
  186. movsd -16 * SIZE(A1), %xmm2
  187. movhpd -15 * SIZE(A1), %xmm2
  188. movsd -14 * SIZE(A1), %xmm3
  189. movhpd -13 * SIZE(A1), %xmm3
  190. movapd -16 * SIZE(Y1), %xmm0
  191. movapd -14 * SIZE(Y1), %xmm1
  192. movsd -16 * SIZE(A1, LDA), %xmm4
  193. movhpd -15 * SIZE(A1, LDA), %xmm4
  194. movsd -14 * SIZE(A1, LDA), %xmm5
  195. movhpd -13 * SIZE(A1, LDA), %xmm5
  196. decl I
  197. jle .L14
  198. ALIGN_3
  199. .L13:
  200. #ifdef PREFETCH
  201. PREFETCH (PREFETCHSIZE + 0) * SIZE(A1)
  202. #endif
  203. mulpd %xmm6, %xmm2
  204. addpd %xmm2, %xmm0
  205. movsd -12 * SIZE(A1), %xmm2
  206. movhpd -11 * SIZE(A1), %xmm2
  207. mulpd %xmm6, %xmm3
  208. addpd %xmm3, %xmm1
  209. movsd -10 * SIZE(A1), %xmm3
  210. movhpd -9 * SIZE(A1), %xmm3
  211. mulpd %xmm7, %xmm4
  212. addpd %xmm4, %xmm0
  213. movsd -12 * SIZE(A1, LDA), %xmm4
  214. movhpd -11 * SIZE(A1, LDA), %xmm4
  215. movapd %xmm0, -16 * SIZE(Y1)
  216. movapd -12 * SIZE(Y1), %xmm0
  217. mulpd %xmm7, %xmm5
  218. addpd %xmm5, %xmm1
  219. movsd -10 * SIZE(A1, LDA), %xmm5
  220. movhpd -9 * SIZE(A1, LDA), %xmm5
  221. movapd %xmm1, -14 * SIZE(Y1)
  222. movapd -10 * SIZE(Y1), %xmm1
  223. #ifdef PREFETCH
  224. PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA)
  225. #endif
  226. mulpd %xmm6, %xmm2
  227. addpd %xmm2, %xmm0
  228. movsd -8 * SIZE(A1), %xmm2
  229. movhpd -7 * SIZE(A1), %xmm2
  230. mulpd %xmm6, %xmm3
  231. addpd %xmm3, %xmm1
  232. movsd -6 * SIZE(A1), %xmm3
  233. movhpd -5 * SIZE(A1), %xmm3
  234. mulpd %xmm7, %xmm4
  235. addpd %xmm4, %xmm0
  236. movsd -8 * SIZE(A1, LDA), %xmm4
  237. movhpd -7 * SIZE(A1, LDA), %xmm4
  238. movapd %xmm0, -12 * SIZE(Y1)
  239. movapd -8 * SIZE(Y1), %xmm0
  240. mulpd %xmm7, %xmm5
  241. addpd %xmm5, %xmm1
  242. movsd -6 * SIZE(A1, LDA), %xmm5
  243. movhpd -5 * SIZE(A1, LDA), %xmm5
  244. movapd %xmm1, -10 * SIZE(Y1)
  245. movapd -6 * SIZE(Y1), %xmm1
  246. subl $-8 * SIZE, A1
  247. subl $-8 * SIZE, Y1
  248. subl $1, I
  249. BRANCH
  250. jg .L13
  251. ALIGN_3
  252. .L14:
  253. mulpd %xmm6, %xmm2
  254. addpd %xmm2, %xmm0
  255. movsd -12 * SIZE(A1), %xmm2
  256. movhpd -11 * SIZE(A1), %xmm2
  257. mulpd %xmm6, %xmm3
  258. addpd %xmm3, %xmm1
  259. movsd -10 * SIZE(A1), %xmm3
  260. movhpd -9 * SIZE(A1), %xmm3
  261. mulpd %xmm7, %xmm4
  262. addpd %xmm4, %xmm0
  263. movsd -12 * SIZE(A1, LDA), %xmm4
  264. movhpd -11 * SIZE(A1, LDA), %xmm4
  265. movapd %xmm0, -16 * SIZE(Y1)
  266. movapd -12 * SIZE(Y1), %xmm0
  267. mulpd %xmm7, %xmm5
  268. addpd %xmm5, %xmm1
  269. movsd -10 * SIZE(A1, LDA), %xmm5
  270. movhpd -9 * SIZE(A1, LDA), %xmm5
  271. movapd %xmm1, -14 * SIZE(Y1)
  272. movapd -10 * SIZE(Y1), %xmm1
  273. mulpd %xmm6, %xmm2
  274. addpd %xmm2, %xmm0
  275. mulpd %xmm6, %xmm3
  276. addpd %xmm3, %xmm1
  277. mulpd %xmm7, %xmm4
  278. addpd %xmm4, %xmm0
  279. movapd %xmm0, -12 * SIZE(Y1)
  280. mulpd %xmm7, %xmm5
  281. addpd %xmm5, %xmm1
  282. movapd %xmm1, -10 * SIZE(Y1)
  283. subl $-8 * SIZE, A1
  284. subl $-8 * SIZE, Y1
  285. ALIGN_3
  286. .L15:
  287. testl $4, M
  288. je .L16
  289. movsd -16 * SIZE(A1), %xmm2
  290. movhpd -15 * SIZE(A1), %xmm2
  291. movsd -14 * SIZE(A1), %xmm3
  292. movhpd -13 * SIZE(A1), %xmm3
  293. movapd -16 * SIZE(Y1), %xmm0
  294. movapd -14 * SIZE(Y1), %xmm1
  295. mulpd %xmm6, %xmm2
  296. addpd %xmm2, %xmm0
  297. mulpd %xmm6, %xmm3
  298. addpd %xmm3, %xmm1
  299. movsd -16 * SIZE(A1, LDA), %xmm4
  300. movhpd -15 * SIZE(A1, LDA), %xmm4
  301. movsd -14 * SIZE(A1, LDA), %xmm5
  302. movhpd -13 * SIZE(A1, LDA), %xmm5
  303. mulpd %xmm7, %xmm4
  304. addpd %xmm4, %xmm0
  305. mulpd %xmm7, %xmm5
  306. addpd %xmm5, %xmm1
  307. movapd %xmm0, -16 * SIZE(Y1)
  308. movapd %xmm1, -14 * SIZE(Y1)
  309. addl $4 * SIZE, A1
  310. addl $4 * SIZE, Y1
  311. ALIGN_3
  312. .L16:
  313. testl $2, M
  314. je .L17
  315. movsd -16 * SIZE(A1), %xmm2
  316. movhpd -15 * SIZE(A1), %xmm2
  317. movsd -16 * SIZE(A1, LDA), %xmm3
  318. movhpd -15 * SIZE(A1, LDA), %xmm3
  319. movapd -16 * SIZE(Y1), %xmm0
  320. mulpd %xmm6, %xmm2
  321. addpd %xmm2, %xmm0
  322. mulpd %xmm7, %xmm3
  323. addpd %xmm3, %xmm0
  324. movapd %xmm0, -16 * SIZE(Y1)
  325. addl $2 * SIZE, A1
  326. addl $2 * SIZE, Y1
  327. ALIGN_3
  328. .L17:
  329. testl $1, M
  330. je .L19
  331. movsd -16 * SIZE(A1), %xmm2
  332. movsd -16 * SIZE(A1, LDA), %xmm3
  333. movsd -16 * SIZE(Y1), %xmm0
  334. mulsd %xmm6, %xmm2
  335. addsd %xmm2, %xmm0
  336. mulsd %xmm7, %xmm3
  337. addsd %xmm3, %xmm0
  338. movsd %xmm0, -16 * SIZE(Y1)
  339. ALIGN_3
  340. .L19:
  341. decl J
  342. jg .L11
  343. ALIGN_4
  344. .L20:
  345. testl $1, N
  346. jle .L990
  347. movl BUFFER, Y1
  348. addl $16 * SIZE, Y1
  349. movl A, A1
  350. #ifdef HAVE_SSE3
  351. movddup (X), %xmm6
  352. addl INCX, X
  353. movddup ALPHA, %xmm0
  354. mulpd %xmm0, %xmm6
  355. #else
  356. movsd (X), %xmm6
  357. addl INCX, X
  358. movsd ALPHA, %xmm0
  359. mulsd %xmm0, %xmm6
  360. unpcklpd %xmm6, %xmm6
  361. #endif
  362. ALIGN_3
  363. movl M, I
  364. sarl $3, I
  365. jle .L25
  366. movsd -16 * SIZE(A1), %xmm2
  367. movhpd -15 * SIZE(A1), %xmm2
  368. movsd -14 * SIZE(A1), %xmm3
  369. movhpd -13 * SIZE(A1), %xmm3
  370. movapd -16 * SIZE(Y1), %xmm0
  371. movapd -14 * SIZE(Y1), %xmm1
  372. decl I
  373. jle .L24
  374. ALIGN_3
  375. .L23:
  376. #ifdef PREFETCH
  377. PREFETCH (PREFETCHSIZE + 0) * SIZE(A1)
  378. #endif
  379. mulpd %xmm6, %xmm2
  380. addpd %xmm2, %xmm0
  381. movsd -12 * SIZE(A1), %xmm2
  382. movhpd -11 * SIZE(A1), %xmm2
  383. movapd %xmm0, -16 * SIZE(Y1)
  384. movapd -12 * SIZE(Y1), %xmm0
  385. mulpd %xmm6, %xmm3
  386. addpd %xmm3, %xmm1
  387. movsd -10 * SIZE(A1), %xmm3
  388. movhpd -9 * SIZE(A1), %xmm3
  389. movapd %xmm1, -14 * SIZE(Y1)
  390. movapd -10 * SIZE(Y1), %xmm1
  391. mulpd %xmm6, %xmm2
  392. addpd %xmm2, %xmm0
  393. movsd -8 * SIZE(A1), %xmm2
  394. movhpd -7 * SIZE(A1), %xmm2
  395. movapd %xmm0, -12 * SIZE(Y1)
  396. movapd -8 * SIZE(Y1), %xmm0
  397. mulpd %xmm6, %xmm3
  398. addpd %xmm3, %xmm1
  399. movsd -6 * SIZE(A1), %xmm3
  400. movhpd -5 * SIZE(A1), %xmm3
  401. movapd %xmm1, -10 * SIZE(Y1)
  402. movapd -6 * SIZE(Y1), %xmm1
  403. subl $-8 * SIZE, A1
  404. subl $-8 * SIZE, Y1
  405. subl $1, I
  406. BRANCH
  407. jg .L23
  408. ALIGN_3
  409. .L24:
  410. mulpd %xmm6, %xmm2
  411. addpd %xmm2, %xmm0
  412. movsd -12 * SIZE(A1), %xmm2
  413. movhpd -11 * SIZE(A1), %xmm2
  414. mulpd %xmm6, %xmm3
  415. addpd %xmm3, %xmm1
  416. movsd -10 * SIZE(A1), %xmm3
  417. movhpd -9 * SIZE(A1), %xmm3
  418. movapd %xmm0, -16 * SIZE(Y1)
  419. movapd -12 * SIZE(Y1), %xmm0
  420. movapd %xmm1, -14 * SIZE(Y1)
  421. movapd -10 * SIZE(Y1), %xmm1
  422. mulpd %xmm6, %xmm2
  423. addpd %xmm2, %xmm0
  424. movapd %xmm0, -12 * SIZE(Y1)
  425. mulpd %xmm6, %xmm3
  426. addpd %xmm3, %xmm1
  427. movapd %xmm1, -10 * SIZE(Y1)
  428. subl $-8 * SIZE, A1
  429. subl $-8 * SIZE, Y1
  430. ALIGN_3
  431. .L25:
  432. testl $4, M
  433. je .L26
  434. movsd -16 * SIZE(A1), %xmm2
  435. movhpd -15 * SIZE(A1), %xmm2
  436. movsd -14 * SIZE(A1), %xmm3
  437. movhpd -13 * SIZE(A1), %xmm3
  438. movapd -16 * SIZE(Y1), %xmm0
  439. movapd -14 * SIZE(Y1), %xmm1
  440. mulpd %xmm6, %xmm2
  441. addpd %xmm2, %xmm0
  442. mulpd %xmm6, %xmm3
  443. addpd %xmm3, %xmm1
  444. movapd %xmm0, -16 * SIZE(Y1)
  445. movapd %xmm1, -14 * SIZE(Y1)
  446. addl $4 * SIZE, A1
  447. addl $4 * SIZE, Y1
  448. ALIGN_3
  449. .L26:
  450. testl $2, M
  451. je .L27
  452. movsd -16 * SIZE(A1), %xmm2
  453. movhpd -15 * SIZE(A1), %xmm2
  454. movapd -16 * SIZE(Y1), %xmm0
  455. mulpd %xmm6, %xmm2
  456. addpd %xmm2, %xmm0
  457. movapd %xmm0, -16 * SIZE(Y1)
  458. addl $2 * SIZE, A1
  459. addl $2 * SIZE, Y1
  460. ALIGN_3
  461. .L27:
  462. testl $1, M
  463. je .L990
  464. movsd -16 * SIZE(A1), %xmm2
  465. movsd -16 * SIZE(Y1), %xmm0
  466. mulsd %xmm6, %xmm2
  467. addsd %xmm2, %xmm0
  468. movsd %xmm0, -16 * SIZE(Y1)
  469. ALIGN_3
  470. .L990:
  471. movl Y, Y1
  472. movl BUFFER, X
  473. movl STACK_INCY, INCY
  474. sall $BASE_SHIFT, INCY
  475. movl M, %eax
  476. sarl $3, %eax
  477. jle .L994
  478. ALIGN_3
  479. .L992:
  480. movsd (Y1), %xmm0
  481. movhpd (Y1, INCY), %xmm0
  482. addpd 0 * SIZE(X), %xmm0
  483. movlpd %xmm0, (Y1)
  484. movhpd %xmm0, (Y1, INCY)
  485. leal (Y1, INCY, 2), Y1
  486. movsd (Y1), %xmm0
  487. movhpd (Y1, INCY), %xmm0
  488. addpd 2 * SIZE(X), %xmm0
  489. movlpd %xmm0, (Y1)
  490. movhpd %xmm0, (Y1, INCY)
  491. leal (Y1, INCY, 2), Y1
  492. movsd (Y1), %xmm0
  493. movhpd (Y1, INCY), %xmm0
  494. addpd 4 * SIZE(X), %xmm0
  495. movlpd %xmm0, (Y1)
  496. movhpd %xmm0, (Y1, INCY)
  497. leal (Y1, INCY, 2), Y1
  498. movsd (Y1), %xmm0
  499. movhpd (Y1, INCY), %xmm0
  500. addpd 6 * SIZE(X), %xmm0
  501. movlpd %xmm0, (Y1)
  502. movhpd %xmm0, (Y1, INCY)
  503. leal (Y1, INCY, 2), Y1
  504. addl $8 * SIZE, X
  505. decl %eax
  506. jg .L992
  507. ALIGN_3
  508. .L994:
  509. testl $7, M
  510. jle .L999
  511. testl $4, M
  512. jle .L995
  513. movsd (Y1), %xmm0
  514. movhpd (Y1, INCY), %xmm0
  515. addpd 0 * SIZE(X), %xmm0
  516. movlpd %xmm0, (Y1)
  517. movhpd %xmm0, (Y1, INCY)
  518. leal (Y1, INCY, 2), Y1
  519. movsd (Y1), %xmm0
  520. movhpd (Y1, INCY), %xmm0
  521. addpd 2 * SIZE(X), %xmm0
  522. movlpd %xmm0, (Y1)
  523. movhpd %xmm0, (Y1, INCY)
  524. leal (Y1, INCY, 2), Y1
  525. addl $4 * SIZE, X
  526. ALIGN_3
  527. .L995:
  528. testl $2, M
  529. jle .L996
  530. movsd (Y1), %xmm0
  531. movhpd (Y1, INCY), %xmm0
  532. addpd 0 * SIZE(X), %xmm0
  533. movlpd %xmm0, (Y1)
  534. movhpd %xmm0, (Y1, INCY)
  535. leal (Y1, INCY, 2), Y1
  536. addl $2 * SIZE, X
  537. ALIGN_3
  538. .L996:
  539. testl $1, M
  540. jle .L999
  541. movsd (Y1), %xmm0
  542. movsd 0 * SIZE(X), %xmm4
  543. addsd %xmm4, %xmm0
  544. movlpd %xmm0, (Y1)
  545. ALIGN_3
  546. .L999:
  547. movl M,J
  548. leal (,J,SIZE),%eax
  549. addl %eax,AA
  550. movl STACK_INCY,INCY
  551. imull INCY,%eax
  552. addl %eax,YY
  553. jmp .L0t
  554. ALIGN_4
  555. .L999x:
  556. popl %ebx
  557. popl %esi
  558. popl %edi
  559. popl %ebp
  560. addl $ARGS,%esp
  561. ret
  562. EPILOGUE