You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_n_sse.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef movsd
  41. #undef movsd
  42. #endif
  43. #ifdef PENTIUM3
  44. #ifdef HAVE_SSE
  45. #define PREFETCH prefetcht0
  46. #define PREFETCHW prefetcht0
  47. #define PREFETCHSIZE (16 * 2)
  48. #endif
  49. #define movsd movlps
  50. #endif
  51. #ifdef PENTIUM4
  52. #define PREFETCH prefetcht0
  53. #define PREFETCHW prefetcht0
  54. #define PREFETCHSIZE (16 * 4)
  55. #endif
  56. #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
  57. #define PREFETCH prefetcht0
  58. #define PREFETCHW prefetcht0
  59. #define PREFETCHSIZE (16 * 7)
  60. #endif
  61. #ifdef OPTERON
  62. #define PREFETCH prefetchnta
  63. #define PREFETCHW prefetchw
  64. #define PREFETCHSIZE (16 * 3)
  65. #define movsd movlps
  66. #endif
  67. #ifdef BARCELONA
  68. #define PREFETCH prefetchnta
  69. #define PREFETCHW prefetchw
  70. #define PREFETCHSIZE (16 * 5)
  71. #endif
  72. #ifdef ATOM
  73. #define PREFETCH prefetchnta
  74. #define PREFETCHW prefetcht0
  75. #define PREFETCHSIZE (16 * 6)
  76. #endif
  77. #ifdef NANO
  78. #define PREFETCH prefetcht0
  79. #define PREFETCHSIZE (16 * 4)
  80. #endif
  81. #define STACKSIZE 16
  82. #define ARGS 16
  83. #define M 4 + STACKSIZE+ARGS(%esp)
  84. #define N 8 + STACKSIZE+ARGS(%esp)
  85. #define ALPHA 16 + STACKSIZE+ARGS(%esp)
  86. #define A 20 + STACKSIZE+ARGS(%esp)
  87. #define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
  88. #define STACK_X 28 + STACKSIZE+ARGS(%esp)
  89. #define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
  90. #define Y 36 + STACKSIZE+ARGS(%esp)
  91. #define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
  92. #define BUFFER 44 + STACKSIZE+ARGS(%esp)
  93. #define MMM 0+ARGS(%esp)
  94. #define YY 4+ARGS(%esp)
  95. #define AA 8+ARGS(%esp)
  96. #define I %eax
  97. #define J %ebx
  98. #define INCX %ecx
  99. #define INCY J
  100. #define A1 %esi
  101. #define X %edx
  102. #define Y1 %edi
  103. #define LDA %ebp
  104. PROLOGUE
  105. subl $ARGS,%esp
  106. pushl %ebp
  107. pushl %edi
  108. pushl %esi
  109. pushl %ebx
  110. PROFCODE
  111. movl Y,J
  112. movl J,YY # backup Y
  113. movl A,J
  114. movl J,AA # backup A
  115. movl M,J
  116. movl J,MMM # backup MM
  117. .L0t:
  118. xorl J,J
  119. addl $1,J
  120. sall $21,J
  121. subl J,MMM
  122. movl J,M
  123. jge .L00t
  124. ALIGN_4
  125. movl MMM,%eax
  126. addl J,%eax
  127. jle .L999x
  128. movl %eax,M
  129. .L00t:
  130. movl AA,%eax
  131. movl %eax,A
  132. movl YY,J
  133. movl J,Y
  134. movl STACK_LDA, LDA
  135. movl STACK_X, X
  136. movl STACK_INCX, INCX
  137. leal (,INCX, SIZE), INCX
  138. leal (,LDA, SIZE), LDA
  139. subl $-32 * SIZE, A
  140. cmpl $0, N
  141. jle .L999
  142. cmpl $0, M
  143. jle .L999
  144. movl BUFFER, Y1
  145. xorps %xmm7, %xmm7
  146. movl M, %eax
  147. addl $16, %eax
  148. sarl $4, %eax
  149. ALIGN_3
  150. .L01:
  151. movaps %xmm7, 0 * SIZE(Y1)
  152. movaps %xmm7, 4 * SIZE(Y1)
  153. movaps %xmm7, 8 * SIZE(Y1)
  154. movaps %xmm7, 12 * SIZE(Y1)
  155. subl $-16 * SIZE, Y1
  156. decl %eax
  157. jg .L01
  158. ALIGN_3
  159. .L10:
  160. movl N, J
  161. sarl $1, J
  162. jle .L20
  163. ALIGN_3
  164. .L11:
  165. movl BUFFER, Y1
  166. addl $32 * SIZE, Y1
  167. movl A, A1
  168. leal (A1, LDA, 2), %eax
  169. movl %eax, A
  170. movss (X), %xmm6
  171. addl INCX, X
  172. movss (X), %xmm7
  173. addl INCX, X
  174. movss ALPHA, %xmm0
  175. mulss %xmm0, %xmm6
  176. mulss %xmm0, %xmm7
  177. shufps $0, %xmm6, %xmm6
  178. shufps $0, %xmm7, %xmm7
  179. ALIGN_3
  180. movl M, I
  181. sarl $4, I
  182. jle .L15
  183. movsd -32 * SIZE(A1), %xmm2
  184. movhps -30 * SIZE(A1), %xmm2
  185. movsd -28 * SIZE(A1), %xmm3
  186. movhps -26 * SIZE(A1), %xmm3
  187. movaps -32 * SIZE(Y1), %xmm0
  188. movaps -28 * SIZE(Y1), %xmm1
  189. movsd -32 * SIZE(A1, LDA), %xmm4
  190. movhps -30 * SIZE(A1, LDA), %xmm4
  191. movsd -28 * SIZE(A1, LDA), %xmm5
  192. movhps -26 * SIZE(A1, LDA), %xmm5
  193. decl I
  194. jle .L14
  195. ALIGN_3
  196. .L13:
  197. #ifdef PREFETCH
  198. PREFETCH (PREFETCHSIZE + 0) * SIZE(A1)
  199. #endif
  200. mulps %xmm6, %xmm2
  201. addps %xmm2, %xmm0
  202. movsd -24 * SIZE(A1), %xmm2
  203. movhps -22 * SIZE(A1), %xmm2
  204. mulps %xmm6, %xmm3
  205. addps %xmm3, %xmm1
  206. movsd -20 * SIZE(A1), %xmm3
  207. movhps -18 * SIZE(A1), %xmm3
  208. mulps %xmm7, %xmm4
  209. addps %xmm4, %xmm0
  210. movsd -24 * SIZE(A1, LDA), %xmm4
  211. movhps -22 * SIZE(A1, LDA), %xmm4
  212. movaps %xmm0, -32 * SIZE(Y1)
  213. movaps -24 * SIZE(Y1), %xmm0
  214. mulps %xmm7, %xmm5
  215. addps %xmm5, %xmm1
  216. movsd -20 * SIZE(A1, LDA), %xmm5
  217. movhps -18 * SIZE(A1, LDA), %xmm5
  218. movaps %xmm1, -28 * SIZE(Y1)
  219. movaps -20 * SIZE(Y1), %xmm1
  220. #ifdef PREFETCH
  221. PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA)
  222. #endif
  223. mulps %xmm6, %xmm2
  224. addps %xmm2, %xmm0
  225. movsd -16 * SIZE(A1), %xmm2
  226. movhps -14 * SIZE(A1), %xmm2
  227. mulps %xmm6, %xmm3
  228. addps %xmm3, %xmm1
  229. movsd -12 * SIZE(A1), %xmm3
  230. movhps -10 * SIZE(A1), %xmm3
  231. mulps %xmm7, %xmm4
  232. addps %xmm4, %xmm0
  233. movsd -16 * SIZE(A1, LDA), %xmm4
  234. movhps -14 * SIZE(A1, LDA), %xmm4
  235. movaps %xmm0, -24 * SIZE(Y1)
  236. movaps -16 * SIZE(Y1), %xmm0
  237. mulps %xmm7, %xmm5
  238. addps %xmm5, %xmm1
  239. movsd -12 * SIZE(A1, LDA), %xmm5
  240. movhps -10 * SIZE(A1, LDA), %xmm5
  241. movaps %xmm1, -20 * SIZE(Y1)
  242. movaps -12 * SIZE(Y1), %xmm1
  243. subl $-16 * SIZE, A1
  244. subl $-16 * SIZE, Y1
  245. subl $1, I
  246. BRANCH
  247. jg .L13
  248. ALIGN_3
  249. .L14:
  250. mulps %xmm6, %xmm2
  251. addps %xmm2, %xmm0
  252. movsd -24 * SIZE(A1), %xmm2
  253. movhps -22 * SIZE(A1), %xmm2
  254. mulps %xmm6, %xmm3
  255. addps %xmm3, %xmm1
  256. movsd -20 * SIZE(A1), %xmm3
  257. movhps -18 * SIZE(A1), %xmm3
  258. mulps %xmm7, %xmm4
  259. addps %xmm4, %xmm0
  260. movsd -24 * SIZE(A1, LDA), %xmm4
  261. movhps -22 * SIZE(A1, LDA), %xmm4
  262. movaps %xmm0, -32 * SIZE(Y1)
  263. movaps -24 * SIZE(Y1), %xmm0
  264. mulps %xmm7, %xmm5
  265. addps %xmm5, %xmm1
  266. movsd -20 * SIZE(A1, LDA), %xmm5
  267. movhps -18 * SIZE(A1, LDA), %xmm5
  268. movaps %xmm1, -28 * SIZE(Y1)
  269. movaps -20 * SIZE(Y1), %xmm1
  270. mulps %xmm6, %xmm2
  271. addps %xmm2, %xmm0
  272. mulps %xmm6, %xmm3
  273. addps %xmm3, %xmm1
  274. mulps %xmm7, %xmm4
  275. addps %xmm4, %xmm0
  276. movaps %xmm0, -24 * SIZE(Y1)
  277. mulps %xmm7, %xmm5
  278. addps %xmm5, %xmm1
  279. movaps %xmm1, -20 * SIZE(Y1)
  280. subl $-16 * SIZE, A1
  281. subl $-16 * SIZE, Y1
  282. ALIGN_3
  283. .L15:
  284. testl $8, M
  285. je .L16
  286. movsd -32 * SIZE(A1), %xmm2
  287. movhps -30 * SIZE(A1), %xmm2
  288. movsd -28 * SIZE(A1), %xmm3
  289. movhps -26 * SIZE(A1), %xmm3
  290. movaps -32 * SIZE(Y1), %xmm0
  291. movaps -28 * SIZE(Y1), %xmm1
  292. mulps %xmm6, %xmm2
  293. addps %xmm2, %xmm0
  294. mulps %xmm6, %xmm3
  295. addps %xmm3, %xmm1
  296. movsd -32 * SIZE(A1, LDA), %xmm4
  297. movhps -30 * SIZE(A1, LDA), %xmm4
  298. movsd -28 * SIZE(A1, LDA), %xmm5
  299. movhps -26 * SIZE(A1, LDA), %xmm5
  300. mulps %xmm7, %xmm4
  301. addps %xmm4, %xmm0
  302. mulps %xmm7, %xmm5
  303. addps %xmm5, %xmm1
  304. movaps %xmm0, -32 * SIZE(Y1)
  305. movaps %xmm1, -28 * SIZE(Y1)
  306. addl $8 * SIZE, A1
  307. addl $8 * SIZE, Y1
  308. ALIGN_3
  309. .L16:
  310. testl $4, M
  311. je .L17
  312. movsd -32 * SIZE(A1), %xmm2
  313. movhps -30 * SIZE(A1), %xmm2
  314. movsd -32 * SIZE(A1, LDA), %xmm3
  315. movhps -30 * SIZE(A1, LDA), %xmm3
  316. movaps -32 * SIZE(Y1), %xmm0
  317. mulps %xmm6, %xmm2
  318. addps %xmm2, %xmm0
  319. mulps %xmm7, %xmm3
  320. addps %xmm3, %xmm0
  321. movaps %xmm0, -32 * SIZE(Y1)
  322. addl $4 * SIZE, A1
  323. addl $4 * SIZE, Y1
  324. ALIGN_3
  325. .L17:
  326. testl $2, M
  327. je .L18
  328. movsd -32 * SIZE(A1), %xmm2
  329. movsd -32 * SIZE(A1, LDA), %xmm3
  330. movsd -32 * SIZE(Y1), %xmm0
  331. mulps %xmm6, %xmm2
  332. addps %xmm2, %xmm0
  333. mulps %xmm7, %xmm3
  334. addps %xmm3, %xmm0
  335. movlps %xmm0, -32 * SIZE(Y1)
  336. addl $2 * SIZE, A1
  337. addl $2 * SIZE, Y1
  338. ALIGN_3
  339. .L18:
  340. testl $1, M
  341. je .L19
  342. movss -32 * SIZE(A1), %xmm2
  343. movss -32 * SIZE(A1, LDA), %xmm3
  344. movss -32 * SIZE(Y1), %xmm0
  345. mulss %xmm6, %xmm2
  346. addss %xmm2, %xmm0
  347. mulss %xmm7, %xmm3
  348. addss %xmm3, %xmm0
  349. movss %xmm0, -32 * SIZE(Y1)
  350. ALIGN_3
  351. .L19:
  352. decl J
  353. jg .L11
  354. ALIGN_4
  355. .L20:
  356. testl $1, N
  357. jle .L990
  358. movl BUFFER, Y1
  359. addl $32 * SIZE, Y1
  360. movl A, A1
  361. movss (X), %xmm6
  362. addl INCX, X
  363. movss ALPHA, %xmm0
  364. mulss %xmm0, %xmm6
  365. shufps $0, %xmm6, %xmm6
  366. ALIGN_3
  367. movl M, I
  368. sarl $4, I
  369. jle .L25
  370. movsd -32 * SIZE(A1), %xmm2
  371. movhps -30 * SIZE(A1), %xmm2
  372. movsd -28 * SIZE(A1), %xmm3
  373. movhps -26 * SIZE(A1), %xmm3
  374. movaps -32 * SIZE(Y1), %xmm0
  375. movaps -28 * SIZE(Y1), %xmm1
  376. decl I
  377. jle .L24
  378. ALIGN_3
  379. .L23:
  380. #ifdef PREFETCH
  381. PREFETCH (PREFETCHSIZE + 0) * SIZE(A1)
  382. #endif
  383. mulps %xmm6, %xmm2
  384. addps %xmm2, %xmm0
  385. movsd -24 * SIZE(A1), %xmm2
  386. movhps -22 * SIZE(A1), %xmm2
  387. movaps %xmm0, -32 * SIZE(Y1)
  388. movaps -24 * SIZE(Y1), %xmm0
  389. mulps %xmm6, %xmm3
  390. addps %xmm3, %xmm1
  391. movsd -20 * SIZE(A1), %xmm3
  392. movhps -18 * SIZE(A1), %xmm3
  393. movaps %xmm1, -28 * SIZE(Y1)
  394. movaps -20 * SIZE(Y1), %xmm1
  395. mulps %xmm6, %xmm2
  396. addps %xmm2, %xmm0
  397. movsd -16 * SIZE(A1), %xmm2
  398. movhps -14 * SIZE(A1), %xmm2
  399. movaps %xmm0, -24 * SIZE(Y1)
  400. movaps -16 * SIZE(Y1), %xmm0
  401. mulps %xmm6, %xmm3
  402. addps %xmm3, %xmm1
  403. movsd -12 * SIZE(A1), %xmm3
  404. movhps -10 * SIZE(A1), %xmm3
  405. movaps %xmm1, -20 * SIZE(Y1)
  406. movaps -12 * SIZE(Y1), %xmm1
  407. subl $-16 * SIZE, A1
  408. subl $-16 * SIZE, Y1
  409. subl $1, I
  410. BRANCH
  411. jg .L23
  412. ALIGN_3
  413. .L24:
  414. mulps %xmm6, %xmm2
  415. addps %xmm2, %xmm0
  416. movsd -24 * SIZE(A1), %xmm2
  417. movhps -22 * SIZE(A1), %xmm2
  418. mulps %xmm6, %xmm3
  419. addps %xmm3, %xmm1
  420. movsd -20 * SIZE(A1), %xmm3
  421. movhps -18 * SIZE(A1), %xmm3
  422. movaps %xmm0, -32 * SIZE(Y1)
  423. movaps -24 * SIZE(Y1), %xmm0
  424. movaps %xmm1, -28 * SIZE(Y1)
  425. movaps -20 * SIZE(Y1), %xmm1
  426. mulps %xmm6, %xmm2
  427. addps %xmm2, %xmm0
  428. movaps %xmm0, -24 * SIZE(Y1)
  429. mulps %xmm6, %xmm3
  430. addps %xmm3, %xmm1
  431. movaps %xmm1, -20 * SIZE(Y1)
  432. subl $-16 * SIZE, A1
  433. subl $-16 * SIZE, Y1
  434. ALIGN_3
  435. .L25:
  436. testl $8, M
  437. je .L26
  438. movsd -32 * SIZE(A1), %xmm2
  439. movhps -30 * SIZE(A1), %xmm2
  440. movsd -28 * SIZE(A1), %xmm3
  441. movhps -26 * SIZE(A1), %xmm3
  442. movaps -32 * SIZE(Y1), %xmm0
  443. movaps -28 * SIZE(Y1), %xmm1
  444. mulps %xmm6, %xmm2
  445. addps %xmm2, %xmm0
  446. mulps %xmm6, %xmm3
  447. addps %xmm3, %xmm1
  448. movaps %xmm0, -32 * SIZE(Y1)
  449. movaps %xmm1, -28 * SIZE(Y1)
  450. addl $8 * SIZE, A1
  451. addl $8 * SIZE, Y1
  452. ALIGN_3
  453. .L26:
  454. testl $4, M
  455. je .L27
  456. movsd -32 * SIZE(A1), %xmm2
  457. movhps -30 * SIZE(A1), %xmm2
  458. movaps -32 * SIZE(Y1), %xmm0
  459. mulps %xmm6, %xmm2
  460. addps %xmm2, %xmm0
  461. movaps %xmm0, -32 * SIZE(Y1)
  462. addl $4 * SIZE, A1
  463. addl $4 * SIZE, Y1
  464. ALIGN_3
  465. .L27:
  466. testl $2, M
  467. je .L28
  468. movsd -32 * SIZE(A1), %xmm2
  469. movsd -32 * SIZE(Y1), %xmm0
  470. mulps %xmm6, %xmm2
  471. addps %xmm2, %xmm0
  472. movlps %xmm0, -32 * SIZE(Y1)
  473. addl $2 * SIZE, A1
  474. addl $2 * SIZE, Y1
  475. ALIGN_3
  476. .L28:
  477. testl $1, M
  478. je .L990
  479. movss -32 * SIZE(A1), %xmm2
  480. movss -32 * SIZE(Y1), %xmm0
  481. mulss %xmm6, %xmm2
  482. addss %xmm2, %xmm0
  483. movss %xmm0, -32 * SIZE(Y1)
  484. ALIGN_3
  485. .L990:
  486. movl Y, Y1
  487. movl BUFFER, X
  488. movl STACK_INCY, INCY
  489. sall $BASE_SHIFT, INCY
  490. movl M, %eax
  491. sarl $2, %eax
  492. jle .L994
  493. ALIGN_3
  494. .L992:
  495. movss (Y1), %xmm0
  496. addss 0 * SIZE(X), %xmm0
  497. movss %xmm0, (Y1)
  498. addl INCY, Y1
  499. movss (Y1), %xmm0
  500. addss 1 * SIZE(X), %xmm0
  501. movss %xmm0, (Y1)
  502. addl INCY, Y1
  503. movss (Y1), %xmm0
  504. addss 2 * SIZE(X), %xmm0
  505. movss %xmm0, (Y1)
  506. addl INCY, Y1
  507. movss (Y1), %xmm0
  508. addss 3 * SIZE(X), %xmm0
  509. movss %xmm0, (Y1)
  510. addl INCY, Y1
  511. addl $4 * SIZE, X
  512. decl %eax
  513. jg .L992
  514. ALIGN_3
  515. .L994:
  516. testl $2, M
  517. jle .L996
  518. movss (Y1), %xmm0
  519. addss 0 * SIZE(X), %xmm0
  520. movss %xmm0, (Y1)
  521. addl INCY, Y1
  522. movss (Y1), %xmm0
  523. addss 1 * SIZE(X), %xmm0
  524. movss %xmm0, (Y1)
  525. addl INCY, Y1
  526. addl $2 * SIZE, X
  527. ALIGN_3
  528. .L996:
  529. testl $1, M
  530. jle .L999
  531. movss (Y1), %xmm0
  532. addss 0 * SIZE(X), %xmm0
  533. movss %xmm0, (Y1)
  534. ALIGN_3
  535. .L999:
  536. movl M,J
  537. leal (,J,SIZE),%eax
  538. addl %eax,AA
  539. movl STACK_INCY,INCY
  540. imull INCY,%eax
  541. addl %eax,YY
  542. jmp .L0t
  543. ALIGN_4
  544. .L999x:
  545. popl %ebx
  546. popl %esi
  547. popl %edi
  548. popl %ebp
  549. addl $ARGS,%esp
  550. ret
  551. EPILOGUE