You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemv_t_sse.S 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifdef movsd
  41. #undef movsd
  42. #endif
  43. #ifdef PENTIUM3
  44. #ifdef HAVE_SSE
  45. #define PREFETCH prefetcht0
  46. #define PREFETCHW prefetcht0
  47. #define PREFETCHSIZE (16 * 2)
  48. #endif
  49. #define movsd movlps
  50. #endif
  51. #ifdef PENTIUM4
  52. #define PREFETCH prefetcht0
  53. #define PREFETCHW prefetcht0
  54. #define PREFETCHSIZE (16 * 4)
  55. #endif
  56. #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE)
  57. #define PREFETCH prefetcht0
  58. #define PREFETCHW prefetcht0
  59. #define PREFETCHSIZE (16 * 7)
  60. #endif
  61. #ifdef OPTERON
  62. #define PREFETCH prefetchnta
  63. #define PREFETCHW prefetchw
  64. #define PREFETCHSIZE (16 * 3)
  65. #define movsd movlps
  66. #endif
  67. #ifdef BARCELONA
  68. #define PREFETCH prefetchnta
  69. #define PREFETCHW prefetchw
  70. #define PREFETCHSIZE (16 * 5)
  71. #endif
  72. #ifdef ATOM
  73. #define PREFETCH prefetchnta
  74. #define PREFETCHW prefetcht0
  75. #define PREFETCHSIZE (16 * 6)
  76. #endif
  77. #ifdef NANO
  78. #define PREFETCH prefetcht0
  79. #define PREFETCHSIZE (16 * 4)
  80. #endif
  81. #define STACKSIZE 16
  82. #define ARGS 20
  83. #define M 4 + STACKSIZE+ARGS(%esp)
  84. #define N 8 + STACKSIZE+ARGS(%esp)
  85. #define ALPHA 16 + STACKSIZE+ARGS(%esp)
  86. #define A 20 + STACKSIZE+ARGS(%esp)
  87. #define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
  88. #define STACK_X 28 + STACKSIZE+ARGS(%esp)
  89. #define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
  90. #define Y 36 + STACKSIZE+ARGS(%esp)
  91. #define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
  92. #define BUFFER 44 + STACKSIZE+ARGS(%esp)
  93. #define MMM 0+ARGS(%esp)
  94. #define AA 4+ARGS(%esp)
  95. #define XX 8+ARGS(%esp)
  96. #define I %eax
  97. #define J %ebx
  98. #define INCX J
  99. #define INCY %ecx
  100. #define A1 %esi
  101. #define X %edx
  102. #define Y1 %edi
  103. #define LDA %ebp
  104. PROLOGUE
  105. subl $ARGS,%esp
  106. pushl %ebp
  107. pushl %edi
  108. pushl %esi
  109. pushl %ebx
  110. PROFCODE
  111. movl STACK_X, X
  112. movl X,XX
  113. movl A,J
  114. movl J,AA # backup A
  115. movl M,J
  116. movl J,MMM # mov M to MMM
  117. .L0t:
  118. xorl J,J
  119. addl $1,J
  120. sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
  121. subl $8, J # Don't use last 8 float in the buffer.
  122. subl J,MMM # MMM=MMM-J
  123. movl J,M
  124. jge .L00t
  125. ALIGN_4
  126. movl MMM,%eax
  127. addl J,%eax
  128. jle .L999x
  129. movl %eax,M
  130. .L00t:
  131. movl AA,%eax
  132. movl %eax,A # mov AA to A
  133. movl XX,%eax
  134. movl %eax,X
  135. movl STACK_LDA, LDA
  136. movl STACK_INCX, INCX
  137. movl STACK_INCY, INCY
  138. leal (,INCX, SIZE), INCX
  139. leal (,INCY, SIZE), INCY
  140. leal (,LDA, SIZE), LDA
  141. subl $-32 * SIZE, A
  142. cmpl $0, N
  143. jle .L999
  144. cmpl $0, M
  145. jle .L999
  146. movl BUFFER, Y1
  147. movl M, I
  148. sarl $3, I
  149. jle .L05
  150. ALIGN_4
  151. .L02:
  152. movss (X), %xmm0
  153. addl INCX, X
  154. movss (X), %xmm1
  155. addl INCX, X
  156. unpcklps %xmm1, %xmm0
  157. movss (X), %xmm2
  158. addl INCX, X
  159. movss (X), %xmm3
  160. addl INCX, X
  161. unpcklps %xmm3, %xmm2
  162. movss (X), %xmm4
  163. addl INCX, X
  164. movss (X), %xmm5
  165. addl INCX, X
  166. unpcklps %xmm5, %xmm4
  167. movss (X), %xmm6
  168. addl INCX, X
  169. movss (X), %xmm7
  170. addl INCX, X
  171. unpcklps %xmm7, %xmm6
  172. movlps %xmm0, 0 * SIZE(Y1)
  173. movlps %xmm2, 2 * SIZE(Y1)
  174. movlps %xmm4, 4 * SIZE(Y1)
  175. movlps %xmm6, 6 * SIZE(Y1)
  176. addl $8 * SIZE, Y1
  177. decl I
  178. jg .L02
  179. ALIGN_4
  180. .L05:
  181. movl M, I
  182. andl $7, I
  183. jle .L10
  184. ALIGN_2
  185. .L06:
  186. movss (X), %xmm0
  187. addl INCX, X
  188. movss %xmm0, 0 * SIZE(Y1)
  189. addl $SIZE, Y1
  190. decl I
  191. jg .L06
  192. ALIGN_4
  193. //Padding zero to prevent loading the dirty number from buffer.
  194. movl M, I
  195. movl $8, J
  196. andl $7, I
  197. xorps %xmm0, %xmm0
  198. subl I, J
  199. ALIGN_2
  200. .L07:
  201. movss %xmm0, 0 * SIZE(Y1)
  202. addl $SIZE, Y1
  203. decl J
  204. jg .L07
  205. ALIGN_4
  206. .L10:
  207. movl Y, Y1
  208. movl N, J
  209. sarl $1, J
  210. jle .L20
  211. ALIGN_3
  212. .L11:
  213. movl BUFFER, X
  214. addl $32 * SIZE, X
  215. movl A, A1
  216. leal (A1, LDA, 2), %eax
  217. movl %eax, A
  218. xorps %xmm0, %xmm0
  219. xorps %xmm1, %xmm1
  220. movaps -32 * SIZE(X), %xmm2
  221. movaps -28 * SIZE(X), %xmm3
  222. movl M, I
  223. sarl $4, I
  224. jle .L15
  225. movsd -32 * SIZE(A1), %xmm4
  226. movhps -30 * SIZE(A1), %xmm4
  227. movsd -32 * SIZE(A1, LDA), %xmm5
  228. movhps -30 * SIZE(A1, LDA), %xmm5
  229. movsd -28 * SIZE(A1), %xmm6
  230. movhps -26 * SIZE(A1), %xmm6
  231. movsd -28 * SIZE(A1, LDA), %xmm7
  232. movhps -26 * SIZE(A1, LDA), %xmm7
  233. decl I
  234. jle .L13
  235. ALIGN_4
  236. .L12:
  237. #ifdef PREFETCH
  238. PREFETCH PREFETCHSIZE * SIZE(A1)
  239. #endif
  240. mulps %xmm2, %xmm4
  241. addps %xmm4, %xmm0
  242. movsd -24 * SIZE(A1), %xmm4
  243. movhps -22 * SIZE(A1), %xmm4
  244. mulps %xmm2, %xmm5
  245. movaps -24 * SIZE(X), %xmm2
  246. addps %xmm5, %xmm1
  247. movsd -24 * SIZE(A1, LDA), %xmm5
  248. movhps -22 * SIZE(A1, LDA), %xmm5
  249. mulps %xmm3, %xmm6
  250. addps %xmm6, %xmm0
  251. movsd -20 * SIZE(A1), %xmm6
  252. movhps -18 * SIZE(A1), %xmm6
  253. mulps %xmm3, %xmm7
  254. movaps -20 * SIZE(X), %xmm3
  255. addps %xmm7, %xmm1
  256. movsd -20 * SIZE(A1, LDA), %xmm7
  257. movhps -18 * SIZE(A1, LDA), %xmm7
  258. #ifdef PREFETCH
  259. PREFETCH PREFETCHSIZE * SIZE(A1, LDA)
  260. #endif
  261. mulps %xmm2, %xmm4
  262. addps %xmm4, %xmm0
  263. movsd -16 * SIZE(A1), %xmm4
  264. movhps -14 * SIZE(A1), %xmm4
  265. mulps %xmm2, %xmm5
  266. movaps -16 * SIZE(X), %xmm2
  267. addps %xmm5, %xmm1
  268. movsd -16 * SIZE(A1, LDA), %xmm5
  269. movhps -14 * SIZE(A1, LDA), %xmm5
  270. mulps %xmm3, %xmm6
  271. addps %xmm6, %xmm0
  272. movsd -12 * SIZE(A1), %xmm6
  273. movhps -10 * SIZE(A1), %xmm6
  274. mulps %xmm3, %xmm7
  275. movaps -12 * SIZE(X), %xmm3
  276. addps %xmm7, %xmm1
  277. movsd -12 * SIZE(A1, LDA), %xmm7
  278. movhps -10 * SIZE(A1, LDA), %xmm7
  279. addl $16 * SIZE, A1
  280. addl $16 * SIZE, X
  281. decl I
  282. jg .L12
  283. ALIGN_4
  284. .L13:
  285. mulps %xmm2, %xmm4
  286. addps %xmm4, %xmm0
  287. movsd -24 * SIZE(A1), %xmm4
  288. movhps -22 * SIZE(A1), %xmm4
  289. mulps %xmm2, %xmm5
  290. movaps -24 * SIZE(X), %xmm2
  291. addps %xmm5, %xmm1
  292. movsd -24 * SIZE(A1, LDA), %xmm5
  293. movhps -22 * SIZE(A1, LDA), %xmm5
  294. mulps %xmm3, %xmm6
  295. addps %xmm6, %xmm0
  296. movsd -20 * SIZE(A1), %xmm6
  297. movhps -18 * SIZE(A1), %xmm6
  298. mulps %xmm3, %xmm7
  299. movaps -20 * SIZE(X), %xmm3
  300. addps %xmm7, %xmm1
  301. movsd -20 * SIZE(A1, LDA), %xmm7
  302. movhps -18 * SIZE(A1, LDA), %xmm7
  303. mulps %xmm2, %xmm4
  304. addps %xmm4, %xmm0
  305. mulps %xmm2, %xmm5
  306. movaps -16 * SIZE(X), %xmm2
  307. addps %xmm5, %xmm1
  308. mulps %xmm3, %xmm6
  309. addps %xmm6, %xmm0
  310. mulps %xmm3, %xmm7
  311. movaps -12 * SIZE(X), %xmm3
  312. addps %xmm7, %xmm1
  313. addl $16 * SIZE, A1
  314. addl $16 * SIZE, X
  315. ALIGN_4
  316. .L15:
  317. testl $8, M
  318. jle .L16
  319. movsd -32 * SIZE(A1), %xmm4
  320. movhps -30 * SIZE(A1), %xmm4
  321. movsd -32 * SIZE(A1, LDA), %xmm5
  322. movhps -30 * SIZE(A1, LDA), %xmm5
  323. movsd -28 * SIZE(A1), %xmm6
  324. movhps -26 * SIZE(A1), %xmm6
  325. movsd -28 * SIZE(A1, LDA), %xmm7
  326. movhps -26 * SIZE(A1, LDA), %xmm7
  327. mulps %xmm2, %xmm4
  328. addps %xmm4, %xmm0
  329. mulps %xmm2, %xmm5
  330. movaps -24 * SIZE(X), %xmm2
  331. addps %xmm5, %xmm1
  332. mulps %xmm3, %xmm6
  333. addps %xmm6, %xmm0
  334. mulps %xmm3, %xmm7
  335. movaps -20 * SIZE(X), %xmm3
  336. addps %xmm7, %xmm1
  337. addl $8 * SIZE, A1
  338. addl $8 * SIZE, X
  339. ALIGN_4
  340. .L16:
  341. testl $4, M
  342. jle .L17
  343. movsd -32 * SIZE(A1), %xmm4
  344. movhps -30 * SIZE(A1), %xmm4
  345. movsd -32 * SIZE(A1, LDA), %xmm5
  346. movhps -30 * SIZE(A1, LDA), %xmm5
  347. mulps %xmm2, %xmm4
  348. addps %xmm4, %xmm0
  349. mulps %xmm2, %xmm5
  350. addps %xmm5, %xmm1
  351. movaps %xmm3, %xmm2
  352. addl $4 * SIZE, A1
  353. ALIGN_4
  354. .L17:
  355. testl $2, M
  356. jle .L18
  357. #ifdef movsd
  358. xorps %xmm4, %xmm4
  359. #endif
  360. movsd -32 * SIZE(A1), %xmm4
  361. #ifdef movsd
  362. xorps %xmm5, %xmm5
  363. #endif
  364. movsd -32 * SIZE(A1, LDA), %xmm5
  365. mulps %xmm2, %xmm4
  366. addps %xmm4, %xmm0
  367. mulps %xmm2, %xmm5
  368. addps %xmm5, %xmm1
  369. movhlps %xmm2, %xmm2
  370. addl $2 * SIZE, A1
  371. ALIGN_4
  372. .L18:
  373. testl $1, M
  374. jle .L19
  375. movss -32 * SIZE(A1), %xmm4
  376. mulss %xmm2, %xmm4
  377. addss %xmm4, %xmm0
  378. movss -32 * SIZE(A1, LDA), %xmm5
  379. mulss %xmm2, %xmm5
  380. addss %xmm5, %xmm1
  381. ALIGN_4
  382. .L19:
  383. #ifdef HAVE_SSE3
  384. haddps %xmm0, %xmm0
  385. haddps %xmm1, %xmm1
  386. haddps %xmm0, %xmm0
  387. haddps %xmm1, %xmm1
  388. #else
  389. movhlps %xmm0, %xmm2
  390. movhlps %xmm1, %xmm3
  391. addps %xmm2, %xmm0
  392. addps %xmm3, %xmm1
  393. movaps %xmm0, %xmm2
  394. shufps $1, %xmm0, %xmm0
  395. movaps %xmm1, %xmm3
  396. shufps $1, %xmm1, %xmm1
  397. addss %xmm2, %xmm0
  398. addss %xmm3, %xmm1
  399. #endif
  400. movss ALPHA, %xmm7
  401. mulss %xmm7, %xmm0
  402. mulss %xmm7, %xmm1
  403. addss (Y1), %xmm0
  404. addss (Y1, INCY), %xmm1
  405. movss %xmm0, (Y1)
  406. movss %xmm1, (Y1, INCY)
  407. leal (Y1, INCY, 2), Y1
  408. decl J
  409. jg .L11
  410. ALIGN_4
  411. .L20:
  412. testl $1, N
  413. jle .L999
  414. movl BUFFER, X
  415. addl $32 * SIZE, X
  416. movl A, A1
  417. xorps %xmm0, %xmm0
  418. xorps %xmm1, %xmm1
  419. movaps -32 * SIZE(X), %xmm2
  420. movaps -28 * SIZE(X), %xmm3
  421. movl M, I
  422. sarl $4, I
  423. jle .L25
  424. movsd -32 * SIZE(A1), %xmm4
  425. movhps -30 * SIZE(A1), %xmm4
  426. movsd -28 * SIZE(A1), %xmm6
  427. movhps -26 * SIZE(A1), %xmm6
  428. decl I
  429. jle .L23
  430. ALIGN_4
  431. .L22:
  432. #ifdef PREFETCH
  433. PREFETCH PREFETCHSIZE * SIZE(A1)
  434. #endif
  435. mulps %xmm2, %xmm4
  436. movaps -24 * SIZE(X), %xmm2
  437. addps %xmm4, %xmm0
  438. movsd -24 * SIZE(A1), %xmm4
  439. movhps -22 * SIZE(A1), %xmm4
  440. mulps %xmm3, %xmm6
  441. movaps -20 * SIZE(X), %xmm3
  442. addps %xmm6, %xmm0
  443. movsd -20 * SIZE(A1), %xmm6
  444. movhps -18 * SIZE(A1), %xmm6
  445. mulps %xmm2, %xmm4
  446. movaps -16 * SIZE(X), %xmm2
  447. addps %xmm4, %xmm0
  448. movsd -16 * SIZE(A1), %xmm4
  449. movhps -14 * SIZE(A1), %xmm4
  450. mulps %xmm3, %xmm6
  451. movaps -12 * SIZE(X), %xmm3
  452. addps %xmm6, %xmm0
  453. movsd -12 * SIZE(A1), %xmm6
  454. movhps -10 * SIZE(A1), %xmm6
  455. addl $16 * SIZE, A1
  456. addl $16 * SIZE, X
  457. decl I
  458. jg .L22
  459. ALIGN_4
  460. .L23:
  461. mulps %xmm2, %xmm4
  462. movaps -24 * SIZE(X), %xmm2
  463. addps %xmm4, %xmm0
  464. movsd -24 * SIZE(A1), %xmm4
  465. movhps -22 * SIZE(A1), %xmm4
  466. mulps %xmm3, %xmm6
  467. movaps -20 * SIZE(X), %xmm3
  468. addps %xmm6, %xmm0
  469. movsd -20 * SIZE(A1), %xmm6
  470. movhps -18 * SIZE(A1), %xmm6
  471. mulps %xmm2, %xmm4
  472. movaps -16 * SIZE(X), %xmm2
  473. addps %xmm4, %xmm0
  474. mulps %xmm3, %xmm6
  475. movaps -12 * SIZE(X), %xmm3
  476. addps %xmm6, %xmm0
  477. addl $16 * SIZE, A1
  478. addl $16 * SIZE, X
  479. ALIGN_4
  480. .L25:
  481. testl $8, M
  482. jle .L26
  483. movsd -32 * SIZE(A1), %xmm4
  484. movhps -30 * SIZE(A1), %xmm4
  485. movsd -28 * SIZE(A1), %xmm6
  486. movhps -26 * SIZE(A1), %xmm6
  487. mulps %xmm2, %xmm4
  488. movaps -24 * SIZE(X), %xmm2
  489. addps %xmm4, %xmm0
  490. mulps %xmm3, %xmm6
  491. movaps -20 * SIZE(X), %xmm3
  492. addps %xmm6, %xmm0
  493. addl $8 * SIZE, A1
  494. addl $8 * SIZE, X
  495. ALIGN_4
  496. .L26:
  497. testl $4, M
  498. jle .L27
  499. movsd -32 * SIZE(A1), %xmm4
  500. movhps -30 * SIZE(A1), %xmm4
  501. mulps %xmm2, %xmm4
  502. addps %xmm4, %xmm0
  503. movaps %xmm3, %xmm2
  504. addl $4 * SIZE, A1
  505. ALIGN_4
  506. .L27:
  507. testl $2, M
  508. jle .L28
  509. #ifdef movsd
  510. xorps %xmm4, %xmm4
  511. #endif
  512. movsd -32 * SIZE(A1), %xmm4
  513. mulps %xmm2, %xmm4
  514. addps %xmm4, %xmm0
  515. movhlps %xmm2, %xmm2
  516. addl $2 * SIZE, A1
  517. ALIGN_4
  518. .L28:
  519. testl $1, M
  520. jle .L29
  521. movss -32 * SIZE(A1), %xmm4
  522. mulss %xmm2, %xmm4
  523. addss %xmm4, %xmm0
  524. ALIGN_4
  525. .L29:
  526. #ifdef HAVE_SSE3
  527. haddps %xmm0, %xmm0
  528. haddps %xmm0, %xmm0
  529. #else
  530. movhlps %xmm0, %xmm2
  531. addps %xmm2, %xmm0
  532. movaps %xmm0, %xmm2
  533. shufps $1, %xmm0, %xmm0
  534. addss %xmm2, %xmm0
  535. #endif
  536. movss ALPHA, %xmm7
  537. mulss %xmm7, %xmm0
  538. addss (Y1), %xmm0
  539. movss %xmm0, (Y1)
  540. ALIGN_4
  541. .L999:
  542. movl M,J
  543. leal (,J,SIZE),%eax
  544. addl %eax,AA
  545. movl STACK_INCX,INCX
  546. imull INCX,%eax
  547. addl %eax,XX
  548. jmp .L0t
  549. ALIGN_4
  550. .L999x:
  551. popl %ebx
  552. popl %esi
  553. popl %edi
  554. popl %ebp
  555. addl $ARGS,%esp
  556. ret
  557. EPILOGUE