You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

axpy_sse.S 28 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_ALPHA 16 + STACK + ARGS(%esp)
  44. #define STACK_X 20 + STACK + ARGS(%esp)
  45. #define STACK_INCX 24 + STACK + ARGS(%esp)
  46. #define STACK_Y 28 + STACK + ARGS(%esp)
  47. #define STACK_INCY 32 + STACK + ARGS(%esp)
  48. #define M %ebx
  49. #define X %esi
  50. #define Y %edi
  51. #define INCX %ecx
  52. #define INCY %edx
  53. #define YY %ebp
  54. #define ALPHA %xmm7
  55. #include "l1param.h"
  56. PROLOGUE
  57. PROFCODE
  58. pushl %edi
  59. pushl %esi
  60. pushl %ebx
  61. pushl %ebp
  62. movl STACK_M, M
  63. movss STACK_ALPHA, ALPHA
  64. movl STACK_X, X
  65. movl STACK_INCX, INCX
  66. movl STACK_Y, Y
  67. movl STACK_INCY, INCY
  68. shufps $0, ALPHA, ALPHA
  69. leal (, INCX, SIZE), INCX
  70. leal (, INCY, SIZE), INCY
  71. testl M, M
  72. jle .L19
  73. cmpl $SIZE, INCX
  74. jne .L50
  75. cmpl $SIZE, INCY
  76. jne .L50
  77. subl $-32 * SIZE, X
  78. subl $-32 * SIZE, Y
  79. cmpl $3, M
  80. jle .L16
  81. testl $SIZE, Y
  82. je .L00
  83. movss -32 * SIZE(X), %xmm0
  84. mulss ALPHA, %xmm0
  85. addss -32 * SIZE(Y), %xmm0
  86. movss %xmm0, -32 * SIZE(Y)
  87. addl $1 * SIZE, X
  88. addl $1 * SIZE, Y
  89. decl M
  90. jle .L19
  91. ALIGN_3
  92. .L00:
  93. testl $SIZE * 2, Y
  94. je .L10
  95. movsd -32 * SIZE(X), %xmm0
  96. movsd -32 * SIZE(Y), %xmm4
  97. mulps ALPHA, %xmm0
  98. addps %xmm4, %xmm0
  99. movsd %xmm0, -32 * SIZE(Y)
  100. addl $2 * SIZE, X
  101. addl $2 * SIZE, Y
  102. subl $2, M
  103. jle .L19
  104. ALIGN_3
  105. .L10:
  106. testl $SIZE * 3, X
  107. jne .L20
  108. movl M, %eax
  109. sarl $5, %eax
  110. jle .L13
  111. movaps -32 * SIZE(X), %xmm0
  112. movaps -28 * SIZE(X), %xmm1
  113. movaps -24 * SIZE(X), %xmm2
  114. movaps -20 * SIZE(X), %xmm3
  115. decl %eax
  116. jle .L12
  117. ALIGN_4
  118. .L11:
  119. #ifdef PREFETCH
  120. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  121. #endif
  122. mulps ALPHA, %xmm0
  123. addps -32 * SIZE(Y), %xmm0
  124. movaps %xmm0, -32 * SIZE(Y)
  125. movaps -16 * SIZE(X), %xmm0
  126. mulps ALPHA, %xmm1
  127. addps -28 * SIZE(Y), %xmm1
  128. movaps %xmm1, -28 * SIZE(Y)
  129. movaps -12 * SIZE(X), %xmm1
  130. #ifdef PREFETCHW
  131. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  132. #endif
  133. mulps ALPHA, %xmm2
  134. addps -24 * SIZE(Y), %xmm2
  135. movaps %xmm2, -24 * SIZE(Y)
  136. movaps -8 * SIZE(X), %xmm2
  137. mulps ALPHA, %xmm3
  138. addps -20 * SIZE(Y), %xmm3
  139. movaps %xmm3, -20 * SIZE(Y)
  140. movaps -4 * SIZE(X), %xmm3
  141. #if defined(PREFETCH) && !defined(FETCH128)
  142. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  143. #endif
  144. mulps ALPHA, %xmm0
  145. addps -16 * SIZE(Y), %xmm0
  146. movaps %xmm0, -16 * SIZE(Y)
  147. movaps 0 * SIZE(X), %xmm0
  148. mulps ALPHA, %xmm1
  149. addps -12 * SIZE(Y), %xmm1
  150. movaps %xmm1, -12 * SIZE(Y)
  151. movaps 4 * SIZE(X), %xmm1
  152. #if defined(PREFETCHW) && !defined(FETCH128)
  153. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  154. #endif
  155. mulps ALPHA, %xmm2
  156. addps -8 * SIZE(Y), %xmm2
  157. movaps %xmm2, -8 * SIZE(Y)
  158. movaps 8 * SIZE(X), %xmm2
  159. mulps ALPHA, %xmm3
  160. addps -4 * SIZE(Y), %xmm3
  161. movaps %xmm3, -4 * SIZE(Y)
  162. movaps 12 * SIZE(X), %xmm3
  163. subl $-32 * SIZE, X
  164. subl $-32 * SIZE, Y
  165. decl %eax
  166. jg .L11
  167. ALIGN_3
  168. .L12:
  169. mulps ALPHA, %xmm0
  170. addps -32 * SIZE(Y), %xmm0
  171. movaps %xmm0, -32 * SIZE(Y)
  172. movaps -16 * SIZE(X), %xmm0
  173. mulps ALPHA, %xmm1
  174. addps -28 * SIZE(Y), %xmm1
  175. movaps %xmm1, -28 * SIZE(Y)
  176. movaps -12 * SIZE(X), %xmm1
  177. mulps ALPHA, %xmm2
  178. addps -24 * SIZE(Y), %xmm2
  179. movaps %xmm2, -24 * SIZE(Y)
  180. movaps -8 * SIZE(X), %xmm2
  181. mulps ALPHA, %xmm3
  182. addps -20 * SIZE(Y), %xmm3
  183. movaps %xmm3, -20 * SIZE(Y)
  184. movaps -4 * SIZE(X), %xmm3
  185. mulps ALPHA, %xmm0
  186. addps -16 * SIZE(Y), %xmm0
  187. movaps %xmm0, -16 * SIZE(Y)
  188. mulps ALPHA, %xmm1
  189. addps -12 * SIZE(Y), %xmm1
  190. movaps %xmm1, -12 * SIZE(Y)
  191. mulps ALPHA, %xmm2
  192. addps -8 * SIZE(Y), %xmm2
  193. movaps %xmm2, -8 * SIZE(Y)
  194. mulps ALPHA, %xmm3
  195. addps -4 * SIZE(Y), %xmm3
  196. movaps %xmm3, -4 * SIZE(Y)
  197. subl $-32 * SIZE, X
  198. subl $-32 * SIZE, Y
  199. ALIGN_3
  200. .L13:
  201. movl M, %eax
  202. andl $16, %eax
  203. jle .L14
  204. ALIGN_3
  205. movaps -32 * SIZE(X), %xmm0
  206. movaps -28 * SIZE(X), %xmm1
  207. movaps -24 * SIZE(X), %xmm2
  208. movaps -20 * SIZE(X), %xmm3
  209. mulps ALPHA, %xmm0
  210. addps -32 * SIZE(Y), %xmm0
  211. mulps ALPHA, %xmm1
  212. addps -28 * SIZE(Y), %xmm1
  213. mulps ALPHA, %xmm2
  214. addps -24 * SIZE(Y), %xmm2
  215. mulps ALPHA, %xmm3
  216. addps -20 * SIZE(Y), %xmm3
  217. movaps %xmm0, -32 * SIZE(Y)
  218. movaps %xmm1, -28 * SIZE(Y)
  219. movaps %xmm2, -24 * SIZE(Y)
  220. movaps %xmm3, -20 * SIZE(Y)
  221. addl $16 * SIZE, X
  222. addl $16 * SIZE, Y
  223. ALIGN_3
  224. .L14:
  225. movl M, %eax
  226. andl $8, %eax
  227. jle .L15
  228. ALIGN_3
  229. movaps -32 * SIZE(X), %xmm0
  230. movaps -28 * SIZE(X), %xmm1
  231. mulps ALPHA, %xmm0
  232. addps -32 * SIZE(Y), %xmm0
  233. mulps ALPHA, %xmm1
  234. addps -28 * SIZE(Y), %xmm1
  235. movaps %xmm0, -32 * SIZE(Y)
  236. movaps %xmm1, -28 * SIZE(Y)
  237. addl $8 * SIZE, X
  238. addl $8 * SIZE, Y
  239. ALIGN_3
  240. .L15:
  241. movl M, %eax
  242. andl $4, %eax
  243. jle .L16
  244. ALIGN_3
  245. movaps -32 * SIZE(X), %xmm0
  246. mulps ALPHA, %xmm0
  247. addps -32 * SIZE(Y), %xmm0
  248. movaps %xmm0, -32 * SIZE(Y)
  249. addl $4 * SIZE, X
  250. addl $4 * SIZE, Y
  251. ALIGN_3
  252. .L16:
  253. movl M, %eax
  254. andl $2, %eax
  255. jle .L17
  256. ALIGN_3
  257. movsd -32 * SIZE(X), %xmm0
  258. movsd -32 * SIZE(Y), %xmm4
  259. mulps ALPHA, %xmm0
  260. addps %xmm4, %xmm0
  261. movsd %xmm0, -32 * SIZE(Y)
  262. addl $2 * SIZE, X
  263. addl $2 * SIZE, Y
  264. ALIGN_3
  265. .L17:
  266. movl M, %eax
  267. andl $1, %eax
  268. jle .L19
  269. ALIGN_3
  270. movss -32 * SIZE(X), %xmm0
  271. mulss ALPHA, %xmm0
  272. addss -32 * SIZE(Y), %xmm0
  273. movss %xmm0, -32 * SIZE(Y)
  274. ALIGN_3
  275. .L19:
  276. popl %ebp
  277. popl %ebx
  278. popl %esi
  279. popl %edi
  280. ret
  281. ALIGN_3
  282. .L20:
  283. #ifdef ALIGNED_ACCESS
  284. testl $SIZE, X
  285. jne .L30
  286. movhps -32 * SIZE(X), %xmm0
  287. movl M, %eax
  288. sarl $5, %eax
  289. jle .L23
  290. movaps -30 * SIZE(X), %xmm1
  291. movaps -26 * SIZE(X), %xmm2
  292. movaps -22 * SIZE(X), %xmm3
  293. decl %eax
  294. jle .L22
  295. ALIGN_4
  296. .L21:
  297. #ifdef PREFETCHW
  298. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  299. #endif
  300. SHUFPD_1 %xmm1, %xmm0
  301. mulps ALPHA, %xmm0
  302. addps -32 * SIZE(Y), %xmm0
  303. movaps %xmm0, -32 * SIZE(Y)
  304. movaps -18 * SIZE(X), %xmm0
  305. SHUFPD_1 %xmm2, %xmm1
  306. mulps ALPHA, %xmm1
  307. addps -28 * SIZE(Y), %xmm1
  308. movaps %xmm1, -28 * SIZE(Y)
  309. movaps -14 * SIZE(X), %xmm1
  310. #ifdef PREFETCH
  311. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  312. #endif
  313. SHUFPD_1 %xmm3, %xmm2
  314. mulps ALPHA, %xmm2
  315. addps -24 * SIZE(Y), %xmm2
  316. movaps %xmm2, -24 * SIZE(Y)
  317. movaps -10 * SIZE(X), %xmm2
  318. SHUFPD_1 %xmm0, %xmm3
  319. mulps ALPHA, %xmm3
  320. addps -20 * SIZE(Y), %xmm3
  321. movaps %xmm3, -20 * SIZE(Y)
  322. movaps -6 * SIZE(X), %xmm3
  323. #if defined(PREFETCHW) && !defined(FETCH128)
  324. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  325. #endif
  326. SHUFPD_1 %xmm1, %xmm0
  327. mulps ALPHA, %xmm0
  328. addps -16 * SIZE(Y), %xmm0
  329. movaps %xmm0, -16 * SIZE(Y)
  330. movaps -2 * SIZE(X), %xmm0
  331. SHUFPD_1 %xmm2, %xmm1
  332. mulps ALPHA, %xmm1
  333. addps -12 * SIZE(Y), %xmm1
  334. movaps %xmm1, -12 * SIZE(Y)
  335. movaps 2 * SIZE(X), %xmm1
  336. #if defined(PREFETCH) && !defined(FETCH128)
  337. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  338. #endif
  339. SHUFPD_1 %xmm3, %xmm2
  340. mulps ALPHA, %xmm2
  341. addps -8 * SIZE(Y), %xmm2
  342. movaps %xmm2, -8 * SIZE(Y)
  343. movaps 6 * SIZE(X), %xmm2
  344. SHUFPD_1 %xmm0, %xmm3
  345. mulps ALPHA, %xmm3
  346. addps -4 * SIZE(Y), %xmm3
  347. movaps %xmm3, -4 * SIZE(Y)
  348. movaps 10 * SIZE(X), %xmm3
  349. subl $-32 * SIZE, X
  350. subl $-32 * SIZE, Y
  351. decl %eax
  352. jg .L21
  353. ALIGN_3
  354. .L22:
  355. SHUFPD_1 %xmm1, %xmm0
  356. mulps ALPHA, %xmm0
  357. addps -32 * SIZE(Y), %xmm0
  358. movaps %xmm0, -32 * SIZE(Y)
  359. movaps -18 * SIZE(X), %xmm0
  360. SHUFPD_1 %xmm2, %xmm1
  361. mulps ALPHA, %xmm1
  362. addps -28 * SIZE(Y), %xmm1
  363. movaps %xmm1, -28 * SIZE(Y)
  364. movaps -14 * SIZE(X), %xmm1
  365. SHUFPD_1 %xmm3, %xmm2
  366. mulps ALPHA, %xmm2
  367. addps -24 * SIZE(Y), %xmm2
  368. movaps %xmm2, -24 * SIZE(Y)
  369. movaps -10 * SIZE(X), %xmm2
  370. SHUFPD_1 %xmm0, %xmm3
  371. mulps ALPHA, %xmm3
  372. addps -20 * SIZE(Y), %xmm3
  373. movaps %xmm3, -20 * SIZE(Y)
  374. movaps -6 * SIZE(X), %xmm3
  375. SHUFPD_1 %xmm1, %xmm0
  376. mulps ALPHA, %xmm0
  377. addps -16 * SIZE(Y), %xmm0
  378. movaps %xmm0, -16 * SIZE(Y)
  379. movaps -2 * SIZE(X), %xmm0
  380. SHUFPD_1 %xmm2, %xmm1
  381. mulps ALPHA, %xmm1
  382. addps -12 * SIZE(Y), %xmm1
  383. movaps %xmm1, -12 * SIZE(Y)
  384. SHUFPD_1 %xmm3, %xmm2
  385. mulps ALPHA, %xmm2
  386. addps -8 * SIZE(Y), %xmm2
  387. movaps %xmm2, -8 * SIZE(Y)
  388. SHUFPD_1 %xmm0, %xmm3
  389. mulps ALPHA, %xmm3
  390. addps -4 * SIZE(Y), %xmm3
  391. movaps %xmm3, -4 * SIZE(Y)
  392. subl $-32 * SIZE, X
  393. subl $-32 * SIZE, Y
  394. ALIGN_3
  395. .L23:
  396. movl M, %eax
  397. andl $16, %eax
  398. jle .L24
  399. ALIGN_3
  400. movaps -30 * SIZE(X), %xmm1
  401. movaps -26 * SIZE(X), %xmm2
  402. movaps -22 * SIZE(X), %xmm3
  403. movaps -18 * SIZE(X), %xmm4
  404. SHUFPD_1 %xmm1, %xmm0
  405. SHUFPD_1 %xmm2, %xmm1
  406. SHUFPD_1 %xmm3, %xmm2
  407. SHUFPD_1 %xmm4, %xmm3
  408. mulps ALPHA, %xmm0
  409. addps -32 * SIZE(Y), %xmm0
  410. mulps ALPHA, %xmm1
  411. addps -28 * SIZE(Y), %xmm1
  412. mulps ALPHA, %xmm2
  413. addps -24 * SIZE(Y), %xmm2
  414. mulps ALPHA, %xmm3
  415. addps -20 * SIZE(Y), %xmm3
  416. movaps %xmm0, -32 * SIZE(Y)
  417. movaps %xmm1, -28 * SIZE(Y)
  418. movaps %xmm2, -24 * SIZE(Y)
  419. movaps %xmm3, -20 * SIZE(Y)
  420. movaps %xmm4, %xmm0
  421. addl $16 * SIZE, X
  422. addl $16 * SIZE, Y
  423. ALIGN_3
  424. .L24:
  425. movl M, %eax
  426. andl $8, %eax
  427. jle .L25
  428. ALIGN_3
  429. movaps -30 * SIZE(X), %xmm1
  430. movaps -26 * SIZE(X), %xmm2
  431. SHUFPD_1 %xmm1, %xmm0
  432. mulps ALPHA, %xmm0
  433. addps -32 * SIZE(Y), %xmm0
  434. SHUFPD_1 %xmm2, %xmm1
  435. mulps ALPHA, %xmm1
  436. addps -28 * SIZE(Y), %xmm1
  437. movaps %xmm0, -32 * SIZE(Y)
  438. movaps %xmm1, -28 * SIZE(Y)
  439. movaps %xmm2, %xmm0
  440. addl $8 * SIZE, X
  441. addl $8 * SIZE, Y
  442. ALIGN_3
  443. .L25:
  444. movl M, %eax
  445. andl $4, %eax
  446. jle .L26
  447. ALIGN_3
  448. movaps -30 * SIZE(X), %xmm1
  449. SHUFPD_1 %xmm1, %xmm0
  450. mulps ALPHA, %xmm0
  451. addps -32 * SIZE(Y), %xmm0
  452. movaps %xmm0, -32 * SIZE(Y)
  453. addl $4 * SIZE, X
  454. addl $4 * SIZE, Y
  455. ALIGN_3
  456. .L26:
  457. movl M, %eax
  458. andl $2, %eax
  459. jle .L27
  460. ALIGN_3
  461. movsd -32 * SIZE(X), %xmm0
  462. movsd -32 * SIZE(Y), %xmm4
  463. mulps ALPHA, %xmm0
  464. addps %xmm4, %xmm0
  465. movsd %xmm0, -32 * SIZE(Y)
  466. addl $2 * SIZE, X
  467. addl $2 * SIZE, Y
  468. ALIGN_3
  469. .L27:
  470. movl M, %eax
  471. andl $1, %eax
  472. jle .L29
  473. ALIGN_3
  474. movss -32 * SIZE(X), %xmm0
  475. mulss ALPHA, %xmm0
  476. addss -32 * SIZE(Y), %xmm0
  477. movss %xmm0, -32 * SIZE(Y)
  478. addl $SIZE, Y
  479. ALIGN_3
  480. .L29:
  481. popl %ebp
  482. popl %ebx
  483. popl %esi
  484. popl %edi
  485. ret
  486. ALIGN_3
  487. .L30:
  488. testl $2 * SIZE, X
  489. jne .L40
  490. movaps -33 * SIZE(X), %xmm0
  491. movl M, %eax
  492. sarl $5, %eax
  493. jle .L33
  494. movaps -29 * SIZE(X), %xmm1
  495. movaps -25 * SIZE(X), %xmm2
  496. movaps -21 * SIZE(X), %xmm3
  497. decl %eax
  498. jle .L32
  499. ALIGN_4
  500. .L31:
  501. #ifdef PREFETCHW
  502. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  503. #endif
  504. movss %xmm1, %xmm0
  505. SHUFPS_39 %xmm0, %xmm0
  506. mulps ALPHA, %xmm0
  507. addps -32 * SIZE(Y), %xmm0
  508. movaps %xmm0, -32 * SIZE(Y)
  509. movaps -17 * SIZE(X), %xmm0
  510. movss %xmm2, %xmm1
  511. SHUFPS_39 %xmm1, %xmm1
  512. mulps ALPHA, %xmm1
  513. addps -28 * SIZE(Y), %xmm1
  514. movaps %xmm1, -28 * SIZE(Y)
  515. movaps -13 * SIZE(X), %xmm1
  516. #ifdef PREFETCH
  517. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  518. #endif
  519. movss %xmm3, %xmm2
  520. SHUFPS_39 %xmm2, %xmm2
  521. mulps ALPHA, %xmm2
  522. addps -24 * SIZE(Y), %xmm2
  523. movaps %xmm2, -24 * SIZE(Y)
  524. movaps -9 * SIZE(X), %xmm2
  525. movss %xmm0, %xmm3
  526. SHUFPS_39 %xmm3, %xmm3
  527. mulps ALPHA, %xmm3
  528. addps -20 * SIZE(Y), %xmm3
  529. movaps %xmm3, -20 * SIZE(Y)
  530. movaps -5 * SIZE(X), %xmm3
  531. #if defined(PREFETCHW) && !defined(FETCH128)
  532. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  533. #endif
  534. movss %xmm1, %xmm0
  535. SHUFPS_39 %xmm0, %xmm0
  536. mulps ALPHA, %xmm0
  537. addps -16 * SIZE(Y), %xmm0
  538. movaps %xmm0, -16 * SIZE(Y)
  539. movaps -1 * SIZE(X), %xmm0
  540. movss %xmm2, %xmm1
  541. SHUFPS_39 %xmm1, %xmm1
  542. mulps ALPHA, %xmm1
  543. addps -12 * SIZE(Y), %xmm1
  544. movaps %xmm1, -12 * SIZE(Y)
  545. movaps 3 * SIZE(X), %xmm1
  546. #if defined(PREFETCH) && !defined(FETCH128)
  547. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  548. #endif
  549. movss %xmm3, %xmm2
  550. SHUFPS_39 %xmm2, %xmm2
  551. mulps ALPHA, %xmm2
  552. addps -8 * SIZE(Y), %xmm2
  553. movaps %xmm2, -8 * SIZE(Y)
  554. movaps 7 * SIZE(X), %xmm2
  555. movss %xmm0, %xmm3
  556. SHUFPS_39 %xmm3, %xmm3
  557. mulps ALPHA, %xmm3
  558. addps -4 * SIZE(Y), %xmm3
  559. movaps %xmm3, -4 * SIZE(Y)
  560. movaps 11 * SIZE(X), %xmm3
  561. subl $-32 * SIZE, X
  562. subl $-32 * SIZE, Y
  563. decl %eax
  564. jg .L31
  565. ALIGN_3
  566. .L32:
  567. movss %xmm1, %xmm0
  568. SHUFPS_39 %xmm0, %xmm0
  569. mulps ALPHA, %xmm0
  570. addps -32 * SIZE(Y), %xmm0
  571. movaps %xmm0, -32 * SIZE(Y)
  572. movaps -17 * SIZE(X), %xmm0
  573. movss %xmm2, %xmm1
  574. SHUFPS_39 %xmm1, %xmm1
  575. mulps ALPHA, %xmm1
  576. addps -28 * SIZE(Y), %xmm1
  577. movaps %xmm1, -28 * SIZE(Y)
  578. movaps -13 * SIZE(X), %xmm1
  579. movss %xmm3, %xmm2
  580. SHUFPS_39 %xmm2, %xmm2
  581. mulps ALPHA, %xmm2
  582. addps -24 * SIZE(Y), %xmm2
  583. movaps %xmm2, -24 * SIZE(Y)
  584. movaps -9 * SIZE(X), %xmm2
  585. movss %xmm0, %xmm3
  586. SHUFPS_39 %xmm3, %xmm3
  587. mulps ALPHA, %xmm3
  588. addps -20 * SIZE(Y), %xmm3
  589. movaps %xmm3, -20 * SIZE(Y)
  590. movaps -5 * SIZE(X), %xmm3
  591. movss %xmm1, %xmm0
  592. SHUFPS_39 %xmm0, %xmm0
  593. mulps ALPHA, %xmm0
  594. addps -16 * SIZE(Y), %xmm0
  595. movaps %xmm0, -16 * SIZE(Y)
  596. movaps -1 * SIZE(X), %xmm0
  597. movss %xmm2, %xmm1
  598. SHUFPS_39 %xmm1, %xmm1
  599. mulps ALPHA, %xmm1
  600. addps -12 * SIZE(Y), %xmm1
  601. movaps %xmm1, -12 * SIZE(Y)
  602. movss %xmm3, %xmm2
  603. SHUFPS_39 %xmm2, %xmm2
  604. mulps ALPHA, %xmm2
  605. addps -8 * SIZE(Y), %xmm2
  606. movaps %xmm2, -8 * SIZE(Y)
  607. movss %xmm0, %xmm3
  608. SHUFPS_39 %xmm3, %xmm3
  609. mulps ALPHA, %xmm3
  610. addps -4 * SIZE(Y), %xmm3
  611. movaps %xmm3, -4 * SIZE(Y)
  612. subl $-32 * SIZE, X
  613. subl $-32 * SIZE, Y
  614. ALIGN_3
  615. .L33:
  616. movl M, %eax
  617. andl $16, %eax
  618. jle .L34
  619. ALIGN_3
  620. movaps -29 * SIZE(X), %xmm1
  621. movaps -25 * SIZE(X), %xmm2
  622. movaps -21 * SIZE(X), %xmm3
  623. movaps -17 * SIZE(X), %xmm4
  624. movss %xmm1, %xmm0
  625. SHUFPS_39 %xmm0, %xmm0
  626. mulps ALPHA, %xmm0
  627. addps -32 * SIZE(Y), %xmm0
  628. movss %xmm2, %xmm1
  629. SHUFPS_39 %xmm1, %xmm1
  630. mulps ALPHA, %xmm1
  631. addps -28 * SIZE(Y), %xmm1
  632. movss %xmm3, %xmm2
  633. SHUFPS_39 %xmm2, %xmm2
  634. mulps ALPHA, %xmm2
  635. addps -24 * SIZE(Y), %xmm2
  636. movss %xmm4, %xmm3
  637. SHUFPS_39 %xmm3, %xmm3
  638. mulps ALPHA, %xmm3
  639. addps -20 * SIZE(Y), %xmm3
  640. movaps %xmm0, -32 * SIZE(Y)
  641. movaps %xmm1, -28 * SIZE(Y)
  642. movaps %xmm2, -24 * SIZE(Y)
  643. movaps %xmm3, -20 * SIZE(Y)
  644. movaps %xmm4, %xmm0
  645. addl $16 * SIZE, X
  646. addl $16 * SIZE, Y
  647. ALIGN_3
  648. .L34:
  649. movl M, %eax
  650. andl $8, %eax
  651. jle .L35
  652. ALIGN_3
  653. movaps -29 * SIZE(X), %xmm1
  654. movaps -25 * SIZE(X), %xmm2
  655. movss %xmm1, %xmm0
  656. SHUFPS_39 %xmm0, %xmm0
  657. mulps ALPHA, %xmm0
  658. addps -32 * SIZE(Y), %xmm0
  659. movss %xmm2, %xmm1
  660. SHUFPS_39 %xmm1, %xmm1
  661. mulps ALPHA, %xmm1
  662. addps -28 * SIZE(Y), %xmm1
  663. movaps %xmm0, -32 * SIZE(Y)
  664. movaps %xmm1, -28 * SIZE(Y)
  665. movaps %xmm2, %xmm0
  666. addl $8 * SIZE, X
  667. addl $8 * SIZE, Y
  668. ALIGN_3
  669. .L35:
  670. movl M, %eax
  671. andl $4, %eax
  672. jle .L36
  673. ALIGN_3
  674. movaps -29 * SIZE(X), %xmm1
  675. movss %xmm1, %xmm0
  676. SHUFPS_39 %xmm0, %xmm0
  677. mulps ALPHA, %xmm0
  678. addps -32 * SIZE(Y), %xmm0
  679. movaps %xmm0, -32 * SIZE(Y)
  680. addl $4 * SIZE, X
  681. addl $4 * SIZE, Y
  682. ALIGN_3
  683. .L36:
  684. movl M, %eax
  685. andl $2, %eax
  686. jle .L37
  687. ALIGN_3
  688. movsd -32 * SIZE(X), %xmm0
  689. movsd -32 * SIZE(Y), %xmm4
  690. mulps ALPHA, %xmm0
  691. addps %xmm4, %xmm0
  692. movsd %xmm0, -32 * SIZE(Y)
  693. addl $2 * SIZE, X
  694. addl $2 * SIZE, Y
  695. ALIGN_3
  696. .L37:
  697. movl M, %eax
  698. andl $1, %eax
  699. jle .L39
  700. ALIGN_3
  701. movss -32 * SIZE(X), %xmm0
  702. mulss ALPHA, %xmm0
  703. addss -32 * SIZE(Y), %xmm0
  704. movss %xmm0, -32 * SIZE(Y)
  705. addl $SIZE, Y
  706. ALIGN_3
  707. .L39:
  708. popl %ebp
  709. popl %ebx
  710. popl %esi
  711. popl %edi
  712. ret
  713. ALIGN_3
  714. .L40:
  715. movaps -35 * SIZE(X), %xmm0
  716. movl M, %eax
  717. sarl $5, %eax
  718. jle .L43
  719. movaps -31 * SIZE(X), %xmm1
  720. movaps -27 * SIZE(X), %xmm2
  721. movaps -23 * SIZE(X), %xmm3
  722. decl %eax
  723. jle .L42
  724. ALIGN_4
  725. .L41:
  726. #ifdef PREFETCHW
  727. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  728. #endif
  729. movss %xmm1, %xmm0
  730. shufps $0x93, %xmm1, %xmm0
  731. mulps ALPHA, %xmm0
  732. addps -32 * SIZE(Y), %xmm0
  733. movaps %xmm0, -32 * SIZE(Y)
  734. movaps -19 * SIZE(X), %xmm0
  735. movss %xmm2, %xmm1
  736. shufps $0x93, %xmm2, %xmm1
  737. mulps ALPHA, %xmm1
  738. addps -28 * SIZE(Y), %xmm1
  739. movaps %xmm1, -28 * SIZE(Y)
  740. movaps -15 * SIZE(X), %xmm1
  741. #ifdef PREFETCH
  742. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  743. #endif
  744. movss %xmm3, %xmm2
  745. shufps $0x93, %xmm3, %xmm2
  746. mulps ALPHA, %xmm2
  747. addps -24 * SIZE(Y), %xmm2
  748. movaps %xmm2, -24 * SIZE(Y)
  749. movaps -11 * SIZE(X), %xmm2
  750. movss %xmm0, %xmm3
  751. shufps $0x93, %xmm0, %xmm3
  752. mulps ALPHA, %xmm3
  753. addps -20 * SIZE(Y), %xmm3
  754. movaps %xmm3, -20 * SIZE(Y)
  755. movaps -7 * SIZE(X), %xmm3
  756. #if defined(PREFETCHW) && !defined(FETCH128)
  757. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  758. #endif
  759. movss %xmm1, %xmm0
  760. shufps $0x93, %xmm1, %xmm0
  761. mulps ALPHA, %xmm0
  762. addps -16 * SIZE(Y), %xmm0
  763. movaps %xmm0, -16 * SIZE(Y)
  764. movaps -3 * SIZE(X), %xmm0
  765. movss %xmm2, %xmm1
  766. shufps $0x93, %xmm2, %xmm1
  767. mulps ALPHA, %xmm1
  768. addps -12 * SIZE(Y), %xmm1
  769. movaps %xmm1, -12 * SIZE(Y)
  770. movaps 1 * SIZE(X), %xmm1
  771. #if defined(PREFETCH) && !defined(FETCH128)
  772. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  773. #endif
  774. movss %xmm3, %xmm2
  775. shufps $0x93, %xmm3, %xmm2
  776. mulps ALPHA, %xmm2
  777. addps -8 * SIZE(Y), %xmm2
  778. movaps %xmm2, -8 * SIZE(Y)
  779. movaps 5 * SIZE(X), %xmm2
  780. movss %xmm0, %xmm3
  781. shufps $0x93, %xmm0, %xmm3
  782. mulps ALPHA, %xmm3
  783. addps -4 * SIZE(Y), %xmm3
  784. movaps %xmm3, -4 * SIZE(Y)
  785. movaps 9 * SIZE(X), %xmm3
  786. subl $-32 * SIZE, X
  787. subl $-32 * SIZE, Y
  788. decl %eax
  789. jg .L41
  790. ALIGN_3
  791. .L42:
  792. movss %xmm1, %xmm0
  793. shufps $0x93, %xmm1, %xmm0
  794. mulps ALPHA, %xmm0
  795. addps -32 * SIZE(Y), %xmm0
  796. movaps %xmm0, -32 * SIZE(Y)
  797. movaps -19 * SIZE(X), %xmm0
  798. movss %xmm2, %xmm1
  799. shufps $0x93, %xmm2, %xmm1
  800. mulps ALPHA, %xmm1
  801. addps -28 * SIZE(Y), %xmm1
  802. movaps %xmm1, -28 * SIZE(Y)
  803. movaps -15 * SIZE(X), %xmm1
  804. movss %xmm3, %xmm2
  805. shufps $0x93, %xmm3, %xmm2
  806. mulps ALPHA, %xmm2
  807. addps -24 * SIZE(Y), %xmm2
  808. movaps %xmm2, -24 * SIZE(Y)
  809. movaps -11 * SIZE(X), %xmm2
  810. movss %xmm0, %xmm3
  811. shufps $0x93, %xmm0, %xmm3
  812. mulps ALPHA, %xmm3
  813. addps -20 * SIZE(Y), %xmm3
  814. movaps %xmm3, -20 * SIZE(Y)
  815. movaps -7 * SIZE(X), %xmm3
  816. movss %xmm1, %xmm0
  817. shufps $0x93, %xmm1, %xmm0
  818. mulps ALPHA, %xmm0
  819. addps -16 * SIZE(Y), %xmm0
  820. movaps %xmm0, -16 * SIZE(Y)
  821. movaps -3 * SIZE(X), %xmm0
  822. movss %xmm2, %xmm1
  823. shufps $0x93, %xmm2, %xmm1
  824. mulps ALPHA, %xmm1
  825. addps -12 * SIZE(Y), %xmm1
  826. movaps %xmm1, -12 * SIZE(Y)
  827. movss %xmm3, %xmm2
  828. shufps $0x93, %xmm3, %xmm2
  829. mulps ALPHA, %xmm2
  830. addps -8 * SIZE(Y), %xmm2
  831. movaps %xmm2, -8 * SIZE(Y)
  832. movss %xmm0, %xmm3
  833. shufps $0x93, %xmm0, %xmm3
  834. mulps ALPHA, %xmm3
  835. addps -4 * SIZE(Y), %xmm3
  836. movaps %xmm3, -4 * SIZE(Y)
  837. subl $-32 * SIZE, X
  838. subl $-32 * SIZE, Y
  839. ALIGN_3
  840. .L43:
  841. movl M, %eax
  842. andl $16, %eax
  843. jle .L44
  844. ALIGN_3
  845. movaps -31 * SIZE(X), %xmm1
  846. movaps -27 * SIZE(X), %xmm2
  847. movaps -23 * SIZE(X), %xmm3
  848. movaps -19 * SIZE(X), %xmm4
  849. movss %xmm1, %xmm0
  850. shufps $0x93, %xmm1, %xmm0
  851. mulps ALPHA, %xmm0
  852. addps -32 * SIZE(Y), %xmm0
  853. movss %xmm2, %xmm1
  854. shufps $0x93, %xmm2, %xmm1
  855. mulps ALPHA, %xmm1
  856. addps -28 * SIZE(Y), %xmm1
  857. movss %xmm3, %xmm2
  858. shufps $0x93, %xmm3, %xmm2
  859. mulps ALPHA, %xmm2
  860. addps -24 * SIZE(Y), %xmm2
  861. movss %xmm4, %xmm3
  862. shufps $0x93, %xmm4, %xmm3
  863. mulps ALPHA, %xmm3
  864. addps -20 * SIZE(Y), %xmm3
  865. movaps %xmm0, -32 * SIZE(Y)
  866. movaps %xmm1, -28 * SIZE(Y)
  867. movaps %xmm2, -24 * SIZE(Y)
  868. movaps %xmm3, -20 * SIZE(Y)
  869. movaps %xmm4, %xmm0
  870. addl $16 * SIZE, X
  871. addl $16 * SIZE, Y
  872. ALIGN_3
  873. .L44:
  874. movl M, %eax
  875. andl $8, %eax
  876. jle .L45
  877. ALIGN_3
  878. movaps -31 * SIZE(X), %xmm1
  879. movaps -27 * SIZE(X), %xmm2
  880. movss %xmm1, %xmm0
  881. shufps $0x93, %xmm1, %xmm0
  882. mulps ALPHA, %xmm0
  883. addps -32 * SIZE(Y), %xmm0
  884. movss %xmm2, %xmm1
  885. shufps $0x93, %xmm2, %xmm1
  886. mulps ALPHA, %xmm1
  887. addps -28 * SIZE(Y), %xmm1
  888. movaps %xmm0, -32 * SIZE(Y)
  889. movaps %xmm1, -28 * SIZE(Y)
  890. movaps %xmm2, %xmm0
  891. addl $8 * SIZE, X
  892. addl $8 * SIZE, Y
  893. ALIGN_3
  894. .L45:
  895. movl M, %eax
  896. andl $4, %eax
  897. jle .L46
  898. ALIGN_3
  899. movaps -31 * SIZE(X), %xmm1
  900. movss %xmm1, %xmm0
  901. shufps $0x93, %xmm1, %xmm0
  902. mulps ALPHA, %xmm0
  903. addps -32 * SIZE(Y), %xmm0
  904. movaps %xmm0, -32 * SIZE(Y)
  905. addl $4 * SIZE, X
  906. addl $4 * SIZE, Y
  907. ALIGN_3
  908. .L46:
  909. movl M, %eax
  910. andl $2, %eax
  911. jle .L47
  912. ALIGN_3
  913. movsd -32 * SIZE(X), %xmm0
  914. movsd -32 * SIZE(Y), %xmm4
  915. mulps ALPHA, %xmm0
  916. addps %xmm4, %xmm0
  917. movsd %xmm0, -32 * SIZE(Y)
  918. addl $2 * SIZE, X
  919. addl $2 * SIZE, Y
  920. ALIGN_3
  921. .L47:
  922. movl M, %eax
  923. andl $1, %eax
  924. jle .L49
  925. ALIGN_3
  926. movss -32 * SIZE(X), %xmm0
  927. mulss ALPHA, %xmm0
  928. addss -32 * SIZE(Y), %xmm0
  929. movss %xmm0, -32 * SIZE(Y)
  930. addl $SIZE, Y
  931. ALIGN_3
  932. .L49:
  933. popl %ebp
  934. popl %ebx
  935. popl %esi
  936. popl %edi
  937. ret
  938. #else
  939. movl M, %eax
  940. sarl $5, %eax
  941. jle .L23
  942. movsd -32 * SIZE(X), %xmm0
  943. movhps -30 * SIZE(X), %xmm0
  944. movsd -28 * SIZE(X), %xmm1
  945. movhps -26 * SIZE(X), %xmm1
  946. movsd -24 * SIZE(X), %xmm2
  947. movhps -22 * SIZE(X), %xmm2
  948. movsd -20 * SIZE(X), %xmm3
  949. movhps -18 * SIZE(X), %xmm3
  950. decl %eax
  951. jle .L22
  952. ALIGN_4
  953. .L21:
  954. #ifdef PREFETCHW
  955. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  956. #endif
  957. mulps ALPHA, %xmm0
  958. addps -32 * SIZE(Y), %xmm0
  959. movaps %xmm0, -32 * SIZE(Y)
  960. movsd -16 * SIZE(X), %xmm0
  961. movhps -14 * SIZE(X), %xmm0
  962. mulps ALPHA, %xmm1
  963. addps -28 * SIZE(Y), %xmm1
  964. movaps %xmm1, -28 * SIZE(Y)
  965. movsd -12 * SIZE(X), %xmm1
  966. movhps -10 * SIZE(X), %xmm1
  967. #ifdef PREFETCH
  968. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  969. #endif
  970. mulps ALPHA, %xmm2
  971. addps -24 * SIZE(Y), %xmm2
  972. movaps %xmm2, -24 * SIZE(Y)
  973. movsd -8 * SIZE(X), %xmm2
  974. movhps -6 * SIZE(X), %xmm2
  975. mulps ALPHA, %xmm3
  976. addps -20 * SIZE(Y), %xmm3
  977. movaps %xmm3, -20 * SIZE(Y)
  978. movsd -4 * SIZE(X), %xmm3
  979. movhps -2 * SIZE(X), %xmm3
  980. #if defined(PREFETCHW) && !defined(FETCH128)
  981. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  982. #endif
  983. mulps ALPHA, %xmm0
  984. addps -16 * SIZE(Y), %xmm0
  985. movaps %xmm0, -16 * SIZE(Y)
  986. movsd 0 * SIZE(X), %xmm0
  987. movhps 2 * SIZE(X), %xmm0
  988. mulps ALPHA, %xmm1
  989. addps -12 * SIZE(Y), %xmm1
  990. movaps %xmm1, -12 * SIZE(Y)
  991. movsd 4 * SIZE(X), %xmm1
  992. movhps 6 * SIZE(X), %xmm1
  993. #if defined(PREFETCH) && !defined(FETCH128)
  994. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  995. #endif
  996. mulps ALPHA, %xmm2
  997. addps -8 * SIZE(Y), %xmm2
  998. movaps %xmm2, -8 * SIZE(Y)
  999. movsd 8 * SIZE(X), %xmm2
  1000. movhps 10 * SIZE(X), %xmm2
  1001. mulps ALPHA, %xmm3
  1002. addps -4 * SIZE(Y), %xmm3
  1003. movaps %xmm3, -4 * SIZE(Y)
  1004. movsd 12 * SIZE(X), %xmm3
  1005. movhps 14 * SIZE(X), %xmm3
  1006. subl $-32 * SIZE, X
  1007. subl $-32 * SIZE, Y
  1008. decl %eax
  1009. jg .L21
  1010. ALIGN_3
  1011. .L22:
  1012. mulps ALPHA, %xmm0
  1013. addps -32 * SIZE(Y), %xmm0
  1014. movaps %xmm0, -32 * SIZE(Y)
  1015. movsd -16 * SIZE(X), %xmm0
  1016. movhps -14 * SIZE(X), %xmm0
  1017. mulps ALPHA, %xmm1
  1018. addps -28 * SIZE(Y), %xmm1
  1019. movaps %xmm1, -28 * SIZE(Y)
  1020. movsd -12 * SIZE(X), %xmm1
  1021. movhps -10 * SIZE(X), %xmm1
  1022. mulps ALPHA, %xmm2
  1023. addps -24 * SIZE(Y), %xmm2
  1024. movaps %xmm2, -24 * SIZE(Y)
  1025. movsd -8 * SIZE(X), %xmm2
  1026. movhps -6 * SIZE(X), %xmm2
  1027. mulps ALPHA, %xmm3
  1028. addps -20 * SIZE(Y), %xmm3
  1029. movaps %xmm3, -20 * SIZE(Y)
  1030. movsd -4 * SIZE(X), %xmm3
  1031. movhps -2 * SIZE(X), %xmm3
  1032. mulps ALPHA, %xmm0
  1033. addps -16 * SIZE(Y), %xmm0
  1034. movaps %xmm0, -16 * SIZE(Y)
  1035. mulps ALPHA, %xmm1
  1036. addps -12 * SIZE(Y), %xmm1
  1037. movaps %xmm1, -12 * SIZE(Y)
  1038. mulps ALPHA, %xmm2
  1039. addps -8 * SIZE(Y), %xmm2
  1040. movaps %xmm2, -8 * SIZE(Y)
  1041. mulps ALPHA, %xmm3
  1042. addps -4 * SIZE(Y), %xmm3
  1043. movaps %xmm3, -4 * SIZE(Y)
  1044. subl $-32 * SIZE, X
  1045. subl $-32 * SIZE, Y
  1046. ALIGN_3
  1047. .L23:
  1048. movl M, %eax
  1049. andl $16, %eax
  1050. jle .L24
  1051. ALIGN_3
  1052. movsd -32 * SIZE(X), %xmm0
  1053. movhps -30 * SIZE(X), %xmm0
  1054. movsd -28 * SIZE(X), %xmm1
  1055. movhps -26 * SIZE(X), %xmm1
  1056. mulps ALPHA, %xmm0
  1057. addps -32 * SIZE(Y), %xmm0
  1058. movaps %xmm0, -32 * SIZE(Y)
  1059. mulps ALPHA, %xmm1
  1060. addps -28 * SIZE(Y), %xmm1
  1061. movaps %xmm1, -28 * SIZE(Y)
  1062. movsd -24 * SIZE(X), %xmm2
  1063. movhps -22 * SIZE(X), %xmm2
  1064. movsd -20 * SIZE(X), %xmm3
  1065. movhps -18 * SIZE(X), %xmm3
  1066. mulps ALPHA, %xmm2
  1067. addps -24 * SIZE(Y), %xmm2
  1068. movaps %xmm2, -24 * SIZE(Y)
  1069. mulps ALPHA, %xmm3
  1070. addps -20 * SIZE(Y), %xmm3
  1071. movaps %xmm3, -20 * SIZE(Y)
  1072. addl $16 * SIZE, X
  1073. addl $16 * SIZE, Y
  1074. ALIGN_3
  1075. .L24:
  1076. movl M, %eax
  1077. andl $8, %eax
  1078. jle .L25
  1079. ALIGN_3
  1080. movsd -32 * SIZE(X), %xmm0
  1081. movhps -30 * SIZE(X), %xmm0
  1082. movsd -28 * SIZE(X), %xmm1
  1083. movhps -26 * SIZE(X), %xmm1
  1084. mulps ALPHA, %xmm0
  1085. addps -32 * SIZE(Y), %xmm0
  1086. mulps ALPHA, %xmm1
  1087. addps -28 * SIZE(Y), %xmm1
  1088. movaps %xmm0, -32 * SIZE(Y)
  1089. movaps %xmm1, -28 * SIZE(Y)
  1090. addl $8 * SIZE, X
  1091. addl $8 * SIZE, Y
  1092. ALIGN_3
  1093. .L25:
  1094. movl M, %eax
  1095. andl $4, %eax
  1096. jle .L26
  1097. ALIGN_3
  1098. movsd -32 * SIZE(X), %xmm0
  1099. movhps -30 * SIZE(X), %xmm0
  1100. mulps ALPHA, %xmm0
  1101. addps -32 * SIZE(Y), %xmm0
  1102. movaps %xmm0, -32 * SIZE(Y)
  1103. addl $4 * SIZE, X
  1104. addl $4 * SIZE, Y
  1105. ALIGN_3
  1106. .L26:
  1107. movl M, %eax
  1108. andl $2, %eax
  1109. jle .L27
  1110. ALIGN_3
  1111. movsd -32 * SIZE(X), %xmm0
  1112. movsd -32 * SIZE(Y), %xmm4
  1113. mulps ALPHA, %xmm0
  1114. addps %xmm4, %xmm0
  1115. movsd %xmm0, -32 * SIZE(Y)
  1116. addl $2 * SIZE, X
  1117. addl $2 * SIZE, Y
  1118. ALIGN_3
  1119. .L27:
  1120. movl M, %eax
  1121. andl $1, %eax
  1122. jle .L29
  1123. ALIGN_3
  1124. movss -32 * SIZE(X), %xmm0
  1125. mulss ALPHA, %xmm0
  1126. addss -32 * SIZE(Y), %xmm0
  1127. movss %xmm0, -32 * SIZE(Y)
  1128. addl $SIZE, Y
  1129. ALIGN_3
  1130. .L29:
  1131. popl %ebp
  1132. popl %ebx
  1133. popl %esi
  1134. popl %edi
  1135. ret
  1136. #endif
  1137. ALIGN_3
  1138. .L50:
  1139. movl M, %eax
  1140. movl Y, YY
  1141. //If incx==0 || incy==0, avoid unloop.
  1142. cmpl $0, INCX
  1143. je .L56
  1144. cmpl $0, INCY
  1145. je .L56
  1146. sarl $3, %eax
  1147. jle .L55
  1148. ALIGN_3
  1149. .L51:
  1150. movss (X), %xmm0
  1151. addl INCX, X
  1152. mulss ALPHA, %xmm0
  1153. movss (YY), %xmm6
  1154. addl INCY, YY
  1155. addss %xmm6, %xmm0
  1156. movss (X), %xmm1
  1157. addl INCX, X
  1158. mulss ALPHA, %xmm1
  1159. movss (YY), %xmm6
  1160. addl INCY, YY
  1161. addss %xmm6, %xmm1
  1162. movss (X), %xmm2
  1163. addl INCX, X
  1164. mulss ALPHA, %xmm2
  1165. movss (YY), %xmm6
  1166. addl INCY, YY
  1167. addss %xmm6, %xmm2
  1168. movss (X), %xmm3
  1169. addl INCX, X
  1170. mulss ALPHA, %xmm3
  1171. movss (YY), %xmm6
  1172. addl INCY, YY
  1173. addss %xmm6, %xmm3
  1174. movss %xmm0, (Y)
  1175. addl INCY, Y
  1176. movss %xmm1, (Y)
  1177. addl INCY, Y
  1178. movss %xmm2, (Y)
  1179. addl INCY, Y
  1180. movss %xmm3, (Y)
  1181. addl INCY, Y
  1182. movss (X), %xmm0
  1183. addl INCX, X
  1184. mulss ALPHA, %xmm0
  1185. movss (YY), %xmm6
  1186. addl INCY, YY
  1187. addss %xmm6, %xmm0
  1188. movss (X), %xmm1
  1189. addl INCX, X
  1190. mulss ALPHA, %xmm1
  1191. movss (YY), %xmm6
  1192. addl INCY, YY
  1193. addss %xmm6, %xmm1
  1194. movss (X), %xmm2
  1195. addl INCX, X
  1196. mulss ALPHA, %xmm2
  1197. movss (YY), %xmm6
  1198. addl INCY, YY
  1199. addss %xmm6, %xmm2
  1200. movss (X), %xmm3
  1201. addl INCX, X
  1202. mulss ALPHA, %xmm3
  1203. movss (YY), %xmm6
  1204. addl INCY, YY
  1205. addss %xmm6, %xmm3
  1206. movss %xmm0, (Y)
  1207. addl INCY, Y
  1208. movss %xmm1, (Y)
  1209. addl INCY, Y
  1210. movss %xmm2, (Y)
  1211. addl INCY, Y
  1212. movss %xmm3, (Y)
  1213. addl INCY, Y
  1214. decl %eax
  1215. jg .L51
  1216. ALIGN_3
  1217. .L55:
  1218. movl M, %eax
  1219. andl $7, %eax
  1220. jle .L59
  1221. ALIGN_3
  1222. .L56:
  1223. movss (X), %xmm0
  1224. addl INCX, X
  1225. mulss ALPHA, %xmm0
  1226. movss (Y), %xmm6
  1227. addss %xmm6, %xmm0
  1228. movss %xmm0, (Y)
  1229. addl INCY, Y
  1230. decl %eax
  1231. jg .L56
  1232. ALIGN_3
  1233. .L59:
  1234. popl %ebp
  1235. popl %ebx
  1236. popl %esi
  1237. popl %edi
  1238. ret
  1239. EPILOGUE