You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zaxpy_sse2.S 32 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_ALPHA_R 16 + STACK + ARGS(%esp)
  44. #define STACK_ALPHA_I 24 + STACK + ARGS(%esp)
  45. #define STACK_X 32 + STACK + ARGS(%esp)
  46. #define STACK_INCX 36 + STACK + ARGS(%esp)
  47. #define STACK_Y 40 + STACK + ARGS(%esp)
  48. #define STACK_INCY 44 + STACK + ARGS(%esp)
  49. #define M %ebx
  50. #define X %esi
  51. #define INCX %ecx
  52. #define Y %edi
  53. #define INCY %edx
  54. #define YY %ebp
  55. #define ALPHA_R %xmm6
  56. #define ALPHA_I %xmm7
  57. #if defined(HAVE_SSE3) && !defined(CORE_OPTERON)
  58. #define MOVDDUP(a, b, c) movddup a(b), c
  59. #define MOVDDUP2(a, b, c) movddup a##b, c
  60. #else
  61. #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c
  62. #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c
  63. #endif
  64. #include "l1param.h"
  65. PROLOGUE
  66. pushl %ebp
  67. pushl %edi
  68. pushl %esi
  69. pushl %ebx
  70. PROFCODE
  71. movl STACK_M, M
  72. movsd STACK_ALPHA_R, %xmm0
  73. movsd STACK_ALPHA_I, %xmm1
  74. movl STACK_X, X
  75. movl STACK_INCX, INCX
  76. movl STACK_Y, Y
  77. movl STACK_INCY, INCY
  78. sall $ZBASE_SHIFT, INCX
  79. sall $ZBASE_SHIFT, INCY
  80. testl M, M
  81. jle .L999
  82. cmpl $2 * SIZE, INCX
  83. jne .L50
  84. cmpl $2 * SIZE, INCY
  85. jne .L50
  86. subl $-16 * SIZE, X
  87. subl $-16 * SIZE, Y
  88. pcmpeqb %xmm5, %xmm5
  89. psllq $63, %xmm5
  90. #ifdef HAVE_SSE3
  91. movddup %xmm0, ALPHA_R
  92. movddup %xmm1, ALPHA_I
  93. #else
  94. pshufd $0x44, %xmm0, ALPHA_R
  95. pshufd $0x44, %xmm1, ALPHA_I
  96. #endif
  97. #ifndef CONJ
  98. shufps $0x0c, %xmm5, %xmm5
  99. xorpd %xmm5, ALPHA_I
  100. #else
  101. shufps $0xc0, %xmm5, %xmm5
  102. xorpd %xmm5, ALPHA_R
  103. #endif
  104. testl $SIZE, Y
  105. jne .L30
  106. testl $SIZE, X
  107. jne .L20
  108. movl M, %eax
  109. sarl $3, %eax
  110. jle .L15
  111. movaps -16 * SIZE(X), %xmm0
  112. movaps -14 * SIZE(X), %xmm1
  113. movaps -12 * SIZE(X), %xmm2
  114. movaps -10 * SIZE(X), %xmm3
  115. decl %eax
  116. jle .L12
  117. ALIGN_3
  118. .L11:
  119. #ifdef PREFETCHW
  120. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  121. #endif
  122. pshufd $0x4e, %xmm0, %xmm5
  123. mulpd ALPHA_R, %xmm0
  124. mulpd ALPHA_I, %xmm5
  125. addpd -16 * SIZE(Y), %xmm0
  126. addpd %xmm5, %xmm0
  127. movaps %xmm0, -16 * SIZE(Y)
  128. movaps -8 * SIZE(X), %xmm0
  129. pshufd $0x4e, %xmm1, %xmm5
  130. mulpd ALPHA_R, %xmm1
  131. mulpd ALPHA_I, %xmm5
  132. addpd -14 * SIZE(Y), %xmm1
  133. addpd %xmm5, %xmm1
  134. movaps %xmm1, -14 * SIZE(Y)
  135. movaps -6 * SIZE(X), %xmm1
  136. #ifdef PREFETCH
  137. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  138. #endif
  139. pshufd $0x4e, %xmm2, %xmm5
  140. mulpd ALPHA_R, %xmm2
  141. mulpd ALPHA_I, %xmm5
  142. addpd -12 * SIZE(Y), %xmm2
  143. addpd %xmm5, %xmm2
  144. movaps %xmm2, -12 * SIZE(Y)
  145. movaps -4 * SIZE(X), %xmm2
  146. pshufd $0x4e, %xmm3, %xmm5
  147. mulpd ALPHA_R, %xmm3
  148. mulpd ALPHA_I, %xmm5
  149. addpd -10 * SIZE(Y), %xmm3
  150. addpd %xmm5, %xmm3
  151. movaps %xmm3, -10 * SIZE(Y)
  152. movaps -2 * SIZE(X), %xmm3
  153. #if defined(PREFETCHW) && !defined(FETCH128)
  154. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  155. #endif
  156. pshufd $0x4e, %xmm0, %xmm5
  157. mulpd ALPHA_R, %xmm0
  158. mulpd ALPHA_I, %xmm5
  159. addpd -8 * SIZE(Y), %xmm0
  160. addpd %xmm5, %xmm0
  161. movaps %xmm0, -8 * SIZE(Y)
  162. movaps 0 * SIZE(X), %xmm0
  163. pshufd $0x4e, %xmm1, %xmm5
  164. mulpd ALPHA_R, %xmm1
  165. mulpd ALPHA_I, %xmm5
  166. addpd -6 * SIZE(Y), %xmm1
  167. addpd %xmm5, %xmm1
  168. movaps %xmm1, -6 * SIZE(Y)
  169. movaps 2 * SIZE(X), %xmm1
  170. #if defined(PREFETCH) && !defined(FETCH128)
  171. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  172. #endif
  173. pshufd $0x4e, %xmm2, %xmm5
  174. mulpd ALPHA_R, %xmm2
  175. mulpd ALPHA_I, %xmm5
  176. addpd -4 * SIZE(Y), %xmm2
  177. addpd %xmm5, %xmm2
  178. movaps %xmm2, -4 * SIZE(Y)
  179. movaps 4 * SIZE(X), %xmm2
  180. pshufd $0x4e, %xmm3, %xmm5
  181. mulpd ALPHA_R, %xmm3
  182. mulpd ALPHA_I, %xmm5
  183. addpd -2 * SIZE(Y), %xmm3
  184. addpd %xmm5, %xmm3
  185. movaps %xmm3, -2 * SIZE(Y)
  186. movaps 6 * SIZE(X), %xmm3
  187. subl $-16 * SIZE, X
  188. subl $-16 * SIZE, Y
  189. decl %eax
  190. jg .L11
  191. ALIGN_3
  192. .L12:
  193. pshufd $0x4e, %xmm0, %xmm5
  194. mulpd ALPHA_R, %xmm0
  195. mulpd ALPHA_I, %xmm5
  196. addpd -16 * SIZE(Y), %xmm0
  197. addpd %xmm5, %xmm0
  198. movaps %xmm0, -16 * SIZE(Y)
  199. movaps -8 * SIZE(X), %xmm0
  200. pshufd $0x4e, %xmm1, %xmm5
  201. mulpd ALPHA_R, %xmm1
  202. mulpd ALPHA_I, %xmm5
  203. addpd -14 * SIZE(Y), %xmm1
  204. addpd %xmm5, %xmm1
  205. movaps %xmm1, -14 * SIZE(Y)
  206. movaps -6 * SIZE(X), %xmm1
  207. pshufd $0x4e, %xmm2, %xmm5
  208. mulpd ALPHA_R, %xmm2
  209. mulpd ALPHA_I, %xmm5
  210. addpd -12 * SIZE(Y), %xmm2
  211. addpd %xmm5, %xmm2
  212. movaps %xmm2, -12 * SIZE(Y)
  213. movaps -4 * SIZE(X), %xmm2
  214. pshufd $0x4e, %xmm3, %xmm5
  215. mulpd ALPHA_R, %xmm3
  216. mulpd ALPHA_I, %xmm5
  217. addpd -10 * SIZE(Y), %xmm3
  218. addpd %xmm5, %xmm3
  219. movaps %xmm3, -10 * SIZE(Y)
  220. movaps -2 * SIZE(X), %xmm3
  221. pshufd $0x4e, %xmm0, %xmm5
  222. mulpd ALPHA_R, %xmm0
  223. mulpd ALPHA_I, %xmm5
  224. addpd -8 * SIZE(Y), %xmm0
  225. addpd %xmm5, %xmm0
  226. movaps %xmm0, -8 * SIZE(Y)
  227. pshufd $0x4e, %xmm1, %xmm5
  228. mulpd ALPHA_R, %xmm1
  229. mulpd ALPHA_I, %xmm5
  230. addpd -6 * SIZE(Y), %xmm1
  231. addpd %xmm5, %xmm1
  232. movaps %xmm1, -6 * SIZE(Y)
  233. pshufd $0x4e, %xmm2, %xmm5
  234. mulpd ALPHA_R, %xmm2
  235. mulpd ALPHA_I, %xmm5
  236. addpd -4 * SIZE(Y), %xmm2
  237. addpd %xmm5, %xmm2
  238. movaps %xmm2, -4 * SIZE(Y)
  239. pshufd $0x4e, %xmm3, %xmm5
  240. mulpd ALPHA_R, %xmm3
  241. mulpd ALPHA_I, %xmm5
  242. addpd -2 * SIZE(Y), %xmm3
  243. addpd %xmm5, %xmm3
  244. movaps %xmm3, -2 * SIZE(Y)
  245. subl $-16 * SIZE, X
  246. subl $-16 * SIZE, Y
  247. ALIGN_3
  248. .L15:
  249. movl M, %eax
  250. andl $4, %eax
  251. jle .L16
  252. movaps -16 * SIZE(X), %xmm0
  253. movaps -14 * SIZE(X), %xmm1
  254. movaps -12 * SIZE(X), %xmm2
  255. movaps -10 * SIZE(X), %xmm3
  256. pshufd $0x4e, %xmm0, %xmm5
  257. mulpd ALPHA_R, %xmm0
  258. mulpd ALPHA_I, %xmm5
  259. addpd -16 * SIZE(Y), %xmm0
  260. addpd %xmm5, %xmm0
  261. movaps %xmm0, -16 * SIZE(Y)
  262. pshufd $0x4e, %xmm1, %xmm5
  263. mulpd ALPHA_R, %xmm1
  264. mulpd ALPHA_I, %xmm5
  265. addpd -14 * SIZE(Y), %xmm1
  266. addpd %xmm5, %xmm1
  267. movaps %xmm1, -14 * SIZE(Y)
  268. pshufd $0x4e, %xmm2, %xmm5
  269. mulpd ALPHA_R, %xmm2
  270. mulpd ALPHA_I, %xmm5
  271. addpd -12 * SIZE(Y), %xmm2
  272. addpd %xmm5, %xmm2
  273. movaps %xmm2, -12 * SIZE(Y)
  274. pshufd $0x4e, %xmm3, %xmm5
  275. mulpd ALPHA_R, %xmm3
  276. mulpd ALPHA_I, %xmm5
  277. addpd -10 * SIZE(Y), %xmm3
  278. addpd %xmm5, %xmm3
  279. movaps %xmm3, -10 * SIZE(Y)
  280. addl $8 * SIZE, X
  281. addl $8 * SIZE, Y
  282. ALIGN_3
  283. .L16:
  284. movl M, %eax
  285. andl $2, %eax
  286. jle .L17
  287. movaps -16 * SIZE(X), %xmm0
  288. movaps -14 * SIZE(X), %xmm1
  289. pshufd $0x4e, %xmm0, %xmm5
  290. mulpd ALPHA_R, %xmm0
  291. mulpd ALPHA_I, %xmm5
  292. addpd -16 * SIZE(Y), %xmm0
  293. addpd %xmm5, %xmm0
  294. movaps %xmm0, -16 * SIZE(Y)
  295. pshufd $0x4e, %xmm1, %xmm5
  296. mulpd ALPHA_R, %xmm1
  297. mulpd ALPHA_I, %xmm5
  298. addpd -14 * SIZE(Y), %xmm1
  299. addpd %xmm5, %xmm1
  300. movaps %xmm1, -14 * SIZE(Y)
  301. addl $4 * SIZE, X
  302. addl $4 * SIZE, Y
  303. ALIGN_3
  304. .L17:
  305. movl M, %eax
  306. andl $1, %eax
  307. jle .L999
  308. movaps -16 * SIZE(X), %xmm0
  309. pshufd $0x4e, %xmm0, %xmm5
  310. mulpd ALPHA_R, %xmm0
  311. mulpd ALPHA_I, %xmm5
  312. addpd -16 * SIZE(Y), %xmm0
  313. addpd %xmm5, %xmm0
  314. movaps %xmm0, -16 * SIZE(Y)
  315. jmp .L999
  316. ALIGN_3
  317. .L20:
  318. movl M, %eax
  319. sarl $3, %eax
  320. jle .L25
  321. movsd -16 * SIZE(X), %xmm0
  322. movhps -15 * SIZE(X), %xmm0
  323. movsd -14 * SIZE(X), %xmm1
  324. movhps -13 * SIZE(X), %xmm1
  325. movsd -12 * SIZE(X), %xmm2
  326. movhps -11 * SIZE(X), %xmm2
  327. movsd -10 * SIZE(X), %xmm3
  328. movhps -9 * SIZE(X), %xmm3
  329. decl %eax
  330. jle .L22
  331. ALIGN_3
  332. .L21:
  333. #ifdef PREFETCHW
  334. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  335. #endif
  336. pshufd $0x4e, %xmm0, %xmm5
  337. mulpd ALPHA_R, %xmm0
  338. mulpd ALPHA_I, %xmm5
  339. addpd -16 * SIZE(Y), %xmm0
  340. addpd %xmm5, %xmm0
  341. movaps %xmm0, -16 * SIZE(Y)
  342. movsd -8 * SIZE(X), %xmm0
  343. movhps -7 * SIZE(X), %xmm0
  344. pshufd $0x4e, %xmm1, %xmm5
  345. mulpd ALPHA_R, %xmm1
  346. mulpd ALPHA_I, %xmm5
  347. addpd -14 * SIZE(Y), %xmm1
  348. addpd %xmm5, %xmm1
  349. movaps %xmm1, -14 * SIZE(Y)
  350. movsd -6 * SIZE(X), %xmm1
  351. movhps -5 * SIZE(X), %xmm1
  352. #ifdef PREFETCH
  353. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  354. #endif
  355. pshufd $0x4e, %xmm2, %xmm5
  356. mulpd ALPHA_R, %xmm2
  357. mulpd ALPHA_I, %xmm5
  358. addpd -12 * SIZE(Y), %xmm2
  359. addpd %xmm5, %xmm2
  360. movaps %xmm2, -12 * SIZE(Y)
  361. movsd -4 * SIZE(X), %xmm2
  362. movhps -3 * SIZE(X), %xmm2
  363. pshufd $0x4e, %xmm3, %xmm5
  364. mulpd ALPHA_R, %xmm3
  365. mulpd ALPHA_I, %xmm5
  366. addpd -10 * SIZE(Y), %xmm3
  367. addpd %xmm5, %xmm3
  368. movaps %xmm3, -10 * SIZE(Y)
  369. movsd -2 * SIZE(X), %xmm3
  370. movhps -1 * SIZE(X), %xmm3
  371. #if defined(PREFETCHW) && !defined(FETCH128)
  372. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  373. #endif
  374. pshufd $0x4e, %xmm0, %xmm5
  375. mulpd ALPHA_R, %xmm0
  376. mulpd ALPHA_I, %xmm5
  377. addpd -8 * SIZE(Y), %xmm0
  378. addpd %xmm5, %xmm0
  379. movaps %xmm0, -8 * SIZE(Y)
  380. movsd 0 * SIZE(X), %xmm0
  381. movhps 1 * SIZE(X), %xmm0
  382. pshufd $0x4e, %xmm1, %xmm5
  383. mulpd ALPHA_R, %xmm1
  384. mulpd ALPHA_I, %xmm5
  385. addpd -6 * SIZE(Y), %xmm1
  386. addpd %xmm5, %xmm1
  387. movaps %xmm1, -6 * SIZE(Y)
  388. movsd 2 * SIZE(X), %xmm1
  389. movhps 3 * SIZE(X), %xmm1
  390. #if defined(PREFETCH) && !defined(FETCH128)
  391. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  392. #endif
  393. pshufd $0x4e, %xmm2, %xmm5
  394. mulpd ALPHA_R, %xmm2
  395. mulpd ALPHA_I, %xmm5
  396. addpd -4 * SIZE(Y), %xmm2
  397. addpd %xmm5, %xmm2
  398. movaps %xmm2, -4 * SIZE(Y)
  399. movsd 4 * SIZE(X), %xmm2
  400. movhps 5 * SIZE(X), %xmm2
  401. pshufd $0x4e, %xmm3, %xmm5
  402. mulpd ALPHA_R, %xmm3
  403. mulpd ALPHA_I, %xmm5
  404. addpd -2 * SIZE(Y), %xmm3
  405. addpd %xmm5, %xmm3
  406. movaps %xmm3, -2 * SIZE(Y)
  407. movsd 6 * SIZE(X), %xmm3
  408. movhps 7 * SIZE(X), %xmm3
  409. subl $-16 * SIZE, X
  410. subl $-16 * SIZE, Y
  411. decl %eax
  412. jg .L21
  413. ALIGN_3
  414. .L22:
  415. pshufd $0x4e, %xmm0, %xmm5
  416. mulpd ALPHA_R, %xmm0
  417. mulpd ALPHA_I, %xmm5
  418. addpd -16 * SIZE(Y), %xmm0
  419. addpd %xmm5, %xmm0
  420. movaps %xmm0, -16 * SIZE(Y)
  421. movsd -8 * SIZE(X), %xmm0
  422. movhps -7 * SIZE(X), %xmm0
  423. pshufd $0x4e, %xmm1, %xmm5
  424. mulpd ALPHA_R, %xmm1
  425. mulpd ALPHA_I, %xmm5
  426. addpd -14 * SIZE(Y), %xmm1
  427. addpd %xmm5, %xmm1
  428. movaps %xmm1, -14 * SIZE(Y)
  429. movsd -6 * SIZE(X), %xmm1
  430. movhps -5 * SIZE(X), %xmm1
  431. pshufd $0x4e, %xmm2, %xmm5
  432. mulpd ALPHA_R, %xmm2
  433. mulpd ALPHA_I, %xmm5
  434. addpd -12 * SIZE(Y), %xmm2
  435. addpd %xmm5, %xmm2
  436. movaps %xmm2, -12 * SIZE(Y)
  437. movsd -4 * SIZE(X), %xmm2
  438. movhps -3 * SIZE(X), %xmm2
  439. pshufd $0x4e, %xmm3, %xmm5
  440. mulpd ALPHA_R, %xmm3
  441. mulpd ALPHA_I, %xmm5
  442. addpd -10 * SIZE(Y), %xmm3
  443. addpd %xmm5, %xmm3
  444. movaps %xmm3, -10 * SIZE(Y)
  445. movsd -2 * SIZE(X), %xmm3
  446. movhps -1 * SIZE(X), %xmm3
  447. pshufd $0x4e, %xmm0, %xmm5
  448. mulpd ALPHA_R, %xmm0
  449. mulpd ALPHA_I, %xmm5
  450. addpd -8 * SIZE(Y), %xmm0
  451. addpd %xmm5, %xmm0
  452. movaps %xmm0, -8 * SIZE(Y)
  453. pshufd $0x4e, %xmm1, %xmm5
  454. mulpd ALPHA_R, %xmm1
  455. mulpd ALPHA_I, %xmm5
  456. addpd -6 * SIZE(Y), %xmm1
  457. addpd %xmm5, %xmm1
  458. movaps %xmm1, -6 * SIZE(Y)
  459. pshufd $0x4e, %xmm2, %xmm5
  460. mulpd ALPHA_R, %xmm2
  461. mulpd ALPHA_I, %xmm5
  462. addpd -4 * SIZE(Y), %xmm2
  463. addpd %xmm5, %xmm2
  464. movaps %xmm2, -4 * SIZE(Y)
  465. pshufd $0x4e, %xmm3, %xmm5
  466. mulpd ALPHA_R, %xmm3
  467. mulpd ALPHA_I, %xmm5
  468. addpd -2 * SIZE(Y), %xmm3
  469. addpd %xmm5, %xmm3
  470. movaps %xmm3, -2 * SIZE(Y)
  471. subl $-16 * SIZE, X
  472. subl $-16 * SIZE, Y
  473. ALIGN_3
  474. .L25:
  475. movl M, %eax
  476. andl $4, %eax
  477. jle .L26
  478. movsd -16 * SIZE(X), %xmm0
  479. movhps -15 * SIZE(X), %xmm0
  480. movsd -14 * SIZE(X), %xmm1
  481. movhps -13 * SIZE(X), %xmm1
  482. pshufd $0x4e, %xmm0, %xmm5
  483. mulpd ALPHA_R, %xmm0
  484. mulpd ALPHA_I, %xmm5
  485. addpd -16 * SIZE(Y), %xmm0
  486. addpd %xmm5, %xmm0
  487. movaps %xmm0, -16 * SIZE(Y)
  488. pshufd $0x4e, %xmm1, %xmm5
  489. mulpd ALPHA_R, %xmm1
  490. mulpd ALPHA_I, %xmm5
  491. addpd -14 * SIZE(Y), %xmm1
  492. addpd %xmm5, %xmm1
  493. movaps %xmm1, -14 * SIZE(Y)
  494. movsd -12 * SIZE(X), %xmm2
  495. movhps -11 * SIZE(X), %xmm2
  496. movsd -10 * SIZE(X), %xmm3
  497. movhps -9 * SIZE(X), %xmm3
  498. pshufd $0x4e, %xmm2, %xmm5
  499. mulpd ALPHA_R, %xmm2
  500. mulpd ALPHA_I, %xmm5
  501. addpd -12 * SIZE(Y), %xmm2
  502. addpd %xmm5, %xmm2
  503. movaps %xmm2, -12 * SIZE(Y)
  504. pshufd $0x4e, %xmm3, %xmm5
  505. mulpd ALPHA_R, %xmm3
  506. mulpd ALPHA_I, %xmm5
  507. addpd -10 * SIZE(Y), %xmm3
  508. addpd %xmm5, %xmm3
  509. movaps %xmm3, -10 * SIZE(Y)
  510. addl $8 * SIZE, X
  511. addl $8 * SIZE, Y
  512. ALIGN_3
  513. .L26:
  514. movl M, %eax
  515. andl $2, %eax
  516. jle .L27
  517. movsd -16 * SIZE(X), %xmm0
  518. movhps -15 * SIZE(X), %xmm0
  519. pshufd $0x4e, %xmm0, %xmm5
  520. mulpd ALPHA_R, %xmm0
  521. mulpd ALPHA_I, %xmm5
  522. addpd -16 * SIZE(Y), %xmm0
  523. addpd %xmm5, %xmm0
  524. movaps %xmm0, -16 * SIZE(Y)
  525. movsd -14 * SIZE(X), %xmm1
  526. movhps -13 * SIZE(X), %xmm1
  527. pshufd $0x4e, %xmm1, %xmm5
  528. mulpd ALPHA_R, %xmm1
  529. mulpd ALPHA_I, %xmm5
  530. addpd -14 * SIZE(Y), %xmm1
  531. addpd %xmm5, %xmm1
  532. movaps %xmm1, -14 * SIZE(Y)
  533. addl $4 * SIZE, X
  534. addl $4 * SIZE, Y
  535. ALIGN_3
  536. .L27:
  537. movl M, %eax
  538. andl $1, %eax
  539. jle .L999
  540. movsd -16 * SIZE(X), %xmm0
  541. movhps -15 * SIZE(X), %xmm0
  542. pshufd $0x4e, %xmm0, %xmm5
  543. mulpd ALPHA_R, %xmm0
  544. mulpd ALPHA_I, %xmm5
  545. addpd -16 * SIZE(Y), %xmm0
  546. addpd %xmm5, %xmm0
  547. movaps %xmm0, -16 * SIZE(Y)
  548. jmp .L999
  549. ALIGN_3
  550. .L30:
  551. testl $SIZE, X
  552. jne .L40
  553. movaps -16 * SIZE(X), %xmm1
  554. pshufd $0x4e, %xmm1, %xmm5
  555. mulpd ALPHA_R, %xmm1
  556. mulpd ALPHA_I, %xmm5
  557. addpd %xmm5, %xmm1
  558. xorps %xmm0, %xmm0
  559. SHUFPD_1 %xmm1, %xmm0
  560. xorps %xmm4, %xmm4
  561. movhps -16 * SIZE(Y), %xmm4
  562. addpd %xmm0, %xmm4
  563. movhps %xmm4, -16 * SIZE(Y)
  564. movaps %xmm1, %xmm0
  565. addl $2 * SIZE, X
  566. addl $1 * SIZE, Y
  567. decl M
  568. jle .L39
  569. movl M, %eax
  570. sarl $3, %eax
  571. jle .L35
  572. movaps -16 * SIZE(X), %xmm1
  573. movaps -14 * SIZE(X), %xmm2
  574. movaps -12 * SIZE(X), %xmm3
  575. decl %eax
  576. jle .L32
  577. ALIGN_3
  578. .L31:
  579. #ifdef PREFETCHW
  580. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  581. #endif
  582. pshufd $0x4e, %xmm1, %xmm5
  583. mulpd ALPHA_R, %xmm1
  584. mulpd ALPHA_I, %xmm5
  585. addpd %xmm5, %xmm1
  586. SHUFPD_1 %xmm1, %xmm0
  587. addpd -16 * SIZE(Y), %xmm0
  588. movaps %xmm0, -16 * SIZE(Y)
  589. movaps -10 * SIZE(X), %xmm0
  590. pshufd $0x4e, %xmm2, %xmm5
  591. mulpd ALPHA_R, %xmm2
  592. mulpd ALPHA_I, %xmm5
  593. addpd %xmm5, %xmm2
  594. SHUFPD_1 %xmm2, %xmm1
  595. addpd -14 * SIZE(Y), %xmm1
  596. movaps %xmm1, -14 * SIZE(Y)
  597. movaps -8 * SIZE(X), %xmm1
  598. #ifdef PREFETCH
  599. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  600. #endif
  601. pshufd $0x4e, %xmm3, %xmm5
  602. mulpd ALPHA_R, %xmm3
  603. mulpd ALPHA_I, %xmm5
  604. addpd %xmm5, %xmm3
  605. SHUFPD_1 %xmm3, %xmm2
  606. addpd -12 * SIZE(Y), %xmm2
  607. movaps %xmm2, -12 * SIZE(Y)
  608. movaps -6 * SIZE(X), %xmm2
  609. pshufd $0x4e, %xmm0, %xmm5
  610. mulpd ALPHA_R, %xmm0
  611. mulpd ALPHA_I, %xmm5
  612. addpd %xmm5, %xmm0
  613. SHUFPD_1 %xmm0, %xmm3
  614. addpd -10 * SIZE(Y), %xmm3
  615. movaps %xmm3, -10 * SIZE(Y)
  616. movaps -4 * SIZE(X), %xmm3
  617. #if defined(PREFETCHW) && !defined(FETCH128)
  618. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  619. #endif
  620. pshufd $0x4e, %xmm1, %xmm5
  621. mulpd ALPHA_R, %xmm1
  622. mulpd ALPHA_I, %xmm5
  623. addpd %xmm5, %xmm1
  624. SHUFPD_1 %xmm1, %xmm0
  625. addpd -8 * SIZE(Y), %xmm0
  626. movaps %xmm0, -8 * SIZE(Y)
  627. movaps -2 * SIZE(X), %xmm0
  628. pshufd $0x4e, %xmm2, %xmm5
  629. mulpd ALPHA_R, %xmm2
  630. mulpd ALPHA_I, %xmm5
  631. addpd %xmm5, %xmm2
  632. SHUFPD_1 %xmm2, %xmm1
  633. addpd -6 * SIZE(Y), %xmm1
  634. movaps %xmm1, -6 * SIZE(Y)
  635. movaps 0 * SIZE(X), %xmm1
  636. #if defined(PREFETCH) && !defined(FETCH128)
  637. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  638. #endif
  639. pshufd $0x4e, %xmm3, %xmm5
  640. mulpd ALPHA_R, %xmm3
  641. mulpd ALPHA_I, %xmm5
  642. addpd %xmm5, %xmm3
  643. SHUFPD_1 %xmm3, %xmm2
  644. addpd -4 * SIZE(Y), %xmm2
  645. movaps %xmm2, -4 * SIZE(Y)
  646. movaps 2 * SIZE(X), %xmm2
  647. pshufd $0x4e, %xmm0, %xmm5
  648. mulpd ALPHA_R, %xmm0
  649. mulpd ALPHA_I, %xmm5
  650. addpd %xmm5, %xmm0
  651. SHUFPD_1 %xmm0, %xmm3
  652. addpd -2 * SIZE(Y), %xmm3
  653. movaps %xmm3, -2 * SIZE(Y)
  654. movaps 4 * SIZE(X), %xmm3
  655. subl $-16 * SIZE, X
  656. subl $-16 * SIZE, Y
  657. decl %eax
  658. jg .L31
  659. ALIGN_3
  660. .L32:
  661. pshufd $0x4e, %xmm1, %xmm5
  662. mulpd ALPHA_R, %xmm1
  663. mulpd ALPHA_I, %xmm5
  664. addpd %xmm5, %xmm1
  665. SHUFPD_1 %xmm1, %xmm0
  666. addpd -16 * SIZE(Y), %xmm0
  667. movaps %xmm0, -16 * SIZE(Y)
  668. movaps -10 * SIZE(X), %xmm0
  669. pshufd $0x4e, %xmm2, %xmm5
  670. mulpd ALPHA_R, %xmm2
  671. mulpd ALPHA_I, %xmm5
  672. addpd %xmm5, %xmm2
  673. SHUFPD_1 %xmm2, %xmm1
  674. addpd -14 * SIZE(Y), %xmm1
  675. movaps %xmm1, -14 * SIZE(Y)
  676. movaps -8 * SIZE(X), %xmm1
  677. pshufd $0x4e, %xmm3, %xmm5
  678. mulpd ALPHA_R, %xmm3
  679. mulpd ALPHA_I, %xmm5
  680. addpd %xmm5, %xmm3
  681. SHUFPD_1 %xmm3, %xmm2
  682. addpd -12 * SIZE(Y), %xmm2
  683. movaps %xmm2, -12 * SIZE(Y)
  684. movaps -6 * SIZE(X), %xmm2
  685. pshufd $0x4e, %xmm0, %xmm5
  686. mulpd ALPHA_R, %xmm0
  687. mulpd ALPHA_I, %xmm5
  688. addpd %xmm5, %xmm0
  689. SHUFPD_1 %xmm0, %xmm3
  690. addpd -10 * SIZE(Y), %xmm3
  691. movaps %xmm3, -10 * SIZE(Y)
  692. movaps -4 * SIZE(X), %xmm3
  693. pshufd $0x4e, %xmm1, %xmm5
  694. mulpd ALPHA_R, %xmm1
  695. mulpd ALPHA_I, %xmm5
  696. addpd %xmm5, %xmm1
  697. SHUFPD_1 %xmm1, %xmm0
  698. addpd -8 * SIZE(Y), %xmm0
  699. movaps %xmm0, -8 * SIZE(Y)
  700. movaps -2 * SIZE(X), %xmm0
  701. pshufd $0x4e, %xmm2, %xmm5
  702. mulpd ALPHA_R, %xmm2
  703. mulpd ALPHA_I, %xmm5
  704. addpd %xmm5, %xmm2
  705. SHUFPD_1 %xmm2, %xmm1
  706. addpd -6 * SIZE(Y), %xmm1
  707. movaps %xmm1, -6 * SIZE(Y)
  708. pshufd $0x4e, %xmm3, %xmm5
  709. mulpd ALPHA_R, %xmm3
  710. mulpd ALPHA_I, %xmm5
  711. addpd %xmm5, %xmm3
  712. SHUFPD_1 %xmm3, %xmm2
  713. addpd -4 * SIZE(Y), %xmm2
  714. movaps %xmm2, -4 * SIZE(Y)
  715. pshufd $0x4e, %xmm0, %xmm5
  716. mulpd ALPHA_R, %xmm0
  717. mulpd ALPHA_I, %xmm5
  718. addpd %xmm5, %xmm0
  719. SHUFPD_1 %xmm0, %xmm3
  720. addpd -2 * SIZE(Y), %xmm3
  721. movaps %xmm3, -2 * SIZE(Y)
  722. subl $-16 * SIZE, X
  723. subl $-16 * SIZE, Y
  724. ALIGN_3
  725. .L35:
  726. movl M, %eax
  727. andl $4, %eax
  728. jle .L36
  729. movaps -16 * SIZE(X), %xmm1
  730. movaps -14 * SIZE(X), %xmm2
  731. movaps -12 * SIZE(X), %xmm3
  732. movaps -10 * SIZE(X), %xmm4
  733. pshufd $0x4e, %xmm1, %xmm5
  734. mulpd ALPHA_R, %xmm1
  735. mulpd ALPHA_I, %xmm5
  736. addpd %xmm5, %xmm1
  737. SHUFPD_1 %xmm1, %xmm0
  738. addpd -16 * SIZE(Y), %xmm0
  739. movaps %xmm0, -16 * SIZE(Y)
  740. pshufd $0x4e, %xmm2, %xmm5
  741. mulpd ALPHA_R, %xmm2
  742. mulpd ALPHA_I, %xmm5
  743. addpd %xmm5, %xmm2
  744. SHUFPD_1 %xmm2, %xmm1
  745. addpd -14 * SIZE(Y), %xmm1
  746. movaps %xmm1, -14 * SIZE(Y)
  747. pshufd $0x4e, %xmm3, %xmm5
  748. mulpd ALPHA_R, %xmm3
  749. mulpd ALPHA_I, %xmm5
  750. addpd %xmm5, %xmm3
  751. SHUFPD_1 %xmm3, %xmm2
  752. addpd -12 * SIZE(Y), %xmm2
  753. movaps %xmm2, -12 * SIZE(Y)
  754. pshufd $0x4e, %xmm4, %xmm5
  755. mulpd ALPHA_R, %xmm4
  756. mulpd ALPHA_I, %xmm5
  757. addpd %xmm5, %xmm4
  758. SHUFPD_1 %xmm4, %xmm3
  759. addpd -10 * SIZE(Y), %xmm3
  760. movaps %xmm3, -10 * SIZE(Y)
  761. movaps %xmm4, %xmm0
  762. addl $8 * SIZE, X
  763. addl $8 * SIZE, Y
  764. ALIGN_3
  765. .L36:
  766. movl M, %eax
  767. andl $2, %eax
  768. jle .L37
  769. movaps -16 * SIZE(X), %xmm1
  770. movaps -14 * SIZE(X), %xmm2
  771. pshufd $0x4e, %xmm1, %xmm5
  772. mulpd ALPHA_R, %xmm1
  773. mulpd ALPHA_I, %xmm5
  774. addpd %xmm5, %xmm1
  775. SHUFPD_1 %xmm1, %xmm0
  776. addpd -16 * SIZE(Y), %xmm0
  777. movaps %xmm0, -16 * SIZE(Y)
  778. pshufd $0x4e, %xmm2, %xmm5
  779. mulpd ALPHA_R, %xmm2
  780. mulpd ALPHA_I, %xmm5
  781. addpd %xmm5, %xmm2
  782. SHUFPD_1 %xmm2, %xmm1
  783. addpd -14 * SIZE(Y), %xmm1
  784. movaps %xmm1, -14 * SIZE(Y)
  785. movaps %xmm2, %xmm0
  786. addl $4 * SIZE, X
  787. addl $4 * SIZE, Y
  788. ALIGN_3
  789. .L37:
  790. movl M, %eax
  791. andl $1, %eax
  792. jle .L39
  793. movaps -16 * SIZE(X), %xmm1
  794. pshufd $0x4e, %xmm1, %xmm5
  795. mulpd ALPHA_R, %xmm1
  796. mulpd ALPHA_I, %xmm5
  797. addpd %xmm5, %xmm1
  798. SHUFPD_1 %xmm1, %xmm0
  799. addpd -16 * SIZE(Y), %xmm0
  800. movaps %xmm0, -16 * SIZE(Y)
  801. movaps %xmm1, %xmm0
  802. addl $2 * SIZE, X
  803. addl $2 * SIZE, Y
  804. ALIGN_3
  805. .L39:
  806. SHUFPD_1 %xmm0, %xmm0
  807. addsd -16 * SIZE(Y), %xmm0
  808. movlps %xmm0, -16 * SIZE(Y)
  809. jmp .L999
  810. ALIGN_3
  811. .L40:
  812. movsd -16 * SIZE(X), %xmm1
  813. movhps -15 * SIZE(X), %xmm1
  814. pshufd $0x4e, %xmm1, %xmm5
  815. mulpd ALPHA_R, %xmm1
  816. mulpd ALPHA_I, %xmm5
  817. addpd %xmm5, %xmm1
  818. xorps %xmm0, %xmm0
  819. SHUFPD_1 %xmm1, %xmm0
  820. xorps %xmm4, %xmm4
  821. movhps -16 * SIZE(Y), %xmm4
  822. addpd %xmm0, %xmm4
  823. movhps %xmm4, -16 * SIZE(Y)
  824. movaps %xmm1, %xmm0
  825. addl $2 * SIZE, X
  826. addl $1 * SIZE, Y
  827. decl M
  828. jle .L49
  829. movl M, %eax
  830. sarl $3, %eax
  831. jle .L45
  832. movsd -16 * SIZE(X), %xmm1
  833. movhps -15 * SIZE(X), %xmm1
  834. movsd -14 * SIZE(X), %xmm2
  835. movhps -13 * SIZE(X), %xmm2
  836. movsd -12 * SIZE(X), %xmm3
  837. movhps -11 * SIZE(X), %xmm3
  838. decl %eax
  839. jle .L42
  840. ALIGN_3
  841. .L41:
  842. #ifdef PREFETCHW
  843. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  844. #endif
  845. pshufd $0x4e, %xmm1, %xmm5
  846. mulpd ALPHA_R, %xmm1
  847. mulpd ALPHA_I, %xmm5
  848. addpd %xmm5, %xmm1
  849. SHUFPD_1 %xmm1, %xmm0
  850. addpd -16 * SIZE(Y), %xmm0
  851. movaps %xmm0, -16 * SIZE(Y)
  852. movsd -10 * SIZE(X), %xmm0
  853. movhps -9 * SIZE(X), %xmm0
  854. pshufd $0x4e, %xmm2, %xmm5
  855. mulpd ALPHA_R, %xmm2
  856. mulpd ALPHA_I, %xmm5
  857. addpd %xmm5, %xmm2
  858. SHUFPD_1 %xmm2, %xmm1
  859. addpd -14 * SIZE(Y), %xmm1
  860. movaps %xmm1, -14 * SIZE(Y)
  861. movsd -8 * SIZE(X), %xmm1
  862. movhps -7 * SIZE(X), %xmm1
  863. #ifdef PREFETCH
  864. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  865. #endif
  866. pshufd $0x4e, %xmm3, %xmm5
  867. mulpd ALPHA_R, %xmm3
  868. mulpd ALPHA_I, %xmm5
  869. addpd %xmm5, %xmm3
  870. SHUFPD_1 %xmm3, %xmm2
  871. addpd -12 * SIZE(Y), %xmm2
  872. movaps %xmm2, -12 * SIZE(Y)
  873. movsd -6 * SIZE(X), %xmm2
  874. movhps -5 * SIZE(X), %xmm2
  875. pshufd $0x4e, %xmm0, %xmm5
  876. mulpd ALPHA_R, %xmm0
  877. mulpd ALPHA_I, %xmm5
  878. addpd %xmm5, %xmm0
  879. SHUFPD_1 %xmm0, %xmm3
  880. addpd -10 * SIZE(Y), %xmm3
  881. movaps %xmm3, -10 * SIZE(Y)
  882. movsd -4 * SIZE(X), %xmm3
  883. movhps -3 * SIZE(X), %xmm3
  884. #if defined(PREFETCHW) && !defined(FETCH128)
  885. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  886. #endif
  887. pshufd $0x4e, %xmm1, %xmm5
  888. mulpd ALPHA_R, %xmm1
  889. mulpd ALPHA_I, %xmm5
  890. addpd %xmm5, %xmm1
  891. SHUFPD_1 %xmm1, %xmm0
  892. addpd -8 * SIZE(Y), %xmm0
  893. movaps %xmm0, -8 * SIZE(Y)
  894. movsd -2 * SIZE(X), %xmm0
  895. movhps -1 * SIZE(X), %xmm0
  896. pshufd $0x4e, %xmm2, %xmm5
  897. mulpd ALPHA_R, %xmm2
  898. mulpd ALPHA_I, %xmm5
  899. addpd %xmm5, %xmm2
  900. SHUFPD_1 %xmm2, %xmm1
  901. addpd -6 * SIZE(Y), %xmm1
  902. movaps %xmm1, -6 * SIZE(Y)
  903. movsd 0 * SIZE(X), %xmm1
  904. movhps 1 * SIZE(X), %xmm1
  905. #if defined(PREFETCH) && !defined(FETCH128)
  906. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  907. #endif
  908. pshufd $0x4e, %xmm3, %xmm5
  909. mulpd ALPHA_R, %xmm3
  910. mulpd ALPHA_I, %xmm5
  911. addpd %xmm5, %xmm3
  912. SHUFPD_1 %xmm3, %xmm2
  913. addpd -4 * SIZE(Y), %xmm2
  914. movaps %xmm2, -4 * SIZE(Y)
  915. movsd 2 * SIZE(X), %xmm2
  916. movhps 3 * SIZE(X), %xmm2
  917. pshufd $0x4e, %xmm0, %xmm5
  918. mulpd ALPHA_R, %xmm0
  919. mulpd ALPHA_I, %xmm5
  920. addpd %xmm5, %xmm0
  921. SHUFPD_1 %xmm0, %xmm3
  922. addpd -2 * SIZE(Y), %xmm3
  923. movaps %xmm3, -2 * SIZE(Y)
  924. movsd 4 * SIZE(X), %xmm3
  925. movhps 5 * SIZE(X), %xmm3
  926. subl $-16 * SIZE, X
  927. subl $-16 * SIZE, Y
  928. decl %eax
  929. jg .L41
  930. ALIGN_3
  931. .L42:
  932. pshufd $0x4e, %xmm1, %xmm5
  933. mulpd ALPHA_R, %xmm1
  934. mulpd ALPHA_I, %xmm5
  935. addpd %xmm5, %xmm1
  936. SHUFPD_1 %xmm1, %xmm0
  937. addpd -16 * SIZE(Y), %xmm0
  938. movaps %xmm0, -16 * SIZE(Y)
  939. movsd -10 * SIZE(X), %xmm0
  940. movhps -9 * SIZE(X), %xmm0
  941. pshufd $0x4e, %xmm2, %xmm5
  942. mulpd ALPHA_R, %xmm2
  943. mulpd ALPHA_I, %xmm5
  944. addpd %xmm5, %xmm2
  945. SHUFPD_1 %xmm2, %xmm1
  946. addpd -14 * SIZE(Y), %xmm1
  947. movaps %xmm1, -14 * SIZE(Y)
  948. movsd -8 * SIZE(X), %xmm1
  949. movhps -7 * SIZE(X), %xmm1
  950. pshufd $0x4e, %xmm3, %xmm5
  951. mulpd ALPHA_R, %xmm3
  952. mulpd ALPHA_I, %xmm5
  953. addpd %xmm5, %xmm3
  954. SHUFPD_1 %xmm3, %xmm2
  955. addpd -12 * SIZE(Y), %xmm2
  956. movaps %xmm2, -12 * SIZE(Y)
  957. movsd -6 * SIZE(X), %xmm2
  958. movhps -5 * SIZE(X), %xmm2
  959. pshufd $0x4e, %xmm0, %xmm5
  960. mulpd ALPHA_R, %xmm0
  961. mulpd ALPHA_I, %xmm5
  962. addpd %xmm5, %xmm0
  963. SHUFPD_1 %xmm0, %xmm3
  964. addpd -10 * SIZE(Y), %xmm3
  965. movaps %xmm3, -10 * SIZE(Y)
  966. movsd -4 * SIZE(X), %xmm3
  967. movhps -3 * SIZE(X), %xmm3
  968. pshufd $0x4e, %xmm1, %xmm5
  969. mulpd ALPHA_R, %xmm1
  970. mulpd ALPHA_I, %xmm5
  971. addpd %xmm5, %xmm1
  972. SHUFPD_1 %xmm1, %xmm0
  973. addpd -8 * SIZE(Y), %xmm0
  974. movaps %xmm0, -8 * SIZE(Y)
  975. movsd -2 * SIZE(X), %xmm0
  976. movhps -1 * SIZE(X), %xmm0
  977. pshufd $0x4e, %xmm2, %xmm5
  978. mulpd ALPHA_R, %xmm2
  979. mulpd ALPHA_I, %xmm5
  980. addpd %xmm5, %xmm2
  981. SHUFPD_1 %xmm2, %xmm1
  982. addpd -6 * SIZE(Y), %xmm1
  983. movaps %xmm1, -6 * SIZE(Y)
  984. pshufd $0x4e, %xmm3, %xmm5
  985. mulpd ALPHA_R, %xmm3
  986. mulpd ALPHA_I, %xmm5
  987. addpd %xmm5, %xmm3
  988. SHUFPD_1 %xmm3, %xmm2
  989. addpd -4 * SIZE(Y), %xmm2
  990. movaps %xmm2, -4 * SIZE(Y)
  991. pshufd $0x4e, %xmm0, %xmm5
  992. mulpd ALPHA_R, %xmm0
  993. mulpd ALPHA_I, %xmm5
  994. addpd %xmm5, %xmm0
  995. SHUFPD_1 %xmm0, %xmm3
  996. addpd -2 * SIZE(Y), %xmm3
  997. movaps %xmm3, -2 * SIZE(Y)
  998. subl $-16 * SIZE, X
  999. subl $-16 * SIZE, Y
  1000. ALIGN_3
  1001. .L45:
  1002. movl M, %eax
  1003. andl $4, %eax
  1004. jle .L46
  1005. movsd -16 * SIZE(X), %xmm1
  1006. movhps -15 * SIZE(X), %xmm1
  1007. movsd -14 * SIZE(X), %xmm2
  1008. movhps -13 * SIZE(X), %xmm2
  1009. pshufd $0x4e, %xmm1, %xmm5
  1010. mulpd ALPHA_R, %xmm1
  1011. mulpd ALPHA_I, %xmm5
  1012. addpd %xmm5, %xmm1
  1013. SHUFPD_1 %xmm1, %xmm0
  1014. addpd -16 * SIZE(Y), %xmm0
  1015. movaps %xmm0, -16 * SIZE(Y)
  1016. pshufd $0x4e, %xmm2, %xmm5
  1017. mulpd ALPHA_R, %xmm2
  1018. mulpd ALPHA_I, %xmm5
  1019. addpd %xmm5, %xmm2
  1020. SHUFPD_1 %xmm2, %xmm1
  1021. addpd -14 * SIZE(Y), %xmm1
  1022. movaps %xmm1, -14 * SIZE(Y)
  1023. movsd -12 * SIZE(X), %xmm3
  1024. movhps -11 * SIZE(X), %xmm3
  1025. movsd -10 * SIZE(X), %xmm4
  1026. movhps -9 * SIZE(X), %xmm4
  1027. pshufd $0x4e, %xmm3, %xmm5
  1028. mulpd ALPHA_R, %xmm3
  1029. mulpd ALPHA_I, %xmm5
  1030. addpd %xmm5, %xmm3
  1031. SHUFPD_1 %xmm3, %xmm2
  1032. addpd -12 * SIZE(Y), %xmm2
  1033. movaps %xmm2, -12 * SIZE(Y)
  1034. pshufd $0x4e, %xmm4, %xmm5
  1035. mulpd ALPHA_R, %xmm4
  1036. mulpd ALPHA_I, %xmm5
  1037. addpd %xmm5, %xmm4
  1038. SHUFPD_1 %xmm4, %xmm3
  1039. addpd -10 * SIZE(Y), %xmm3
  1040. movaps %xmm3, -10 * SIZE(Y)
  1041. movaps %xmm4, %xmm0
  1042. addl $8 * SIZE, X
  1043. addl $8 * SIZE, Y
  1044. ALIGN_3
  1045. .L46:
  1046. movl M, %eax
  1047. andl $2, %eax
  1048. jle .L47
  1049. movsd -16 * SIZE(X), %xmm1
  1050. movhps -15 * SIZE(X), %xmm1
  1051. movsd -14 * SIZE(X), %xmm2
  1052. movhps -13 * SIZE(X), %xmm2
  1053. pshufd $0x4e, %xmm1, %xmm5
  1054. mulpd ALPHA_R, %xmm1
  1055. mulpd ALPHA_I, %xmm5
  1056. addpd %xmm5, %xmm1
  1057. SHUFPD_1 %xmm1, %xmm0
  1058. addpd -16 * SIZE(Y), %xmm0
  1059. movaps %xmm0, -16 * SIZE(Y)
  1060. pshufd $0x4e, %xmm2, %xmm5
  1061. mulpd ALPHA_R, %xmm2
  1062. mulpd ALPHA_I, %xmm5
  1063. addpd %xmm5, %xmm2
  1064. SHUFPD_1 %xmm2, %xmm1
  1065. addpd -14 * SIZE(Y), %xmm1
  1066. movaps %xmm1, -14 * SIZE(Y)
  1067. movaps %xmm2, %xmm0
  1068. addl $4 * SIZE, X
  1069. addl $4 * SIZE, Y
  1070. ALIGN_3
  1071. .L47:
  1072. movl M, %eax
  1073. andl $1, %eax
  1074. jle .L49
  1075. movsd -16 * SIZE(X), %xmm1
  1076. movhps -15 * SIZE(X), %xmm1
  1077. pshufd $0x4e, %xmm1, %xmm5
  1078. mulpd ALPHA_R, %xmm1
  1079. mulpd ALPHA_I, %xmm5
  1080. addpd %xmm5, %xmm1
  1081. SHUFPD_1 %xmm1, %xmm0
  1082. addpd -16 * SIZE(Y), %xmm0
  1083. movaps %xmm0, -16 * SIZE(Y)
  1084. movaps %xmm1, %xmm0
  1085. addl $2 * SIZE, Y
  1086. ALIGN_3
  1087. .L49:
  1088. SHUFPD_1 %xmm0, %xmm0
  1089. addsd -16 * SIZE(Y), %xmm0
  1090. movlps %xmm0, -16 * SIZE(Y)
  1091. jmp .L999
  1092. ALIGN_3
  1093. .L50:
  1094. #ifndef CONJ
  1095. movaps %xmm0, ALPHA_R
  1096. pxor ALPHA_I, ALPHA_I
  1097. subsd %xmm1, ALPHA_I
  1098. unpcklpd ALPHA_R, ALPHA_I
  1099. unpcklpd %xmm1, ALPHA_R
  1100. #else
  1101. movaps %xmm0, ALPHA_R
  1102. movaps %xmm1, ALPHA_I
  1103. pxor %xmm5, %xmm5
  1104. subsd %xmm0, %xmm5
  1105. unpcklpd %xmm5, ALPHA_I
  1106. unpcklpd %xmm1, ALPHA_R
  1107. #endif
  1108. movl Y, YY
  1109. movl M, %eax
  1110. //If incx==0 || incy==0, avoid unloop and jump to end.
  1111. cmpl $0, INCX
  1112. je .L58
  1113. cmpl $0, INCY
  1114. je .L58
  1115. sarl $2, %eax
  1116. jle .L55
  1117. MOVDDUP( 0 * SIZE, X, %xmm0)
  1118. MOVDDUP( 1 * SIZE, X, %xmm1)
  1119. addl INCX, X
  1120. MOVDDUP( 0 * SIZE, X, %xmm2)
  1121. MOVDDUP( 1 * SIZE, X, %xmm3)
  1122. addl INCX, X
  1123. movsd 0 * SIZE(Y), %xmm4
  1124. movhpd 1 * SIZE(Y), %xmm4
  1125. addl INCY, Y
  1126. movsd 0 * SIZE(Y), %xmm5
  1127. movhpd 1 * SIZE(Y), %xmm5
  1128. addl INCY, Y
  1129. decl %eax
  1130. jle .L52
  1131. ALIGN_3
  1132. .L51:
  1133. mulpd ALPHA_R, %xmm0
  1134. mulpd ALPHA_R, %xmm2
  1135. mulpd ALPHA_I, %xmm1
  1136. mulpd ALPHA_I, %xmm3
  1137. addpd %xmm0, %xmm4
  1138. addpd %xmm2, %xmm5
  1139. addpd %xmm1, %xmm4
  1140. addpd %xmm3, %xmm5
  1141. movlpd %xmm4, 0 * SIZE(YY)
  1142. movhpd %xmm4, 1 * SIZE(YY)
  1143. addl INCY, YY
  1144. movlpd %xmm5, 0 * SIZE(YY)
  1145. movhpd %xmm5, 1 * SIZE(YY)
  1146. addl INCY, YY
  1147. MOVDDUP( 0 * SIZE, X, %xmm0)
  1148. MOVDDUP( 1 * SIZE, X, %xmm1)
  1149. addl INCX, X
  1150. MOVDDUP( 0 * SIZE, X, %xmm2)
  1151. MOVDDUP( 1 * SIZE, X, %xmm3)
  1152. addl INCX, X
  1153. movsd 0 * SIZE(Y), %xmm4
  1154. movhpd 1 * SIZE(Y), %xmm4
  1155. addl INCY, Y
  1156. movsd 0 * SIZE(Y), %xmm5
  1157. movhpd 1 * SIZE(Y), %xmm5
  1158. addl INCY, Y
  1159. mulpd ALPHA_R, %xmm0
  1160. mulpd ALPHA_R, %xmm2
  1161. mulpd ALPHA_I, %xmm1
  1162. mulpd ALPHA_I, %xmm3
  1163. addpd %xmm0, %xmm4
  1164. addpd %xmm2, %xmm5
  1165. addpd %xmm1, %xmm4
  1166. addpd %xmm3, %xmm5
  1167. movlpd %xmm4, 0 * SIZE(YY)
  1168. movhpd %xmm4, 1 * SIZE(YY)
  1169. addl INCY, YY
  1170. movlpd %xmm5, 0 * SIZE(YY)
  1171. movhpd %xmm5, 1 * SIZE(YY)
  1172. addl INCY, YY
  1173. MOVDDUP( 0 * SIZE, X, %xmm0)
  1174. MOVDDUP( 1 * SIZE, X, %xmm1)
  1175. addl INCX, X
  1176. MOVDDUP( 0 * SIZE, X, %xmm2)
  1177. MOVDDUP( 1 * SIZE, X, %xmm3)
  1178. addl INCX, X
  1179. movsd 0 * SIZE(Y), %xmm4
  1180. movhpd 1 * SIZE(Y), %xmm4
  1181. addl INCY, Y
  1182. movsd 0 * SIZE(Y), %xmm5
  1183. movhpd 1 * SIZE(Y), %xmm5
  1184. addl INCY, Y
  1185. decl %eax
  1186. jg .L51
  1187. ALIGN_3
  1188. .L52:
  1189. mulpd ALPHA_R, %xmm0
  1190. mulpd ALPHA_R, %xmm2
  1191. mulpd ALPHA_I, %xmm1
  1192. mulpd ALPHA_I, %xmm3
  1193. addpd %xmm0, %xmm4
  1194. addpd %xmm2, %xmm5
  1195. addpd %xmm1, %xmm4
  1196. addpd %xmm3, %xmm5
  1197. movlpd %xmm4, 0 * SIZE(YY)
  1198. movhpd %xmm4, 1 * SIZE(YY)
  1199. addl INCY, YY
  1200. movlpd %xmm5, 0 * SIZE(YY)
  1201. movhpd %xmm5, 1 * SIZE(YY)
  1202. addl INCY, YY
  1203. MOVDDUP( 0 * SIZE, X, %xmm0)
  1204. MOVDDUP( 1 * SIZE, X, %xmm1)
  1205. addl INCX, X
  1206. MOVDDUP( 0 * SIZE, X, %xmm2)
  1207. MOVDDUP( 1 * SIZE, X, %xmm3)
  1208. addl INCX, X
  1209. movsd 0 * SIZE(Y), %xmm4
  1210. movhpd 1 * SIZE(Y), %xmm4
  1211. addl INCY, Y
  1212. movsd 0 * SIZE(Y), %xmm5
  1213. movhpd 1 * SIZE(Y), %xmm5
  1214. addl INCY, Y
  1215. mulpd ALPHA_R, %xmm0
  1216. mulpd ALPHA_R, %xmm2
  1217. mulpd ALPHA_I, %xmm1
  1218. mulpd ALPHA_I, %xmm3
  1219. addpd %xmm0, %xmm4
  1220. addpd %xmm2, %xmm5
  1221. addpd %xmm1, %xmm4
  1222. addpd %xmm3, %xmm5
  1223. movlpd %xmm4, 0 * SIZE(YY)
  1224. movhpd %xmm4, 1 * SIZE(YY)
  1225. addl INCY, YY
  1226. movlpd %xmm5, 0 * SIZE(YY)
  1227. movhpd %xmm5, 1 * SIZE(YY)
  1228. addl INCY, YY
  1229. ALIGN_3
  1230. .L55:
  1231. movl M, %eax
  1232. andl $2, %eax
  1233. jle .L57
  1234. MOVDDUP( 0 * SIZE, X, %xmm0)
  1235. MOVDDUP( 1 * SIZE, X, %xmm1)
  1236. addl INCX, X
  1237. MOVDDUP( 0 * SIZE, X, %xmm2)
  1238. MOVDDUP( 1 * SIZE, X, %xmm3)
  1239. addl INCX, X
  1240. movsd 0 * SIZE(Y), %xmm4
  1241. movhpd 1 * SIZE(Y), %xmm4
  1242. addl INCY, Y
  1243. movsd 0 * SIZE(Y), %xmm5
  1244. movhpd 1 * SIZE(Y), %xmm5
  1245. addl INCY, Y
  1246. mulpd ALPHA_R, %xmm0
  1247. mulpd ALPHA_R, %xmm2
  1248. mulpd ALPHA_I, %xmm1
  1249. mulpd ALPHA_I, %xmm3
  1250. addpd %xmm0, %xmm4
  1251. addpd %xmm2, %xmm5
  1252. addpd %xmm1, %xmm4
  1253. addpd %xmm3, %xmm5
  1254. movlpd %xmm4, 0 * SIZE(YY)
  1255. movhpd %xmm4, 1 * SIZE(YY)
  1256. addl INCY, YY
  1257. movlpd %xmm5, 0 * SIZE(YY)
  1258. movhpd %xmm5, 1 * SIZE(YY)
  1259. addl INCY, YY
  1260. ALIGN_3
  1261. .L57:
  1262. movl M, %eax
  1263. andl $1, %eax
  1264. jle .L999
  1265. .L58:
  1266. MOVDDUP( 0 * SIZE, X, %xmm0)
  1267. MOVDDUP( 1 * SIZE, X, %xmm1)
  1268. movsd 0 * SIZE(Y), %xmm4
  1269. movhpd 1 * SIZE(Y), %xmm4
  1270. mulpd ALPHA_R, %xmm0
  1271. mulpd ALPHA_I, %xmm1
  1272. addpd %xmm0, %xmm4
  1273. addpd %xmm1, %xmm4
  1274. movlpd %xmm4, 0 * SIZE(YY)
  1275. movhpd %xmm4, 1 * SIZE(YY)
  1276. decl %eax
  1277. jg .L58
  1278. ALIGN_3
  1279. .L999:
  1280. popl %ebx
  1281. popl %esi
  1282. popl %edi
  1283. popl %ebp
  1284. ret
  1285. EPILOGUE