You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zaxpy_sse2.S 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef WINDOWS_ABI
  41. #define M ARG1
  42. #define X ARG4
  43. #define INCX ARG5
  44. #define Y ARG6
  45. #define INCY ARG2
  46. #else
  47. #define M ARG1
  48. #define X ARG2
  49. #define INCX ARG3
  50. #define Y ARG4
  51. #define INCY %r10
  52. #endif
  53. #define YY %r11
  54. #define ALPHA_R %xmm14
  55. #define ALPHA_I %xmm15
  56. #define USE_PSHUFD
  57. #if defined(HAVE_SSE3) && !defined(CORE_OPTERON)
  58. #define MOVDDUP(a, b, c) movddup a(b), c
  59. #define MOVDDUP2(a, b, c) movddup a##b, c
  60. #else
  61. #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c
  62. #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c
  63. #endif
  64. #include "l1param.h"
  65. PROLOGUE
  66. PROFCODE
  67. #ifndef WINDOWS_ABI
  68. movq 8(%rsp), INCY
  69. #else
  70. movaps %xmm3, %xmm0
  71. movsd 40(%rsp), %xmm1
  72. movq 48(%rsp), X
  73. movq 56(%rsp), INCX
  74. movq 64(%rsp), Y
  75. movq 72(%rsp), INCY
  76. #endif
  77. SAVEREGISTERS
  78. salq $ZBASE_SHIFT, INCX
  79. salq $ZBASE_SHIFT, INCY
  80. testq M, M
  81. jle .L999
  82. cmpq $2 * SIZE, INCX
  83. jne .L50
  84. cmpq $2 * SIZE, INCY
  85. jne .L50
  86. subq $-16 * SIZE, X
  87. subq $-16 * SIZE, Y
  88. pcmpeqb %xmm7, %xmm7
  89. psllq $63, %xmm7
  90. #ifdef HAVE_SSE3
  91. movddup %xmm0, ALPHA_R
  92. movddup %xmm1, ALPHA_I
  93. #else
  94. pshufd $0x44, %xmm0, ALPHA_R
  95. pshufd $0x44, %xmm1, ALPHA_I
  96. #endif
  97. #ifndef CONJ
  98. shufps $0x0c, %xmm7, %xmm7
  99. xorpd %xmm7, ALPHA_I
  100. #else
  101. shufps $0xc0, %xmm7, %xmm7
  102. xorpd %xmm7, ALPHA_R
  103. #endif
  104. testq $SIZE, Y
  105. jne .L30
  106. testq $SIZE, X
  107. jne .L20
  108. movq M, %rax
  109. sarq $3, %rax
  110. jle .L15
  111. movaps -16 * SIZE(X), %xmm0
  112. movaps -14 * SIZE(X), %xmm1
  113. movaps -12 * SIZE(X), %xmm2
  114. movaps -10 * SIZE(X), %xmm3
  115. decq %rax
  116. jle .L12
  117. ALIGN_3
  118. .L11:
  119. movaps -8 * SIZE(X), %xmm4
  120. movaps -6 * SIZE(X), %xmm5
  121. #ifdef PREFETCHW
  122. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  123. #endif
  124. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  125. pshufd $0x4e, %xmm0, %xmm8
  126. #else
  127. movsd -15 * SIZE(X), %xmm8
  128. movhps -16 * SIZE(X), %xmm8
  129. #endif
  130. mulpd ALPHA_R, %xmm0
  131. mulpd ALPHA_I, %xmm8
  132. addpd -16 * SIZE(Y), %xmm0
  133. addpd %xmm8, %xmm0
  134. movaps %xmm0, -16 * SIZE(Y)
  135. #ifdef USE_PSHUFD
  136. pshufd $0x4e, %xmm1, %xmm8
  137. #else
  138. movsd -13 * SIZE(X), %xmm8
  139. movhps -14 * SIZE(X), %xmm8
  140. #endif
  141. mulpd ALPHA_R, %xmm1
  142. mulpd ALPHA_I, %xmm8
  143. addpd -14 * SIZE(Y), %xmm1
  144. addpd %xmm8, %xmm1
  145. movaps %xmm1, -14 * SIZE(Y)
  146. movaps -4 * SIZE(X), %xmm6
  147. movaps -2 * SIZE(X), %xmm7
  148. #ifdef PREFETCH
  149. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  150. #endif
  151. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  152. pshufd $0x4e, %xmm2, %xmm8
  153. #else
  154. movsd -11 * SIZE(X), %xmm8
  155. movhps -12 * SIZE(X), %xmm8
  156. #endif
  157. mulpd ALPHA_R, %xmm2
  158. mulpd ALPHA_I, %xmm8
  159. addpd -12 * SIZE(Y), %xmm2
  160. addpd %xmm8, %xmm2
  161. movaps %xmm2, -12 * SIZE(Y)
  162. #ifdef USE_PSHUFD
  163. pshufd $0x4e, %xmm3, %xmm8
  164. #else
  165. movsd -9 * SIZE(X), %xmm8
  166. movhps -10 * SIZE(X), %xmm8
  167. #endif
  168. mulpd ALPHA_R, %xmm3
  169. mulpd ALPHA_I, %xmm8
  170. addpd -10 * SIZE(Y), %xmm3
  171. addpd %xmm8, %xmm3
  172. movaps %xmm3, -10 * SIZE(Y)
  173. movaps 0 * SIZE(X), %xmm0
  174. movaps 2 * SIZE(X), %xmm1
  175. #if defined(PREFETCHW) && !defined(FETCH128)
  176. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  177. #endif
  178. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  179. pshufd $0x4e, %xmm4, %xmm8
  180. #else
  181. movsd -7 * SIZE(X), %xmm8
  182. movhps -8 * SIZE(X), %xmm8
  183. #endif
  184. mulpd ALPHA_R, %xmm4
  185. mulpd ALPHA_I, %xmm8
  186. addpd -8 * SIZE(Y), %xmm4
  187. addpd %xmm8, %xmm4
  188. movaps %xmm4, -8 * SIZE(Y)
  189. #ifdef USE_PSHUFD
  190. pshufd $0x4e, %xmm5, %xmm8
  191. #else
  192. movsd -5 * SIZE(X), %xmm8
  193. movhps -6 * SIZE(X), %xmm8
  194. #endif
  195. mulpd ALPHA_R, %xmm5
  196. mulpd ALPHA_I, %xmm8
  197. addpd -6 * SIZE(Y), %xmm5
  198. addpd %xmm8, %xmm5
  199. movaps %xmm5, -6 * SIZE(Y)
  200. movaps 4 * SIZE(X), %xmm2
  201. movaps 6 * SIZE(X), %xmm3
  202. #if defined(PREFETCH) && !defined(FETCH128)
  203. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  204. #endif
  205. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  206. pshufd $0x4e, %xmm6, %xmm8
  207. #else
  208. movsd -3 * SIZE(X), %xmm8
  209. movhps -4 * SIZE(X), %xmm8
  210. #endif
  211. mulpd ALPHA_R, %xmm6
  212. mulpd ALPHA_I, %xmm8
  213. addpd -4 * SIZE(Y), %xmm6
  214. addpd %xmm8, %xmm6
  215. movaps %xmm6, -4 * SIZE(Y)
  216. #ifdef USE_PSHUFD
  217. pshufd $0x4e, %xmm7, %xmm8
  218. #else
  219. movsd -1 * SIZE(X), %xmm8
  220. movhps -2 * SIZE(X), %xmm8
  221. #endif
  222. mulpd ALPHA_R, %xmm7
  223. mulpd ALPHA_I, %xmm8
  224. addpd -2 * SIZE(Y), %xmm7
  225. addpd %xmm8, %xmm7
  226. movaps %xmm7, -2 * SIZE(Y)
  227. subq $-16 * SIZE, X
  228. subq $-16 * SIZE, Y
  229. decq %rax
  230. jg .L11
  231. ALIGN_3
  232. .L12:
  233. movaps -8 * SIZE(X), %xmm4
  234. movaps -6 * SIZE(X), %xmm5
  235. pshufd $0x4e, %xmm0, %xmm8
  236. mulpd ALPHA_R, %xmm0
  237. mulpd ALPHA_I, %xmm8
  238. addpd -16 * SIZE(Y), %xmm0
  239. addpd %xmm8, %xmm0
  240. movaps %xmm0, -16 * SIZE(Y)
  241. pshufd $0x4e, %xmm1, %xmm8
  242. mulpd ALPHA_R, %xmm1
  243. mulpd ALPHA_I, %xmm8
  244. addpd -14 * SIZE(Y), %xmm1
  245. addpd %xmm8, %xmm1
  246. movaps %xmm1, -14 * SIZE(Y)
  247. movaps -4 * SIZE(X), %xmm6
  248. movaps -2 * SIZE(X), %xmm7
  249. pshufd $0x4e, %xmm2, %xmm8
  250. mulpd ALPHA_R, %xmm2
  251. mulpd ALPHA_I, %xmm8
  252. addpd -12 * SIZE(Y), %xmm2
  253. addpd %xmm8, %xmm2
  254. movaps %xmm2, -12 * SIZE(Y)
  255. pshufd $0x4e, %xmm3, %xmm8
  256. mulpd ALPHA_R, %xmm3
  257. mulpd ALPHA_I, %xmm8
  258. addpd -10 * SIZE(Y), %xmm3
  259. addpd %xmm8, %xmm3
  260. movaps %xmm3, -10 * SIZE(Y)
  261. pshufd $0x4e, %xmm4, %xmm8
  262. mulpd ALPHA_R, %xmm4
  263. mulpd ALPHA_I, %xmm8
  264. addpd -8 * SIZE(Y), %xmm4
  265. addpd %xmm8, %xmm4
  266. movaps %xmm4, -8 * SIZE(Y)
  267. pshufd $0x4e, %xmm5, %xmm8
  268. mulpd ALPHA_R, %xmm5
  269. mulpd ALPHA_I, %xmm8
  270. addpd -6 * SIZE(Y), %xmm5
  271. addpd %xmm8, %xmm5
  272. movaps %xmm5, -6 * SIZE(Y)
  273. pshufd $0x4e, %xmm6, %xmm8
  274. mulpd ALPHA_R, %xmm6
  275. mulpd ALPHA_I, %xmm8
  276. addpd -4 * SIZE(Y), %xmm6
  277. addpd %xmm8, %xmm6
  278. movaps %xmm6, -4 * SIZE(Y)
  279. pshufd $0x4e, %xmm7, %xmm8
  280. mulpd ALPHA_R, %xmm7
  281. mulpd ALPHA_I, %xmm8
  282. addpd -2 * SIZE(Y), %xmm7
  283. addpd %xmm8, %xmm7
  284. movaps %xmm7, -2 * SIZE(Y)
  285. subq $-16 * SIZE, X
  286. subq $-16 * SIZE, Y
  287. ALIGN_3
  288. .L15:
  289. movq M, %rax
  290. andq $4, %rax
  291. jle .L16
  292. movaps -16 * SIZE(X), %xmm0
  293. movaps -14 * SIZE(X), %xmm1
  294. movaps -12 * SIZE(X), %xmm2
  295. movaps -10 * SIZE(X), %xmm3
  296. pshufd $0x4e, %xmm0, %xmm8
  297. mulpd ALPHA_R, %xmm0
  298. mulpd ALPHA_I, %xmm8
  299. addpd -16 * SIZE(Y), %xmm0
  300. addpd %xmm8, %xmm0
  301. movaps %xmm0, -16 * SIZE(Y)
  302. pshufd $0x4e, %xmm1, %xmm8
  303. mulpd ALPHA_R, %xmm1
  304. mulpd ALPHA_I, %xmm8
  305. addpd -14 * SIZE(Y), %xmm1
  306. addpd %xmm8, %xmm1
  307. movaps %xmm1, -14 * SIZE(Y)
  308. pshufd $0x4e, %xmm2, %xmm8
  309. mulpd ALPHA_R, %xmm2
  310. mulpd ALPHA_I, %xmm8
  311. addpd -12 * SIZE(Y), %xmm2
  312. addpd %xmm8, %xmm2
  313. movaps %xmm2, -12 * SIZE(Y)
  314. pshufd $0x4e, %xmm3, %xmm8
  315. mulpd ALPHA_R, %xmm3
  316. mulpd ALPHA_I, %xmm8
  317. addpd -10 * SIZE(Y), %xmm3
  318. addpd %xmm8, %xmm3
  319. movaps %xmm3, -10 * SIZE(Y)
  320. addq $8 * SIZE, X
  321. addq $8 * SIZE, Y
  322. ALIGN_3
  323. .L16:
  324. movq M, %rax
  325. andq $2, %rax
  326. jle .L17
  327. movaps -16 * SIZE(X), %xmm0
  328. movaps -14 * SIZE(X), %xmm1
  329. pshufd $0x4e, %xmm0, %xmm8
  330. mulpd ALPHA_R, %xmm0
  331. mulpd ALPHA_I, %xmm8
  332. addpd -16 * SIZE(Y), %xmm0
  333. addpd %xmm8, %xmm0
  334. movaps %xmm0, -16 * SIZE(Y)
  335. pshufd $0x4e, %xmm1, %xmm8
  336. mulpd ALPHA_R, %xmm1
  337. mulpd ALPHA_I, %xmm8
  338. addpd -14 * SIZE(Y), %xmm1
  339. addpd %xmm8, %xmm1
  340. movaps %xmm1, -14 * SIZE(Y)
  341. addq $4 * SIZE, X
  342. addq $4 * SIZE, Y
  343. ALIGN_3
  344. .L17:
  345. movq M, %rax
  346. andq $1, %rax
  347. jle .L999
  348. movaps -16 * SIZE(X), %xmm0
  349. pshufd $0x4e, %xmm0, %xmm8
  350. mulpd ALPHA_R, %xmm0
  351. mulpd ALPHA_I, %xmm8
  352. addpd -16 * SIZE(Y), %xmm0
  353. addpd %xmm8, %xmm0
  354. movaps %xmm0, -16 * SIZE(Y)
  355. jmp .L999
  356. ALIGN_3
  357. .L20:
  358. movq M, %rax
  359. sarq $3, %rax
  360. jle .L25
  361. movsd -16 * SIZE(X), %xmm0
  362. movhps -15 * SIZE(X), %xmm0
  363. movsd -14 * SIZE(X), %xmm1
  364. movhps -13 * SIZE(X), %xmm1
  365. movsd -12 * SIZE(X), %xmm2
  366. movhps -11 * SIZE(X), %xmm2
  367. movsd -10 * SIZE(X), %xmm3
  368. movhps -9 * SIZE(X), %xmm3
  369. decq %rax
  370. jle .L22
  371. ALIGN_3
  372. .L21:
  373. movsd -8 * SIZE(X), %xmm4
  374. movhps -7 * SIZE(X), %xmm4
  375. #ifdef PREFETCHW
  376. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  377. #endif
  378. pshufd $0x4e, %xmm0, %xmm8
  379. mulpd ALPHA_R, %xmm0
  380. mulpd ALPHA_I, %xmm8
  381. addpd -16 * SIZE(Y), %xmm0
  382. addpd %xmm8, %xmm0
  383. movaps %xmm0, -16 * SIZE(Y)
  384. movsd -6 * SIZE(X), %xmm5
  385. movhps -5 * SIZE(X), %xmm5
  386. pshufd $0x4e, %xmm1, %xmm8
  387. mulpd ALPHA_R, %xmm1
  388. mulpd ALPHA_I, %xmm8
  389. addpd -14 * SIZE(Y), %xmm1
  390. addpd %xmm8, %xmm1
  391. movaps %xmm1, -14 * SIZE(Y)
  392. movsd -4 * SIZE(X), %xmm6
  393. movhps -3 * SIZE(X), %xmm6
  394. #ifdef PREFETCH
  395. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  396. #endif
  397. pshufd $0x4e, %xmm2, %xmm8
  398. mulpd ALPHA_R, %xmm2
  399. mulpd ALPHA_I, %xmm8
  400. addpd -12 * SIZE(Y), %xmm2
  401. addpd %xmm8, %xmm2
  402. movaps %xmm2, -12 * SIZE(Y)
  403. movsd -2 * SIZE(X), %xmm7
  404. movhps -1 * SIZE(X), %xmm7
  405. pshufd $0x4e, %xmm3, %xmm8
  406. mulpd ALPHA_R, %xmm3
  407. mulpd ALPHA_I, %xmm8
  408. addpd -10 * SIZE(Y), %xmm3
  409. addpd %xmm8, %xmm3
  410. movaps %xmm3, -10 * SIZE(Y)
  411. movsd 0 * SIZE(X), %xmm0
  412. movhps 1 * SIZE(X), %xmm0
  413. #if defined(PREFETCHW) && !defined(FETCH128)
  414. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  415. #endif
  416. pshufd $0x4e, %xmm4, %xmm8
  417. mulpd ALPHA_R, %xmm4
  418. mulpd ALPHA_I, %xmm8
  419. addpd -8 * SIZE(Y), %xmm4
  420. addpd %xmm8, %xmm4
  421. movaps %xmm4, -8 * SIZE(Y)
  422. movsd 2 * SIZE(X), %xmm1
  423. movhps 3 * SIZE(X), %xmm1
  424. pshufd $0x4e, %xmm5, %xmm8
  425. mulpd ALPHA_R, %xmm5
  426. mulpd ALPHA_I, %xmm8
  427. addpd -6 * SIZE(Y), %xmm5
  428. addpd %xmm8, %xmm5
  429. movaps %xmm5, -6 * SIZE(Y)
  430. movsd 4 * SIZE(X), %xmm2
  431. movhps 5 * SIZE(X), %xmm2
  432. #if defined(PREFETCH) && !defined(FETCH128)
  433. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  434. #endif
  435. pshufd $0x4e, %xmm6, %xmm8
  436. mulpd ALPHA_R, %xmm6
  437. mulpd ALPHA_I, %xmm8
  438. addpd -4 * SIZE(Y), %xmm6
  439. addpd %xmm8, %xmm6
  440. movaps %xmm6, -4 * SIZE(Y)
  441. movsd 6 * SIZE(X), %xmm3
  442. movhps 7 * SIZE(X), %xmm3
  443. pshufd $0x4e, %xmm7, %xmm8
  444. mulpd ALPHA_R, %xmm7
  445. mulpd ALPHA_I, %xmm8
  446. addpd -2 * SIZE(Y), %xmm7
  447. addpd %xmm8, %xmm7
  448. movaps %xmm7, -2 * SIZE(Y)
  449. subq $-16 * SIZE, X
  450. subq $-16 * SIZE, Y
  451. decq %rax
  452. jg .L21
  453. ALIGN_3
  454. .L22:
  455. movsd -8 * SIZE(X), %xmm4
  456. movhps -7 * SIZE(X), %xmm4
  457. pshufd $0x4e, %xmm0, %xmm8
  458. mulpd ALPHA_R, %xmm0
  459. mulpd ALPHA_I, %xmm8
  460. addpd -16 * SIZE(Y), %xmm0
  461. addpd %xmm8, %xmm0
  462. movaps %xmm0, -16 * SIZE(Y)
  463. movsd -6 * SIZE(X), %xmm5
  464. movhps -5 * SIZE(X), %xmm5
  465. pshufd $0x4e, %xmm1, %xmm8
  466. mulpd ALPHA_R, %xmm1
  467. mulpd ALPHA_I, %xmm8
  468. addpd -14 * SIZE(Y), %xmm1
  469. addpd %xmm8, %xmm1
  470. movaps %xmm1, -14 * SIZE(Y)
  471. movsd -4 * SIZE(X), %xmm6
  472. movhps -3 * SIZE(X), %xmm6
  473. pshufd $0x4e, %xmm2, %xmm8
  474. mulpd ALPHA_R, %xmm2
  475. mulpd ALPHA_I, %xmm8
  476. addpd -12 * SIZE(Y), %xmm2
  477. addpd %xmm8, %xmm2
  478. movaps %xmm2, -12 * SIZE(Y)
  479. movsd -2 * SIZE(X), %xmm7
  480. movhps -1 * SIZE(X), %xmm7
  481. pshufd $0x4e, %xmm3, %xmm8
  482. mulpd ALPHA_R, %xmm3
  483. mulpd ALPHA_I, %xmm8
  484. addpd -10 * SIZE(Y), %xmm3
  485. addpd %xmm8, %xmm3
  486. movaps %xmm3, -10 * SIZE(Y)
  487. pshufd $0x4e, %xmm4, %xmm8
  488. mulpd ALPHA_R, %xmm4
  489. mulpd ALPHA_I, %xmm8
  490. addpd -8 * SIZE(Y), %xmm4
  491. addpd %xmm8, %xmm4
  492. movaps %xmm4, -8 * SIZE(Y)
  493. pshufd $0x4e, %xmm5, %xmm8
  494. mulpd ALPHA_R, %xmm5
  495. mulpd ALPHA_I, %xmm8
  496. addpd -6 * SIZE(Y), %xmm5
  497. addpd %xmm8, %xmm5
  498. movaps %xmm5, -6 * SIZE(Y)
  499. pshufd $0x4e, %xmm6, %xmm8
  500. mulpd ALPHA_R, %xmm6
  501. mulpd ALPHA_I, %xmm8
  502. addpd -4 * SIZE(Y), %xmm6
  503. addpd %xmm8, %xmm6
  504. movaps %xmm6, -4 * SIZE(Y)
  505. pshufd $0x4e, %xmm7, %xmm8
  506. mulpd ALPHA_R, %xmm7
  507. mulpd ALPHA_I, %xmm8
  508. addpd -2 * SIZE(Y), %xmm7
  509. addpd %xmm8, %xmm7
  510. movaps %xmm7, -2 * SIZE(Y)
  511. subq $-16 * SIZE, X
  512. subq $-16 * SIZE, Y
  513. ALIGN_3
  514. .L25:
  515. movq M, %rax
  516. andq $4, %rax
  517. jle .L26
  518. movsd -16 * SIZE(X), %xmm0
  519. movhps -15 * SIZE(X), %xmm0
  520. movsd -14 * SIZE(X), %xmm1
  521. movhps -13 * SIZE(X), %xmm1
  522. pshufd $0x4e, %xmm0, %xmm8
  523. mulpd ALPHA_R, %xmm0
  524. mulpd ALPHA_I, %xmm8
  525. addpd -16 * SIZE(Y), %xmm0
  526. addpd %xmm8, %xmm0
  527. movaps %xmm0, -16 * SIZE(Y)
  528. pshufd $0x4e, %xmm1, %xmm8
  529. mulpd ALPHA_R, %xmm1
  530. mulpd ALPHA_I, %xmm8
  531. addpd -14 * SIZE(Y), %xmm1
  532. addpd %xmm8, %xmm1
  533. movaps %xmm1, -14 * SIZE(Y)
  534. movsd -12 * SIZE(X), %xmm2
  535. movhps -11 * SIZE(X), %xmm2
  536. movsd -10 * SIZE(X), %xmm3
  537. movhps -9 * SIZE(X), %xmm3
  538. pshufd $0x4e, %xmm2, %xmm8
  539. mulpd ALPHA_R, %xmm2
  540. mulpd ALPHA_I, %xmm8
  541. addpd -12 * SIZE(Y), %xmm2
  542. addpd %xmm8, %xmm2
  543. movaps %xmm2, -12 * SIZE(Y)
  544. pshufd $0x4e, %xmm3, %xmm8
  545. mulpd ALPHA_R, %xmm3
  546. mulpd ALPHA_I, %xmm8
  547. addpd -10 * SIZE(Y), %xmm3
  548. addpd %xmm8, %xmm3
  549. movaps %xmm3, -10 * SIZE(Y)
  550. addq $8 * SIZE, X
  551. addq $8 * SIZE, Y
  552. ALIGN_3
  553. .L26:
  554. movq M, %rax
  555. andq $2, %rax
  556. jle .L27
  557. movsd -16 * SIZE(X), %xmm0
  558. movhps -15 * SIZE(X), %xmm0
  559. pshufd $0x4e, %xmm0, %xmm8
  560. mulpd ALPHA_R, %xmm0
  561. mulpd ALPHA_I, %xmm8
  562. addpd -16 * SIZE(Y), %xmm0
  563. addpd %xmm8, %xmm0
  564. movaps %xmm0, -16 * SIZE(Y)
  565. movsd -14 * SIZE(X), %xmm1
  566. movhps -13 * SIZE(X), %xmm1
  567. pshufd $0x4e, %xmm1, %xmm8
  568. mulpd ALPHA_R, %xmm1
  569. mulpd ALPHA_I, %xmm8
  570. addpd -14 * SIZE(Y), %xmm1
  571. addpd %xmm8, %xmm1
  572. movaps %xmm1, -14 * SIZE(Y)
  573. addq $4 * SIZE, X
  574. addq $4 * SIZE, Y
  575. ALIGN_3
  576. .L27:
  577. movq M, %rax
  578. andq $1, %rax
  579. jle .L999
  580. movsd -16 * SIZE(X), %xmm0
  581. movhps -15 * SIZE(X), %xmm0
  582. pshufd $0x4e, %xmm0, %xmm8
  583. mulpd ALPHA_R, %xmm0
  584. mulpd ALPHA_I, %xmm8
  585. addpd -16 * SIZE(Y), %xmm0
  586. addpd %xmm8, %xmm0
  587. movaps %xmm0, -16 * SIZE(Y)
  588. jmp .L999
  589. ALIGN_3
  590. .L30:
  591. testq $SIZE, X
  592. jne .L40
  593. movaps -16 * SIZE(X), %xmm1
  594. pshufd $0x4e, %xmm1, %xmm8
  595. mulpd ALPHA_R, %xmm1
  596. mulpd ALPHA_I, %xmm8
  597. addpd %xmm8, %xmm1
  598. xorps %xmm0, %xmm0
  599. SHUFPD_1 %xmm1, %xmm0
  600. xorps %xmm4, %xmm4
  601. movhps -16 * SIZE(Y), %xmm4
  602. addpd %xmm0, %xmm4
  603. movhps %xmm4, -16 * SIZE(Y)
  604. movaps %xmm1, %xmm0
  605. addq $2 * SIZE, X
  606. addq $1 * SIZE, Y
  607. decq M
  608. jle .L39
  609. movq M, %rax
  610. sarq $3, %rax
  611. jle .L35
  612. movaps -16 * SIZE(X), %xmm1
  613. movaps -14 * SIZE(X), %xmm2
  614. movaps -12 * SIZE(X), %xmm3
  615. decq %rax
  616. jle .L32
  617. ALIGN_3
  618. .L31:
  619. #ifdef PREFETCHW
  620. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  621. #endif
  622. pshufd $0x4e, %xmm1, %xmm8
  623. mulpd ALPHA_R, %xmm1
  624. mulpd ALPHA_I, %xmm8
  625. addpd %xmm8, %xmm1
  626. SHUFPD_1 %xmm1, %xmm0
  627. addpd -16 * SIZE(Y), %xmm0
  628. movaps %xmm0, -16 * SIZE(Y)
  629. movaps -10 * SIZE(X), %xmm0
  630. pshufd $0x4e, %xmm2, %xmm8
  631. mulpd ALPHA_R, %xmm2
  632. mulpd ALPHA_I, %xmm8
  633. addpd %xmm8, %xmm2
  634. SHUFPD_1 %xmm2, %xmm1
  635. addpd -14 * SIZE(Y), %xmm1
  636. movaps %xmm1, -14 * SIZE(Y)
  637. movaps -8 * SIZE(X), %xmm1
  638. #ifdef PREFETCH
  639. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  640. #endif
  641. pshufd $0x4e, %xmm3, %xmm8
  642. mulpd ALPHA_R, %xmm3
  643. mulpd ALPHA_I, %xmm8
  644. addpd %xmm8, %xmm3
  645. SHUFPD_1 %xmm3, %xmm2
  646. addpd -12 * SIZE(Y), %xmm2
  647. movaps %xmm2, -12 * SIZE(Y)
  648. movaps -6 * SIZE(X), %xmm2
  649. pshufd $0x4e, %xmm0, %xmm8
  650. mulpd ALPHA_R, %xmm0
  651. mulpd ALPHA_I, %xmm8
  652. addpd %xmm8, %xmm0
  653. SHUFPD_1 %xmm0, %xmm3
  654. addpd -10 * SIZE(Y), %xmm3
  655. movaps %xmm3, -10 * SIZE(Y)
  656. movaps -4 * SIZE(X), %xmm3
  657. #if defined(PREFETCHW) && !defined(FETCH128)
  658. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  659. #endif
  660. pshufd $0x4e, %xmm1, %xmm8
  661. mulpd ALPHA_R, %xmm1
  662. mulpd ALPHA_I, %xmm8
  663. addpd %xmm8, %xmm1
  664. SHUFPD_1 %xmm1, %xmm0
  665. addpd -8 * SIZE(Y), %xmm0
  666. movaps %xmm0, -8 * SIZE(Y)
  667. movaps -2 * SIZE(X), %xmm0
  668. pshufd $0x4e, %xmm2, %xmm8
  669. mulpd ALPHA_R, %xmm2
  670. mulpd ALPHA_I, %xmm8
  671. addpd %xmm8, %xmm2
  672. SHUFPD_1 %xmm2, %xmm1
  673. addpd -6 * SIZE(Y), %xmm1
  674. movaps %xmm1, -6 * SIZE(Y)
  675. movaps 0 * SIZE(X), %xmm1
  676. #if defined(PREFETCH) && !defined(FETCH128)
  677. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  678. #endif
  679. pshufd $0x4e, %xmm3, %xmm8
  680. mulpd ALPHA_R, %xmm3
  681. mulpd ALPHA_I, %xmm8
  682. addpd %xmm8, %xmm3
  683. SHUFPD_1 %xmm3, %xmm2
  684. addpd -4 * SIZE(Y), %xmm2
  685. movaps %xmm2, -4 * SIZE(Y)
  686. movaps 2 * SIZE(X), %xmm2
  687. pshufd $0x4e, %xmm0, %xmm8
  688. mulpd ALPHA_R, %xmm0
  689. mulpd ALPHA_I, %xmm8
  690. addpd %xmm8, %xmm0
  691. SHUFPD_1 %xmm0, %xmm3
  692. addpd -2 * SIZE(Y), %xmm3
  693. movaps %xmm3, -2 * SIZE(Y)
  694. movaps 4 * SIZE(X), %xmm3
  695. subq $-16 * SIZE, X
  696. subq $-16 * SIZE, Y
  697. decq %rax
  698. jg .L31
  699. ALIGN_3
  700. .L32:
  701. pshufd $0x4e, %xmm1, %xmm8
  702. mulpd ALPHA_R, %xmm1
  703. mulpd ALPHA_I, %xmm8
  704. addpd %xmm8, %xmm1
  705. SHUFPD_1 %xmm1, %xmm0
  706. addpd -16 * SIZE(Y), %xmm0
  707. movaps %xmm0, -16 * SIZE(Y)
  708. movaps -10 * SIZE(X), %xmm0
  709. pshufd $0x4e, %xmm2, %xmm8
  710. mulpd ALPHA_R, %xmm2
  711. mulpd ALPHA_I, %xmm8
  712. addpd %xmm8, %xmm2
  713. SHUFPD_1 %xmm2, %xmm1
  714. addpd -14 * SIZE(Y), %xmm1
  715. movaps %xmm1, -14 * SIZE(Y)
  716. movaps -8 * SIZE(X), %xmm1
  717. pshufd $0x4e, %xmm3, %xmm8
  718. mulpd ALPHA_R, %xmm3
  719. mulpd ALPHA_I, %xmm8
  720. addpd %xmm8, %xmm3
  721. SHUFPD_1 %xmm3, %xmm2
  722. addpd -12 * SIZE(Y), %xmm2
  723. movaps %xmm2, -12 * SIZE(Y)
  724. movaps -6 * SIZE(X), %xmm2
  725. pshufd $0x4e, %xmm0, %xmm8
  726. mulpd ALPHA_R, %xmm0
  727. mulpd ALPHA_I, %xmm8
  728. addpd %xmm8, %xmm0
  729. SHUFPD_1 %xmm0, %xmm3
  730. addpd -10 * SIZE(Y), %xmm3
  731. movaps %xmm3, -10 * SIZE(Y)
  732. movaps -4 * SIZE(X), %xmm3
  733. pshufd $0x4e, %xmm1, %xmm8
  734. mulpd ALPHA_R, %xmm1
  735. mulpd ALPHA_I, %xmm8
  736. addpd %xmm8, %xmm1
  737. SHUFPD_1 %xmm1, %xmm0
  738. addpd -8 * SIZE(Y), %xmm0
  739. movaps %xmm0, -8 * SIZE(Y)
  740. movaps -2 * SIZE(X), %xmm0
  741. pshufd $0x4e, %xmm2, %xmm8
  742. mulpd ALPHA_R, %xmm2
  743. mulpd ALPHA_I, %xmm8
  744. addpd %xmm8, %xmm2
  745. SHUFPD_1 %xmm2, %xmm1
  746. addpd -6 * SIZE(Y), %xmm1
  747. movaps %xmm1, -6 * SIZE(Y)
  748. pshufd $0x4e, %xmm3, %xmm8
  749. mulpd ALPHA_R, %xmm3
  750. mulpd ALPHA_I, %xmm8
  751. addpd %xmm8, %xmm3
  752. SHUFPD_1 %xmm3, %xmm2
  753. addpd -4 * SIZE(Y), %xmm2
  754. movaps %xmm2, -4 * SIZE(Y)
  755. pshufd $0x4e, %xmm0, %xmm8
  756. mulpd ALPHA_R, %xmm0
  757. mulpd ALPHA_I, %xmm8
  758. addpd %xmm8, %xmm0
  759. SHUFPD_1 %xmm0, %xmm3
  760. addpd -2 * SIZE(Y), %xmm3
  761. movaps %xmm3, -2 * SIZE(Y)
  762. subq $-16 * SIZE, X
  763. subq $-16 * SIZE, Y
  764. ALIGN_3
  765. .L35:
  766. movq M, %rax
  767. andq $4, %rax
  768. jle .L36
  769. movaps -16 * SIZE(X), %xmm1
  770. movaps -14 * SIZE(X), %xmm2
  771. movaps -12 * SIZE(X), %xmm3
  772. movaps -10 * SIZE(X), %xmm4
  773. pshufd $0x4e, %xmm1, %xmm8
  774. mulpd ALPHA_R, %xmm1
  775. mulpd ALPHA_I, %xmm8
  776. addpd %xmm8, %xmm1
  777. SHUFPD_1 %xmm1, %xmm0
  778. addpd -16 * SIZE(Y), %xmm0
  779. movaps %xmm0, -16 * SIZE(Y)
  780. pshufd $0x4e, %xmm2, %xmm8
  781. mulpd ALPHA_R, %xmm2
  782. mulpd ALPHA_I, %xmm8
  783. addpd %xmm8, %xmm2
  784. SHUFPD_1 %xmm2, %xmm1
  785. addpd -14 * SIZE(Y), %xmm1
  786. movaps %xmm1, -14 * SIZE(Y)
  787. pshufd $0x4e, %xmm3, %xmm8
  788. mulpd ALPHA_R, %xmm3
  789. mulpd ALPHA_I, %xmm8
  790. addpd %xmm8, %xmm3
  791. SHUFPD_1 %xmm3, %xmm2
  792. addpd -12 * SIZE(Y), %xmm2
  793. movaps %xmm2, -12 * SIZE(Y)
  794. pshufd $0x4e, %xmm4, %xmm8
  795. mulpd ALPHA_R, %xmm4
  796. mulpd ALPHA_I, %xmm8
  797. addpd %xmm8, %xmm4
  798. SHUFPD_1 %xmm4, %xmm3
  799. addpd -10 * SIZE(Y), %xmm3
  800. movaps %xmm3, -10 * SIZE(Y)
  801. movaps %xmm4, %xmm0
  802. addq $8 * SIZE, X
  803. addq $8 * SIZE, Y
  804. ALIGN_3
  805. .L36:
  806. movq M, %rax
  807. andq $2, %rax
  808. jle .L37
  809. movaps -16 * SIZE(X), %xmm1
  810. movaps -14 * SIZE(X), %xmm2
  811. pshufd $0x4e, %xmm1, %xmm8
  812. mulpd ALPHA_R, %xmm1
  813. mulpd ALPHA_I, %xmm8
  814. addpd %xmm8, %xmm1
  815. SHUFPD_1 %xmm1, %xmm0
  816. addpd -16 * SIZE(Y), %xmm0
  817. movaps %xmm0, -16 * SIZE(Y)
  818. pshufd $0x4e, %xmm2, %xmm8
  819. mulpd ALPHA_R, %xmm2
  820. mulpd ALPHA_I, %xmm8
  821. addpd %xmm8, %xmm2
  822. SHUFPD_1 %xmm2, %xmm1
  823. addpd -14 * SIZE(Y), %xmm1
  824. movaps %xmm1, -14 * SIZE(Y)
  825. movaps %xmm2, %xmm0
  826. addq $4 * SIZE, X
  827. addq $4 * SIZE, Y
  828. ALIGN_3
  829. .L37:
  830. movq M, %rax
  831. andq $1, %rax
  832. jle .L39
  833. movaps -16 * SIZE(X), %xmm1
  834. pshufd $0x4e, %xmm1, %xmm8
  835. mulpd ALPHA_R, %xmm1
  836. mulpd ALPHA_I, %xmm8
  837. addpd %xmm8, %xmm1
  838. SHUFPD_1 %xmm1, %xmm0
  839. addpd -16 * SIZE(Y), %xmm0
  840. movaps %xmm0, -16 * SIZE(Y)
  841. movaps %xmm1, %xmm0
  842. addq $2 * SIZE, X
  843. addq $2 * SIZE, Y
  844. ALIGN_3
  845. .L39:
  846. SHUFPD_1 %xmm0, %xmm0
  847. addsd -16 * SIZE(Y), %xmm0
  848. movlps %xmm0, -16 * SIZE(Y)
  849. jmp .L999
  850. ALIGN_3
  851. .L40:
  852. movsd -16 * SIZE(X), %xmm1
  853. movhps -15 * SIZE(X), %xmm1
  854. pshufd $0x4e, %xmm1, %xmm8
  855. mulpd ALPHA_R, %xmm1
  856. mulpd ALPHA_I, %xmm8
  857. addpd %xmm8, %xmm1
  858. xorps %xmm0, %xmm0
  859. SHUFPD_1 %xmm1, %xmm0
  860. xorps %xmm4, %xmm4
  861. movhps -16 * SIZE(Y), %xmm4
  862. addpd %xmm0, %xmm4
  863. movhps %xmm4, -16 * SIZE(Y)
  864. movaps %xmm1, %xmm0
  865. addq $2 * SIZE, X
  866. addq $1 * SIZE, Y
  867. decq M
  868. jle .L49
  869. movq M, %rax
  870. sarq $3, %rax
  871. jle .L45
  872. movsd -16 * SIZE(X), %xmm1
  873. movhps -15 * SIZE(X), %xmm1
  874. movsd -14 * SIZE(X), %xmm2
  875. movhps -13 * SIZE(X), %xmm2
  876. movsd -12 * SIZE(X), %xmm3
  877. movhps -11 * SIZE(X), %xmm3
  878. decq %rax
  879. jle .L42
  880. ALIGN_3
  881. .L41:
  882. #ifdef PREFETCHW
  883. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  884. #endif
  885. pshufd $0x4e, %xmm1, %xmm8
  886. mulpd ALPHA_R, %xmm1
  887. mulpd ALPHA_I, %xmm8
  888. addpd %xmm8, %xmm1
  889. SHUFPD_1 %xmm1, %xmm0
  890. addpd -16 * SIZE(Y), %xmm0
  891. movaps %xmm0, -16 * SIZE(Y)
  892. movsd -10 * SIZE(X), %xmm0
  893. movhps -9 * SIZE(X), %xmm0
  894. pshufd $0x4e, %xmm2, %xmm8
  895. mulpd ALPHA_R, %xmm2
  896. mulpd ALPHA_I, %xmm8
  897. addpd %xmm8, %xmm2
  898. SHUFPD_1 %xmm2, %xmm1
  899. addpd -14 * SIZE(Y), %xmm1
  900. movaps %xmm1, -14 * SIZE(Y)
  901. movsd -8 * SIZE(X), %xmm1
  902. movhps -7 * SIZE(X), %xmm1
  903. #ifdef PREFETCH
  904. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  905. #endif
  906. pshufd $0x4e, %xmm3, %xmm8
  907. mulpd ALPHA_R, %xmm3
  908. mulpd ALPHA_I, %xmm8
  909. addpd %xmm8, %xmm3
  910. SHUFPD_1 %xmm3, %xmm2
  911. addpd -12 * SIZE(Y), %xmm2
  912. movaps %xmm2, -12 * SIZE(Y)
  913. movsd -6 * SIZE(X), %xmm2
  914. movhps -5 * SIZE(X), %xmm2
  915. pshufd $0x4e, %xmm0, %xmm8
  916. mulpd ALPHA_R, %xmm0
  917. mulpd ALPHA_I, %xmm8
  918. addpd %xmm8, %xmm0
  919. SHUFPD_1 %xmm0, %xmm3
  920. addpd -10 * SIZE(Y), %xmm3
  921. movaps %xmm3, -10 * SIZE(Y)
  922. movsd -4 * SIZE(X), %xmm3
  923. movhps -3 * SIZE(X), %xmm3
  924. #if defined(PREFETCHW) && !defined(FETCH128)
  925. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  926. #endif
  927. pshufd $0x4e, %xmm1, %xmm8
  928. mulpd ALPHA_R, %xmm1
  929. mulpd ALPHA_I, %xmm8
  930. addpd %xmm8, %xmm1
  931. SHUFPD_1 %xmm1, %xmm0
  932. addpd -8 * SIZE(Y), %xmm0
  933. movaps %xmm0, -8 * SIZE(Y)
  934. movsd -2 * SIZE(X), %xmm0
  935. movhps -1 * SIZE(X), %xmm0
  936. pshufd $0x4e, %xmm2, %xmm8
  937. mulpd ALPHA_R, %xmm2
  938. mulpd ALPHA_I, %xmm8
  939. addpd %xmm8, %xmm2
  940. SHUFPD_1 %xmm2, %xmm1
  941. addpd -6 * SIZE(Y), %xmm1
  942. movaps %xmm1, -6 * SIZE(Y)
  943. movsd 0 * SIZE(X), %xmm1
  944. movhps 1 * SIZE(X), %xmm1
  945. #if defined(PREFETCH) && !defined(FETCH128)
  946. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  947. #endif
  948. pshufd $0x4e, %xmm3, %xmm8
  949. mulpd ALPHA_R, %xmm3
  950. mulpd ALPHA_I, %xmm8
  951. addpd %xmm8, %xmm3
  952. SHUFPD_1 %xmm3, %xmm2
  953. addpd -4 * SIZE(Y), %xmm2
  954. movaps %xmm2, -4 * SIZE(Y)
  955. movsd 2 * SIZE(X), %xmm2
  956. movhps 3 * SIZE(X), %xmm2
  957. pshufd $0x4e, %xmm0, %xmm8
  958. mulpd ALPHA_R, %xmm0
  959. mulpd ALPHA_I, %xmm8
  960. addpd %xmm8, %xmm0
  961. SHUFPD_1 %xmm0, %xmm3
  962. addpd -2 * SIZE(Y), %xmm3
  963. movaps %xmm3, -2 * SIZE(Y)
  964. movsd 4 * SIZE(X), %xmm3
  965. movhps 5 * SIZE(X), %xmm3
  966. subq $-16 * SIZE, X
  967. subq $-16 * SIZE, Y
  968. decq %rax
  969. jg .L41
  970. ALIGN_3
  971. .L42:
  972. pshufd $0x4e, %xmm1, %xmm8
  973. mulpd ALPHA_R, %xmm1
  974. mulpd ALPHA_I, %xmm8
  975. addpd %xmm8, %xmm1
  976. SHUFPD_1 %xmm1, %xmm0
  977. addpd -16 * SIZE(Y), %xmm0
  978. movaps %xmm0, -16 * SIZE(Y)
  979. movsd -10 * SIZE(X), %xmm0
  980. movhps -9 * SIZE(X), %xmm0
  981. pshufd $0x4e, %xmm2, %xmm8
  982. mulpd ALPHA_R, %xmm2
  983. mulpd ALPHA_I, %xmm8
  984. addpd %xmm8, %xmm2
  985. SHUFPD_1 %xmm2, %xmm1
  986. addpd -14 * SIZE(Y), %xmm1
  987. movaps %xmm1, -14 * SIZE(Y)
  988. movsd -8 * SIZE(X), %xmm1
  989. movhps -7 * SIZE(X), %xmm1
  990. pshufd $0x4e, %xmm3, %xmm8
  991. mulpd ALPHA_R, %xmm3
  992. mulpd ALPHA_I, %xmm8
  993. addpd %xmm8, %xmm3
  994. SHUFPD_1 %xmm3, %xmm2
  995. addpd -12 * SIZE(Y), %xmm2
  996. movaps %xmm2, -12 * SIZE(Y)
  997. movsd -6 * SIZE(X), %xmm2
  998. movhps -5 * SIZE(X), %xmm2
  999. pshufd $0x4e, %xmm0, %xmm8
  1000. mulpd ALPHA_R, %xmm0
  1001. mulpd ALPHA_I, %xmm8
  1002. addpd %xmm8, %xmm0
  1003. SHUFPD_1 %xmm0, %xmm3
  1004. addpd -10 * SIZE(Y), %xmm3
  1005. movaps %xmm3, -10 * SIZE(Y)
  1006. movsd -4 * SIZE(X), %xmm3
  1007. movhps -3 * SIZE(X), %xmm3
  1008. pshufd $0x4e, %xmm1, %xmm8
  1009. mulpd ALPHA_R, %xmm1
  1010. mulpd ALPHA_I, %xmm8
  1011. addpd %xmm8, %xmm1
  1012. SHUFPD_1 %xmm1, %xmm0
  1013. addpd -8 * SIZE(Y), %xmm0
  1014. movaps %xmm0, -8 * SIZE(Y)
  1015. movsd -2 * SIZE(X), %xmm0
  1016. movhps -1 * SIZE(X), %xmm0
  1017. pshufd $0x4e, %xmm2, %xmm8
  1018. mulpd ALPHA_R, %xmm2
  1019. mulpd ALPHA_I, %xmm8
  1020. addpd %xmm8, %xmm2
  1021. SHUFPD_1 %xmm2, %xmm1
  1022. addpd -6 * SIZE(Y), %xmm1
  1023. movaps %xmm1, -6 * SIZE(Y)
  1024. pshufd $0x4e, %xmm3, %xmm8
  1025. mulpd ALPHA_R, %xmm3
  1026. mulpd ALPHA_I, %xmm8
  1027. addpd %xmm8, %xmm3
  1028. SHUFPD_1 %xmm3, %xmm2
  1029. addpd -4 * SIZE(Y), %xmm2
  1030. movaps %xmm2, -4 * SIZE(Y)
  1031. pshufd $0x4e, %xmm0, %xmm8
  1032. mulpd ALPHA_R, %xmm0
  1033. mulpd ALPHA_I, %xmm8
  1034. addpd %xmm8, %xmm0
  1035. SHUFPD_1 %xmm0, %xmm3
  1036. addpd -2 * SIZE(Y), %xmm3
  1037. movaps %xmm3, -2 * SIZE(Y)
  1038. subq $-16 * SIZE, X
  1039. subq $-16 * SIZE, Y
  1040. ALIGN_3
  1041. .L45:
  1042. movq M, %rax
  1043. andq $4, %rax
  1044. jle .L46
  1045. movsd -16 * SIZE(X), %xmm1
  1046. movhps -15 * SIZE(X), %xmm1
  1047. movsd -14 * SIZE(X), %xmm2
  1048. movhps -13 * SIZE(X), %xmm2
  1049. movsd -12 * SIZE(X), %xmm3
  1050. movhps -11 * SIZE(X), %xmm3
  1051. movsd -10 * SIZE(X), %xmm4
  1052. movhps -9 * SIZE(X), %xmm4
  1053. pshufd $0x4e, %xmm1, %xmm8
  1054. mulpd ALPHA_R, %xmm1
  1055. mulpd ALPHA_I, %xmm8
  1056. addpd %xmm8, %xmm1
  1057. SHUFPD_1 %xmm1, %xmm0
  1058. addpd -16 * SIZE(Y), %xmm0
  1059. movaps %xmm0, -16 * SIZE(Y)
  1060. pshufd $0x4e, %xmm2, %xmm8
  1061. mulpd ALPHA_R, %xmm2
  1062. mulpd ALPHA_I, %xmm8
  1063. addpd %xmm8, %xmm2
  1064. SHUFPD_1 %xmm2, %xmm1
  1065. addpd -14 * SIZE(Y), %xmm1
  1066. movaps %xmm1, -14 * SIZE(Y)
  1067. pshufd $0x4e, %xmm3, %xmm8
  1068. mulpd ALPHA_R, %xmm3
  1069. mulpd ALPHA_I, %xmm8
  1070. addpd %xmm8, %xmm3
  1071. SHUFPD_1 %xmm3, %xmm2
  1072. addpd -12 * SIZE(Y), %xmm2
  1073. movaps %xmm2, -12 * SIZE(Y)
  1074. pshufd $0x4e, %xmm4, %xmm8
  1075. mulpd ALPHA_R, %xmm4
  1076. mulpd ALPHA_I, %xmm8
  1077. addpd %xmm8, %xmm4
  1078. SHUFPD_1 %xmm4, %xmm3
  1079. addpd -10 * SIZE(Y), %xmm3
  1080. movaps %xmm3, -10 * SIZE(Y)
  1081. movaps %xmm4, %xmm0
  1082. addq $8 * SIZE, X
  1083. addq $8 * SIZE, Y
  1084. ALIGN_3
  1085. .L46:
  1086. movq M, %rax
  1087. andq $2, %rax
  1088. jle .L47
  1089. movsd -16 * SIZE(X), %xmm1
  1090. movhps -15 * SIZE(X), %xmm1
  1091. movsd -14 * SIZE(X), %xmm2
  1092. movhps -13 * SIZE(X), %xmm2
  1093. pshufd $0x4e, %xmm1, %xmm8
  1094. mulpd ALPHA_R, %xmm1
  1095. mulpd ALPHA_I, %xmm8
  1096. addpd %xmm8, %xmm1
  1097. SHUFPD_1 %xmm1, %xmm0
  1098. addpd -16 * SIZE(Y), %xmm0
  1099. movaps %xmm0, -16 * SIZE(Y)
  1100. pshufd $0x4e, %xmm2, %xmm8
  1101. mulpd ALPHA_R, %xmm2
  1102. mulpd ALPHA_I, %xmm8
  1103. addpd %xmm8, %xmm2
  1104. SHUFPD_1 %xmm2, %xmm1
  1105. addpd -14 * SIZE(Y), %xmm1
  1106. movaps %xmm1, -14 * SIZE(Y)
  1107. movaps %xmm2, %xmm0
  1108. addq $4 * SIZE, X
  1109. addq $4 * SIZE, Y
  1110. ALIGN_3
  1111. .L47:
  1112. movq M, %rax
  1113. andq $1, %rax
  1114. jle .L49
  1115. movsd -16 * SIZE(X), %xmm1
  1116. movhps -15 * SIZE(X), %xmm1
  1117. pshufd $0x4e, %xmm1, %xmm8
  1118. mulpd ALPHA_R, %xmm1
  1119. mulpd ALPHA_I, %xmm8
  1120. addpd %xmm8, %xmm1
  1121. SHUFPD_1 %xmm1, %xmm0
  1122. addpd -16 * SIZE(Y), %xmm0
  1123. movaps %xmm0, -16 * SIZE(Y)
  1124. movaps %xmm1, %xmm0
  1125. addq $2 * SIZE, X
  1126. addq $2 * SIZE, Y
  1127. ALIGN_3
  1128. .L49:
  1129. SHUFPD_1 %xmm0, %xmm0
  1130. addsd -16 * SIZE(Y), %xmm0
  1131. movlps %xmm0, -16 * SIZE(Y)
  1132. jmp .L999
  1133. ALIGN_3
  1134. .L50:
  1135. #ifndef CONJ
  1136. movaps %xmm0, %xmm14 # a 0
  1137. pxor %xmm15, %xmm15 # 0 0
  1138. subsd %xmm1, %xmm15 # -b 0
  1139. unpcklpd %xmm14, %xmm15 # -b a
  1140. unpcklpd %xmm1, %xmm14 # a b
  1141. #else
  1142. movaps %xmm0, %xmm14 # a 0
  1143. movaps %xmm1, %xmm15 # b 0
  1144. pxor %xmm13, %xmm13 # 0 0
  1145. subsd %xmm0, %xmm13 # -a 0
  1146. unpcklpd %xmm13, %xmm15 # b -a
  1147. unpcklpd %xmm1, %xmm14 # a b
  1148. #endif
  1149. movq Y, YY
  1150. movq M, %rax
  1151. //If incx==0 || incy==0, avoid unloop and jump to end.
  1152. cmpq $0, INCX
  1153. jne .L59
  1154. cmpq $0, INCY
  1155. je .L58
  1156. .L59:
  1157. sarq $3, %rax
  1158. jle .L55
  1159. MOVDDUP( 0 * SIZE, X, %xmm0)
  1160. MOVDDUP( 1 * SIZE, X, %xmm1)
  1161. addq INCX, X
  1162. MOVDDUP( 0 * SIZE, X, %xmm2)
  1163. MOVDDUP( 1 * SIZE, X, %xmm3)
  1164. addq INCX, X
  1165. MOVDDUP( 0 * SIZE, X, %xmm4)
  1166. MOVDDUP( 1 * SIZE, X, %xmm5)
  1167. addq INCX, X
  1168. MOVDDUP( 0 * SIZE, X, %xmm6)
  1169. MOVDDUP( 1 * SIZE, X, %xmm7)
  1170. addq INCX, X
  1171. movsd 0 * SIZE(Y), %xmm8
  1172. movhpd 1 * SIZE(Y), %xmm8
  1173. addq INCY, Y
  1174. movsd 0 * SIZE(Y), %xmm9
  1175. movhpd 1 * SIZE(Y), %xmm9
  1176. addq INCY, Y
  1177. movsd 0 * SIZE(Y), %xmm10
  1178. movhpd 1 * SIZE(Y), %xmm10
  1179. addq INCY, Y
  1180. movsd 0 * SIZE(Y), %xmm11
  1181. movhpd 1 * SIZE(Y), %xmm11
  1182. addq INCY, Y
  1183. mulpd %xmm14, %xmm0
  1184. mulpd %xmm14, %xmm2
  1185. mulpd %xmm14, %xmm4
  1186. mulpd %xmm14, %xmm6
  1187. decq %rax
  1188. jle .L52
  1189. ALIGN_3
  1190. .L51:
  1191. addpd %xmm0, %xmm8
  1192. mulpd %xmm15, %xmm1
  1193. addpd %xmm2, %xmm9
  1194. mulpd %xmm15, %xmm3
  1195. addpd %xmm4, %xmm10
  1196. mulpd %xmm15, %xmm5
  1197. addpd %xmm6, %xmm11
  1198. mulpd %xmm15, %xmm7
  1199. addpd %xmm1, %xmm8
  1200. addpd %xmm3, %xmm9
  1201. addpd %xmm5, %xmm10
  1202. addpd %xmm7, %xmm11
  1203. MOVDDUP( 0 * SIZE, X, %xmm0)
  1204. MOVDDUP( 1 * SIZE, X, %xmm1)
  1205. addq INCX, X
  1206. MOVDDUP( 0 * SIZE, X, %xmm2)
  1207. MOVDDUP( 1 * SIZE, X, %xmm3)
  1208. addq INCX, X
  1209. MOVDDUP( 0 * SIZE, X, %xmm4)
  1210. MOVDDUP( 1 * SIZE, X, %xmm5)
  1211. addq INCX, X
  1212. MOVDDUP( 0 * SIZE, X, %xmm6)
  1213. MOVDDUP( 1 * SIZE, X, %xmm7)
  1214. addq INCX, X
  1215. mulpd %xmm14, %xmm0
  1216. mulpd %xmm14, %xmm2
  1217. mulpd %xmm14, %xmm4
  1218. mulpd %xmm14, %xmm6
  1219. movlpd %xmm8, 0 * SIZE(YY)
  1220. movhpd %xmm8, 1 * SIZE(YY)
  1221. addq INCY, YY
  1222. movlpd %xmm9, 0 * SIZE(YY)
  1223. movhpd %xmm9, 1 * SIZE(YY)
  1224. addq INCY, YY
  1225. movlpd %xmm10, 0 * SIZE(YY)
  1226. movhpd %xmm10, 1 * SIZE(YY)
  1227. addq INCY, YY
  1228. movlpd %xmm11, 0 * SIZE(YY)
  1229. movhpd %xmm11, 1 * SIZE(YY)
  1230. addq INCY, YY
  1231. movsd 0 * SIZE(Y), %xmm8
  1232. movhpd 1 * SIZE(Y), %xmm8
  1233. addq INCY, Y
  1234. movsd 0 * SIZE(Y), %xmm9
  1235. movhpd 1 * SIZE(Y), %xmm9
  1236. addq INCY, Y
  1237. movsd 0 * SIZE(Y), %xmm10
  1238. movhpd 1 * SIZE(Y), %xmm10
  1239. addq INCY, Y
  1240. movsd 0 * SIZE(Y), %xmm11
  1241. movhpd 1 * SIZE(Y), %xmm11
  1242. addq INCY, Y
  1243. addpd %xmm0, %xmm8
  1244. mulpd %xmm15, %xmm1
  1245. addpd %xmm2, %xmm9
  1246. mulpd %xmm15, %xmm3
  1247. addpd %xmm4, %xmm10
  1248. mulpd %xmm15, %xmm5
  1249. addpd %xmm6, %xmm11
  1250. mulpd %xmm15, %xmm7
  1251. addpd %xmm1, %xmm8
  1252. addpd %xmm3, %xmm9
  1253. addpd %xmm5, %xmm10
  1254. addpd %xmm7, %xmm11
  1255. MOVDDUP( 0 * SIZE, X, %xmm0)
  1256. MOVDDUP( 1 * SIZE, X, %xmm1)
  1257. addq INCX, X
  1258. MOVDDUP( 0 * SIZE, X, %xmm2)
  1259. MOVDDUP( 1 * SIZE, X, %xmm3)
  1260. addq INCX, X
  1261. MOVDDUP( 0 * SIZE, X, %xmm4)
  1262. MOVDDUP( 1 * SIZE, X, %xmm5)
  1263. addq INCX, X
  1264. MOVDDUP( 0 * SIZE, X, %xmm6)
  1265. MOVDDUP( 1 * SIZE, X, %xmm7)
  1266. addq INCX, X
  1267. mulpd %xmm14, %xmm0
  1268. mulpd %xmm14, %xmm2
  1269. mulpd %xmm14, %xmm4
  1270. mulpd %xmm14, %xmm6
  1271. movlpd %xmm8, 0 * SIZE(YY)
  1272. movhpd %xmm8, 1 * SIZE(YY)
  1273. addq INCY, YY
  1274. movlpd %xmm9, 0 * SIZE(YY)
  1275. movhpd %xmm9, 1 * SIZE(YY)
  1276. addq INCY, YY
  1277. movlpd %xmm10, 0 * SIZE(YY)
  1278. movhpd %xmm10, 1 * SIZE(YY)
  1279. addq INCY, YY
  1280. movlpd %xmm11, 0 * SIZE(YY)
  1281. movhpd %xmm11, 1 * SIZE(YY)
  1282. addq INCY, YY
  1283. movsd 0 * SIZE(Y), %xmm8
  1284. movhpd 1 * SIZE(Y), %xmm8
  1285. addq INCY, Y
  1286. movsd 0 * SIZE(Y), %xmm9
  1287. movhpd 1 * SIZE(Y), %xmm9
  1288. addq INCY, Y
  1289. movsd 0 * SIZE(Y), %xmm10
  1290. movhpd 1 * SIZE(Y), %xmm10
  1291. addq INCY, Y
  1292. movsd 0 * SIZE(Y), %xmm11
  1293. movhpd 1 * SIZE(Y), %xmm11
  1294. addq INCY, Y
  1295. decq %rax
  1296. jg .L51
  1297. ALIGN_3
  1298. .L52:
  1299. addpd %xmm0, %xmm8
  1300. mulpd %xmm15, %xmm1
  1301. addpd %xmm2, %xmm9
  1302. mulpd %xmm15, %xmm3
  1303. addpd %xmm4, %xmm10
  1304. mulpd %xmm15, %xmm5
  1305. addpd %xmm6, %xmm11
  1306. mulpd %xmm15, %xmm7
  1307. addpd %xmm1, %xmm8
  1308. addpd %xmm3, %xmm9
  1309. addpd %xmm5, %xmm10
  1310. addpd %xmm7, %xmm11
  1311. MOVDDUP( 0 * SIZE, X, %xmm0)
  1312. MOVDDUP( 1 * SIZE, X, %xmm1)
  1313. addq INCX, X
  1314. MOVDDUP( 0 * SIZE, X, %xmm2)
  1315. MOVDDUP( 1 * SIZE, X, %xmm3)
  1316. addq INCX, X
  1317. MOVDDUP( 0 * SIZE, X, %xmm4)
  1318. MOVDDUP( 1 * SIZE, X, %xmm5)
  1319. addq INCX, X
  1320. MOVDDUP( 0 * SIZE, X, %xmm6)
  1321. MOVDDUP( 1 * SIZE, X, %xmm7)
  1322. addq INCX, X
  1323. mulpd %xmm14, %xmm0
  1324. mulpd %xmm14, %xmm2
  1325. mulpd %xmm14, %xmm4
  1326. mulpd %xmm14, %xmm6
  1327. movlpd %xmm8, 0 * SIZE(YY)
  1328. movhpd %xmm8, 1 * SIZE(YY)
  1329. addq INCY, YY
  1330. movlpd %xmm9, 0 * SIZE(YY)
  1331. movhpd %xmm9, 1 * SIZE(YY)
  1332. addq INCY, YY
  1333. movlpd %xmm10, 0 * SIZE(YY)
  1334. movhpd %xmm10, 1 * SIZE(YY)
  1335. addq INCY, YY
  1336. movlpd %xmm11, 0 * SIZE(YY)
  1337. movhpd %xmm11, 1 * SIZE(YY)
  1338. addq INCY, YY
  1339. movsd 0 * SIZE(Y), %xmm8
  1340. movhpd 1 * SIZE(Y), %xmm8
  1341. addq INCY, Y
  1342. movsd 0 * SIZE(Y), %xmm9
  1343. movhpd 1 * SIZE(Y), %xmm9
  1344. addq INCY, Y
  1345. movsd 0 * SIZE(Y), %xmm10
  1346. movhpd 1 * SIZE(Y), %xmm10
  1347. addq INCY, Y
  1348. movsd 0 * SIZE(Y), %xmm11
  1349. movhpd 1 * SIZE(Y), %xmm11
  1350. addq INCY, Y
  1351. addpd %xmm0, %xmm8
  1352. mulpd %xmm15, %xmm1
  1353. addpd %xmm2, %xmm9
  1354. mulpd %xmm15, %xmm3
  1355. addpd %xmm4, %xmm10
  1356. mulpd %xmm15, %xmm5
  1357. addpd %xmm6, %xmm11
  1358. mulpd %xmm15, %xmm7
  1359. addpd %xmm1, %xmm8
  1360. addpd %xmm3, %xmm9
  1361. addpd %xmm5, %xmm10
  1362. addpd %xmm7, %xmm11
  1363. movlpd %xmm8, 0 * SIZE(YY)
  1364. movhpd %xmm8, 1 * SIZE(YY)
  1365. addq INCY, YY
  1366. movlpd %xmm9, 0 * SIZE(YY)
  1367. movhpd %xmm9, 1 * SIZE(YY)
  1368. addq INCY, YY
  1369. movlpd %xmm10, 0 * SIZE(YY)
  1370. movhpd %xmm10, 1 * SIZE(YY)
  1371. addq INCY, YY
  1372. movlpd %xmm11, 0 * SIZE(YY)
  1373. movhpd %xmm11, 1 * SIZE(YY)
  1374. addq INCY, YY
  1375. ALIGN_3
  1376. .L55:
  1377. movq M, %rax
  1378. andq $4, %rax
  1379. jle .L56
  1380. MOVDDUP( 0 * SIZE, X, %xmm0)
  1381. MOVDDUP( 1 * SIZE, X, %xmm1)
  1382. addq INCX, X
  1383. MOVDDUP( 0 * SIZE, X, %xmm2)
  1384. MOVDDUP( 1 * SIZE, X, %xmm3)
  1385. addq INCX, X
  1386. MOVDDUP( 0 * SIZE, X, %xmm4)
  1387. MOVDDUP( 1 * SIZE, X, %xmm5)
  1388. addq INCX, X
  1389. MOVDDUP( 0 * SIZE, X, %xmm6)
  1390. MOVDDUP( 1 * SIZE, X, %xmm7)
  1391. addq INCX, X
  1392. movsd 0 * SIZE(Y), %xmm8
  1393. movhpd 1 * SIZE(Y), %xmm8
  1394. addq INCY, Y
  1395. movsd 0 * SIZE(Y), %xmm9
  1396. movhpd 1 * SIZE(Y), %xmm9
  1397. addq INCY, Y
  1398. movsd 0 * SIZE(Y), %xmm10
  1399. movhpd 1 * SIZE(Y), %xmm10
  1400. addq INCY, Y
  1401. movsd 0 * SIZE(Y), %xmm11
  1402. movhpd 1 * SIZE(Y), %xmm11
  1403. addq INCY, Y
  1404. mulpd %xmm14, %xmm0
  1405. mulpd %xmm14, %xmm2
  1406. mulpd %xmm14, %xmm4
  1407. mulpd %xmm14, %xmm6
  1408. addpd %xmm0, %xmm8
  1409. mulpd %xmm15, %xmm1
  1410. addpd %xmm2, %xmm9
  1411. mulpd %xmm15, %xmm3
  1412. addpd %xmm4, %xmm10
  1413. mulpd %xmm15, %xmm5
  1414. addpd %xmm6, %xmm11
  1415. mulpd %xmm15, %xmm7
  1416. addpd %xmm1, %xmm8
  1417. addpd %xmm3, %xmm9
  1418. addpd %xmm5, %xmm10
  1419. addpd %xmm7, %xmm11
  1420. movlpd %xmm8, 0 * SIZE(YY)
  1421. movhpd %xmm8, 1 * SIZE(YY)
  1422. addq INCY, YY
  1423. movlpd %xmm9, 0 * SIZE(YY)
  1424. movhpd %xmm9, 1 * SIZE(YY)
  1425. addq INCY, YY
  1426. movlpd %xmm10, 0 * SIZE(YY)
  1427. movhpd %xmm10, 1 * SIZE(YY)
  1428. addq INCY, YY
  1429. movlpd %xmm11, 0 * SIZE(YY)
  1430. movhpd %xmm11, 1 * SIZE(YY)
  1431. addq INCY, YY
  1432. ALIGN_3
  1433. .L56:
  1434. movq M, %rax
  1435. andq $2, %rax
  1436. jle .L57
  1437. MOVDDUP( 0 * SIZE, X, %xmm0)
  1438. MOVDDUP( 1 * SIZE, X, %xmm1)
  1439. addq INCX, X
  1440. MOVDDUP( 0 * SIZE, X, %xmm2)
  1441. MOVDDUP( 1 * SIZE, X, %xmm3)
  1442. addq INCX, X
  1443. movsd 0 * SIZE(Y), %xmm8
  1444. movhpd 1 * SIZE(Y), %xmm8
  1445. addq INCY, Y
  1446. movsd 0 * SIZE(Y), %xmm9
  1447. movhpd 1 * SIZE(Y), %xmm9
  1448. addq INCY, Y
  1449. mulpd %xmm14, %xmm0
  1450. mulpd %xmm14, %xmm2
  1451. mulpd %xmm15, %xmm1
  1452. mulpd %xmm15, %xmm3
  1453. addpd %xmm0, %xmm8
  1454. addpd %xmm2, %xmm9
  1455. addpd %xmm1, %xmm8
  1456. addpd %xmm3, %xmm9
  1457. movlpd %xmm8, 0 * SIZE(YY)
  1458. movhpd %xmm8, 1 * SIZE(YY)
  1459. addq INCY, YY
  1460. movlpd %xmm9, 0 * SIZE(YY)
  1461. movhpd %xmm9, 1 * SIZE(YY)
  1462. addq INCY, YY
  1463. ALIGN_3
  1464. .L57:
  1465. movq M, %rax
  1466. andq $1, %rax
  1467. jle .L999
  1468. .L58:
  1469. MOVDDUP( 0 * SIZE, X, %xmm0)
  1470. MOVDDUP( 1 * SIZE, X, %xmm1)
  1471. movsd 0 * SIZE(Y), %xmm8
  1472. movhpd 1 * SIZE(Y), %xmm8
  1473. mulpd %xmm14, %xmm0
  1474. mulpd %xmm15, %xmm1
  1475. addpd %xmm0, %xmm8
  1476. addpd %xmm1, %xmm8
  1477. movlpd %xmm8, 0 * SIZE(YY)
  1478. movhpd %xmm8, 1 * SIZE(YY)
  1479. decq %rax
  1480. jg .L58
  1481. ALIGN_3
  1482. .L999:
  1483. xorq %rax, %rax
  1484. RESTOREREGISTERS
  1485. ret
  1486. EPILOGUE