You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zdot_sse2.S 31 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define RESULT 4 + STACK + ARGS(%esp)
  43. #define STACK_N 8 + STACK + ARGS(%esp)
  44. #define STACK_X 12 + STACK + ARGS(%esp)
  45. #define STACK_INCX 16 + STACK + ARGS(%esp)
  46. #define STACK_Y 20 + STACK + ARGS(%esp)
  47. #define STACK_INCY 24 + STACK + ARGS(%esp)
  48. #define N %ebx
  49. #define X %esi
  50. #define INCX %ecx
  51. #define Y %edi
  52. #define INCY %edx
  53. #include "l1param.h"
  54. #undef movsd
  55. #ifndef OPTERON
  56. #define MOVLPS movsd
  57. #else
  58. #define MOVLPS movlps
  59. #endif
  60. PROLOGUE
  61. PROFCODE
  62. pushl %edi
  63. pushl %esi
  64. pushl %ebx
  65. movl STACK_N, N
  66. movl STACK_X, X
  67. movl STACK_INCX, INCX
  68. movl STACK_Y, Y
  69. movl STACK_INCY, INCY
  70. sall $ZBASE_SHIFT, INCX
  71. sall $ZBASE_SHIFT, INCY
  72. xorps %xmm0, %xmm0
  73. xorps %xmm1, %xmm1
  74. cmpl $0, N
  75. jle .L999
  76. cmpl $2 * SIZE, INCX
  77. jne .L50
  78. cmpl $2 * SIZE, INCY
  79. jne .L50
  80. subl $-16 * SIZE, X
  81. subl $-16 * SIZE, Y
  82. testl $SIZE, Y
  83. jne .L30
  84. testl $SIZE, X
  85. jne .L20
  86. movl N, %eax
  87. sarl $3, %eax
  88. jle .L15
  89. movaps -16 * SIZE(X), %xmm4
  90. movaps -16 * SIZE(Y), %xmm6
  91. movaps -14 * SIZE(X), %xmm5
  92. movaps -14 * SIZE(Y), %xmm7
  93. decl %eax
  94. jle .L12
  95. ALIGN_3
  96. .L11:
  97. #ifdef PREFETCH
  98. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  99. #endif
  100. pshufd $0x4e, %xmm6, %xmm3
  101. mulpd %xmm4, %xmm6
  102. addpd %xmm6, %xmm0
  103. movaps -12 * SIZE(Y), %xmm6
  104. mulpd %xmm4, %xmm3
  105. movaps -12 * SIZE(X), %xmm4
  106. addpd %xmm3, %xmm1
  107. pshufd $0x4e, %xmm7, %xmm3
  108. mulpd %xmm5, %xmm7
  109. addpd %xmm7, %xmm0
  110. movaps -10 * SIZE(Y), %xmm7
  111. mulpd %xmm5, %xmm3
  112. movaps -10 * SIZE(X), %xmm5
  113. addpd %xmm3, %xmm1
  114. #ifdef PREFETCH
  115. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  116. #endif
  117. pshufd $0x4e, %xmm6, %xmm3
  118. mulpd %xmm4, %xmm6
  119. addpd %xmm6, %xmm0
  120. movaps -8 * SIZE(Y), %xmm6
  121. mulpd %xmm4, %xmm3
  122. movaps -8 * SIZE(X), %xmm4
  123. addpd %xmm3, %xmm1
  124. pshufd $0x4e, %xmm7, %xmm3
  125. mulpd %xmm5, %xmm7
  126. addpd %xmm7, %xmm0
  127. movaps -6 * SIZE(Y), %xmm7
  128. mulpd %xmm5, %xmm3
  129. movaps -6 * SIZE(X), %xmm5
  130. addpd %xmm3, %xmm1
  131. #if defined(PREFETCH) && !defined(FETCH128)
  132. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  133. #endif
  134. pshufd $0x4e, %xmm6, %xmm3
  135. mulpd %xmm4, %xmm6
  136. addpd %xmm6, %xmm0
  137. movaps -4 * SIZE(Y), %xmm6
  138. mulpd %xmm4, %xmm3
  139. movaps -4 * SIZE(X), %xmm4
  140. addpd %xmm3, %xmm1
  141. pshufd $0x4e, %xmm7, %xmm3
  142. mulpd %xmm5, %xmm7
  143. addpd %xmm7, %xmm0
  144. movaps -2 * SIZE(Y), %xmm7
  145. mulpd %xmm5, %xmm3
  146. movaps -2 * SIZE(X), %xmm5
  147. addpd %xmm3, %xmm1
  148. #if defined(PREFETCH) && !defined(FETCH128)
  149. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  150. #endif
  151. pshufd $0x4e, %xmm6, %xmm3
  152. mulpd %xmm4, %xmm6
  153. addpd %xmm6, %xmm0
  154. movaps 0 * SIZE(Y), %xmm6
  155. mulpd %xmm4, %xmm3
  156. movaps 0 * SIZE(X), %xmm4
  157. addpd %xmm3, %xmm1
  158. pshufd $0x4e, %xmm7, %xmm3
  159. mulpd %xmm5, %xmm7
  160. addpd %xmm7, %xmm0
  161. movaps 2 * SIZE(Y), %xmm7
  162. mulpd %xmm5, %xmm3
  163. movaps 2 * SIZE(X), %xmm5
  164. addpd %xmm3, %xmm1
  165. subl $-16 * SIZE, X
  166. subl $-16 * SIZE, Y
  167. decl %eax
  168. jg .L11
  169. ALIGN_3
  170. .L12:
  171. pshufd $0x4e, %xmm6, %xmm3
  172. mulpd %xmm4, %xmm6
  173. addpd %xmm6, %xmm0
  174. movaps -12 * SIZE(Y), %xmm6
  175. mulpd %xmm4, %xmm3
  176. movaps -12 * SIZE(X), %xmm4
  177. addpd %xmm3, %xmm1
  178. pshufd $0x4e, %xmm7, %xmm3
  179. mulpd %xmm5, %xmm7
  180. addpd %xmm7, %xmm0
  181. movaps -10 * SIZE(Y), %xmm7
  182. mulpd %xmm5, %xmm3
  183. movaps -10 * SIZE(X), %xmm5
  184. addpd %xmm3, %xmm1
  185. pshufd $0x4e, %xmm6, %xmm3
  186. mulpd %xmm4, %xmm6
  187. addpd %xmm6, %xmm0
  188. movaps -8 * SIZE(Y), %xmm6
  189. mulpd %xmm4, %xmm3
  190. movaps -8 * SIZE(X), %xmm4
  191. addpd %xmm3, %xmm1
  192. pshufd $0x4e, %xmm7, %xmm3
  193. mulpd %xmm5, %xmm7
  194. addpd %xmm7, %xmm0
  195. movaps -6 * SIZE(Y), %xmm7
  196. mulpd %xmm5, %xmm3
  197. movaps -6 * SIZE(X), %xmm5
  198. addpd %xmm3, %xmm1
  199. pshufd $0x4e, %xmm6, %xmm3
  200. mulpd %xmm4, %xmm6
  201. addpd %xmm6, %xmm0
  202. movaps -4 * SIZE(Y), %xmm6
  203. mulpd %xmm4, %xmm3
  204. movaps -4 * SIZE(X), %xmm4
  205. addpd %xmm3, %xmm1
  206. pshufd $0x4e, %xmm7, %xmm3
  207. mulpd %xmm5, %xmm7
  208. addpd %xmm7, %xmm0
  209. movaps -2 * SIZE(Y), %xmm7
  210. mulpd %xmm5, %xmm3
  211. movaps -2 * SIZE(X), %xmm5
  212. addpd %xmm3, %xmm1
  213. pshufd $0x4e, %xmm6, %xmm3
  214. mulpd %xmm4, %xmm6
  215. addpd %xmm6, %xmm0
  216. mulpd %xmm4, %xmm3
  217. addpd %xmm3, %xmm1
  218. pshufd $0x4e, %xmm7, %xmm3
  219. mulpd %xmm5, %xmm7
  220. addpd %xmm7, %xmm0
  221. mulpd %xmm5, %xmm3
  222. addpd %xmm3, %xmm1
  223. subl $-16 * SIZE, X
  224. subl $-16 * SIZE, Y
  225. ALIGN_3
  226. .L15:
  227. testl $4, N
  228. jle .L16
  229. movaps -16 * SIZE(X), %xmm4
  230. movaps -16 * SIZE(Y), %xmm6
  231. movaps -14 * SIZE(X), %xmm5
  232. movaps -14 * SIZE(Y), %xmm7
  233. pshufd $0x4e, %xmm6, %xmm3
  234. mulpd %xmm4, %xmm6
  235. addpd %xmm6, %xmm0
  236. movaps -12 * SIZE(Y), %xmm6
  237. mulpd %xmm4, %xmm3
  238. movaps -12 * SIZE(X), %xmm4
  239. addpd %xmm3, %xmm1
  240. pshufd $0x4e, %xmm7, %xmm3
  241. mulpd %xmm5, %xmm7
  242. addpd %xmm7, %xmm0
  243. movaps -10 * SIZE(Y), %xmm7
  244. mulpd %xmm5, %xmm3
  245. movaps -10 * SIZE(X), %xmm5
  246. addpd %xmm3, %xmm1
  247. pshufd $0x4e, %xmm6, %xmm3
  248. mulpd %xmm4, %xmm6
  249. addpd %xmm6, %xmm0
  250. mulpd %xmm4, %xmm3
  251. addpd %xmm3, %xmm1
  252. pshufd $0x4e, %xmm7, %xmm3
  253. mulpd %xmm5, %xmm7
  254. addpd %xmm7, %xmm0
  255. mulpd %xmm5, %xmm3
  256. addpd %xmm3, %xmm1
  257. addl $8 * SIZE, X
  258. addl $8 * SIZE, Y
  259. ALIGN_3
  260. .L16:
  261. testl $2, N
  262. jle .L17
  263. movaps -16 * SIZE(X), %xmm4
  264. movaps -16 * SIZE(Y), %xmm6
  265. movaps -14 * SIZE(X), %xmm5
  266. movaps -14 * SIZE(Y), %xmm7
  267. pshufd $0x4e, %xmm6, %xmm3
  268. mulpd %xmm4, %xmm6
  269. addpd %xmm6, %xmm0
  270. mulpd %xmm4, %xmm3
  271. addpd %xmm3, %xmm1
  272. pshufd $0x4e, %xmm7, %xmm3
  273. mulpd %xmm5, %xmm7
  274. addpd %xmm7, %xmm0
  275. mulpd %xmm5, %xmm3
  276. addpd %xmm3, %xmm1
  277. addl $4 * SIZE, X
  278. addl $4 * SIZE, Y
  279. ALIGN_3
  280. .L17:
  281. testl $1, N
  282. jle .L98
  283. movaps -16 * SIZE(X), %xmm4
  284. movaps -16 * SIZE(Y), %xmm6
  285. pshufd $0x4e, %xmm6, %xmm3
  286. mulpd %xmm4, %xmm6
  287. addpd %xmm6, %xmm0
  288. mulpd %xmm4, %xmm3
  289. addpd %xmm3, %xmm1
  290. jmp .L98
  291. ALIGN_3
  292. .L20:
  293. movl N, %eax
  294. sarl $3, %eax
  295. jle .L25
  296. MOVLPS -16 * SIZE(X), %xmm4
  297. movhps -15 * SIZE(X), %xmm4
  298. movaps -16 * SIZE(Y), %xmm6
  299. MOVLPS -14 * SIZE(X), %xmm5
  300. movhps -13 * SIZE(X), %xmm5
  301. movaps -14 * SIZE(Y), %xmm7
  302. decl %eax
  303. jle .L22
  304. ALIGN_3
  305. .L21:
  306. #ifdef PREFETCH
  307. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  308. #endif
  309. pshufd $0x4e, %xmm6, %xmm3
  310. mulpd %xmm4, %xmm6
  311. addpd %xmm6, %xmm0
  312. movaps -12 * SIZE(Y), %xmm6
  313. mulpd %xmm4, %xmm3
  314. MOVLPS -12 * SIZE(X), %xmm4
  315. movhps -11 * SIZE(X), %xmm4
  316. addpd %xmm3, %xmm1
  317. pshufd $0x4e, %xmm7, %xmm3
  318. mulpd %xmm5, %xmm7
  319. addpd %xmm7, %xmm0
  320. movaps -10 * SIZE(Y), %xmm7
  321. mulpd %xmm5, %xmm3
  322. MOVLPS -10 * SIZE(X), %xmm5
  323. movhps -9 * SIZE(X), %xmm5
  324. addpd %xmm3, %xmm1
  325. #ifdef PREFETCH
  326. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  327. #endif
  328. pshufd $0x4e, %xmm6, %xmm3
  329. mulpd %xmm4, %xmm6
  330. addpd %xmm6, %xmm0
  331. movaps -8 * SIZE(Y), %xmm6
  332. mulpd %xmm4, %xmm3
  333. MOVLPS -8 * SIZE(X), %xmm4
  334. movhps -7 * SIZE(X), %xmm4
  335. addpd %xmm3, %xmm1
  336. pshufd $0x4e, %xmm7, %xmm3
  337. mulpd %xmm5, %xmm7
  338. addpd %xmm7, %xmm0
  339. movaps -6 * SIZE(Y), %xmm7
  340. mulpd %xmm5, %xmm3
  341. MOVLPS -6 * SIZE(X), %xmm5
  342. movhps -5 * SIZE(X), %xmm5
  343. addpd %xmm3, %xmm1
  344. #if defined(PREFETCH) && !defined(FETCH128)
  345. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  346. #endif
  347. pshufd $0x4e, %xmm6, %xmm3
  348. mulpd %xmm4, %xmm6
  349. addpd %xmm6, %xmm0
  350. movaps -4 * SIZE(Y), %xmm6
  351. mulpd %xmm4, %xmm3
  352. MOVLPS -4 * SIZE(X), %xmm4
  353. movhps -3 * SIZE(X), %xmm4
  354. addpd %xmm3, %xmm1
  355. pshufd $0x4e, %xmm7, %xmm3
  356. mulpd %xmm5, %xmm7
  357. addpd %xmm7, %xmm0
  358. movaps -2 * SIZE(Y), %xmm7
  359. mulpd %xmm5, %xmm3
  360. MOVLPS -2 * SIZE(X), %xmm5
  361. movhps -1 * SIZE(X), %xmm5
  362. addpd %xmm3, %xmm1
  363. #if defined(PREFETCH) && !defined(FETCH128)
  364. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  365. #endif
  366. pshufd $0x4e, %xmm6, %xmm3
  367. mulpd %xmm4, %xmm6
  368. addpd %xmm6, %xmm0
  369. movaps 0 * SIZE(Y), %xmm6
  370. mulpd %xmm4, %xmm3
  371. MOVLPS 0 * SIZE(X), %xmm4
  372. movhps 1 * SIZE(X), %xmm4
  373. addpd %xmm3, %xmm1
  374. pshufd $0x4e, %xmm7, %xmm3
  375. mulpd %xmm5, %xmm7
  376. addpd %xmm7, %xmm0
  377. movaps 2 * SIZE(Y), %xmm7
  378. mulpd %xmm5, %xmm3
  379. MOVLPS 2 * SIZE(X), %xmm5
  380. movhps 3 * SIZE(X), %xmm5
  381. addpd %xmm3, %xmm1
  382. subl $-16 * SIZE, X
  383. subl $-16 * SIZE, Y
  384. decl %eax
  385. jg .L21
  386. ALIGN_3
  387. .L22:
  388. pshufd $0x4e, %xmm6, %xmm3
  389. mulpd %xmm4, %xmm6
  390. addpd %xmm6, %xmm0
  391. movaps -12 * SIZE(Y), %xmm6
  392. mulpd %xmm4, %xmm3
  393. MOVLPS -12 * SIZE(X), %xmm4
  394. movhps -11 * SIZE(X), %xmm4
  395. addpd %xmm3, %xmm1
  396. pshufd $0x4e, %xmm7, %xmm3
  397. mulpd %xmm5, %xmm7
  398. addpd %xmm7, %xmm0
  399. movaps -10 * SIZE(Y), %xmm7
  400. mulpd %xmm5, %xmm3
  401. MOVLPS -10 * SIZE(X), %xmm5
  402. movhps -9 * SIZE(X), %xmm5
  403. addpd %xmm3, %xmm1
  404. pshufd $0x4e, %xmm6, %xmm3
  405. mulpd %xmm4, %xmm6
  406. addpd %xmm6, %xmm0
  407. movaps -8 * SIZE(Y), %xmm6
  408. mulpd %xmm4, %xmm3
  409. MOVLPS -8 * SIZE(X), %xmm4
  410. movhps -7 * SIZE(X), %xmm4
  411. addpd %xmm3, %xmm1
  412. pshufd $0x4e, %xmm7, %xmm3
  413. mulpd %xmm5, %xmm7
  414. addpd %xmm7, %xmm0
  415. movaps -6 * SIZE(Y), %xmm7
  416. mulpd %xmm5, %xmm3
  417. MOVLPS -6 * SIZE(X), %xmm5
  418. movhps -5 * SIZE(X), %xmm5
  419. addpd %xmm3, %xmm1
  420. pshufd $0x4e, %xmm6, %xmm3
  421. mulpd %xmm4, %xmm6
  422. addpd %xmm6, %xmm0
  423. movaps -4 * SIZE(Y), %xmm6
  424. mulpd %xmm4, %xmm3
  425. MOVLPS -4 * SIZE(X), %xmm4
  426. movhps -3 * SIZE(X), %xmm4
  427. addpd %xmm3, %xmm1
  428. pshufd $0x4e, %xmm7, %xmm3
  429. mulpd %xmm5, %xmm7
  430. addpd %xmm7, %xmm0
  431. movaps -2 * SIZE(Y), %xmm7
  432. mulpd %xmm5, %xmm3
  433. MOVLPS -2 * SIZE(X), %xmm5
  434. movhps -1 * SIZE(X), %xmm5
  435. addpd %xmm3, %xmm1
  436. pshufd $0x4e, %xmm6, %xmm3
  437. mulpd %xmm4, %xmm6
  438. addpd %xmm6, %xmm0
  439. mulpd %xmm4, %xmm3
  440. addpd %xmm3, %xmm1
  441. pshufd $0x4e, %xmm7, %xmm3
  442. mulpd %xmm5, %xmm7
  443. addpd %xmm7, %xmm0
  444. mulpd %xmm5, %xmm3
  445. addpd %xmm3, %xmm1
  446. subl $-16 * SIZE, X
  447. subl $-16 * SIZE, Y
  448. ALIGN_3
  449. .L25:
  450. testl $4, N
  451. jle .L26
  452. MOVLPS -16 * SIZE(X), %xmm4
  453. movhps -15 * SIZE(X), %xmm4
  454. movaps -16 * SIZE(Y), %xmm6
  455. MOVLPS -14 * SIZE(X), %xmm5
  456. movhps -13 * SIZE(X), %xmm5
  457. movaps -14 * SIZE(Y), %xmm7
  458. pshufd $0x4e, %xmm6, %xmm3
  459. mulpd %xmm4, %xmm6
  460. addpd %xmm6, %xmm0
  461. movaps -12 * SIZE(Y), %xmm6
  462. mulpd %xmm4, %xmm3
  463. MOVLPS -12 * SIZE(X), %xmm4
  464. movhps -11 * SIZE(X), %xmm4
  465. addpd %xmm3, %xmm1
  466. pshufd $0x4e, %xmm7, %xmm3
  467. mulpd %xmm5, %xmm7
  468. addpd %xmm7, %xmm0
  469. movaps -10 * SIZE(Y), %xmm7
  470. mulpd %xmm5, %xmm3
  471. MOVLPS -10 * SIZE(X), %xmm5
  472. movhps -9 * SIZE(X), %xmm5
  473. addpd %xmm3, %xmm1
  474. pshufd $0x4e, %xmm6, %xmm3
  475. mulpd %xmm4, %xmm6
  476. addpd %xmm6, %xmm0
  477. mulpd %xmm4, %xmm3
  478. addpd %xmm3, %xmm1
  479. pshufd $0x4e, %xmm7, %xmm3
  480. mulpd %xmm5, %xmm7
  481. addpd %xmm7, %xmm0
  482. mulpd %xmm5, %xmm3
  483. addpd %xmm3, %xmm1
  484. addl $8 * SIZE, X
  485. addl $8 * SIZE, Y
  486. ALIGN_3
  487. .L26:
  488. testl $2, N
  489. jle .L27
  490. MOVLPS -16 * SIZE(X), %xmm4
  491. movhps -15 * SIZE(X), %xmm4
  492. movaps -16 * SIZE(Y), %xmm6
  493. pshufd $0x4e, %xmm6, %xmm3
  494. mulpd %xmm4, %xmm6
  495. addpd %xmm6, %xmm0
  496. mulpd %xmm4, %xmm3
  497. addpd %xmm3, %xmm1
  498. MOVLPS -14 * SIZE(X), %xmm5
  499. movhps -13 * SIZE(X), %xmm5
  500. movaps -14 * SIZE(Y), %xmm7
  501. pshufd $0x4e, %xmm7, %xmm3
  502. mulpd %xmm5, %xmm7
  503. addpd %xmm7, %xmm0
  504. mulpd %xmm5, %xmm3
  505. addpd %xmm3, %xmm1
  506. addl $4 * SIZE, X
  507. addl $4 * SIZE, Y
  508. ALIGN_3
  509. .L27:
  510. testl $1, N
  511. jle .L98
  512. MOVLPS -16 * SIZE(X), %xmm4
  513. movhps -15 * SIZE(X), %xmm4
  514. movaps -16 * SIZE(Y), %xmm6
  515. pshufd $0x4e, %xmm6, %xmm3
  516. mulpd %xmm4, %xmm6
  517. addpd %xmm6, %xmm0
  518. mulpd %xmm4, %xmm3
  519. addpd %xmm3, %xmm1
  520. jmp .L98
  521. ALIGN_3
  522. .L30:
  523. testl $SIZE, X
  524. jne .L40
  525. movl N, %eax
  526. sarl $3, %eax
  527. jle .L35
  528. MOVLPS -16 * SIZE(Y), %xmm4
  529. movhps -15 * SIZE(Y), %xmm4
  530. movaps -16 * SIZE(X), %xmm6
  531. MOVLPS -14 * SIZE(Y), %xmm5
  532. movhps -13 * SIZE(Y), %xmm5
  533. movaps -14 * SIZE(X), %xmm7
  534. decl %eax
  535. jle .L32
  536. ALIGN_3
  537. .L31:
  538. #ifdef PREFETCH
  539. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  540. #endif
  541. pshufd $0x4e, %xmm6, %xmm3
  542. mulpd %xmm4, %xmm6
  543. addpd %xmm6, %xmm0
  544. movaps -12 * SIZE(X), %xmm6
  545. mulpd %xmm4, %xmm3
  546. MOVLPS -12 * SIZE(Y), %xmm4
  547. movhps -11 * SIZE(Y), %xmm4
  548. addpd %xmm3, %xmm1
  549. pshufd $0x4e, %xmm7, %xmm3
  550. mulpd %xmm5, %xmm7
  551. addpd %xmm7, %xmm0
  552. movaps -10 * SIZE(X), %xmm7
  553. mulpd %xmm5, %xmm3
  554. MOVLPS -10 * SIZE(Y), %xmm5
  555. movhps -9 * SIZE(Y), %xmm5
  556. addpd %xmm3, %xmm1
  557. #ifdef PREFETCH
  558. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  559. #endif
  560. pshufd $0x4e, %xmm6, %xmm3
  561. mulpd %xmm4, %xmm6
  562. addpd %xmm6, %xmm0
  563. movaps -8 * SIZE(X), %xmm6
  564. mulpd %xmm4, %xmm3
  565. MOVLPS -8 * SIZE(Y), %xmm4
  566. movhps -7 * SIZE(Y), %xmm4
  567. addpd %xmm3, %xmm1
  568. pshufd $0x4e, %xmm7, %xmm3
  569. mulpd %xmm5, %xmm7
  570. addpd %xmm7, %xmm0
  571. movaps -6 * SIZE(X), %xmm7
  572. mulpd %xmm5, %xmm3
  573. MOVLPS -6 * SIZE(Y), %xmm5
  574. movhps -5 * SIZE(Y), %xmm5
  575. addpd %xmm3, %xmm1
  576. #if defined(PREFETCH) && !defined(FETCH128)
  577. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  578. #endif
  579. pshufd $0x4e, %xmm6, %xmm3
  580. mulpd %xmm4, %xmm6
  581. addpd %xmm6, %xmm0
  582. movaps -4 * SIZE(X), %xmm6
  583. mulpd %xmm4, %xmm3
  584. MOVLPS -4 * SIZE(Y), %xmm4
  585. movhps -3 * SIZE(Y), %xmm4
  586. addpd %xmm3, %xmm1
  587. pshufd $0x4e, %xmm7, %xmm3
  588. mulpd %xmm5, %xmm7
  589. addpd %xmm7, %xmm0
  590. movaps -2 * SIZE(X), %xmm7
  591. mulpd %xmm5, %xmm3
  592. MOVLPS -2 * SIZE(Y), %xmm5
  593. movhps -1 * SIZE(Y), %xmm5
  594. addpd %xmm3, %xmm1
  595. #if defined(PREFETCH) && !defined(FETCH128)
  596. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  597. #endif
  598. pshufd $0x4e, %xmm6, %xmm3
  599. mulpd %xmm4, %xmm6
  600. addpd %xmm6, %xmm0
  601. movaps 0 * SIZE(X), %xmm6
  602. mulpd %xmm4, %xmm3
  603. MOVLPS 0 * SIZE(Y), %xmm4
  604. movhps 1 * SIZE(Y), %xmm4
  605. addpd %xmm3, %xmm1
  606. pshufd $0x4e, %xmm7, %xmm3
  607. mulpd %xmm5, %xmm7
  608. addpd %xmm7, %xmm0
  609. movaps 2 * SIZE(X), %xmm7
  610. mulpd %xmm5, %xmm3
  611. MOVLPS 2 * SIZE(Y), %xmm5
  612. movhps 3 * SIZE(Y), %xmm5
  613. addpd %xmm3, %xmm1
  614. subl $-16 * SIZE, X
  615. subl $-16 * SIZE, Y
  616. decl %eax
  617. jg .L31
  618. ALIGN_3
  619. .L32:
  620. pshufd $0x4e, %xmm6, %xmm3
  621. mulpd %xmm4, %xmm6
  622. addpd %xmm6, %xmm0
  623. movaps -12 * SIZE(X), %xmm6
  624. mulpd %xmm4, %xmm3
  625. MOVLPS -12 * SIZE(Y), %xmm4
  626. movhps -11 * SIZE(Y), %xmm4
  627. addpd %xmm3, %xmm1
  628. pshufd $0x4e, %xmm7, %xmm3
  629. mulpd %xmm5, %xmm7
  630. addpd %xmm7, %xmm0
  631. movaps -10 * SIZE(X), %xmm7
  632. mulpd %xmm5, %xmm3
  633. MOVLPS -10 * SIZE(Y), %xmm5
  634. movhps -9 * SIZE(Y), %xmm5
  635. addpd %xmm3, %xmm1
  636. pshufd $0x4e, %xmm6, %xmm3
  637. mulpd %xmm4, %xmm6
  638. addpd %xmm6, %xmm0
  639. movaps -8 * SIZE(X), %xmm6
  640. mulpd %xmm4, %xmm3
  641. MOVLPS -8 * SIZE(Y), %xmm4
  642. movhps -7 * SIZE(Y), %xmm4
  643. addpd %xmm3, %xmm1
  644. pshufd $0x4e, %xmm7, %xmm3
  645. mulpd %xmm5, %xmm7
  646. addpd %xmm7, %xmm0
  647. movaps -6 * SIZE(X), %xmm7
  648. mulpd %xmm5, %xmm3
  649. MOVLPS -6 * SIZE(Y), %xmm5
  650. movhps -5 * SIZE(Y), %xmm5
  651. addpd %xmm3, %xmm1
  652. pshufd $0x4e, %xmm6, %xmm3
  653. mulpd %xmm4, %xmm6
  654. addpd %xmm6, %xmm0
  655. movaps -4 * SIZE(X), %xmm6
  656. mulpd %xmm4, %xmm3
  657. MOVLPS -4 * SIZE(Y), %xmm4
  658. movhps -3 * SIZE(Y), %xmm4
  659. addpd %xmm3, %xmm1
  660. pshufd $0x4e, %xmm7, %xmm3
  661. mulpd %xmm5, %xmm7
  662. addpd %xmm7, %xmm0
  663. movaps -2 * SIZE(X), %xmm7
  664. mulpd %xmm5, %xmm3
  665. MOVLPS -2 * SIZE(Y), %xmm5
  666. movhps -1 * SIZE(Y), %xmm5
  667. addpd %xmm3, %xmm1
  668. pshufd $0x4e, %xmm6, %xmm3
  669. mulpd %xmm4, %xmm6
  670. addpd %xmm6, %xmm0
  671. mulpd %xmm4, %xmm3
  672. addpd %xmm3, %xmm1
  673. pshufd $0x4e, %xmm7, %xmm3
  674. mulpd %xmm5, %xmm7
  675. addpd %xmm7, %xmm0
  676. mulpd %xmm5, %xmm3
  677. addpd %xmm3, %xmm1
  678. subl $-16 * SIZE, X
  679. subl $-16 * SIZE, Y
  680. ALIGN_3
  681. .L35:
  682. testl $4, N
  683. jle .L36
  684. MOVLPS -16 * SIZE(Y), %xmm4
  685. movhps -15 * SIZE(Y), %xmm4
  686. movaps -16 * SIZE(X), %xmm6
  687. MOVLPS -14 * SIZE(Y), %xmm5
  688. movhps -13 * SIZE(Y), %xmm5
  689. movaps -14 * SIZE(X), %xmm7
  690. pshufd $0x4e, %xmm6, %xmm3
  691. mulpd %xmm4, %xmm6
  692. addpd %xmm6, %xmm0
  693. movaps -12 * SIZE(X), %xmm6
  694. mulpd %xmm4, %xmm3
  695. MOVLPS -12 * SIZE(Y), %xmm4
  696. movhps -11 * SIZE(Y), %xmm4
  697. addpd %xmm3, %xmm1
  698. pshufd $0x4e, %xmm7, %xmm3
  699. mulpd %xmm5, %xmm7
  700. addpd %xmm7, %xmm0
  701. movaps -10 * SIZE(X), %xmm7
  702. mulpd %xmm5, %xmm3
  703. MOVLPS -10 * SIZE(Y), %xmm5
  704. movhps -9 * SIZE(Y), %xmm5
  705. addpd %xmm3, %xmm1
  706. pshufd $0x4e, %xmm6, %xmm3
  707. mulpd %xmm4, %xmm6
  708. addpd %xmm6, %xmm0
  709. mulpd %xmm4, %xmm3
  710. addpd %xmm3, %xmm1
  711. pshufd $0x4e, %xmm7, %xmm3
  712. mulpd %xmm5, %xmm7
  713. addpd %xmm7, %xmm0
  714. mulpd %xmm5, %xmm3
  715. addpd %xmm3, %xmm1
  716. addl $8 * SIZE, X
  717. addl $8 * SIZE, Y
  718. ALIGN_3
  719. .L36:
  720. testl $2, N
  721. jle .L37
  722. MOVLPS -16 * SIZE(Y), %xmm4
  723. movhps -15 * SIZE(Y), %xmm4
  724. movaps -16 * SIZE(X), %xmm6
  725. pshufd $0x4e, %xmm6, %xmm3
  726. mulpd %xmm4, %xmm6
  727. addpd %xmm6, %xmm0
  728. mulpd %xmm4, %xmm3
  729. addpd %xmm3, %xmm1
  730. MOVLPS -14 * SIZE(Y), %xmm5
  731. movhps -13 * SIZE(Y), %xmm5
  732. movaps -14 * SIZE(X), %xmm7
  733. pshufd $0x4e, %xmm7, %xmm3
  734. mulpd %xmm5, %xmm7
  735. addpd %xmm7, %xmm0
  736. mulpd %xmm5, %xmm3
  737. addpd %xmm3, %xmm1
  738. addl $4 * SIZE, X
  739. addl $4 * SIZE, Y
  740. ALIGN_3
  741. .L37:
  742. SHUFPD_1 %xmm1, %xmm1
  743. SHUFPD_1 %xmm3, %xmm3
  744. testl $1, N
  745. jle .L98
  746. MOVLPS -16 * SIZE(Y), %xmm4
  747. movhps -15 * SIZE(Y), %xmm4
  748. movaps -16 * SIZE(X), %xmm6
  749. pshufd $0x4e, %xmm6, %xmm3
  750. mulpd %xmm4, %xmm6
  751. addpd %xmm6, %xmm0
  752. mulpd %xmm4, %xmm3
  753. SHUFPD_1 %xmm3, %xmm3
  754. addpd %xmm3, %xmm1
  755. jmp .L98
  756. ALIGN_3
  757. .L40:
  758. movhps -16 * SIZE(X), %xmm4
  759. addl $SIZE, X
  760. movhps -16 * SIZE(Y), %xmm6
  761. addl $SIZE, Y
  762. movl N, %eax
  763. sarl $3, %eax
  764. jle .L45
  765. movaps -16 * SIZE(X), %xmm5
  766. movaps -16 * SIZE(Y), %xmm7
  767. decl %eax
  768. jle .L42
  769. ALIGN_3
  770. .L41:
  771. #ifdef PREFETCH
  772. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  773. #endif
  774. #ifdef PREFETCH
  775. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  776. #endif
  777. #if defined(PREFETCH) && !defined(FETCH128)
  778. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  779. #endif
  780. #if defined(PREFETCH) && !defined(FETCH128)
  781. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  782. #endif
  783. movsd %xmm7, %xmm6
  784. pshufd $0x4e, %xmm6, %xmm3
  785. movsd %xmm5, %xmm4
  786. mulpd %xmm4, %xmm6
  787. addpd %xmm6, %xmm0
  788. movaps -14 * SIZE(Y), %xmm6
  789. mulpd %xmm4, %xmm3
  790. movaps -14 * SIZE(X), %xmm4
  791. addpd %xmm3, %xmm1
  792. movsd %xmm6, %xmm7
  793. pshufd $0x4e, %xmm7, %xmm3
  794. movsd %xmm4, %xmm5
  795. mulpd %xmm5, %xmm7
  796. addpd %xmm7, %xmm0
  797. movaps -12 * SIZE(Y), %xmm7
  798. mulpd %xmm5, %xmm3
  799. movaps -12 * SIZE(X), %xmm5
  800. addpd %xmm3, %xmm1
  801. movsd %xmm7, %xmm6
  802. pshufd $0x4e, %xmm6, %xmm3
  803. movsd %xmm5, %xmm4
  804. mulpd %xmm4, %xmm6
  805. addpd %xmm6, %xmm0
  806. movaps -10 * SIZE(Y), %xmm6
  807. mulpd %xmm4, %xmm3
  808. movaps -10 * SIZE(X), %xmm4
  809. addpd %xmm3, %xmm1
  810. movsd %xmm6, %xmm7
  811. pshufd $0x4e, %xmm7, %xmm3
  812. movsd %xmm4, %xmm5
  813. mulpd %xmm5, %xmm7
  814. addpd %xmm7, %xmm0
  815. movaps -8 * SIZE(Y), %xmm7
  816. mulpd %xmm5, %xmm3
  817. movaps -8 * SIZE(X), %xmm5
  818. addpd %xmm3, %xmm1
  819. movsd %xmm7, %xmm6
  820. pshufd $0x4e, %xmm6, %xmm3
  821. movsd %xmm5, %xmm4
  822. mulpd %xmm4, %xmm6
  823. addpd %xmm6, %xmm0
  824. movaps -6 * SIZE(Y), %xmm6
  825. mulpd %xmm4, %xmm3
  826. movaps -6 * SIZE(X), %xmm4
  827. addpd %xmm3, %xmm1
  828. movsd %xmm6, %xmm7
  829. pshufd $0x4e, %xmm7, %xmm3
  830. movsd %xmm4, %xmm5
  831. mulpd %xmm5, %xmm7
  832. addpd %xmm7, %xmm0
  833. movaps -4 * SIZE(Y), %xmm7
  834. mulpd %xmm5, %xmm3
  835. movaps -4 * SIZE(X), %xmm5
  836. addpd %xmm3, %xmm1
  837. movsd %xmm7, %xmm6
  838. pshufd $0x4e, %xmm6, %xmm3
  839. movsd %xmm5, %xmm4
  840. mulpd %xmm4, %xmm6
  841. addpd %xmm6, %xmm0
  842. movaps -2 * SIZE(Y), %xmm6
  843. mulpd %xmm4, %xmm3
  844. movaps -2 * SIZE(X), %xmm4
  845. addpd %xmm3, %xmm1
  846. movsd %xmm6, %xmm7
  847. pshufd $0x4e, %xmm7, %xmm3
  848. movsd %xmm4, %xmm5
  849. mulpd %xmm5, %xmm7
  850. addpd %xmm7, %xmm0
  851. movaps 0 * SIZE(Y), %xmm7
  852. mulpd %xmm5, %xmm3
  853. movaps 0 * SIZE(X), %xmm5
  854. addpd %xmm3, %xmm1
  855. subl $-16 * SIZE, X
  856. subl $-16 * SIZE, Y
  857. decl %eax
  858. jg .L41
  859. ALIGN_3
  860. .L42:
  861. movsd %xmm7, %xmm6
  862. pshufd $0x4e, %xmm6, %xmm3
  863. movsd %xmm5, %xmm4
  864. mulpd %xmm4, %xmm6
  865. addpd %xmm6, %xmm0
  866. movaps -14 * SIZE(Y), %xmm6
  867. mulpd %xmm4, %xmm3
  868. movaps -14 * SIZE(X), %xmm4
  869. addpd %xmm3, %xmm1
  870. movsd %xmm6, %xmm7
  871. pshufd $0x4e, %xmm7, %xmm3
  872. movsd %xmm4, %xmm5
  873. mulpd %xmm5, %xmm7
  874. addpd %xmm7, %xmm0
  875. movaps -12 * SIZE(Y), %xmm7
  876. mulpd %xmm5, %xmm3
  877. movaps -12 * SIZE(X), %xmm5
  878. addpd %xmm3, %xmm1
  879. movsd %xmm7, %xmm6
  880. pshufd $0x4e, %xmm6, %xmm3
  881. movsd %xmm5, %xmm4
  882. mulpd %xmm4, %xmm6
  883. addpd %xmm6, %xmm0
  884. movaps -10 * SIZE(Y), %xmm6
  885. mulpd %xmm4, %xmm3
  886. movaps -10 * SIZE(X), %xmm4
  887. addpd %xmm3, %xmm1
  888. movsd %xmm6, %xmm7
  889. pshufd $0x4e, %xmm7, %xmm3
  890. movsd %xmm4, %xmm5
  891. mulpd %xmm5, %xmm7
  892. addpd %xmm7, %xmm0
  893. movaps -8 * SIZE(Y), %xmm7
  894. mulpd %xmm5, %xmm3
  895. movaps -8 * SIZE(X), %xmm5
  896. addpd %xmm3, %xmm1
  897. movsd %xmm7, %xmm6
  898. pshufd $0x4e, %xmm6, %xmm3
  899. movsd %xmm5, %xmm4
  900. mulpd %xmm4, %xmm6
  901. addpd %xmm6, %xmm0
  902. movaps -6 * SIZE(Y), %xmm6
  903. mulpd %xmm4, %xmm3
  904. movaps -6 * SIZE(X), %xmm4
  905. addpd %xmm3, %xmm1
  906. movsd %xmm6, %xmm7
  907. pshufd $0x4e, %xmm7, %xmm3
  908. movsd %xmm4, %xmm5
  909. mulpd %xmm5, %xmm7
  910. addpd %xmm7, %xmm0
  911. movaps -4 * SIZE(Y), %xmm7
  912. mulpd %xmm5, %xmm3
  913. movaps -4 * SIZE(X), %xmm5
  914. addpd %xmm3, %xmm1
  915. movsd %xmm7, %xmm6
  916. pshufd $0x4e, %xmm6, %xmm3
  917. movsd %xmm5, %xmm4
  918. mulpd %xmm4, %xmm6
  919. addpd %xmm6, %xmm0
  920. movaps -2 * SIZE(Y), %xmm6
  921. mulpd %xmm4, %xmm3
  922. movaps -2 * SIZE(X), %xmm4
  923. addpd %xmm3, %xmm1
  924. movsd %xmm6, %xmm7
  925. pshufd $0x4e, %xmm7, %xmm3
  926. movsd %xmm4, %xmm5
  927. mulpd %xmm5, %xmm7
  928. addpd %xmm7, %xmm0
  929. mulpd %xmm5, %xmm3
  930. addpd %xmm3, %xmm1
  931. subl $-16 * SIZE, X
  932. subl $-16 * SIZE, Y
  933. ALIGN_3
  934. .L45:
  935. testl $4, N
  936. jle .L46
  937. movaps -16 * SIZE(X), %xmm5
  938. movaps -16 * SIZE(Y), %xmm7
  939. movsd %xmm7, %xmm6
  940. pshufd $0x4e, %xmm6, %xmm3
  941. movsd %xmm5, %xmm4
  942. mulpd %xmm4, %xmm6
  943. addpd %xmm6, %xmm0
  944. movaps -14 * SIZE(Y), %xmm6
  945. mulpd %xmm4, %xmm3
  946. movaps -14 * SIZE(X), %xmm4
  947. addpd %xmm3, %xmm1
  948. movsd %xmm6, %xmm7
  949. pshufd $0x4e, %xmm7, %xmm3
  950. movsd %xmm4, %xmm5
  951. mulpd %xmm5, %xmm7
  952. addpd %xmm7, %xmm0
  953. movaps -12 * SIZE(Y), %xmm7
  954. mulpd %xmm5, %xmm3
  955. movaps -12 * SIZE(X), %xmm5
  956. addpd %xmm3, %xmm1
  957. movsd %xmm7, %xmm6
  958. pshufd $0x4e, %xmm6, %xmm3
  959. movsd %xmm5, %xmm4
  960. mulpd %xmm4, %xmm6
  961. addpd %xmm6, %xmm0
  962. movaps -10 * SIZE(Y), %xmm6
  963. mulpd %xmm4, %xmm3
  964. movaps -10 * SIZE(X), %xmm4
  965. addpd %xmm3, %xmm1
  966. movsd %xmm6, %xmm7
  967. pshufd $0x4e, %xmm7, %xmm3
  968. movsd %xmm4, %xmm5
  969. mulpd %xmm5, %xmm7
  970. addpd %xmm7, %xmm0
  971. mulpd %xmm5, %xmm3
  972. addpd %xmm3, %xmm1
  973. addl $8 * SIZE, X
  974. addl $8 * SIZE, Y
  975. ALIGN_3
  976. .L46:
  977. testl $2, N
  978. jle .L47
  979. movaps -16 * SIZE(X), %xmm5
  980. movaps -16 * SIZE(Y), %xmm7
  981. movsd %xmm7, %xmm6
  982. pshufd $0x4e, %xmm6, %xmm3
  983. movsd %xmm5, %xmm4
  984. mulpd %xmm4, %xmm6
  985. addpd %xmm6, %xmm0
  986. movaps -14 * SIZE(Y), %xmm6
  987. mulpd %xmm4, %xmm3
  988. movaps -14 * SIZE(X), %xmm4
  989. addpd %xmm3, %xmm1
  990. movsd %xmm6, %xmm7
  991. pshufd $0x4e, %xmm7, %xmm3
  992. movsd %xmm4, %xmm5
  993. mulpd %xmm5, %xmm7
  994. addpd %xmm7, %xmm0
  995. mulpd %xmm5, %xmm3
  996. addpd %xmm3, %xmm1
  997. addl $4 * SIZE, X
  998. addl $4 * SIZE, Y
  999. ALIGN_3
  1000. .L47:
  1001. testl $1, N
  1002. jle .L48
  1003. movlpd -16 * SIZE(X), %xmm4
  1004. movlpd -16 * SIZE(Y), %xmm6
  1005. pshufd $0x4e, %xmm6, %xmm3
  1006. mulpd %xmm4, %xmm6
  1007. addpd %xmm6, %xmm0
  1008. mulpd %xmm4, %xmm3
  1009. addpd %xmm3, %xmm1
  1010. ALIGN_3
  1011. .L48:
  1012. SHUFPD_1 %xmm0, %xmm0
  1013. SHUFPD_1 %xmm1, %xmm1
  1014. SHUFPD_1 %xmm2, %xmm2
  1015. SHUFPD_1 %xmm3, %xmm3
  1016. jmp .L98
  1017. ALIGN_3
  1018. .L50:
  1019. movl N, %eax
  1020. sarl $3, %eax
  1021. jle .L55
  1022. MOVLPS 0 * SIZE(X), %xmm4
  1023. movhps 1 * SIZE(X), %xmm4
  1024. addl INCX, X
  1025. MOVLPS 0 * SIZE(Y), %xmm6
  1026. movhps 1 * SIZE(Y), %xmm6
  1027. addl INCY, Y
  1028. MOVLPS 0 * SIZE(X), %xmm5
  1029. movhps 1 * SIZE(X), %xmm5
  1030. addl INCX, X
  1031. MOVLPS 0 * SIZE(Y), %xmm7
  1032. movhps 1 * SIZE(Y), %xmm7
  1033. addl INCY, Y
  1034. decl %eax
  1035. jle .L54
  1036. ALIGN_3
  1037. .L53:
  1038. pshufd $0x4e, %xmm6, %xmm3
  1039. mulpd %xmm4, %xmm6
  1040. addpd %xmm6, %xmm0
  1041. MOVLPS 0 * SIZE(Y), %xmm6
  1042. movhps 1 * SIZE(Y), %xmm6
  1043. addl INCY, Y
  1044. mulpd %xmm4, %xmm3
  1045. MOVLPS 0 * SIZE(X), %xmm4
  1046. movhps 1 * SIZE(X), %xmm4
  1047. addl INCX, X
  1048. addpd %xmm3, %xmm1
  1049. pshufd $0x4e, %xmm7, %xmm3
  1050. mulpd %xmm5, %xmm7
  1051. addpd %xmm7, %xmm0
  1052. MOVLPS 0 * SIZE(Y), %xmm7
  1053. movhps 1 * SIZE(Y), %xmm7
  1054. addl INCY, Y
  1055. mulpd %xmm5, %xmm3
  1056. MOVLPS 0 * SIZE(X), %xmm5
  1057. movhps 1 * SIZE(X), %xmm5
  1058. addl INCX, X
  1059. addpd %xmm3, %xmm1
  1060. pshufd $0x4e, %xmm6, %xmm3
  1061. mulpd %xmm4, %xmm6
  1062. addpd %xmm6, %xmm0
  1063. MOVLPS 0 * SIZE(Y), %xmm6
  1064. movhps 1 * SIZE(Y), %xmm6
  1065. addl INCY, Y
  1066. mulpd %xmm4, %xmm3
  1067. MOVLPS 0 * SIZE(X), %xmm4
  1068. movhps 1 * SIZE(X), %xmm4
  1069. addl INCX, X
  1070. addpd %xmm3, %xmm1
  1071. pshufd $0x4e, %xmm7, %xmm3
  1072. mulpd %xmm5, %xmm7
  1073. addpd %xmm7, %xmm0
  1074. MOVLPS 0 * SIZE(Y), %xmm7
  1075. movhps 1 * SIZE(Y), %xmm7
  1076. addl INCY, Y
  1077. mulpd %xmm5, %xmm3
  1078. MOVLPS 0 * SIZE(X), %xmm5
  1079. movhps 1 * SIZE(X), %xmm5
  1080. addl INCX, X
  1081. addpd %xmm3, %xmm1
  1082. pshufd $0x4e, %xmm6, %xmm3
  1083. mulpd %xmm4, %xmm6
  1084. addpd %xmm6, %xmm0
  1085. MOVLPS 0 * SIZE(Y), %xmm6
  1086. movhps 1 * SIZE(Y), %xmm6
  1087. addl INCY, Y
  1088. mulpd %xmm4, %xmm3
  1089. MOVLPS 0 * SIZE(X), %xmm4
  1090. movhps 1 * SIZE(X), %xmm4
  1091. addl INCX, X
  1092. addpd %xmm3, %xmm1
  1093. pshufd $0x4e, %xmm7, %xmm3
  1094. mulpd %xmm5, %xmm7
  1095. addpd %xmm7, %xmm0
  1096. MOVLPS 0 * SIZE(Y), %xmm7
  1097. movhps 1 * SIZE(Y), %xmm7
  1098. addl INCY, Y
  1099. mulpd %xmm5, %xmm3
  1100. MOVLPS 0 * SIZE(X), %xmm5
  1101. movhps 1 * SIZE(X), %xmm5
  1102. addl INCX, X
  1103. addpd %xmm3, %xmm1
  1104. pshufd $0x4e, %xmm6, %xmm3
  1105. mulpd %xmm4, %xmm6
  1106. addpd %xmm6, %xmm0
  1107. MOVLPS 0 * SIZE(Y), %xmm6
  1108. movhps 1 * SIZE(Y), %xmm6
  1109. addl INCY, Y
  1110. mulpd %xmm4, %xmm3
  1111. MOVLPS 0 * SIZE(X), %xmm4
  1112. movhps 1 * SIZE(X), %xmm4
  1113. addl INCX, X
  1114. addpd %xmm3, %xmm1
  1115. pshufd $0x4e, %xmm7, %xmm3
  1116. mulpd %xmm5, %xmm7
  1117. addpd %xmm7, %xmm0
  1118. MOVLPS 0 * SIZE(Y), %xmm7
  1119. movhps 1 * SIZE(Y), %xmm7
  1120. addl INCY, Y
  1121. mulpd %xmm5, %xmm3
  1122. MOVLPS 0 * SIZE(X), %xmm5
  1123. movhps 1 * SIZE(X), %xmm5
  1124. addl INCX, X
  1125. addpd %xmm3, %xmm1
  1126. decl %eax
  1127. jg .L53
  1128. ALIGN_3
  1129. .L54:
  1130. pshufd $0x4e, %xmm6, %xmm3
  1131. mulpd %xmm4, %xmm6
  1132. addpd %xmm6, %xmm0
  1133. MOVLPS 0 * SIZE(Y), %xmm6
  1134. movhps 1 * SIZE(Y), %xmm6
  1135. addl INCY, Y
  1136. mulpd %xmm4, %xmm3
  1137. MOVLPS 0 * SIZE(X), %xmm4
  1138. movhps 1 * SIZE(X), %xmm4
  1139. addl INCX, X
  1140. addpd %xmm3, %xmm1
  1141. pshufd $0x4e, %xmm7, %xmm3
  1142. mulpd %xmm5, %xmm7
  1143. addpd %xmm7, %xmm0
  1144. MOVLPS 0 * SIZE(Y), %xmm7
  1145. movhps 1 * SIZE(Y), %xmm7
  1146. addl INCY, Y
  1147. mulpd %xmm5, %xmm3
  1148. MOVLPS 0 * SIZE(X), %xmm5
  1149. movhps 1 * SIZE(X), %xmm5
  1150. addl INCX, X
  1151. addpd %xmm3, %xmm1
  1152. pshufd $0x4e, %xmm6, %xmm3
  1153. mulpd %xmm4, %xmm6
  1154. addpd %xmm6, %xmm0
  1155. MOVLPS 0 * SIZE(Y), %xmm6
  1156. movhps 1 * SIZE(Y), %xmm6
  1157. addl INCY, Y
  1158. mulpd %xmm4, %xmm3
  1159. MOVLPS 0 * SIZE(X), %xmm4
  1160. movhps 1 * SIZE(X), %xmm4
  1161. addl INCX, X
  1162. addpd %xmm3, %xmm1
  1163. pshufd $0x4e, %xmm7, %xmm3
  1164. mulpd %xmm5, %xmm7
  1165. addpd %xmm7, %xmm0
  1166. MOVLPS 0 * SIZE(Y), %xmm7
  1167. movhps 1 * SIZE(Y), %xmm7
  1168. addl INCY, Y
  1169. mulpd %xmm5, %xmm3
  1170. MOVLPS 0 * SIZE(X), %xmm5
  1171. movhps 1 * SIZE(X), %xmm5
  1172. addl INCX, X
  1173. addpd %xmm3, %xmm1
  1174. pshufd $0x4e, %xmm6, %xmm3
  1175. mulpd %xmm4, %xmm6
  1176. addpd %xmm6, %xmm0
  1177. MOVLPS 0 * SIZE(Y), %xmm6
  1178. movhps 1 * SIZE(Y), %xmm6
  1179. addl INCY, Y
  1180. mulpd %xmm4, %xmm3
  1181. MOVLPS 0 * SIZE(X), %xmm4
  1182. movhps 1 * SIZE(X), %xmm4
  1183. addl INCX, X
  1184. addpd %xmm3, %xmm1
  1185. pshufd $0x4e, %xmm7, %xmm3
  1186. mulpd %xmm5, %xmm7
  1187. addpd %xmm7, %xmm0
  1188. MOVLPS 0 * SIZE(Y), %xmm7
  1189. movhps 1 * SIZE(Y), %xmm7
  1190. addl INCY, Y
  1191. mulpd %xmm5, %xmm3
  1192. MOVLPS 0 * SIZE(X), %xmm5
  1193. movhps 1 * SIZE(X), %xmm5
  1194. addl INCX, X
  1195. addpd %xmm3, %xmm1
  1196. pshufd $0x4e, %xmm6, %xmm3
  1197. mulpd %xmm4, %xmm6
  1198. addpd %xmm6, %xmm0
  1199. mulpd %xmm4, %xmm3
  1200. addpd %xmm3, %xmm1
  1201. pshufd $0x4e, %xmm7, %xmm3
  1202. mulpd %xmm5, %xmm7
  1203. addpd %xmm7, %xmm0
  1204. mulpd %xmm5, %xmm3
  1205. addpd %xmm3, %xmm1
  1206. ALIGN_3
  1207. .L55:
  1208. testl $4, N
  1209. jle .L56
  1210. MOVLPS 0 * SIZE(X), %xmm4
  1211. movhps 1 * SIZE(X), %xmm4
  1212. addl INCX, X
  1213. MOVLPS 0 * SIZE(Y), %xmm6
  1214. movhps 1 * SIZE(Y), %xmm6
  1215. addl INCY, Y
  1216. MOVLPS 0 * SIZE(X), %xmm5
  1217. movhps 1 * SIZE(X), %xmm5
  1218. addl INCX, X
  1219. MOVLPS 0 * SIZE(Y), %xmm7
  1220. movhps 1 * SIZE(Y), %xmm7
  1221. addl INCY, Y
  1222. pshufd $0x4e, %xmm6, %xmm3
  1223. mulpd %xmm4, %xmm6
  1224. addpd %xmm6, %xmm0
  1225. MOVLPS 0 * SIZE(Y), %xmm6
  1226. movhps 1 * SIZE(Y), %xmm6
  1227. addl INCY, Y
  1228. mulpd %xmm4, %xmm3
  1229. MOVLPS 0 * SIZE(X), %xmm4
  1230. movhps 1 * SIZE(X), %xmm4
  1231. addl INCX, X
  1232. addpd %xmm3, %xmm1
  1233. pshufd $0x4e, %xmm7, %xmm3
  1234. mulpd %xmm5, %xmm7
  1235. addpd %xmm7, %xmm0
  1236. MOVLPS 0 * SIZE(Y), %xmm7
  1237. movhps 1 * SIZE(Y), %xmm7
  1238. addl INCY, Y
  1239. mulpd %xmm5, %xmm3
  1240. MOVLPS 0 * SIZE(X), %xmm5
  1241. movhps 1 * SIZE(X), %xmm5
  1242. addl INCX, X
  1243. addpd %xmm3, %xmm1
  1244. pshufd $0x4e, %xmm6, %xmm3
  1245. mulpd %xmm4, %xmm6
  1246. addpd %xmm6, %xmm0
  1247. mulpd %xmm4, %xmm3
  1248. addpd %xmm3, %xmm1
  1249. pshufd $0x4e, %xmm7, %xmm3
  1250. mulpd %xmm5, %xmm7
  1251. addpd %xmm7, %xmm0
  1252. mulpd %xmm5, %xmm3
  1253. addpd %xmm3, %xmm1
  1254. ALIGN_3
  1255. .L56:
  1256. testl $2, N
  1257. jle .L57
  1258. MOVLPS 0 * SIZE(X), %xmm4
  1259. movhps 1 * SIZE(X), %xmm4
  1260. addl INCX, X
  1261. MOVLPS 0 * SIZE(Y), %xmm6
  1262. movhps 1 * SIZE(Y), %xmm6
  1263. addl INCY, Y
  1264. pshufd $0x4e, %xmm6, %xmm3
  1265. mulpd %xmm4, %xmm6
  1266. addpd %xmm6, %xmm0
  1267. mulpd %xmm4, %xmm3
  1268. addpd %xmm3, %xmm1
  1269. MOVLPS 0 * SIZE(X), %xmm5
  1270. movhps 1 * SIZE(X), %xmm5
  1271. addl INCX, X
  1272. MOVLPS 0 * SIZE(Y), %xmm7
  1273. movhps 1 * SIZE(Y), %xmm7
  1274. addl INCY, Y
  1275. pshufd $0x4e, %xmm7, %xmm3
  1276. mulpd %xmm5, %xmm7
  1277. addpd %xmm7, %xmm0
  1278. mulpd %xmm5, %xmm3
  1279. addpd %xmm3, %xmm1
  1280. ALIGN_3
  1281. .L57:
  1282. testl $1, N
  1283. jle .L98
  1284. MOVLPS 0 * SIZE(X), %xmm4
  1285. movhps 1 * SIZE(X), %xmm4
  1286. MOVLPS 0 * SIZE(Y), %xmm6
  1287. movhps 1 * SIZE(Y), %xmm6
  1288. pshufd $0x4e, %xmm6, %xmm3
  1289. mulpd %xmm4, %xmm6
  1290. addpd %xmm6, %xmm0
  1291. mulpd %xmm4, %xmm3
  1292. addpd %xmm3, %xmm1
  1293. ALIGN_3
  1294. .L98:
  1295. pshufd $0x4e, %xmm0, %xmm2
  1296. pshufd $0x4e, %xmm1, %xmm3
  1297. #ifndef CONJ
  1298. subsd %xmm2, %xmm0
  1299. addsd %xmm3, %xmm1
  1300. #else
  1301. addsd %xmm2, %xmm0
  1302. subsd %xmm3, %xmm1
  1303. #endif
  1304. .L999:
  1305. movl RESULT, %eax
  1306. MOVLPS %xmm0, 0 * SIZE(%eax)
  1307. MOVLPS %xmm1, 1 * SIZE(%eax)
  1308. popl %ebx
  1309. popl %esi
  1310. popl %edi
  1311. /*remove the hidden return value address from the stack.*/
  1312. popl %ecx
  1313. xchgl %ecx, 0(%esp)
  1314. ret
  1315. EPILOGUE

OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.