You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dot_sse2.S 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_N 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define N %ecx
  48. #define X %esi
  49. #define INCX %ebx
  50. #define Y %edi
  51. #define INCY %edx
  52. #include "l1param.h"
  53. PROLOGUE
  54. PROFCODE
  55. pushl %edi
  56. pushl %esi
  57. pushl %ebx
  58. movl STACK_N, N
  59. movl STACK_X, X
  60. movl STACK_INCX, INCX
  61. movl STACK_Y, Y
  62. movl STACK_INCY, INCY
  63. leal (, INCX, SIZE), INCX
  64. leal (, INCY, SIZE), INCY
  65. xorps %xmm0, %xmm0
  66. xorps %xmm1, %xmm1
  67. xorps %xmm2, %xmm2
  68. xorps %xmm3, %xmm3
  69. cmpl $0, N
  70. jle .L999
  71. cmpl $SIZE, INCX
  72. jne .L50
  73. cmpl $SIZE, INCY
  74. jne .L50
  75. subl $-16 * SIZE, X
  76. subl $-16 * SIZE, Y
  77. testl $SIZE, Y
  78. je .L10
  79. movsd -16 * SIZE(X), %xmm0
  80. mulsd -16 * SIZE(Y), %xmm0
  81. addl $1 * SIZE, X
  82. addl $1 * SIZE, Y
  83. decl N
  84. ALIGN_2
  85. .L10:
  86. testl $SIZE, X
  87. jne .L20
  88. movl N, %eax
  89. sarl $4, %eax
  90. jle .L14
  91. movaps -16 * SIZE(X), %xmm4
  92. movaps -14 * SIZE(X), %xmm5
  93. movaps -12 * SIZE(X), %xmm6
  94. movaps -10 * SIZE(X), %xmm7
  95. decl %eax
  96. jle .L12
  97. ALIGN_3
  98. .L11:
  99. #ifdef PREFETCH
  100. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  101. #endif
  102. mulpd -16 * SIZE(Y), %xmm4
  103. addpd %xmm4, %xmm0
  104. movaps -8 * SIZE(X), %xmm4
  105. mulpd -14 * SIZE(Y), %xmm5
  106. addpd %xmm5, %xmm1
  107. movaps -6 * SIZE(X), %xmm5
  108. #ifdef PREFETCH
  109. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  110. #endif
  111. mulpd -12 * SIZE(Y), %xmm6
  112. addpd %xmm6, %xmm2
  113. movaps -4 * SIZE(X), %xmm6
  114. mulpd -10 * SIZE(Y), %xmm7
  115. addpd %xmm7, %xmm3
  116. movaps -2 * SIZE(X), %xmm7
  117. #if defined(PREFETCH) && !defined(FETCH128)
  118. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  119. #endif
  120. mulpd -8 * SIZE(Y), %xmm4
  121. addpd %xmm4, %xmm0
  122. movaps 0 * SIZE(X), %xmm4
  123. mulpd -6 * SIZE(Y), %xmm5
  124. addpd %xmm5, %xmm1
  125. movaps 2 * SIZE(X), %xmm5
  126. #if defined(PREFETCH) && !defined(FETCH128)
  127. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  128. #endif
  129. mulpd -4 * SIZE(Y), %xmm6
  130. addpd %xmm6, %xmm2
  131. movaps 4 * SIZE(X), %xmm6
  132. mulpd -2 * SIZE(Y), %xmm7
  133. addpd %xmm7, %xmm3
  134. movaps 6 * SIZE(X), %xmm7
  135. subl $-16 * SIZE, X
  136. subl $-16 * SIZE, Y
  137. decl %eax
  138. jg .L11
  139. ALIGN_3
  140. .L12:
  141. mulpd -16 * SIZE(Y), %xmm4
  142. addpd %xmm4, %xmm0
  143. movaps -8 * SIZE(X), %xmm4
  144. mulpd -14 * SIZE(Y), %xmm5
  145. addpd %xmm5, %xmm1
  146. movaps -6 * SIZE(X), %xmm5
  147. mulpd -12 * SIZE(Y), %xmm6
  148. addpd %xmm6, %xmm2
  149. movaps -4 * SIZE(X), %xmm6
  150. mulpd -10 * SIZE(Y), %xmm7
  151. addpd %xmm7, %xmm3
  152. movaps -2 * SIZE(X), %xmm7
  153. mulpd -8 * SIZE(Y), %xmm4
  154. addpd %xmm4, %xmm0
  155. mulpd -6 * SIZE(Y), %xmm5
  156. addpd %xmm5, %xmm1
  157. mulpd -4 * SIZE(Y), %xmm6
  158. addpd %xmm6, %xmm2
  159. mulpd -2 * SIZE(Y), %xmm7
  160. addpd %xmm7, %xmm3
  161. subl $-16 * SIZE, X
  162. subl $-16 * SIZE, Y
  163. ALIGN_3
  164. .L14:
  165. testl $15, N
  166. jle .L999
  167. testl $8, N
  168. jle .L15
  169. movaps -16 * SIZE(X), %xmm4
  170. movaps -14 * SIZE(X), %xmm5
  171. movaps -12 * SIZE(X), %xmm6
  172. movaps -10 * SIZE(X), %xmm7
  173. mulpd -16 * SIZE(Y), %xmm4
  174. addpd %xmm4, %xmm0
  175. mulpd -14 * SIZE(Y), %xmm5
  176. addpd %xmm5, %xmm1
  177. mulpd -12 * SIZE(Y), %xmm6
  178. addpd %xmm6, %xmm2
  179. mulpd -10 * SIZE(Y), %xmm7
  180. addpd %xmm7, %xmm3
  181. addl $8 * SIZE, X
  182. addl $8 * SIZE, Y
  183. ALIGN_3
  184. .L15:
  185. testl $4, N
  186. jle .L16
  187. movaps -16 * SIZE(X), %xmm4
  188. movaps -14 * SIZE(X), %xmm5
  189. mulpd -16 * SIZE(Y), %xmm4
  190. addpd %xmm4, %xmm0
  191. mulpd -14 * SIZE(Y), %xmm5
  192. addpd %xmm5, %xmm1
  193. addl $4 * SIZE, X
  194. addl $4 * SIZE, Y
  195. ALIGN_3
  196. .L16:
  197. testl $2, N
  198. jle .L17
  199. movaps -16 * SIZE(X), %xmm4
  200. mulpd -16 * SIZE(Y), %xmm4
  201. addpd %xmm4, %xmm0
  202. addl $2 * SIZE, X
  203. addl $2 * SIZE, Y
  204. ALIGN_3
  205. .L17:
  206. testl $1, N
  207. jle .L999
  208. movsd -16 * SIZE(X), %xmm4
  209. mulsd -16 * SIZE(Y), %xmm4
  210. addsd %xmm4, %xmm0
  211. jmp .L999
  212. ALIGN_3
  213. .L20:
  214. #ifdef ALIGNED_ACCESS
  215. movhps -16 * SIZE(X), %xmm4
  216. addl $SIZE, X
  217. movl N, %eax
  218. sarl $4, %eax
  219. jle .L24
  220. movaps -16 * SIZE(X), %xmm5
  221. movaps -14 * SIZE(X), %xmm6
  222. movaps -12 * SIZE(X), %xmm7
  223. decl %eax
  224. jle .L22
  225. ALIGN_3
  226. .L21:
  227. #ifdef PREFETCH
  228. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  229. #endif
  230. SHUFPD_1 %xmm5, %xmm4
  231. mulpd -16 * SIZE(Y), %xmm4
  232. addpd %xmm4, %xmm0
  233. movaps -10 * SIZE(X), %xmm4
  234. SHUFPD_1 %xmm6, %xmm5
  235. mulpd -14 * SIZE(Y), %xmm5
  236. addpd %xmm5, %xmm1
  237. movaps -8 * SIZE(X), %xmm5
  238. #ifdef PREFETCH
  239. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  240. #endif
  241. SHUFPD_1 %xmm7, %xmm6
  242. mulpd -12 * SIZE(Y), %xmm6
  243. addpd %xmm6, %xmm2
  244. movaps -6 * SIZE(X), %xmm6
  245. SHUFPD_1 %xmm4, %xmm7
  246. mulpd -10 * SIZE(Y), %xmm7
  247. addpd %xmm7, %xmm3
  248. movaps -4 * SIZE(X), %xmm7
  249. #if defined(PREFETCH) && !defined(FETCH128)
  250. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  251. #endif
  252. SHUFPD_1 %xmm5, %xmm4
  253. mulpd -8 * SIZE(Y), %xmm4
  254. addpd %xmm4, %xmm0
  255. movaps -2 * SIZE(X), %xmm4
  256. SHUFPD_1 %xmm6, %xmm5
  257. mulpd -6 * SIZE(Y), %xmm5
  258. addpd %xmm5, %xmm1
  259. movaps 0 * SIZE(X), %xmm5
  260. #if defined(PREFETCH) && !defined(FETCH128)
  261. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  262. #endif
  263. SHUFPD_1 %xmm7, %xmm6
  264. mulpd -4 * SIZE(Y), %xmm6
  265. addpd %xmm6, %xmm2
  266. movaps 2 * SIZE(X), %xmm6
  267. SHUFPD_1 %xmm4, %xmm7
  268. mulpd -2 * SIZE(Y), %xmm7
  269. addpd %xmm7, %xmm3
  270. movaps 4 * SIZE(X), %xmm7
  271. subl $-16 * SIZE, X
  272. subl $-16 * SIZE, Y
  273. decl %eax
  274. jg .L21
  275. ALIGN_3
  276. .L22:
  277. SHUFPD_1 %xmm5, %xmm4
  278. mulpd -16 * SIZE(Y), %xmm4
  279. addpd %xmm4, %xmm0
  280. movaps -10 * SIZE(X), %xmm4
  281. SHUFPD_1 %xmm6, %xmm5
  282. mulpd -14 * SIZE(Y), %xmm5
  283. addpd %xmm5, %xmm1
  284. movaps -8 * SIZE(X), %xmm5
  285. SHUFPD_1 %xmm7, %xmm6
  286. mulpd -12 * SIZE(Y), %xmm6
  287. addpd %xmm6, %xmm2
  288. movaps -6 * SIZE(X), %xmm6
  289. SHUFPD_1 %xmm4, %xmm7
  290. mulpd -10 * SIZE(Y), %xmm7
  291. addpd %xmm7, %xmm3
  292. movaps -4 * SIZE(X), %xmm7
  293. SHUFPD_1 %xmm5, %xmm4
  294. mulpd -8 * SIZE(Y), %xmm4
  295. addpd %xmm4, %xmm0
  296. movaps -2 * SIZE(X), %xmm4
  297. SHUFPD_1 %xmm6, %xmm5
  298. mulpd -6 * SIZE(Y), %xmm5
  299. addpd %xmm5, %xmm1
  300. SHUFPD_1 %xmm7, %xmm6
  301. mulpd -4 * SIZE(Y), %xmm6
  302. addpd %xmm6, %xmm2
  303. SHUFPD_1 %xmm4, %xmm7
  304. mulpd -2 * SIZE(Y), %xmm7
  305. addpd %xmm7, %xmm3
  306. subl $-16 * SIZE, X
  307. subl $-16 * SIZE, Y
  308. ALIGN_3
  309. .L24:
  310. testl $15, N
  311. jle .L999
  312. testl $8, N
  313. jle .L25
  314. movaps -16 * SIZE(X), %xmm5
  315. movaps -14 * SIZE(X), %xmm6
  316. movaps -12 * SIZE(X), %xmm7
  317. SHUFPD_1 %xmm5, %xmm4
  318. mulpd -16 * SIZE(Y), %xmm4
  319. addpd %xmm4, %xmm0
  320. movaps -10 * SIZE(X), %xmm4
  321. SHUFPD_1 %xmm6, %xmm5
  322. mulpd -14 * SIZE(Y), %xmm5
  323. addpd %xmm5, %xmm1
  324. SHUFPD_1 %xmm7, %xmm6
  325. mulpd -12 * SIZE(Y), %xmm6
  326. addpd %xmm6, %xmm2
  327. SHUFPD_1 %xmm4, %xmm7
  328. mulpd -10 * SIZE(Y), %xmm7
  329. addpd %xmm7, %xmm3
  330. addl $8 * SIZE, X
  331. addl $8 * SIZE, Y
  332. ALIGN_3
  333. .L25:
  334. testl $4, N
  335. jle .L26
  336. movaps -16 * SIZE(X), %xmm5
  337. movaps -14 * SIZE(X), %xmm6
  338. SHUFPD_1 %xmm5, %xmm4
  339. mulpd -16 * SIZE(Y), %xmm4
  340. addpd %xmm4, %xmm0
  341. SHUFPD_1 %xmm6, %xmm5
  342. mulpd -14 * SIZE(Y), %xmm5
  343. addpd %xmm5, %xmm1
  344. movapd %xmm6, %xmm4
  345. addl $4 * SIZE, X
  346. addl $4 * SIZE, Y
  347. ALIGN_3
  348. .L26:
  349. testl $2, N
  350. jle .L27
  351. movaps -16 * SIZE(X), %xmm5
  352. SHUFPD_1 %xmm5, %xmm4
  353. mulpd -16 * SIZE(Y), %xmm4
  354. addpd %xmm4, %xmm0
  355. movapd %xmm5, %xmm4
  356. addl $2 * SIZE, X
  357. addl $2 * SIZE, Y
  358. ALIGN_3
  359. .L27:
  360. testl $1, N
  361. jle .L999
  362. SHUFPD_1 %xmm4, %xmm4
  363. mulsd -16 * SIZE(Y), %xmm4
  364. addsd %xmm4, %xmm0
  365. jmp .L999
  366. ALIGN_3
  367. #else
  368. movl N, %eax
  369. sarl $4, %eax
  370. jle .L24
  371. movlps -16 * SIZE(X), %xmm4
  372. movhps -15 * SIZE(X), %xmm4
  373. movlps -14 * SIZE(X), %xmm5
  374. movhps -13 * SIZE(X), %xmm5
  375. movlps -12 * SIZE(X), %xmm6
  376. movhps -11 * SIZE(X), %xmm6
  377. movlps -10 * SIZE(X), %xmm7
  378. movhps -9 * SIZE(X), %xmm7
  379. decl %eax
  380. jle .L22
  381. ALIGN_3
  382. .L21:
  383. #ifdef PREFETCH
  384. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  385. #endif
  386. mulpd -16 * SIZE(Y), %xmm4
  387. addpd %xmm4, %xmm0
  388. movlps -8 * SIZE(X), %xmm4
  389. movhps -7 * SIZE(X), %xmm4
  390. mulpd -14 * SIZE(Y), %xmm5
  391. addpd %xmm5, %xmm1
  392. movlps -6 * SIZE(X), %xmm5
  393. movhps -5 * SIZE(X), %xmm5
  394. #ifdef PREFETCH
  395. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  396. #endif
  397. mulpd -12 * SIZE(Y), %xmm6
  398. addpd %xmm6, %xmm2
  399. movlps -4 * SIZE(X), %xmm6
  400. movhps -3 * SIZE(X), %xmm6
  401. mulpd -10 * SIZE(Y), %xmm7
  402. addpd %xmm7, %xmm3
  403. movlps -2 * SIZE(X), %xmm7
  404. movhps -1 * SIZE(X), %xmm7
  405. #if defined(PREFETCH) && !defined(FETCH128)
  406. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  407. #endif
  408. mulpd -8 * SIZE(Y), %xmm4
  409. addpd %xmm4, %xmm0
  410. movlps 0 * SIZE(X), %xmm4
  411. movhps 1 * SIZE(X), %xmm4
  412. mulpd -6 * SIZE(Y), %xmm5
  413. addpd %xmm5, %xmm1
  414. movlps 2 * SIZE(X), %xmm5
  415. movhps 3 * SIZE(X), %xmm5
  416. #if defined(PREFETCH) && !defined(FETCH128)
  417. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  418. #endif
  419. mulpd -4 * SIZE(Y), %xmm6
  420. addpd %xmm6, %xmm2
  421. movlps 4 * SIZE(X), %xmm6
  422. movhps 5 * SIZE(X), %xmm6
  423. mulpd -2 * SIZE(Y), %xmm7
  424. addpd %xmm7, %xmm3
  425. movlps 6 * SIZE(X), %xmm7
  426. movhps 7 * SIZE(X), %xmm7
  427. subl $-16 * SIZE, X
  428. subl $-16 * SIZE, Y
  429. decl %eax
  430. jg .L21
  431. ALIGN_3
  432. .L22:
  433. mulpd -16 * SIZE(Y), %xmm4
  434. addpd %xmm4, %xmm0
  435. movlps -8 * SIZE(X), %xmm4
  436. movhps -7 * SIZE(X), %xmm4
  437. mulpd -14 * SIZE(Y), %xmm5
  438. addpd %xmm5, %xmm1
  439. movlps -6 * SIZE(X), %xmm5
  440. movhps -5 * SIZE(X), %xmm5
  441. mulpd -12 * SIZE(Y), %xmm6
  442. addpd %xmm6, %xmm2
  443. movlps -4 * SIZE(X), %xmm6
  444. movhps -3 * SIZE(X), %xmm6
  445. mulpd -10 * SIZE(Y), %xmm7
  446. addpd %xmm7, %xmm3
  447. movlps -2 * SIZE(X), %xmm7
  448. movhps -1 * SIZE(X), %xmm7
  449. mulpd -8 * SIZE(Y), %xmm4
  450. addpd %xmm4, %xmm0
  451. mulpd -6 * SIZE(Y), %xmm5
  452. addpd %xmm5, %xmm1
  453. mulpd -4 * SIZE(Y), %xmm6
  454. addpd %xmm6, %xmm2
  455. mulpd -2 * SIZE(Y), %xmm7
  456. addpd %xmm7, %xmm3
  457. subl $-16 * SIZE, X
  458. subl $-16 * SIZE, Y
  459. ALIGN_3
  460. .L24:
  461. testl $15, N
  462. jle .L999
  463. testl $8, N
  464. jle .L25
  465. movlps -16 * SIZE(X), %xmm4
  466. movhps -15 * SIZE(X), %xmm4
  467. movlps -14 * SIZE(X), %xmm5
  468. movhps -13 * SIZE(X), %xmm5
  469. movlps -12 * SIZE(X), %xmm6
  470. movhps -11 * SIZE(X), %xmm6
  471. movlps -10 * SIZE(X), %xmm7
  472. movhps -9 * SIZE(X), %xmm7
  473. mulpd -16 * SIZE(Y), %xmm4
  474. addpd %xmm4, %xmm0
  475. mulpd -14 * SIZE(Y), %xmm5
  476. addpd %xmm5, %xmm1
  477. mulpd -12 * SIZE(Y), %xmm6
  478. addpd %xmm6, %xmm2
  479. mulpd -10 * SIZE(Y), %xmm7
  480. addpd %xmm7, %xmm3
  481. addl $8 * SIZE, X
  482. addl $8 * SIZE, Y
  483. ALIGN_3
  484. .L25:
  485. testl $4, N
  486. jle .L26
  487. movlps -16 * SIZE(X), %xmm4
  488. movhps -15 * SIZE(X), %xmm4
  489. movlps -14 * SIZE(X), %xmm5
  490. movhps -13 * SIZE(X), %xmm5
  491. mulpd -16 * SIZE(Y), %xmm4
  492. addpd %xmm4, %xmm0
  493. mulpd -14 * SIZE(Y), %xmm5
  494. addpd %xmm5, %xmm1
  495. addl $4 * SIZE, X
  496. addl $4 * SIZE, Y
  497. ALIGN_3
  498. .L26:
  499. testl $2, N
  500. jle .L27
  501. movlps -16 * SIZE(X), %xmm4
  502. movhps -15 * SIZE(X), %xmm4
  503. mulpd -16 * SIZE(Y), %xmm4
  504. addpd %xmm4, %xmm0
  505. addl $2 * SIZE, X
  506. addl $2 * SIZE, Y
  507. ALIGN_3
  508. .L27:
  509. testl $1, N
  510. jle .L999
  511. movsd -16 * SIZE(X), %xmm4
  512. mulsd -16 * SIZE(Y), %xmm4
  513. addsd %xmm4, %xmm0
  514. jmp .L999
  515. ALIGN_3
  516. #endif
  517. .L50:
  518. movl N, %eax
  519. sarl $2, %eax
  520. jle .L55
  521. ALIGN_3
  522. .L53:
  523. movsd (X), %xmm4
  524. addl INCX, X
  525. mulsd (Y), %xmm4
  526. addl INCY, Y
  527. movsd (X), %xmm5
  528. addl INCX, X
  529. mulsd (Y), %xmm5
  530. addl INCY, Y
  531. movsd (X), %xmm6
  532. addl INCX, X
  533. mulsd (Y), %xmm6
  534. addl INCY, Y
  535. movsd (X), %xmm7
  536. addl INCX, X
  537. mulsd (Y), %xmm7
  538. addl INCY, Y
  539. addsd %xmm4, %xmm0
  540. addsd %xmm5, %xmm1
  541. addsd %xmm6, %xmm2
  542. addsd %xmm7, %xmm3
  543. decl %eax
  544. jg .L53
  545. ALIGN_3
  546. .L55:
  547. movl N, %eax
  548. andl $3, %eax
  549. jle .L999
  550. ALIGN_3
  551. .L56:
  552. movsd (X), %xmm4
  553. addl INCX, X
  554. mulsd (Y), %xmm4
  555. addl INCY, Y
  556. addsd %xmm4, %xmm0
  557. decl %eax
  558. jg .L56
  559. ALIGN_3
  560. .L999:
  561. addpd %xmm1, %xmm0
  562. addpd %xmm3, %xmm2
  563. addpd %xmm2, %xmm0
  564. #ifndef HAVE_SSE3
  565. pshufd $0xe, %xmm0, %xmm1
  566. addsd %xmm1, %xmm0
  567. #else
  568. haddpd %xmm0, %xmm0
  569. #endif
  570. movlps %xmm0, STACK_N
  571. fldl STACK_N
  572. popl %ebx
  573. popl %esi
  574. popl %edi
  575. ret
  576. EPILOGUE