You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dot_sse.S 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_N 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define N %ecx
  48. #define X %esi
  49. #define INCX %ebx
  50. #define Y %edi
  51. #define INCY %edx
  52. #include "l1param.h"
  53. PROLOGUE
  54. PROFCODE
  55. pushl %edi
  56. pushl %esi
  57. pushl %ebx
  58. movl STACK_N, N
  59. movl STACK_X, X
  60. movl STACK_INCX, INCX
  61. movl STACK_Y, Y
  62. movl STACK_INCY, INCY
  63. #ifdef F_INTERFACE
  64. movl (N), N # N
  65. movl (INCX),INCX # INCX
  66. movl (INCY),INCY # INCY
  67. #endif
  68. leal (, INCX, SIZE), INCX
  69. leal (, INCY, SIZE), INCY
  70. xorps %xmm0, %xmm0
  71. xorps %xmm1, %xmm1
  72. xorps %xmm2, %xmm2
  73. xorps %xmm3, %xmm3
  74. cmpl $0, N
  75. jle .L999
  76. cmpl $SIZE, INCX
  77. jne .L50
  78. cmpl $SIZE, INCY
  79. jne .L50
  80. subl $-32 * SIZE, X
  81. subl $-32 * SIZE, Y
  82. cmpl $3, N
  83. jle .L17
  84. testl $SIZE, Y
  85. je .L05
  86. movss -32 * SIZE(X), %xmm0
  87. mulss -32 * SIZE(Y), %xmm0
  88. addl $1 * SIZE, X
  89. addl $1 * SIZE, Y
  90. decl N
  91. ALIGN_2
  92. .L05:
  93. testl $2 * SIZE, Y
  94. je .L10
  95. #ifdef movsd
  96. xorps %xmm4, %xmm4
  97. #endif
  98. movsd -32 * SIZE(X), %xmm4
  99. #ifdef movsd
  100. xorps %xmm1, %xmm1
  101. #endif
  102. movsd -32 * SIZE(Y), %xmm1
  103. mulps %xmm4, %xmm1
  104. addl $2 * SIZE, X
  105. addl $2 * SIZE, Y
  106. subl $2, N
  107. jle .L999
  108. ALIGN_2
  109. .L10:
  110. #ifdef ALIGNED_ACCESS
  111. testl $2 * SIZE, X
  112. jne .L30
  113. testl $SIZE, X
  114. jne .L20
  115. #else
  116. testl $3 * SIZE, X
  117. jne .L20
  118. #endif
  119. movl N, %eax
  120. sarl $5, %eax
  121. jle .L14
  122. movaps -32 * SIZE(X), %xmm4
  123. movaps -28 * SIZE(X), %xmm5
  124. movaps -24 * SIZE(X), %xmm6
  125. movaps -20 * SIZE(X), %xmm7
  126. decl %eax
  127. jle .L12
  128. ALIGN_3
  129. .L11:
  130. #ifdef PREFETCH
  131. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  132. #endif
  133. mulps -32 * SIZE(Y), %xmm4
  134. addps %xmm4, %xmm0
  135. movaps -16 * SIZE(X), %xmm4
  136. mulps -28 * SIZE(Y), %xmm5
  137. addps %xmm5, %xmm1
  138. movaps -12 * SIZE(X), %xmm5
  139. #ifdef PREFETCH
  140. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  141. #endif
  142. mulps -24 * SIZE(Y), %xmm6
  143. addps %xmm6, %xmm2
  144. movaps -8 * SIZE(X), %xmm6
  145. mulps -20 * SIZE(Y), %xmm7
  146. addps %xmm7, %xmm3
  147. movaps -4 * SIZE(X), %xmm7
  148. #if defined(PREFETCH) && !defined(FETCH128)
  149. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  150. #endif
  151. mulps -16 * SIZE(Y), %xmm4
  152. addps %xmm4, %xmm0
  153. movaps 0 * SIZE(X), %xmm4
  154. mulps -12 * SIZE(Y), %xmm5
  155. addps %xmm5, %xmm1
  156. movaps 4 * SIZE(X), %xmm5
  157. #if defined(PREFETCH) && !defined(FETCH128)
  158. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  159. #endif
  160. mulps -8 * SIZE(Y), %xmm6
  161. addps %xmm6, %xmm2
  162. movaps 8 * SIZE(X), %xmm6
  163. mulps -4 * SIZE(Y), %xmm7
  164. addps %xmm7, %xmm3
  165. movaps 12 * SIZE(X), %xmm7
  166. subl $-32 * SIZE, X
  167. subl $-32 * SIZE, Y
  168. decl %eax
  169. jg .L11
  170. ALIGN_3
  171. .L12:
  172. mulps -32 * SIZE(Y), %xmm4
  173. addps %xmm4, %xmm0
  174. movaps -16 * SIZE(X), %xmm4
  175. mulps -28 * SIZE(Y), %xmm5
  176. addps %xmm5, %xmm1
  177. movaps -12 * SIZE(X), %xmm5
  178. mulps -24 * SIZE(Y), %xmm6
  179. addps %xmm6, %xmm2
  180. movaps -8 * SIZE(X), %xmm6
  181. mulps -20 * SIZE(Y), %xmm7
  182. addps %xmm7, %xmm3
  183. movaps -4 * SIZE(X), %xmm7
  184. mulps -16 * SIZE(Y), %xmm4
  185. addps %xmm4, %xmm0
  186. mulps -12 * SIZE(Y), %xmm5
  187. addps %xmm5, %xmm1
  188. mulps -8 * SIZE(Y), %xmm6
  189. addps %xmm6, %xmm2
  190. mulps -4 * SIZE(Y), %xmm7
  191. addps %xmm7, %xmm3
  192. subl $-32 * SIZE, X
  193. subl $-32 * SIZE, Y
  194. ALIGN_3
  195. .L14:
  196. testl $31, N
  197. jle .L999
  198. testl $16, N
  199. jle .L15
  200. movaps -32 * SIZE(X), %xmm4
  201. movaps -28 * SIZE(X), %xmm5
  202. mulps -32 * SIZE(Y), %xmm4
  203. addps %xmm4, %xmm0
  204. mulps -28 * SIZE(Y), %xmm5
  205. addps %xmm5, %xmm1
  206. movaps -24 * SIZE(X), %xmm6
  207. movaps -20 * SIZE(X), %xmm7
  208. mulps -24 * SIZE(Y), %xmm6
  209. addps %xmm6, %xmm2
  210. mulps -20 * SIZE(Y), %xmm7
  211. addps %xmm7, %xmm3
  212. addl $16 * SIZE, X
  213. addl $16 * SIZE, Y
  214. ALIGN_3
  215. .L15:
  216. testl $8, N
  217. jle .L16
  218. movaps -32 * SIZE(X), %xmm4
  219. movaps -28 * SIZE(X), %xmm5
  220. mulps -32 * SIZE(Y), %xmm4
  221. addps %xmm4, %xmm0
  222. mulps -28 * SIZE(Y), %xmm5
  223. addps %xmm5, %xmm1
  224. addl $8 * SIZE, X
  225. addl $8 * SIZE, Y
  226. ALIGN_3
  227. .L16:
  228. testl $4, N
  229. jle .L17
  230. movaps -32 * SIZE(X), %xmm4
  231. mulps -32 * SIZE(Y), %xmm4
  232. addps %xmm4, %xmm2
  233. addl $4 * SIZE, X
  234. addl $4 * SIZE, Y
  235. ALIGN_3
  236. .L17:
  237. testl $2, N
  238. jle .L18
  239. #ifdef movsd
  240. xorps %xmm4, %xmm4
  241. #endif
  242. movsd -32 * SIZE(X), %xmm4
  243. #ifdef movsd
  244. xorps %xmm6, %xmm6
  245. #endif
  246. movsd -32 * SIZE(Y), %xmm6
  247. mulps %xmm6, %xmm4
  248. addps %xmm4, %xmm3
  249. addl $2 * SIZE, X
  250. addl $2 * SIZE, Y
  251. ALIGN_3
  252. .L18:
  253. testl $1, N
  254. jle .L999
  255. movss -32 * SIZE(X), %xmm4
  256. mulss -32 * SIZE(Y), %xmm4
  257. addss %xmm4, %xmm0
  258. jmp .L999
  259. ALIGN_3
  260. #ifdef ALIGNED_ACCESS
  261. .L20:
  262. movaps -33 * SIZE(X), %xmm4
  263. addl $3 * SIZE, X
  264. movl N, %eax
  265. sarl $5, %eax
  266. jle .L24
  267. movaps -32 * SIZE(X), %xmm5
  268. movaps -28 * SIZE(X), %xmm6
  269. movaps -24 * SIZE(X), %xmm7
  270. decl %eax
  271. jle .L22
  272. ALIGN_3
  273. .L21:
  274. #ifdef PREFETCH
  275. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  276. #endif
  277. movss %xmm5, %xmm4
  278. PSHUFD1($0x39, %xmm4)
  279. mulps -32 * SIZE(Y), %xmm4
  280. addps %xmm4, %xmm0
  281. movaps -20 * SIZE(X), %xmm4
  282. movss %xmm6, %xmm5
  283. PSHUFD1($0x39, %xmm5)
  284. mulps -28 * SIZE(Y), %xmm5
  285. addps %xmm5, %xmm1
  286. movaps -16 * SIZE(X), %xmm5
  287. #ifdef PREFETCH
  288. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  289. #endif
  290. movss %xmm7, %xmm6
  291. PSHUFD1($0x39, %xmm6)
  292. mulps -24 * SIZE(Y), %xmm6
  293. addps %xmm6, %xmm2
  294. movaps -12 * SIZE(X), %xmm6
  295. movss %xmm4, %xmm7
  296. PSHUFD1($0x39, %xmm7)
  297. mulps -20 * SIZE(Y), %xmm7
  298. addps %xmm7, %xmm3
  299. movaps -8 * SIZE(X), %xmm7
  300. #if defined(PREFETCH) && !defined(FETCH128)
  301. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  302. #endif
  303. movss %xmm5, %xmm4
  304. PSHUFD1($0x39, %xmm4)
  305. mulps -16 * SIZE(Y), %xmm4
  306. addps %xmm4, %xmm0
  307. movaps -4 * SIZE(X), %xmm4
  308. movss %xmm6, %xmm5
  309. PSHUFD1($0x39, %xmm5)
  310. mulps -12 * SIZE(Y), %xmm5
  311. addps %xmm5, %xmm1
  312. movaps 0 * SIZE(X), %xmm5
  313. #if defined(PREFETCH) && !defined(FETCH128)
  314. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  315. #endif
  316. movss %xmm7, %xmm6
  317. PSHUFD1($0x39, %xmm6)
  318. mulps -8 * SIZE(Y), %xmm6
  319. addps %xmm6, %xmm2
  320. movaps 4 * SIZE(X), %xmm6
  321. movss %xmm4, %xmm7
  322. PSHUFD1($0x39, %xmm7)
  323. mulps -4 * SIZE(Y), %xmm7
  324. addps %xmm7, %xmm3
  325. movaps 8 * SIZE(X), %xmm7
  326. subl $-32 * SIZE, X
  327. subl $-32 * SIZE, Y
  328. decl %eax
  329. jg .L21
  330. ALIGN_3
  331. .L22:
  332. movss %xmm5, %xmm4
  333. PSHUFD1($0x39, %xmm4)
  334. mulps -32 * SIZE(Y), %xmm4
  335. addps %xmm4, %xmm0
  336. movaps -20 * SIZE(X), %xmm4
  337. movss %xmm6, %xmm5
  338. PSHUFD1($0x39, %xmm5)
  339. mulps -28 * SIZE(Y), %xmm5
  340. addps %xmm5, %xmm1
  341. movaps -16 * SIZE(X), %xmm5
  342. movss %xmm7, %xmm6
  343. PSHUFD1($0x39, %xmm6)
  344. mulps -24 * SIZE(Y), %xmm6
  345. addps %xmm6, %xmm2
  346. movaps -12 * SIZE(X), %xmm6
  347. movss %xmm4, %xmm7
  348. PSHUFD1($0x39, %xmm7)
  349. mulps -20 * SIZE(Y), %xmm7
  350. addps %xmm7, %xmm3
  351. movaps -8 * SIZE(X), %xmm7
  352. movss %xmm5, %xmm4
  353. PSHUFD1($0x39, %xmm4)
  354. mulps -16 * SIZE(Y), %xmm4
  355. addps %xmm4, %xmm0
  356. movaps -4 * SIZE(X), %xmm4
  357. movss %xmm6, %xmm5
  358. PSHUFD1($0x39, %xmm5)
  359. mulps -12 * SIZE(Y), %xmm5
  360. addps %xmm5, %xmm1
  361. movss %xmm7, %xmm6
  362. PSHUFD1($0x39, %xmm6)
  363. mulps -8 * SIZE(Y), %xmm6
  364. addps %xmm6, %xmm2
  365. movss %xmm4, %xmm7
  366. PSHUFD1($0x39, %xmm7)
  367. mulps -4 * SIZE(Y), %xmm7
  368. addps %xmm7, %xmm3
  369. subl $-32 * SIZE, X
  370. subl $-32 * SIZE, Y
  371. ALIGN_3
  372. .L24:
  373. testl $31, N
  374. jle .L999
  375. testl $16, N
  376. jle .L25
  377. movaps -32 * SIZE(X), %xmm5
  378. movaps -28 * SIZE(X), %xmm6
  379. movaps -24 * SIZE(X), %xmm7
  380. movss %xmm5, %xmm4
  381. PSHUFD1($0x39, %xmm4)
  382. mulps -32 * SIZE(Y), %xmm4
  383. addps %xmm4, %xmm0
  384. movaps -20 * SIZE(X), %xmm4
  385. movss %xmm6, %xmm5
  386. PSHUFD1($0x39, %xmm5)
  387. mulps -28 * SIZE(Y), %xmm5
  388. addps %xmm5, %xmm1
  389. movss %xmm7, %xmm6
  390. PSHUFD1($0x39, %xmm6)
  391. mulps -24 * SIZE(Y), %xmm6
  392. addps %xmm6, %xmm2
  393. movss %xmm4, %xmm7
  394. PSHUFD1($0x39, %xmm7)
  395. mulps -20 * SIZE(Y), %xmm7
  396. addps %xmm7, %xmm3
  397. addl $16 * SIZE, X
  398. addl $16 * SIZE, Y
  399. ALIGN_3
  400. .L25:
  401. testl $8, N
  402. jle .L26
  403. movaps -32 * SIZE(X), %xmm5
  404. movaps -28 * SIZE(X), %xmm6
  405. movss %xmm5, %xmm4
  406. PSHUFD1($0x39, %xmm4)
  407. mulps -32 * SIZE(Y), %xmm4
  408. addps %xmm4, %xmm0
  409. movss %xmm6, %xmm5
  410. PSHUFD1($0x39, %xmm5)
  411. mulps -28 * SIZE(Y), %xmm5
  412. addps %xmm5, %xmm1
  413. movaps %xmm6, %xmm4
  414. addl $8 * SIZE, X
  415. addl $8 * SIZE, Y
  416. ALIGN_3
  417. .L26:
  418. testl $4, N
  419. jle .L27
  420. movaps -32 * SIZE(X), %xmm5
  421. movss %xmm5, %xmm4
  422. PSHUFD1($0x39, %xmm4)
  423. mulps -32 * SIZE(Y), %xmm4
  424. addps %xmm4, %xmm2
  425. movaps %xmm5, %xmm4
  426. addl $4 * SIZE, X
  427. addl $4 * SIZE, Y
  428. ALIGN_3
  429. .L27:
  430. testl $2, N
  431. jle .L28
  432. #ifdef movsd
  433. xorps %xmm6, %xmm6
  434. #endif
  435. movsd -32 * SIZE(Y), %xmm6
  436. PSHUFD2($0x39, %xmm4, %xmm5)
  437. mulps %xmm6, %xmm5
  438. addps %xmm5, %xmm3
  439. movhlps %xmm4, %xmm4
  440. addl $2 * SIZE, X
  441. addl $2 * SIZE, Y
  442. ALIGN_3
  443. .L28:
  444. testl $1, N
  445. jle .L999
  446. PSHUFD1($0x39, %xmm4)
  447. mulss -32 * SIZE(Y), %xmm4
  448. addss %xmm4, %xmm0
  449. jmp .L999
  450. ALIGN_3
  451. .L30:
  452. testl $SIZE, X
  453. jne .L40
  454. movhps -32 * SIZE(X), %xmm4
  455. addl $2 * SIZE, X
  456. movl N, %eax
  457. sarl $5, %eax
  458. jle .L34
  459. movaps -32 * SIZE(X), %xmm5
  460. movaps -28 * SIZE(X), %xmm6
  461. movaps -24 * SIZE(X), %xmm7
  462. decl %eax
  463. jle .L32
  464. ALIGN_3
  465. .L31:
  466. #ifdef PREFETCH
  467. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  468. #endif
  469. SHUFPD_1 %xmm5, %xmm4
  470. mulps -32 * SIZE(Y), %xmm4
  471. addps %xmm4, %xmm0
  472. movaps -20 * SIZE(X), %xmm4
  473. SHUFPD_1 %xmm6, %xmm5
  474. mulps -28 * SIZE(Y), %xmm5
  475. addps %xmm5, %xmm1
  476. movaps -16 * SIZE(X), %xmm5
  477. #ifdef PREFETCH
  478. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  479. #endif
  480. SHUFPD_1 %xmm7, %xmm6
  481. mulps -24 * SIZE(Y), %xmm6
  482. addps %xmm6, %xmm2
  483. movaps -12 * SIZE(X), %xmm6
  484. SHUFPD_1 %xmm4, %xmm7
  485. mulps -20 * SIZE(Y), %xmm7
  486. addps %xmm7, %xmm3
  487. movaps -8 * SIZE(X), %xmm7
  488. #if defined(PREFETCH) && !defined(FETCH128)
  489. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  490. #endif
  491. SHUFPD_1 %xmm5, %xmm4
  492. mulps -16 * SIZE(Y), %xmm4
  493. addps %xmm4, %xmm0
  494. movaps -4 * SIZE(X), %xmm4
  495. SHUFPD_1 %xmm6, %xmm5
  496. mulps -12 * SIZE(Y), %xmm5
  497. addps %xmm5, %xmm1
  498. movaps 0 * SIZE(X), %xmm5
  499. #if defined(PREFETCH) && !defined(FETCH128)
  500. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  501. #endif
  502. SHUFPD_1 %xmm7, %xmm6
  503. mulps -8 * SIZE(Y), %xmm6
  504. addps %xmm6, %xmm2
  505. movaps 4 * SIZE(X), %xmm6
  506. SHUFPD_1 %xmm4, %xmm7
  507. mulps -4 * SIZE(Y), %xmm7
  508. addps %xmm7, %xmm3
  509. movaps 8 * SIZE(X), %xmm7
  510. subl $-32 * SIZE, X
  511. subl $-32 * SIZE, Y
  512. decl %eax
  513. jg .L31
  514. ALIGN_3
  515. .L32:
  516. SHUFPD_1 %xmm5, %xmm4
  517. mulps -32 * SIZE(Y), %xmm4
  518. addps %xmm4, %xmm0
  519. movaps -20 * SIZE(X), %xmm4
  520. SHUFPD_1 %xmm6, %xmm5
  521. mulps -28 * SIZE(Y), %xmm5
  522. addps %xmm5, %xmm1
  523. movaps -16 * SIZE(X), %xmm5
  524. SHUFPD_1 %xmm7, %xmm6
  525. mulps -24 * SIZE(Y), %xmm6
  526. addps %xmm6, %xmm2
  527. movaps -12 * SIZE(X), %xmm6
  528. SHUFPD_1 %xmm4, %xmm7
  529. mulps -20 * SIZE(Y), %xmm7
  530. addps %xmm7, %xmm3
  531. movaps -8 * SIZE(X), %xmm7
  532. SHUFPD_1 %xmm5, %xmm4
  533. mulps -16 * SIZE(Y), %xmm4
  534. addps %xmm4, %xmm0
  535. movaps -4 * SIZE(X), %xmm4
  536. SHUFPD_1 %xmm6, %xmm5
  537. mulps -12 * SIZE(Y), %xmm5
  538. addps %xmm5, %xmm1
  539. SHUFPD_1 %xmm7, %xmm6
  540. mulps -8 * SIZE(Y), %xmm6
  541. addps %xmm6, %xmm2
  542. SHUFPD_1 %xmm4, %xmm7
  543. mulps -4 * SIZE(Y), %xmm7
  544. addps %xmm7, %xmm3
  545. subl $-32 * SIZE, X
  546. subl $-32 * SIZE, Y
  547. ALIGN_3
  548. .L34:
  549. testl $31, N
  550. jle .L999
  551. testl $16, N
  552. jle .L35
  553. movaps -32 * SIZE(X), %xmm5
  554. movaps -28 * SIZE(X), %xmm6
  555. movaps -24 * SIZE(X), %xmm7
  556. SHUFPD_1 %xmm5, %xmm4
  557. mulps -32 * SIZE(Y), %xmm4
  558. addps %xmm4, %xmm0
  559. movaps -20 * SIZE(X), %xmm4
  560. SHUFPD_1 %xmm6, %xmm5
  561. mulps -28 * SIZE(Y), %xmm5
  562. addps %xmm5, %xmm1
  563. SHUFPD_1 %xmm7, %xmm6
  564. mulps -24 * SIZE(Y), %xmm6
  565. addps %xmm6, %xmm2
  566. SHUFPD_1 %xmm4, %xmm7
  567. mulps -20 * SIZE(Y), %xmm7
  568. addps %xmm7, %xmm3
  569. addl $16 * SIZE, X
  570. addl $16 * SIZE, Y
  571. ALIGN_3
  572. .L35:
  573. testl $8, N
  574. jle .L36
  575. movaps -32 * SIZE(X), %xmm5
  576. movaps -28 * SIZE(X), %xmm6
  577. SHUFPD_1 %xmm5, %xmm4
  578. mulps -32 * SIZE(Y), %xmm4
  579. addps %xmm4, %xmm0
  580. SHUFPD_1 %xmm6, %xmm5
  581. mulps -28 * SIZE(Y), %xmm5
  582. addps %xmm5, %xmm1
  583. movaps %xmm6, %xmm4
  584. addl $8 * SIZE, X
  585. addl $8 * SIZE, Y
  586. ALIGN_3
  587. .L36:
  588. testl $4, N
  589. jle .L37
  590. movaps -32 * SIZE(X), %xmm5
  591. SHUFPD_1 %xmm5, %xmm4
  592. mulps -32 * SIZE(Y), %xmm4
  593. addps %xmm4, %xmm0
  594. movaps %xmm5, %xmm4
  595. addl $4 * SIZE, X
  596. addl $4 * SIZE, Y
  597. ALIGN_3
  598. .L37:
  599. testl $2, N
  600. jle .L38
  601. xorps %xmm5, %xmm5
  602. movhlps %xmm4, %xmm5
  603. mulps -32 * SIZE(Y), %xmm5
  604. addps %xmm5, %xmm0
  605. addl $2 * SIZE, X
  606. addl $2 * SIZE, Y
  607. ALIGN_3
  608. .L38:
  609. testl $1, N
  610. jle .L999
  611. movss -34 * SIZE(X), %xmm4
  612. mulss -32 * SIZE(Y), %xmm4
  613. addss %xmm4, %xmm0
  614. jmp .L999
  615. ALIGN_3
  616. .L40:
  617. movaps -35 * SIZE(X), %xmm4
  618. addl $SIZE, X
  619. movl N, %eax
  620. sarl $5, %eax
  621. jle .L44
  622. movaps -32 * SIZE(X), %xmm5
  623. movaps -28 * SIZE(X), %xmm6
  624. movaps -24 * SIZE(X), %xmm7
  625. decl %eax
  626. jle .L42
  627. ALIGN_3
  628. .L41:
  629. #ifdef PREFETCH
  630. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  631. #endif
  632. movss %xmm5, %xmm4
  633. shufps $0x93, %xmm5, %xmm4
  634. mulps -32 * SIZE(Y), %xmm4
  635. addps %xmm4, %xmm0
  636. movaps -20 * SIZE(X), %xmm4
  637. movss %xmm6, %xmm5
  638. shufps $0x93, %xmm6, %xmm5
  639. mulps -28 * SIZE(Y), %xmm5
  640. addps %xmm5, %xmm1
  641. movaps -16 * SIZE(X), %xmm5
  642. #ifdef PREFETCH
  643. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  644. #endif
  645. movss %xmm7, %xmm6
  646. shufps $0x93, %xmm7, %xmm6
  647. mulps -24 * SIZE(Y), %xmm6
  648. addps %xmm6, %xmm2
  649. movaps -12 * SIZE(X), %xmm6
  650. movss %xmm4, %xmm7
  651. shufps $0x93, %xmm4, %xmm7
  652. mulps -20 * SIZE(Y), %xmm7
  653. addps %xmm7, %xmm3
  654. movaps -8 * SIZE(X), %xmm7
  655. #if defined(PREFETCH) && !defined(FETCH128)
  656. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  657. #endif
  658. movss %xmm5, %xmm4
  659. shufps $0x93, %xmm5, %xmm4
  660. mulps -16 * SIZE(Y), %xmm4
  661. addps %xmm4, %xmm0
  662. movaps -4 * SIZE(X), %xmm4
  663. movss %xmm6, %xmm5
  664. shufps $0x93, %xmm6, %xmm5
  665. mulps -12 * SIZE(Y), %xmm5
  666. addps %xmm5, %xmm1
  667. movaps 0 * SIZE(X), %xmm5
  668. #if defined(PREFETCH) && !defined(FETCH128)
  669. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  670. #endif
  671. movss %xmm7, %xmm6
  672. shufps $0x93, %xmm7, %xmm6
  673. mulps -8 * SIZE(Y), %xmm6
  674. addps %xmm6, %xmm2
  675. movaps 4 * SIZE(X), %xmm6
  676. movss %xmm4, %xmm7
  677. shufps $0x93, %xmm4, %xmm7
  678. mulps -4 * SIZE(Y), %xmm7
  679. addps %xmm7, %xmm3
  680. movaps 8 * SIZE(X), %xmm7
  681. subl $-32 * SIZE, X
  682. subl $-32 * SIZE, Y
  683. decl %eax
  684. jg .L41
  685. ALIGN_3
  686. .L42:
  687. movss %xmm5, %xmm4
  688. shufps $0x93, %xmm5, %xmm4
  689. mulps -32 * SIZE(Y), %xmm4
  690. addps %xmm4, %xmm0
  691. movaps -20 * SIZE(X), %xmm4
  692. movss %xmm6, %xmm5
  693. shufps $0x93, %xmm6, %xmm5
  694. mulps -28 * SIZE(Y), %xmm5
  695. addps %xmm5, %xmm1
  696. movaps -16 * SIZE(X), %xmm5
  697. movss %xmm7, %xmm6
  698. shufps $0x93, %xmm7, %xmm6
  699. mulps -24 * SIZE(Y), %xmm6
  700. addps %xmm6, %xmm2
  701. movaps -12 * SIZE(X), %xmm6
  702. movss %xmm4, %xmm7
  703. shufps $0x93, %xmm4, %xmm7
  704. mulps -20 * SIZE(Y), %xmm7
  705. addps %xmm7, %xmm3
  706. movaps -8 * SIZE(X), %xmm7
  707. movss %xmm5, %xmm4
  708. shufps $0x93, %xmm5, %xmm4
  709. mulps -16 * SIZE(Y), %xmm4
  710. addps %xmm4, %xmm0
  711. movaps -4 * SIZE(X), %xmm4
  712. movss %xmm6, %xmm5
  713. shufps $0x93, %xmm6, %xmm5
  714. mulps -12 * SIZE(Y), %xmm5
  715. addps %xmm5, %xmm1
  716. movss %xmm7, %xmm6
  717. shufps $0x93, %xmm7, %xmm6
  718. mulps -8 * SIZE(Y), %xmm6
  719. addps %xmm6, %xmm2
  720. movss %xmm4, %xmm7
  721. shufps $0x93, %xmm4, %xmm7
  722. mulps -4 * SIZE(Y), %xmm7
  723. addps %xmm7, %xmm3
  724. subl $-32 * SIZE, X
  725. subl $-32 * SIZE, Y
  726. ALIGN_3
  727. .L44:
  728. testl $31, N
  729. jle .L999
  730. testl $16, N
  731. jle .L45
  732. movaps -32 * SIZE(X), %xmm5
  733. movaps -28 * SIZE(X), %xmm6
  734. movaps -24 * SIZE(X), %xmm7
  735. movss %xmm5, %xmm4
  736. shufps $0x93, %xmm5, %xmm4
  737. mulps -32 * SIZE(Y), %xmm4
  738. addps %xmm4, %xmm0
  739. movaps -20 * SIZE(X), %xmm4
  740. movss %xmm6, %xmm5
  741. shufps $0x93, %xmm6, %xmm5
  742. mulps -28 * SIZE(Y), %xmm5
  743. addps %xmm5, %xmm1
  744. movss %xmm7, %xmm6
  745. shufps $0x93, %xmm7, %xmm6
  746. mulps -24 * SIZE(Y), %xmm6
  747. addps %xmm6, %xmm2
  748. movss %xmm4, %xmm7
  749. shufps $0x93, %xmm4, %xmm7
  750. mulps -20 * SIZE(Y), %xmm7
  751. addps %xmm7, %xmm3
  752. addl $16 * SIZE, X
  753. addl $16 * SIZE, Y
  754. ALIGN_3
  755. .L45:
  756. testl $8, N
  757. jle .L46
  758. movaps -32 * SIZE(X), %xmm5
  759. movaps -28 * SIZE(X), %xmm6
  760. movss %xmm5, %xmm4
  761. shufps $0x93, %xmm5, %xmm4
  762. mulps -32 * SIZE(Y), %xmm4
  763. addps %xmm4, %xmm0
  764. movss %xmm6, %xmm5
  765. shufps $0x93, %xmm6, %xmm5
  766. mulps -28 * SIZE(Y), %xmm5
  767. addps %xmm5, %xmm1
  768. movaps %xmm6, %xmm4
  769. addl $8 * SIZE, X
  770. addl $8 * SIZE, Y
  771. ALIGN_3
  772. .L46:
  773. testl $4, N
  774. jle .L47
  775. movaps -32 * SIZE(X), %xmm5
  776. movss %xmm5, %xmm4
  777. shufps $0x93, %xmm5, %xmm4
  778. mulps -32 * SIZE(Y), %xmm4
  779. addps %xmm4, %xmm2
  780. movaps %xmm5, %xmm4
  781. addl $4 * SIZE, X
  782. addl $4 * SIZE, Y
  783. ALIGN_3
  784. .L47:
  785. testl $2, N
  786. jle .L48
  787. movaps -32 * SIZE(X), %xmm5
  788. #ifdef movsd
  789. xorps %xmm7, %xmm7
  790. #endif
  791. movsd -32 * SIZE(Y), %xmm7
  792. movss %xmm5, %xmm4
  793. shufps $0x93, %xmm5, %xmm4
  794. mulps %xmm7, %xmm4
  795. addps %xmm4, %xmm3
  796. movlhps %xmm5, %xmm4
  797. addl $2 * SIZE, X
  798. addl $2 * SIZE, Y
  799. ALIGN_3
  800. .L48:
  801. testl $1, N
  802. jle .L999
  803. PSHUFD1($0x93, %xmm4)
  804. mulss -32 * SIZE(Y), %xmm4
  805. addss %xmm4, %xmm0
  806. jmp .L999
  807. ALIGN_4
  808. #else
  809. .L20:
  810. movl N, %eax
  811. sarl $5, %eax
  812. jle .L24
  813. movlps -32 * SIZE(X), %xmm4
  814. movhps -30 * SIZE(X), %xmm4
  815. movlps -28 * SIZE(X), %xmm5
  816. movhps -26 * SIZE(X), %xmm5
  817. movlps -24 * SIZE(X), %xmm6
  818. movhps -22 * SIZE(X), %xmm6
  819. movlps -20 * SIZE(X), %xmm7
  820. movhps -18 * SIZE(X), %xmm7
  821. decl %eax
  822. jle .L22
  823. ALIGN_3
  824. .L21:
  825. #ifdef PREFETCH
  826. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  827. #endif
  828. mulps -32 * SIZE(Y), %xmm4
  829. addps %xmm4, %xmm0
  830. movlps -16 * SIZE(X), %xmm4
  831. movhps -14 * SIZE(X), %xmm4
  832. mulps -28 * SIZE(Y), %xmm5
  833. addps %xmm5, %xmm1
  834. movlps -12 * SIZE(X), %xmm5
  835. movhps -10 * SIZE(X), %xmm5
  836. #ifdef PREFETCH
  837. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  838. #endif
  839. mulps -24 * SIZE(Y), %xmm6
  840. addps %xmm6, %xmm2
  841. movlps -8 * SIZE(X), %xmm6
  842. movhps -6 * SIZE(X), %xmm6
  843. mulps -20 * SIZE(Y), %xmm7
  844. addps %xmm7, %xmm3
  845. movlps -4 * SIZE(X), %xmm7
  846. movhps -2 * SIZE(X), %xmm7
  847. #if defined(PREFETCH) && !defined(FETCH128)
  848. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  849. #endif
  850. mulps -16 * SIZE(Y), %xmm4
  851. addps %xmm4, %xmm0
  852. movlps 0 * SIZE(X), %xmm4
  853. movhps 2 * SIZE(X), %xmm4
  854. mulps -12 * SIZE(Y), %xmm5
  855. addps %xmm5, %xmm1
  856. movlps 4 * SIZE(X), %xmm5
  857. movhps 6 * SIZE(X), %xmm5
  858. #if defined(PREFETCH) && !defined(FETCH128)
  859. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  860. #endif
  861. mulps -8 * SIZE(Y), %xmm6
  862. addps %xmm6, %xmm2
  863. movlps 8 * SIZE(X), %xmm6
  864. movhps 10 * SIZE(X), %xmm6
  865. mulps -4 * SIZE(Y), %xmm7
  866. addps %xmm7, %xmm3
  867. movlps 12 * SIZE(X), %xmm7
  868. movhps 14 * SIZE(X), %xmm7
  869. subl $-32 * SIZE, X
  870. subl $-32 * SIZE, Y
  871. decl %eax
  872. jg .L21
  873. ALIGN_3
  874. .L22:
  875. mulps -32 * SIZE(Y), %xmm4
  876. addps %xmm4, %xmm0
  877. movlps -16 * SIZE(X), %xmm4
  878. movhps -14 * SIZE(X), %xmm4
  879. mulps -28 * SIZE(Y), %xmm5
  880. addps %xmm5, %xmm1
  881. movlps -12 * SIZE(X), %xmm5
  882. movhps -10 * SIZE(X), %xmm5
  883. mulps -24 * SIZE(Y), %xmm6
  884. addps %xmm6, %xmm2
  885. movlps -8 * SIZE(X), %xmm6
  886. movhps -6 * SIZE(X), %xmm6
  887. mulps -20 * SIZE(Y), %xmm7
  888. addps %xmm7, %xmm3
  889. movlps -4 * SIZE(X), %xmm7
  890. movhps -2 * SIZE(X), %xmm7
  891. mulps -16 * SIZE(Y), %xmm4
  892. addps %xmm4, %xmm0
  893. mulps -12 * SIZE(Y), %xmm5
  894. addps %xmm5, %xmm1
  895. mulps -8 * SIZE(Y), %xmm6
  896. addps %xmm6, %xmm2
  897. mulps -4 * SIZE(Y), %xmm7
  898. addps %xmm7, %xmm3
  899. subl $-32 * SIZE, X
  900. subl $-32 * SIZE, Y
  901. ALIGN_3
  902. .L24:
  903. testl $31, N
  904. jle .L999
  905. testl $16, N
  906. jle .L25
  907. movlps -32 * SIZE(X), %xmm4
  908. movhps -30 * SIZE(X), %xmm4
  909. movlps -28 * SIZE(X), %xmm5
  910. movhps -26 * SIZE(X), %xmm5
  911. movlps -24 * SIZE(X), %xmm6
  912. movhps -22 * SIZE(X), %xmm6
  913. movlps -20 * SIZE(X), %xmm7
  914. movhps -18 * SIZE(X), %xmm7
  915. mulps -32 * SIZE(Y), %xmm4
  916. addps %xmm4, %xmm0
  917. mulps -28 * SIZE(Y), %xmm5
  918. addps %xmm5, %xmm1
  919. mulps -24 * SIZE(Y), %xmm6
  920. addps %xmm6, %xmm2
  921. mulps -20 * SIZE(Y), %xmm7
  922. addps %xmm7, %xmm3
  923. addl $16 * SIZE, X
  924. addl $16 * SIZE, Y
  925. ALIGN_3
  926. .L25:
  927. testl $8, N
  928. jle .L26
  929. movlps -32 * SIZE(X), %xmm4
  930. movhps -30 * SIZE(X), %xmm4
  931. movlps -28 * SIZE(X), %xmm5
  932. movhps -26 * SIZE(X), %xmm5
  933. mulps -32 * SIZE(Y), %xmm4
  934. addps %xmm4, %xmm0
  935. mulps -28 * SIZE(Y), %xmm5
  936. addps %xmm5, %xmm1
  937. addl $8 * SIZE, X
  938. addl $8 * SIZE, Y
  939. ALIGN_3
  940. .L26:
  941. testl $4, N
  942. jle .L27
  943. movlps -32 * SIZE(X), %xmm4
  944. movhps -30 * SIZE(X), %xmm4
  945. mulps -32 * SIZE(Y), %xmm4
  946. addps %xmm4, %xmm2
  947. addl $4 * SIZE, X
  948. addl $4 * SIZE, Y
  949. ALIGN_3
  950. .L27:
  951. testl $2, N
  952. jle .L28
  953. #ifdef movsd
  954. xorps %xmm4, %xmm4
  955. #endif
  956. movsd -32 * SIZE(X), %xmm4
  957. #ifdef movsd
  958. xorps %xmm6, %xmm6
  959. #endif
  960. movsd -32 * SIZE(Y), %xmm6
  961. mulps %xmm6, %xmm4
  962. addps %xmm4, %xmm3
  963. addl $2 * SIZE, X
  964. addl $2 * SIZE, Y
  965. ALIGN_3
  966. .L28:
  967. testl $1, N
  968. jle .L999
  969. movss -32 * SIZE(X), %xmm4
  970. mulss -32 * SIZE(Y), %xmm4
  971. addss %xmm4, %xmm0
  972. jmp .L999
  973. ALIGN_3
  974. #endif
  975. .L50:
  976. movl N, %eax
  977. sarl $2, %eax
  978. jle .L55
  979. ALIGN_3
  980. .L53:
  981. movss 0 * SIZE(X), %xmm4
  982. addl INCX, X
  983. mulss 0 * SIZE(Y), %xmm4
  984. addl INCY, Y
  985. movss 0 * SIZE(X), %xmm5
  986. addl INCX, X
  987. mulss 0 * SIZE(Y), %xmm5
  988. addl INCY, Y
  989. movss 0 * SIZE(X), %xmm6
  990. addl INCX, X
  991. mulss 0 * SIZE(Y), %xmm6
  992. addl INCY, Y
  993. movss 0 * SIZE(X), %xmm7
  994. addl INCX, X
  995. mulss 0 * SIZE(Y), %xmm7
  996. addl INCY, Y
  997. addss %xmm4, %xmm0
  998. addss %xmm5, %xmm1
  999. addss %xmm6, %xmm2
  1000. addss %xmm7, %xmm3
  1001. decl %eax
  1002. jg .L53
  1003. ALIGN_3
  1004. .L55:
  1005. movl N, %eax
  1006. andl $3, %eax
  1007. jle .L999
  1008. ALIGN_3
  1009. .L56:
  1010. movss 0 * SIZE(X), %xmm4
  1011. addl INCX, X
  1012. mulss 0 * SIZE(Y), %xmm4
  1013. addl INCY, Y
  1014. addss %xmm4, %xmm0
  1015. decl %eax
  1016. jg .L56
  1017. ALIGN_3
  1018. .L999:
  1019. addps %xmm1, %xmm0
  1020. addps %xmm3, %xmm2
  1021. addps %xmm2, %xmm0
  1022. #if defined(HAVE_SSE3) && !defined(__INTERIX)
  1023. haddps %xmm0, %xmm0
  1024. haddps %xmm0, %xmm0
  1025. #elif defined(HAVE_SSE2)
  1026. movhlps %xmm0, %xmm1
  1027. addps %xmm1, %xmm0
  1028. PSHUFD2($1, %xmm0, %xmm1)
  1029. addss %xmm1, %xmm0
  1030. #else
  1031. movhlps %xmm0, %xmm1
  1032. addps %xmm1, %xmm0
  1033. movaps %xmm0, %xmm1
  1034. shufps $1, %xmm0, %xmm0
  1035. addss %xmm1, %xmm0
  1036. #endif
  1037. movss %xmm0, STACK_N
  1038. flds STACK_N
  1039. popl %ebx
  1040. popl %esi
  1041. popl %edi
  1042. ret
  1043. EPILOGUE