You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zscal_sse.S 25 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_ALPHA_R 16 + STACK + ARGS(%esp)
  44. #define STACK_ALPHA_I 20 + STACK + ARGS(%esp)
  45. #define STACK_X 24 + STACK + ARGS(%esp)
  46. #define STACK_INCX 28 + STACK + ARGS(%esp)
  47. #define M %ebx
  48. #define X %ecx
  49. #define INCX %edx
  50. #define I %esi
  51. #define XX %edi
  52. #define FLAG %ebp
  53. #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
  54. #define USE_PSHUFD
  55. #else
  56. #define USE_PSHUFD_HALF
  57. #endif
  58. #include "l1param.h"
  59. PROLOGUE
  60. PROFCODE
  61. pushl %edi
  62. pushl %esi
  63. pushl %ebx
  64. pushl %ebp
  65. movl STACK_M, M
  66. movl STACK_X, X
  67. movl STACK_INCX, INCX
  68. movss STACK_ALPHA_R, %xmm0
  69. movss STACK_ALPHA_I, %xmm1
  70. sall $ZBASE_SHIFT, INCX
  71. xor FLAG, FLAG
  72. testl M, M
  73. jle .L999
  74. xorps %xmm7, %xmm7
  75. comiss %xmm0, %xmm7
  76. jne .L100 # Alpha_r != ZERO
  77. jp .L100 # Alpha_r NaN
  78. comiss %xmm1, %xmm7
  79. jne .L100 # Alpha_i != ZERO
  80. /* Alpha == ZERO */
  81. cmpl $2 * SIZE, INCX
  82. jne .L50
  83. /* INCX == 1 */
  84. cmpl $3, M
  85. jle .L13
  86. testl $4, X
  87. je .L05
  88. movss %xmm7, 0 * SIZE(X)
  89. addl $SIZE, X
  90. movl $1, FLAG
  91. decl M
  92. ALIGN_3
  93. .L05:
  94. testl $8, X
  95. je .L06
  96. movlps %xmm7, 0 * SIZE(X)
  97. addl $2 * SIZE, X
  98. subl $1, M
  99. ALIGN_3
  100. .L06:
  101. movl M, I # rcx = n
  102. sarl $3, I
  103. jle .L12
  104. ALIGN_4
  105. .L11:
  106. #ifdef PREFETCHW
  107. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  108. #endif
  109. movaps %xmm7, 0 * SIZE(X)
  110. movaps %xmm7, 4 * SIZE(X)
  111. movaps %xmm7, 8 * SIZE(X)
  112. movaps %xmm7, 12 * SIZE(X)
  113. addl $16 * SIZE, X
  114. decl I
  115. jg .L11
  116. ALIGN_4
  117. .L12:
  118. testl $7, M
  119. je .L19
  120. testl $4, M
  121. je .L13
  122. movaps %xmm7, 0 * SIZE(X)
  123. movaps %xmm7, 4 * SIZE(X)
  124. addl $8 * SIZE, X
  125. ALIGN_3
  126. .L13:
  127. testl $2, M
  128. je .L14
  129. movlps %xmm7, 0 * SIZE(X)
  130. movhps %xmm7, 2 * SIZE(X)
  131. addl $4 * SIZE, X
  132. ALIGN_3
  133. .L14:
  134. testl $1, M
  135. je .L19
  136. movlps %xmm7, 0 * SIZE(X)
  137. addl $2 * SIZE, X
  138. ALIGN_3
  139. .L19:
  140. testl $1, FLAG
  141. je .L999
  142. movss %xmm7, 0 * SIZE(X)
  143. jmp .L999
  144. ALIGN_4
  145. /* incx != 1 */
  146. .L50:
  147. movl M, I # rcx = n
  148. sarl $2, I
  149. jle .L52
  150. ALIGN_4
  151. .L51:
  152. #ifdef PREFETCHW
  153. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  154. #endif
  155. movsd %xmm7, 0 * SIZE(X)
  156. addl INCX, X
  157. movsd %xmm7, 0 * SIZE(X)
  158. addl INCX, X
  159. movsd %xmm7, 0 * SIZE(X)
  160. addl INCX, X
  161. movsd %xmm7, 0 * SIZE(X)
  162. addl INCX, X
  163. decl I
  164. jg .L51
  165. ALIGN_4
  166. .L52:
  167. testl $2, M
  168. je .L53
  169. movsd %xmm7, 0 * SIZE(X)
  170. addl INCX, X
  171. movsd %xmm7, 0 * SIZE(X)
  172. addl INCX, X
  173. ALIGN_3
  174. .L53:
  175. testl $1, M
  176. je .L999
  177. movsd %xmm7, 0 * SIZE(X)
  178. jmp .L999
  179. ALIGN_4
  180. /* Alpha != ZERO */
  181. .L100:
  182. testl $SIZE, X
  183. jne .L130
  184. cmpl $2 * SIZE, INCX
  185. jne .L120
  186. movaps %xmm0, %xmm6
  187. shufps $0, %xmm6, %xmm6
  188. shufps $0, %xmm1, %xmm1
  189. subps %xmm1, %xmm7
  190. unpcklps %xmm1, %xmm7
  191. subl $-32 * SIZE, X
  192. testl $2 * SIZE, X
  193. je .L105
  194. movsd -32 * SIZE(X), %xmm0
  195. PSHUFD2( $0xb1, %xmm0, %xmm5)
  196. mulps %xmm6, %xmm0
  197. mulps %xmm7, %xmm5
  198. addps %xmm5, %xmm0
  199. movlps %xmm0, -32 * SIZE(X)
  200. addl $2 * SIZE, X
  201. decl M
  202. jle .L999
  203. ALIGN_3
  204. .L105:
  205. movl M, I
  206. sarl $4, I
  207. jle .L115
  208. movaps -32 * SIZE(X), %xmm0
  209. movaps -28 * SIZE(X), %xmm1
  210. movaps -24 * SIZE(X), %xmm2
  211. movaps -20 * SIZE(X), %xmm3
  212. decl I
  213. jle .L112
  214. ALIGN_4
  215. .L111:
  216. #ifdef PREFETCHW
  217. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  218. #endif
  219. PSHUFD2( $0xb1, %xmm0, %xmm5)
  220. mulps %xmm6, %xmm0
  221. mulps %xmm7, %xmm5
  222. addps %xmm5, %xmm0
  223. movaps %xmm0, -32 * SIZE(X)
  224. movaps -16 * SIZE(X), %xmm0
  225. PSHUFD2( $0xb1, %xmm1, %xmm5)
  226. mulps %xmm6, %xmm1
  227. mulps %xmm7, %xmm5
  228. addps %xmm5, %xmm1
  229. movaps %xmm1, -28 * SIZE(X)
  230. movaps -12 * SIZE(X), %xmm1
  231. PSHUFD2( $0xb1, %xmm2, %xmm5)
  232. mulps %xmm6, %xmm2
  233. mulps %xmm7, %xmm5
  234. addps %xmm5, %xmm2
  235. movaps %xmm2, -24 * SIZE(X)
  236. movaps -8 * SIZE(X), %xmm2
  237. PSHUFD2( $0xb1, %xmm3, %xmm5)
  238. mulps %xmm6, %xmm3
  239. mulps %xmm7, %xmm5
  240. addps %xmm5, %xmm3
  241. movaps %xmm3, -20 * SIZE(X)
  242. movaps -4 * SIZE(X), %xmm3
  243. #ifdef PREFETCHW
  244. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  245. #endif
  246. PSHUFD2( $0xb1, %xmm0, %xmm5)
  247. mulps %xmm6, %xmm0
  248. mulps %xmm7, %xmm5
  249. addps %xmm5, %xmm0
  250. movaps %xmm0, -16 * SIZE(X)
  251. movaps 0 * SIZE(X), %xmm0
  252. PSHUFD2( $0xb1, %xmm1, %xmm5)
  253. mulps %xmm6, %xmm1
  254. mulps %xmm7, %xmm5
  255. addps %xmm5, %xmm1
  256. movaps %xmm1, -12 * SIZE(X)
  257. movaps 4 * SIZE(X), %xmm1
  258. PSHUFD2( $0xb1, %xmm2, %xmm5)
  259. mulps %xmm6, %xmm2
  260. mulps %xmm7, %xmm5
  261. addps %xmm5, %xmm2
  262. movaps %xmm2, -8 * SIZE(X)
  263. movaps 8 * SIZE(X), %xmm2
  264. PSHUFD2( $0xb1, %xmm3, %xmm5)
  265. mulps %xmm6, %xmm3
  266. mulps %xmm7, %xmm5
  267. addps %xmm5, %xmm3
  268. movaps %xmm3, -4 * SIZE(X)
  269. movaps 12 * SIZE(X), %xmm3
  270. subl $-32 * SIZE, X
  271. decl I
  272. jg .L111
  273. ALIGN_4
  274. .L112:
  275. PSHUFD2( $0xb1, %xmm0, %xmm5)
  276. mulps %xmm6, %xmm0
  277. mulps %xmm7, %xmm5
  278. addps %xmm5, %xmm0
  279. movaps %xmm0, -32 * SIZE(X)
  280. movaps -16 * SIZE(X), %xmm0
  281. PSHUFD2( $0xb1, %xmm1, %xmm5)
  282. mulps %xmm6, %xmm1
  283. mulps %xmm7, %xmm5
  284. addps %xmm5, %xmm1
  285. movaps %xmm1, -28 * SIZE(X)
  286. movaps -12 * SIZE(X), %xmm1
  287. PSHUFD2( $0xb1, %xmm2, %xmm5)
  288. mulps %xmm6, %xmm2
  289. mulps %xmm7, %xmm5
  290. addps %xmm5, %xmm2
  291. movaps %xmm2, -24 * SIZE(X)
  292. movaps -8 * SIZE(X), %xmm2
  293. PSHUFD2( $0xb1, %xmm3, %xmm5)
  294. mulps %xmm6, %xmm3
  295. mulps %xmm7, %xmm5
  296. addps %xmm5, %xmm3
  297. movaps %xmm3, -20 * SIZE(X)
  298. movaps -4 * SIZE(X), %xmm3
  299. PSHUFD2( $0xb1, %xmm0, %xmm5)
  300. mulps %xmm6, %xmm0
  301. mulps %xmm7, %xmm5
  302. addps %xmm5, %xmm0
  303. movaps %xmm0, -16 * SIZE(X)
  304. PSHUFD2( $0xb1, %xmm1, %xmm5)
  305. mulps %xmm6, %xmm1
  306. mulps %xmm7, %xmm5
  307. addps %xmm5, %xmm1
  308. movaps %xmm1, -12 * SIZE(X)
  309. PSHUFD2( $0xb1, %xmm2, %xmm5)
  310. mulps %xmm6, %xmm2
  311. mulps %xmm7, %xmm5
  312. addps %xmm5, %xmm2
  313. movaps %xmm2, -8 * SIZE(X)
  314. PSHUFD2( $0xb1, %xmm3, %xmm5)
  315. mulps %xmm6, %xmm3
  316. mulps %xmm7, %xmm5
  317. addps %xmm5, %xmm3
  318. movaps %xmm3, -4 * SIZE(X)
  319. subl $-32 * SIZE, X
  320. ALIGN_4
  321. .L115:
  322. testl $8, M
  323. je .L116
  324. movaps -32 * SIZE(X), %xmm0
  325. movaps -28 * SIZE(X), %xmm1
  326. PSHUFD2( $0xb1, %xmm0, %xmm5)
  327. mulps %xmm6, %xmm0
  328. mulps %xmm7, %xmm5
  329. addps %xmm5, %xmm0
  330. movaps %xmm0, -32 * SIZE(X)
  331. PSHUFD2( $0xb1, %xmm1, %xmm5)
  332. mulps %xmm6, %xmm1
  333. mulps %xmm7, %xmm5
  334. addps %xmm5, %xmm1
  335. movaps %xmm1, -28 * SIZE(X)
  336. movaps -24 * SIZE(X), %xmm2
  337. movaps -20 * SIZE(X), %xmm3
  338. PSHUFD2( $0xb1, %xmm2, %xmm5)
  339. mulps %xmm6, %xmm2
  340. mulps %xmm7, %xmm5
  341. addps %xmm5, %xmm2
  342. movaps %xmm2, -24 * SIZE(X)
  343. PSHUFD2( $0xb1, %xmm3, %xmm5)
  344. mulps %xmm6, %xmm3
  345. mulps %xmm7, %xmm5
  346. addps %xmm5, %xmm3
  347. movaps %xmm3, -20 * SIZE(X)
  348. addl $16 * SIZE, X
  349. ALIGN_3
  350. .L116:
  351. testl $4, M
  352. je .L117
  353. movaps -32 * SIZE(X), %xmm0
  354. movaps -28 * SIZE(X), %xmm1
  355. PSHUFD2( $0xb1, %xmm0, %xmm5)
  356. mulps %xmm6, %xmm0
  357. mulps %xmm7, %xmm5
  358. addps %xmm5, %xmm0
  359. movaps %xmm0, -32 * SIZE(X)
  360. PSHUFD2( $0xb1, %xmm1, %xmm5)
  361. mulps %xmm6, %xmm1
  362. mulps %xmm7, %xmm5
  363. addps %xmm5, %xmm1
  364. movaps %xmm1, -28 * SIZE(X)
  365. addl $8 * SIZE, X
  366. ALIGN_3
  367. .L117:
  368. testl $2, M
  369. je .L118
  370. movaps -32 * SIZE(X), %xmm0
  371. PSHUFD2( $0xb1, %xmm0, %xmm5)
  372. mulps %xmm6, %xmm0
  373. mulps %xmm7, %xmm5
  374. addps %xmm5, %xmm0
  375. movaps %xmm0, -32 * SIZE(X)
  376. addl $4 * SIZE, X
  377. ALIGN_3
  378. .L118:
  379. testl $1, M
  380. je .L999
  381. movsd -32 * SIZE(X), %xmm0
  382. PSHUFD2( $0xb1, %xmm0, %xmm5)
  383. mulps %xmm6, %xmm0
  384. mulps %xmm7, %xmm5
  385. addps %xmm5, %xmm0
  386. movlps %xmm0, -32 * SIZE(X)
  387. jmp .L999
  388. ALIGN_3
  389. .L120:
  390. PSHUFD2($0, %xmm0, %xmm6)
  391. PSHUFD2($0, %xmm1, %xmm1)
  392. subps %xmm1, %xmm7
  393. unpcklps %xmm1, %xmm7
  394. movl X, XX
  395. movl M, I
  396. sarl $3, I
  397. jle .L125
  398. movsd (X), %xmm0
  399. addl INCX, X
  400. movhps (X), %xmm0
  401. addl INCX, X
  402. movsd (X), %xmm1
  403. addl INCX, X
  404. movhps (X), %xmm1
  405. addl INCX, X
  406. movsd (X), %xmm2
  407. addl INCX, X
  408. movhps (X), %xmm2
  409. addl INCX, X
  410. movsd (X), %xmm3
  411. addl INCX, X
  412. movhps (X), %xmm3
  413. addl INCX, X
  414. decl I
  415. jle .L122
  416. ALIGN_4
  417. .L121:
  418. #ifdef PREFETCHW
  419. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  420. #endif
  421. PSHUFD2( $0xb1, %xmm0, %xmm5)
  422. mulps %xmm6, %xmm0
  423. mulps %xmm7, %xmm5
  424. addps %xmm5, %xmm0
  425. movlps %xmm0, (XX)
  426. addl INCX, XX
  427. movhps %xmm0, (XX)
  428. addl INCX, XX
  429. movsd (X), %xmm0
  430. addl INCX, X
  431. movhps (X), %xmm0
  432. addl INCX, X
  433. PSHUFD2( $0xb1, %xmm1, %xmm5)
  434. mulps %xmm6, %xmm1
  435. mulps %xmm7, %xmm5
  436. addps %xmm5, %xmm1
  437. movlps %xmm1, (XX)
  438. addl INCX, XX
  439. movhps %xmm1, (XX)
  440. addl INCX, XX
  441. movsd (X), %xmm1
  442. addl INCX, X
  443. movhps (X), %xmm1
  444. addl INCX, X
  445. PSHUFD2( $0xb1, %xmm2, %xmm5)
  446. mulps %xmm6, %xmm2
  447. mulps %xmm7, %xmm5
  448. addps %xmm5, %xmm2
  449. movlps %xmm2, (XX)
  450. addl INCX, XX
  451. movhps %xmm2, (XX)
  452. addl INCX, XX
  453. movsd (X), %xmm2
  454. addl INCX, X
  455. movhps (X), %xmm2
  456. addl INCX, X
  457. PSHUFD2( $0xb1, %xmm3, %xmm5)
  458. mulps %xmm6, %xmm3
  459. mulps %xmm7, %xmm5
  460. addps %xmm5, %xmm3
  461. movlps %xmm3, (XX)
  462. addl INCX, XX
  463. movhps %xmm3, (XX)
  464. addl INCX, XX
  465. movsd (X), %xmm3
  466. addl INCX, X
  467. movhps (X), %xmm3
  468. addl INCX, X
  469. decl I
  470. jg .L121
  471. ALIGN_4
  472. .L122:
  473. PSHUFD2( $0xb1, %xmm0, %xmm5)
  474. mulps %xmm6, %xmm0
  475. mulps %xmm7, %xmm5
  476. addps %xmm5, %xmm0
  477. movlps %xmm0, (XX)
  478. addl INCX, XX
  479. movhps %xmm0, (XX)
  480. addl INCX, XX
  481. PSHUFD2( $0xb1, %xmm1, %xmm5)
  482. mulps %xmm6, %xmm1
  483. mulps %xmm7, %xmm5
  484. addps %xmm5, %xmm1
  485. movlps %xmm1, (XX)
  486. addl INCX, XX
  487. movhps %xmm1, (XX)
  488. addl INCX, XX
  489. PSHUFD2( $0xb1, %xmm2, %xmm5)
  490. mulps %xmm6, %xmm2
  491. mulps %xmm7, %xmm5
  492. addps %xmm5, %xmm2
  493. movlps %xmm2, (XX)
  494. addl INCX, XX
  495. movhps %xmm2, (XX)
  496. addl INCX, XX
  497. PSHUFD2( $0xb1, %xmm3, %xmm5)
  498. mulps %xmm6, %xmm3
  499. mulps %xmm7, %xmm5
  500. addps %xmm5, %xmm3
  501. movlps %xmm3, (XX)
  502. addl INCX, XX
  503. movhps %xmm3, (XX)
  504. addl INCX, XX
  505. ALIGN_4
  506. .L125:
  507. testl $4, M
  508. je .L127
  509. movsd (X), %xmm0
  510. addl INCX, X
  511. movhps (X), %xmm0
  512. addl INCX, X
  513. PSHUFD2( $0xb1, %xmm0, %xmm5)
  514. mulps %xmm6, %xmm0
  515. mulps %xmm7, %xmm5
  516. addps %xmm5, %xmm0
  517. movlps %xmm0, (XX)
  518. addl INCX, XX
  519. movhps %xmm0, (XX)
  520. addl INCX, XX
  521. movsd (X), %xmm1
  522. addl INCX, X
  523. movhps (X), %xmm1
  524. addl INCX, X
  525. PSHUFD2( $0xb1, %xmm1, %xmm5)
  526. mulps %xmm6, %xmm1
  527. mulps %xmm7, %xmm5
  528. addps %xmm5, %xmm1
  529. movlps %xmm1, (XX)
  530. addl INCX, XX
  531. movhps %xmm1, (XX)
  532. addl INCX, XX
  533. ALIGN_3
  534. .L127:
  535. testl $2, M
  536. je .L128
  537. movsd (X), %xmm0
  538. addl INCX, X
  539. movhps (X), %xmm0
  540. addl INCX, X
  541. PSHUFD2( $0xb1, %xmm0, %xmm5)
  542. mulps %xmm6, %xmm0
  543. mulps %xmm7, %xmm5
  544. addps %xmm5, %xmm0
  545. movlps %xmm0, (XX)
  546. addl INCX, XX
  547. movhps %xmm0, (XX)
  548. addl INCX, XX
  549. ALIGN_3
  550. .L128:
  551. testl $1, M
  552. je .L999
  553. movsd (X), %xmm0
  554. PSHUFD2( $0xb1, %xmm0, %xmm5)
  555. mulps %xmm6, %xmm0
  556. mulps %xmm7, %xmm5
  557. addps %xmm5, %xmm0
  558. movlps %xmm0, (XX)
  559. jmp .L999
  560. ALIGN_3
  561. .L130:
  562. cmpl $2 * SIZE, INCX
  563. jne .L120
  564. #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
  565. PSHUFD2($0, %xmm0, %xmm6)
  566. PSHUFD2($0, %xmm1, %xmm1)
  567. subps %xmm1, %xmm7
  568. unpcklps %xmm1, %xmm7
  569. subl $-31 * SIZE, X
  570. testl $2 * SIZE, X
  571. je .L130x
  572. movsd -31 * SIZE(X), %xmm0
  573. PSHUFD2( $0xb1, %xmm0, %xmm5)
  574. mulps %xmm6, %xmm0
  575. mulps %xmm7, %xmm5
  576. addps %xmm5, %xmm0
  577. movlps %xmm0, -31 * SIZE(X)
  578. addl $2 * SIZE, X
  579. decl M
  580. jle .L999
  581. ALIGN_3
  582. .L130x:
  583. shufps $0xb1, %xmm7, %xmm7
  584. movaps -32 * SIZE(X), %xmm0
  585. movaps %xmm0, %xmm4
  586. movl M, I
  587. sarl $4, I
  588. jle .L135
  589. movaps -28 * SIZE(X), %xmm1
  590. decl I
  591. jle .L132
  592. ALIGN_4
  593. .L131:
  594. #ifdef PREFETCHW
  595. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  596. #endif
  597. movss %xmm1, %xmm0
  598. PSHUFD2($0x1b, %xmm0, %xmm5)
  599. mulps %xmm6, %xmm0
  600. mulps %xmm7, %xmm5
  601. addps %xmm5, %xmm0
  602. movaps %xmm0, %xmm2
  603. movss %xmm4, %xmm0
  604. movaps %xmm0, -32 * SIZE(X)
  605. movaps -24 * SIZE(X), %xmm0
  606. movss %xmm0, %xmm1
  607. PSHUFD2($0x1b, %xmm1, %xmm5)
  608. mulps %xmm6, %xmm1
  609. mulps %xmm7, %xmm5
  610. addps %xmm5, %xmm1
  611. movaps %xmm1, %xmm4
  612. movss %xmm2, %xmm1
  613. movaps %xmm1, -28 * SIZE(X)
  614. movaps -20 * SIZE(X), %xmm1
  615. movss %xmm1, %xmm0
  616. PSHUFD2($0x1b, %xmm0, %xmm5)
  617. mulps %xmm6, %xmm0
  618. mulps %xmm7, %xmm5
  619. addps %xmm5, %xmm0
  620. movaps %xmm0, %xmm2
  621. movss %xmm4, %xmm0
  622. movaps %xmm0, -24 * SIZE(X)
  623. movaps -16 * SIZE(X), %xmm0
  624. movss %xmm0, %xmm1
  625. PSHUFD2($0x1b, %xmm1, %xmm5)
  626. mulps %xmm6, %xmm1
  627. mulps %xmm7, %xmm5
  628. addps %xmm5, %xmm1
  629. movaps %xmm1, %xmm4
  630. movss %xmm2, %xmm1
  631. movaps %xmm1, -20 * SIZE(X)
  632. movaps -12 * SIZE(X), %xmm1
  633. #ifdef PREFETCHW
  634. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  635. #endif
  636. movss %xmm1, %xmm0
  637. PSHUFD2($0x1b, %xmm0, %xmm5)
  638. mulps %xmm6, %xmm0
  639. mulps %xmm7, %xmm5
  640. addps %xmm5, %xmm0
  641. movaps %xmm0, %xmm2
  642. movss %xmm4, %xmm0
  643. movaps %xmm0, -16 * SIZE(X)
  644. movaps -8 * SIZE(X), %xmm0
  645. movss %xmm0, %xmm1
  646. PSHUFD2($0x1b, %xmm1, %xmm5)
  647. mulps %xmm6, %xmm1
  648. mulps %xmm7, %xmm5
  649. addps %xmm5, %xmm1
  650. movaps %xmm1, %xmm4
  651. movss %xmm2, %xmm1
  652. movaps %xmm1, -12 * SIZE(X)
  653. movaps -4 * SIZE(X), %xmm1
  654. movss %xmm1, %xmm0
  655. PSHUFD2($0x1b, %xmm0, %xmm5)
  656. mulps %xmm6, %xmm0
  657. mulps %xmm7, %xmm5
  658. addps %xmm5, %xmm0
  659. movaps %xmm0, %xmm2
  660. movss %xmm4, %xmm0
  661. movaps %xmm0, -8 * SIZE(X)
  662. movaps 0 * SIZE(X), %xmm0
  663. movss %xmm0, %xmm1
  664. PSHUFD2($0x1b, %xmm1, %xmm5)
  665. mulps %xmm6, %xmm1
  666. mulps %xmm7, %xmm5
  667. addps %xmm5, %xmm1
  668. movaps %xmm1, %xmm4
  669. movss %xmm2, %xmm1
  670. movaps %xmm1, -4 * SIZE(X)
  671. movaps 4 * SIZE(X), %xmm1
  672. subl $-32 * SIZE, X
  673. decl I
  674. jg .L131
  675. ALIGN_4
  676. .L132:
  677. movss %xmm1, %xmm0
  678. PSHUFD2($0x1b, %xmm0, %xmm5)
  679. mulps %xmm6, %xmm0
  680. mulps %xmm7, %xmm5
  681. addps %xmm5, %xmm0
  682. movaps %xmm0, %xmm2
  683. movss %xmm4, %xmm0
  684. movaps %xmm0, -32 * SIZE(X)
  685. movaps -24 * SIZE(X), %xmm0
  686. movss %xmm0, %xmm1
  687. PSHUFD2($0x1b, %xmm1, %xmm5)
  688. mulps %xmm6, %xmm1
  689. mulps %xmm7, %xmm5
  690. addps %xmm5, %xmm1
  691. movaps %xmm1, %xmm4
  692. movss %xmm2, %xmm1
  693. movaps %xmm1, -28 * SIZE(X)
  694. movaps -20 * SIZE(X), %xmm1
  695. movss %xmm1, %xmm0
  696. PSHUFD2($0x1b, %xmm0, %xmm5)
  697. mulps %xmm6, %xmm0
  698. mulps %xmm7, %xmm5
  699. addps %xmm5, %xmm0
  700. movaps %xmm0, %xmm2
  701. movss %xmm4, %xmm0
  702. movaps %xmm0, -24 * SIZE(X)
  703. movaps -16 * SIZE(X), %xmm0
  704. movss %xmm0, %xmm1
  705. PSHUFD2($0x1b, %xmm1, %xmm5)
  706. mulps %xmm6, %xmm1
  707. mulps %xmm7, %xmm5
  708. addps %xmm5, %xmm1
  709. movaps %xmm1, %xmm4
  710. movss %xmm2, %xmm1
  711. movaps %xmm1, -20 * SIZE(X)
  712. movaps -12 * SIZE(X), %xmm1
  713. movss %xmm1, %xmm0
  714. PSHUFD2($0x1b, %xmm0, %xmm5)
  715. mulps %xmm6, %xmm0
  716. mulps %xmm7, %xmm5
  717. addps %xmm5, %xmm0
  718. movaps %xmm0, %xmm2
  719. movss %xmm4, %xmm0
  720. movaps %xmm0, -16 * SIZE(X)
  721. movaps -8 * SIZE(X), %xmm0
  722. movss %xmm0, %xmm1
  723. PSHUFD2($0x1b, %xmm1, %xmm5)
  724. mulps %xmm6, %xmm1
  725. mulps %xmm7, %xmm5
  726. addps %xmm5, %xmm1
  727. movaps %xmm1, %xmm4
  728. movss %xmm2, %xmm1
  729. movaps %xmm1, -12 * SIZE(X)
  730. movaps -4 * SIZE(X), %xmm1
  731. movss %xmm1, %xmm0
  732. PSHUFD2($0x1b, %xmm0, %xmm5)
  733. mulps %xmm6, %xmm0
  734. mulps %xmm7, %xmm5
  735. addps %xmm5, %xmm0
  736. movaps %xmm0, %xmm2
  737. movss %xmm4, %xmm0
  738. movaps %xmm0, -8 * SIZE(X)
  739. movaps 0 * SIZE(X), %xmm0
  740. movss %xmm0, %xmm1
  741. PSHUFD2($0x1b, %xmm1, %xmm5)
  742. mulps %xmm6, %xmm1
  743. mulps %xmm7, %xmm5
  744. addps %xmm5, %xmm1
  745. movaps %xmm1, %xmm4
  746. movss %xmm2, %xmm1
  747. movaps %xmm1, -4 * SIZE(X)
  748. subl $-32 * SIZE, X
  749. ALIGN_4
  750. .L135:
  751. testl $8, M
  752. je .L136
  753. movaps -28 * SIZE(X), %xmm1
  754. movss %xmm1, %xmm0
  755. PSHUFD2($0x1b, %xmm0, %xmm5)
  756. mulps %xmm6, %xmm0
  757. mulps %xmm7, %xmm5
  758. addps %xmm5, %xmm0
  759. movaps %xmm0, %xmm2
  760. movss %xmm4, %xmm0
  761. movaps %xmm0, -32 * SIZE(X)
  762. movaps -24 * SIZE(X), %xmm0
  763. movss %xmm0, %xmm1
  764. PSHUFD2($0x1b, %xmm1, %xmm5)
  765. mulps %xmm6, %xmm1
  766. mulps %xmm7, %xmm5
  767. addps %xmm5, %xmm1
  768. movaps %xmm1, %xmm4
  769. movss %xmm2, %xmm1
  770. movaps %xmm1, -28 * SIZE(X)
  771. movaps -20 * SIZE(X), %xmm1
  772. movss %xmm1, %xmm0
  773. PSHUFD2($0x1b, %xmm0, %xmm5)
  774. mulps %xmm6, %xmm0
  775. mulps %xmm7, %xmm5
  776. addps %xmm5, %xmm0
  777. movaps %xmm0, %xmm2
  778. movss %xmm4, %xmm0
  779. movaps %xmm0, -24 * SIZE(X)
  780. movaps -16 * SIZE(X), %xmm0
  781. movss %xmm0, %xmm1
  782. PSHUFD2($0x1b, %xmm1, %xmm5)
  783. mulps %xmm6, %xmm1
  784. mulps %xmm7, %xmm5
  785. addps %xmm5, %xmm1
  786. movaps %xmm1, %xmm4
  787. movss %xmm2, %xmm1
  788. movaps %xmm1, -20 * SIZE(X)
  789. addl $16 * SIZE, X
  790. ALIGN_3
  791. .L136:
  792. testl $4, M
  793. je .L137
  794. movaps -28 * SIZE(X), %xmm1
  795. movss %xmm1, %xmm0
  796. PSHUFD2($0x1b, %xmm0, %xmm5)
  797. mulps %xmm6, %xmm0
  798. mulps %xmm7, %xmm5
  799. addps %xmm5, %xmm0
  800. movaps %xmm0, %xmm2
  801. movss %xmm4, %xmm0
  802. movaps %xmm0, -32 * SIZE(X)
  803. movaps -24 * SIZE(X), %xmm0
  804. movss %xmm0, %xmm1
  805. PSHUFD2($0x1b, %xmm1, %xmm5)
  806. mulps %xmm6, %xmm1
  807. mulps %xmm7, %xmm5
  808. addps %xmm5, %xmm1
  809. movaps %xmm1, %xmm4
  810. movss %xmm2, %xmm1
  811. movaps %xmm1, -28 * SIZE(X)
  812. addl $8 * SIZE, X
  813. ALIGN_3
  814. .L137:
  815. testl $2, M
  816. je .L138
  817. movaps -28 * SIZE(X), %xmm1
  818. movss %xmm1, %xmm0
  819. PSHUFD2($0x1b, %xmm0, %xmm5)
  820. mulps %xmm6, %xmm0
  821. mulps %xmm7, %xmm5
  822. addps %xmm5, %xmm0
  823. movaps %xmm0, %xmm2
  824. movss %xmm4, %xmm0
  825. movaps %xmm0, -32 * SIZE(X)
  826. movaps %xmm2, %xmm4
  827. movaps %xmm1, %xmm0
  828. addl $4 * SIZE, X
  829. ALIGN_3
  830. .L138:
  831. movss %xmm4, -32 * SIZE(X)
  832. testl $1, M
  833. je .L999
  834. PSHUFD2( $0x1b, %xmm0, %xmm5)
  835. mulps %xmm6, %xmm0
  836. mulps %xmm7, %xmm5
  837. addps %xmm5, %xmm0
  838. PSHUFD1( $0x39, %xmm0)
  839. movlps %xmm0, -31 * SIZE(X)
  840. jmp .L999
  841. ALIGN_3
  842. #else
  843. PSHUFD2($0, %xmm0, %xmm6)
  844. PSHUFD2($0, %xmm1, %xmm1)
  845. subps %xmm1, %xmm7
  846. unpcklps %xmm1, %xmm7
  847. subl $-32 * SIZE, X
  848. testl $2 * SIZE, X
  849. je .L130x
  850. #ifdef movsd
  851. xorps %xmm0, %xmm0
  852. #endif
  853. movsd -32 * SIZE(X), %xmm0
  854. PSHUFD2( $0xb1, %xmm0, %xmm5)
  855. mulps %xmm6, %xmm0
  856. mulps %xmm7, %xmm5
  857. addps %xmm5, %xmm0
  858. movlps %xmm0, -32 * SIZE(X)
  859. addl $2 * SIZE, X
  860. decl M
  861. jle .L999
  862. ALIGN_3
  863. .L130x:
  864. movl M, I
  865. sarl $4, I
  866. jle .L135
  867. movsd -32 * SIZE(X), %xmm0
  868. movhps -30 * SIZE(X), %xmm0
  869. movsd -28 * SIZE(X), %xmm1
  870. movhps -26 * SIZE(X), %xmm1
  871. movsd -24 * SIZE(X), %xmm2
  872. movhps -22 * SIZE(X), %xmm2
  873. movsd -20 * SIZE(X), %xmm3
  874. movhps -18 * SIZE(X), %xmm3
  875. decl I
  876. jle .L132
  877. ALIGN_4
  878. .L131:
  879. #ifdef PREFETCHW
  880. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  881. #endif
  882. PSHUFD2( $0xb1, %xmm0, %xmm5)
  883. mulps %xmm6, %xmm0
  884. mulps %xmm7, %xmm5
  885. addps %xmm5, %xmm0
  886. movlps %xmm0, -32 * SIZE(X)
  887. movhps %xmm0, -30 * SIZE(X)
  888. movsd -16 * SIZE(X), %xmm0
  889. movhps -14 * SIZE(X), %xmm0
  890. PSHUFD2( $0xb1, %xmm1, %xmm5)
  891. mulps %xmm6, %xmm1
  892. mulps %xmm7, %xmm5
  893. addps %xmm5, %xmm1
  894. movlps %xmm1, -28 * SIZE(X)
  895. movhps %xmm1, -26 * SIZE(X)
  896. movsd -12 * SIZE(X), %xmm1
  897. movhps -10 * SIZE(X), %xmm1
  898. PSHUFD2( $0xb1, %xmm2, %xmm5)
  899. mulps %xmm6, %xmm2
  900. mulps %xmm7, %xmm5
  901. addps %xmm5, %xmm2
  902. movlps %xmm2, -24 * SIZE(X)
  903. movhps %xmm2, -22 * SIZE(X)
  904. movsd -8 * SIZE(X), %xmm2
  905. movhps -6 * SIZE(X), %xmm2
  906. PSHUFD2( $0xb1, %xmm3, %xmm5)
  907. mulps %xmm6, %xmm3
  908. mulps %xmm7, %xmm5
  909. addps %xmm5, %xmm3
  910. movlps %xmm3, -20 * SIZE(X)
  911. movhps %xmm3, -18 * SIZE(X)
  912. movsd -4 * SIZE(X), %xmm3
  913. movhps -2 * SIZE(X), %xmm3
  914. #ifdef PREFETCHW
  915. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  916. #endif
  917. PSHUFD2( $0xb1, %xmm0, %xmm5)
  918. mulps %xmm6, %xmm0
  919. mulps %xmm7, %xmm5
  920. addps %xmm5, %xmm0
  921. movlps %xmm0, -16 * SIZE(X)
  922. movhps %xmm0, -14 * SIZE(X)
  923. movsd 0 * SIZE(X), %xmm0
  924. movhps 2 * SIZE(X), %xmm0
  925. PSHUFD2( $0xb1, %xmm1, %xmm5)
  926. mulps %xmm6, %xmm1
  927. mulps %xmm7, %xmm5
  928. addps %xmm5, %xmm1
  929. movlps %xmm1, -12 * SIZE(X)
  930. movhps %xmm1, -10 * SIZE(X)
  931. movsd 4 * SIZE(X), %xmm1
  932. movhps 6 * SIZE(X), %xmm1
  933. PSHUFD2( $0xb1, %xmm2, %xmm5)
  934. mulps %xmm6, %xmm2
  935. mulps %xmm7, %xmm5
  936. addps %xmm5, %xmm2
  937. movlps %xmm2, -8 * SIZE(X)
  938. movhps %xmm2, -6 * SIZE(X)
  939. movsd 8 * SIZE(X), %xmm2
  940. movhps 10 * SIZE(X), %xmm2
  941. PSHUFD2( $0xb1, %xmm3, %xmm5)
  942. mulps %xmm6, %xmm3
  943. mulps %xmm7, %xmm5
  944. addps %xmm5, %xmm3
  945. movlps %xmm3, -4 * SIZE(X)
  946. movhps %xmm3, -2 * SIZE(X)
  947. movsd 12 * SIZE(X), %xmm3
  948. movhps 14 * SIZE(X), %xmm3
  949. subl $-32 * SIZE, X
  950. decl I
  951. jg .L131
  952. ALIGN_4
  953. .L132:
  954. PSHUFD2( $0xb1, %xmm0, %xmm5)
  955. mulps %xmm6, %xmm0
  956. mulps %xmm7, %xmm5
  957. addps %xmm5, %xmm0
  958. movlps %xmm0, -32 * SIZE(X)
  959. movhps %xmm0, -30 * SIZE(X)
  960. movsd -16 * SIZE(X), %xmm0
  961. movhps -14 * SIZE(X), %xmm0
  962. PSHUFD2( $0xb1, %xmm1, %xmm5)
  963. mulps %xmm6, %xmm1
  964. mulps %xmm7, %xmm5
  965. addps %xmm5, %xmm1
  966. movlps %xmm1, -28 * SIZE(X)
  967. movhps %xmm1, -26 * SIZE(X)
  968. movsd -12 * SIZE(X), %xmm1
  969. movhps -10 * SIZE(X), %xmm1
  970. PSHUFD2( $0xb1, %xmm2, %xmm5)
  971. mulps %xmm6, %xmm2
  972. mulps %xmm7, %xmm5
  973. addps %xmm5, %xmm2
  974. movlps %xmm2, -24 * SIZE(X)
  975. movhps %xmm2, -22 * SIZE(X)
  976. movsd -8 * SIZE(X), %xmm2
  977. movhps -6 * SIZE(X), %xmm2
  978. PSHUFD2( $0xb1, %xmm3, %xmm5)
  979. mulps %xmm6, %xmm3
  980. mulps %xmm7, %xmm5
  981. addps %xmm5, %xmm3
  982. movlps %xmm3, -20 * SIZE(X)
  983. movhps %xmm3, -18 * SIZE(X)
  984. movsd -4 * SIZE(X), %xmm3
  985. movhps -2 * SIZE(X), %xmm3
  986. PSHUFD2( $0xb1, %xmm0, %xmm5)
  987. mulps %xmm6, %xmm0
  988. mulps %xmm7, %xmm5
  989. addps %xmm5, %xmm0
  990. movlps %xmm0, -16 * SIZE(X)
  991. movhps %xmm0, -14 * SIZE(X)
  992. PSHUFD2( $0xb1, %xmm1, %xmm5)
  993. mulps %xmm6, %xmm1
  994. mulps %xmm7, %xmm5
  995. addps %xmm5, %xmm1
  996. movlps %xmm1, -12 * SIZE(X)
  997. movhps %xmm1, -10 * SIZE(X)
  998. PSHUFD2( $0xb1, %xmm2, %xmm5)
  999. mulps %xmm6, %xmm2
  1000. mulps %xmm7, %xmm5
  1001. addps %xmm5, %xmm2
  1002. movlps %xmm2, -8 * SIZE(X)
  1003. movhps %xmm2, -6 * SIZE(X)
  1004. PSHUFD2( $0xb1, %xmm3, %xmm5)
  1005. mulps %xmm6, %xmm3
  1006. mulps %xmm7, %xmm5
  1007. addps %xmm5, %xmm3
  1008. movlps %xmm3, -4 * SIZE(X)
  1009. movhps %xmm3, -2 * SIZE(X)
  1010. subl $-32 * SIZE, X
  1011. ALIGN_4
  1012. .L135:
  1013. testl $8, M
  1014. je .L136
  1015. movsd -32 * SIZE(X), %xmm0
  1016. movhps -30 * SIZE(X), %xmm0
  1017. PSHUFD2( $0xb1, %xmm0, %xmm5)
  1018. mulps %xmm6, %xmm0
  1019. mulps %xmm7, %xmm5
  1020. addps %xmm5, %xmm0
  1021. movlps %xmm0, -32 * SIZE(X)
  1022. movhps %xmm0, -30 * SIZE(X)
  1023. movsd -28 * SIZE(X), %xmm1
  1024. movhps -26 * SIZE(X), %xmm1
  1025. PSHUFD2( $0xb1, %xmm1, %xmm5)
  1026. mulps %xmm6, %xmm1
  1027. mulps %xmm7, %xmm5
  1028. addps %xmm5, %xmm1
  1029. movlps %xmm1, -28 * SIZE(X)
  1030. movhps %xmm1, -26 * SIZE(X)
  1031. movsd -24 * SIZE(X), %xmm2
  1032. movhps -22 * SIZE(X), %xmm2
  1033. PSHUFD2( $0xb1, %xmm2, %xmm5)
  1034. mulps %xmm6, %xmm2
  1035. mulps %xmm7, %xmm5
  1036. addps %xmm5, %xmm2
  1037. movlps %xmm2, -24 * SIZE(X)
  1038. movhps %xmm2, -22 * SIZE(X)
  1039. movsd -20 * SIZE(X), %xmm3
  1040. movhps -18 * SIZE(X), %xmm3
  1041. PSHUFD2( $0xb1, %xmm3, %xmm5)
  1042. mulps %xmm6, %xmm3
  1043. mulps %xmm7, %xmm5
  1044. addps %xmm5, %xmm3
  1045. movlps %xmm3, -20 * SIZE(X)
  1046. movhps %xmm3, -18 * SIZE(X)
  1047. addl $16 * SIZE, X
  1048. ALIGN_3
  1049. .L136:
  1050. testl $4, M
  1051. je .L137
  1052. movsd -32 * SIZE(X), %xmm0
  1053. movhps -30 * SIZE(X), %xmm0
  1054. movsd -28 * SIZE(X), %xmm1
  1055. movhps -26 * SIZE(X), %xmm1
  1056. PSHUFD2( $0xb1, %xmm0, %xmm5)
  1057. mulps %xmm6, %xmm0
  1058. mulps %xmm7, %xmm5
  1059. addps %xmm5, %xmm0
  1060. movlps %xmm0, -32 * SIZE(X)
  1061. movhps %xmm0, -30 * SIZE(X)
  1062. PSHUFD2( $0xb1, %xmm1, %xmm5)
  1063. mulps %xmm6, %xmm1
  1064. mulps %xmm7, %xmm5
  1065. addps %xmm5, %xmm1
  1066. movlps %xmm1, -28 * SIZE(X)
  1067. movhps %xmm1, -26 * SIZE(X)
  1068. addl $8 * SIZE, X
  1069. ALIGN_3
  1070. .L137:
  1071. testl $2, M
  1072. je .L138
  1073. movsd -32 * SIZE(X), %xmm0
  1074. movhps -30 * SIZE(X), %xmm0
  1075. PSHUFD2( $0xb1, %xmm0, %xmm5)
  1076. mulps %xmm6, %xmm0
  1077. mulps %xmm7, %xmm5
  1078. addps %xmm5, %xmm0
  1079. movlps %xmm0, -32 * SIZE(X)
  1080. movhps %xmm0, -30 * SIZE(X)
  1081. addl $4 * SIZE, X
  1082. ALIGN_3
  1083. .L138:
  1084. testl $1, M
  1085. je .L999
  1086. movsd -32 * SIZE(X), %xmm0
  1087. PSHUFD2( $0xb1, %xmm0, %xmm5)
  1088. mulps %xmm6, %xmm0
  1089. mulps %xmm7, %xmm5
  1090. addps %xmm5, %xmm0
  1091. movlps %xmm0, -32 * SIZE(X)
  1092. ALIGN_3
  1093. #endif
  1094. .L999:
  1095. xorl %eax, %eax
  1096. popl %ebp
  1097. popl %ebx
  1098. popl %esi
  1099. popl %edi
  1100. ret
  1101. EPILOGUE