You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zscal_sse.S 25 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_ALPHA_R 16 + STACK + ARGS(%esp)
  44. #define STACK_ALPHA_I 20 + STACK + ARGS(%esp)
  45. #define STACK_X 24 + STACK + ARGS(%esp)
  46. #define STACK_INCX 28 + STACK + ARGS(%esp)
  47. #define M %ebx
  48. #define X %ecx
  49. #define INCX %edx
  50. #define I %esi
  51. #define XX %edi
  52. #define FLAG %ebp
  53. #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE)
  54. #define USE_PSHUFD
  55. #else
  56. #define USE_PSHUFD_HALF
  57. #endif
  58. #include "l1param.h"
  59. PROLOGUE
  60. PROFCODE
  61. pushl %edi
  62. pushl %esi
  63. pushl %ebx
  64. pushl %ebp
  65. movl STACK_M, M
  66. movl STACK_X, X
  67. movl STACK_INCX, INCX
  68. movss STACK_ALPHA_R, %xmm0
  69. movss STACK_ALPHA_I, %xmm1
  70. sall $ZBASE_SHIFT, INCX
  71. xor FLAG, FLAG
  72. testl M, M
  73. jle .L999
  74. xorps %xmm7, %xmm7
  75. comiss %xmm0, %xmm7
  76. jne .L100 # Alpha_r != ZERO
  77. comiss %xmm1, %xmm7
  78. jne .L100 # Alpha_i != ZERO
  79. /* Alpha == ZERO */
  80. cmpl $2 * SIZE, INCX
  81. jne .L50
  82. /* INCX == 1 */
  83. cmpl $3, M
  84. jle .L13
  85. testl $4, X
  86. je .L05
  87. movss %xmm7, 0 * SIZE(X)
  88. addl $SIZE, X
  89. movl $1, FLAG
  90. decl M
  91. ALIGN_3
  92. .L05:
  93. testl $8, X
  94. je .L06
  95. movlps %xmm7, 0 * SIZE(X)
  96. addl $2 * SIZE, X
  97. subl $1, M
  98. ALIGN_3
  99. .L06:
  100. movl M, I # rcx = n
  101. sarl $3, I
  102. jle .L12
  103. ALIGN_4
  104. .L11:
  105. #ifdef PREFETCHW
  106. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  107. #endif
  108. movaps %xmm7, 0 * SIZE(X)
  109. movaps %xmm7, 4 * SIZE(X)
  110. movaps %xmm7, 8 * SIZE(X)
  111. movaps %xmm7, 12 * SIZE(X)
  112. addl $16 * SIZE, X
  113. decl I
  114. jg .L11
  115. ALIGN_4
  116. .L12:
  117. testl $7, M
  118. je .L19
  119. testl $4, M
  120. je .L13
  121. movaps %xmm7, 0 * SIZE(X)
  122. movaps %xmm7, 4 * SIZE(X)
  123. addl $8 * SIZE, X
  124. ALIGN_3
  125. .L13:
  126. testl $2, M
  127. je .L14
  128. movlps %xmm7, 0 * SIZE(X)
  129. movhps %xmm7, 2 * SIZE(X)
  130. addl $4 * SIZE, X
  131. ALIGN_3
  132. .L14:
  133. testl $1, M
  134. je .L19
  135. movlps %xmm7, 0 * SIZE(X)
  136. addl $2 * SIZE, X
  137. ALIGN_3
  138. .L19:
  139. testl $1, FLAG
  140. je .L999
  141. movss %xmm7, 0 * SIZE(X)
  142. jmp .L999
  143. ALIGN_4
  144. /* incx != 1 */
  145. .L50:
  146. movl M, I # rcx = n
  147. sarl $2, I
  148. jle .L52
  149. ALIGN_4
  150. .L51:
  151. #ifdef PREFETCHW
  152. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  153. #endif
  154. movsd %xmm7, 0 * SIZE(X)
  155. addl INCX, X
  156. movsd %xmm7, 0 * SIZE(X)
  157. addl INCX, X
  158. movsd %xmm7, 0 * SIZE(X)
  159. addl INCX, X
  160. movsd %xmm7, 0 * SIZE(X)
  161. addl INCX, X
  162. decl I
  163. jg .L51
  164. ALIGN_4
  165. .L52:
  166. testl $2, M
  167. je .L53
  168. movsd %xmm7, 0 * SIZE(X)
  169. addl INCX, X
  170. movsd %xmm7, 0 * SIZE(X)
  171. addl INCX, X
  172. ALIGN_3
  173. .L53:
  174. testl $1, M
  175. je .L999
  176. movsd %xmm7, 0 * SIZE(X)
  177. jmp .L999
  178. ALIGN_4
  179. /* Alpha != ZERO */
  180. .L100:
  181. testl $SIZE, X
  182. jne .L130
  183. cmpl $2 * SIZE, INCX
  184. jne .L120
  185. movaps %xmm0, %xmm6
  186. shufps $0, %xmm6, %xmm6
  187. shufps $0, %xmm1, %xmm1
  188. subps %xmm1, %xmm7
  189. unpcklps %xmm1, %xmm7
  190. subl $-32 * SIZE, X
  191. testl $2 * SIZE, X
  192. je .L105
  193. movsd -32 * SIZE(X), %xmm0
  194. PSHUFD2( $0xb1, %xmm0, %xmm5)
  195. mulps %xmm6, %xmm0
  196. mulps %xmm7, %xmm5
  197. addps %xmm5, %xmm0
  198. movlps %xmm0, -32 * SIZE(X)
  199. addl $2 * SIZE, X
  200. decl M
  201. jle .L999
  202. ALIGN_3
  203. .L105:
  204. movl M, I
  205. sarl $4, I
  206. jle .L115
  207. movaps -32 * SIZE(X), %xmm0
  208. movaps -28 * SIZE(X), %xmm1
  209. movaps -24 * SIZE(X), %xmm2
  210. movaps -20 * SIZE(X), %xmm3
  211. decl I
  212. jle .L112
  213. ALIGN_4
  214. .L111:
  215. #ifdef PREFETCHW
  216. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  217. #endif
  218. PSHUFD2( $0xb1, %xmm0, %xmm5)
  219. mulps %xmm6, %xmm0
  220. mulps %xmm7, %xmm5
  221. addps %xmm5, %xmm0
  222. movaps %xmm0, -32 * SIZE(X)
  223. movaps -16 * SIZE(X), %xmm0
  224. PSHUFD2( $0xb1, %xmm1, %xmm5)
  225. mulps %xmm6, %xmm1
  226. mulps %xmm7, %xmm5
  227. addps %xmm5, %xmm1
  228. movaps %xmm1, -28 * SIZE(X)
  229. movaps -12 * SIZE(X), %xmm1
  230. PSHUFD2( $0xb1, %xmm2, %xmm5)
  231. mulps %xmm6, %xmm2
  232. mulps %xmm7, %xmm5
  233. addps %xmm5, %xmm2
  234. movaps %xmm2, -24 * SIZE(X)
  235. movaps -8 * SIZE(X), %xmm2
  236. PSHUFD2( $0xb1, %xmm3, %xmm5)
  237. mulps %xmm6, %xmm3
  238. mulps %xmm7, %xmm5
  239. addps %xmm5, %xmm3
  240. movaps %xmm3, -20 * SIZE(X)
  241. movaps -4 * SIZE(X), %xmm3
  242. #ifdef PREFETCHW
  243. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  244. #endif
  245. PSHUFD2( $0xb1, %xmm0, %xmm5)
  246. mulps %xmm6, %xmm0
  247. mulps %xmm7, %xmm5
  248. addps %xmm5, %xmm0
  249. movaps %xmm0, -16 * SIZE(X)
  250. movaps 0 * SIZE(X), %xmm0
  251. PSHUFD2( $0xb1, %xmm1, %xmm5)
  252. mulps %xmm6, %xmm1
  253. mulps %xmm7, %xmm5
  254. addps %xmm5, %xmm1
  255. movaps %xmm1, -12 * SIZE(X)
  256. movaps 4 * SIZE(X), %xmm1
  257. PSHUFD2( $0xb1, %xmm2, %xmm5)
  258. mulps %xmm6, %xmm2
  259. mulps %xmm7, %xmm5
  260. addps %xmm5, %xmm2
  261. movaps %xmm2, -8 * SIZE(X)
  262. movaps 8 * SIZE(X), %xmm2
  263. PSHUFD2( $0xb1, %xmm3, %xmm5)
  264. mulps %xmm6, %xmm3
  265. mulps %xmm7, %xmm5
  266. addps %xmm5, %xmm3
  267. movaps %xmm3, -4 * SIZE(X)
  268. movaps 12 * SIZE(X), %xmm3
  269. subl $-32 * SIZE, X
  270. decl I
  271. jg .L111
  272. ALIGN_4
  273. .L112:
  274. PSHUFD2( $0xb1, %xmm0, %xmm5)
  275. mulps %xmm6, %xmm0
  276. mulps %xmm7, %xmm5
  277. addps %xmm5, %xmm0
  278. movaps %xmm0, -32 * SIZE(X)
  279. movaps -16 * SIZE(X), %xmm0
  280. PSHUFD2( $0xb1, %xmm1, %xmm5)
  281. mulps %xmm6, %xmm1
  282. mulps %xmm7, %xmm5
  283. addps %xmm5, %xmm1
  284. movaps %xmm1, -28 * SIZE(X)
  285. movaps -12 * SIZE(X), %xmm1
  286. PSHUFD2( $0xb1, %xmm2, %xmm5)
  287. mulps %xmm6, %xmm2
  288. mulps %xmm7, %xmm5
  289. addps %xmm5, %xmm2
  290. movaps %xmm2, -24 * SIZE(X)
  291. movaps -8 * SIZE(X), %xmm2
  292. PSHUFD2( $0xb1, %xmm3, %xmm5)
  293. mulps %xmm6, %xmm3
  294. mulps %xmm7, %xmm5
  295. addps %xmm5, %xmm3
  296. movaps %xmm3, -20 * SIZE(X)
  297. movaps -4 * SIZE(X), %xmm3
  298. PSHUFD2( $0xb1, %xmm0, %xmm5)
  299. mulps %xmm6, %xmm0
  300. mulps %xmm7, %xmm5
  301. addps %xmm5, %xmm0
  302. movaps %xmm0, -16 * SIZE(X)
  303. PSHUFD2( $0xb1, %xmm1, %xmm5)
  304. mulps %xmm6, %xmm1
  305. mulps %xmm7, %xmm5
  306. addps %xmm5, %xmm1
  307. movaps %xmm1, -12 * SIZE(X)
  308. PSHUFD2( $0xb1, %xmm2, %xmm5)
  309. mulps %xmm6, %xmm2
  310. mulps %xmm7, %xmm5
  311. addps %xmm5, %xmm2
  312. movaps %xmm2, -8 * SIZE(X)
  313. PSHUFD2( $0xb1, %xmm3, %xmm5)
  314. mulps %xmm6, %xmm3
  315. mulps %xmm7, %xmm5
  316. addps %xmm5, %xmm3
  317. movaps %xmm3, -4 * SIZE(X)
  318. subl $-32 * SIZE, X
  319. ALIGN_4
  320. .L115:
  321. testl $8, M
  322. je .L116
  323. movaps -32 * SIZE(X), %xmm0
  324. movaps -28 * SIZE(X), %xmm1
  325. PSHUFD2( $0xb1, %xmm0, %xmm5)
  326. mulps %xmm6, %xmm0
  327. mulps %xmm7, %xmm5
  328. addps %xmm5, %xmm0
  329. movaps %xmm0, -32 * SIZE(X)
  330. PSHUFD2( $0xb1, %xmm1, %xmm5)
  331. mulps %xmm6, %xmm1
  332. mulps %xmm7, %xmm5
  333. addps %xmm5, %xmm1
  334. movaps %xmm1, -28 * SIZE(X)
  335. movaps -24 * SIZE(X), %xmm2
  336. movaps -20 * SIZE(X), %xmm3
  337. PSHUFD2( $0xb1, %xmm2, %xmm5)
  338. mulps %xmm6, %xmm2
  339. mulps %xmm7, %xmm5
  340. addps %xmm5, %xmm2
  341. movaps %xmm2, -24 * SIZE(X)
  342. PSHUFD2( $0xb1, %xmm3, %xmm5)
  343. mulps %xmm6, %xmm3
  344. mulps %xmm7, %xmm5
  345. addps %xmm5, %xmm3
  346. movaps %xmm3, -20 * SIZE(X)
  347. addl $16 * SIZE, X
  348. ALIGN_3
  349. .L116:
  350. testl $4, M
  351. je .L117
  352. movaps -32 * SIZE(X), %xmm0
  353. movaps -28 * SIZE(X), %xmm1
  354. PSHUFD2( $0xb1, %xmm0, %xmm5)
  355. mulps %xmm6, %xmm0
  356. mulps %xmm7, %xmm5
  357. addps %xmm5, %xmm0
  358. movaps %xmm0, -32 * SIZE(X)
  359. PSHUFD2( $0xb1, %xmm1, %xmm5)
  360. mulps %xmm6, %xmm1
  361. mulps %xmm7, %xmm5
  362. addps %xmm5, %xmm1
  363. movaps %xmm1, -28 * SIZE(X)
  364. addl $8 * SIZE, X
  365. ALIGN_3
  366. .L117:
  367. testl $2, M
  368. je .L118
  369. movaps -32 * SIZE(X), %xmm0
  370. PSHUFD2( $0xb1, %xmm0, %xmm5)
  371. mulps %xmm6, %xmm0
  372. mulps %xmm7, %xmm5
  373. addps %xmm5, %xmm0
  374. movaps %xmm0, -32 * SIZE(X)
  375. addl $4 * SIZE, X
  376. ALIGN_3
  377. .L118:
  378. testl $1, M
  379. je .L999
  380. movsd -32 * SIZE(X), %xmm0
  381. PSHUFD2( $0xb1, %xmm0, %xmm5)
  382. mulps %xmm6, %xmm0
  383. mulps %xmm7, %xmm5
  384. addps %xmm5, %xmm0
  385. movlps %xmm0, -32 * SIZE(X)
  386. jmp .L999
  387. ALIGN_3
  388. .L120:
  389. PSHUFD2($0, %xmm0, %xmm6)
  390. PSHUFD2($0, %xmm1, %xmm1)
  391. subps %xmm1, %xmm7
  392. unpcklps %xmm1, %xmm7
  393. movl X, XX
  394. movl M, I
  395. sarl $3, I
  396. jle .L125
  397. movsd (X), %xmm0
  398. addl INCX, X
  399. movhps (X), %xmm0
  400. addl INCX, X
  401. movsd (X), %xmm1
  402. addl INCX, X
  403. movhps (X), %xmm1
  404. addl INCX, X
  405. movsd (X), %xmm2
  406. addl INCX, X
  407. movhps (X), %xmm2
  408. addl INCX, X
  409. movsd (X), %xmm3
  410. addl INCX, X
  411. movhps (X), %xmm3
  412. addl INCX, X
  413. decl I
  414. jle .L122
  415. ALIGN_4
  416. .L121:
  417. #ifdef PREFETCHW
  418. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  419. #endif
  420. PSHUFD2( $0xb1, %xmm0, %xmm5)
  421. mulps %xmm6, %xmm0
  422. mulps %xmm7, %xmm5
  423. addps %xmm5, %xmm0
  424. movlps %xmm0, (XX)
  425. addl INCX, XX
  426. movhps %xmm0, (XX)
  427. addl INCX, XX
  428. movsd (X), %xmm0
  429. addl INCX, X
  430. movhps (X), %xmm0
  431. addl INCX, X
  432. PSHUFD2( $0xb1, %xmm1, %xmm5)
  433. mulps %xmm6, %xmm1
  434. mulps %xmm7, %xmm5
  435. addps %xmm5, %xmm1
  436. movlps %xmm1, (XX)
  437. addl INCX, XX
  438. movhps %xmm1, (XX)
  439. addl INCX, XX
  440. movsd (X), %xmm1
  441. addl INCX, X
  442. movhps (X), %xmm1
  443. addl INCX, X
  444. PSHUFD2( $0xb1, %xmm2, %xmm5)
  445. mulps %xmm6, %xmm2
  446. mulps %xmm7, %xmm5
  447. addps %xmm5, %xmm2
  448. movlps %xmm2, (XX)
  449. addl INCX, XX
  450. movhps %xmm2, (XX)
  451. addl INCX, XX
  452. movsd (X), %xmm2
  453. addl INCX, X
  454. movhps (X), %xmm2
  455. addl INCX, X
  456. PSHUFD2( $0xb1, %xmm3, %xmm5)
  457. mulps %xmm6, %xmm3
  458. mulps %xmm7, %xmm5
  459. addps %xmm5, %xmm3
  460. movlps %xmm3, (XX)
  461. addl INCX, XX
  462. movhps %xmm3, (XX)
  463. addl INCX, XX
  464. movsd (X), %xmm3
  465. addl INCX, X
  466. movhps (X), %xmm3
  467. addl INCX, X
  468. decl I
  469. jg .L121
  470. ALIGN_4
  471. .L122:
  472. PSHUFD2( $0xb1, %xmm0, %xmm5)
  473. mulps %xmm6, %xmm0
  474. mulps %xmm7, %xmm5
  475. addps %xmm5, %xmm0
  476. movlps %xmm0, (XX)
  477. addl INCX, XX
  478. movhps %xmm0, (XX)
  479. addl INCX, XX
  480. PSHUFD2( $0xb1, %xmm1, %xmm5)
  481. mulps %xmm6, %xmm1
  482. mulps %xmm7, %xmm5
  483. addps %xmm5, %xmm1
  484. movlps %xmm1, (XX)
  485. addl INCX, XX
  486. movhps %xmm1, (XX)
  487. addl INCX, XX
  488. PSHUFD2( $0xb1, %xmm2, %xmm5)
  489. mulps %xmm6, %xmm2
  490. mulps %xmm7, %xmm5
  491. addps %xmm5, %xmm2
  492. movlps %xmm2, (XX)
  493. addl INCX, XX
  494. movhps %xmm2, (XX)
  495. addl INCX, XX
  496. PSHUFD2( $0xb1, %xmm3, %xmm5)
  497. mulps %xmm6, %xmm3
  498. mulps %xmm7, %xmm5
  499. addps %xmm5, %xmm3
  500. movlps %xmm3, (XX)
  501. addl INCX, XX
  502. movhps %xmm3, (XX)
  503. addl INCX, XX
  504. ALIGN_4
  505. .L125:
  506. testl $4, M
  507. je .L127
  508. movsd (X), %xmm0
  509. addl INCX, X
  510. movhps (X), %xmm0
  511. addl INCX, X
  512. PSHUFD2( $0xb1, %xmm0, %xmm5)
  513. mulps %xmm6, %xmm0
  514. mulps %xmm7, %xmm5
  515. addps %xmm5, %xmm0
  516. movlps %xmm0, (XX)
  517. addl INCX, XX
  518. movhps %xmm0, (XX)
  519. addl INCX, XX
  520. movsd (X), %xmm1
  521. addl INCX, X
  522. movhps (X), %xmm1
  523. addl INCX, X
  524. PSHUFD2( $0xb1, %xmm1, %xmm5)
  525. mulps %xmm6, %xmm1
  526. mulps %xmm7, %xmm5
  527. addps %xmm5, %xmm1
  528. movlps %xmm1, (XX)
  529. addl INCX, XX
  530. movhps %xmm1, (XX)
  531. addl INCX, XX
  532. ALIGN_3
  533. .L127:
  534. testl $2, M
  535. je .L128
  536. movsd (X), %xmm0
  537. addl INCX, X
  538. movhps (X), %xmm0
  539. addl INCX, X
  540. PSHUFD2( $0xb1, %xmm0, %xmm5)
  541. mulps %xmm6, %xmm0
  542. mulps %xmm7, %xmm5
  543. addps %xmm5, %xmm0
  544. movlps %xmm0, (XX)
  545. addl INCX, XX
  546. movhps %xmm0, (XX)
  547. addl INCX, XX
  548. ALIGN_3
  549. .L128:
  550. testl $1, M
  551. je .L999
  552. movsd (X), %xmm0
  553. PSHUFD2( $0xb1, %xmm0, %xmm5)
  554. mulps %xmm6, %xmm0
  555. mulps %xmm7, %xmm5
  556. addps %xmm5, %xmm0
  557. movlps %xmm0, (XX)
  558. jmp .L999
  559. ALIGN_3
  560. .L130:
  561. cmpl $2 * SIZE, INCX
  562. jne .L120
  563. #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
  564. PSHUFD2($0, %xmm0, %xmm6)
  565. PSHUFD2($0, %xmm1, %xmm1)
  566. subps %xmm1, %xmm7
  567. unpcklps %xmm1, %xmm7
  568. subl $-31 * SIZE, X
  569. testl $2 * SIZE, X
  570. je .L130x
  571. movsd -31 * SIZE(X), %xmm0
  572. PSHUFD2( $0xb1, %xmm0, %xmm5)
  573. mulps %xmm6, %xmm0
  574. mulps %xmm7, %xmm5
  575. addps %xmm5, %xmm0
  576. movlps %xmm0, -31 * SIZE(X)
  577. addl $2 * SIZE, X
  578. decl M
  579. jle .L999
  580. ALIGN_3
  581. .L130x:
  582. shufps $0xb1, %xmm7, %xmm7
  583. movaps -32 * SIZE(X), %xmm0
  584. movaps %xmm0, %xmm4
  585. movl M, I
  586. sarl $4, I
  587. jle .L135
  588. movaps -28 * SIZE(X), %xmm1
  589. decl I
  590. jle .L132
  591. ALIGN_4
  592. .L131:
  593. #ifdef PREFETCHW
  594. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  595. #endif
  596. movss %xmm1, %xmm0
  597. PSHUFD2($0x1b, %xmm0, %xmm5)
  598. mulps %xmm6, %xmm0
  599. mulps %xmm7, %xmm5
  600. addps %xmm5, %xmm0
  601. movaps %xmm0, %xmm2
  602. movss %xmm4, %xmm0
  603. movaps %xmm0, -32 * SIZE(X)
  604. movaps -24 * SIZE(X), %xmm0
  605. movss %xmm0, %xmm1
  606. PSHUFD2($0x1b, %xmm1, %xmm5)
  607. mulps %xmm6, %xmm1
  608. mulps %xmm7, %xmm5
  609. addps %xmm5, %xmm1
  610. movaps %xmm1, %xmm4
  611. movss %xmm2, %xmm1
  612. movaps %xmm1, -28 * SIZE(X)
  613. movaps -20 * SIZE(X), %xmm1
  614. movss %xmm1, %xmm0
  615. PSHUFD2($0x1b, %xmm0, %xmm5)
  616. mulps %xmm6, %xmm0
  617. mulps %xmm7, %xmm5
  618. addps %xmm5, %xmm0
  619. movaps %xmm0, %xmm2
  620. movss %xmm4, %xmm0
  621. movaps %xmm0, -24 * SIZE(X)
  622. movaps -16 * SIZE(X), %xmm0
  623. movss %xmm0, %xmm1
  624. PSHUFD2($0x1b, %xmm1, %xmm5)
  625. mulps %xmm6, %xmm1
  626. mulps %xmm7, %xmm5
  627. addps %xmm5, %xmm1
  628. movaps %xmm1, %xmm4
  629. movss %xmm2, %xmm1
  630. movaps %xmm1, -20 * SIZE(X)
  631. movaps -12 * SIZE(X), %xmm1
  632. #ifdef PREFETCHW
  633. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  634. #endif
  635. movss %xmm1, %xmm0
  636. PSHUFD2($0x1b, %xmm0, %xmm5)
  637. mulps %xmm6, %xmm0
  638. mulps %xmm7, %xmm5
  639. addps %xmm5, %xmm0
  640. movaps %xmm0, %xmm2
  641. movss %xmm4, %xmm0
  642. movaps %xmm0, -16 * SIZE(X)
  643. movaps -8 * SIZE(X), %xmm0
  644. movss %xmm0, %xmm1
  645. PSHUFD2($0x1b, %xmm1, %xmm5)
  646. mulps %xmm6, %xmm1
  647. mulps %xmm7, %xmm5
  648. addps %xmm5, %xmm1
  649. movaps %xmm1, %xmm4
  650. movss %xmm2, %xmm1
  651. movaps %xmm1, -12 * SIZE(X)
  652. movaps -4 * SIZE(X), %xmm1
  653. movss %xmm1, %xmm0
  654. PSHUFD2($0x1b, %xmm0, %xmm5)
  655. mulps %xmm6, %xmm0
  656. mulps %xmm7, %xmm5
  657. addps %xmm5, %xmm0
  658. movaps %xmm0, %xmm2
  659. movss %xmm4, %xmm0
  660. movaps %xmm0, -8 * SIZE(X)
  661. movaps 0 * SIZE(X), %xmm0
  662. movss %xmm0, %xmm1
  663. PSHUFD2($0x1b, %xmm1, %xmm5)
  664. mulps %xmm6, %xmm1
  665. mulps %xmm7, %xmm5
  666. addps %xmm5, %xmm1
  667. movaps %xmm1, %xmm4
  668. movss %xmm2, %xmm1
  669. movaps %xmm1, -4 * SIZE(X)
  670. movaps 4 * SIZE(X), %xmm1
  671. subl $-32 * SIZE, X
  672. decl I
  673. jg .L131
  674. ALIGN_4
  675. .L132:
  676. movss %xmm1, %xmm0
  677. PSHUFD2($0x1b, %xmm0, %xmm5)
  678. mulps %xmm6, %xmm0
  679. mulps %xmm7, %xmm5
  680. addps %xmm5, %xmm0
  681. movaps %xmm0, %xmm2
  682. movss %xmm4, %xmm0
  683. movaps %xmm0, -32 * SIZE(X)
  684. movaps -24 * SIZE(X), %xmm0
  685. movss %xmm0, %xmm1
  686. PSHUFD2($0x1b, %xmm1, %xmm5)
  687. mulps %xmm6, %xmm1
  688. mulps %xmm7, %xmm5
  689. addps %xmm5, %xmm1
  690. movaps %xmm1, %xmm4
  691. movss %xmm2, %xmm1
  692. movaps %xmm1, -28 * SIZE(X)
  693. movaps -20 * SIZE(X), %xmm1
  694. movss %xmm1, %xmm0
  695. PSHUFD2($0x1b, %xmm0, %xmm5)
  696. mulps %xmm6, %xmm0
  697. mulps %xmm7, %xmm5
  698. addps %xmm5, %xmm0
  699. movaps %xmm0, %xmm2
  700. movss %xmm4, %xmm0
  701. movaps %xmm0, -24 * SIZE(X)
  702. movaps -16 * SIZE(X), %xmm0
  703. movss %xmm0, %xmm1
  704. PSHUFD2($0x1b, %xmm1, %xmm5)
  705. mulps %xmm6, %xmm1
  706. mulps %xmm7, %xmm5
  707. addps %xmm5, %xmm1
  708. movaps %xmm1, %xmm4
  709. movss %xmm2, %xmm1
  710. movaps %xmm1, -20 * SIZE(X)
  711. movaps -12 * SIZE(X), %xmm1
  712. movss %xmm1, %xmm0
  713. PSHUFD2($0x1b, %xmm0, %xmm5)
  714. mulps %xmm6, %xmm0
  715. mulps %xmm7, %xmm5
  716. addps %xmm5, %xmm0
  717. movaps %xmm0, %xmm2
  718. movss %xmm4, %xmm0
  719. movaps %xmm0, -16 * SIZE(X)
  720. movaps -8 * SIZE(X), %xmm0
  721. movss %xmm0, %xmm1
  722. PSHUFD2($0x1b, %xmm1, %xmm5)
  723. mulps %xmm6, %xmm1
  724. mulps %xmm7, %xmm5
  725. addps %xmm5, %xmm1
  726. movaps %xmm1, %xmm4
  727. movss %xmm2, %xmm1
  728. movaps %xmm1, -12 * SIZE(X)
  729. movaps -4 * SIZE(X), %xmm1
  730. movss %xmm1, %xmm0
  731. PSHUFD2($0x1b, %xmm0, %xmm5)
  732. mulps %xmm6, %xmm0
  733. mulps %xmm7, %xmm5
  734. addps %xmm5, %xmm0
  735. movaps %xmm0, %xmm2
  736. movss %xmm4, %xmm0
  737. movaps %xmm0, -8 * SIZE(X)
  738. movaps 0 * SIZE(X), %xmm0
  739. movss %xmm0, %xmm1
  740. PSHUFD2($0x1b, %xmm1, %xmm5)
  741. mulps %xmm6, %xmm1
  742. mulps %xmm7, %xmm5
  743. addps %xmm5, %xmm1
  744. movaps %xmm1, %xmm4
  745. movss %xmm2, %xmm1
  746. movaps %xmm1, -4 * SIZE(X)
  747. subl $-32 * SIZE, X
  748. ALIGN_4
  749. .L135:
  750. testl $8, M
  751. je .L136
  752. movaps -28 * SIZE(X), %xmm1
  753. movss %xmm1, %xmm0
  754. PSHUFD2($0x1b, %xmm0, %xmm5)
  755. mulps %xmm6, %xmm0
  756. mulps %xmm7, %xmm5
  757. addps %xmm5, %xmm0
  758. movaps %xmm0, %xmm2
  759. movss %xmm4, %xmm0
  760. movaps %xmm0, -32 * SIZE(X)
  761. movaps -24 * SIZE(X), %xmm0
  762. movss %xmm0, %xmm1
  763. PSHUFD2($0x1b, %xmm1, %xmm5)
  764. mulps %xmm6, %xmm1
  765. mulps %xmm7, %xmm5
  766. addps %xmm5, %xmm1
  767. movaps %xmm1, %xmm4
  768. movss %xmm2, %xmm1
  769. movaps %xmm1, -28 * SIZE(X)
  770. movaps -20 * SIZE(X), %xmm1
  771. movss %xmm1, %xmm0
  772. PSHUFD2($0x1b, %xmm0, %xmm5)
  773. mulps %xmm6, %xmm0
  774. mulps %xmm7, %xmm5
  775. addps %xmm5, %xmm0
  776. movaps %xmm0, %xmm2
  777. movss %xmm4, %xmm0
  778. movaps %xmm0, -24 * SIZE(X)
  779. movaps -16 * SIZE(X), %xmm0
  780. movss %xmm0, %xmm1
  781. PSHUFD2($0x1b, %xmm1, %xmm5)
  782. mulps %xmm6, %xmm1
  783. mulps %xmm7, %xmm5
  784. addps %xmm5, %xmm1
  785. movaps %xmm1, %xmm4
  786. movss %xmm2, %xmm1
  787. movaps %xmm1, -20 * SIZE(X)
  788. addl $16 * SIZE, X
  789. ALIGN_3
  790. .L136:
  791. testl $4, M
  792. je .L137
  793. movaps -28 * SIZE(X), %xmm1
  794. movss %xmm1, %xmm0
  795. PSHUFD2($0x1b, %xmm0, %xmm5)
  796. mulps %xmm6, %xmm0
  797. mulps %xmm7, %xmm5
  798. addps %xmm5, %xmm0
  799. movaps %xmm0, %xmm2
  800. movss %xmm4, %xmm0
  801. movaps %xmm0, -32 * SIZE(X)
  802. movaps -24 * SIZE(X), %xmm0
  803. movss %xmm0, %xmm1
  804. PSHUFD2($0x1b, %xmm1, %xmm5)
  805. mulps %xmm6, %xmm1
  806. mulps %xmm7, %xmm5
  807. addps %xmm5, %xmm1
  808. movaps %xmm1, %xmm4
  809. movss %xmm2, %xmm1
  810. movaps %xmm1, -28 * SIZE(X)
  811. addl $8 * SIZE, X
  812. ALIGN_3
  813. .L137:
  814. testl $2, M
  815. je .L138
  816. movaps -28 * SIZE(X), %xmm1
  817. movss %xmm1, %xmm0
  818. PSHUFD2($0x1b, %xmm0, %xmm5)
  819. mulps %xmm6, %xmm0
  820. mulps %xmm7, %xmm5
  821. addps %xmm5, %xmm0
  822. movaps %xmm0, %xmm2
  823. movss %xmm4, %xmm0
  824. movaps %xmm0, -32 * SIZE(X)
  825. movaps %xmm2, %xmm4
  826. movaps %xmm1, %xmm0
  827. addl $4 * SIZE, X
  828. ALIGN_3
  829. .L138:
  830. movss %xmm4, -32 * SIZE(X)
  831. testl $1, M
  832. je .L999
  833. PSHUFD2( $0x1b, %xmm0, %xmm5)
  834. mulps %xmm6, %xmm0
  835. mulps %xmm7, %xmm5
  836. addps %xmm5, %xmm0
  837. PSHUFD1( $0x39, %xmm0)
  838. movlps %xmm0, -31 * SIZE(X)
  839. jmp .L999
  840. ALIGN_3
  841. #else
  842. PSHUFD2($0, %xmm0, %xmm6)
  843. PSHUFD2($0, %xmm1, %xmm1)
  844. subps %xmm1, %xmm7
  845. unpcklps %xmm1, %xmm7
  846. subl $-32 * SIZE, X
  847. testl $2 * SIZE, X
  848. je .L130x
  849. #ifdef movsd
  850. xorps %xmm0, %xmm0
  851. #endif
  852. movsd -32 * SIZE(X), %xmm0
  853. PSHUFD2( $0xb1, %xmm0, %xmm5)
  854. mulps %xmm6, %xmm0
  855. mulps %xmm7, %xmm5
  856. addps %xmm5, %xmm0
  857. movlps %xmm0, -32 * SIZE(X)
  858. addl $2 * SIZE, X
  859. decl M
  860. jle .L999
  861. ALIGN_3
  862. .L130x:
  863. movl M, I
  864. sarl $4, I
  865. jle .L135
  866. movsd -32 * SIZE(X), %xmm0
  867. movhps -30 * SIZE(X), %xmm0
  868. movsd -28 * SIZE(X), %xmm1
  869. movhps -26 * SIZE(X), %xmm1
  870. movsd -24 * SIZE(X), %xmm2
  871. movhps -22 * SIZE(X), %xmm2
  872. movsd -20 * SIZE(X), %xmm3
  873. movhps -18 * SIZE(X), %xmm3
  874. decl I
  875. jle .L132
  876. ALIGN_4
  877. .L131:
  878. #ifdef PREFETCHW
  879. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  880. #endif
  881. PSHUFD2( $0xb1, %xmm0, %xmm5)
  882. mulps %xmm6, %xmm0
  883. mulps %xmm7, %xmm5
  884. addps %xmm5, %xmm0
  885. movlps %xmm0, -32 * SIZE(X)
  886. movhps %xmm0, -30 * SIZE(X)
  887. movsd -16 * SIZE(X), %xmm0
  888. movhps -14 * SIZE(X), %xmm0
  889. PSHUFD2( $0xb1, %xmm1, %xmm5)
  890. mulps %xmm6, %xmm1
  891. mulps %xmm7, %xmm5
  892. addps %xmm5, %xmm1
  893. movlps %xmm1, -28 * SIZE(X)
  894. movhps %xmm1, -26 * SIZE(X)
  895. movsd -12 * SIZE(X), %xmm1
  896. movhps -10 * SIZE(X), %xmm1
  897. PSHUFD2( $0xb1, %xmm2, %xmm5)
  898. mulps %xmm6, %xmm2
  899. mulps %xmm7, %xmm5
  900. addps %xmm5, %xmm2
  901. movlps %xmm2, -24 * SIZE(X)
  902. movhps %xmm2, -22 * SIZE(X)
  903. movsd -8 * SIZE(X), %xmm2
  904. movhps -6 * SIZE(X), %xmm2
  905. PSHUFD2( $0xb1, %xmm3, %xmm5)
  906. mulps %xmm6, %xmm3
  907. mulps %xmm7, %xmm5
  908. addps %xmm5, %xmm3
  909. movlps %xmm3, -20 * SIZE(X)
  910. movhps %xmm3, -18 * SIZE(X)
  911. movsd -4 * SIZE(X), %xmm3
  912. movhps -2 * SIZE(X), %xmm3
  913. #ifdef PREFETCHW
  914. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  915. #endif
  916. PSHUFD2( $0xb1, %xmm0, %xmm5)
  917. mulps %xmm6, %xmm0
  918. mulps %xmm7, %xmm5
  919. addps %xmm5, %xmm0
  920. movlps %xmm0, -16 * SIZE(X)
  921. movhps %xmm0, -14 * SIZE(X)
  922. movsd 0 * SIZE(X), %xmm0
  923. movhps 2 * SIZE(X), %xmm0
  924. PSHUFD2( $0xb1, %xmm1, %xmm5)
  925. mulps %xmm6, %xmm1
  926. mulps %xmm7, %xmm5
  927. addps %xmm5, %xmm1
  928. movlps %xmm1, -12 * SIZE(X)
  929. movhps %xmm1, -10 * SIZE(X)
  930. movsd 4 * SIZE(X), %xmm1
  931. movhps 6 * SIZE(X), %xmm1
  932. PSHUFD2( $0xb1, %xmm2, %xmm5)
  933. mulps %xmm6, %xmm2
  934. mulps %xmm7, %xmm5
  935. addps %xmm5, %xmm2
  936. movlps %xmm2, -8 * SIZE(X)
  937. movhps %xmm2, -6 * SIZE(X)
  938. movsd 8 * SIZE(X), %xmm2
  939. movhps 10 * SIZE(X), %xmm2
  940. PSHUFD2( $0xb1, %xmm3, %xmm5)
  941. mulps %xmm6, %xmm3
  942. mulps %xmm7, %xmm5
  943. addps %xmm5, %xmm3
  944. movlps %xmm3, -4 * SIZE(X)
  945. movhps %xmm3, -2 * SIZE(X)
  946. movsd 12 * SIZE(X), %xmm3
  947. movhps 14 * SIZE(X), %xmm3
  948. subl $-32 * SIZE, X
  949. decl I
  950. jg .L131
  951. ALIGN_4
  952. .L132:
  953. PSHUFD2( $0xb1, %xmm0, %xmm5)
  954. mulps %xmm6, %xmm0
  955. mulps %xmm7, %xmm5
  956. addps %xmm5, %xmm0
  957. movlps %xmm0, -32 * SIZE(X)
  958. movhps %xmm0, -30 * SIZE(X)
  959. movsd -16 * SIZE(X), %xmm0
  960. movhps -14 * SIZE(X), %xmm0
  961. PSHUFD2( $0xb1, %xmm1, %xmm5)
  962. mulps %xmm6, %xmm1
  963. mulps %xmm7, %xmm5
  964. addps %xmm5, %xmm1
  965. movlps %xmm1, -28 * SIZE(X)
  966. movhps %xmm1, -26 * SIZE(X)
  967. movsd -12 * SIZE(X), %xmm1
  968. movhps -10 * SIZE(X), %xmm1
  969. PSHUFD2( $0xb1, %xmm2, %xmm5)
  970. mulps %xmm6, %xmm2
  971. mulps %xmm7, %xmm5
  972. addps %xmm5, %xmm2
  973. movlps %xmm2, -24 * SIZE(X)
  974. movhps %xmm2, -22 * SIZE(X)
  975. movsd -8 * SIZE(X), %xmm2
  976. movhps -6 * SIZE(X), %xmm2
  977. PSHUFD2( $0xb1, %xmm3, %xmm5)
  978. mulps %xmm6, %xmm3
  979. mulps %xmm7, %xmm5
  980. addps %xmm5, %xmm3
  981. movlps %xmm3, -20 * SIZE(X)
  982. movhps %xmm3, -18 * SIZE(X)
  983. movsd -4 * SIZE(X), %xmm3
  984. movhps -2 * SIZE(X), %xmm3
  985. PSHUFD2( $0xb1, %xmm0, %xmm5)
  986. mulps %xmm6, %xmm0
  987. mulps %xmm7, %xmm5
  988. addps %xmm5, %xmm0
  989. movlps %xmm0, -16 * SIZE(X)
  990. movhps %xmm0, -14 * SIZE(X)
  991. PSHUFD2( $0xb1, %xmm1, %xmm5)
  992. mulps %xmm6, %xmm1
  993. mulps %xmm7, %xmm5
  994. addps %xmm5, %xmm1
  995. movlps %xmm1, -12 * SIZE(X)
  996. movhps %xmm1, -10 * SIZE(X)
  997. PSHUFD2( $0xb1, %xmm2, %xmm5)
  998. mulps %xmm6, %xmm2
  999. mulps %xmm7, %xmm5
  1000. addps %xmm5, %xmm2
  1001. movlps %xmm2, -8 * SIZE(X)
  1002. movhps %xmm2, -6 * SIZE(X)
  1003. PSHUFD2( $0xb1, %xmm3, %xmm5)
  1004. mulps %xmm6, %xmm3
  1005. mulps %xmm7, %xmm5
  1006. addps %xmm5, %xmm3
  1007. movlps %xmm3, -4 * SIZE(X)
  1008. movhps %xmm3, -2 * SIZE(X)
  1009. subl $-32 * SIZE, X
  1010. ALIGN_4
  1011. .L135:
  1012. testl $8, M
  1013. je .L136
  1014. movsd -32 * SIZE(X), %xmm0
  1015. movhps -30 * SIZE(X), %xmm0
  1016. PSHUFD2( $0xb1, %xmm0, %xmm5)
  1017. mulps %xmm6, %xmm0
  1018. mulps %xmm7, %xmm5
  1019. addps %xmm5, %xmm0
  1020. movlps %xmm0, -32 * SIZE(X)
  1021. movhps %xmm0, -30 * SIZE(X)
  1022. movsd -28 * SIZE(X), %xmm1
  1023. movhps -26 * SIZE(X), %xmm1
  1024. PSHUFD2( $0xb1, %xmm1, %xmm5)
  1025. mulps %xmm6, %xmm1
  1026. mulps %xmm7, %xmm5
  1027. addps %xmm5, %xmm1
  1028. movlps %xmm1, -28 * SIZE(X)
  1029. movhps %xmm1, -26 * SIZE(X)
  1030. movsd -24 * SIZE(X), %xmm2
  1031. movhps -22 * SIZE(X), %xmm2
  1032. PSHUFD2( $0xb1, %xmm2, %xmm5)
  1033. mulps %xmm6, %xmm2
  1034. mulps %xmm7, %xmm5
  1035. addps %xmm5, %xmm2
  1036. movlps %xmm2, -24 * SIZE(X)
  1037. movhps %xmm2, -22 * SIZE(X)
  1038. movsd -20 * SIZE(X), %xmm3
  1039. movhps -18 * SIZE(X), %xmm3
  1040. PSHUFD2( $0xb1, %xmm3, %xmm5)
  1041. mulps %xmm6, %xmm3
  1042. mulps %xmm7, %xmm5
  1043. addps %xmm5, %xmm3
  1044. movlps %xmm3, -20 * SIZE(X)
  1045. movhps %xmm3, -18 * SIZE(X)
  1046. addl $16 * SIZE, X
  1047. ALIGN_3
  1048. .L136:
  1049. testl $4, M
  1050. je .L137
  1051. movsd -32 * SIZE(X), %xmm0
  1052. movhps -30 * SIZE(X), %xmm0
  1053. movsd -28 * SIZE(X), %xmm1
  1054. movhps -26 * SIZE(X), %xmm1
  1055. PSHUFD2( $0xb1, %xmm0, %xmm5)
  1056. mulps %xmm6, %xmm0
  1057. mulps %xmm7, %xmm5
  1058. addps %xmm5, %xmm0
  1059. movlps %xmm0, -32 * SIZE(X)
  1060. movhps %xmm0, -30 * SIZE(X)
  1061. PSHUFD2( $0xb1, %xmm1, %xmm5)
  1062. mulps %xmm6, %xmm1
  1063. mulps %xmm7, %xmm5
  1064. addps %xmm5, %xmm1
  1065. movlps %xmm1, -28 * SIZE(X)
  1066. movhps %xmm1, -26 * SIZE(X)
  1067. addl $8 * SIZE, X
  1068. ALIGN_3
  1069. .L137:
  1070. testl $2, M
  1071. je .L138
  1072. movsd -32 * SIZE(X), %xmm0
  1073. movhps -30 * SIZE(X), %xmm0
  1074. PSHUFD2( $0xb1, %xmm0, %xmm5)
  1075. mulps %xmm6, %xmm0
  1076. mulps %xmm7, %xmm5
  1077. addps %xmm5, %xmm0
  1078. movlps %xmm0, -32 * SIZE(X)
  1079. movhps %xmm0, -30 * SIZE(X)
  1080. addl $4 * SIZE, X
  1081. ALIGN_3
  1082. .L138:
  1083. testl $1, M
  1084. je .L999
  1085. movsd -32 * SIZE(X), %xmm0
  1086. PSHUFD2( $0xb1, %xmm0, %xmm5)
  1087. mulps %xmm6, %xmm0
  1088. mulps %xmm7, %xmm5
  1089. addps %xmm5, %xmm0
  1090. movlps %xmm0, -32 * SIZE(X)
  1091. ALIGN_3
  1092. #endif
  1093. .L999:
  1094. xorl %eax, %eax
  1095. popl %ebp
  1096. popl %ebx
  1097. popl %esi
  1098. popl %edi
  1099. ret
  1100. EPILOGUE