You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zscal_sse.S 25 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef WINDOWS_ABI
  41. #define M ARG1
  42. #define X ARG4
  43. #define INCX ARG5
  44. #else
  45. #define M ARG1
  46. #define X ARG2
  47. #define INCX ARG3
  48. #endif
  49. #define XX %r10
  50. #define FLAG %r11
  51. #define I %rax
  52. #include "l1param.h"
  53. PROLOGUE
  54. PROFCODE
  55. #ifdef WINDOWS_ABI
  56. movaps %xmm3, %xmm0
  57. movsd 40(%rsp), %xmm1
  58. movq 48(%rsp), X
  59. movq 56(%rsp), INCX
  60. #endif
  61. SAVEREGISTERS
  62. salq $ZBASE_SHIFT, INCX
  63. xor FLAG, FLAG
  64. testq M, M
  65. jle .L999
  66. pxor %xmm15, %xmm15
  67. comiss %xmm0, %xmm15
  68. jne .L100 # Alpha_r != ZERO
  69. jp .L100 # Alpha_r == NAN
  70. comiss %xmm1, %xmm15
  71. jne .L100 # Alpha_i != ZERO
  72. /* Alpha == ZERO */
  73. cmpq $2 * SIZE, INCX
  74. jne .L50
  75. /* INCX == 1 */
  76. cmpq $3, M
  77. jle .L13
  78. testq $4, X
  79. je .L05
  80. movss %xmm15, 0 * SIZE(X)
  81. addq $SIZE, X
  82. movq $1, FLAG
  83. decq M
  84. ALIGN_3
  85. .L05:
  86. testq $8, X
  87. je .L06
  88. movlps %xmm15, 0 * SIZE(X)
  89. addq $2 * SIZE, X
  90. subq $1, M
  91. ALIGN_3
  92. .L06:
  93. movq M, I # rcx = n
  94. sarq $3, I
  95. jle .L12
  96. ALIGN_4
  97. .L11:
  98. #ifdef PREFETCHW
  99. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  100. #endif
  101. movaps %xmm15, 0 * SIZE(X)
  102. movaps %xmm15, 4 * SIZE(X)
  103. movaps %xmm15, 8 * SIZE(X)
  104. movaps %xmm15, 12 * SIZE(X)
  105. addq $16 * SIZE, X
  106. decq I
  107. jg .L11
  108. ALIGN_4
  109. .L12:
  110. testq $7, M
  111. je .L19
  112. testq $4, M
  113. je .L13
  114. movaps %xmm15, 0 * SIZE(X)
  115. movaps %xmm15, 4 * SIZE(X)
  116. addq $8 * SIZE, X
  117. ALIGN_3
  118. .L13:
  119. testq $2, M
  120. je .L14
  121. movlps %xmm15, 0 * SIZE(X)
  122. movhps %xmm15, 2 * SIZE(X)
  123. addq $4 * SIZE, X
  124. ALIGN_3
  125. .L14:
  126. testq $1, M
  127. je .L19
  128. movlps %xmm15, 0 * SIZE(X)
  129. addq $2 * SIZE, X
  130. ALIGN_3
  131. .L19:
  132. testq $1, FLAG
  133. je .L999
  134. movss %xmm15, 0 * SIZE(X)
  135. jmp .L999
  136. ALIGN_4
  137. /* incx != 1 */
  138. .L50:
  139. movq M, I # rcx = n
  140. sarq $2, I
  141. jle .L52
  142. ALIGN_4
  143. .L51:
  144. #ifdef PREFETCHW
  145. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  146. #endif
  147. movsd %xmm15, 0 * SIZE(X)
  148. addq INCX, X
  149. movsd %xmm15, 0 * SIZE(X)
  150. addq INCX, X
  151. movsd %xmm15, 0 * SIZE(X)
  152. addq INCX, X
  153. movsd %xmm15, 0 * SIZE(X)
  154. addq INCX, X
  155. decq I
  156. jg .L51
  157. ALIGN_4
  158. .L52:
  159. testq $2, M
  160. je .L53
  161. movsd %xmm15, 0 * SIZE(X)
  162. addq INCX, X
  163. movsd %xmm15, 0 * SIZE(X)
  164. addq INCX, X
  165. ALIGN_3
  166. .L53:
  167. testq $1, M
  168. je .L999
  169. movsd %xmm15, 0 * SIZE(X)
  170. jmp .L999
  171. ALIGN_4
  172. /* Alpha != ZERO */
  173. .L100:
  174. testq $SIZE, X
  175. jne .L130
  176. cmpq $2 * SIZE, INCX
  177. jne .L120
  178. pshufd $0, %xmm0, %xmm14
  179. pshufd $0, %xmm1, %xmm1
  180. subps %xmm1, %xmm15
  181. unpcklps %xmm1, %xmm15
  182. subq $-32 * SIZE, X
  183. testq $2 * SIZE, X
  184. je .L105
  185. movsd -32 * SIZE(X), %xmm0
  186. pshufd $0xb1, %xmm0, %xmm8
  187. mulps %xmm14, %xmm0
  188. mulps %xmm15, %xmm8
  189. addps %xmm8, %xmm0
  190. movlps %xmm0, -32 * SIZE(X)
  191. addq $2 * SIZE, X
  192. decq M
  193. jle .L999
  194. ALIGN_3
  195. .L105:
  196. movq M, I
  197. sarq $4, I
  198. jle .L115
  199. movaps -32 * SIZE(X), %xmm0
  200. movaps -28 * SIZE(X), %xmm1
  201. movaps -24 * SIZE(X), %xmm2
  202. movaps -20 * SIZE(X), %xmm3
  203. movaps -16 * SIZE(X), %xmm4
  204. movaps -12 * SIZE(X), %xmm5
  205. movaps -8 * SIZE(X), %xmm6
  206. movaps -4 * SIZE(X), %xmm7
  207. decq I
  208. jle .L112
  209. ALIGN_4
  210. .L111:
  211. #ifdef PREFETCHW
  212. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  213. #endif
  214. pshufd $0xb1, %xmm0, %xmm8
  215. mulps %xmm14, %xmm0
  216. mulps %xmm15, %xmm8
  217. addps %xmm8, %xmm0
  218. movaps %xmm0, -32 * SIZE(X)
  219. movaps 0 * SIZE(X), %xmm0
  220. pshufd $0xb1, %xmm1, %xmm8
  221. mulps %xmm14, %xmm1
  222. mulps %xmm15, %xmm8
  223. addps %xmm8, %xmm1
  224. movaps %xmm1, -28 * SIZE(X)
  225. movaps 4 * SIZE(X), %xmm1
  226. pshufd $0xb1, %xmm2, %xmm8
  227. mulps %xmm14, %xmm2
  228. mulps %xmm15, %xmm8
  229. addps %xmm8, %xmm2
  230. movaps %xmm2, -24 * SIZE(X)
  231. movaps 8 * SIZE(X), %xmm2
  232. pshufd $0xb1, %xmm3, %xmm8
  233. mulps %xmm14, %xmm3
  234. mulps %xmm15, %xmm8
  235. addps %xmm8, %xmm3
  236. movaps %xmm3, -20 * SIZE(X)
  237. movaps 12 * SIZE(X), %xmm3
  238. #ifdef PREFETCHW
  239. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  240. #endif
  241. pshufd $0xb1, %xmm4, %xmm8
  242. mulps %xmm14, %xmm4
  243. mulps %xmm15, %xmm8
  244. addps %xmm8, %xmm4
  245. movaps %xmm4, -16 * SIZE(X)
  246. movaps 16 * SIZE(X), %xmm4
  247. pshufd $0xb1, %xmm5, %xmm8
  248. mulps %xmm14, %xmm5
  249. mulps %xmm15, %xmm8
  250. addps %xmm8, %xmm5
  251. movaps %xmm5, -12 * SIZE(X)
  252. movaps 20 * SIZE(X), %xmm5
  253. pshufd $0xb1, %xmm6, %xmm8
  254. mulps %xmm14, %xmm6
  255. mulps %xmm15, %xmm8
  256. addps %xmm8, %xmm6
  257. movaps %xmm6, -8 * SIZE(X)
  258. movaps 24 * SIZE(X), %xmm6
  259. pshufd $0xb1, %xmm7, %xmm8
  260. mulps %xmm14, %xmm7
  261. mulps %xmm15, %xmm8
  262. addps %xmm8, %xmm7
  263. movaps %xmm7, -4 * SIZE(X)
  264. movaps 28 * SIZE(X), %xmm7
  265. subq $-32 * SIZE, X
  266. decq I
  267. jg .L111
  268. ALIGN_4
  269. .L112:
  270. pshufd $0xb1, %xmm0, %xmm8
  271. mulps %xmm14, %xmm0
  272. mulps %xmm15, %xmm8
  273. addps %xmm8, %xmm0
  274. movaps %xmm0, -32 * SIZE(X)
  275. pshufd $0xb1, %xmm1, %xmm8
  276. mulps %xmm14, %xmm1
  277. mulps %xmm15, %xmm8
  278. addps %xmm8, %xmm1
  279. movaps %xmm1, -28 * SIZE(X)
  280. pshufd $0xb1, %xmm2, %xmm8
  281. mulps %xmm14, %xmm2
  282. mulps %xmm15, %xmm8
  283. addps %xmm8, %xmm2
  284. movaps %xmm2, -24 * SIZE(X)
  285. pshufd $0xb1, %xmm3, %xmm8
  286. mulps %xmm14, %xmm3
  287. mulps %xmm15, %xmm8
  288. addps %xmm8, %xmm3
  289. movaps %xmm3, -20 * SIZE(X)
  290. pshufd $0xb1, %xmm4, %xmm8
  291. mulps %xmm14, %xmm4
  292. mulps %xmm15, %xmm8
  293. addps %xmm8, %xmm4
  294. movaps %xmm4, -16 * SIZE(X)
  295. pshufd $0xb1, %xmm5, %xmm8
  296. mulps %xmm14, %xmm5
  297. mulps %xmm15, %xmm8
  298. addps %xmm8, %xmm5
  299. movaps %xmm5, -12 * SIZE(X)
  300. pshufd $0xb1, %xmm6, %xmm8
  301. mulps %xmm14, %xmm6
  302. mulps %xmm15, %xmm8
  303. addps %xmm8, %xmm6
  304. movaps %xmm6, -8 * SIZE(X)
  305. pshufd $0xb1, %xmm7, %xmm8
  306. mulps %xmm14, %xmm7
  307. mulps %xmm15, %xmm8
  308. addps %xmm8, %xmm7
  309. movaps %xmm7, -4 * SIZE(X)
  310. subq $-32 * SIZE, X
  311. ALIGN_4
  312. .L115:
  313. testq $8, M
  314. je .L116
  315. movaps -32 * SIZE(X), %xmm0
  316. movaps -28 * SIZE(X), %xmm1
  317. pshufd $0xb1, %xmm0, %xmm8
  318. mulps %xmm14, %xmm0
  319. mulps %xmm15, %xmm8
  320. addps %xmm8, %xmm0
  321. movaps %xmm0, -32 * SIZE(X)
  322. pshufd $0xb1, %xmm1, %xmm8
  323. mulps %xmm14, %xmm1
  324. mulps %xmm15, %xmm8
  325. addps %xmm8, %xmm1
  326. movaps %xmm1, -28 * SIZE(X)
  327. movaps -24 * SIZE(X), %xmm2
  328. movaps -20 * SIZE(X), %xmm3
  329. pshufd $0xb1, %xmm2, %xmm8
  330. mulps %xmm14, %xmm2
  331. mulps %xmm15, %xmm8
  332. addps %xmm8, %xmm2
  333. movaps %xmm2, -24 * SIZE(X)
  334. pshufd $0xb1, %xmm3, %xmm8
  335. mulps %xmm14, %xmm3
  336. mulps %xmm15, %xmm8
  337. addps %xmm8, %xmm3
  338. movaps %xmm3, -20 * SIZE(X)
  339. addq $16 * SIZE, X
  340. ALIGN_3
  341. .L116:
  342. testq $4, M
  343. je .L117
  344. movaps -32 * SIZE(X), %xmm0
  345. movaps -28 * SIZE(X), %xmm1
  346. pshufd $0xb1, %xmm0, %xmm8
  347. mulps %xmm14, %xmm0
  348. mulps %xmm15, %xmm8
  349. addps %xmm8, %xmm0
  350. movaps %xmm0, -32 * SIZE(X)
  351. pshufd $0xb1, %xmm1, %xmm8
  352. mulps %xmm14, %xmm1
  353. mulps %xmm15, %xmm8
  354. addps %xmm8, %xmm1
  355. movaps %xmm1, -28 * SIZE(X)
  356. addq $8 * SIZE, X
  357. ALIGN_3
  358. .L117:
  359. testq $2, M
  360. je .L118
  361. movaps -32 * SIZE(X), %xmm0
  362. pshufd $0xb1, %xmm0, %xmm8
  363. mulps %xmm14, %xmm0
  364. mulps %xmm15, %xmm8
  365. addps %xmm8, %xmm0
  366. movaps %xmm0, -32 * SIZE(X)
  367. addq $4 * SIZE, X
  368. ALIGN_3
  369. .L118:
  370. testq $1, M
  371. je .L999
  372. movsd -32 * SIZE(X), %xmm0
  373. pshufd $0xb1, %xmm0, %xmm8
  374. mulps %xmm14, %xmm0
  375. mulps %xmm15, %xmm8
  376. addps %xmm8, %xmm0
  377. movlps %xmm0, -32 * SIZE(X)
  378. jmp .L999
  379. ALIGN_3
  380. .L120:
  381. pshufd $0, %xmm0, %xmm14
  382. pshufd $0, %xmm1, %xmm1
  383. subps %xmm1, %xmm15
  384. unpcklps %xmm1, %xmm15
  385. movq X, XX
  386. movq M, I
  387. sarq $3, I
  388. jle .L125
  389. movsd (X), %xmm0
  390. addq INCX, X
  391. movhps (X), %xmm0
  392. addq INCX, X
  393. movsd (X), %xmm1
  394. addq INCX, X
  395. movhps (X), %xmm1
  396. addq INCX, X
  397. movsd (X), %xmm2
  398. addq INCX, X
  399. movhps (X), %xmm2
  400. addq INCX, X
  401. movsd (X), %xmm3
  402. addq INCX, X
  403. movhps (X), %xmm3
  404. addq INCX, X
  405. decq I
  406. jle .L122
  407. ALIGN_4
  408. .L121:
  409. #ifdef PREFETCHW
  410. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  411. #endif
  412. pshufd $0xb1, %xmm0, %xmm8
  413. mulps %xmm14, %xmm0
  414. mulps %xmm15, %xmm8
  415. addps %xmm8, %xmm0
  416. movlps %xmm0, (XX)
  417. addq INCX, XX
  418. movhps %xmm0, (XX)
  419. addq INCX, XX
  420. movsd (X), %xmm0
  421. addq INCX, X
  422. movhps (X), %xmm0
  423. addq INCX, X
  424. pshufd $0xb1, %xmm1, %xmm8
  425. mulps %xmm14, %xmm1
  426. mulps %xmm15, %xmm8
  427. addps %xmm8, %xmm1
  428. movlps %xmm1, (XX)
  429. addq INCX, XX
  430. movhps %xmm1, (XX)
  431. addq INCX, XX
  432. movsd (X), %xmm1
  433. addq INCX, X
  434. movhps (X), %xmm1
  435. addq INCX, X
  436. pshufd $0xb1, %xmm2, %xmm8
  437. mulps %xmm14, %xmm2
  438. mulps %xmm15, %xmm8
  439. addps %xmm8, %xmm2
  440. movlps %xmm2, (XX)
  441. addq INCX, XX
  442. movhps %xmm2, (XX)
  443. addq INCX, XX
  444. movsd (X), %xmm2
  445. addq INCX, X
  446. movhps (X), %xmm2
  447. addq INCX, X
  448. pshufd $0xb1, %xmm3, %xmm8
  449. mulps %xmm14, %xmm3
  450. mulps %xmm15, %xmm8
  451. addps %xmm8, %xmm3
  452. movlps %xmm3, (XX)
  453. addq INCX, XX
  454. movhps %xmm3, (XX)
  455. addq INCX, XX
  456. movsd (X), %xmm3
  457. addq INCX, X
  458. movhps (X), %xmm3
  459. addq INCX, X
  460. decq I
  461. jg .L121
  462. ALIGN_4
  463. .L122:
  464. pshufd $0xb1, %xmm0, %xmm8
  465. mulps %xmm14, %xmm0
  466. mulps %xmm15, %xmm8
  467. addps %xmm8, %xmm0
  468. movlps %xmm0, (XX)
  469. addq INCX, XX
  470. movhps %xmm0, (XX)
  471. addq INCX, XX
  472. pshufd $0xb1, %xmm1, %xmm8
  473. mulps %xmm14, %xmm1
  474. mulps %xmm15, %xmm8
  475. addps %xmm8, %xmm1
  476. movlps %xmm1, (XX)
  477. addq INCX, XX
  478. movhps %xmm1, (XX)
  479. addq INCX, XX
  480. pshufd $0xb1, %xmm2, %xmm8
  481. mulps %xmm14, %xmm2
  482. mulps %xmm15, %xmm8
  483. addps %xmm8, %xmm2
  484. movlps %xmm2, (XX)
  485. addq INCX, XX
  486. movhps %xmm2, (XX)
  487. addq INCX, XX
  488. pshufd $0xb1, %xmm3, %xmm8
  489. mulps %xmm14, %xmm3
  490. mulps %xmm15, %xmm8
  491. addps %xmm8, %xmm3
  492. movlps %xmm3, (XX)
  493. addq INCX, XX
  494. movhps %xmm3, (XX)
  495. addq INCX, XX
  496. ALIGN_4
  497. .L125:
  498. testq $4, M
  499. je .L127
  500. movsd (X), %xmm0
  501. addq INCX, X
  502. movhps (X), %xmm0
  503. addq INCX, X
  504. pshufd $0xb1, %xmm0, %xmm8
  505. mulps %xmm14, %xmm0
  506. mulps %xmm15, %xmm8
  507. addps %xmm8, %xmm0
  508. movlps %xmm0, (XX)
  509. addq INCX, XX
  510. movhps %xmm0, (XX)
  511. addq INCX, XX
  512. movsd (X), %xmm1
  513. addq INCX, X
  514. movhps (X), %xmm1
  515. addq INCX, X
  516. pshufd $0xb1, %xmm1, %xmm8
  517. mulps %xmm14, %xmm1
  518. mulps %xmm15, %xmm8
  519. addps %xmm8, %xmm1
  520. movlps %xmm1, (XX)
  521. addq INCX, XX
  522. movhps %xmm1, (XX)
  523. addq INCX, XX
  524. ALIGN_3
  525. .L127:
  526. testq $2, M
  527. je .L128
  528. movsd (X), %xmm0
  529. addq INCX, X
  530. movhps (X), %xmm0
  531. addq INCX, X
  532. pshufd $0xb1, %xmm0, %xmm8
  533. mulps %xmm14, %xmm0
  534. mulps %xmm15, %xmm8
  535. addps %xmm8, %xmm0
  536. movlps %xmm0, (XX)
  537. addq INCX, XX
  538. movhps %xmm0, (XX)
  539. addq INCX, XX
  540. ALIGN_3
  541. .L128:
  542. testq $1, M
  543. je .L999
  544. movsd (X), %xmm0
  545. pshufd $0xb1, %xmm0, %xmm8
  546. mulps %xmm14, %xmm0
  547. mulps %xmm15, %xmm8
  548. addps %xmm8, %xmm0
  549. movlps %xmm0, (XX)
  550. jmp .L999
  551. ALIGN_3
  552. .L130:
  553. cmpq $2 * SIZE, INCX
  554. jne .L120
  555. #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
  556. pshufd $0, %xmm0, %xmm14
  557. pshufd $0, %xmm1, %xmm1
  558. subps %xmm1, %xmm15
  559. unpcklps %xmm1, %xmm15
  560. subq $-31 * SIZE, X
  561. testq $2 * SIZE, X
  562. je .L130x
  563. movsd -31 * SIZE(X), %xmm0
  564. pshufd $0xb1, %xmm0, %xmm8
  565. mulps %xmm14, %xmm0
  566. mulps %xmm15, %xmm8
  567. addps %xmm8, %xmm0
  568. movlps %xmm0, -31 * SIZE(X)
  569. addq $2 * SIZE, X
  570. decq M
  571. jle .L999
  572. ALIGN_3
  573. .L130x:
  574. shufps $0xb1, %xmm15, %xmm15
  575. movaps -32 * SIZE(X), %xmm0
  576. movaps %xmm0, %xmm9
  577. movq M, I
  578. sarq $4, I
  579. jle .L135
  580. movaps -28 * SIZE(X), %xmm1
  581. movaps -24 * SIZE(X), %xmm2
  582. movaps -20 * SIZE(X), %xmm3
  583. movaps -16 * SIZE(X), %xmm4
  584. movaps -12 * SIZE(X), %xmm5
  585. movaps -8 * SIZE(X), %xmm6
  586. movaps -4 * SIZE(X), %xmm7
  587. decq I
  588. jle .L132
  589. ALIGN_4
  590. .L131:
  591. #ifdef PREFETCHW
  592. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  593. #endif
  594. movss %xmm1, %xmm0
  595. pshufd $0x1b, %xmm0, %xmm8
  596. mulps %xmm14, %xmm0
  597. mulps %xmm15, %xmm8
  598. addps %xmm8, %xmm0
  599. movaps %xmm0, %xmm10
  600. movss %xmm9, %xmm0
  601. movaps %xmm0, -32 * SIZE(X)
  602. movaps 0 * SIZE(X), %xmm0
  603. movss %xmm2, %xmm1
  604. pshufd $0x1b, %xmm1, %xmm8
  605. mulps %xmm14, %xmm1
  606. mulps %xmm15, %xmm8
  607. addps %xmm8, %xmm1
  608. movaps %xmm1, %xmm9
  609. movss %xmm10, %xmm1
  610. movaps %xmm1, -28 * SIZE(X)
  611. movaps 4 * SIZE(X), %xmm1
  612. movss %xmm3, %xmm2
  613. pshufd $0x1b, %xmm2, %xmm8
  614. mulps %xmm14, %xmm2
  615. mulps %xmm15, %xmm8
  616. addps %xmm8, %xmm2
  617. movaps %xmm2, %xmm10
  618. movss %xmm9, %xmm2
  619. movaps %xmm2, -24 * SIZE(X)
  620. movaps 8 * SIZE(X), %xmm2
  621. movss %xmm4, %xmm3
  622. pshufd $0x1b, %xmm3, %xmm8
  623. mulps %xmm14, %xmm3
  624. mulps %xmm15, %xmm8
  625. addps %xmm8, %xmm3
  626. movaps %xmm3, %xmm9
  627. movss %xmm10, %xmm3
  628. movaps %xmm3, -20 * SIZE(X)
  629. movaps 12 * SIZE(X), %xmm3
  630. #ifdef PREFETCHW
  631. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  632. #endif
  633. movss %xmm5, %xmm4
  634. pshufd $0x1b, %xmm4, %xmm8
  635. mulps %xmm14, %xmm4
  636. mulps %xmm15, %xmm8
  637. addps %xmm8, %xmm4
  638. movaps %xmm4, %xmm10
  639. movss %xmm9, %xmm4
  640. movaps %xmm4, -16 * SIZE(X)
  641. movaps 16 * SIZE(X), %xmm4
  642. movss %xmm6, %xmm5
  643. pshufd $0x1b, %xmm5, %xmm8
  644. mulps %xmm14, %xmm5
  645. mulps %xmm15, %xmm8
  646. addps %xmm8, %xmm5
  647. movaps %xmm5, %xmm9
  648. movss %xmm10, %xmm5
  649. movaps %xmm5, -12 * SIZE(X)
  650. movaps 20 * SIZE(X), %xmm5
  651. movss %xmm7, %xmm6
  652. pshufd $0x1b, %xmm6, %xmm8
  653. mulps %xmm14, %xmm6
  654. mulps %xmm15, %xmm8
  655. addps %xmm8, %xmm6
  656. movaps %xmm6, %xmm10
  657. movss %xmm9, %xmm6
  658. movaps %xmm6, -8 * SIZE(X)
  659. movaps 24 * SIZE(X), %xmm6
  660. movss %xmm0, %xmm7
  661. pshufd $0x1b, %xmm7, %xmm8
  662. mulps %xmm14, %xmm7
  663. mulps %xmm15, %xmm8
  664. addps %xmm8, %xmm7
  665. movaps %xmm7, %xmm9
  666. movss %xmm10, %xmm7
  667. movaps %xmm7, -4 * SIZE(X)
  668. movaps 28 * SIZE(X), %xmm7
  669. subq $-32 * SIZE, X
  670. decq I
  671. jg .L131
  672. ALIGN_4
  673. .L132:
  674. movss %xmm1, %xmm0
  675. pshufd $0x1b, %xmm0, %xmm8
  676. mulps %xmm14, %xmm0
  677. mulps %xmm15, %xmm8
  678. addps %xmm8, %xmm0
  679. movaps %xmm0, %xmm10
  680. movss %xmm9, %xmm0
  681. movaps %xmm0, -32 * SIZE(X)
  682. movaps 0 * SIZE(X), %xmm0
  683. movss %xmm2, %xmm1
  684. pshufd $0x1b, %xmm1, %xmm8
  685. mulps %xmm14, %xmm1
  686. mulps %xmm15, %xmm8
  687. addps %xmm8, %xmm1
  688. movaps %xmm1, %xmm9
  689. movss %xmm10, %xmm1
  690. movaps %xmm1, -28 * SIZE(X)
  691. movss %xmm3, %xmm2
  692. pshufd $0x1b, %xmm2, %xmm8
  693. mulps %xmm14, %xmm2
  694. mulps %xmm15, %xmm8
  695. addps %xmm8, %xmm2
  696. movaps %xmm2, %xmm10
  697. movss %xmm9, %xmm2
  698. movaps %xmm2, -24 * SIZE(X)
  699. movss %xmm4, %xmm3
  700. pshufd $0x1b, %xmm3, %xmm8
  701. mulps %xmm14, %xmm3
  702. mulps %xmm15, %xmm8
  703. addps %xmm8, %xmm3
  704. movaps %xmm3, %xmm9
  705. movss %xmm10, %xmm3
  706. movaps %xmm3, -20 * SIZE(X)
  707. movss %xmm5, %xmm4
  708. pshufd $0x1b, %xmm4, %xmm8
  709. mulps %xmm14, %xmm4
  710. mulps %xmm15, %xmm8
  711. addps %xmm8, %xmm4
  712. movaps %xmm4, %xmm10
  713. movss %xmm9, %xmm4
  714. movaps %xmm4, -16 * SIZE(X)
  715. movss %xmm6, %xmm5
  716. pshufd $0x1b, %xmm5, %xmm8
  717. mulps %xmm14, %xmm5
  718. mulps %xmm15, %xmm8
  719. addps %xmm8, %xmm5
  720. movaps %xmm5, %xmm9
  721. movss %xmm10, %xmm5
  722. movaps %xmm5, -12 * SIZE(X)
  723. movss %xmm7, %xmm6
  724. pshufd $0x1b, %xmm6, %xmm8
  725. mulps %xmm14, %xmm6
  726. mulps %xmm15, %xmm8
  727. addps %xmm8, %xmm6
  728. movaps %xmm6, %xmm10
  729. movss %xmm9, %xmm6
  730. movaps %xmm6, -8 * SIZE(X)
  731. movss %xmm0, %xmm7
  732. pshufd $0x1b, %xmm7, %xmm8
  733. mulps %xmm14, %xmm7
  734. mulps %xmm15, %xmm8
  735. addps %xmm8, %xmm7
  736. movaps %xmm7, %xmm9
  737. movss %xmm10, %xmm7
  738. movaps %xmm7, -4 * SIZE(X)
  739. subq $-32 * SIZE, X
  740. ALIGN_4
  741. .L135:
  742. testq $8, M
  743. je .L136
  744. movaps -28 * SIZE(X), %xmm1
  745. movss %xmm1, %xmm0
  746. pshufd $0x1b, %xmm0, %xmm8
  747. mulps %xmm14, %xmm0
  748. mulps %xmm15, %xmm8
  749. addps %xmm8, %xmm0
  750. movaps %xmm0, %xmm10
  751. movss %xmm9, %xmm0
  752. movaps %xmm0, -32 * SIZE(X)
  753. movaps -24 * SIZE(X), %xmm2
  754. movss %xmm2, %xmm1
  755. pshufd $0x1b, %xmm1, %xmm8
  756. mulps %xmm14, %xmm1
  757. mulps %xmm15, %xmm8
  758. addps %xmm8, %xmm1
  759. movaps %xmm1, %xmm9
  760. movss %xmm10, %xmm1
  761. movaps %xmm1, -28 * SIZE(X)
  762. movaps -20 * SIZE(X), %xmm3
  763. movss %xmm3, %xmm2
  764. pshufd $0x1b, %xmm2, %xmm8
  765. mulps %xmm14, %xmm2
  766. mulps %xmm15, %xmm8
  767. addps %xmm8, %xmm2
  768. movaps %xmm2, %xmm10
  769. movss %xmm9, %xmm2
  770. movaps %xmm2, -24 * SIZE(X)
  771. movaps -16 * SIZE(X), %xmm0
  772. movss %xmm0, %xmm3
  773. pshufd $0x1b, %xmm3, %xmm8
  774. mulps %xmm14, %xmm3
  775. mulps %xmm15, %xmm8
  776. addps %xmm8, %xmm3
  777. movaps %xmm3, %xmm9
  778. movss %xmm10, %xmm3
  779. movaps %xmm3, -20 * SIZE(X)
  780. addq $16 * SIZE, X
  781. ALIGN_3
  782. .L136:
  783. testq $4, M
  784. je .L137
  785. movaps -28 * SIZE(X), %xmm1
  786. movss %xmm1, %xmm0
  787. pshufd $0x1b, %xmm0, %xmm8
  788. mulps %xmm14, %xmm0
  789. mulps %xmm15, %xmm8
  790. addps %xmm8, %xmm0
  791. movaps %xmm0, %xmm10
  792. movss %xmm9, %xmm0
  793. movaps %xmm0, -32 * SIZE(X)
  794. movaps -24 * SIZE(X), %xmm2
  795. movss %xmm2, %xmm1
  796. pshufd $0x1b, %xmm1, %xmm8
  797. mulps %xmm14, %xmm1
  798. mulps %xmm15, %xmm8
  799. addps %xmm8, %xmm1
  800. movaps %xmm1, %xmm9
  801. movss %xmm10, %xmm1
  802. movaps %xmm1, -28 * SIZE(X)
  803. movaps %xmm2, %xmm0
  804. addq $8 * SIZE, X
  805. ALIGN_3
  806. .L137:
  807. testq $2, M
  808. je .L138
  809. movaps -28 * SIZE(X), %xmm1
  810. movss %xmm1, %xmm0
  811. pshufd $0x1b, %xmm0, %xmm8
  812. mulps %xmm14, %xmm0
  813. mulps %xmm15, %xmm8
  814. addps %xmm8, %xmm0
  815. movaps %xmm0, %xmm10
  816. movss %xmm9, %xmm0
  817. movaps %xmm0, -32 * SIZE(X)
  818. movaps %xmm10, %xmm9
  819. movaps %xmm1, %xmm0
  820. addq $4 * SIZE, X
  821. ALIGN_3
  822. .L138:
  823. movss %xmm9, -32 * SIZE(X)
  824. testq $1, M
  825. je .L999
  826. pshufd $0x1b, %xmm0, %xmm8
  827. mulps %xmm14, %xmm0
  828. mulps %xmm15, %xmm8
  829. addps %xmm8, %xmm0
  830. pshufd $0x39, %xmm0, %xmm0
  831. movlps %xmm0, -31 * SIZE(X)
  832. jmp .L999
  833. ALIGN_3
  834. #else
  835. pshufd $0, %xmm0, %xmm14
  836. pshufd $0, %xmm1, %xmm1
  837. subps %xmm1, %xmm15
  838. unpcklps %xmm1, %xmm15
  839. subq $-32 * SIZE, X
  840. testq $2 * SIZE, X
  841. je .L130x
  842. movsd -32 * SIZE(X), %xmm0
  843. pshufd $0xb1, %xmm0, %xmm8
  844. mulps %xmm14, %xmm0
  845. mulps %xmm15, %xmm8
  846. addps %xmm8, %xmm0
  847. movlps %xmm0, -32 * SIZE(X)
  848. addq $2 * SIZE, X
  849. decq M
  850. jle .L999
  851. ALIGN_3
  852. .L130x:
  853. movq M, I
  854. sarq $4, I
  855. jle .L135
  856. movsd -32 * SIZE(X), %xmm0
  857. movhps -30 * SIZE(X), %xmm0
  858. movsd -28 * SIZE(X), %xmm1
  859. movhps -26 * SIZE(X), %xmm1
  860. movsd -24 * SIZE(X), %xmm2
  861. movhps -22 * SIZE(X), %xmm2
  862. movsd -20 * SIZE(X), %xmm3
  863. movhps -18 * SIZE(X), %xmm3
  864. movsd -16 * SIZE(X), %xmm4
  865. movhps -14 * SIZE(X), %xmm4
  866. movsd -12 * SIZE(X), %xmm5
  867. movhps -10 * SIZE(X), %xmm5
  868. movsd -8 * SIZE(X), %xmm6
  869. movhps -6 * SIZE(X), %xmm6
  870. movsd -4 * SIZE(X), %xmm7
  871. movhps -2 * SIZE(X), %xmm7
  872. decq I
  873. jle .L132
  874. ALIGN_4
  875. .L131:
  876. #ifdef PREFETCHW
  877. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  878. #endif
  879. pshufd $0xb1, %xmm0, %xmm8
  880. mulps %xmm14, %xmm0
  881. mulps %xmm15, %xmm8
  882. addps %xmm8, %xmm0
  883. movlps %xmm0, -32 * SIZE(X)
  884. movhps %xmm0, -30 * SIZE(X)
  885. movsd 0 * SIZE(X), %xmm0
  886. movhps 2 * SIZE(X), %xmm0
  887. pshufd $0xb1, %xmm1, %xmm8
  888. mulps %xmm14, %xmm1
  889. mulps %xmm15, %xmm8
  890. addps %xmm8, %xmm1
  891. movlps %xmm1, -28 * SIZE(X)
  892. movhps %xmm1, -26 * SIZE(X)
  893. movsd 4 * SIZE(X), %xmm1
  894. movhps 6 * SIZE(X), %xmm1
  895. pshufd $0xb1, %xmm2, %xmm8
  896. mulps %xmm14, %xmm2
  897. mulps %xmm15, %xmm8
  898. addps %xmm8, %xmm2
  899. movlps %xmm2, -24 * SIZE(X)
  900. movhps %xmm2, -22 * SIZE(X)
  901. movsd 8 * SIZE(X), %xmm2
  902. movhps 10 * SIZE(X), %xmm2
  903. pshufd $0xb1, %xmm3, %xmm8
  904. mulps %xmm14, %xmm3
  905. mulps %xmm15, %xmm8
  906. addps %xmm8, %xmm3
  907. movlps %xmm3, -20 * SIZE(X)
  908. movhps %xmm3, -18 * SIZE(X)
  909. movsd 12 * SIZE(X), %xmm3
  910. movhps 14 * SIZE(X), %xmm3
  911. #ifdef PREFETCHW
  912. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  913. #endif
  914. pshufd $0xb1, %xmm4, %xmm8
  915. mulps %xmm14, %xmm4
  916. mulps %xmm15, %xmm8
  917. addps %xmm8, %xmm4
  918. movlps %xmm4, -16 * SIZE(X)
  919. movhps %xmm4, -14 * SIZE(X)
  920. movsd 16 * SIZE(X), %xmm4
  921. movhps 18 * SIZE(X), %xmm4
  922. pshufd $0xb1, %xmm5, %xmm8
  923. mulps %xmm14, %xmm5
  924. mulps %xmm15, %xmm8
  925. addps %xmm8, %xmm5
  926. movlps %xmm5, -12 * SIZE(X)
  927. movhps %xmm5, -10 * SIZE(X)
  928. movsd 20 * SIZE(X), %xmm5
  929. movhps 22 * SIZE(X), %xmm5
  930. pshufd $0xb1, %xmm6, %xmm8
  931. mulps %xmm14, %xmm6
  932. mulps %xmm15, %xmm8
  933. addps %xmm8, %xmm6
  934. movlps %xmm6, -8 * SIZE(X)
  935. movhps %xmm6, -6 * SIZE(X)
  936. movsd 24 * SIZE(X), %xmm6
  937. movhps 26 * SIZE(X), %xmm6
  938. pshufd $0xb1, %xmm7, %xmm8
  939. mulps %xmm14, %xmm7
  940. mulps %xmm15, %xmm8
  941. addps %xmm8, %xmm7
  942. movlps %xmm7, -4 * SIZE(X)
  943. movhps %xmm7, -2 * SIZE(X)
  944. movsd 28 * SIZE(X), %xmm7
  945. movhps 30 * SIZE(X), %xmm7
  946. subq $-32 * SIZE, X
  947. decq I
  948. jg .L131
  949. ALIGN_4
  950. .L132:
  951. pshufd $0xb1, %xmm0, %xmm8
  952. mulps %xmm14, %xmm0
  953. mulps %xmm15, %xmm8
  954. addps %xmm8, %xmm0
  955. movlps %xmm0, -32 * SIZE(X)
  956. movhps %xmm0, -30 * SIZE(X)
  957. pshufd $0xb1, %xmm1, %xmm8
  958. mulps %xmm14, %xmm1
  959. mulps %xmm15, %xmm8
  960. addps %xmm8, %xmm1
  961. movlps %xmm1, -28 * SIZE(X)
  962. movhps %xmm1, -26 * SIZE(X)
  963. pshufd $0xb1, %xmm2, %xmm8
  964. mulps %xmm14, %xmm2
  965. mulps %xmm15, %xmm8
  966. addps %xmm8, %xmm2
  967. movlps %xmm2, -24 * SIZE(X)
  968. movhps %xmm2, -22 * SIZE(X)
  969. pshufd $0xb1, %xmm3, %xmm8
  970. mulps %xmm14, %xmm3
  971. mulps %xmm15, %xmm8
  972. addps %xmm8, %xmm3
  973. movlps %xmm3, -20 * SIZE(X)
  974. movhps %xmm3, -18 * SIZE(X)
  975. pshufd $0xb1, %xmm4, %xmm8
  976. mulps %xmm14, %xmm4
  977. mulps %xmm15, %xmm8
  978. addps %xmm8, %xmm4
  979. movlps %xmm4, -16 * SIZE(X)
  980. movhps %xmm4, -14 * SIZE(X)
  981. pshufd $0xb1, %xmm5, %xmm8
  982. mulps %xmm14, %xmm5
  983. mulps %xmm15, %xmm8
  984. addps %xmm8, %xmm5
  985. movlps %xmm5, -12 * SIZE(X)
  986. movhps %xmm5, -10 * SIZE(X)
  987. pshufd $0xb1, %xmm6, %xmm8
  988. mulps %xmm14, %xmm6
  989. mulps %xmm15, %xmm8
  990. addps %xmm8, %xmm6
  991. movlps %xmm6, -8 * SIZE(X)
  992. movhps %xmm6, -6 * SIZE(X)
  993. pshufd $0xb1, %xmm7, %xmm8
  994. mulps %xmm14, %xmm7
  995. mulps %xmm15, %xmm8
  996. addps %xmm8, %xmm7
  997. movlps %xmm7, -4 * SIZE(X)
  998. movhps %xmm7, -2 * SIZE(X)
  999. subq $-32 * SIZE, X
  1000. ALIGN_4
  1001. .L135:
  1002. testq $8, M
  1003. je .L136
  1004. movsd -32 * SIZE(X), %xmm0
  1005. movhps -30 * SIZE(X), %xmm0
  1006. pshufd $0xb1, %xmm0, %xmm8
  1007. mulps %xmm14, %xmm0
  1008. mulps %xmm15, %xmm8
  1009. addps %xmm8, %xmm0
  1010. movlps %xmm0, -32 * SIZE(X)
  1011. movhps %xmm0, -30 * SIZE(X)
  1012. movsd -28 * SIZE(X), %xmm1
  1013. movhps -26 * SIZE(X), %xmm1
  1014. pshufd $0xb1, %xmm1, %xmm8
  1015. mulps %xmm14, %xmm1
  1016. mulps %xmm15, %xmm8
  1017. addps %xmm8, %xmm1
  1018. movlps %xmm1, -28 * SIZE(X)
  1019. movhps %xmm1, -26 * SIZE(X)
  1020. movsd -24 * SIZE(X), %xmm2
  1021. movhps -22 * SIZE(X), %xmm2
  1022. pshufd $0xb1, %xmm2, %xmm8
  1023. mulps %xmm14, %xmm2
  1024. mulps %xmm15, %xmm8
  1025. addps %xmm8, %xmm2
  1026. movlps %xmm2, -24 * SIZE(X)
  1027. movhps %xmm2, -22 * SIZE(X)
  1028. movsd -20 * SIZE(X), %xmm3
  1029. movhps -18 * SIZE(X), %xmm3
  1030. pshufd $0xb1, %xmm3, %xmm8
  1031. mulps %xmm14, %xmm3
  1032. mulps %xmm15, %xmm8
  1033. addps %xmm8, %xmm3
  1034. movlps %xmm3, -20 * SIZE(X)
  1035. movhps %xmm3, -18 * SIZE(X)
  1036. addq $16 * SIZE, X
  1037. ALIGN_3
  1038. .L136:
  1039. testq $4, M
  1040. je .L137
  1041. movsd -32 * SIZE(X), %xmm0
  1042. movhps -30 * SIZE(X), %xmm0
  1043. movsd -28 * SIZE(X), %xmm1
  1044. movhps -26 * SIZE(X), %xmm1
  1045. pshufd $0xb1, %xmm0, %xmm8
  1046. mulps %xmm14, %xmm0
  1047. mulps %xmm15, %xmm8
  1048. addps %xmm8, %xmm0
  1049. movlps %xmm0, -32 * SIZE(X)
  1050. movhps %xmm0, -30 * SIZE(X)
  1051. pshufd $0xb1, %xmm1, %xmm8
  1052. mulps %xmm14, %xmm1
  1053. mulps %xmm15, %xmm8
  1054. addps %xmm8, %xmm1
  1055. movlps %xmm1, -28 * SIZE(X)
  1056. movhps %xmm1, -26 * SIZE(X)
  1057. addq $8 * SIZE, X
  1058. ALIGN_3
  1059. .L137:
  1060. testq $2, M
  1061. je .L138
  1062. movsd -32 * SIZE(X), %xmm0
  1063. movhps -30 * SIZE(X), %xmm0
  1064. pshufd $0xb1, %xmm0, %xmm8
  1065. mulps %xmm14, %xmm0
  1066. mulps %xmm15, %xmm8
  1067. addps %xmm8, %xmm0
  1068. movlps %xmm0, -32 * SIZE(X)
  1069. movhps %xmm0, -30 * SIZE(X)
  1070. addq $4 * SIZE, X
  1071. ALIGN_3
  1072. .L138:
  1073. testq $1, M
  1074. je .L999
  1075. movsd -32 * SIZE(X), %xmm0
  1076. pshufd $0xb1, %xmm0, %xmm8
  1077. mulps %xmm14, %xmm0
  1078. mulps %xmm15, %xmm8
  1079. addps %xmm8, %xmm0
  1080. movlps %xmm0, -32 * SIZE(X)
  1081. ALIGN_3
  1082. #endif
  1083. .L999:
  1084. xorq %rax, %rax
  1085. RESTOREREGISTERS
  1086. ret
  1087. EPILOGUE