You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zscal_sse2.S 32 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef WINDOWS_ABI
  41. #define M ARG1
  42. #define X ARG4
  43. #define INCX ARG5
  44. #else
  45. #define M ARG1
  46. #define X ARG2
  47. #define INCX ARG3
  48. #endif
  49. #define XX %r10
  50. #define FLAG %r11
  51. #define I %rax
  52. #include "l1param.h"
  53. #if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) || defined(SANDYBRIDGE)
  54. #define USE_PSHUFD
  55. #else
  56. #define USE_PSHUFD_HALF
  57. #endif
  58. PROLOGUE
  59. PROFCODE
  60. #ifdef WINDOWS_ABI
  61. movaps %xmm3, %xmm0
  62. movsd 40(%rsp), %xmm1
  63. movq 48(%rsp), X
  64. movq 56(%rsp), INCX
  65. #endif
  66. SAVEREGISTERS
  67. salq $ZBASE_SHIFT, INCX
  68. xor FLAG, FLAG
  69. testq M, M
  70. jle .L999
  71. pxor %xmm15, %xmm15
  72. comisd %xmm0, %xmm15
  73. jne .L100
  74. comisd %xmm1, %xmm15
  75. jne .L100
  76. /* Alpha == ZERO */
  77. cmpq $2 * SIZE, INCX
  78. jne .L20
  79. /* INCX == 1 */
  80. testq $SIZE, X
  81. je .L05
  82. movsd %xmm15, 0 * SIZE(X)
  83. addq $SIZE, X
  84. movq $1, FLAG
  85. decq M
  86. jle .L19
  87. ALIGN_3
  88. .L05:
  89. movq M, I # rcx = n
  90. sarq $3, I
  91. jle .L12
  92. ALIGN_4
  93. .L11:
  94. #ifdef PREFETCHW
  95. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  96. #endif
  97. movaps %xmm15, 0 * SIZE(X)
  98. movaps %xmm15, 2 * SIZE(X)
  99. movaps %xmm15, 4 * SIZE(X)
  100. movaps %xmm15, 6 * SIZE(X)
  101. #ifdef PREFETCHW
  102. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  103. #endif
  104. movaps %xmm15, 8 * SIZE(X)
  105. movaps %xmm15, 10 * SIZE(X)
  106. movaps %xmm15, 12 * SIZE(X)
  107. movaps %xmm15, 14 * SIZE(X)
  108. addq $16 * SIZE, X
  109. decq I
  110. jg .L11
  111. ALIGN_4
  112. .L12:
  113. testq $4, M
  114. je .L13
  115. movaps %xmm15, 0 * SIZE(X)
  116. movaps %xmm15, 2 * SIZE(X)
  117. movaps %xmm15, 4 * SIZE(X)
  118. movaps %xmm15, 6 * SIZE(X)
  119. addq $8 * SIZE, X
  120. ALIGN_3
  121. .L13:
  122. testq $2, M
  123. je .L14
  124. movaps %xmm15, 0 * SIZE(X)
  125. movaps %xmm15, 2 * SIZE(X)
  126. addq $4 * SIZE, X
  127. ALIGN_3
  128. .L14:
  129. testq $1, M
  130. je .L19
  131. movaps %xmm15, 0 * SIZE(X)
  132. addq $2 * SIZE, X
  133. ALIGN_3
  134. .L19:
  135. testq $1, FLAG
  136. je .L999
  137. movsd %xmm15, 0 * SIZE(X)
  138. jmp .L999
  139. ALIGN_4
  140. /* incx != 1 */
  141. .L20:
  142. testq $SIZE, X
  143. jne .L30
  144. /* Aligned Mode */
  145. movq M, I # rcx = n
  146. sarq $2, I
  147. jle .L22
  148. ALIGN_4
  149. .L21:
  150. #ifdef PREFETCHW
  151. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  152. #endif
  153. movaps %xmm15, (X)
  154. addq INCX, X
  155. movaps %xmm15, (X)
  156. addq INCX, X
  157. movaps %xmm15, (X)
  158. addq INCX, X
  159. movaps %xmm15, (X)
  160. addq INCX, X
  161. decq I
  162. jg .L21
  163. ALIGN_4
  164. .L22:
  165. testq $3, M
  166. je .L999
  167. testq $2, M
  168. je .L23
  169. movaps %xmm15, (X)
  170. addq INCX, X
  171. movaps %xmm15, (X)
  172. addq INCX, X
  173. ALIGN_3
  174. .L23:
  175. testq $1, M
  176. je .L999
  177. movaps %xmm15, (X)
  178. jmp .L999
  179. ALIGN_4
  180. /* Unaligned Mode */
  181. .L30:
  182. movq M, I # rcx = n
  183. sarq $2, I
  184. jle .L32
  185. ALIGN_4
  186. .L31:
  187. #ifdef PREFETCHW
  188. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  189. #endif
  190. movlps %xmm15, 0 * SIZE(X)
  191. movlps %xmm15, 1 * SIZE(X)
  192. addq INCX, X
  193. movlps %xmm15, 0 * SIZE(X)
  194. movlps %xmm15, 1 * SIZE(X)
  195. addq INCX, X
  196. movlps %xmm15, 0 * SIZE(X)
  197. movlps %xmm15, 1 * SIZE(X)
  198. addq INCX, X
  199. movlps %xmm15, 0 * SIZE(X)
  200. movlps %xmm15, 1 * SIZE(X)
  201. addq INCX, X
  202. decq I
  203. jg .L31
  204. ALIGN_4
  205. .L32:
  206. testq $3, M
  207. je .L999
  208. testq $2, M
  209. je .L33
  210. movlps %xmm15, 0 * SIZE(X)
  211. movlps %xmm15, 1 * SIZE(X)
  212. addq INCX, X
  213. movlps %xmm15, 0 * SIZE(X)
  214. movlps %xmm15, 1 * SIZE(X)
  215. addq INCX, X
  216. ALIGN_3
  217. .L33:
  218. testq $1, M
  219. je .L999
  220. movlps %xmm15, 0 * SIZE(X)
  221. movlps %xmm15, 1 * SIZE(X)
  222. jmp .L999
  223. ALIGN_4
  224. /* Alpha != ZERO */
  225. .L100:
  226. testq $SIZE, X
  227. jne .L200
  228. #ifdef HAVE_SSE3
  229. movddup %xmm0, %xmm14
  230. #else
  231. pshufd $0x44, %xmm0, %xmm14
  232. #endif
  233. pxor %xmm15, %xmm15
  234. subsd %xmm1, %xmm15
  235. movlhps %xmm1, %xmm15
  236. cmpq $2 * SIZE, INCX
  237. jne .L120
  238. subq $-16 * SIZE, X
  239. movq M, I
  240. sarq $3, I
  241. jle .L115
  242. movaps -16 * SIZE(X), %xmm0
  243. movaps -14 * SIZE(X), %xmm1
  244. movaps -12 * SIZE(X), %xmm2
  245. movaps -10 * SIZE(X), %xmm3
  246. movaps -8 * SIZE(X), %xmm4
  247. movaps -6 * SIZE(X), %xmm5
  248. movaps -4 * SIZE(X), %xmm6
  249. movaps -2 * SIZE(X), %xmm7
  250. decq I
  251. jle .L112
  252. ALIGN_4
  253. .L111:
  254. #ifdef PREFETCHW
  255. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  256. #endif
  257. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  258. pshufd $0x4e, %xmm0, %xmm8
  259. #else
  260. movsd -15 * SIZE(X), %xmm8
  261. movhps -16 * SIZE(X), %xmm8
  262. #endif
  263. mulpd %xmm14, %xmm0
  264. mulpd %xmm15, %xmm8
  265. addpd %xmm8, %xmm0
  266. movaps %xmm0, -16 * SIZE(X)
  267. movaps 0 * SIZE(X), %xmm0
  268. #ifdef USE_PSHUFD
  269. pshufd $0x4e, %xmm1, %xmm8
  270. #else
  271. movsd -13 * SIZE(X), %xmm8
  272. movhps -14 * SIZE(X), %xmm8
  273. #endif
  274. mulpd %xmm14, %xmm1
  275. mulpd %xmm15, %xmm8
  276. addpd %xmm8, %xmm1
  277. movaps %xmm1, -14 * SIZE(X)
  278. movaps 2 * SIZE(X), %xmm1
  279. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  280. pshufd $0x4e, %xmm2, %xmm8
  281. #else
  282. movsd -11 * SIZE(X), %xmm8
  283. movhps -12 * SIZE(X), %xmm8
  284. #endif
  285. mulpd %xmm14, %xmm2
  286. mulpd %xmm15, %xmm8
  287. addpd %xmm8, %xmm2
  288. movaps %xmm2, -12 * SIZE(X)
  289. movaps 4 * SIZE(X), %xmm2
  290. #ifdef USE_PSHUFD
  291. pshufd $0x4e, %xmm3, %xmm8
  292. #else
  293. movsd -9 * SIZE(X), %xmm8
  294. movhps -10 * SIZE(X), %xmm8
  295. #endif
  296. mulpd %xmm14, %xmm3
  297. mulpd %xmm15, %xmm8
  298. addpd %xmm8, %xmm3
  299. movaps %xmm3, -10 * SIZE(X)
  300. movaps 6 * SIZE(X), %xmm3
  301. #ifdef PREFETCHW
  302. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  303. #endif
  304. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  305. pshufd $0x4e, %xmm4, %xmm8
  306. #else
  307. movsd -7 * SIZE(X), %xmm8
  308. movhps -8 * SIZE(X), %xmm8
  309. #endif
  310. mulpd %xmm14, %xmm4
  311. mulpd %xmm15, %xmm8
  312. addpd %xmm8, %xmm4
  313. movaps %xmm4, -8 * SIZE(X)
  314. movaps 8 * SIZE(X), %xmm4
  315. #ifdef USE_PSHUFD
  316. pshufd $0x4e, %xmm5, %xmm8
  317. #else
  318. movsd -5 * SIZE(X), %xmm8
  319. movhps -6 * SIZE(X), %xmm8
  320. #endif
  321. mulpd %xmm14, %xmm5
  322. mulpd %xmm15, %xmm8
  323. addpd %xmm8, %xmm5
  324. movaps %xmm5, -6 * SIZE(X)
  325. movaps 10 * SIZE(X), %xmm5
  326. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  327. pshufd $0x4e, %xmm6, %xmm8
  328. #else
  329. movsd -3 * SIZE(X), %xmm8
  330. movhps -4 * SIZE(X), %xmm8
  331. #endif
  332. mulpd %xmm14, %xmm6
  333. mulpd %xmm15, %xmm8
  334. addpd %xmm8, %xmm6
  335. movaps %xmm6, -4 * SIZE(X)
  336. movaps 12 * SIZE(X), %xmm6
  337. #ifdef USE_PSHUFD
  338. pshufd $0x4e, %xmm7, %xmm8
  339. #else
  340. movsd -1 * SIZE(X), %xmm8
  341. movhps -2 * SIZE(X), %xmm8
  342. #endif
  343. mulpd %xmm14, %xmm7
  344. mulpd %xmm15, %xmm8
  345. addpd %xmm8, %xmm7
  346. movaps %xmm7, -2 * SIZE(X)
  347. movaps 14 * SIZE(X), %xmm7
  348. subq $-16 * SIZE, X
  349. decq I
  350. jg .L111
  351. ALIGN_4
  352. .L112:
  353. pshufd $0x4e, %xmm0, %xmm8
  354. mulpd %xmm14, %xmm0
  355. mulpd %xmm15, %xmm8
  356. addpd %xmm8, %xmm0
  357. movaps %xmm0, -16 * SIZE(X)
  358. pshufd $0x4e, %xmm1, %xmm8
  359. mulpd %xmm14, %xmm1
  360. mulpd %xmm15, %xmm8
  361. addpd %xmm8, %xmm1
  362. movaps %xmm1, -14 * SIZE(X)
  363. pshufd $0x4e, %xmm2, %xmm8
  364. mulpd %xmm14, %xmm2
  365. mulpd %xmm15, %xmm8
  366. addpd %xmm8, %xmm2
  367. movaps %xmm2, -12 * SIZE(X)
  368. pshufd $0x4e, %xmm3, %xmm8
  369. mulpd %xmm14, %xmm3
  370. mulpd %xmm15, %xmm8
  371. addpd %xmm8, %xmm3
  372. movaps %xmm3, -10 * SIZE(X)
  373. pshufd $0x4e, %xmm4, %xmm8
  374. mulpd %xmm14, %xmm4
  375. mulpd %xmm15, %xmm8
  376. addpd %xmm8, %xmm4
  377. movaps %xmm4, -8 * SIZE(X)
  378. pshufd $0x4e, %xmm5, %xmm8
  379. mulpd %xmm14, %xmm5
  380. mulpd %xmm15, %xmm8
  381. addpd %xmm8, %xmm5
  382. movaps %xmm5, -6 * SIZE(X)
  383. pshufd $0x4e, %xmm6, %xmm8
  384. mulpd %xmm14, %xmm6
  385. mulpd %xmm15, %xmm8
  386. addpd %xmm8, %xmm6
  387. movaps %xmm6, -4 * SIZE(X)
  388. pshufd $0x4e, %xmm7, %xmm8
  389. mulpd %xmm14, %xmm7
  390. mulpd %xmm15, %xmm8
  391. addpd %xmm8, %xmm7
  392. movaps %xmm7, -2 * SIZE(X)
  393. subq $-16 * SIZE, X
  394. ALIGN_3
  395. .L115:
  396. testq $7, M
  397. je .L999
  398. testq $4, M
  399. je .L116
  400. movaps -16 * SIZE(X), %xmm0
  401. movaps -14 * SIZE(X), %xmm1
  402. pshufd $0x4e, %xmm0, %xmm8
  403. mulpd %xmm14, %xmm0
  404. mulpd %xmm15, %xmm8
  405. addpd %xmm8, %xmm0
  406. movaps %xmm0, -16 * SIZE(X)
  407. pshufd $0x4e, %xmm1, %xmm8
  408. mulpd %xmm14, %xmm1
  409. mulpd %xmm15, %xmm8
  410. addpd %xmm8, %xmm1
  411. movaps %xmm1, -14 * SIZE(X)
  412. movaps -12 * SIZE(X), %xmm2
  413. movaps -10 * SIZE(X), %xmm3
  414. pshufd $0x4e, %xmm2, %xmm8
  415. mulpd %xmm14, %xmm2
  416. mulpd %xmm15, %xmm8
  417. addpd %xmm8, %xmm2
  418. movaps %xmm2, -12 * SIZE(X)
  419. pshufd $0x4e, %xmm3, %xmm8
  420. mulpd %xmm14, %xmm3
  421. mulpd %xmm15, %xmm8
  422. addpd %xmm8, %xmm3
  423. movaps %xmm3, -10 * SIZE(X)
  424. addq $8 * SIZE, X
  425. ALIGN_3
  426. .L116:
  427. testq $2, M
  428. je .L117
  429. movaps -16 * SIZE(X), %xmm0
  430. movaps -14 * SIZE(X), %xmm1
  431. pshufd $0x4e, %xmm0, %xmm8
  432. mulpd %xmm14, %xmm0
  433. mulpd %xmm15, %xmm8
  434. addpd %xmm8, %xmm0
  435. movaps %xmm0, -16 * SIZE(X)
  436. pshufd $0x4e, %xmm1, %xmm8
  437. mulpd %xmm14, %xmm1
  438. mulpd %xmm15, %xmm8
  439. addpd %xmm8, %xmm1
  440. movaps %xmm1, -14 * SIZE(X)
  441. addq $4 * SIZE, X
  442. ALIGN_3
  443. .L117:
  444. testq $1, M
  445. je .L999
  446. movaps -16 * SIZE(X), %xmm0
  447. pshufd $0x4e, %xmm0, %xmm8
  448. mulpd %xmm14, %xmm0
  449. mulpd %xmm15, %xmm8
  450. addpd %xmm8, %xmm0
  451. movaps %xmm0, -16 * SIZE(X)
  452. jmp .L999
  453. ALIGN_3
  454. .L120:
  455. movq X, XX
  456. movq M, I
  457. sarq $3, I
  458. jle .L125
  459. movaps (X), %xmm0
  460. addq INCX, X
  461. movaps (X), %xmm1
  462. addq INCX, X
  463. movaps (X), %xmm2
  464. addq INCX, X
  465. movaps (X), %xmm3
  466. addq INCX, X
  467. movaps (X), %xmm4
  468. addq INCX, X
  469. movaps (X), %xmm5
  470. addq INCX, X
  471. movaps (X), %xmm6
  472. addq INCX, X
  473. movaps (X), %xmm7
  474. addq INCX, X
  475. decq I
  476. jle .L122
  477. ALIGN_4
  478. .L121:
  479. #ifdef PREFETCHW
  480. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  481. #endif
  482. pshufd $0x4e, %xmm0, %xmm8
  483. mulpd %xmm14, %xmm0
  484. mulpd %xmm15, %xmm8
  485. addpd %xmm8, %xmm0
  486. movaps %xmm0, (XX)
  487. addq INCX, XX
  488. movaps (X), %xmm0
  489. addq INCX, X
  490. pshufd $0x4e, %xmm1, %xmm8
  491. mulpd %xmm14, %xmm1
  492. mulpd %xmm15, %xmm8
  493. addpd %xmm8, %xmm1
  494. movaps %xmm1, (XX)
  495. addq INCX, XX
  496. movaps (X), %xmm1
  497. addq INCX, X
  498. pshufd $0x4e, %xmm2, %xmm8
  499. mulpd %xmm14, %xmm2
  500. mulpd %xmm15, %xmm8
  501. addpd %xmm8, %xmm2
  502. movaps %xmm2, (XX)
  503. addq INCX, XX
  504. movaps (X), %xmm2
  505. addq INCX, X
  506. pshufd $0x4e, %xmm3, %xmm8
  507. mulpd %xmm14, %xmm3
  508. mulpd %xmm15, %xmm8
  509. addpd %xmm8, %xmm3
  510. movaps %xmm3, (XX)
  511. addq INCX, XX
  512. movaps (X), %xmm3
  513. addq INCX, X
  514. #ifdef PREFETCHW
  515. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  516. #endif
  517. pshufd $0x4e, %xmm4, %xmm8
  518. mulpd %xmm14, %xmm4
  519. mulpd %xmm15, %xmm8
  520. addpd %xmm8, %xmm4
  521. movaps %xmm4, (XX)
  522. addq INCX, XX
  523. movaps (X), %xmm4
  524. addq INCX, X
  525. pshufd $0x4e, %xmm5, %xmm8
  526. mulpd %xmm14, %xmm5
  527. mulpd %xmm15, %xmm8
  528. addpd %xmm8, %xmm5
  529. movaps %xmm5, (XX)
  530. addq INCX, XX
  531. movaps (X), %xmm5
  532. addq INCX, X
  533. pshufd $0x4e, %xmm6, %xmm8
  534. mulpd %xmm14, %xmm6
  535. mulpd %xmm15, %xmm8
  536. addpd %xmm8, %xmm6
  537. movaps %xmm6, (XX)
  538. addq INCX, XX
  539. movaps (X), %xmm6
  540. addq INCX, X
  541. pshufd $0x4e, %xmm7, %xmm8
  542. mulpd %xmm14, %xmm7
  543. mulpd %xmm15, %xmm8
  544. addpd %xmm8, %xmm7
  545. movaps %xmm7, (XX)
  546. addq INCX, XX
  547. movaps (X), %xmm7
  548. addq INCX, X
  549. decq I
  550. jg .L121
  551. ALIGN_4
  552. .L122:
  553. pshufd $0x4e, %xmm0, %xmm8
  554. mulpd %xmm14, %xmm0
  555. mulpd %xmm15, %xmm8
  556. addpd %xmm8, %xmm0
  557. movaps %xmm0, (XX)
  558. addq INCX, XX
  559. pshufd $0x4e, %xmm1, %xmm8
  560. mulpd %xmm14, %xmm1
  561. mulpd %xmm15, %xmm8
  562. addpd %xmm8, %xmm1
  563. movaps %xmm1, (XX)
  564. addq INCX, XX
  565. pshufd $0x4e, %xmm2, %xmm8
  566. mulpd %xmm14, %xmm2
  567. mulpd %xmm15, %xmm8
  568. addpd %xmm8, %xmm2
  569. movaps %xmm2, (XX)
  570. addq INCX, XX
  571. pshufd $0x4e, %xmm3, %xmm8
  572. mulpd %xmm14, %xmm3
  573. mulpd %xmm15, %xmm8
  574. addpd %xmm8, %xmm3
  575. movaps %xmm3, (XX)
  576. addq INCX, XX
  577. pshufd $0x4e, %xmm4, %xmm8
  578. mulpd %xmm14, %xmm4
  579. mulpd %xmm15, %xmm8
  580. addpd %xmm8, %xmm4
  581. movaps %xmm4, (XX)
  582. addq INCX, XX
  583. pshufd $0x4e, %xmm5, %xmm8
  584. mulpd %xmm14, %xmm5
  585. mulpd %xmm15, %xmm8
  586. addpd %xmm8, %xmm5
  587. movaps %xmm5, (XX)
  588. addq INCX, XX
  589. pshufd $0x4e, %xmm6, %xmm8
  590. mulpd %xmm14, %xmm6
  591. mulpd %xmm15, %xmm8
  592. addpd %xmm8, %xmm6
  593. movaps %xmm6, (XX)
  594. addq INCX, XX
  595. pshufd $0x4e, %xmm7, %xmm8
  596. mulpd %xmm14, %xmm7
  597. mulpd %xmm15, %xmm8
  598. addpd %xmm8, %xmm7
  599. movaps %xmm7, (XX)
  600. addq INCX, XX
  601. ALIGN_3
  602. .L125:
  603. testq $7, M
  604. je .L999
  605. testq $4, M
  606. je .L126
  607. movaps (X), %xmm0
  608. addq INCX, X
  609. movaps (X), %xmm1
  610. addq INCX, X
  611. pshufd $0x4e, %xmm0, %xmm8
  612. mulpd %xmm14, %xmm0
  613. mulpd %xmm15, %xmm8
  614. addpd %xmm8, %xmm0
  615. movaps %xmm0, (XX)
  616. addq INCX, XX
  617. pshufd $0x4e, %xmm1, %xmm8
  618. mulpd %xmm14, %xmm1
  619. mulpd %xmm15, %xmm8
  620. addpd %xmm8, %xmm1
  621. movaps %xmm1, (XX)
  622. addq INCX, XX
  623. movaps (X), %xmm2
  624. addq INCX, X
  625. movaps (X), %xmm3
  626. addq INCX, X
  627. pshufd $0x4e, %xmm2, %xmm8
  628. mulpd %xmm14, %xmm2
  629. mulpd %xmm15, %xmm8
  630. addpd %xmm8, %xmm2
  631. movaps %xmm2, (XX)
  632. addq INCX, XX
  633. pshufd $0x4e, %xmm3, %xmm8
  634. mulpd %xmm14, %xmm3
  635. mulpd %xmm15, %xmm8
  636. addpd %xmm8, %xmm3
  637. movaps %xmm3, (XX)
  638. addq INCX, XX
  639. ALIGN_3
  640. .L126:
  641. testq $2, M
  642. je .L127
  643. movaps (X), %xmm0
  644. addq INCX, X
  645. movaps (X), %xmm1
  646. addq INCX, X
  647. pshufd $0x4e, %xmm0, %xmm8
  648. mulpd %xmm14, %xmm0
  649. mulpd %xmm15, %xmm8
  650. addpd %xmm8, %xmm0
  651. movaps %xmm0, (XX)
  652. addq INCX, XX
  653. pshufd $0x4e, %xmm1, %xmm8
  654. mulpd %xmm14, %xmm1
  655. mulpd %xmm15, %xmm8
  656. addpd %xmm8, %xmm1
  657. movaps %xmm1, (XX)
  658. addq INCX, XX
  659. ALIGN_3
  660. .L127:
  661. testq $1, M
  662. je .L999
  663. movaps (X), %xmm0
  664. pshufd $0x4e, %xmm0, %xmm8
  665. mulpd %xmm14, %xmm0
  666. mulpd %xmm15, %xmm8
  667. addpd %xmm8, %xmm0
  668. movaps %xmm0, (XX)
  669. jmp .L999
  670. ALIGN_3
  671. .L200:
  672. cmpq $2 * SIZE, INCX
  673. jne .L220
  674. #if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE)
  675. movddup %xmm0, %xmm14
  676. pxor %xmm15, %xmm15
  677. subsd %xmm1, %xmm15
  678. movlhps %xmm1, %xmm15
  679. shufpd $1, %xmm15, %xmm15
  680. movhps 0 * SIZE(X), %xmm0
  681. movaps 1 * SIZE(X), %xmm1
  682. subq $-16 * SIZE, X
  683. unpckhpd %xmm0, %xmm0
  684. mulsd %xmm14, %xmm0
  685. movaps %xmm1, %xmm8
  686. mulsd %xmm15, %xmm8
  687. subsd %xmm8, %xmm0
  688. movlps %xmm0, -16 * SIZE(X)
  689. decq M
  690. movq M, I
  691. sarq $3, I
  692. jle .L205
  693. movaps -13 * SIZE(X), %xmm2
  694. movaps -11 * SIZE(X), %xmm3
  695. movaps -9 * SIZE(X), %xmm4
  696. decq I
  697. jle .L202
  698. ALIGN_4
  699. .L201:
  700. #ifdef PREFETCHW
  701. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  702. #endif
  703. movaps %xmm1, %xmm8
  704. SHUFPD_1 %xmm2, %xmm0
  705. mulpd %xmm14, %xmm8
  706. mulpd %xmm15, %xmm0
  707. addpd %xmm8, %xmm0
  708. movaps %xmm0, -15 * SIZE(X)
  709. movaps -7 * SIZE(X), %xmm5
  710. movaps %xmm2, %xmm8
  711. SHUFPD_1 %xmm3, %xmm1
  712. mulpd %xmm14, %xmm8
  713. mulpd %xmm15, %xmm1
  714. addpd %xmm8, %xmm1
  715. movaps %xmm1, -13 * SIZE(X)
  716. movaps -5 * SIZE(X), %xmm6
  717. movaps %xmm3, %xmm8
  718. SHUFPD_1 %xmm4, %xmm2
  719. mulpd %xmm14, %xmm8
  720. mulpd %xmm15, %xmm2
  721. addpd %xmm8, %xmm2
  722. movaps %xmm2, -11 * SIZE(X)
  723. movaps -3 * SIZE(X), %xmm7
  724. movaps %xmm4, %xmm8
  725. SHUFPD_1 %xmm5, %xmm3
  726. mulpd %xmm14, %xmm8
  727. mulpd %xmm15, %xmm3
  728. addpd %xmm8, %xmm3
  729. movaps %xmm3, -9 * SIZE(X)
  730. movaps -1 * SIZE(X), %xmm0
  731. #ifdef PREFETCHW
  732. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  733. #endif
  734. movaps %xmm5, %xmm8
  735. SHUFPD_1 %xmm6, %xmm4
  736. mulpd %xmm14, %xmm8
  737. mulpd %xmm15, %xmm4
  738. addpd %xmm8, %xmm4
  739. movaps %xmm4, -7 * SIZE(X)
  740. movaps 1 * SIZE(X), %xmm1
  741. movaps %xmm6, %xmm8
  742. SHUFPD_1 %xmm7, %xmm5
  743. mulpd %xmm14, %xmm8
  744. mulpd %xmm15, %xmm5
  745. addpd %xmm8, %xmm5
  746. movaps %xmm5, -5 * SIZE(X)
  747. movaps 3 * SIZE(X), %xmm2
  748. movaps %xmm7, %xmm8
  749. SHUFPD_1 %xmm0, %xmm6
  750. mulpd %xmm14, %xmm8
  751. mulpd %xmm15, %xmm6
  752. addpd %xmm8, %xmm6
  753. movaps %xmm6, -3 * SIZE(X)
  754. movaps 5 * SIZE(X), %xmm3
  755. movaps %xmm0, %xmm8
  756. SHUFPD_1 %xmm1, %xmm7
  757. mulpd %xmm14, %xmm8
  758. mulpd %xmm15, %xmm7
  759. addpd %xmm8, %xmm7
  760. movaps %xmm7, -1 * SIZE(X)
  761. movaps 7 * SIZE(X), %xmm4
  762. subq $-16 * SIZE, X
  763. decq I
  764. jg .L201
  765. ALIGN_4
  766. .L202:
  767. movaps %xmm1, %xmm8
  768. SHUFPD_1 %xmm2, %xmm0
  769. mulpd %xmm14, %xmm8
  770. mulpd %xmm15, %xmm0
  771. addpd %xmm8, %xmm0
  772. movaps %xmm0, -15 * SIZE(X)
  773. movaps -7 * SIZE(X), %xmm5
  774. movaps %xmm2, %xmm8
  775. SHUFPD_1 %xmm3, %xmm1
  776. mulpd %xmm14, %xmm8
  777. mulpd %xmm15, %xmm1
  778. addpd %xmm8, %xmm1
  779. movaps %xmm1, -13 * SIZE(X)
  780. movaps -5 * SIZE(X), %xmm6
  781. movaps %xmm3, %xmm8
  782. SHUFPD_1 %xmm4, %xmm2
  783. mulpd %xmm14, %xmm8
  784. mulpd %xmm15, %xmm2
  785. addpd %xmm8, %xmm2
  786. movaps %xmm2, -11 * SIZE(X)
  787. movaps -3 * SIZE(X), %xmm7
  788. movaps %xmm4, %xmm8
  789. SHUFPD_1 %xmm5, %xmm3
  790. mulpd %xmm14, %xmm8
  791. mulpd %xmm15, %xmm3
  792. addpd %xmm8, %xmm3
  793. movaps %xmm3, -9 * SIZE(X)
  794. movaps -1 * SIZE(X), %xmm0
  795. movaps %xmm5, %xmm8
  796. SHUFPD_1 %xmm6, %xmm4
  797. mulpd %xmm14, %xmm8
  798. mulpd %xmm15, %xmm4
  799. addpd %xmm8, %xmm4
  800. movaps %xmm4, -7 * SIZE(X)
  801. movaps 1 * SIZE(X), %xmm1
  802. movaps %xmm6, %xmm8
  803. SHUFPD_1 %xmm7, %xmm5
  804. mulpd %xmm14, %xmm8
  805. mulpd %xmm15, %xmm5
  806. addpd %xmm8, %xmm5
  807. movaps %xmm5, -5 * SIZE(X)
  808. movaps %xmm7, %xmm8
  809. SHUFPD_1 %xmm0, %xmm6
  810. mulpd %xmm14, %xmm8
  811. mulpd %xmm15, %xmm6
  812. addpd %xmm8, %xmm6
  813. movaps %xmm6, -3 * SIZE(X)
  814. movaps %xmm0, %xmm8
  815. SHUFPD_1 %xmm1, %xmm7
  816. mulpd %xmm14, %xmm8
  817. mulpd %xmm15, %xmm7
  818. addpd %xmm8, %xmm7
  819. movaps %xmm7, -1 * SIZE(X)
  820. subq $-16 * SIZE, X
  821. ALIGN_3
  822. .L205:
  823. testq $4, M
  824. je .L206
  825. movaps -13 * SIZE(X), %xmm2
  826. movaps %xmm1, %xmm8
  827. SHUFPD_1 %xmm2, %xmm0
  828. mulpd %xmm14, %xmm8
  829. mulpd %xmm15, %xmm0
  830. addpd %xmm8, %xmm0
  831. movaps %xmm0, -15 * SIZE(X)
  832. movaps -11 * SIZE(X), %xmm3
  833. movaps %xmm2, %xmm8
  834. SHUFPD_1 %xmm3, %xmm1
  835. mulpd %xmm14, %xmm8
  836. mulpd %xmm15, %xmm1
  837. addpd %xmm8, %xmm1
  838. movaps %xmm1, -13 * SIZE(X)
  839. movaps -9 * SIZE(X), %xmm0
  840. movaps %xmm3, %xmm8
  841. SHUFPD_1 %xmm0, %xmm2
  842. mulpd %xmm14, %xmm8
  843. mulpd %xmm15, %xmm2
  844. addpd %xmm8, %xmm2
  845. movaps %xmm2, -11 * SIZE(X)
  846. movaps -7 * SIZE(X), %xmm1
  847. movaps %xmm0, %xmm8
  848. SHUFPD_1 %xmm1, %xmm3
  849. mulpd %xmm14, %xmm8
  850. mulpd %xmm15, %xmm3
  851. addpd %xmm8, %xmm3
  852. movaps %xmm3, -9 * SIZE(X)
  853. addq $8 * SIZE, X
  854. ALIGN_3
  855. .L206:
  856. testq $2, M
  857. je .L207
  858. movaps -13 * SIZE(X), %xmm2
  859. movaps %xmm1, %xmm8
  860. SHUFPD_1 %xmm2, %xmm0
  861. mulpd %xmm14, %xmm8
  862. mulpd %xmm15, %xmm0
  863. addpd %xmm8, %xmm0
  864. movaps %xmm0, -15 * SIZE(X)
  865. movaps -11 * SIZE(X), %xmm3
  866. movaps %xmm2, %xmm8
  867. SHUFPD_1 %xmm3, %xmm1
  868. mulpd %xmm14, %xmm8
  869. mulpd %xmm15, %xmm1
  870. addpd %xmm8, %xmm1
  871. movaps %xmm1, -13 * SIZE(X)
  872. movaps %xmm2, %xmm0
  873. movaps %xmm3, %xmm1
  874. addq $4 * SIZE, X
  875. ALIGN_3
  876. .L207:
  877. testq $1, M
  878. je .L208
  879. movaps -13 * SIZE(X), %xmm2
  880. movaps %xmm1, %xmm8
  881. SHUFPD_1 %xmm2, %xmm0
  882. mulpd %xmm14, %xmm8
  883. mulpd %xmm15, %xmm0
  884. addpd %xmm8, %xmm0
  885. movaps %xmm0, -15 * SIZE(X)
  886. movaps %xmm1, %xmm0
  887. movaps %xmm2, %xmm1
  888. addq $2 * SIZE, X
  889. ALIGN_3
  890. .L208:
  891. unpckhpd %xmm0, %xmm0
  892. mulsd %xmm14, %xmm1
  893. mulsd %xmm15, %xmm0
  894. addsd %xmm1, %xmm0
  895. movlps %xmm0, -15 * SIZE(X)
  896. jmp .L999
  897. ALIGN_3
  898. #else
  899. movddup %xmm0, %xmm14
  900. pxor %xmm15, %xmm15
  901. subsd %xmm1, %xmm15
  902. movlhps %xmm1, %xmm15
  903. subq $-16 * SIZE, X
  904. movq M, I
  905. sarq $3, I
  906. jle .L205
  907. movsd -16 * SIZE(X), %xmm0
  908. movhps -15 * SIZE(X), %xmm0
  909. movsd -14 * SIZE(X), %xmm1
  910. movhps -13 * SIZE(X), %xmm1
  911. movsd -12 * SIZE(X), %xmm2
  912. movhps -11 * SIZE(X), %xmm2
  913. movsd -10 * SIZE(X), %xmm3
  914. movhps -9 * SIZE(X), %xmm3
  915. movsd -8 * SIZE(X), %xmm4
  916. movhps -7 * SIZE(X), %xmm4
  917. movsd -6 * SIZE(X), %xmm5
  918. movhps -5 * SIZE(X), %xmm5
  919. movsd -4 * SIZE(X), %xmm6
  920. movhps -3 * SIZE(X), %xmm6
  921. movsd -2 * SIZE(X), %xmm7
  922. movhps -1 * SIZE(X), %xmm7
  923. decq I
  924. jle .L202
  925. ALIGN_4
  926. .L201:
  927. #ifdef PREFETCHW
  928. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  929. #endif
  930. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  931. pshufd $0x4e, %xmm0, %xmm8
  932. #else
  933. movsd -15 * SIZE(X), %xmm8
  934. movhps -16 * SIZE(X), %xmm8
  935. #endif
  936. mulpd %xmm14, %xmm0
  937. mulpd %xmm15, %xmm8
  938. addpd %xmm8, %xmm0
  939. movlps %xmm0, -16 * SIZE(X)
  940. movhps %xmm0, -15 * SIZE(X)
  941. movsd 0 * SIZE(X), %xmm0
  942. movhps 1 * SIZE(X), %xmm0
  943. #ifdef USE_PSHUFD
  944. pshufd $0x4e, %xmm1, %xmm8
  945. #else
  946. movsd -13 * SIZE(X), %xmm8
  947. movhps -14 * SIZE(X), %xmm8
  948. #endif
  949. mulpd %xmm14, %xmm1
  950. mulpd %xmm15, %xmm8
  951. addpd %xmm8, %xmm1
  952. movlps %xmm1, -14 * SIZE(X)
  953. movhps %xmm1, -13 * SIZE(X)
  954. movsd 2 * SIZE(X), %xmm1
  955. movhps 3 * SIZE(X), %xmm1
  956. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  957. pshufd $0x4e, %xmm2, %xmm8
  958. #else
  959. movsd -11 * SIZE(X), %xmm8
  960. movhps -12 * SIZE(X), %xmm8
  961. #endif
  962. mulpd %xmm14, %xmm2
  963. mulpd %xmm15, %xmm8
  964. addpd %xmm8, %xmm2
  965. movlps %xmm2, -12 * SIZE(X)
  966. movhps %xmm2, -11 * SIZE(X)
  967. movsd 4 * SIZE(X), %xmm2
  968. movhps 5 * SIZE(X), %xmm2
  969. #ifdef USE_PSHUFD
  970. pshufd $0x4e, %xmm3, %xmm8
  971. #else
  972. movsd -9 * SIZE(X), %xmm8
  973. movhps -10 * SIZE(X), %xmm8
  974. #endif
  975. mulpd %xmm14, %xmm3
  976. mulpd %xmm15, %xmm8
  977. addpd %xmm8, %xmm3
  978. movlps %xmm3, -10 * SIZE(X)
  979. movhps %xmm3, -9 * SIZE(X)
  980. movsd 6 * SIZE(X), %xmm3
  981. movhps 7 * SIZE(X), %xmm3
  982. #ifdef PREFETCHW
  983. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  984. #endif
  985. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  986. pshufd $0x4e, %xmm4, %xmm8
  987. #else
  988. movsd -7 * SIZE(X), %xmm8
  989. movhps -8 * SIZE(X), %xmm8
  990. #endif
  991. mulpd %xmm14, %xmm4
  992. mulpd %xmm15, %xmm8
  993. addpd %xmm8, %xmm4
  994. movlps %xmm4, -8 * SIZE(X)
  995. movhps %xmm4, -7 * SIZE(X)
  996. movsd 8 * SIZE(X), %xmm4
  997. movhps 9 * SIZE(X), %xmm4
  998. #ifdef USE_PSHUFD
  999. pshufd $0x4e, %xmm5, %xmm8
  1000. #else
  1001. movsd -5 * SIZE(X), %xmm8
  1002. movhps -6 * SIZE(X), %xmm8
  1003. #endif
  1004. mulpd %xmm14, %xmm5
  1005. mulpd %xmm15, %xmm8
  1006. addpd %xmm8, %xmm5
  1007. movlps %xmm5, -6 * SIZE(X)
  1008. movhps %xmm5, -5 * SIZE(X)
  1009. movsd 10 * SIZE(X), %xmm5
  1010. movhps 11 * SIZE(X), %xmm5
  1011. #if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF)
  1012. pshufd $0x4e, %xmm6, %xmm8
  1013. #else
  1014. movsd -3 * SIZE(X), %xmm8
  1015. movhps -4 * SIZE(X), %xmm8
  1016. #endif
  1017. mulpd %xmm14, %xmm6
  1018. mulpd %xmm15, %xmm8
  1019. addpd %xmm8, %xmm6
  1020. movlps %xmm6, -4 * SIZE(X)
  1021. movhps %xmm6, -3 * SIZE(X)
  1022. movsd 12 * SIZE(X), %xmm6
  1023. movhps 13 * SIZE(X), %xmm6
  1024. #ifdef USE_PSHUFD
  1025. pshufd $0x4e, %xmm7, %xmm8
  1026. #else
  1027. movsd -1 * SIZE(X), %xmm8
  1028. movhps -2 * SIZE(X), %xmm8
  1029. #endif
  1030. mulpd %xmm14, %xmm7
  1031. mulpd %xmm15, %xmm8
  1032. addpd %xmm8, %xmm7
  1033. movlps %xmm7, -2 * SIZE(X)
  1034. movhps %xmm7, -1 * SIZE(X)
  1035. movsd 14 * SIZE(X), %xmm7
  1036. movhps 15 * SIZE(X), %xmm7
  1037. subq $-16 * SIZE, X
  1038. decq I
  1039. jg .L201
  1040. ALIGN_4
  1041. .L202:
  1042. pshufd $0x4e, %xmm0, %xmm8
  1043. mulpd %xmm14, %xmm0
  1044. mulpd %xmm15, %xmm8
  1045. addpd %xmm8, %xmm0
  1046. movlps %xmm0, -16 * SIZE(X)
  1047. movhps %xmm0, -15 * SIZE(X)
  1048. pshufd $0x4e, %xmm1, %xmm8
  1049. mulpd %xmm14, %xmm1
  1050. mulpd %xmm15, %xmm8
  1051. addpd %xmm8, %xmm1
  1052. movlps %xmm1, -14 * SIZE(X)
  1053. movhps %xmm1, -13 * SIZE(X)
  1054. pshufd $0x4e, %xmm2, %xmm8
  1055. mulpd %xmm14, %xmm2
  1056. mulpd %xmm15, %xmm8
  1057. addpd %xmm8, %xmm2
  1058. movlps %xmm2, -12 * SIZE(X)
  1059. movhps %xmm2, -11 * SIZE(X)
  1060. pshufd $0x4e, %xmm3, %xmm8
  1061. mulpd %xmm14, %xmm3
  1062. mulpd %xmm15, %xmm8
  1063. addpd %xmm8, %xmm3
  1064. movlps %xmm3, -10 * SIZE(X)
  1065. movhps %xmm3, -9 * SIZE(X)
  1066. pshufd $0x4e, %xmm4, %xmm8
  1067. mulpd %xmm14, %xmm4
  1068. mulpd %xmm15, %xmm8
  1069. addpd %xmm8, %xmm4
  1070. movlps %xmm4, -8 * SIZE(X)
  1071. movhps %xmm4, -7 * SIZE(X)
  1072. pshufd $0x4e, %xmm5, %xmm8
  1073. mulpd %xmm14, %xmm5
  1074. mulpd %xmm15, %xmm8
  1075. addpd %xmm8, %xmm5
  1076. movlps %xmm5, -6 * SIZE(X)
  1077. movhps %xmm5, -5 * SIZE(X)
  1078. pshufd $0x4e, %xmm6, %xmm8
  1079. mulpd %xmm14, %xmm6
  1080. mulpd %xmm15, %xmm8
  1081. addpd %xmm8, %xmm6
  1082. movlps %xmm6, -4 * SIZE(X)
  1083. movhps %xmm6, -3 * SIZE(X)
  1084. pshufd $0x4e, %xmm7, %xmm8
  1085. mulpd %xmm14, %xmm7
  1086. mulpd %xmm15, %xmm8
  1087. addpd %xmm8, %xmm7
  1088. movlps %xmm7, -2 * SIZE(X)
  1089. movhps %xmm7, -1 * SIZE(X)
  1090. subq $-16 * SIZE, X
  1091. ALIGN_3
  1092. .L205:
  1093. testq $7, M
  1094. je .L999
  1095. testq $4, M
  1096. je .L206
  1097. movsd -16 * SIZE(X), %xmm0
  1098. movhps -15 * SIZE(X), %xmm0
  1099. movsd -14 * SIZE(X), %xmm1
  1100. movhps -13 * SIZE(X), %xmm1
  1101. pshufd $0x4e, %xmm0, %xmm8
  1102. mulpd %xmm14, %xmm0
  1103. mulpd %xmm15, %xmm8
  1104. addpd %xmm8, %xmm0
  1105. movlps %xmm0, -16 * SIZE(X)
  1106. movhps %xmm0, -15 * SIZE(X)
  1107. pshufd $0x4e, %xmm1, %xmm8
  1108. mulpd %xmm14, %xmm1
  1109. mulpd %xmm15, %xmm8
  1110. addpd %xmm8, %xmm1
  1111. movlps %xmm1, -14 * SIZE(X)
  1112. movhps %xmm1, -13 * SIZE(X)
  1113. movsd -12 * SIZE(X), %xmm2
  1114. movhps -11 * SIZE(X), %xmm2
  1115. movsd -10 * SIZE(X), %xmm3
  1116. movhps -9 * SIZE(X), %xmm3
  1117. pshufd $0x4e, %xmm2, %xmm8
  1118. mulpd %xmm14, %xmm2
  1119. mulpd %xmm15, %xmm8
  1120. addpd %xmm8, %xmm2
  1121. movlps %xmm2, -12 * SIZE(X)
  1122. movhps %xmm2, -11 * SIZE(X)
  1123. pshufd $0x4e, %xmm3, %xmm8
  1124. mulpd %xmm14, %xmm3
  1125. mulpd %xmm15, %xmm8
  1126. addpd %xmm8, %xmm3
  1127. movlps %xmm3, -10 * SIZE(X)
  1128. movhps %xmm3, -9 * SIZE(X)
  1129. addq $8 * SIZE, X
  1130. ALIGN_3
  1131. .L206:
  1132. testq $2, M
  1133. je .L207
  1134. movsd -16 * SIZE(X), %xmm0
  1135. movhps -15 * SIZE(X), %xmm0
  1136. pshufd $0x4e, %xmm0, %xmm8
  1137. mulpd %xmm14, %xmm0
  1138. mulpd %xmm15, %xmm8
  1139. addpd %xmm8, %xmm0
  1140. movlps %xmm0, -16 * SIZE(X)
  1141. movhps %xmm0, -15 * SIZE(X)
  1142. movsd -14 * SIZE(X), %xmm1
  1143. movhps -13 * SIZE(X), %xmm1
  1144. pshufd $0x4e, %xmm1, %xmm8
  1145. mulpd %xmm14, %xmm1
  1146. mulpd %xmm15, %xmm8
  1147. addpd %xmm8, %xmm1
  1148. movlps %xmm1, -14 * SIZE(X)
  1149. movhps %xmm1, -13 * SIZE(X)
  1150. addq $4 * SIZE, X
  1151. ALIGN_3
  1152. .L207:
  1153. testq $1, M
  1154. je .L999
  1155. movsd -16 * SIZE(X), %xmm0
  1156. movhps -15 * SIZE(X), %xmm0
  1157. pshufd $0x4e, %xmm0, %xmm8
  1158. mulpd %xmm14, %xmm0
  1159. mulpd %xmm15, %xmm8
  1160. addpd %xmm8, %xmm0
  1161. movlps %xmm0, -16 * SIZE(X)
  1162. movhps %xmm0, -15 * SIZE(X)
  1163. jmp .L999
  1164. ALIGN_3
  1165. #endif
  1166. .L220:
  1167. movddup %xmm0, %xmm14
  1168. pxor %xmm15, %xmm15
  1169. subsd %xmm1, %xmm15
  1170. movlhps %xmm1, %xmm15
  1171. movq X, XX
  1172. movq M, I
  1173. sarq $3, I
  1174. jle .L225
  1175. movsd 0 * SIZE(X), %xmm0
  1176. movhps 1 * SIZE(X), %xmm0
  1177. addq INCX, X
  1178. movsd 0 * SIZE(X), %xmm1
  1179. movhps 1 * SIZE(X), %xmm1
  1180. addq INCX, X
  1181. movsd 0 * SIZE(X), %xmm2
  1182. movhps 1 * SIZE(X), %xmm2
  1183. addq INCX, X
  1184. movsd 0 * SIZE(X), %xmm3
  1185. movhps 1 * SIZE(X), %xmm3
  1186. addq INCX, X
  1187. movsd 0 * SIZE(X), %xmm4
  1188. movhps 1 * SIZE(X), %xmm4
  1189. addq INCX, X
  1190. movsd 0 * SIZE(X), %xmm5
  1191. movhps 1 * SIZE(X), %xmm5
  1192. addq INCX, X
  1193. movsd 0 * SIZE(X), %xmm6
  1194. movhps 1 * SIZE(X), %xmm6
  1195. addq INCX, X
  1196. movsd 0 * SIZE(X), %xmm7
  1197. movhps 1 * SIZE(X), %xmm7
  1198. addq INCX, X
  1199. decq I
  1200. jle .L222
  1201. ALIGN_4
  1202. .L221:
  1203. #ifdef PREFETCHW
  1204. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X)
  1205. #endif
  1206. pshufd $0x4e, %xmm0, %xmm8
  1207. mulpd %xmm14, %xmm0
  1208. mulpd %xmm15, %xmm8
  1209. addpd %xmm8, %xmm0
  1210. movlps %xmm0, 0 * SIZE(XX)
  1211. movhps %xmm0, 1 * SIZE(XX)
  1212. addq INCX, XX
  1213. movsd 0 * SIZE(X), %xmm0
  1214. movhps 1 * SIZE(X), %xmm0
  1215. addq INCX, X
  1216. pshufd $0x4e, %xmm1, %xmm8
  1217. mulpd %xmm14, %xmm1
  1218. mulpd %xmm15, %xmm8
  1219. addpd %xmm8, %xmm1
  1220. movlps %xmm1, 0 * SIZE(XX)
  1221. movhps %xmm1, 1 * SIZE(XX)
  1222. addq INCX, XX
  1223. movsd 0 * SIZE(X), %xmm1
  1224. movhps 1 * SIZE(X), %xmm1
  1225. addq INCX, X
  1226. pshufd $0x4e, %xmm2, %xmm8
  1227. mulpd %xmm14, %xmm2
  1228. mulpd %xmm15, %xmm8
  1229. addpd %xmm8, %xmm2
  1230. movlps %xmm2, 0 * SIZE(XX)
  1231. movhps %xmm2, 1 * SIZE(XX)
  1232. addq INCX, XX
  1233. movsd 0 * SIZE(X), %xmm2
  1234. movhps 1 * SIZE(X), %xmm2
  1235. addq INCX, X
  1236. pshufd $0x4e, %xmm3, %xmm8
  1237. mulpd %xmm14, %xmm3
  1238. mulpd %xmm15, %xmm8
  1239. addpd %xmm8, %xmm3
  1240. movlps %xmm3, 0 * SIZE(XX)
  1241. movhps %xmm3, 1 * SIZE(XX)
  1242. addq INCX, XX
  1243. movsd 0 * SIZE(X), %xmm3
  1244. movhps 1 * SIZE(X), %xmm3
  1245. addq INCX, X
  1246. #ifdef PREFETCHW
  1247. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X)
  1248. #endif
  1249. pshufd $0x4e, %xmm4, %xmm8
  1250. mulpd %xmm14, %xmm4
  1251. mulpd %xmm15, %xmm8
  1252. addpd %xmm8, %xmm4
  1253. movlps %xmm4, 0 * SIZE(XX)
  1254. movhps %xmm4, 1 * SIZE(XX)
  1255. addq INCX, XX
  1256. movsd 0 * SIZE(X), %xmm4
  1257. movhps 1 * SIZE(X), %xmm4
  1258. addq INCX, X
  1259. pshufd $0x4e, %xmm5, %xmm8
  1260. mulpd %xmm14, %xmm5
  1261. mulpd %xmm15, %xmm8
  1262. addpd %xmm8, %xmm5
  1263. movlps %xmm5, 0 * SIZE(XX)
  1264. movhps %xmm5, 1 * SIZE(XX)
  1265. addq INCX, XX
  1266. movsd 0 * SIZE(X), %xmm5
  1267. movhps 1 * SIZE(X), %xmm5
  1268. addq INCX, X
  1269. pshufd $0x4e, %xmm6, %xmm8
  1270. mulpd %xmm14, %xmm6
  1271. mulpd %xmm15, %xmm8
  1272. addpd %xmm8, %xmm6
  1273. movlps %xmm6, 0 * SIZE(XX)
  1274. movhps %xmm6, 1 * SIZE(XX)
  1275. addq INCX, XX
  1276. movsd 0 * SIZE(X), %xmm6
  1277. movhps 1 * SIZE(X), %xmm6
  1278. addq INCX, X
  1279. pshufd $0x4e, %xmm7, %xmm8
  1280. mulpd %xmm14, %xmm7
  1281. mulpd %xmm15, %xmm8
  1282. addpd %xmm8, %xmm7
  1283. movlps %xmm7, 0 * SIZE(XX)
  1284. movhps %xmm7, 1 * SIZE(XX)
  1285. addq INCX, XX
  1286. movsd 0 * SIZE(X), %xmm7
  1287. movhps 1 * SIZE(X), %xmm7
  1288. addq INCX, X
  1289. decq I
  1290. jg .L221
  1291. ALIGN_4
  1292. .L222:
  1293. pshufd $0x4e, %xmm0, %xmm8
  1294. mulpd %xmm14, %xmm0
  1295. mulpd %xmm15, %xmm8
  1296. addpd %xmm8, %xmm0
  1297. movlps %xmm0, 0 * SIZE(XX)
  1298. movhps %xmm0, 1 * SIZE(XX)
  1299. addq INCX, XX
  1300. pshufd $0x4e, %xmm1, %xmm8
  1301. mulpd %xmm14, %xmm1
  1302. mulpd %xmm15, %xmm8
  1303. addpd %xmm8, %xmm1
  1304. movlps %xmm1, 0 * SIZE(XX)
  1305. movhps %xmm1, 1 * SIZE(XX)
  1306. addq INCX, XX
  1307. pshufd $0x4e, %xmm2, %xmm8
  1308. mulpd %xmm14, %xmm2
  1309. mulpd %xmm15, %xmm8
  1310. addpd %xmm8, %xmm2
  1311. movlps %xmm2, 0 * SIZE(XX)
  1312. movhps %xmm2, 1 * SIZE(XX)
  1313. addq INCX, XX
  1314. pshufd $0x4e, %xmm3, %xmm8
  1315. mulpd %xmm14, %xmm3
  1316. mulpd %xmm15, %xmm8
  1317. addpd %xmm8, %xmm3
  1318. movlps %xmm3, 0 * SIZE(XX)
  1319. movhps %xmm3, 1 * SIZE(XX)
  1320. addq INCX, XX
  1321. pshufd $0x4e, %xmm4, %xmm8
  1322. mulpd %xmm14, %xmm4
  1323. mulpd %xmm15, %xmm8
  1324. addpd %xmm8, %xmm4
  1325. movlps %xmm4, 0 * SIZE(XX)
  1326. movhps %xmm4, 1 * SIZE(XX)
  1327. addq INCX, XX
  1328. pshufd $0x4e, %xmm5, %xmm8
  1329. mulpd %xmm14, %xmm5
  1330. mulpd %xmm15, %xmm8
  1331. addpd %xmm8, %xmm5
  1332. movlps %xmm5, 0 * SIZE(XX)
  1333. movhps %xmm5, 1 * SIZE(XX)
  1334. addq INCX, XX
  1335. pshufd $0x4e, %xmm6, %xmm8
  1336. mulpd %xmm14, %xmm6
  1337. mulpd %xmm15, %xmm8
  1338. addpd %xmm8, %xmm6
  1339. movlps %xmm6, 0 * SIZE(XX)
  1340. movhps %xmm6, 1 * SIZE(XX)
  1341. addq INCX, XX
  1342. pshufd $0x4e, %xmm7, %xmm8
  1343. mulpd %xmm14, %xmm7
  1344. mulpd %xmm15, %xmm8
  1345. addpd %xmm8, %xmm7
  1346. movlps %xmm7, 0 * SIZE(XX)
  1347. movhps %xmm7, 1 * SIZE(XX)
  1348. addq INCX, XX
  1349. ALIGN_3
  1350. .L225:
  1351. testq $7, M
  1352. je .L999
  1353. testq $4, M
  1354. je .L226
  1355. movsd 0 * SIZE(X), %xmm0
  1356. movhps 1 * SIZE(X), %xmm0
  1357. addq INCX, X
  1358. pshufd $0x4e, %xmm0, %xmm8
  1359. mulpd %xmm14, %xmm0
  1360. mulpd %xmm15, %xmm8
  1361. addpd %xmm8, %xmm0
  1362. movlps %xmm0, 0 * SIZE(XX)
  1363. movhps %xmm0, 1 * SIZE(XX)
  1364. addq INCX, XX
  1365. movsd 0 * SIZE(X), %xmm1
  1366. movhps 1 * SIZE(X), %xmm1
  1367. addq INCX, X
  1368. pshufd $0x4e, %xmm1, %xmm8
  1369. mulpd %xmm14, %xmm1
  1370. mulpd %xmm15, %xmm8
  1371. addpd %xmm8, %xmm1
  1372. movlps %xmm1, 0 * SIZE(XX)
  1373. movhps %xmm1, 1 * SIZE(XX)
  1374. addq INCX, XX
  1375. movsd 0 * SIZE(X), %xmm2
  1376. movhps 1 * SIZE(X), %xmm2
  1377. addq INCX, X
  1378. pshufd $0x4e, %xmm2, %xmm8
  1379. mulpd %xmm14, %xmm2
  1380. mulpd %xmm15, %xmm8
  1381. addpd %xmm8, %xmm2
  1382. movlps %xmm2, 0 * SIZE(XX)
  1383. movhps %xmm2, 1 * SIZE(XX)
  1384. addq INCX, XX
  1385. movsd 0 * SIZE(X), %xmm3
  1386. movhps 1 * SIZE(X), %xmm3
  1387. addq INCX, X
  1388. pshufd $0x4e, %xmm3, %xmm8
  1389. mulpd %xmm14, %xmm3
  1390. mulpd %xmm15, %xmm8
  1391. addpd %xmm8, %xmm3
  1392. movlps %xmm3, 0 * SIZE(XX)
  1393. movhps %xmm3, 1 * SIZE(XX)
  1394. addq INCX, XX
  1395. ALIGN_3
  1396. .L226:
  1397. testq $2, M
  1398. je .L227
  1399. movsd 0 * SIZE(X), %xmm0
  1400. movhps 1 * SIZE(X), %xmm0
  1401. addq INCX, X
  1402. pshufd $0x4e, %xmm0, %xmm8
  1403. mulpd %xmm14, %xmm0
  1404. mulpd %xmm15, %xmm8
  1405. addpd %xmm8, %xmm0
  1406. movlps %xmm0, 0 * SIZE(XX)
  1407. movhps %xmm0, 1 * SIZE(XX)
  1408. addq INCX, XX
  1409. movsd 0 * SIZE(X), %xmm1
  1410. movhps 1 * SIZE(X), %xmm1
  1411. addq INCX, X
  1412. pshufd $0x4e, %xmm1, %xmm8
  1413. mulpd %xmm14, %xmm1
  1414. mulpd %xmm15, %xmm8
  1415. addpd %xmm8, %xmm1
  1416. movlps %xmm1, 0 * SIZE(XX)
  1417. movhps %xmm1, 1 * SIZE(XX)
  1418. addq INCX, XX
  1419. ALIGN_3
  1420. .L227:
  1421. testq $1, M
  1422. je .L999
  1423. movsd 0 * SIZE(X), %xmm0
  1424. movhps 1 * SIZE(X), %xmm0
  1425. pshufd $0x4e, %xmm0, %xmm8
  1426. mulpd %xmm14, %xmm0
  1427. mulpd %xmm15, %xmm8
  1428. addpd %xmm8, %xmm0
  1429. movlps %xmm0, 0 * SIZE(XX)
  1430. movhps %xmm0, 1 * SIZE(XX)
  1431. ALIGN_3
  1432. .L999:
  1433. xorq %rax, %rax
  1434. RESTOREREGISTERS
  1435. ret
  1436. EPILOGUE