You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zdot_sse2.S 32 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #ifndef WINDOWS_ABI
  41. #define N ARG1 /* rdi */
  42. #define X ARG2 /* rsi */
  43. #define INCX ARG3 /* rdx */
  44. #define Y ARG4 /* rcx */
  45. #define INCY ARG5 /* r8 */
  46. #else
  47. #define RESULT_ADDRESS ARG1 /*rcx*/
  48. #define N ARG2 /* rdx */
  49. #define X ARG3 /* r8 */
  50. #define INCX ARG4 /* r9*/
  51. #define Y %r10
  52. #define INCY %r11
  53. #endif
  54. #include "l1param.h"
  55. #undef movsd
  56. #ifndef OPTERON
  57. #define MOVLPS movsd
  58. #else
  59. #define MOVLPS movlps
  60. #endif
  61. PROLOGUE
  62. PROFCODE
  63. #ifdef WINDOWS_ABI
  64. movq 40(%rsp), Y
  65. movq 48(%rsp), INCY
  66. #endif
  67. SAVEREGISTERS
  68. salq $ZBASE_SHIFT, INCX
  69. salq $ZBASE_SHIFT, INCY
  70. xorps %xmm0, %xmm0
  71. xorps %xmm1, %xmm1
  72. xorps %xmm2, %xmm2
  73. xorps %xmm3, %xmm3
  74. cmpq $0, N
  75. jle .L999
  76. cmpq $2 * SIZE, INCX
  77. jne .L50
  78. cmpq $2 * SIZE, INCY
  79. jne .L50
  80. subq $-16 * SIZE, X
  81. subq $-16 * SIZE, Y
  82. testq $SIZE, Y
  83. jne .L30
  84. testq $SIZE, X
  85. jne .L20
  86. movq N, %rax
  87. sarq $3, %rax
  88. jle .L15
  89. movaps -16 * SIZE(X), %xmm4
  90. movaps -14 * SIZE(X), %xmm5
  91. movaps -16 * SIZE(Y), %xmm8
  92. movaps -14 * SIZE(Y), %xmm9
  93. movaps -12 * SIZE(X), %xmm6
  94. movaps -10 * SIZE(X), %xmm7
  95. movaps -12 * SIZE(Y), %xmm10
  96. movaps -10 * SIZE(Y), %xmm11
  97. decq %rax
  98. jle .L12
  99. ALIGN_3
  100. .L11:
  101. #ifdef PREFETCH
  102. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  103. #endif
  104. pshufd $0x4e, %xmm8, %xmm12
  105. mulpd %xmm4, %xmm8
  106. addpd %xmm8, %xmm0
  107. movaps -8 * SIZE(Y), %xmm8
  108. mulpd %xmm4, %xmm12
  109. movaps -8 * SIZE(X), %xmm4
  110. addpd %xmm12, %xmm1
  111. pshufd $0x4e, %xmm9, %xmm12
  112. mulpd %xmm5, %xmm9
  113. addpd %xmm9, %xmm2
  114. movaps -6 * SIZE(Y), %xmm9
  115. mulpd %xmm5, %xmm12
  116. movaps -6 * SIZE(X), %xmm5
  117. addpd %xmm12, %xmm3
  118. #ifdef PREFETCH
  119. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  120. #endif
  121. pshufd $0x4e, %xmm10, %xmm12
  122. mulpd %xmm6, %xmm10
  123. addpd %xmm10, %xmm0
  124. movaps -4 * SIZE(Y), %xmm10
  125. mulpd %xmm6, %xmm12
  126. movaps -4 * SIZE(X), %xmm6
  127. addpd %xmm12, %xmm1
  128. pshufd $0x4e, %xmm11, %xmm12
  129. mulpd %xmm7, %xmm11
  130. addpd %xmm11, %xmm2
  131. movaps -2 * SIZE(Y), %xmm11
  132. mulpd %xmm7, %xmm12
  133. movaps -2 * SIZE(X), %xmm7
  134. addpd %xmm12, %xmm3
  135. #if defined(PREFETCH) && !defined(FETCH128)
  136. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  137. #endif
  138. pshufd $0x4e, %xmm8, %xmm12
  139. mulpd %xmm4, %xmm8
  140. addpd %xmm8, %xmm0
  141. movaps 0 * SIZE(Y), %xmm8
  142. mulpd %xmm4, %xmm12
  143. movaps 0 * SIZE(X), %xmm4
  144. addpd %xmm12, %xmm1
  145. pshufd $0x4e, %xmm9, %xmm12
  146. mulpd %xmm5, %xmm9
  147. addpd %xmm9, %xmm2
  148. movaps 2 * SIZE(Y), %xmm9
  149. mulpd %xmm5, %xmm12
  150. movaps 2 * SIZE(X), %xmm5
  151. addpd %xmm12, %xmm3
  152. #if defined(PREFETCH) && !defined(FETCH128)
  153. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  154. #endif
  155. pshufd $0x4e, %xmm10, %xmm12
  156. mulpd %xmm6, %xmm10
  157. addpd %xmm10, %xmm0
  158. movaps 4 * SIZE(Y), %xmm10
  159. mulpd %xmm6, %xmm12
  160. movaps 4 * SIZE(X), %xmm6
  161. addpd %xmm12, %xmm1
  162. pshufd $0x4e, %xmm11, %xmm12
  163. mulpd %xmm7, %xmm11
  164. addpd %xmm11, %xmm2
  165. movaps 6 * SIZE(Y), %xmm11
  166. mulpd %xmm7, %xmm12
  167. movaps 6 * SIZE(X), %xmm7
  168. addpd %xmm12, %xmm3
  169. subq $-16 * SIZE, X
  170. subq $-16 * SIZE, Y
  171. decq %rax
  172. jg .L11
  173. ALIGN_3
  174. .L12:
  175. pshufd $0x4e, %xmm8, %xmm12
  176. mulpd %xmm4, %xmm8
  177. addpd %xmm8, %xmm0
  178. movaps -8 * SIZE(Y), %xmm8
  179. mulpd %xmm4, %xmm12
  180. movaps -8 * SIZE(X), %xmm4
  181. addpd %xmm12, %xmm1
  182. pshufd $0x4e, %xmm9, %xmm12
  183. mulpd %xmm5, %xmm9
  184. addpd %xmm9, %xmm2
  185. movaps -6 * SIZE(Y), %xmm9
  186. mulpd %xmm5, %xmm12
  187. movaps -6 * SIZE(X), %xmm5
  188. addpd %xmm12, %xmm3
  189. pshufd $0x4e, %xmm10, %xmm12
  190. mulpd %xmm6, %xmm10
  191. addpd %xmm10, %xmm0
  192. movaps -4 * SIZE(Y), %xmm10
  193. mulpd %xmm6, %xmm12
  194. movaps -4 * SIZE(X), %xmm6
  195. addpd %xmm12, %xmm1
  196. pshufd $0x4e, %xmm11, %xmm12
  197. mulpd %xmm7, %xmm11
  198. addpd %xmm11, %xmm2
  199. movaps -2 * SIZE(Y), %xmm11
  200. mulpd %xmm7, %xmm12
  201. movaps -2 * SIZE(X), %xmm7
  202. addpd %xmm12, %xmm3
  203. pshufd $0x4e, %xmm8, %xmm12
  204. mulpd %xmm4, %xmm8
  205. addpd %xmm8, %xmm0
  206. mulpd %xmm4, %xmm12
  207. addpd %xmm12, %xmm1
  208. pshufd $0x4e, %xmm9, %xmm12
  209. mulpd %xmm5, %xmm9
  210. addpd %xmm9, %xmm2
  211. mulpd %xmm5, %xmm12
  212. addpd %xmm12, %xmm3
  213. pshufd $0x4e, %xmm10, %xmm12
  214. mulpd %xmm6, %xmm10
  215. addpd %xmm10, %xmm0
  216. mulpd %xmm6, %xmm12
  217. addpd %xmm12, %xmm1
  218. pshufd $0x4e, %xmm11, %xmm12
  219. mulpd %xmm7, %xmm11
  220. addpd %xmm11, %xmm2
  221. mulpd %xmm7, %xmm12
  222. addpd %xmm12, %xmm3
  223. subq $-16 * SIZE, X
  224. subq $-16 * SIZE, Y
  225. ALIGN_3
  226. .L15:
  227. testq $4, N
  228. jle .L16
  229. movaps -16 * SIZE(X), %xmm4
  230. movaps -16 * SIZE(Y), %xmm8
  231. movaps -14 * SIZE(X), %xmm5
  232. movaps -14 * SIZE(Y), %xmm9
  233. pshufd $0x4e, %xmm8, %xmm12
  234. mulpd %xmm4, %xmm8
  235. addpd %xmm8, %xmm0
  236. mulpd %xmm4, %xmm12
  237. addpd %xmm12, %xmm1
  238. pshufd $0x4e, %xmm9, %xmm12
  239. mulpd %xmm5, %xmm9
  240. addpd %xmm9, %xmm2
  241. mulpd %xmm5, %xmm12
  242. addpd %xmm12, %xmm3
  243. movaps -12 * SIZE(X), %xmm6
  244. movaps -12 * SIZE(Y), %xmm10
  245. movaps -10 * SIZE(X), %xmm7
  246. movaps -10 * SIZE(Y), %xmm11
  247. pshufd $0x4e, %xmm10, %xmm12
  248. mulpd %xmm6, %xmm10
  249. addpd %xmm10, %xmm0
  250. mulpd %xmm6, %xmm12
  251. addpd %xmm12, %xmm1
  252. pshufd $0x4e, %xmm11, %xmm12
  253. mulpd %xmm7, %xmm11
  254. addpd %xmm11, %xmm2
  255. mulpd %xmm7, %xmm12
  256. addpd %xmm12, %xmm3
  257. addq $8 * SIZE, X
  258. addq $8 * SIZE, Y
  259. ALIGN_3
  260. .L16:
  261. testq $2, N
  262. jle .L17
  263. movaps -16 * SIZE(X), %xmm4
  264. movaps -16 * SIZE(Y), %xmm8
  265. movaps -14 * SIZE(X), %xmm5
  266. movaps -14 * SIZE(Y), %xmm9
  267. pshufd $0x4e, %xmm8, %xmm12
  268. mulpd %xmm4, %xmm8
  269. addpd %xmm8, %xmm0
  270. mulpd %xmm4, %xmm12
  271. addpd %xmm12, %xmm1
  272. pshufd $0x4e, %xmm9, %xmm12
  273. mulpd %xmm5, %xmm9
  274. addpd %xmm9, %xmm2
  275. mulpd %xmm5, %xmm12
  276. addpd %xmm12, %xmm3
  277. addq $4 * SIZE, X
  278. addq $4 * SIZE, Y
  279. ALIGN_3
  280. .L17:
  281. testq $1, N
  282. jle .L98
  283. movaps -16 * SIZE(X), %xmm4
  284. movaps -16 * SIZE(Y), %xmm8
  285. pshufd $0x4e, %xmm8, %xmm12
  286. mulpd %xmm4, %xmm8
  287. addpd %xmm8, %xmm0
  288. mulpd %xmm4, %xmm12
  289. addpd %xmm12, %xmm1
  290. jmp .L98
  291. ALIGN_3
  292. .L20:
  293. movq N, %rax
  294. sarq $3, %rax
  295. jle .L25
  296. MOVLPS -16 * SIZE(X), %xmm4
  297. movhps -15 * SIZE(X), %xmm4
  298. MOVLPS -14 * SIZE(X), %xmm5
  299. movhps -13 * SIZE(X), %xmm5
  300. movaps -16 * SIZE(Y), %xmm8
  301. movaps -14 * SIZE(Y), %xmm9
  302. MOVLPS -12 * SIZE(X), %xmm6
  303. movhps -11 * SIZE(X), %xmm6
  304. MOVLPS -10 * SIZE(X), %xmm7
  305. movhps -9 * SIZE(X), %xmm7
  306. movaps -12 * SIZE(Y), %xmm10
  307. movaps -10 * SIZE(Y), %xmm11
  308. decq %rax
  309. jle .L22
  310. ALIGN_3
  311. .L21:
  312. #ifdef PREFETCH
  313. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  314. #endif
  315. pshufd $0x4e, %xmm8, %xmm12
  316. mulpd %xmm4, %xmm8
  317. addpd %xmm8, %xmm0
  318. movaps -8 * SIZE(Y), %xmm8
  319. mulpd %xmm4, %xmm12
  320. MOVLPS -8 * SIZE(X), %xmm4
  321. movhps -7 * SIZE(X), %xmm4
  322. addpd %xmm12, %xmm1
  323. pshufd $0x4e, %xmm9, %xmm12
  324. mulpd %xmm5, %xmm9
  325. addpd %xmm9, %xmm2
  326. movaps -6 * SIZE(Y), %xmm9
  327. mulpd %xmm5, %xmm12
  328. MOVLPS -6 * SIZE(X), %xmm5
  329. movhps -5 * SIZE(X), %xmm5
  330. addpd %xmm12, %xmm3
  331. #ifdef PREFETCH
  332. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  333. #endif
  334. pshufd $0x4e, %xmm10, %xmm12
  335. mulpd %xmm6, %xmm10
  336. addpd %xmm10, %xmm0
  337. movaps -4 * SIZE(Y), %xmm10
  338. mulpd %xmm6, %xmm12
  339. MOVLPS -4 * SIZE(X), %xmm6
  340. movhps -3 * SIZE(X), %xmm6
  341. addpd %xmm12, %xmm1
  342. pshufd $0x4e, %xmm11, %xmm12
  343. mulpd %xmm7, %xmm11
  344. addpd %xmm11, %xmm2
  345. movaps -2 * SIZE(Y), %xmm11
  346. mulpd %xmm7, %xmm12
  347. MOVLPS -2 * SIZE(X), %xmm7
  348. movhps -1 * SIZE(X), %xmm7
  349. addpd %xmm12, %xmm3
  350. #if defined(PREFETCH) && !defined(FETCH128)
  351. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  352. #endif
  353. pshufd $0x4e, %xmm8, %xmm12
  354. mulpd %xmm4, %xmm8
  355. addpd %xmm8, %xmm0
  356. movaps 0 * SIZE(Y), %xmm8
  357. mulpd %xmm4, %xmm12
  358. MOVLPS 0 * SIZE(X), %xmm4
  359. movhps 1 * SIZE(X), %xmm4
  360. addpd %xmm12, %xmm1
  361. pshufd $0x4e, %xmm9, %xmm12
  362. mulpd %xmm5, %xmm9
  363. addpd %xmm9, %xmm2
  364. movaps 2 * SIZE(Y), %xmm9
  365. mulpd %xmm5, %xmm12
  366. MOVLPS 2 * SIZE(X), %xmm5
  367. movhps 3 * SIZE(X), %xmm5
  368. addpd %xmm12, %xmm3
  369. #if defined(PREFETCH) && !defined(FETCH128)
  370. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  371. #endif
  372. pshufd $0x4e, %xmm10, %xmm12
  373. mulpd %xmm6, %xmm10
  374. addpd %xmm10, %xmm0
  375. movaps 4 * SIZE(Y), %xmm10
  376. mulpd %xmm6, %xmm12
  377. MOVLPS 4 * SIZE(X), %xmm6
  378. movhps 5 * SIZE(X), %xmm6
  379. addpd %xmm12, %xmm1
  380. pshufd $0x4e, %xmm11, %xmm12
  381. mulpd %xmm7, %xmm11
  382. addpd %xmm11, %xmm2
  383. movaps 6 * SIZE(Y), %xmm11
  384. mulpd %xmm7, %xmm12
  385. MOVLPS 6 * SIZE(X), %xmm7
  386. movhps 7 * SIZE(X), %xmm7
  387. addpd %xmm12, %xmm3
  388. subq $-16 * SIZE, X
  389. subq $-16 * SIZE, Y
  390. decq %rax
  391. jg .L21
  392. ALIGN_3
  393. .L22:
  394. pshufd $0x4e, %xmm8, %xmm12
  395. mulpd %xmm4, %xmm8
  396. addpd %xmm8, %xmm0
  397. movaps -8 * SIZE(Y), %xmm8
  398. mulpd %xmm4, %xmm12
  399. MOVLPS -8 * SIZE(X), %xmm4
  400. movhps -7 * SIZE(X), %xmm4
  401. addpd %xmm12, %xmm1
  402. pshufd $0x4e, %xmm9, %xmm12
  403. mulpd %xmm5, %xmm9
  404. addpd %xmm9, %xmm2
  405. movaps -6 * SIZE(Y), %xmm9
  406. mulpd %xmm5, %xmm12
  407. MOVLPS -6 * SIZE(X), %xmm5
  408. movhps -5 * SIZE(X), %xmm5
  409. addpd %xmm12, %xmm3
  410. pshufd $0x4e, %xmm10, %xmm12
  411. mulpd %xmm6, %xmm10
  412. addpd %xmm10, %xmm0
  413. movaps -4 * SIZE(Y), %xmm10
  414. mulpd %xmm6, %xmm12
  415. MOVLPS -4 * SIZE(X), %xmm6
  416. movhps -3 * SIZE(X), %xmm6
  417. addpd %xmm12, %xmm1
  418. pshufd $0x4e, %xmm11, %xmm12
  419. mulpd %xmm7, %xmm11
  420. addpd %xmm11, %xmm2
  421. movaps -2 * SIZE(Y), %xmm11
  422. mulpd %xmm7, %xmm12
  423. MOVLPS -2 * SIZE(X), %xmm7
  424. movhps -1 * SIZE(X), %xmm7
  425. addpd %xmm12, %xmm3
  426. pshufd $0x4e, %xmm8, %xmm12
  427. mulpd %xmm4, %xmm8
  428. addpd %xmm8, %xmm0
  429. mulpd %xmm4, %xmm12
  430. addpd %xmm12, %xmm1
  431. pshufd $0x4e, %xmm9, %xmm12
  432. mulpd %xmm5, %xmm9
  433. addpd %xmm9, %xmm2
  434. mulpd %xmm5, %xmm12
  435. addpd %xmm12, %xmm3
  436. pshufd $0x4e, %xmm10, %xmm12
  437. mulpd %xmm6, %xmm10
  438. addpd %xmm10, %xmm0
  439. mulpd %xmm6, %xmm12
  440. addpd %xmm12, %xmm1
  441. pshufd $0x4e, %xmm11, %xmm12
  442. mulpd %xmm7, %xmm11
  443. addpd %xmm11, %xmm2
  444. mulpd %xmm7, %xmm12
  445. addpd %xmm12, %xmm3
  446. subq $-16 * SIZE, X
  447. subq $-16 * SIZE, Y
  448. ALIGN_3
  449. .L25:
  450. testq $4, N
  451. jle .L26
  452. MOVLPS -16 * SIZE(X), %xmm4
  453. movhps -15 * SIZE(X), %xmm4
  454. movaps -16 * SIZE(Y), %xmm8
  455. pshufd $0x4e, %xmm8, %xmm12
  456. mulpd %xmm4, %xmm8
  457. addpd %xmm8, %xmm0
  458. mulpd %xmm4, %xmm12
  459. addpd %xmm12, %xmm1
  460. MOVLPS -14 * SIZE(X), %xmm5
  461. movhps -13 * SIZE(X), %xmm5
  462. movaps -14 * SIZE(Y), %xmm9
  463. pshufd $0x4e, %xmm9, %xmm12
  464. mulpd %xmm5, %xmm9
  465. addpd %xmm9, %xmm2
  466. mulpd %xmm5, %xmm12
  467. addpd %xmm12, %xmm3
  468. MOVLPS -12 * SIZE(X), %xmm6
  469. movhps -11 * SIZE(X), %xmm6
  470. movaps -12 * SIZE(Y), %xmm10
  471. pshufd $0x4e, %xmm10, %xmm12
  472. mulpd %xmm6, %xmm10
  473. addpd %xmm10, %xmm0
  474. mulpd %xmm6, %xmm12
  475. addpd %xmm12, %xmm1
  476. MOVLPS -10 * SIZE(X), %xmm7
  477. movhps -9 * SIZE(X), %xmm7
  478. movaps -10 * SIZE(Y), %xmm11
  479. pshufd $0x4e, %xmm11, %xmm12
  480. mulpd %xmm7, %xmm11
  481. addpd %xmm11, %xmm2
  482. mulpd %xmm7, %xmm12
  483. addpd %xmm12, %xmm3
  484. addq $8 * SIZE, X
  485. addq $8 * SIZE, Y
  486. ALIGN_3
  487. .L26:
  488. testq $2, N
  489. jle .L27
  490. MOVLPS -16 * SIZE(X), %xmm4
  491. movhps -15 * SIZE(X), %xmm4
  492. movaps -16 * SIZE(Y), %xmm8
  493. pshufd $0x4e, %xmm8, %xmm12
  494. mulpd %xmm4, %xmm8
  495. addpd %xmm8, %xmm0
  496. mulpd %xmm4, %xmm12
  497. addpd %xmm12, %xmm1
  498. MOVLPS -14 * SIZE(X), %xmm5
  499. movhps -13 * SIZE(X), %xmm5
  500. movaps -14 * SIZE(Y), %xmm9
  501. pshufd $0x4e, %xmm9, %xmm12
  502. mulpd %xmm5, %xmm9
  503. addpd %xmm9, %xmm2
  504. mulpd %xmm5, %xmm12
  505. addpd %xmm12, %xmm3
  506. addq $4 * SIZE, X
  507. addq $4 * SIZE, Y
  508. ALIGN_3
  509. .L27:
  510. testq $1, N
  511. jle .L98
  512. MOVLPS -16 * SIZE(X), %xmm4
  513. movhps -15 * SIZE(X), %xmm4
  514. movaps -16 * SIZE(Y), %xmm8
  515. pshufd $0x4e, %xmm8, %xmm12
  516. mulpd %xmm4, %xmm8
  517. addpd %xmm8, %xmm0
  518. mulpd %xmm4, %xmm12
  519. addpd %xmm12, %xmm1
  520. jmp .L98
  521. ALIGN_3
  522. .L30:
  523. testq $SIZE, X
  524. jne .L40
  525. movq N, %rax
  526. sarq $3, %rax
  527. jle .L35
  528. MOVLPS -16 * SIZE(Y), %xmm4
  529. movhps -15 * SIZE(Y), %xmm4
  530. MOVLPS -14 * SIZE(Y), %xmm5
  531. movhps -13 * SIZE(Y), %xmm5
  532. movaps -16 * SIZE(X), %xmm8
  533. movaps -14 * SIZE(X), %xmm9
  534. MOVLPS -12 * SIZE(Y), %xmm6
  535. movhps -11 * SIZE(Y), %xmm6
  536. MOVLPS -10 * SIZE(Y), %xmm7
  537. movhps -9 * SIZE(Y), %xmm7
  538. movaps -12 * SIZE(X), %xmm10
  539. movaps -10 * SIZE(X), %xmm11
  540. decq %rax
  541. jle .L32
  542. ALIGN_3
  543. .L31:
  544. #ifdef PREFETCH
  545. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  546. #endif
  547. pshufd $0x4e, %xmm8, %xmm12
  548. mulpd %xmm4, %xmm8
  549. addpd %xmm8, %xmm0
  550. movaps -8 * SIZE(X), %xmm8
  551. mulpd %xmm4, %xmm12
  552. MOVLPS -8 * SIZE(Y), %xmm4
  553. movhps -7 * SIZE(Y), %xmm4
  554. addpd %xmm12, %xmm1
  555. pshufd $0x4e, %xmm9, %xmm12
  556. mulpd %xmm5, %xmm9
  557. addpd %xmm9, %xmm2
  558. movaps -6 * SIZE(X), %xmm9
  559. mulpd %xmm5, %xmm12
  560. MOVLPS -6 * SIZE(Y), %xmm5
  561. movhps -5 * SIZE(Y), %xmm5
  562. addpd %xmm12, %xmm3
  563. #ifdef PREFETCH
  564. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  565. #endif
  566. pshufd $0x4e, %xmm10, %xmm12
  567. mulpd %xmm6, %xmm10
  568. addpd %xmm10, %xmm0
  569. movaps -4 * SIZE(X), %xmm10
  570. mulpd %xmm6, %xmm12
  571. MOVLPS -4 * SIZE(Y), %xmm6
  572. movhps -3 * SIZE(Y), %xmm6
  573. addpd %xmm12, %xmm1
  574. pshufd $0x4e, %xmm11, %xmm12
  575. mulpd %xmm7, %xmm11
  576. addpd %xmm11, %xmm2
  577. movaps -2 * SIZE(X), %xmm11
  578. mulpd %xmm7, %xmm12
  579. MOVLPS -2 * SIZE(Y), %xmm7
  580. movhps -1 * SIZE(Y), %xmm7
  581. addpd %xmm12, %xmm3
  582. #if defined(PREFETCH) && !defined(FETCH128)
  583. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  584. #endif
  585. pshufd $0x4e, %xmm8, %xmm12
  586. mulpd %xmm4, %xmm8
  587. addpd %xmm8, %xmm0
  588. movaps 0 * SIZE(X), %xmm8
  589. mulpd %xmm4, %xmm12
  590. MOVLPS 0 * SIZE(Y), %xmm4
  591. movhps 1 * SIZE(Y), %xmm4
  592. addpd %xmm12, %xmm1
  593. pshufd $0x4e, %xmm9, %xmm12
  594. mulpd %xmm5, %xmm9
  595. addpd %xmm9, %xmm2
  596. movaps 2 * SIZE(X), %xmm9
  597. mulpd %xmm5, %xmm12
  598. MOVLPS 2 * SIZE(Y), %xmm5
  599. movhps 3 * SIZE(Y), %xmm5
  600. addpd %xmm12, %xmm3
  601. #if defined(PREFETCH) && !defined(FETCH128)
  602. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  603. #endif
  604. pshufd $0x4e, %xmm10, %xmm12
  605. mulpd %xmm6, %xmm10
  606. addpd %xmm10, %xmm0
  607. movaps 4 * SIZE(X), %xmm10
  608. mulpd %xmm6, %xmm12
  609. MOVLPS 4 * SIZE(Y), %xmm6
  610. movhps 5 * SIZE(Y), %xmm6
  611. addpd %xmm12, %xmm1
  612. pshufd $0x4e, %xmm11, %xmm12
  613. mulpd %xmm7, %xmm11
  614. addpd %xmm11, %xmm2
  615. movaps 6 * SIZE(X), %xmm11
  616. mulpd %xmm7, %xmm12
  617. MOVLPS 6 * SIZE(Y), %xmm7
  618. movhps 7 * SIZE(Y), %xmm7
  619. addpd %xmm12, %xmm3
  620. subq $-16 * SIZE, X
  621. subq $-16 * SIZE, Y
  622. decq %rax
  623. jg .L31
  624. ALIGN_3
  625. .L32:
  626. pshufd $0x4e, %xmm8, %xmm12
  627. mulpd %xmm4, %xmm8
  628. addpd %xmm8, %xmm0
  629. movaps -8 * SIZE(X), %xmm8
  630. mulpd %xmm4, %xmm12
  631. MOVLPS -8 * SIZE(Y), %xmm4
  632. movhps -7 * SIZE(Y), %xmm4
  633. addpd %xmm12, %xmm1
  634. pshufd $0x4e, %xmm9, %xmm12
  635. mulpd %xmm5, %xmm9
  636. addpd %xmm9, %xmm2
  637. movaps -6 * SIZE(X), %xmm9
  638. mulpd %xmm5, %xmm12
  639. MOVLPS -6 * SIZE(Y), %xmm5
  640. movhps -5 * SIZE(Y), %xmm5
  641. addpd %xmm12, %xmm3
  642. pshufd $0x4e, %xmm10, %xmm12
  643. mulpd %xmm6, %xmm10
  644. addpd %xmm10, %xmm0
  645. movaps -4 * SIZE(X), %xmm10
  646. mulpd %xmm6, %xmm12
  647. MOVLPS -4 * SIZE(Y), %xmm6
  648. movhps -3 * SIZE(Y), %xmm6
  649. addpd %xmm12, %xmm1
  650. pshufd $0x4e, %xmm11, %xmm12
  651. mulpd %xmm7, %xmm11
  652. addpd %xmm11, %xmm2
  653. movaps -2 * SIZE(X), %xmm11
  654. mulpd %xmm7, %xmm12
  655. MOVLPS -2 * SIZE(Y), %xmm7
  656. movhps -1 * SIZE(Y), %xmm7
  657. addpd %xmm12, %xmm3
  658. pshufd $0x4e, %xmm8, %xmm12
  659. mulpd %xmm4, %xmm8
  660. addpd %xmm8, %xmm0
  661. mulpd %xmm4, %xmm12
  662. addpd %xmm12, %xmm1
  663. pshufd $0x4e, %xmm9, %xmm12
  664. mulpd %xmm5, %xmm9
  665. addpd %xmm9, %xmm2
  666. mulpd %xmm5, %xmm12
  667. addpd %xmm12, %xmm3
  668. pshufd $0x4e, %xmm10, %xmm12
  669. mulpd %xmm6, %xmm10
  670. addpd %xmm10, %xmm0
  671. mulpd %xmm6, %xmm12
  672. addpd %xmm12, %xmm1
  673. pshufd $0x4e, %xmm11, %xmm12
  674. mulpd %xmm7, %xmm11
  675. addpd %xmm11, %xmm2
  676. mulpd %xmm7, %xmm12
  677. addpd %xmm12, %xmm3
  678. subq $-16 * SIZE, X
  679. subq $-16 * SIZE, Y
  680. ALIGN_3
  681. .L35:
  682. testq $4, N
  683. jle .L36
  684. MOVLPS -16 * SIZE(Y), %xmm4
  685. movhps -15 * SIZE(Y), %xmm4
  686. movaps -16 * SIZE(X), %xmm8
  687. pshufd $0x4e, %xmm8, %xmm12
  688. mulpd %xmm4, %xmm8
  689. addpd %xmm8, %xmm0
  690. mulpd %xmm4, %xmm12
  691. addpd %xmm12, %xmm1
  692. MOVLPS -14 * SIZE(Y), %xmm5
  693. movhps -13 * SIZE(Y), %xmm5
  694. movaps -14 * SIZE(X), %xmm9
  695. pshufd $0x4e, %xmm9, %xmm12
  696. mulpd %xmm5, %xmm9
  697. addpd %xmm9, %xmm2
  698. mulpd %xmm5, %xmm12
  699. addpd %xmm12, %xmm3
  700. MOVLPS -12 * SIZE(Y), %xmm6
  701. movhps -11 * SIZE(Y), %xmm6
  702. movaps -12 * SIZE(X), %xmm10
  703. pshufd $0x4e, %xmm10, %xmm12
  704. mulpd %xmm6, %xmm10
  705. addpd %xmm10, %xmm0
  706. mulpd %xmm6, %xmm12
  707. addpd %xmm12, %xmm1
  708. MOVLPS -10 * SIZE(Y), %xmm7
  709. movhps -9 * SIZE(Y), %xmm7
  710. movaps -10 * SIZE(X), %xmm11
  711. pshufd $0x4e, %xmm11, %xmm12
  712. mulpd %xmm7, %xmm11
  713. addpd %xmm11, %xmm2
  714. mulpd %xmm7, %xmm12
  715. addpd %xmm12, %xmm3
  716. addq $8 * SIZE, X
  717. addq $8 * SIZE, Y
  718. ALIGN_3
  719. .L36:
  720. testq $2, N
  721. jle .L37
  722. MOVLPS -16 * SIZE(Y), %xmm4
  723. movhps -15 * SIZE(Y), %xmm4
  724. movaps -16 * SIZE(X), %xmm8
  725. pshufd $0x4e, %xmm8, %xmm12
  726. mulpd %xmm4, %xmm8
  727. addpd %xmm8, %xmm0
  728. mulpd %xmm4, %xmm12
  729. addpd %xmm12, %xmm1
  730. MOVLPS -14 * SIZE(Y), %xmm5
  731. movhps -13 * SIZE(Y), %xmm5
  732. movaps -14 * SIZE(X), %xmm9
  733. pshufd $0x4e, %xmm9, %xmm12
  734. mulpd %xmm5, %xmm9
  735. addpd %xmm9, %xmm2
  736. mulpd %xmm5, %xmm12
  737. addpd %xmm12, %xmm3
  738. addq $4 * SIZE, X
  739. addq $4 * SIZE, Y
  740. ALIGN_3
  741. .L37:
  742. SHUFPD_1 %xmm1, %xmm1
  743. SHUFPD_1 %xmm3, %xmm3
  744. testq $1, N
  745. jle .L98
  746. MOVLPS -16 * SIZE(Y), %xmm4
  747. movhps -15 * SIZE(Y), %xmm4
  748. movaps -16 * SIZE(X), %xmm8
  749. pshufd $0x4e, %xmm8, %xmm12
  750. mulpd %xmm4, %xmm8
  751. addpd %xmm8, %xmm0
  752. mulpd %xmm4, %xmm12
  753. SHUFPD_1 %xmm12, %xmm12
  754. addpd %xmm12, %xmm1
  755. jmp .L98
  756. ALIGN_3
  757. .L40:
  758. movhps -16 * SIZE(X), %xmm4
  759. addq $SIZE, X
  760. movhps -16 * SIZE(Y), %xmm8
  761. addq $SIZE, Y
  762. movq N, %rax
  763. sarq $3, %rax
  764. jle .L45
  765. movaps -16 * SIZE(X), %xmm5
  766. movaps -16 * SIZE(Y), %xmm9
  767. movaps -14 * SIZE(X), %xmm6
  768. movaps -14 * SIZE(Y), %xmm10
  769. movaps -12 * SIZE(X), %xmm7
  770. movaps -12 * SIZE(Y), %xmm11
  771. decq %rax
  772. jle .L42
  773. ALIGN_3
  774. .L41:
  775. #ifdef PREFETCH
  776. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  777. #endif
  778. movsd %xmm9, %xmm8
  779. pshufd $0x4e, %xmm8, %xmm12
  780. movsd %xmm5, %xmm4
  781. mulpd %xmm4, %xmm8
  782. addpd %xmm8, %xmm0
  783. movaps -10 * SIZE(Y), %xmm8
  784. mulpd %xmm4, %xmm12
  785. movaps -10 * SIZE(X), %xmm4
  786. addpd %xmm12, %xmm1
  787. movsd %xmm10, %xmm9
  788. pshufd $0x4e, %xmm9, %xmm12
  789. movsd %xmm6, %xmm5
  790. mulpd %xmm5, %xmm9
  791. addpd %xmm9, %xmm0
  792. movaps -8 * SIZE(Y), %xmm9
  793. mulpd %xmm5, %xmm12
  794. movaps -8 * SIZE(X), %xmm5
  795. addpd %xmm12, %xmm1
  796. #ifdef PREFETCH
  797. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  798. #endif
  799. movsd %xmm11, %xmm10
  800. pshufd $0x4e, %xmm10, %xmm12
  801. movsd %xmm7, %xmm6
  802. mulpd %xmm6, %xmm10
  803. addpd %xmm10, %xmm0
  804. movaps -6 * SIZE(Y), %xmm10
  805. mulpd %xmm6, %xmm12
  806. movaps -6 * SIZE(X), %xmm6
  807. addpd %xmm12, %xmm1
  808. movsd %xmm8, %xmm11
  809. pshufd $0x4e, %xmm11, %xmm12
  810. movsd %xmm4, %xmm7
  811. mulpd %xmm7, %xmm11
  812. addpd %xmm11, %xmm0
  813. movaps -4 * SIZE(Y), %xmm11
  814. mulpd %xmm7, %xmm12
  815. movaps -4 * SIZE(X), %xmm7
  816. addpd %xmm12, %xmm1
  817. #if defined(PREFETCH) && !defined(FETCH128)
  818. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  819. #endif
  820. movsd %xmm9, %xmm8
  821. pshufd $0x4e, %xmm8, %xmm12
  822. movsd %xmm5, %xmm4
  823. mulpd %xmm4, %xmm8
  824. addpd %xmm8, %xmm0
  825. movaps -2 * SIZE(Y), %xmm8
  826. mulpd %xmm4, %xmm12
  827. movaps -2 * SIZE(X), %xmm4
  828. addpd %xmm12, %xmm1
  829. movsd %xmm10, %xmm9
  830. pshufd $0x4e, %xmm9, %xmm12
  831. movsd %xmm6, %xmm5
  832. mulpd %xmm5, %xmm9
  833. addpd %xmm9, %xmm0
  834. movaps 0 * SIZE(Y), %xmm9
  835. mulpd %xmm5, %xmm12
  836. movaps 0 * SIZE(X), %xmm5
  837. addpd %xmm12, %xmm1
  838. #if defined(PREFETCH) && !defined(FETCH128)
  839. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  840. #endif
  841. movsd %xmm11, %xmm10
  842. pshufd $0x4e, %xmm10, %xmm12
  843. movsd %xmm7, %xmm6
  844. mulpd %xmm6, %xmm10
  845. addpd %xmm10, %xmm0
  846. movaps 2 * SIZE(Y), %xmm10
  847. mulpd %xmm6, %xmm12
  848. movaps 2 * SIZE(X), %xmm6
  849. addpd %xmm12, %xmm1
  850. movsd %xmm8, %xmm11
  851. pshufd $0x4e, %xmm11, %xmm12
  852. movsd %xmm4, %xmm7
  853. mulpd %xmm7, %xmm11
  854. addpd %xmm11, %xmm0
  855. movaps 4 * SIZE(Y), %xmm11
  856. mulpd %xmm7, %xmm12
  857. movaps 4 * SIZE(X), %xmm7
  858. addpd %xmm12, %xmm1
  859. subq $-16 * SIZE, X
  860. subq $-16 * SIZE, Y
  861. decq %rax
  862. jg .L41
  863. ALIGN_3
  864. .L42:
  865. movsd %xmm9, %xmm8
  866. pshufd $0x4e, %xmm8, %xmm12
  867. movsd %xmm5, %xmm4
  868. mulpd %xmm4, %xmm8
  869. addpd %xmm8, %xmm0
  870. movaps -10 * SIZE(Y), %xmm8
  871. mulpd %xmm4, %xmm12
  872. movaps -10 * SIZE(X), %xmm4
  873. addpd %xmm12, %xmm1
  874. movsd %xmm10, %xmm9
  875. pshufd $0x4e, %xmm9, %xmm12
  876. movsd %xmm6, %xmm5
  877. mulpd %xmm5, %xmm9
  878. addpd %xmm9, %xmm0
  879. movaps -8 * SIZE(Y), %xmm9
  880. mulpd %xmm5, %xmm12
  881. movaps -8 * SIZE(X), %xmm5
  882. addpd %xmm12, %xmm1
  883. movsd %xmm11, %xmm10
  884. pshufd $0x4e, %xmm10, %xmm12
  885. movsd %xmm7, %xmm6
  886. mulpd %xmm6, %xmm10
  887. addpd %xmm10, %xmm0
  888. movaps -6 * SIZE(Y), %xmm10
  889. mulpd %xmm6, %xmm12
  890. movaps -6 * SIZE(X), %xmm6
  891. addpd %xmm12, %xmm1
  892. movsd %xmm8, %xmm11
  893. pshufd $0x4e, %xmm11, %xmm12
  894. movsd %xmm4, %xmm7
  895. mulpd %xmm7, %xmm11
  896. addpd %xmm11, %xmm0
  897. movaps -4 * SIZE(Y), %xmm11
  898. mulpd %xmm7, %xmm12
  899. movaps -4 * SIZE(X), %xmm7
  900. addpd %xmm12, %xmm1
  901. movsd %xmm9, %xmm8
  902. pshufd $0x4e, %xmm8, %xmm12
  903. movsd %xmm5, %xmm4
  904. mulpd %xmm4, %xmm8
  905. addpd %xmm8, %xmm0
  906. movaps -2 * SIZE(Y), %xmm8
  907. mulpd %xmm4, %xmm12
  908. movaps -2 * SIZE(X), %xmm4
  909. addpd %xmm12, %xmm1
  910. movsd %xmm10, %xmm9
  911. pshufd $0x4e, %xmm9, %xmm12
  912. movsd %xmm6, %xmm5
  913. mulpd %xmm5, %xmm9
  914. addpd %xmm9, %xmm0
  915. mulpd %xmm5, %xmm12
  916. addpd %xmm12, %xmm1
  917. movsd %xmm11, %xmm10
  918. pshufd $0x4e, %xmm10, %xmm12
  919. movsd %xmm7, %xmm6
  920. mulpd %xmm6, %xmm10
  921. addpd %xmm10, %xmm0
  922. mulpd %xmm6, %xmm12
  923. addpd %xmm12, %xmm1
  924. movsd %xmm8, %xmm11
  925. pshufd $0x4e, %xmm11, %xmm12
  926. movsd %xmm4, %xmm7
  927. mulpd %xmm7, %xmm11
  928. addpd %xmm11, %xmm0
  929. mulpd %xmm7, %xmm12
  930. addpd %xmm12, %xmm1
  931. subq $-16 * SIZE, X
  932. subq $-16 * SIZE, Y
  933. ALIGN_3
  934. .L45:
  935. testq $4, N
  936. jle .L46
  937. movaps -16 * SIZE(X), %xmm5
  938. movaps -16 * SIZE(Y), %xmm9
  939. movaps -14 * SIZE(X), %xmm6
  940. movaps -14 * SIZE(Y), %xmm10
  941. movsd %xmm9, %xmm8
  942. pshufd $0x4e, %xmm8, %xmm12
  943. movsd %xmm5, %xmm4
  944. mulpd %xmm4, %xmm8
  945. addpd %xmm8, %xmm0
  946. mulpd %xmm4, %xmm12
  947. addpd %xmm12, %xmm1
  948. movaps -12 * SIZE(X), %xmm7
  949. movaps -12 * SIZE(Y), %xmm11
  950. movsd %xmm10, %xmm9
  951. pshufd $0x4e, %xmm9, %xmm12
  952. movsd %xmm6, %xmm5
  953. mulpd %xmm5, %xmm9
  954. addpd %xmm9, %xmm0
  955. mulpd %xmm5, %xmm12
  956. addpd %xmm12, %xmm1
  957. movaps -10 * SIZE(X), %xmm4
  958. movaps -10 * SIZE(Y), %xmm8
  959. movsd %xmm11, %xmm10
  960. pshufd $0x4e, %xmm10, %xmm12
  961. movsd %xmm7, %xmm6
  962. mulpd %xmm6, %xmm10
  963. addpd %xmm10, %xmm0
  964. mulpd %xmm6, %xmm12
  965. addpd %xmm12, %xmm1
  966. movsd %xmm8, %xmm11
  967. pshufd $0x4e, %xmm11, %xmm12
  968. movsd %xmm4, %xmm7
  969. mulpd %xmm7, %xmm11
  970. addpd %xmm11, %xmm0
  971. mulpd %xmm7, %xmm12
  972. addpd %xmm12, %xmm1
  973. addq $8 * SIZE, X
  974. addq $8 * SIZE, Y
  975. ALIGN_3
  976. .L46:
  977. testq $2, N
  978. jle .L47
  979. movaps -16 * SIZE(X), %xmm5
  980. movaps -16 * SIZE(Y), %xmm9
  981. movsd %xmm9, %xmm8
  982. pshufd $0x4e, %xmm8, %xmm12
  983. movsd %xmm5, %xmm4
  984. mulpd %xmm4, %xmm8
  985. addpd %xmm8, %xmm0
  986. mulpd %xmm4, %xmm12
  987. addpd %xmm12, %xmm1
  988. movaps -14 * SIZE(X), %xmm6
  989. movaps -14 * SIZE(Y), %xmm10
  990. movsd %xmm10, %xmm9
  991. pshufd $0x4e, %xmm9, %xmm12
  992. movsd %xmm6, %xmm5
  993. mulpd %xmm5, %xmm9
  994. addpd %xmm9, %xmm0
  995. mulpd %xmm5, %xmm12
  996. addpd %xmm12, %xmm1
  997. movaps %xmm6, %xmm4
  998. movaps %xmm10, %xmm8
  999. addq $4 * SIZE, X
  1000. addq $4 * SIZE, Y
  1001. ALIGN_3
  1002. .L47:
  1003. testq $1, N
  1004. jle .L48
  1005. movlps -16 * SIZE(X), %xmm4
  1006. movlps -16 * SIZE(Y), %xmm8
  1007. pshufd $0x4e, %xmm8, %xmm12
  1008. mulpd %xmm4, %xmm8
  1009. addpd %xmm8, %xmm0
  1010. mulpd %xmm4, %xmm12
  1011. addpd %xmm12, %xmm1
  1012. ALIGN_3
  1013. .L48:
  1014. SHUFPD_1 %xmm0, %xmm0
  1015. SHUFPD_1 %xmm1, %xmm1
  1016. SHUFPD_1 %xmm2, %xmm2
  1017. SHUFPD_1 %xmm3, %xmm3
  1018. jmp .L98
  1019. ALIGN_3
  1020. .L50:
  1021. movq N, %rax
  1022. sarq $3, %rax
  1023. jle .L55
  1024. MOVLPS 0 * SIZE(X), %xmm4
  1025. movhps 1 * SIZE(X), %xmm4
  1026. addq INCX, X
  1027. MOVLPS 0 * SIZE(Y), %xmm8
  1028. movhps 1 * SIZE(Y), %xmm8
  1029. addq INCY, Y
  1030. MOVLPS 0 * SIZE(X), %xmm5
  1031. movhps 1 * SIZE(X), %xmm5
  1032. addq INCX, X
  1033. MOVLPS 0 * SIZE(Y), %xmm9
  1034. movhps 1 * SIZE(Y), %xmm9
  1035. addq INCY, Y
  1036. MOVLPS 0 * SIZE(X), %xmm6
  1037. movhps 1 * SIZE(X), %xmm6
  1038. addq INCX, X
  1039. MOVLPS 0 * SIZE(Y), %xmm10
  1040. movhps 1 * SIZE(Y), %xmm10
  1041. addq INCY, Y
  1042. MOVLPS 0 * SIZE(X), %xmm7
  1043. movhps 1 * SIZE(X), %xmm7
  1044. addq INCX, X
  1045. MOVLPS 0 * SIZE(Y), %xmm11
  1046. movhps 1 * SIZE(Y), %xmm11
  1047. addq INCY, Y
  1048. decq %rax
  1049. jle .L54
  1050. ALIGN_3
  1051. .L53:
  1052. pshufd $0x4e, %xmm8, %xmm12
  1053. mulpd %xmm4, %xmm8
  1054. addpd %xmm8, %xmm0
  1055. MOVLPS 0 * SIZE(Y), %xmm8
  1056. movhps 1 * SIZE(Y), %xmm8
  1057. addq INCY, Y
  1058. mulpd %xmm4, %xmm12
  1059. MOVLPS 0 * SIZE(X), %xmm4
  1060. movhps 1 * SIZE(X), %xmm4
  1061. addq INCX, X
  1062. addpd %xmm12, %xmm1
  1063. pshufd $0x4e, %xmm9, %xmm12
  1064. mulpd %xmm5, %xmm9
  1065. addpd %xmm9, %xmm2
  1066. MOVLPS 0 * SIZE(Y), %xmm9
  1067. movhps 1 * SIZE(Y), %xmm9
  1068. addq INCY, Y
  1069. mulpd %xmm5, %xmm12
  1070. MOVLPS 0 * SIZE(X), %xmm5
  1071. movhps 1 * SIZE(X), %xmm5
  1072. addq INCX, X
  1073. addpd %xmm12, %xmm3
  1074. pshufd $0x4e, %xmm10, %xmm12
  1075. mulpd %xmm6, %xmm10
  1076. addpd %xmm10, %xmm0
  1077. MOVLPS 0 * SIZE(Y), %xmm10
  1078. movhps 1 * SIZE(Y), %xmm10
  1079. addq INCY, Y
  1080. mulpd %xmm6, %xmm12
  1081. MOVLPS 0 * SIZE(X), %xmm6
  1082. movhps 1 * SIZE(X), %xmm6
  1083. addq INCX, X
  1084. addpd %xmm12, %xmm1
  1085. pshufd $0x4e, %xmm11, %xmm12
  1086. mulpd %xmm7, %xmm11
  1087. addpd %xmm11, %xmm2
  1088. MOVLPS 0 * SIZE(Y), %xmm11
  1089. movhps 1 * SIZE(Y), %xmm11
  1090. addq INCY, Y
  1091. mulpd %xmm7, %xmm12
  1092. MOVLPS 0 * SIZE(X), %xmm7
  1093. movhps 1 * SIZE(X), %xmm7
  1094. addq INCX, X
  1095. addpd %xmm12, %xmm3
  1096. pshufd $0x4e, %xmm8, %xmm12
  1097. mulpd %xmm4, %xmm8
  1098. addpd %xmm8, %xmm0
  1099. MOVLPS 0 * SIZE(Y), %xmm8
  1100. movhps 1 * SIZE(Y), %xmm8
  1101. addq INCY, Y
  1102. mulpd %xmm4, %xmm12
  1103. MOVLPS 0 * SIZE(X), %xmm4
  1104. movhps 1 * SIZE(X), %xmm4
  1105. addq INCX, X
  1106. addpd %xmm12, %xmm1
  1107. pshufd $0x4e, %xmm9, %xmm12
  1108. mulpd %xmm5, %xmm9
  1109. addpd %xmm9, %xmm2
  1110. MOVLPS 0 * SIZE(Y), %xmm9
  1111. movhps 1 * SIZE(Y), %xmm9
  1112. addq INCY, Y
  1113. mulpd %xmm5, %xmm12
  1114. MOVLPS 0 * SIZE(X), %xmm5
  1115. movhps 1 * SIZE(X), %xmm5
  1116. addq INCX, X
  1117. addpd %xmm12, %xmm3
  1118. pshufd $0x4e, %xmm10, %xmm12
  1119. mulpd %xmm6, %xmm10
  1120. addpd %xmm10, %xmm0
  1121. MOVLPS 0 * SIZE(Y), %xmm10
  1122. movhps 1 * SIZE(Y), %xmm10
  1123. addq INCY, Y
  1124. mulpd %xmm6, %xmm12
  1125. MOVLPS 0 * SIZE(X), %xmm6
  1126. movhps 1 * SIZE(X), %xmm6
  1127. addq INCX, X
  1128. addpd %xmm12, %xmm1
  1129. pshufd $0x4e, %xmm11, %xmm12
  1130. mulpd %xmm7, %xmm11
  1131. addpd %xmm11, %xmm2
  1132. MOVLPS 0 * SIZE(Y), %xmm11
  1133. movhps 1 * SIZE(Y), %xmm11
  1134. addq INCY, Y
  1135. mulpd %xmm7, %xmm12
  1136. MOVLPS 0 * SIZE(X), %xmm7
  1137. movhps 1 * SIZE(X), %xmm7
  1138. addq INCX, X
  1139. addpd %xmm12, %xmm3
  1140. decq %rax
  1141. jg .L53
  1142. ALIGN_3
  1143. .L54:
  1144. pshufd $0x4e, %xmm8, %xmm12
  1145. mulpd %xmm4, %xmm8
  1146. addpd %xmm8, %xmm0
  1147. MOVLPS 0 * SIZE(Y), %xmm8
  1148. movhps 1 * SIZE(Y), %xmm8
  1149. addq INCY, Y
  1150. mulpd %xmm4, %xmm12
  1151. MOVLPS 0 * SIZE(X), %xmm4
  1152. movhps 1 * SIZE(X), %xmm4
  1153. addq INCX, X
  1154. addpd %xmm12, %xmm1
  1155. pshufd $0x4e, %xmm9, %xmm12
  1156. mulpd %xmm5, %xmm9
  1157. addpd %xmm9, %xmm2
  1158. MOVLPS 0 * SIZE(Y), %xmm9
  1159. movhps 1 * SIZE(Y), %xmm9
  1160. addq INCY, Y
  1161. mulpd %xmm5, %xmm12
  1162. MOVLPS 0 * SIZE(X), %xmm5
  1163. movhps 1 * SIZE(X), %xmm5
  1164. addq INCX, X
  1165. addpd %xmm12, %xmm3
  1166. pshufd $0x4e, %xmm10, %xmm12
  1167. mulpd %xmm6, %xmm10
  1168. addpd %xmm10, %xmm0
  1169. MOVLPS 0 * SIZE(Y), %xmm10
  1170. movhps 1 * SIZE(Y), %xmm10
  1171. addq INCY, Y
  1172. mulpd %xmm6, %xmm12
  1173. MOVLPS 0 * SIZE(X), %xmm6
  1174. movhps 1 * SIZE(X), %xmm6
  1175. addq INCX, X
  1176. addpd %xmm12, %xmm1
  1177. pshufd $0x4e, %xmm11, %xmm12
  1178. mulpd %xmm7, %xmm11
  1179. addpd %xmm11, %xmm2
  1180. MOVLPS 0 * SIZE(Y), %xmm11
  1181. movhps 1 * SIZE(Y), %xmm11
  1182. addq INCY, Y
  1183. mulpd %xmm7, %xmm12
  1184. MOVLPS 0 * SIZE(X), %xmm7
  1185. movhps 1 * SIZE(X), %xmm7
  1186. addq INCX, X
  1187. addpd %xmm12, %xmm3
  1188. pshufd $0x4e, %xmm8, %xmm12
  1189. mulpd %xmm4, %xmm8
  1190. addpd %xmm8, %xmm0
  1191. mulpd %xmm4, %xmm12
  1192. addpd %xmm12, %xmm1
  1193. pshufd $0x4e, %xmm9, %xmm12
  1194. mulpd %xmm5, %xmm9
  1195. addpd %xmm9, %xmm2
  1196. mulpd %xmm5, %xmm12
  1197. addpd %xmm12, %xmm3
  1198. pshufd $0x4e, %xmm10, %xmm12
  1199. mulpd %xmm6, %xmm10
  1200. addpd %xmm10, %xmm0
  1201. mulpd %xmm6, %xmm12
  1202. addpd %xmm12, %xmm1
  1203. pshufd $0x4e, %xmm11, %xmm12
  1204. mulpd %xmm7, %xmm11
  1205. addpd %xmm11, %xmm2
  1206. mulpd %xmm7, %xmm12
  1207. addpd %xmm12, %xmm3
  1208. ALIGN_3
  1209. .L55:
  1210. testq $4, N
  1211. jle .L56
  1212. MOVLPS 0 * SIZE(X), %xmm4
  1213. movhps 1 * SIZE(X), %xmm4
  1214. addq INCX, X
  1215. MOVLPS 0 * SIZE(Y), %xmm8
  1216. movhps 1 * SIZE(Y), %xmm8
  1217. addq INCY, Y
  1218. pshufd $0x4e, %xmm8, %xmm12
  1219. mulpd %xmm4, %xmm8
  1220. addpd %xmm8, %xmm0
  1221. mulpd %xmm4, %xmm12
  1222. addpd %xmm12, %xmm1
  1223. MOVLPS 0 * SIZE(X), %xmm5
  1224. movhps 1 * SIZE(X), %xmm5
  1225. addq INCX, X
  1226. MOVLPS 0 * SIZE(Y), %xmm9
  1227. movhps 1 * SIZE(Y), %xmm9
  1228. addq INCY, Y
  1229. pshufd $0x4e, %xmm9, %xmm12
  1230. mulpd %xmm5, %xmm9
  1231. addpd %xmm9, %xmm2
  1232. mulpd %xmm5, %xmm12
  1233. addpd %xmm12, %xmm3
  1234. MOVLPS 0 * SIZE(X), %xmm6
  1235. movhps 1 * SIZE(X), %xmm6
  1236. addq INCX, X
  1237. MOVLPS 0 * SIZE(Y), %xmm10
  1238. movhps 1 * SIZE(Y), %xmm10
  1239. addq INCY, Y
  1240. pshufd $0x4e, %xmm10, %xmm12
  1241. mulpd %xmm6, %xmm10
  1242. addpd %xmm10, %xmm0
  1243. mulpd %xmm6, %xmm12
  1244. addpd %xmm12, %xmm1
  1245. MOVLPS 0 * SIZE(X), %xmm7
  1246. movhps 1 * SIZE(X), %xmm7
  1247. addq INCX, X
  1248. MOVLPS 0 * SIZE(Y), %xmm11
  1249. movhps 1 * SIZE(Y), %xmm11
  1250. addq INCY, Y
  1251. pshufd $0x4e, %xmm11, %xmm12
  1252. mulpd %xmm7, %xmm11
  1253. addpd %xmm11, %xmm2
  1254. mulpd %xmm7, %xmm12
  1255. addpd %xmm12, %xmm3
  1256. ALIGN_3
  1257. .L56:
  1258. testq $2, N
  1259. jle .L57
  1260. MOVLPS 0 * SIZE(X), %xmm4
  1261. movhps 1 * SIZE(X), %xmm4
  1262. addq INCX, X
  1263. MOVLPS 0 * SIZE(Y), %xmm8
  1264. movhps 1 * SIZE(Y), %xmm8
  1265. addq INCY, Y
  1266. pshufd $0x4e, %xmm8, %xmm12
  1267. mulpd %xmm4, %xmm8
  1268. addpd %xmm8, %xmm0
  1269. mulpd %xmm4, %xmm12
  1270. addpd %xmm12, %xmm1
  1271. MOVLPS 0 * SIZE(X), %xmm5
  1272. movhps 1 * SIZE(X), %xmm5
  1273. addq INCX, X
  1274. MOVLPS 0 * SIZE(Y), %xmm9
  1275. movhps 1 * SIZE(Y), %xmm9
  1276. addq INCY, Y
  1277. pshufd $0x4e, %xmm9, %xmm12
  1278. mulpd %xmm5, %xmm9
  1279. addpd %xmm9, %xmm2
  1280. mulpd %xmm5, %xmm12
  1281. addpd %xmm12, %xmm3
  1282. ALIGN_3
  1283. .L57:
  1284. testq $1, N
  1285. jle .L98
  1286. MOVLPS 0 * SIZE(X), %xmm4
  1287. movhps 1 * SIZE(X), %xmm4
  1288. MOVLPS 0 * SIZE(Y), %xmm8
  1289. movhps 1 * SIZE(Y), %xmm8
  1290. pshufd $0x4e, %xmm8, %xmm12
  1291. mulpd %xmm4, %xmm8
  1292. addpd %xmm8, %xmm0
  1293. mulpd %xmm4, %xmm12
  1294. addpd %xmm12, %xmm1
  1295. ALIGN_3
  1296. .L98:
  1297. addpd %xmm2, %xmm0
  1298. addpd %xmm3, %xmm1
  1299. pshufd $0x4e, %xmm0, %xmm2
  1300. pshufd $0x4e, %xmm1, %xmm3
  1301. .L999:
  1302. #ifndef CONJ
  1303. subsd %xmm2, %xmm0
  1304. addsd %xmm3, %xmm1
  1305. #else
  1306. addsd %xmm2, %xmm0
  1307. subsd %xmm3, %xmm1
  1308. #endif
  1309. #ifdef WINDOWS_ABI
  1310. movq RESULT_ADDRESS, %rax
  1311. movsd %xmm0, (%rax)
  1312. movsd %xmm1, 8(%rax)
  1313. #endif
  1314. RESTOREREGISTERS
  1315. ret
  1316. EPILOGUE