You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zdot_sse2.S 31 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define RESULT 4 + STACK + ARGS(%esp)
  43. #define STACK_N 8 + STACK + ARGS(%esp)
  44. #define STACK_X 12 + STACK + ARGS(%esp)
  45. #define STACK_INCX 16 + STACK + ARGS(%esp)
  46. #define STACK_Y 20 + STACK + ARGS(%esp)
  47. #define STACK_INCY 24 + STACK + ARGS(%esp)
  48. #define N %ebx
  49. #define X %esi
  50. #define INCX %ecx
  51. #define Y %edi
  52. #define INCY %edx
  53. #include "l1param.h"
  54. #undef movsd
  55. #ifndef OPTERON
  56. #define MOVLPS movsd
  57. #else
  58. #define MOVLPS movlps
  59. #endif
  60. PROLOGUE
  61. PROFCODE
  62. pushl %edi
  63. pushl %esi
  64. pushl %ebx
  65. movl STACK_N, N
  66. movl STACK_X, X
  67. movl STACK_INCX, INCX
  68. movl STACK_Y, Y
  69. movl STACK_INCY, INCY
  70. sall $ZBASE_SHIFT, INCX
  71. sall $ZBASE_SHIFT, INCY
  72. xorps %xmm0, %xmm0
  73. xorps %xmm1, %xmm1
  74. cmpl $0, N
  75. jle .L999
  76. cmpl $2 * SIZE, INCX
  77. jne .L50
  78. cmpl $2 * SIZE, INCY
  79. jne .L50
  80. subl $-16 * SIZE, X
  81. subl $-16 * SIZE, Y
  82. testl $SIZE, Y
  83. jne .L30
  84. testl $SIZE, X
  85. jne .L20
  86. movl N, %eax
  87. sarl $3, %eax
  88. jle .L15
  89. movaps -16 * SIZE(X), %xmm4
  90. movaps -16 * SIZE(Y), %xmm6
  91. movaps -14 * SIZE(X), %xmm5
  92. movaps -14 * SIZE(Y), %xmm7
  93. decl %eax
  94. jle .L12
  95. ALIGN_3
  96. .L11:
  97. #ifdef PREFETCH
  98. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  99. #endif
  100. pshufd $0x4e, %xmm6, %xmm3
  101. mulpd %xmm4, %xmm6
  102. addpd %xmm6, %xmm0
  103. movaps -12 * SIZE(Y), %xmm6
  104. mulpd %xmm4, %xmm3
  105. movaps -12 * SIZE(X), %xmm4
  106. addpd %xmm3, %xmm1
  107. pshufd $0x4e, %xmm7, %xmm3
  108. mulpd %xmm5, %xmm7
  109. addpd %xmm7, %xmm0
  110. movaps -10 * SIZE(Y), %xmm7
  111. mulpd %xmm5, %xmm3
  112. movaps -10 * SIZE(X), %xmm5
  113. addpd %xmm3, %xmm1
  114. #ifdef PREFETCH
  115. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  116. #endif
  117. pshufd $0x4e, %xmm6, %xmm3
  118. mulpd %xmm4, %xmm6
  119. addpd %xmm6, %xmm0
  120. movaps -8 * SIZE(Y), %xmm6
  121. mulpd %xmm4, %xmm3
  122. movaps -8 * SIZE(X), %xmm4
  123. addpd %xmm3, %xmm1
  124. pshufd $0x4e, %xmm7, %xmm3
  125. mulpd %xmm5, %xmm7
  126. addpd %xmm7, %xmm0
  127. movaps -6 * SIZE(Y), %xmm7
  128. mulpd %xmm5, %xmm3
  129. movaps -6 * SIZE(X), %xmm5
  130. addpd %xmm3, %xmm1
  131. #if defined(PREFETCH) && !defined(FETCH128)
  132. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  133. #endif
  134. pshufd $0x4e, %xmm6, %xmm3
  135. mulpd %xmm4, %xmm6
  136. addpd %xmm6, %xmm0
  137. movaps -4 * SIZE(Y), %xmm6
  138. mulpd %xmm4, %xmm3
  139. movaps -4 * SIZE(X), %xmm4
  140. addpd %xmm3, %xmm1
  141. pshufd $0x4e, %xmm7, %xmm3
  142. mulpd %xmm5, %xmm7
  143. addpd %xmm7, %xmm0
  144. movaps -2 * SIZE(Y), %xmm7
  145. mulpd %xmm5, %xmm3
  146. movaps -2 * SIZE(X), %xmm5
  147. addpd %xmm3, %xmm1
  148. #if defined(PREFETCH) && !defined(FETCH128)
  149. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  150. #endif
  151. pshufd $0x4e, %xmm6, %xmm3
  152. mulpd %xmm4, %xmm6
  153. addpd %xmm6, %xmm0
  154. movaps 0 * SIZE(Y), %xmm6
  155. mulpd %xmm4, %xmm3
  156. movaps 0 * SIZE(X), %xmm4
  157. addpd %xmm3, %xmm1
  158. pshufd $0x4e, %xmm7, %xmm3
  159. mulpd %xmm5, %xmm7
  160. addpd %xmm7, %xmm0
  161. movaps 2 * SIZE(Y), %xmm7
  162. mulpd %xmm5, %xmm3
  163. movaps 2 * SIZE(X), %xmm5
  164. addpd %xmm3, %xmm1
  165. subl $-16 * SIZE, X
  166. subl $-16 * SIZE, Y
  167. decl %eax
  168. jg .L11
  169. ALIGN_3
  170. .L12:
  171. pshufd $0x4e, %xmm6, %xmm3
  172. mulpd %xmm4, %xmm6
  173. addpd %xmm6, %xmm0
  174. movaps -12 * SIZE(Y), %xmm6
  175. mulpd %xmm4, %xmm3
  176. movaps -12 * SIZE(X), %xmm4
  177. addpd %xmm3, %xmm1
  178. pshufd $0x4e, %xmm7, %xmm3
  179. mulpd %xmm5, %xmm7
  180. addpd %xmm7, %xmm0
  181. movaps -10 * SIZE(Y), %xmm7
  182. mulpd %xmm5, %xmm3
  183. movaps -10 * SIZE(X), %xmm5
  184. addpd %xmm3, %xmm1
  185. pshufd $0x4e, %xmm6, %xmm3
  186. mulpd %xmm4, %xmm6
  187. addpd %xmm6, %xmm0
  188. movaps -8 * SIZE(Y), %xmm6
  189. mulpd %xmm4, %xmm3
  190. movaps -8 * SIZE(X), %xmm4
  191. addpd %xmm3, %xmm1
  192. pshufd $0x4e, %xmm7, %xmm3
  193. mulpd %xmm5, %xmm7
  194. addpd %xmm7, %xmm0
  195. movaps -6 * SIZE(Y), %xmm7
  196. mulpd %xmm5, %xmm3
  197. movaps -6 * SIZE(X), %xmm5
  198. addpd %xmm3, %xmm1
  199. pshufd $0x4e, %xmm6, %xmm3
  200. mulpd %xmm4, %xmm6
  201. addpd %xmm6, %xmm0
  202. movaps -4 * SIZE(Y), %xmm6
  203. mulpd %xmm4, %xmm3
  204. movaps -4 * SIZE(X), %xmm4
  205. addpd %xmm3, %xmm1
  206. pshufd $0x4e, %xmm7, %xmm3
  207. mulpd %xmm5, %xmm7
  208. addpd %xmm7, %xmm0
  209. movaps -2 * SIZE(Y), %xmm7
  210. mulpd %xmm5, %xmm3
  211. movaps -2 * SIZE(X), %xmm5
  212. addpd %xmm3, %xmm1
  213. pshufd $0x4e, %xmm6, %xmm3
  214. mulpd %xmm4, %xmm6
  215. addpd %xmm6, %xmm0
  216. mulpd %xmm4, %xmm3
  217. addpd %xmm3, %xmm1
  218. pshufd $0x4e, %xmm7, %xmm3
  219. mulpd %xmm5, %xmm7
  220. addpd %xmm7, %xmm0
  221. mulpd %xmm5, %xmm3
  222. addpd %xmm3, %xmm1
  223. subl $-16 * SIZE, X
  224. subl $-16 * SIZE, Y
  225. ALIGN_3
  226. .L15:
  227. testl $4, N
  228. jle .L16
  229. movaps -16 * SIZE(X), %xmm4
  230. movaps -16 * SIZE(Y), %xmm6
  231. movaps -14 * SIZE(X), %xmm5
  232. movaps -14 * SIZE(Y), %xmm7
  233. pshufd $0x4e, %xmm6, %xmm3
  234. mulpd %xmm4, %xmm6
  235. addpd %xmm6, %xmm0
  236. movaps -12 * SIZE(Y), %xmm6
  237. mulpd %xmm4, %xmm3
  238. movaps -12 * SIZE(X), %xmm4
  239. addpd %xmm3, %xmm1
  240. pshufd $0x4e, %xmm7, %xmm3
  241. mulpd %xmm5, %xmm7
  242. addpd %xmm7, %xmm0
  243. movaps -10 * SIZE(Y), %xmm7
  244. mulpd %xmm5, %xmm3
  245. movaps -10 * SIZE(X), %xmm5
  246. addpd %xmm3, %xmm1
  247. pshufd $0x4e, %xmm6, %xmm3
  248. mulpd %xmm4, %xmm6
  249. addpd %xmm6, %xmm0
  250. mulpd %xmm4, %xmm3
  251. addpd %xmm3, %xmm1
  252. pshufd $0x4e, %xmm7, %xmm3
  253. mulpd %xmm5, %xmm7
  254. addpd %xmm7, %xmm0
  255. mulpd %xmm5, %xmm3
  256. addpd %xmm3, %xmm1
  257. addl $8 * SIZE, X
  258. addl $8 * SIZE, Y
  259. ALIGN_3
  260. .L16:
  261. testl $2, N
  262. jle .L17
  263. movaps -16 * SIZE(X), %xmm4
  264. movaps -16 * SIZE(Y), %xmm6
  265. movaps -14 * SIZE(X), %xmm5
  266. movaps -14 * SIZE(Y), %xmm7
  267. pshufd $0x4e, %xmm6, %xmm3
  268. mulpd %xmm4, %xmm6
  269. addpd %xmm6, %xmm0
  270. mulpd %xmm4, %xmm3
  271. addpd %xmm3, %xmm1
  272. pshufd $0x4e, %xmm7, %xmm3
  273. mulpd %xmm5, %xmm7
  274. addpd %xmm7, %xmm0
  275. mulpd %xmm5, %xmm3
  276. addpd %xmm3, %xmm1
  277. addl $4 * SIZE, X
  278. addl $4 * SIZE, Y
  279. ALIGN_3
  280. .L17:
  281. testl $1, N
  282. jle .L98
  283. movaps -16 * SIZE(X), %xmm4
  284. movaps -16 * SIZE(Y), %xmm6
  285. pshufd $0x4e, %xmm6, %xmm3
  286. mulpd %xmm4, %xmm6
  287. addpd %xmm6, %xmm0
  288. mulpd %xmm4, %xmm3
  289. addpd %xmm3, %xmm1
  290. jmp .L98
  291. ALIGN_3
  292. .L20:
  293. movl N, %eax
  294. sarl $3, %eax
  295. jle .L25
  296. MOVLPS -16 * SIZE(X), %xmm4
  297. movhps -15 * SIZE(X), %xmm4
  298. movaps -16 * SIZE(Y), %xmm6
  299. MOVLPS -14 * SIZE(X), %xmm5
  300. movhps -13 * SIZE(X), %xmm5
  301. movaps -14 * SIZE(Y), %xmm7
  302. decl %eax
  303. jle .L22
  304. ALIGN_3
  305. .L21:
  306. #ifdef PREFETCH
  307. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  308. #endif
  309. pshufd $0x4e, %xmm6, %xmm3
  310. mulpd %xmm4, %xmm6
  311. addpd %xmm6, %xmm0
  312. movaps -12 * SIZE(Y), %xmm6
  313. mulpd %xmm4, %xmm3
  314. MOVLPS -12 * SIZE(X), %xmm4
  315. movhps -11 * SIZE(X), %xmm4
  316. addpd %xmm3, %xmm1
  317. pshufd $0x4e, %xmm7, %xmm3
  318. mulpd %xmm5, %xmm7
  319. addpd %xmm7, %xmm0
  320. movaps -10 * SIZE(Y), %xmm7
  321. mulpd %xmm5, %xmm3
  322. MOVLPS -10 * SIZE(X), %xmm5
  323. movhps -9 * SIZE(X), %xmm5
  324. addpd %xmm3, %xmm1
  325. #ifdef PREFETCH
  326. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  327. #endif
  328. pshufd $0x4e, %xmm6, %xmm3
  329. mulpd %xmm4, %xmm6
  330. addpd %xmm6, %xmm0
  331. movaps -8 * SIZE(Y), %xmm6
  332. mulpd %xmm4, %xmm3
  333. MOVLPS -8 * SIZE(X), %xmm4
  334. movhps -7 * SIZE(X), %xmm4
  335. addpd %xmm3, %xmm1
  336. pshufd $0x4e, %xmm7, %xmm3
  337. mulpd %xmm5, %xmm7
  338. addpd %xmm7, %xmm0
  339. movaps -6 * SIZE(Y), %xmm7
  340. mulpd %xmm5, %xmm3
  341. MOVLPS -6 * SIZE(X), %xmm5
  342. movhps -5 * SIZE(X), %xmm5
  343. addpd %xmm3, %xmm1
  344. #if defined(PREFETCH) && !defined(FETCH128)
  345. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  346. #endif
  347. pshufd $0x4e, %xmm6, %xmm3
  348. mulpd %xmm4, %xmm6
  349. addpd %xmm6, %xmm0
  350. movaps -4 * SIZE(Y), %xmm6
  351. mulpd %xmm4, %xmm3
  352. MOVLPS -4 * SIZE(X), %xmm4
  353. movhps -3 * SIZE(X), %xmm4
  354. addpd %xmm3, %xmm1
  355. pshufd $0x4e, %xmm7, %xmm3
  356. mulpd %xmm5, %xmm7
  357. addpd %xmm7, %xmm0
  358. movaps -2 * SIZE(Y), %xmm7
  359. mulpd %xmm5, %xmm3
  360. MOVLPS -2 * SIZE(X), %xmm5
  361. movhps -1 * SIZE(X), %xmm5
  362. addpd %xmm3, %xmm1
  363. #if defined(PREFETCH) && !defined(FETCH128)
  364. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  365. #endif
  366. pshufd $0x4e, %xmm6, %xmm3
  367. mulpd %xmm4, %xmm6
  368. addpd %xmm6, %xmm0
  369. movaps 0 * SIZE(Y), %xmm6
  370. mulpd %xmm4, %xmm3
  371. MOVLPS 0 * SIZE(X), %xmm4
  372. movhps 1 * SIZE(X), %xmm4
  373. addpd %xmm3, %xmm1
  374. pshufd $0x4e, %xmm7, %xmm3
  375. mulpd %xmm5, %xmm7
  376. addpd %xmm7, %xmm0
  377. movaps 2 * SIZE(Y), %xmm7
  378. mulpd %xmm5, %xmm3
  379. MOVLPS 2 * SIZE(X), %xmm5
  380. movhps 3 * SIZE(X), %xmm5
  381. addpd %xmm3, %xmm1
  382. subl $-16 * SIZE, X
  383. subl $-16 * SIZE, Y
  384. decl %eax
  385. jg .L21
  386. ALIGN_3
  387. .L22:
  388. pshufd $0x4e, %xmm6, %xmm3
  389. mulpd %xmm4, %xmm6
  390. addpd %xmm6, %xmm0
  391. movaps -12 * SIZE(Y), %xmm6
  392. mulpd %xmm4, %xmm3
  393. MOVLPS -12 * SIZE(X), %xmm4
  394. movhps -11 * SIZE(X), %xmm4
  395. addpd %xmm3, %xmm1
  396. pshufd $0x4e, %xmm7, %xmm3
  397. mulpd %xmm5, %xmm7
  398. addpd %xmm7, %xmm0
  399. movaps -10 * SIZE(Y), %xmm7
  400. mulpd %xmm5, %xmm3
  401. MOVLPS -10 * SIZE(X), %xmm5
  402. movhps -9 * SIZE(X), %xmm5
  403. addpd %xmm3, %xmm1
  404. pshufd $0x4e, %xmm6, %xmm3
  405. mulpd %xmm4, %xmm6
  406. addpd %xmm6, %xmm0
  407. movaps -8 * SIZE(Y), %xmm6
  408. mulpd %xmm4, %xmm3
  409. MOVLPS -8 * SIZE(X), %xmm4
  410. movhps -7 * SIZE(X), %xmm4
  411. addpd %xmm3, %xmm1
  412. pshufd $0x4e, %xmm7, %xmm3
  413. mulpd %xmm5, %xmm7
  414. addpd %xmm7, %xmm0
  415. movaps -6 * SIZE(Y), %xmm7
  416. mulpd %xmm5, %xmm3
  417. MOVLPS -6 * SIZE(X), %xmm5
  418. movhps -5 * SIZE(X), %xmm5
  419. addpd %xmm3, %xmm1
  420. pshufd $0x4e, %xmm6, %xmm3
  421. mulpd %xmm4, %xmm6
  422. addpd %xmm6, %xmm0
  423. movaps -4 * SIZE(Y), %xmm6
  424. mulpd %xmm4, %xmm3
  425. MOVLPS -4 * SIZE(X), %xmm4
  426. movhps -3 * SIZE(X), %xmm4
  427. addpd %xmm3, %xmm1
  428. pshufd $0x4e, %xmm7, %xmm3
  429. mulpd %xmm5, %xmm7
  430. addpd %xmm7, %xmm0
  431. movaps -2 * SIZE(Y), %xmm7
  432. mulpd %xmm5, %xmm3
  433. MOVLPS -2 * SIZE(X), %xmm5
  434. movhps -1 * SIZE(X), %xmm5
  435. addpd %xmm3, %xmm1
  436. pshufd $0x4e, %xmm6, %xmm3
  437. mulpd %xmm4, %xmm6
  438. addpd %xmm6, %xmm0
  439. mulpd %xmm4, %xmm3
  440. addpd %xmm3, %xmm1
  441. pshufd $0x4e, %xmm7, %xmm3
  442. mulpd %xmm5, %xmm7
  443. addpd %xmm7, %xmm0
  444. mulpd %xmm5, %xmm3
  445. addpd %xmm3, %xmm1
  446. subl $-16 * SIZE, X
  447. subl $-16 * SIZE, Y
  448. ALIGN_3
  449. .L25:
  450. testl $4, N
  451. jle .L26
  452. MOVLPS -16 * SIZE(X), %xmm4
  453. movhps -15 * SIZE(X), %xmm4
  454. movaps -16 * SIZE(Y), %xmm6
  455. MOVLPS -14 * SIZE(X), %xmm5
  456. movhps -13 * SIZE(X), %xmm5
  457. movaps -14 * SIZE(Y), %xmm7
  458. pshufd $0x4e, %xmm6, %xmm3
  459. mulpd %xmm4, %xmm6
  460. addpd %xmm6, %xmm0
  461. movaps -12 * SIZE(Y), %xmm6
  462. mulpd %xmm4, %xmm3
  463. MOVLPS -12 * SIZE(X), %xmm4
  464. movhps -11 * SIZE(X), %xmm4
  465. addpd %xmm3, %xmm1
  466. pshufd $0x4e, %xmm7, %xmm3
  467. mulpd %xmm5, %xmm7
  468. addpd %xmm7, %xmm0
  469. movaps -10 * SIZE(Y), %xmm7
  470. mulpd %xmm5, %xmm3
  471. MOVLPS -10 * SIZE(X), %xmm5
  472. movhps -9 * SIZE(X), %xmm5
  473. addpd %xmm3, %xmm1
  474. pshufd $0x4e, %xmm6, %xmm3
  475. mulpd %xmm4, %xmm6
  476. addpd %xmm6, %xmm0
  477. mulpd %xmm4, %xmm3
  478. addpd %xmm3, %xmm1
  479. pshufd $0x4e, %xmm7, %xmm3
  480. mulpd %xmm5, %xmm7
  481. addpd %xmm7, %xmm0
  482. mulpd %xmm5, %xmm3
  483. addpd %xmm3, %xmm1
  484. addl $8 * SIZE, X
  485. addl $8 * SIZE, Y
  486. ALIGN_3
  487. .L26:
  488. testl $2, N
  489. jle .L27
  490. MOVLPS -16 * SIZE(X), %xmm4
  491. movhps -15 * SIZE(X), %xmm4
  492. movaps -16 * SIZE(Y), %xmm6
  493. pshufd $0x4e, %xmm6, %xmm3
  494. mulpd %xmm4, %xmm6
  495. addpd %xmm6, %xmm0
  496. mulpd %xmm4, %xmm3
  497. addpd %xmm3, %xmm1
  498. MOVLPS -14 * SIZE(X), %xmm5
  499. movhps -13 * SIZE(X), %xmm5
  500. movaps -14 * SIZE(Y), %xmm7
  501. pshufd $0x4e, %xmm7, %xmm3
  502. mulpd %xmm5, %xmm7
  503. addpd %xmm7, %xmm0
  504. mulpd %xmm5, %xmm3
  505. addpd %xmm3, %xmm1
  506. addl $4 * SIZE, X
  507. addl $4 * SIZE, Y
  508. ALIGN_3
  509. .L27:
  510. testl $1, N
  511. jle .L98
  512. MOVLPS -16 * SIZE(X), %xmm4
  513. movhps -15 * SIZE(X), %xmm4
  514. movaps -16 * SIZE(Y), %xmm6
  515. pshufd $0x4e, %xmm6, %xmm3
  516. mulpd %xmm4, %xmm6
  517. addpd %xmm6, %xmm0
  518. mulpd %xmm4, %xmm3
  519. addpd %xmm3, %xmm1
  520. jmp .L98
  521. ALIGN_3
  522. .L30:
  523. testl $SIZE, X
  524. jne .L40
  525. movl N, %eax
  526. sarl $3, %eax
  527. jle .L35
  528. MOVLPS -16 * SIZE(Y), %xmm4
  529. movhps -15 * SIZE(Y), %xmm4
  530. movaps -16 * SIZE(X), %xmm6
  531. MOVLPS -14 * SIZE(Y), %xmm5
  532. movhps -13 * SIZE(Y), %xmm5
  533. movaps -14 * SIZE(X), %xmm7
  534. decl %eax
  535. jle .L32
  536. ALIGN_3
  537. .L31:
  538. #ifdef PREFETCH
  539. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  540. #endif
  541. pshufd $0x4e, %xmm6, %xmm3
  542. mulpd %xmm4, %xmm6
  543. addpd %xmm6, %xmm0
  544. movaps -12 * SIZE(X), %xmm6
  545. mulpd %xmm4, %xmm3
  546. MOVLPS -12 * SIZE(Y), %xmm4
  547. movhps -11 * SIZE(Y), %xmm4
  548. addpd %xmm3, %xmm1
  549. pshufd $0x4e, %xmm7, %xmm3
  550. mulpd %xmm5, %xmm7
  551. addpd %xmm7, %xmm0
  552. movaps -10 * SIZE(X), %xmm7
  553. mulpd %xmm5, %xmm3
  554. MOVLPS -10 * SIZE(Y), %xmm5
  555. movhps -9 * SIZE(Y), %xmm5
  556. addpd %xmm3, %xmm1
  557. #ifdef PREFETCH
  558. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  559. #endif
  560. pshufd $0x4e, %xmm6, %xmm3
  561. mulpd %xmm4, %xmm6
  562. addpd %xmm6, %xmm0
  563. movaps -8 * SIZE(X), %xmm6
  564. mulpd %xmm4, %xmm3
  565. MOVLPS -8 * SIZE(Y), %xmm4
  566. movhps -7 * SIZE(Y), %xmm4
  567. addpd %xmm3, %xmm1
  568. pshufd $0x4e, %xmm7, %xmm3
  569. mulpd %xmm5, %xmm7
  570. addpd %xmm7, %xmm0
  571. movaps -6 * SIZE(X), %xmm7
  572. mulpd %xmm5, %xmm3
  573. MOVLPS -6 * SIZE(Y), %xmm5
  574. movhps -5 * SIZE(Y), %xmm5
  575. addpd %xmm3, %xmm1
  576. #if defined(PREFETCH) && !defined(FETCH128)
  577. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  578. #endif
  579. pshufd $0x4e, %xmm6, %xmm3
  580. mulpd %xmm4, %xmm6
  581. addpd %xmm6, %xmm0
  582. movaps -4 * SIZE(X), %xmm6
  583. mulpd %xmm4, %xmm3
  584. MOVLPS -4 * SIZE(Y), %xmm4
  585. movhps -3 * SIZE(Y), %xmm4
  586. addpd %xmm3, %xmm1
  587. pshufd $0x4e, %xmm7, %xmm3
  588. mulpd %xmm5, %xmm7
  589. addpd %xmm7, %xmm0
  590. movaps -2 * SIZE(X), %xmm7
  591. mulpd %xmm5, %xmm3
  592. MOVLPS -2 * SIZE(Y), %xmm5
  593. movhps -1 * SIZE(Y), %xmm5
  594. addpd %xmm3, %xmm1
  595. #if defined(PREFETCH) && !defined(FETCH128)
  596. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  597. #endif
  598. pshufd $0x4e, %xmm6, %xmm3
  599. mulpd %xmm4, %xmm6
  600. addpd %xmm6, %xmm0
  601. movaps 0 * SIZE(X), %xmm6
  602. mulpd %xmm4, %xmm3
  603. MOVLPS 0 * SIZE(Y), %xmm4
  604. movhps 1 * SIZE(Y), %xmm4
  605. addpd %xmm3, %xmm1
  606. pshufd $0x4e, %xmm7, %xmm3
  607. mulpd %xmm5, %xmm7
  608. addpd %xmm7, %xmm0
  609. movaps 2 * SIZE(X), %xmm7
  610. mulpd %xmm5, %xmm3
  611. MOVLPS 2 * SIZE(Y), %xmm5
  612. movhps 3 * SIZE(Y), %xmm5
  613. addpd %xmm3, %xmm1
  614. subl $-16 * SIZE, X
  615. subl $-16 * SIZE, Y
  616. decl %eax
  617. jg .L31
  618. ALIGN_3
  619. .L32:
  620. pshufd $0x4e, %xmm6, %xmm3
  621. mulpd %xmm4, %xmm6
  622. addpd %xmm6, %xmm0
  623. movaps -12 * SIZE(X), %xmm6
  624. mulpd %xmm4, %xmm3
  625. MOVLPS -12 * SIZE(Y), %xmm4
  626. movhps -11 * SIZE(Y), %xmm4
  627. addpd %xmm3, %xmm1
  628. pshufd $0x4e, %xmm7, %xmm3
  629. mulpd %xmm5, %xmm7
  630. addpd %xmm7, %xmm0
  631. movaps -10 * SIZE(X), %xmm7
  632. mulpd %xmm5, %xmm3
  633. MOVLPS -10 * SIZE(Y), %xmm5
  634. movhps -9 * SIZE(Y), %xmm5
  635. addpd %xmm3, %xmm1
  636. pshufd $0x4e, %xmm6, %xmm3
  637. mulpd %xmm4, %xmm6
  638. addpd %xmm6, %xmm0
  639. movaps -8 * SIZE(X), %xmm6
  640. mulpd %xmm4, %xmm3
  641. MOVLPS -8 * SIZE(Y), %xmm4
  642. movhps -7 * SIZE(Y), %xmm4
  643. addpd %xmm3, %xmm1
  644. pshufd $0x4e, %xmm7, %xmm3
  645. mulpd %xmm5, %xmm7
  646. addpd %xmm7, %xmm0
  647. movaps -6 * SIZE(X), %xmm7
  648. mulpd %xmm5, %xmm3
  649. MOVLPS -6 * SIZE(Y), %xmm5
  650. movhps -5 * SIZE(Y), %xmm5
  651. addpd %xmm3, %xmm1
  652. pshufd $0x4e, %xmm6, %xmm3
  653. mulpd %xmm4, %xmm6
  654. addpd %xmm6, %xmm0
  655. movaps -4 * SIZE(X), %xmm6
  656. mulpd %xmm4, %xmm3
  657. MOVLPS -4 * SIZE(Y), %xmm4
  658. movhps -3 * SIZE(Y), %xmm4
  659. addpd %xmm3, %xmm1
  660. pshufd $0x4e, %xmm7, %xmm3
  661. mulpd %xmm5, %xmm7
  662. addpd %xmm7, %xmm0
  663. movaps -2 * SIZE(X), %xmm7
  664. mulpd %xmm5, %xmm3
  665. MOVLPS -2 * SIZE(Y), %xmm5
  666. movhps -1 * SIZE(Y), %xmm5
  667. addpd %xmm3, %xmm1
  668. pshufd $0x4e, %xmm6, %xmm3
  669. mulpd %xmm4, %xmm6
  670. addpd %xmm6, %xmm0
  671. mulpd %xmm4, %xmm3
  672. addpd %xmm3, %xmm1
  673. pshufd $0x4e, %xmm7, %xmm3
  674. mulpd %xmm5, %xmm7
  675. addpd %xmm7, %xmm0
  676. mulpd %xmm5, %xmm3
  677. addpd %xmm3, %xmm1
  678. subl $-16 * SIZE, X
  679. subl $-16 * SIZE, Y
  680. ALIGN_3
  681. .L35:
  682. testl $4, N
  683. jle .L36
  684. MOVLPS -16 * SIZE(Y), %xmm4
  685. movhps -15 * SIZE(Y), %xmm4
  686. movaps -16 * SIZE(X), %xmm6
  687. MOVLPS -14 * SIZE(Y), %xmm5
  688. movhps -13 * SIZE(Y), %xmm5
  689. movaps -14 * SIZE(X), %xmm7
  690. pshufd $0x4e, %xmm6, %xmm3
  691. mulpd %xmm4, %xmm6
  692. addpd %xmm6, %xmm0
  693. movaps -12 * SIZE(X), %xmm6
  694. mulpd %xmm4, %xmm3
  695. MOVLPS -12 * SIZE(Y), %xmm4
  696. movhps -11 * SIZE(Y), %xmm4
  697. addpd %xmm3, %xmm1
  698. pshufd $0x4e, %xmm7, %xmm3
  699. mulpd %xmm5, %xmm7
  700. addpd %xmm7, %xmm0
  701. movaps -10 * SIZE(X), %xmm7
  702. mulpd %xmm5, %xmm3
  703. MOVLPS -10 * SIZE(Y), %xmm5
  704. movhps -9 * SIZE(Y), %xmm5
  705. addpd %xmm3, %xmm1
  706. pshufd $0x4e, %xmm6, %xmm3
  707. mulpd %xmm4, %xmm6
  708. addpd %xmm6, %xmm0
  709. mulpd %xmm4, %xmm3
  710. addpd %xmm3, %xmm1
  711. pshufd $0x4e, %xmm7, %xmm3
  712. mulpd %xmm5, %xmm7
  713. addpd %xmm7, %xmm0
  714. mulpd %xmm5, %xmm3
  715. addpd %xmm3, %xmm1
  716. addl $8 * SIZE, X
  717. addl $8 * SIZE, Y
  718. ALIGN_3
  719. .L36:
  720. testl $2, N
  721. jle .L37
  722. MOVLPS -16 * SIZE(Y), %xmm4
  723. movhps -15 * SIZE(Y), %xmm4
  724. movaps -16 * SIZE(X), %xmm6
  725. pshufd $0x4e, %xmm6, %xmm3
  726. mulpd %xmm4, %xmm6
  727. addpd %xmm6, %xmm0
  728. mulpd %xmm4, %xmm3
  729. addpd %xmm3, %xmm1
  730. MOVLPS -14 * SIZE(Y), %xmm5
  731. movhps -13 * SIZE(Y), %xmm5
  732. movaps -14 * SIZE(X), %xmm7
  733. pshufd $0x4e, %xmm7, %xmm3
  734. mulpd %xmm5, %xmm7
  735. addpd %xmm7, %xmm0
  736. mulpd %xmm5, %xmm3
  737. addpd %xmm3, %xmm1
  738. addl $4 * SIZE, X
  739. addl $4 * SIZE, Y
  740. ALIGN_3
  741. .L37:
  742. SHUFPD_1 %xmm1, %xmm1
  743. SHUFPD_1 %xmm3, %xmm3
  744. testl $1, N
  745. jle .L98
  746. MOVLPS -16 * SIZE(Y), %xmm4
  747. movhps -15 * SIZE(Y), %xmm4
  748. movaps -16 * SIZE(X), %xmm6
  749. pshufd $0x4e, %xmm6, %xmm3
  750. mulpd %xmm4, %xmm6
  751. addpd %xmm6, %xmm0
  752. mulpd %xmm4, %xmm3
  753. SHUFPD_1 %xmm3, %xmm3
  754. addpd %xmm3, %xmm1
  755. jmp .L98
  756. ALIGN_3
  757. .L40:
  758. movhps -16 * SIZE(X), %xmm4
  759. addl $SIZE, X
  760. movhps -16 * SIZE(Y), %xmm6
  761. addl $SIZE, Y
  762. movl N, %eax
  763. sarl $3, %eax
  764. jle .L45
  765. movaps -16 * SIZE(X), %xmm5
  766. movaps -16 * SIZE(Y), %xmm7
  767. decl %eax
  768. jle .L42
  769. ALIGN_3
  770. .L41:
  771. #ifdef PREFETCH
  772. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  773. #endif
  774. #ifdef PREFETCH
  775. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  776. #endif
  777. #if defined(PREFETCH) && !defined(FETCH128)
  778. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  779. #endif
  780. #if defined(PREFETCH) && !defined(FETCH128)
  781. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  782. #endif
  783. movsd %xmm7, %xmm6
  784. pshufd $0x4e, %xmm6, %xmm3
  785. movsd %xmm5, %xmm4
  786. mulpd %xmm4, %xmm6
  787. addpd %xmm6, %xmm0
  788. movaps -14 * SIZE(Y), %xmm6
  789. mulpd %xmm4, %xmm3
  790. movaps -14 * SIZE(X), %xmm4
  791. addpd %xmm3, %xmm1
  792. movsd %xmm6, %xmm7
  793. pshufd $0x4e, %xmm7, %xmm3
  794. movsd %xmm4, %xmm5
  795. mulpd %xmm5, %xmm7
  796. addpd %xmm7, %xmm0
  797. movaps -12 * SIZE(Y), %xmm7
  798. mulpd %xmm5, %xmm3
  799. movaps -12 * SIZE(X), %xmm5
  800. addpd %xmm3, %xmm1
  801. movsd %xmm7, %xmm6
  802. pshufd $0x4e, %xmm6, %xmm3
  803. movsd %xmm5, %xmm4
  804. mulpd %xmm4, %xmm6
  805. addpd %xmm6, %xmm0
  806. movaps -10 * SIZE(Y), %xmm6
  807. mulpd %xmm4, %xmm3
  808. movaps -10 * SIZE(X), %xmm4
  809. addpd %xmm3, %xmm1
  810. movsd %xmm6, %xmm7
  811. pshufd $0x4e, %xmm7, %xmm3
  812. movsd %xmm4, %xmm5
  813. mulpd %xmm5, %xmm7
  814. addpd %xmm7, %xmm0
  815. movaps -8 * SIZE(Y), %xmm7
  816. mulpd %xmm5, %xmm3
  817. movaps -8 * SIZE(X), %xmm5
  818. addpd %xmm3, %xmm1
  819. movsd %xmm7, %xmm6
  820. pshufd $0x4e, %xmm6, %xmm3
  821. movsd %xmm5, %xmm4
  822. mulpd %xmm4, %xmm6
  823. addpd %xmm6, %xmm0
  824. movaps -6 * SIZE(Y), %xmm6
  825. mulpd %xmm4, %xmm3
  826. movaps -6 * SIZE(X), %xmm4
  827. addpd %xmm3, %xmm1
  828. movsd %xmm6, %xmm7
  829. pshufd $0x4e, %xmm7, %xmm3
  830. movsd %xmm4, %xmm5
  831. mulpd %xmm5, %xmm7
  832. addpd %xmm7, %xmm0
  833. movaps -4 * SIZE(Y), %xmm7
  834. mulpd %xmm5, %xmm3
  835. movaps -4 * SIZE(X), %xmm5
  836. addpd %xmm3, %xmm1
  837. movsd %xmm7, %xmm6
  838. pshufd $0x4e, %xmm6, %xmm3
  839. movsd %xmm5, %xmm4
  840. mulpd %xmm4, %xmm6
  841. addpd %xmm6, %xmm0
  842. movaps -2 * SIZE(Y), %xmm6
  843. mulpd %xmm4, %xmm3
  844. movaps -2 * SIZE(X), %xmm4
  845. addpd %xmm3, %xmm1
  846. movsd %xmm6, %xmm7
  847. pshufd $0x4e, %xmm7, %xmm3
  848. movsd %xmm4, %xmm5
  849. mulpd %xmm5, %xmm7
  850. addpd %xmm7, %xmm0
  851. movaps 0 * SIZE(Y), %xmm7
  852. mulpd %xmm5, %xmm3
  853. movaps 0 * SIZE(X), %xmm5
  854. addpd %xmm3, %xmm1
  855. subl $-16 * SIZE, X
  856. subl $-16 * SIZE, Y
  857. decl %eax
  858. jg .L41
  859. ALIGN_3
  860. .L42:
  861. movsd %xmm7, %xmm6
  862. pshufd $0x4e, %xmm6, %xmm3
  863. movsd %xmm5, %xmm4
  864. mulpd %xmm4, %xmm6
  865. addpd %xmm6, %xmm0
  866. movaps -14 * SIZE(Y), %xmm6
  867. mulpd %xmm4, %xmm3
  868. movaps -14 * SIZE(X), %xmm4
  869. addpd %xmm3, %xmm1
  870. movsd %xmm6, %xmm7
  871. pshufd $0x4e, %xmm7, %xmm3
  872. movsd %xmm4, %xmm5
  873. mulpd %xmm5, %xmm7
  874. addpd %xmm7, %xmm0
  875. movaps -12 * SIZE(Y), %xmm7
  876. mulpd %xmm5, %xmm3
  877. movaps -12 * SIZE(X), %xmm5
  878. addpd %xmm3, %xmm1
  879. movsd %xmm7, %xmm6
  880. pshufd $0x4e, %xmm6, %xmm3
  881. movsd %xmm5, %xmm4
  882. mulpd %xmm4, %xmm6
  883. addpd %xmm6, %xmm0
  884. movaps -10 * SIZE(Y), %xmm6
  885. mulpd %xmm4, %xmm3
  886. movaps -10 * SIZE(X), %xmm4
  887. addpd %xmm3, %xmm1
  888. movsd %xmm6, %xmm7
  889. pshufd $0x4e, %xmm7, %xmm3
  890. movsd %xmm4, %xmm5
  891. mulpd %xmm5, %xmm7
  892. addpd %xmm7, %xmm0
  893. movaps -8 * SIZE(Y), %xmm7
  894. mulpd %xmm5, %xmm3
  895. movaps -8 * SIZE(X), %xmm5
  896. addpd %xmm3, %xmm1
  897. movsd %xmm7, %xmm6
  898. pshufd $0x4e, %xmm6, %xmm3
  899. movsd %xmm5, %xmm4
  900. mulpd %xmm4, %xmm6
  901. addpd %xmm6, %xmm0
  902. movaps -6 * SIZE(Y), %xmm6
  903. mulpd %xmm4, %xmm3
  904. movaps -6 * SIZE(X), %xmm4
  905. addpd %xmm3, %xmm1
  906. movsd %xmm6, %xmm7
  907. pshufd $0x4e, %xmm7, %xmm3
  908. movsd %xmm4, %xmm5
  909. mulpd %xmm5, %xmm7
  910. addpd %xmm7, %xmm0
  911. movaps -4 * SIZE(Y), %xmm7
  912. mulpd %xmm5, %xmm3
  913. movaps -4 * SIZE(X), %xmm5
  914. addpd %xmm3, %xmm1
  915. movsd %xmm7, %xmm6
  916. pshufd $0x4e, %xmm6, %xmm3
  917. movsd %xmm5, %xmm4
  918. mulpd %xmm4, %xmm6
  919. addpd %xmm6, %xmm0
  920. movaps -2 * SIZE(Y), %xmm6
  921. mulpd %xmm4, %xmm3
  922. movaps -2 * SIZE(X), %xmm4
  923. addpd %xmm3, %xmm1
  924. movsd %xmm6, %xmm7
  925. pshufd $0x4e, %xmm7, %xmm3
  926. movsd %xmm4, %xmm5
  927. mulpd %xmm5, %xmm7
  928. addpd %xmm7, %xmm0
  929. mulpd %xmm5, %xmm3
  930. addpd %xmm3, %xmm1
  931. subl $-16 * SIZE, X
  932. subl $-16 * SIZE, Y
  933. ALIGN_3
  934. .L45:
  935. testl $4, N
  936. jle .L46
  937. movaps -16 * SIZE(X), %xmm5
  938. movaps -16 * SIZE(Y), %xmm7
  939. movsd %xmm7, %xmm6
  940. pshufd $0x4e, %xmm6, %xmm3
  941. movsd %xmm5, %xmm4
  942. mulpd %xmm4, %xmm6
  943. addpd %xmm6, %xmm0
  944. movaps -14 * SIZE(Y), %xmm6
  945. mulpd %xmm4, %xmm3
  946. movaps -14 * SIZE(X), %xmm4
  947. addpd %xmm3, %xmm1
  948. movsd %xmm6, %xmm7
  949. pshufd $0x4e, %xmm7, %xmm3
  950. movsd %xmm4, %xmm5
  951. mulpd %xmm5, %xmm7
  952. addpd %xmm7, %xmm0
  953. movaps -12 * SIZE(Y), %xmm7
  954. mulpd %xmm5, %xmm3
  955. movaps -12 * SIZE(X), %xmm5
  956. addpd %xmm3, %xmm1
  957. movsd %xmm7, %xmm6
  958. pshufd $0x4e, %xmm6, %xmm3
  959. movsd %xmm5, %xmm4
  960. mulpd %xmm4, %xmm6
  961. addpd %xmm6, %xmm0
  962. movaps -10 * SIZE(Y), %xmm6
  963. mulpd %xmm4, %xmm3
  964. movaps -10 * SIZE(X), %xmm4
  965. addpd %xmm3, %xmm1
  966. movsd %xmm6, %xmm7
  967. pshufd $0x4e, %xmm7, %xmm3
  968. movsd %xmm4, %xmm5
  969. mulpd %xmm5, %xmm7
  970. addpd %xmm7, %xmm0
  971. mulpd %xmm5, %xmm3
  972. addpd %xmm3, %xmm1
  973. addl $8 * SIZE, X
  974. addl $8 * SIZE, Y
  975. ALIGN_3
  976. .L46:
  977. testl $2, N
  978. jle .L47
  979. movaps -16 * SIZE(X), %xmm5
  980. movaps -16 * SIZE(Y), %xmm7
  981. movsd %xmm7, %xmm6
  982. pshufd $0x4e, %xmm6, %xmm3
  983. movsd %xmm5, %xmm4
  984. mulpd %xmm4, %xmm6
  985. addpd %xmm6, %xmm0
  986. movaps -14 * SIZE(Y), %xmm6
  987. mulpd %xmm4, %xmm3
  988. movaps -14 * SIZE(X), %xmm4
  989. addpd %xmm3, %xmm1
  990. movsd %xmm6, %xmm7
  991. pshufd $0x4e, %xmm7, %xmm3
  992. movsd %xmm4, %xmm5
  993. mulpd %xmm5, %xmm7
  994. addpd %xmm7, %xmm0
  995. mulpd %xmm5, %xmm3
  996. addpd %xmm3, %xmm1
  997. addl $4 * SIZE, X
  998. addl $4 * SIZE, Y
  999. ALIGN_3
  1000. .L47:
  1001. testl $1, N
  1002. jle .L48
  1003. movlpd -16 * SIZE(X), %xmm4
  1004. movlpd -16 * SIZE(Y), %xmm6
  1005. pshufd $0x4e, %xmm6, %xmm3
  1006. mulpd %xmm4, %xmm6
  1007. addpd %xmm6, %xmm0
  1008. mulpd %xmm4, %xmm3
  1009. addpd %xmm3, %xmm1
  1010. ALIGN_3
  1011. .L48:
  1012. SHUFPD_1 %xmm0, %xmm0
  1013. SHUFPD_1 %xmm1, %xmm1
  1014. SHUFPD_1 %xmm2, %xmm2
  1015. SHUFPD_1 %xmm3, %xmm3
  1016. jmp .L98
  1017. ALIGN_3
  1018. .L50:
  1019. movl N, %eax
  1020. sarl $3, %eax
  1021. jle .L55
  1022. MOVLPS 0 * SIZE(X), %xmm4
  1023. movhps 1 * SIZE(X), %xmm4
  1024. addl INCX, X
  1025. MOVLPS 0 * SIZE(Y), %xmm6
  1026. movhps 1 * SIZE(Y), %xmm6
  1027. addl INCY, Y
  1028. MOVLPS 0 * SIZE(X), %xmm5
  1029. movhps 1 * SIZE(X), %xmm5
  1030. addl INCX, X
  1031. MOVLPS 0 * SIZE(Y), %xmm7
  1032. movhps 1 * SIZE(Y), %xmm7
  1033. addl INCY, Y
  1034. decl %eax
  1035. jle .L54
  1036. ALIGN_3
  1037. .L53:
  1038. pshufd $0x4e, %xmm6, %xmm3
  1039. mulpd %xmm4, %xmm6
  1040. addpd %xmm6, %xmm0
  1041. MOVLPS 0 * SIZE(Y), %xmm6
  1042. movhps 1 * SIZE(Y), %xmm6
  1043. addl INCY, Y
  1044. mulpd %xmm4, %xmm3
  1045. MOVLPS 0 * SIZE(X), %xmm4
  1046. movhps 1 * SIZE(X), %xmm4
  1047. addl INCX, X
  1048. addpd %xmm3, %xmm1
  1049. pshufd $0x4e, %xmm7, %xmm3
  1050. mulpd %xmm5, %xmm7
  1051. addpd %xmm7, %xmm0
  1052. MOVLPS 0 * SIZE(Y), %xmm7
  1053. movhps 1 * SIZE(Y), %xmm7
  1054. addl INCY, Y
  1055. mulpd %xmm5, %xmm3
  1056. MOVLPS 0 * SIZE(X), %xmm5
  1057. movhps 1 * SIZE(X), %xmm5
  1058. addl INCX, X
  1059. addpd %xmm3, %xmm1
  1060. pshufd $0x4e, %xmm6, %xmm3
  1061. mulpd %xmm4, %xmm6
  1062. addpd %xmm6, %xmm0
  1063. MOVLPS 0 * SIZE(Y), %xmm6
  1064. movhps 1 * SIZE(Y), %xmm6
  1065. addl INCY, Y
  1066. mulpd %xmm4, %xmm3
  1067. MOVLPS 0 * SIZE(X), %xmm4
  1068. movhps 1 * SIZE(X), %xmm4
  1069. addl INCX, X
  1070. addpd %xmm3, %xmm1
  1071. pshufd $0x4e, %xmm7, %xmm3
  1072. mulpd %xmm5, %xmm7
  1073. addpd %xmm7, %xmm0
  1074. MOVLPS 0 * SIZE(Y), %xmm7
  1075. movhps 1 * SIZE(Y), %xmm7
  1076. addl INCY, Y
  1077. mulpd %xmm5, %xmm3
  1078. MOVLPS 0 * SIZE(X), %xmm5
  1079. movhps 1 * SIZE(X), %xmm5
  1080. addl INCX, X
  1081. addpd %xmm3, %xmm1
  1082. pshufd $0x4e, %xmm6, %xmm3
  1083. mulpd %xmm4, %xmm6
  1084. addpd %xmm6, %xmm0
  1085. MOVLPS 0 * SIZE(Y), %xmm6
  1086. movhps 1 * SIZE(Y), %xmm6
  1087. addl INCY, Y
  1088. mulpd %xmm4, %xmm3
  1089. MOVLPS 0 * SIZE(X), %xmm4
  1090. movhps 1 * SIZE(X), %xmm4
  1091. addl INCX, X
  1092. addpd %xmm3, %xmm1
  1093. pshufd $0x4e, %xmm7, %xmm3
  1094. mulpd %xmm5, %xmm7
  1095. addpd %xmm7, %xmm0
  1096. MOVLPS 0 * SIZE(Y), %xmm7
  1097. movhps 1 * SIZE(Y), %xmm7
  1098. addl INCY, Y
  1099. mulpd %xmm5, %xmm3
  1100. MOVLPS 0 * SIZE(X), %xmm5
  1101. movhps 1 * SIZE(X), %xmm5
  1102. addl INCX, X
  1103. addpd %xmm3, %xmm1
  1104. pshufd $0x4e, %xmm6, %xmm3
  1105. mulpd %xmm4, %xmm6
  1106. addpd %xmm6, %xmm0
  1107. MOVLPS 0 * SIZE(Y), %xmm6
  1108. movhps 1 * SIZE(Y), %xmm6
  1109. addl INCY, Y
  1110. mulpd %xmm4, %xmm3
  1111. MOVLPS 0 * SIZE(X), %xmm4
  1112. movhps 1 * SIZE(X), %xmm4
  1113. addl INCX, X
  1114. addpd %xmm3, %xmm1
  1115. pshufd $0x4e, %xmm7, %xmm3
  1116. mulpd %xmm5, %xmm7
  1117. addpd %xmm7, %xmm0
  1118. MOVLPS 0 * SIZE(Y), %xmm7
  1119. movhps 1 * SIZE(Y), %xmm7
  1120. addl INCY, Y
  1121. mulpd %xmm5, %xmm3
  1122. MOVLPS 0 * SIZE(X), %xmm5
  1123. movhps 1 * SIZE(X), %xmm5
  1124. addl INCX, X
  1125. addpd %xmm3, %xmm1
  1126. decl %eax
  1127. jg .L53
  1128. ALIGN_3
  1129. .L54:
  1130. pshufd $0x4e, %xmm6, %xmm3
  1131. mulpd %xmm4, %xmm6
  1132. addpd %xmm6, %xmm0
  1133. MOVLPS 0 * SIZE(Y), %xmm6
  1134. movhps 1 * SIZE(Y), %xmm6
  1135. addl INCY, Y
  1136. mulpd %xmm4, %xmm3
  1137. MOVLPS 0 * SIZE(X), %xmm4
  1138. movhps 1 * SIZE(X), %xmm4
  1139. addl INCX, X
  1140. addpd %xmm3, %xmm1
  1141. pshufd $0x4e, %xmm7, %xmm3
  1142. mulpd %xmm5, %xmm7
  1143. addpd %xmm7, %xmm0
  1144. MOVLPS 0 * SIZE(Y), %xmm7
  1145. movhps 1 * SIZE(Y), %xmm7
  1146. addl INCY, Y
  1147. mulpd %xmm5, %xmm3
  1148. MOVLPS 0 * SIZE(X), %xmm5
  1149. movhps 1 * SIZE(X), %xmm5
  1150. addl INCX, X
  1151. addpd %xmm3, %xmm1
  1152. pshufd $0x4e, %xmm6, %xmm3
  1153. mulpd %xmm4, %xmm6
  1154. addpd %xmm6, %xmm0
  1155. MOVLPS 0 * SIZE(Y), %xmm6
  1156. movhps 1 * SIZE(Y), %xmm6
  1157. addl INCY, Y
  1158. mulpd %xmm4, %xmm3
  1159. MOVLPS 0 * SIZE(X), %xmm4
  1160. movhps 1 * SIZE(X), %xmm4
  1161. addl INCX, X
  1162. addpd %xmm3, %xmm1
  1163. pshufd $0x4e, %xmm7, %xmm3
  1164. mulpd %xmm5, %xmm7
  1165. addpd %xmm7, %xmm0
  1166. MOVLPS 0 * SIZE(Y), %xmm7
  1167. movhps 1 * SIZE(Y), %xmm7
  1168. addl INCY, Y
  1169. mulpd %xmm5, %xmm3
  1170. MOVLPS 0 * SIZE(X), %xmm5
  1171. movhps 1 * SIZE(X), %xmm5
  1172. addl INCX, X
  1173. addpd %xmm3, %xmm1
  1174. pshufd $0x4e, %xmm6, %xmm3
  1175. mulpd %xmm4, %xmm6
  1176. addpd %xmm6, %xmm0
  1177. MOVLPS 0 * SIZE(Y), %xmm6
  1178. movhps 1 * SIZE(Y), %xmm6
  1179. addl INCY, Y
  1180. mulpd %xmm4, %xmm3
  1181. MOVLPS 0 * SIZE(X), %xmm4
  1182. movhps 1 * SIZE(X), %xmm4
  1183. addl INCX, X
  1184. addpd %xmm3, %xmm1
  1185. pshufd $0x4e, %xmm7, %xmm3
  1186. mulpd %xmm5, %xmm7
  1187. addpd %xmm7, %xmm0
  1188. MOVLPS 0 * SIZE(Y), %xmm7
  1189. movhps 1 * SIZE(Y), %xmm7
  1190. addl INCY, Y
  1191. mulpd %xmm5, %xmm3
  1192. MOVLPS 0 * SIZE(X), %xmm5
  1193. movhps 1 * SIZE(X), %xmm5
  1194. addl INCX, X
  1195. addpd %xmm3, %xmm1
  1196. pshufd $0x4e, %xmm6, %xmm3
  1197. mulpd %xmm4, %xmm6
  1198. addpd %xmm6, %xmm0
  1199. mulpd %xmm4, %xmm3
  1200. addpd %xmm3, %xmm1
  1201. pshufd $0x4e, %xmm7, %xmm3
  1202. mulpd %xmm5, %xmm7
  1203. addpd %xmm7, %xmm0
  1204. mulpd %xmm5, %xmm3
  1205. addpd %xmm3, %xmm1
  1206. ALIGN_3
  1207. .L55:
  1208. testl $4, N
  1209. jle .L56
  1210. MOVLPS 0 * SIZE(X), %xmm4
  1211. movhps 1 * SIZE(X), %xmm4
  1212. addl INCX, X
  1213. MOVLPS 0 * SIZE(Y), %xmm6
  1214. movhps 1 * SIZE(Y), %xmm6
  1215. addl INCY, Y
  1216. MOVLPS 0 * SIZE(X), %xmm5
  1217. movhps 1 * SIZE(X), %xmm5
  1218. addl INCX, X
  1219. MOVLPS 0 * SIZE(Y), %xmm7
  1220. movhps 1 * SIZE(Y), %xmm7
  1221. addl INCY, Y
  1222. pshufd $0x4e, %xmm6, %xmm3
  1223. mulpd %xmm4, %xmm6
  1224. addpd %xmm6, %xmm0
  1225. MOVLPS 0 * SIZE(Y), %xmm6
  1226. movhps 1 * SIZE(Y), %xmm6
  1227. addl INCY, Y
  1228. mulpd %xmm4, %xmm3
  1229. MOVLPS 0 * SIZE(X), %xmm4
  1230. movhps 1 * SIZE(X), %xmm4
  1231. addl INCX, X
  1232. addpd %xmm3, %xmm1
  1233. pshufd $0x4e, %xmm7, %xmm3
  1234. mulpd %xmm5, %xmm7
  1235. addpd %xmm7, %xmm0
  1236. MOVLPS 0 * SIZE(Y), %xmm7
  1237. movhps 1 * SIZE(Y), %xmm7
  1238. addl INCY, Y
  1239. mulpd %xmm5, %xmm3
  1240. MOVLPS 0 * SIZE(X), %xmm5
  1241. movhps 1 * SIZE(X), %xmm5
  1242. addl INCX, X
  1243. addpd %xmm3, %xmm1
  1244. pshufd $0x4e, %xmm6, %xmm3
  1245. mulpd %xmm4, %xmm6
  1246. addpd %xmm6, %xmm0
  1247. mulpd %xmm4, %xmm3
  1248. addpd %xmm3, %xmm1
  1249. pshufd $0x4e, %xmm7, %xmm3
  1250. mulpd %xmm5, %xmm7
  1251. addpd %xmm7, %xmm0
  1252. mulpd %xmm5, %xmm3
  1253. addpd %xmm3, %xmm1
  1254. ALIGN_3
  1255. .L56:
  1256. testl $2, N
  1257. jle .L57
  1258. MOVLPS 0 * SIZE(X), %xmm4
  1259. movhps 1 * SIZE(X), %xmm4
  1260. addl INCX, X
  1261. MOVLPS 0 * SIZE(Y), %xmm6
  1262. movhps 1 * SIZE(Y), %xmm6
  1263. addl INCY, Y
  1264. pshufd $0x4e, %xmm6, %xmm3
  1265. mulpd %xmm4, %xmm6
  1266. addpd %xmm6, %xmm0
  1267. mulpd %xmm4, %xmm3
  1268. addpd %xmm3, %xmm1
  1269. MOVLPS 0 * SIZE(X), %xmm5
  1270. movhps 1 * SIZE(X), %xmm5
  1271. addl INCX, X
  1272. MOVLPS 0 * SIZE(Y), %xmm7
  1273. movhps 1 * SIZE(Y), %xmm7
  1274. addl INCY, Y
  1275. pshufd $0x4e, %xmm7, %xmm3
  1276. mulpd %xmm5, %xmm7
  1277. addpd %xmm7, %xmm0
  1278. mulpd %xmm5, %xmm3
  1279. addpd %xmm3, %xmm1
  1280. ALIGN_3
  1281. .L57:
  1282. testl $1, N
  1283. jle .L98
  1284. MOVLPS 0 * SIZE(X), %xmm4
  1285. movhps 1 * SIZE(X), %xmm4
  1286. MOVLPS 0 * SIZE(Y), %xmm6
  1287. movhps 1 * SIZE(Y), %xmm6
  1288. pshufd $0x4e, %xmm6, %xmm3
  1289. mulpd %xmm4, %xmm6
  1290. addpd %xmm6, %xmm0
  1291. mulpd %xmm4, %xmm3
  1292. addpd %xmm3, %xmm1
  1293. ALIGN_3
  1294. .L98:
  1295. pshufd $0x4e, %xmm0, %xmm2
  1296. pshufd $0x4e, %xmm1, %xmm3
  1297. #ifndef CONJ
  1298. subsd %xmm2, %xmm0
  1299. addsd %xmm3, %xmm1
  1300. #else
  1301. addsd %xmm2, %xmm0
  1302. subsd %xmm3, %xmm1
  1303. #endif
  1304. .L999:
  1305. movl RESULT, %eax
  1306. MOVLPS %xmm0, 0 * SIZE(%eax)
  1307. MOVLPS %xmm1, 1 * SIZE(%eax)
  1308. popl %ebx
  1309. popl %esi
  1310. popl %edi
  1311. #if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
  1312. #ifdef MS_ABI
  1313. /* For MingW GCC >= 4.7. It is compatible with MSVC ABI. http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36834 */
  1314. ret
  1315. #else
  1316. /* remove the hidden return value address from the stack. For MingW GCC < 4.7 */
  1317. ret $0x4
  1318. #endif
  1319. #else
  1320. /*remove the hidden return value address from the stack on Linux.*/
  1321. ret $0x4
  1322. #endif
  1323. EPILOGUE