You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zdot_sse.S 65 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 12
  41. #define ARGS 0
  42. #define STACK_N 4 + STACK + ARGS(%esp)
  43. #define STACK_X 8 + STACK + ARGS(%esp)
  44. #define STACK_INCX 12 + STACK + ARGS(%esp)
  45. #define STACK_Y 16 + STACK + ARGS(%esp)
  46. #define STACK_INCY 20 + STACK + ARGS(%esp)
  47. #define N %ebx
  48. #define X %esi
  49. #define INCX %ecx
  50. #define Y %edi
  51. #define INCY %edx
  52. #include "l1param.h"
  53. PROLOGUE
  54. PROFCODE
  55. pushl %edi
  56. pushl %esi
  57. pushl %ebx
  58. movl STACK_N, N
  59. movl STACK_X, X
  60. movl STACK_INCX, INCX
  61. movl STACK_Y, Y
  62. movl STACK_INCY, INCY
  63. sall $ZBASE_SHIFT, INCX
  64. sall $ZBASE_SHIFT, INCY
  65. xorps %xmm0, %xmm0
  66. xorps %xmm1, %xmm1
  67. testl N, N
  68. jle .L999
  69. cmpl $2 * SIZE, INCX
  70. jne .L200
  71. cmpl $2 * SIZE, INCY
  72. jne .L200
  73. subl $-32 * SIZE, X
  74. subl $-32 * SIZE, Y
  75. testl $SIZE, X
  76. jne .L50
  77. .L0x:
  78. testl $2 * SIZE, X
  79. je .L10
  80. #ifdef movsd
  81. xorps %xmm4, %xmm4
  82. #endif
  83. movsd -32 * SIZE(X), %xmm4
  84. #ifdef movsd
  85. xorps %xmm6, %xmm6
  86. #endif
  87. movsd -32 * SIZE(Y), %xmm0
  88. PSHUFD2($0xb1, %xmm0, %xmm1)
  89. mulps %xmm4, %xmm0
  90. mulps %xmm4, %xmm1
  91. addl $2 * SIZE, X
  92. addl $2 * SIZE, Y
  93. decl N
  94. ALIGN_3
  95. .L10:
  96. testl $3 * SIZE, Y
  97. jne .L20
  98. movl N, %eax
  99. sarl $4, %eax
  100. jle .L15
  101. movaps -32 * SIZE(X), %xmm4
  102. movaps -32 * SIZE(Y), %xmm6
  103. movaps -28 * SIZE(X), %xmm5
  104. movaps -28 * SIZE(Y), %xmm7
  105. decl %eax
  106. jle .L12
  107. ALIGN_3
  108. .L11:
  109. #ifdef PREFETCH
  110. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  111. #endif
  112. PSHUFD2($0xb1, %xmm6, %xmm3)
  113. mulps %xmm4, %xmm6
  114. addps %xmm6, %xmm0
  115. movaps -24 * SIZE(Y), %xmm6
  116. mulps %xmm4, %xmm3
  117. movaps -24 * SIZE(X), %xmm4
  118. addps %xmm3, %xmm1
  119. PSHUFD2($0xb1, %xmm7, %xmm3)
  120. mulps %xmm5, %xmm7
  121. addps %xmm7, %xmm0
  122. movaps -20 * SIZE(Y), %xmm7
  123. mulps %xmm5, %xmm3
  124. movaps -20 * SIZE(X), %xmm5
  125. addps %xmm3, %xmm1
  126. #ifdef PREFETCH
  127. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  128. #endif
  129. PSHUFD2($0xb1, %xmm6, %xmm3)
  130. mulps %xmm4, %xmm6
  131. addps %xmm6, %xmm0
  132. movaps -16 * SIZE(Y), %xmm6
  133. mulps %xmm4, %xmm3
  134. movaps -16 * SIZE(X), %xmm4
  135. addps %xmm3, %xmm1
  136. PSHUFD2($0xb1, %xmm7, %xmm3)
  137. mulps %xmm5, %xmm7
  138. addps %xmm7, %xmm0
  139. movaps -12 * SIZE(Y), %xmm7
  140. mulps %xmm5, %xmm3
  141. movaps -12 * SIZE(X), %xmm5
  142. addps %xmm3, %xmm1
  143. #if defined(PREFETCH) && !defined(FETCH128)
  144. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  145. #endif
  146. PSHUFD2($0xb1, %xmm6, %xmm3)
  147. mulps %xmm4, %xmm6
  148. addps %xmm6, %xmm0
  149. movaps -8 * SIZE(Y), %xmm6
  150. mulps %xmm4, %xmm3
  151. movaps -8 * SIZE(X), %xmm4
  152. addps %xmm3, %xmm1
  153. PSHUFD2($0xb1, %xmm7, %xmm3)
  154. mulps %xmm5, %xmm7
  155. addps %xmm7, %xmm0
  156. movaps -4 * SIZE(Y), %xmm7
  157. mulps %xmm5, %xmm3
  158. movaps -4 * SIZE(X), %xmm5
  159. addps %xmm3, %xmm1
  160. #if defined(PREFETCH) && !defined(FETCH128)
  161. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  162. #endif
  163. PSHUFD2($0xb1, %xmm6, %xmm3)
  164. mulps %xmm4, %xmm6
  165. addps %xmm6, %xmm0
  166. movaps 0 * SIZE(Y), %xmm6
  167. mulps %xmm4, %xmm3
  168. movaps 0 * SIZE(X), %xmm4
  169. addps %xmm3, %xmm1
  170. PSHUFD2($0xb1, %xmm7, %xmm3)
  171. mulps %xmm5, %xmm7
  172. addps %xmm7, %xmm0
  173. movaps 4 * SIZE(Y), %xmm7
  174. mulps %xmm5, %xmm3
  175. movaps 4 * SIZE(X), %xmm5
  176. addps %xmm3, %xmm1
  177. subl $-32 * SIZE, X
  178. subl $-32 * SIZE, Y
  179. decl %eax
  180. jg .L11
  181. ALIGN_3
  182. .L12:
  183. PSHUFD2($0xb1, %xmm6, %xmm3)
  184. mulps %xmm4, %xmm6
  185. addps %xmm6, %xmm0
  186. movaps -24 * SIZE(Y), %xmm6
  187. mulps %xmm4, %xmm3
  188. movaps -24 * SIZE(X), %xmm4
  189. addps %xmm3, %xmm1
  190. PSHUFD2($0xb1, %xmm7, %xmm3)
  191. mulps %xmm5, %xmm7
  192. addps %xmm7, %xmm0
  193. movaps -20 * SIZE(Y), %xmm7
  194. mulps %xmm5, %xmm3
  195. movaps -20 * SIZE(X), %xmm5
  196. addps %xmm3, %xmm1
  197. PSHUFD2($0xb1, %xmm6, %xmm3)
  198. mulps %xmm4, %xmm6
  199. addps %xmm6, %xmm0
  200. movaps -16 * SIZE(Y), %xmm6
  201. mulps %xmm4, %xmm3
  202. movaps -16 * SIZE(X), %xmm4
  203. addps %xmm3, %xmm1
  204. PSHUFD2($0xb1, %xmm7, %xmm3)
  205. mulps %xmm5, %xmm7
  206. addps %xmm7, %xmm0
  207. movaps -12 * SIZE(Y), %xmm7
  208. mulps %xmm5, %xmm3
  209. movaps -12 * SIZE(X), %xmm5
  210. addps %xmm3, %xmm1
  211. PSHUFD2($0xb1, %xmm6, %xmm3)
  212. mulps %xmm4, %xmm6
  213. addps %xmm6, %xmm0
  214. movaps -8 * SIZE(Y), %xmm6
  215. mulps %xmm4, %xmm3
  216. movaps -8 * SIZE(X), %xmm4
  217. addps %xmm3, %xmm1
  218. PSHUFD2($0xb1, %xmm7, %xmm3)
  219. mulps %xmm5, %xmm7
  220. addps %xmm7, %xmm0
  221. movaps -4 * SIZE(Y), %xmm7
  222. mulps %xmm5, %xmm3
  223. movaps -4 * SIZE(X), %xmm5
  224. addps %xmm3, %xmm1
  225. PSHUFD2($0xb1, %xmm6, %xmm3)
  226. mulps %xmm4, %xmm6
  227. addps %xmm6, %xmm0
  228. mulps %xmm4, %xmm3
  229. addps %xmm3, %xmm1
  230. PSHUFD2($0xb1, %xmm7, %xmm3)
  231. mulps %xmm5, %xmm7
  232. addps %xmm7, %xmm0
  233. mulps %xmm5, %xmm3
  234. addps %xmm3, %xmm1
  235. subl $-32 * SIZE, X
  236. subl $-32 * SIZE, Y
  237. ALIGN_3
  238. .L15:
  239. testl $8, N
  240. jle .L16
  241. movaps -32 * SIZE(X), %xmm4
  242. movaps -32 * SIZE(Y), %xmm6
  243. movaps -28 * SIZE(X), %xmm5
  244. movaps -28 * SIZE(Y), %xmm7
  245. PSHUFD2($0xb1, %xmm6, %xmm3)
  246. mulps %xmm4, %xmm6
  247. addps %xmm6, %xmm0
  248. movaps -24 * SIZE(Y), %xmm6
  249. mulps %xmm4, %xmm3
  250. movaps -24 * SIZE(X), %xmm4
  251. addps %xmm3, %xmm1
  252. PSHUFD2($0xb1, %xmm7, %xmm3)
  253. mulps %xmm5, %xmm7
  254. addps %xmm7, %xmm0
  255. movaps -20 * SIZE(Y), %xmm7
  256. mulps %xmm5, %xmm3
  257. movaps -20 * SIZE(X), %xmm5
  258. addps %xmm3, %xmm1
  259. PSHUFD2($0xb1, %xmm6, %xmm3)
  260. mulps %xmm4, %xmm6
  261. addps %xmm6, %xmm0
  262. mulps %xmm4, %xmm3
  263. addps %xmm3, %xmm1
  264. PSHUFD2($0xb1, %xmm7, %xmm3)
  265. mulps %xmm5, %xmm7
  266. addps %xmm7, %xmm0
  267. mulps %xmm5, %xmm3
  268. addps %xmm3, %xmm1
  269. addl $16 * SIZE, X
  270. addl $16 * SIZE, Y
  271. ALIGN_3
  272. .L16:
  273. testl $4, N
  274. jle .L17
  275. movaps -32 * SIZE(X), %xmm4
  276. movaps -32 * SIZE(Y), %xmm6
  277. movaps -28 * SIZE(X), %xmm5
  278. movaps -28 * SIZE(Y), %xmm7
  279. PSHUFD2($0xb1, %xmm6, %xmm3)
  280. mulps %xmm4, %xmm6
  281. addps %xmm6, %xmm0
  282. mulps %xmm4, %xmm3
  283. addps %xmm3, %xmm1
  284. PSHUFD2($0xb1, %xmm7, %xmm3)
  285. mulps %xmm5, %xmm7
  286. addps %xmm7, %xmm0
  287. mulps %xmm5, %xmm3
  288. addps %xmm3, %xmm1
  289. addl $8 * SIZE, X
  290. addl $8 * SIZE, Y
  291. ALIGN_3
  292. .L17:
  293. testl $2, N
  294. jle .L18
  295. movaps -32 * SIZE(X), %xmm4
  296. movaps -32 * SIZE(Y), %xmm6
  297. PSHUFD2($0xb1, %xmm6, %xmm3)
  298. mulps %xmm4, %xmm6
  299. addps %xmm6, %xmm0
  300. mulps %xmm4, %xmm3
  301. addps %xmm3, %xmm1
  302. addl $4 * SIZE, X
  303. addl $4 * SIZE, Y
  304. ALIGN_3
  305. .L18:
  306. testl $1, N
  307. jle .L98
  308. #ifdef movsd
  309. xorps %xmm4, %xmm4
  310. #endif
  311. movsd -32 * SIZE(X), %xmm4
  312. #ifdef movsd
  313. xorps %xmm6, %xmm6
  314. #endif
  315. movsd -32 * SIZE(Y), %xmm6
  316. PSHUFD2($0xb1, %xmm6, %xmm3)
  317. mulps %xmm4, %xmm6
  318. addps %xmm6, %xmm0
  319. mulps %xmm4, %xmm3
  320. addps %xmm3, %xmm1
  321. jmp .L98
  322. ALIGN_3
  323. .L20:
  324. #ifdef ALIGNED_ACCESS
  325. testl $2 * SIZE, Y
  326. jne .L30
  327. movaps -33 * SIZE(Y), %xmm6
  328. addl $3 * SIZE, Y
  329. shufps $0xb1, %xmm1, %xmm1
  330. movl N, %eax
  331. sarl $4, %eax
  332. jle .L25
  333. movaps -32 * SIZE(X), %xmm4
  334. movaps -28 * SIZE(X), %xmm5
  335. movaps -32 * SIZE(Y), %xmm7
  336. decl %eax
  337. jle .L22
  338. ALIGN_3
  339. .L21:
  340. #ifdef PREFETCH
  341. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  342. #endif
  343. movss %xmm7, %xmm6
  344. PSHUFD2($0xb1, %xmm4, %xmm3)
  345. shufps $0x39, %xmm6, %xmm6
  346. mulps %xmm6, %xmm4
  347. addps %xmm4, %xmm0
  348. movaps -24 * SIZE(X), %xmm4
  349. mulps %xmm6, %xmm3
  350. movaps -28 * SIZE(Y), %xmm6
  351. addps %xmm3, %xmm1
  352. movss %xmm6, %xmm7
  353. PSHUFD2($0xb1, %xmm5, %xmm3)
  354. shufps $0x39, %xmm7, %xmm7
  355. mulps %xmm7, %xmm5
  356. addps %xmm5, %xmm0
  357. movaps -20 * SIZE(X), %xmm5
  358. mulps %xmm7, %xmm3
  359. movaps -24 * SIZE(Y), %xmm7
  360. addps %xmm3, %xmm1
  361. #ifdef PREFETCH
  362. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  363. #endif
  364. movss %xmm7, %xmm6
  365. PSHUFD2($0xb1, %xmm4, %xmm3)
  366. shufps $0x39, %xmm6, %xmm6
  367. mulps %xmm6, %xmm4
  368. addps %xmm4, %xmm0
  369. movaps -16 * SIZE(X), %xmm4
  370. mulps %xmm6, %xmm3
  371. movaps -20 * SIZE(Y), %xmm6
  372. addps %xmm3, %xmm1
  373. movss %xmm6, %xmm7
  374. PSHUFD2($0xb1, %xmm5, %xmm3)
  375. shufps $0x39, %xmm7, %xmm7
  376. mulps %xmm7, %xmm5
  377. addps %xmm5, %xmm0
  378. movaps -12 * SIZE(X), %xmm5
  379. mulps %xmm7, %xmm3
  380. movaps -16 * SIZE(Y), %xmm7
  381. addps %xmm3, %xmm1
  382. #if defined(PREFETCH) && !defined(FETCH128)
  383. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  384. #endif
  385. movss %xmm7, %xmm6
  386. PSHUFD2($0xb1, %xmm4, %xmm3)
  387. shufps $0x39, %xmm6, %xmm6
  388. mulps %xmm6, %xmm4
  389. addps %xmm4, %xmm0
  390. movaps -8 * SIZE(X), %xmm4
  391. mulps %xmm6, %xmm3
  392. movaps -12 * SIZE(Y), %xmm6
  393. addps %xmm3, %xmm1
  394. movss %xmm6, %xmm7
  395. PSHUFD2($0xb1, %xmm5, %xmm3)
  396. shufps $0x39, %xmm7, %xmm7
  397. mulps %xmm7, %xmm5
  398. addps %xmm5, %xmm0
  399. movaps -4 * SIZE(X), %xmm5
  400. mulps %xmm7, %xmm3
  401. movaps -8 * SIZE(Y), %xmm7
  402. addps %xmm3, %xmm1
  403. #if defined(PREFETCH) && !defined(FETCH128)
  404. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  405. #endif
  406. movss %xmm7, %xmm6
  407. PSHUFD2($0xb1, %xmm4, %xmm3)
  408. shufps $0x39, %xmm6, %xmm6
  409. mulps %xmm6, %xmm4
  410. addps %xmm4, %xmm0
  411. movaps 0 * SIZE(X), %xmm4
  412. mulps %xmm6, %xmm3
  413. movaps -4 * SIZE(Y), %xmm6
  414. addps %xmm3, %xmm1
  415. movss %xmm6, %xmm7
  416. PSHUFD2($0xb1, %xmm5, %xmm3)
  417. shufps $0x39, %xmm7, %xmm7
  418. mulps %xmm7, %xmm5
  419. addps %xmm5, %xmm0
  420. movaps 4 * SIZE(X), %xmm5
  421. mulps %xmm7, %xmm3
  422. movaps 0 * SIZE(Y), %xmm7
  423. addps %xmm3, %xmm1
  424. subl $-32 * SIZE, X
  425. subl $-32 * SIZE, Y
  426. decl %eax
  427. jg .L21
  428. ALIGN_3
  429. .L22:
  430. movss %xmm7, %xmm6
  431. PSHUFD2($0xb1, %xmm4, %xmm3)
  432. shufps $0x39, %xmm6, %xmm6
  433. mulps %xmm6, %xmm4
  434. addps %xmm4, %xmm0
  435. movaps -24 * SIZE(X), %xmm4
  436. mulps %xmm6, %xmm3
  437. movaps -28 * SIZE(Y), %xmm6
  438. addps %xmm3, %xmm1
  439. movss %xmm6, %xmm7
  440. PSHUFD2($0xb1, %xmm5, %xmm3)
  441. shufps $0x39, %xmm7, %xmm7
  442. mulps %xmm7, %xmm5
  443. addps %xmm5, %xmm0
  444. movaps -20 * SIZE(X), %xmm5
  445. mulps %xmm7, %xmm3
  446. movaps -24 * SIZE(Y), %xmm7
  447. addps %xmm3, %xmm1
  448. movss %xmm7, %xmm6
  449. PSHUFD2($0xb1, %xmm4, %xmm3)
  450. shufps $0x39, %xmm6, %xmm6
  451. mulps %xmm6, %xmm4
  452. addps %xmm4, %xmm0
  453. movaps -16 * SIZE(X), %xmm4
  454. mulps %xmm6, %xmm3
  455. movaps -20 * SIZE(Y), %xmm6
  456. addps %xmm3, %xmm1
  457. movss %xmm6, %xmm7
  458. PSHUFD2($0xb1, %xmm5, %xmm3)
  459. shufps $0x39, %xmm7, %xmm7
  460. mulps %xmm7, %xmm5
  461. addps %xmm5, %xmm0
  462. movaps -12 * SIZE(X), %xmm5
  463. mulps %xmm7, %xmm3
  464. movaps -16 * SIZE(Y), %xmm7
  465. addps %xmm3, %xmm1
  466. movss %xmm7, %xmm6
  467. PSHUFD2($0xb1, %xmm4, %xmm3)
  468. shufps $0x39, %xmm6, %xmm6
  469. mulps %xmm6, %xmm4
  470. addps %xmm4, %xmm0
  471. movaps -8 * SIZE(X), %xmm4
  472. mulps %xmm6, %xmm3
  473. movaps -12 * SIZE(Y), %xmm6
  474. addps %xmm3, %xmm1
  475. movss %xmm6, %xmm7
  476. PSHUFD2($0xb1, %xmm5, %xmm3)
  477. shufps $0x39, %xmm7, %xmm7
  478. mulps %xmm7, %xmm5
  479. addps %xmm5, %xmm0
  480. movaps -4 * SIZE(X), %xmm5
  481. mulps %xmm7, %xmm3
  482. movaps -8 * SIZE(Y), %xmm7
  483. addps %xmm3, %xmm1
  484. movss %xmm7, %xmm6
  485. PSHUFD2($0xb1, %xmm4, %xmm3)
  486. shufps $0x39, %xmm6, %xmm6
  487. mulps %xmm6, %xmm4
  488. addps %xmm4, %xmm0
  489. mulps %xmm6, %xmm3
  490. movaps -4 * SIZE(Y), %xmm6
  491. addps %xmm3, %xmm1
  492. movss %xmm6, %xmm7
  493. PSHUFD2($0xb1, %xmm5, %xmm3)
  494. shufps $0x39, %xmm7, %xmm7
  495. mulps %xmm7, %xmm5
  496. addps %xmm5, %xmm0
  497. mulps %xmm7, %xmm3
  498. addps %xmm3, %xmm1
  499. subl $-32 * SIZE, X
  500. subl $-32 * SIZE, Y
  501. ALIGN_3
  502. .L25:
  503. testl $8, N
  504. jle .L26
  505. movaps -32 * SIZE(X), %xmm4
  506. movaps -28 * SIZE(X), %xmm5
  507. movaps -32 * SIZE(Y), %xmm7
  508. movss %xmm7, %xmm6
  509. PSHUFD2($0xb1, %xmm4, %xmm3)
  510. shufps $0x39, %xmm6, %xmm6
  511. mulps %xmm6, %xmm4
  512. addps %xmm4, %xmm0
  513. movaps -24 * SIZE(X), %xmm4
  514. mulps %xmm6, %xmm3
  515. movaps -28 * SIZE(Y), %xmm6
  516. addps %xmm3, %xmm1
  517. movss %xmm6, %xmm7
  518. PSHUFD2($0xb1, %xmm5, %xmm3)
  519. shufps $0x39, %xmm7, %xmm7
  520. mulps %xmm7, %xmm5
  521. addps %xmm5, %xmm0
  522. movaps -20 * SIZE(X), %xmm5
  523. mulps %xmm7, %xmm3
  524. movaps -24 * SIZE(Y), %xmm7
  525. addps %xmm3, %xmm1
  526. movss %xmm7, %xmm6
  527. PSHUFD2($0xb1, %xmm4, %xmm3)
  528. shufps $0x39, %xmm6, %xmm6
  529. mulps %xmm6, %xmm4
  530. addps %xmm4, %xmm0
  531. mulps %xmm6, %xmm3
  532. movaps -20 * SIZE(Y), %xmm6
  533. addps %xmm3, %xmm1
  534. movss %xmm6, %xmm7
  535. PSHUFD2($0xb1, %xmm5, %xmm3)
  536. shufps $0x39, %xmm7, %xmm7
  537. mulps %xmm7, %xmm5
  538. addps %xmm5, %xmm0
  539. mulps %xmm7, %xmm3
  540. addps %xmm3, %xmm1
  541. addl $16 * SIZE, X
  542. addl $16 * SIZE, Y
  543. ALIGN_3
  544. .L26:
  545. testl $4, N
  546. jle .L27
  547. movaps -32 * SIZE(X), %xmm4
  548. movaps -28 * SIZE(X), %xmm5
  549. movaps -32 * SIZE(Y), %xmm7
  550. movss %xmm7, %xmm6
  551. PSHUFD2($0xb1, %xmm4, %xmm3)
  552. shufps $0x39, %xmm6, %xmm6
  553. mulps %xmm6, %xmm4
  554. addps %xmm4, %xmm0
  555. mulps %xmm6, %xmm3
  556. movaps -28 * SIZE(Y), %xmm6
  557. addps %xmm3, %xmm1
  558. movss %xmm6, %xmm7
  559. PSHUFD2($0xb1, %xmm5, %xmm3)
  560. shufps $0x39, %xmm7, %xmm7
  561. mulps %xmm7, %xmm5
  562. addps %xmm5, %xmm0
  563. mulps %xmm7, %xmm3
  564. addps %xmm3, %xmm1
  565. addl $8 * SIZE, X
  566. addl $8 * SIZE, Y
  567. ALIGN_3
  568. .L27:
  569. testl $2, N
  570. jle .L28
  571. movaps -32 * SIZE(X), %xmm4
  572. movaps -32 * SIZE(Y), %xmm7
  573. movss %xmm7, %xmm6
  574. PSHUFD2($0xb1, %xmm4, %xmm3)
  575. shufps $0x39, %xmm6, %xmm6
  576. mulps %xmm6, %xmm4
  577. addps %xmm4, %xmm0
  578. mulps %xmm6, %xmm3
  579. addps %xmm3, %xmm1
  580. movaps %xmm7, %xmm6
  581. addl $4 * SIZE, X
  582. addl $4 * SIZE, Y
  583. ALIGN_3
  584. .L28:
  585. testl $1, N
  586. jle .L29
  587. #ifdef movsd
  588. xorps %xmm4, %xmm4
  589. #endif
  590. movsd -32 * SIZE(X), %xmm4
  591. PSHUFD2($0xb1, %xmm4, %xmm3)
  592. shufps $0x39, %xmm6, %xmm6
  593. mulps %xmm6, %xmm4
  594. addps %xmm4, %xmm0
  595. mulps %xmm6, %xmm3
  596. addps %xmm3, %xmm1
  597. ALIGN_3
  598. .L29:
  599. shufps $0xb1, %xmm1, %xmm1
  600. jmp .L98
  601. ALIGN_3
  602. .L30:
  603. testl $SIZE, Y
  604. jne .L40
  605. #endif
  606. movl N, %eax
  607. sarl $4, %eax
  608. jle .L35
  609. movaps -32 * SIZE(X), %xmm4
  610. movsd -32 * SIZE(Y), %xmm6
  611. movhps -30 * SIZE(Y), %xmm6
  612. movaps -28 * SIZE(X), %xmm5
  613. movsd -28 * SIZE(Y), %xmm7
  614. movhps -26 * SIZE(Y), %xmm7
  615. decl %eax
  616. jle .L32
  617. ALIGN_3
  618. .L31:
  619. #ifdef PREFETCH
  620. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  621. #endif
  622. PSHUFD2($0xb1, %xmm6, %xmm3)
  623. mulps %xmm4, %xmm6
  624. addps %xmm6, %xmm0
  625. movsd -24 * SIZE(Y), %xmm6
  626. movhps -22 * SIZE(Y), %xmm6
  627. mulps %xmm4, %xmm3
  628. movaps -24 * SIZE(X), %xmm4
  629. addps %xmm3, %xmm1
  630. PSHUFD2($0xb1, %xmm7, %xmm3)
  631. mulps %xmm5, %xmm7
  632. addps %xmm7, %xmm0
  633. movsd -20 * SIZE(Y), %xmm7
  634. movhps -18 * SIZE(Y), %xmm7
  635. mulps %xmm5, %xmm3
  636. movaps -20 * SIZE(X), %xmm5
  637. addps %xmm3, %xmm1
  638. #ifdef PREFETCH
  639. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  640. #endif
  641. PSHUFD2($0xb1, %xmm6, %xmm3)
  642. mulps %xmm4, %xmm6
  643. addps %xmm6, %xmm0
  644. movsd -16 * SIZE(Y), %xmm6
  645. movhps -14 * SIZE(Y), %xmm6
  646. mulps %xmm4, %xmm3
  647. movaps -16 * SIZE(X), %xmm4
  648. addps %xmm3, %xmm1
  649. PSHUFD2($0xb1, %xmm7, %xmm3)
  650. mulps %xmm5, %xmm7
  651. addps %xmm7, %xmm0
  652. movsd -12 * SIZE(Y), %xmm7
  653. movhps -10 * SIZE(Y), %xmm7
  654. mulps %xmm5, %xmm3
  655. movaps -12 * SIZE(X), %xmm5
  656. addps %xmm3, %xmm1
  657. #if defined(PREFETCH) && !defined(FETCH128)
  658. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  659. #endif
  660. PSHUFD2($0xb1, %xmm6, %xmm3)
  661. mulps %xmm4, %xmm6
  662. addps %xmm6, %xmm0
  663. movsd -8 * SIZE(Y), %xmm6
  664. movhps -6 * SIZE(Y), %xmm6
  665. mulps %xmm4, %xmm3
  666. movaps -8 * SIZE(X), %xmm4
  667. addps %xmm3, %xmm1
  668. PSHUFD2($0xb1, %xmm7, %xmm3)
  669. mulps %xmm5, %xmm7
  670. addps %xmm7, %xmm0
  671. movsd -4 * SIZE(Y), %xmm7
  672. movhps -2 * SIZE(Y), %xmm7
  673. mulps %xmm5, %xmm3
  674. movaps -4 * SIZE(X), %xmm5
  675. addps %xmm3, %xmm1
  676. #if defined(PREFETCH) && !defined(FETCH128)
  677. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  678. #endif
  679. PSHUFD2($0xb1, %xmm6, %xmm3)
  680. mulps %xmm4, %xmm6
  681. addps %xmm6, %xmm0
  682. movsd 0 * SIZE(Y), %xmm6
  683. movhps 2 * SIZE(Y), %xmm6
  684. mulps %xmm4, %xmm3
  685. movaps 0 * SIZE(X), %xmm4
  686. addps %xmm3, %xmm1
  687. PSHUFD2($0xb1, %xmm7, %xmm3)
  688. mulps %xmm5, %xmm7
  689. addps %xmm7, %xmm0
  690. movsd 4 * SIZE(Y), %xmm7
  691. movhps 6 * SIZE(Y), %xmm7
  692. mulps %xmm5, %xmm3
  693. movaps 4 * SIZE(X), %xmm5
  694. addps %xmm3, %xmm1
  695. subl $-32 * SIZE, X
  696. subl $-32 * SIZE, Y
  697. decl %eax
  698. jg .L31
  699. ALIGN_3
  700. .L32:
  701. PSHUFD2($0xb1, %xmm6, %xmm3)
  702. mulps %xmm4, %xmm6
  703. addps %xmm6, %xmm0
  704. movsd -24 * SIZE(Y), %xmm6
  705. movhps -22 * SIZE(Y), %xmm6
  706. mulps %xmm4, %xmm3
  707. movaps -24 * SIZE(X), %xmm4
  708. addps %xmm3, %xmm1
  709. PSHUFD2($0xb1, %xmm7, %xmm3)
  710. mulps %xmm5, %xmm7
  711. addps %xmm7, %xmm0
  712. movsd -20 * SIZE(Y), %xmm7
  713. movhps -18 * SIZE(Y), %xmm7
  714. mulps %xmm5, %xmm3
  715. movaps -20 * SIZE(X), %xmm5
  716. addps %xmm3, %xmm1
  717. PSHUFD2($0xb1, %xmm6, %xmm3)
  718. mulps %xmm4, %xmm6
  719. addps %xmm6, %xmm0
  720. movsd -16 * SIZE(Y), %xmm6
  721. movhps -14 * SIZE(Y), %xmm6
  722. mulps %xmm4, %xmm3
  723. movaps -16 * SIZE(X), %xmm4
  724. addps %xmm3, %xmm1
  725. PSHUFD2($0xb1, %xmm7, %xmm3)
  726. mulps %xmm5, %xmm7
  727. addps %xmm7, %xmm0
  728. movsd -12 * SIZE(Y), %xmm7
  729. movhps -10 * SIZE(Y), %xmm7
  730. mulps %xmm5, %xmm3
  731. movaps -12 * SIZE(X), %xmm5
  732. addps %xmm3, %xmm1
  733. PSHUFD2($0xb1, %xmm6, %xmm3)
  734. mulps %xmm4, %xmm6
  735. addps %xmm6, %xmm0
  736. movsd -8 * SIZE(Y), %xmm6
  737. movhps -6 * SIZE(Y), %xmm6
  738. mulps %xmm4, %xmm3
  739. movaps -8 * SIZE(X), %xmm4
  740. addps %xmm3, %xmm1
  741. PSHUFD2($0xb1, %xmm7, %xmm3)
  742. mulps %xmm5, %xmm7
  743. addps %xmm7, %xmm0
  744. movsd -4 * SIZE(Y), %xmm7
  745. movhps -2 * SIZE(Y), %xmm7
  746. mulps %xmm5, %xmm3
  747. movaps -4 * SIZE(X), %xmm5
  748. addps %xmm3, %xmm1
  749. PSHUFD2($0xb1, %xmm6, %xmm3)
  750. mulps %xmm4, %xmm6
  751. addps %xmm6, %xmm0
  752. mulps %xmm4, %xmm3
  753. addps %xmm3, %xmm1
  754. PSHUFD2($0xb1, %xmm7, %xmm3)
  755. mulps %xmm5, %xmm7
  756. addps %xmm7, %xmm0
  757. mulps %xmm5, %xmm3
  758. addps %xmm3, %xmm1
  759. subl $-32 * SIZE, X
  760. subl $-32 * SIZE, Y
  761. ALIGN_3
  762. .L35:
  763. testl $8, N
  764. jle .L36
  765. movaps -32 * SIZE(X), %xmm4
  766. movsd -32 * SIZE(Y), %xmm6
  767. movhps -30 * SIZE(Y), %xmm6
  768. movaps -28 * SIZE(X), %xmm5
  769. movsd -28 * SIZE(Y), %xmm7
  770. movhps -26 * SIZE(Y), %xmm7
  771. PSHUFD2($0xb1, %xmm6, %xmm3)
  772. mulps %xmm4, %xmm6
  773. addps %xmm6, %xmm0
  774. movsd -24 * SIZE(Y), %xmm6
  775. movhps -22 * SIZE(Y), %xmm6
  776. mulps %xmm4, %xmm3
  777. movaps -24 * SIZE(X), %xmm4
  778. addps %xmm3, %xmm1
  779. PSHUFD2($0xb1, %xmm7, %xmm3)
  780. mulps %xmm5, %xmm7
  781. addps %xmm7, %xmm0
  782. movsd -20 * SIZE(Y), %xmm7
  783. movhps -18 * SIZE(Y), %xmm7
  784. mulps %xmm5, %xmm3
  785. movaps -20 * SIZE(X), %xmm5
  786. addps %xmm3, %xmm1
  787. PSHUFD2($0xb1, %xmm6, %xmm3)
  788. mulps %xmm4, %xmm6
  789. addps %xmm6, %xmm0
  790. mulps %xmm4, %xmm3
  791. addps %xmm3, %xmm1
  792. PSHUFD2($0xb1, %xmm7, %xmm3)
  793. mulps %xmm5, %xmm7
  794. addps %xmm7, %xmm0
  795. mulps %xmm5, %xmm3
  796. addps %xmm3, %xmm1
  797. addl $16 * SIZE, X
  798. addl $16 * SIZE, Y
  799. ALIGN_3
  800. .L36:
  801. testl $4, N
  802. jle .L37
  803. movaps -32 * SIZE(X), %xmm4
  804. movsd -32 * SIZE(Y), %xmm6
  805. movhps -30 * SIZE(Y), %xmm6
  806. PSHUFD2($0xb1, %xmm6, %xmm3)
  807. mulps %xmm4, %xmm6
  808. addps %xmm6, %xmm0
  809. mulps %xmm4, %xmm3
  810. addps %xmm3, %xmm1
  811. movaps -28 * SIZE(X), %xmm5
  812. movsd -28 * SIZE(Y), %xmm7
  813. movhps -26 * SIZE(Y), %xmm7
  814. PSHUFD2($0xb1, %xmm7, %xmm3)
  815. mulps %xmm5, %xmm7
  816. addps %xmm7, %xmm0
  817. mulps %xmm5, %xmm3
  818. addps %xmm3, %xmm1
  819. addl $8 * SIZE, X
  820. addl $8 * SIZE, Y
  821. ALIGN_3
  822. .L37:
  823. testl $2, N
  824. jle .L38
  825. movaps -32 * SIZE(X), %xmm4
  826. movsd -32 * SIZE(Y), %xmm6
  827. movhps -30 * SIZE(Y), %xmm6
  828. PSHUFD2($0xb1, %xmm6, %xmm3)
  829. mulps %xmm4, %xmm6
  830. addps %xmm6, %xmm0
  831. mulps %xmm4, %xmm3
  832. addps %xmm3, %xmm1
  833. addl $4 * SIZE, X
  834. addl $4 * SIZE, Y
  835. ALIGN_3
  836. .L38:
  837. testl $1, N
  838. jle .L98
  839. #ifdef movsd
  840. xorps %xmm4, %xmm4
  841. #endif
  842. movsd -32 * SIZE(X), %xmm4
  843. #ifdef movsd
  844. xorps %xmm6, %xmm6
  845. #endif
  846. movsd -32 * SIZE(Y), %xmm6
  847. PSHUFD2($0xb1, %xmm6, %xmm3)
  848. mulps %xmm4, %xmm6
  849. addps %xmm6, %xmm0
  850. mulps %xmm4, %xmm3
  851. addps %xmm3, %xmm1
  852. jmp .L98
  853. ALIGN_3
  854. #ifdef ALIGNED_ACCESS
  855. .L40:
  856. movaps -35 * SIZE(Y), %xmm6
  857. addl $1 * SIZE, Y
  858. shufps $0xb1, %xmm1, %xmm1
  859. movl N, %eax
  860. sarl $4, %eax
  861. jle .L45
  862. movaps -32 * SIZE(X), %xmm4
  863. movaps -28 * SIZE(X), %xmm5
  864. movaps -32 * SIZE(Y), %xmm7
  865. decl %eax
  866. jle .L42
  867. ALIGN_3
  868. .L41:
  869. #ifdef PREFETCH
  870. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  871. #endif
  872. #ifdef PREFETCH
  873. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  874. #endif
  875. #if defined(PREFETCH) && !defined(FETCH128)
  876. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  877. #endif
  878. #if defined(PREFETCH) && !defined(FETCH128)
  879. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  880. #endif
  881. movss %xmm7, %xmm6
  882. PSHUFD2($0xb1, %xmm4, %xmm3)
  883. shufps $0x93, %xmm7, %xmm6
  884. mulps %xmm6, %xmm4
  885. addps %xmm4, %xmm0
  886. movaps -24 * SIZE(X), %xmm4
  887. mulps %xmm6, %xmm3
  888. movaps -28 * SIZE(Y), %xmm6
  889. addps %xmm3, %xmm1
  890. movss %xmm6, %xmm7
  891. PSHUFD2($0xb1, %xmm5, %xmm3)
  892. shufps $0x93, %xmm6, %xmm7
  893. mulps %xmm7, %xmm5
  894. addps %xmm5, %xmm0
  895. movaps -20 * SIZE(X), %xmm5
  896. mulps %xmm7, %xmm3
  897. movaps -24 * SIZE(Y), %xmm7
  898. addps %xmm3, %xmm1
  899. movss %xmm7, %xmm6
  900. PSHUFD2($0xb1, %xmm4, %xmm3)
  901. shufps $0x93, %xmm7, %xmm6
  902. mulps %xmm6, %xmm4
  903. addps %xmm4, %xmm0
  904. movaps -16 * SIZE(X), %xmm4
  905. mulps %xmm6, %xmm3
  906. movaps -20 * SIZE(Y), %xmm6
  907. addps %xmm3, %xmm1
  908. movss %xmm6, %xmm7
  909. PSHUFD2($0xb1, %xmm5, %xmm3)
  910. shufps $0x93, %xmm6, %xmm7
  911. mulps %xmm7, %xmm5
  912. addps %xmm5, %xmm0
  913. movaps -12 * SIZE(X), %xmm5
  914. mulps %xmm7, %xmm3
  915. movaps -16 * SIZE(Y), %xmm7
  916. addps %xmm3, %xmm1
  917. movss %xmm7, %xmm6
  918. PSHUFD2($0xb1, %xmm4, %xmm3)
  919. shufps $0x93, %xmm7, %xmm6
  920. mulps %xmm6, %xmm4
  921. addps %xmm4, %xmm0
  922. movaps -8 * SIZE(X), %xmm4
  923. mulps %xmm6, %xmm3
  924. movaps -12 * SIZE(Y), %xmm6
  925. addps %xmm3, %xmm1
  926. movss %xmm6, %xmm7
  927. PSHUFD2($0xb1, %xmm5, %xmm3)
  928. shufps $0x93, %xmm6, %xmm7
  929. mulps %xmm7, %xmm5
  930. addps %xmm5, %xmm0
  931. movaps -4 * SIZE(X), %xmm5
  932. mulps %xmm7, %xmm3
  933. movaps -8 * SIZE(Y), %xmm7
  934. addps %xmm3, %xmm1
  935. movss %xmm7, %xmm6
  936. PSHUFD2($0xb1, %xmm4, %xmm3)
  937. shufps $0x93, %xmm7, %xmm6
  938. mulps %xmm6, %xmm4
  939. addps %xmm4, %xmm0
  940. movaps 0 * SIZE(X), %xmm4
  941. mulps %xmm6, %xmm3
  942. movaps -4 * SIZE(Y), %xmm6
  943. addps %xmm3, %xmm1
  944. movss %xmm6, %xmm7
  945. PSHUFD2($0xb1, %xmm5, %xmm3)
  946. shufps $0x93, %xmm6, %xmm7
  947. mulps %xmm7, %xmm5
  948. addps %xmm5, %xmm0
  949. movaps 4 * SIZE(X), %xmm5
  950. mulps %xmm7, %xmm3
  951. movaps 0 * SIZE(Y), %xmm7
  952. addps %xmm3, %xmm1
  953. subl $-32 * SIZE, X
  954. subl $-32 * SIZE, Y
  955. decl %eax
  956. jg .L41
  957. ALIGN_3
  958. .L42:
  959. movss %xmm7, %xmm6
  960. PSHUFD2($0xb1, %xmm4, %xmm3)
  961. shufps $0x93, %xmm7, %xmm6
  962. mulps %xmm6, %xmm4
  963. addps %xmm4, %xmm0
  964. movaps -24 * SIZE(X), %xmm4
  965. mulps %xmm6, %xmm3
  966. movaps -28 * SIZE(Y), %xmm6
  967. addps %xmm3, %xmm1
  968. movss %xmm6, %xmm7
  969. PSHUFD2($0xb1, %xmm5, %xmm3)
  970. shufps $0x93, %xmm6, %xmm7
  971. mulps %xmm7, %xmm5
  972. addps %xmm5, %xmm0
  973. movaps -20 * SIZE(X), %xmm5
  974. mulps %xmm7, %xmm3
  975. movaps -24 * SIZE(Y), %xmm7
  976. addps %xmm3, %xmm1
  977. movss %xmm7, %xmm6
  978. PSHUFD2($0xb1, %xmm4, %xmm3)
  979. shufps $0x93, %xmm7, %xmm6
  980. mulps %xmm6, %xmm4
  981. addps %xmm4, %xmm0
  982. movaps -16 * SIZE(X), %xmm4
  983. mulps %xmm6, %xmm3
  984. movaps -20 * SIZE(Y), %xmm6
  985. addps %xmm3, %xmm1
  986. movss %xmm6, %xmm7
  987. PSHUFD2($0xb1, %xmm5, %xmm3)
  988. shufps $0x93, %xmm6, %xmm7
  989. mulps %xmm7, %xmm5
  990. addps %xmm5, %xmm0
  991. movaps -12 * SIZE(X), %xmm5
  992. mulps %xmm7, %xmm3
  993. movaps -16 * SIZE(Y), %xmm7
  994. addps %xmm3, %xmm1
  995. movss %xmm7, %xmm6
  996. PSHUFD2($0xb1, %xmm4, %xmm3)
  997. shufps $0x93, %xmm7, %xmm6
  998. mulps %xmm6, %xmm4
  999. addps %xmm4, %xmm0
  1000. movaps -8 * SIZE(X), %xmm4
  1001. mulps %xmm6, %xmm3
  1002. movaps -12 * SIZE(Y), %xmm6
  1003. addps %xmm3, %xmm1
  1004. movss %xmm6, %xmm7
  1005. PSHUFD2($0xb1, %xmm5, %xmm3)
  1006. shufps $0x93, %xmm6, %xmm7
  1007. mulps %xmm7, %xmm5
  1008. addps %xmm5, %xmm0
  1009. movaps -4 * SIZE(X), %xmm5
  1010. mulps %xmm7, %xmm3
  1011. movaps -8 * SIZE(Y), %xmm7
  1012. addps %xmm3, %xmm1
  1013. movss %xmm7, %xmm6
  1014. PSHUFD2($0xb1, %xmm4, %xmm3)
  1015. shufps $0x93, %xmm7, %xmm6
  1016. mulps %xmm6, %xmm4
  1017. addps %xmm4, %xmm0
  1018. mulps %xmm6, %xmm3
  1019. movaps -4 * SIZE(Y), %xmm6
  1020. addps %xmm3, %xmm1
  1021. movss %xmm6, %xmm7
  1022. PSHUFD2($0xb1, %xmm5, %xmm3)
  1023. shufps $0x93, %xmm6, %xmm7
  1024. mulps %xmm7, %xmm5
  1025. addps %xmm5, %xmm0
  1026. mulps %xmm7, %xmm3
  1027. addps %xmm3, %xmm1
  1028. subl $-32 * SIZE, X
  1029. subl $-32 * SIZE, Y
  1030. ALIGN_3
  1031. .L45:
  1032. testl $8, N
  1033. jle .L46
  1034. movaps -32 * SIZE(X), %xmm4
  1035. movaps -28 * SIZE(X), %xmm5
  1036. movaps -32 * SIZE(Y), %xmm7
  1037. movss %xmm7, %xmm6
  1038. PSHUFD2($0xb1, %xmm4, %xmm3)
  1039. shufps $0x93, %xmm7, %xmm6
  1040. mulps %xmm6, %xmm4
  1041. addps %xmm4, %xmm0
  1042. movaps -24 * SIZE(X), %xmm4
  1043. mulps %xmm6, %xmm3
  1044. movaps -28 * SIZE(Y), %xmm6
  1045. addps %xmm3, %xmm1
  1046. movss %xmm6, %xmm7
  1047. PSHUFD2($0xb1, %xmm5, %xmm3)
  1048. shufps $0x93, %xmm6, %xmm7
  1049. mulps %xmm7, %xmm5
  1050. addps %xmm5, %xmm0
  1051. movaps -20 * SIZE(X), %xmm5
  1052. mulps %xmm7, %xmm3
  1053. movaps -24 * SIZE(Y), %xmm7
  1054. addps %xmm3, %xmm1
  1055. movss %xmm7, %xmm6
  1056. PSHUFD2($0xb1, %xmm4, %xmm3)
  1057. shufps $0x93, %xmm7, %xmm6
  1058. mulps %xmm6, %xmm4
  1059. addps %xmm4, %xmm0
  1060. mulps %xmm6, %xmm3
  1061. movaps -20 * SIZE(Y), %xmm6
  1062. addps %xmm3, %xmm1
  1063. movss %xmm6, %xmm7
  1064. PSHUFD2($0xb1, %xmm5, %xmm3)
  1065. shufps $0x93, %xmm6, %xmm7
  1066. mulps %xmm7, %xmm5
  1067. addps %xmm5, %xmm0
  1068. mulps %xmm7, %xmm3
  1069. addps %xmm3, %xmm1
  1070. addl $16 * SIZE, X
  1071. addl $16 * SIZE, Y
  1072. ALIGN_3
  1073. .L46:
  1074. testl $4, N
  1075. jle .L47
  1076. movaps -32 * SIZE(X), %xmm4
  1077. movaps -28 * SIZE(X), %xmm5
  1078. movaps -32 * SIZE(Y), %xmm7
  1079. movss %xmm7, %xmm6
  1080. PSHUFD2($0xb1, %xmm4, %xmm3)
  1081. shufps $0x93, %xmm7, %xmm6
  1082. mulps %xmm6, %xmm4
  1083. addps %xmm4, %xmm0
  1084. mulps %xmm6, %xmm3
  1085. movaps -28 * SIZE(Y), %xmm6
  1086. addps %xmm3, %xmm1
  1087. movss %xmm6, %xmm7
  1088. PSHUFD2($0xb1, %xmm5, %xmm3)
  1089. shufps $0x93, %xmm6, %xmm7
  1090. mulps %xmm7, %xmm5
  1091. addps %xmm5, %xmm0
  1092. mulps %xmm7, %xmm3
  1093. addps %xmm3, %xmm1
  1094. addl $8 * SIZE, X
  1095. addl $8 * SIZE, Y
  1096. ALIGN_3
  1097. .L47:
  1098. testl $2, N
  1099. jle .L48
  1100. movaps -32 * SIZE(X), %xmm4
  1101. movaps -32 * SIZE(Y), %xmm7
  1102. movss %xmm7, %xmm6
  1103. PSHUFD2($0xb1, %xmm4, %xmm3)
  1104. shufps $0x93, %xmm7, %xmm6
  1105. mulps %xmm6, %xmm4
  1106. addps %xmm4, %xmm0
  1107. mulps %xmm6, %xmm3
  1108. addps %xmm3, %xmm1
  1109. movaps %xmm7, %xmm6
  1110. addl $4 * SIZE, X
  1111. addl $4 * SIZE, Y
  1112. ALIGN_3
  1113. .L48:
  1114. testl $1, N
  1115. jle .L49
  1116. #ifdef movsd
  1117. xorps %xmm4, %xmm4
  1118. #endif
  1119. movsd -32 * SIZE(X), %xmm4
  1120. movss -32 * SIZE(Y), %xmm7
  1121. movss %xmm7, %xmm6
  1122. PSHUFD2($0xb1, %xmm4, %xmm3)
  1123. shufps $0x93, %xmm6, %xmm6
  1124. mulps %xmm6, %xmm4
  1125. addps %xmm4, %xmm0
  1126. mulps %xmm6, %xmm3
  1127. addps %xmm3, %xmm1
  1128. ALIGN_3
  1129. .L49:
  1130. shufps $0xb1, %xmm1, %xmm1
  1131. jmp .L98
  1132. ALIGN_3
  1133. #endif
  1134. .L50:
  1135. testl $SIZE, Y
  1136. jne .L70
  1137. #ifdef ALIGNED_ACCESS
  1138. testl $2 * SIZE, Y
  1139. je .L50x
  1140. #ifdef movsd
  1141. xorps %xmm0, %xmm0
  1142. #endif
  1143. movsd -32 * SIZE(X), %xmm0
  1144. #ifdef movsd
  1145. xorps %xmm4, %xmm4
  1146. #endif
  1147. movsd -32 * SIZE(Y), %xmm4
  1148. PSHUFD2($0xb1, %xmm0, %xmm1)
  1149. mulps %xmm4, %xmm0
  1150. mulps %xmm4, %xmm1
  1151. addl $2 * SIZE, X
  1152. addl $2 * SIZE, Y
  1153. decl N
  1154. ALIGN_3
  1155. .L50x:
  1156. testl $2 * SIZE, X
  1157. jne .L60
  1158. movaps -33 * SIZE(X), %xmm6
  1159. addl $3 * SIZE, X
  1160. shufps $0xb1, %xmm1, %xmm1
  1161. movl N, %eax
  1162. sarl $4, %eax
  1163. jle .L55
  1164. movaps -32 * SIZE(Y), %xmm4
  1165. movaps -28 * SIZE(Y), %xmm5
  1166. movaps -32 * SIZE(X), %xmm7
  1167. decl %eax
  1168. jle .L52
  1169. ALIGN_3
  1170. .L51:
  1171. #ifdef PREFETCH
  1172. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  1173. #endif
  1174. movss %xmm7, %xmm6
  1175. PSHUFD2($0xb1, %xmm4, %xmm3)
  1176. shufps $0x39, %xmm6, %xmm6
  1177. mulps %xmm6, %xmm4
  1178. addps %xmm4, %xmm0
  1179. movaps -24 * SIZE(Y), %xmm4
  1180. mulps %xmm6, %xmm3
  1181. movaps -28 * SIZE(X), %xmm6
  1182. addps %xmm3, %xmm1
  1183. movss %xmm6, %xmm7
  1184. PSHUFD2($0xb1, %xmm5, %xmm3)
  1185. shufps $0x39, %xmm7, %xmm7
  1186. mulps %xmm7, %xmm5
  1187. addps %xmm5, %xmm0
  1188. movaps -20 * SIZE(Y), %xmm5
  1189. mulps %xmm7, %xmm3
  1190. movaps -24 * SIZE(X), %xmm7
  1191. addps %xmm3, %xmm1
  1192. #ifdef PREFETCH
  1193. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  1194. #endif
  1195. movss %xmm7, %xmm6
  1196. PSHUFD2($0xb1, %xmm4, %xmm3)
  1197. shufps $0x39, %xmm6, %xmm6
  1198. mulps %xmm6, %xmm4
  1199. addps %xmm4, %xmm0
  1200. movaps -16 * SIZE(Y), %xmm4
  1201. mulps %xmm6, %xmm3
  1202. movaps -20 * SIZE(X), %xmm6
  1203. addps %xmm3, %xmm1
  1204. movss %xmm6, %xmm7
  1205. PSHUFD2($0xb1, %xmm5, %xmm3)
  1206. shufps $0x39, %xmm7, %xmm7
  1207. mulps %xmm7, %xmm5
  1208. addps %xmm5, %xmm0
  1209. movaps -12 * SIZE(Y), %xmm5
  1210. mulps %xmm7, %xmm3
  1211. movaps -16 * SIZE(X), %xmm7
  1212. addps %xmm3, %xmm1
  1213. #if defined(PREFETCH) && !defined(FETCH128)
  1214. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  1215. #endif
  1216. movss %xmm7, %xmm6
  1217. PSHUFD2($0xb1, %xmm4, %xmm3)
  1218. shufps $0x39, %xmm6, %xmm6
  1219. mulps %xmm6, %xmm4
  1220. addps %xmm4, %xmm0
  1221. movaps -8 * SIZE(Y), %xmm4
  1222. mulps %xmm6, %xmm3
  1223. movaps -12 * SIZE(X), %xmm6
  1224. addps %xmm3, %xmm1
  1225. movss %xmm6, %xmm7
  1226. PSHUFD2($0xb1, %xmm5, %xmm3)
  1227. shufps $0x39, %xmm7, %xmm7
  1228. mulps %xmm7, %xmm5
  1229. addps %xmm5, %xmm0
  1230. movaps -4 * SIZE(Y), %xmm5
  1231. mulps %xmm7, %xmm3
  1232. movaps -8 * SIZE(X), %xmm7
  1233. addps %xmm3, %xmm1
  1234. #if defined(PREFETCH) && !defined(FETCH128)
  1235. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  1236. #endif
  1237. movss %xmm7, %xmm6
  1238. PSHUFD2($0xb1, %xmm4, %xmm3)
  1239. shufps $0x39, %xmm6, %xmm6
  1240. mulps %xmm6, %xmm4
  1241. addps %xmm4, %xmm0
  1242. movaps 0 * SIZE(Y), %xmm4
  1243. mulps %xmm6, %xmm3
  1244. movaps -4 * SIZE(X), %xmm6
  1245. addps %xmm3, %xmm1
  1246. movss %xmm6, %xmm7
  1247. PSHUFD2($0xb1, %xmm5, %xmm3)
  1248. shufps $0x39, %xmm7, %xmm7
  1249. mulps %xmm7, %xmm5
  1250. addps %xmm5, %xmm0
  1251. movaps 4 * SIZE(Y), %xmm5
  1252. mulps %xmm7, %xmm3
  1253. movaps 0 * SIZE(X), %xmm7
  1254. addps %xmm3, %xmm1
  1255. subl $-32 * SIZE, X
  1256. subl $-32 * SIZE, Y
  1257. decl %eax
  1258. jg .L51
  1259. ALIGN_3
  1260. .L52:
  1261. movss %xmm7, %xmm6
  1262. PSHUFD2($0xb1, %xmm4, %xmm3)
  1263. shufps $0x39, %xmm6, %xmm6
  1264. mulps %xmm6, %xmm4
  1265. addps %xmm4, %xmm0
  1266. movaps -24 * SIZE(Y), %xmm4
  1267. mulps %xmm6, %xmm3
  1268. movaps -28 * SIZE(X), %xmm6
  1269. addps %xmm3, %xmm1
  1270. movss %xmm6, %xmm7
  1271. PSHUFD2($0xb1, %xmm5, %xmm3)
  1272. shufps $0x39, %xmm7, %xmm7
  1273. mulps %xmm7, %xmm5
  1274. addps %xmm5, %xmm0
  1275. movaps -20 * SIZE(Y), %xmm5
  1276. mulps %xmm7, %xmm3
  1277. movaps -24 * SIZE(X), %xmm7
  1278. addps %xmm3, %xmm1
  1279. movss %xmm7, %xmm6
  1280. PSHUFD2($0xb1, %xmm4, %xmm3)
  1281. shufps $0x39, %xmm6, %xmm6
  1282. mulps %xmm6, %xmm4
  1283. addps %xmm4, %xmm0
  1284. movaps -16 * SIZE(Y), %xmm4
  1285. mulps %xmm6, %xmm3
  1286. movaps -20 * SIZE(X), %xmm6
  1287. addps %xmm3, %xmm1
  1288. movss %xmm6, %xmm7
  1289. PSHUFD2($0xb1, %xmm5, %xmm3)
  1290. shufps $0x39, %xmm7, %xmm7
  1291. mulps %xmm7, %xmm5
  1292. addps %xmm5, %xmm0
  1293. movaps -12 * SIZE(Y), %xmm5
  1294. mulps %xmm7, %xmm3
  1295. movaps -16 * SIZE(X), %xmm7
  1296. addps %xmm3, %xmm1
  1297. movss %xmm7, %xmm6
  1298. PSHUFD2($0xb1, %xmm4, %xmm3)
  1299. shufps $0x39, %xmm6, %xmm6
  1300. mulps %xmm6, %xmm4
  1301. addps %xmm4, %xmm0
  1302. movaps -8 * SIZE(Y), %xmm4
  1303. mulps %xmm6, %xmm3
  1304. movaps -12 * SIZE(X), %xmm6
  1305. addps %xmm3, %xmm1
  1306. movss %xmm6, %xmm7
  1307. PSHUFD2($0xb1, %xmm5, %xmm3)
  1308. shufps $0x39, %xmm7, %xmm7
  1309. mulps %xmm7, %xmm5
  1310. addps %xmm5, %xmm0
  1311. movaps -4 * SIZE(Y), %xmm5
  1312. mulps %xmm7, %xmm3
  1313. movaps -8 * SIZE(X), %xmm7
  1314. addps %xmm3, %xmm1
  1315. movss %xmm7, %xmm6
  1316. PSHUFD2($0xb1, %xmm4, %xmm3)
  1317. shufps $0x39, %xmm6, %xmm6
  1318. mulps %xmm6, %xmm4
  1319. addps %xmm4, %xmm0
  1320. mulps %xmm6, %xmm3
  1321. movaps -4 * SIZE(X), %xmm6
  1322. addps %xmm3, %xmm1
  1323. movss %xmm6, %xmm7
  1324. PSHUFD2($0xb1, %xmm5, %xmm3)
  1325. shufps $0x39, %xmm7, %xmm7
  1326. mulps %xmm7, %xmm5
  1327. addps %xmm5, %xmm0
  1328. mulps %xmm7, %xmm3
  1329. addps %xmm3, %xmm1
  1330. subl $-32 * SIZE, X
  1331. subl $-32 * SIZE, Y
  1332. ALIGN_3
  1333. .L55:
  1334. testl $8, N
  1335. jle .L56
  1336. movaps -32 * SIZE(Y), %xmm4
  1337. movaps -28 * SIZE(Y), %xmm5
  1338. movaps -32 * SIZE(X), %xmm7
  1339. movss %xmm7, %xmm6
  1340. PSHUFD2($0xb1, %xmm4, %xmm3)
  1341. shufps $0x39, %xmm6, %xmm6
  1342. mulps %xmm6, %xmm4
  1343. addps %xmm4, %xmm0
  1344. movaps -24 * SIZE(Y), %xmm4
  1345. mulps %xmm6, %xmm3
  1346. movaps -28 * SIZE(X), %xmm6
  1347. addps %xmm3, %xmm1
  1348. movss %xmm6, %xmm7
  1349. PSHUFD2($0xb1, %xmm5, %xmm3)
  1350. shufps $0x39, %xmm7, %xmm7
  1351. mulps %xmm7, %xmm5
  1352. addps %xmm5, %xmm0
  1353. movaps -20 * SIZE(Y), %xmm5
  1354. mulps %xmm7, %xmm3
  1355. movaps -24 * SIZE(X), %xmm7
  1356. addps %xmm3, %xmm1
  1357. movss %xmm7, %xmm6
  1358. PSHUFD2($0xb1, %xmm4, %xmm3)
  1359. shufps $0x39, %xmm6, %xmm6
  1360. mulps %xmm6, %xmm4
  1361. addps %xmm4, %xmm0
  1362. mulps %xmm6, %xmm3
  1363. movaps -20 * SIZE(X), %xmm6
  1364. addps %xmm3, %xmm1
  1365. movss %xmm6, %xmm7
  1366. PSHUFD2($0xb1, %xmm5, %xmm3)
  1367. shufps $0x39, %xmm7, %xmm7
  1368. mulps %xmm7, %xmm5
  1369. addps %xmm5, %xmm0
  1370. mulps %xmm7, %xmm3
  1371. addps %xmm3, %xmm1
  1372. addl $16 * SIZE, X
  1373. addl $16 * SIZE, Y
  1374. ALIGN_3
  1375. .L56:
  1376. testl $4, N
  1377. jle .L57
  1378. movaps -32 * SIZE(Y), %xmm4
  1379. movaps -28 * SIZE(Y), %xmm5
  1380. movaps -32 * SIZE(X), %xmm7
  1381. movss %xmm7, %xmm6
  1382. PSHUFD2($0xb1, %xmm4, %xmm3)
  1383. shufps $0x39, %xmm6, %xmm6
  1384. mulps %xmm6, %xmm4
  1385. addps %xmm4, %xmm0
  1386. mulps %xmm6, %xmm3
  1387. movaps -28 * SIZE(X), %xmm6
  1388. addps %xmm3, %xmm1
  1389. movss %xmm6, %xmm7
  1390. PSHUFD2($0xb1, %xmm5, %xmm3)
  1391. shufps $0x39, %xmm7, %xmm7
  1392. mulps %xmm7, %xmm5
  1393. addps %xmm5, %xmm0
  1394. mulps %xmm7, %xmm3
  1395. addps %xmm3, %xmm1
  1396. addl $8 * SIZE, X
  1397. addl $8 * SIZE, Y
  1398. ALIGN_3
  1399. .L57:
  1400. testl $2, N
  1401. jle .L58
  1402. movaps -32 * SIZE(Y), %xmm4
  1403. movaps -32 * SIZE(X), %xmm7
  1404. movss %xmm7, %xmm6
  1405. PSHUFD2($0xb1, %xmm4, %xmm3)
  1406. shufps $0x39, %xmm6, %xmm6
  1407. mulps %xmm6, %xmm4
  1408. addps %xmm4, %xmm0
  1409. mulps %xmm6, %xmm3
  1410. addps %xmm3, %xmm1
  1411. movaps %xmm7, %xmm6
  1412. addl $4 * SIZE, X
  1413. addl $4 * SIZE, Y
  1414. ALIGN_3
  1415. .L58:
  1416. testl $1, N
  1417. jle .L98
  1418. #ifdef movsd
  1419. xorps %xmm4, %xmm4
  1420. #endif
  1421. movsd -32 * SIZE(Y), %xmm4
  1422. PSHUFD2($0xb1, %xmm4, %xmm3)
  1423. shufps $0x39, %xmm6, %xmm6
  1424. mulps %xmm6, %xmm4
  1425. addps %xmm4, %xmm0
  1426. mulps %xmm6, %xmm3
  1427. addps %xmm3, %xmm1
  1428. jmp .L98
  1429. ALIGN_3
  1430. .L60:
  1431. movaps -35 * SIZE(X), %xmm6
  1432. addl $1 * SIZE, X
  1433. shufps $0xb1, %xmm1, %xmm1
  1434. movl N, %eax
  1435. sarl $4, %eax
  1436. jle .L65
  1437. movaps -32 * SIZE(Y), %xmm4
  1438. movaps -28 * SIZE(Y), %xmm5
  1439. movaps -32 * SIZE(X), %xmm7
  1440. decl %eax
  1441. jle .L62
  1442. ALIGN_3
  1443. .L61:
  1444. #ifdef PREFETCH
  1445. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  1446. #endif
  1447. movss %xmm7, %xmm6
  1448. PSHUFD2($0xb1, %xmm4, %xmm3)
  1449. shufps $0x93, %xmm7, %xmm6
  1450. mulps %xmm6, %xmm4
  1451. addps %xmm4, %xmm0
  1452. movaps -24 * SIZE(Y), %xmm4
  1453. mulps %xmm6, %xmm3
  1454. movaps -28 * SIZE(X), %xmm6
  1455. addps %xmm3, %xmm1
  1456. movss %xmm6, %xmm7
  1457. PSHUFD2($0xb1, %xmm5, %xmm3)
  1458. shufps $0x93, %xmm6, %xmm7
  1459. mulps %xmm7, %xmm5
  1460. addps %xmm5, %xmm0
  1461. movaps -20 * SIZE(Y), %xmm5
  1462. mulps %xmm7, %xmm3
  1463. movaps -24 * SIZE(X), %xmm7
  1464. addps %xmm3, %xmm1
  1465. #ifdef PREFETCH
  1466. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  1467. #endif
  1468. movss %xmm7, %xmm6
  1469. PSHUFD2($0xb1, %xmm4, %xmm3)
  1470. shufps $0x93, %xmm7, %xmm6
  1471. mulps %xmm6, %xmm4
  1472. addps %xmm4, %xmm0
  1473. movaps -16 * SIZE(Y), %xmm4
  1474. mulps %xmm6, %xmm3
  1475. movaps -20 * SIZE(X), %xmm6
  1476. addps %xmm3, %xmm1
  1477. movss %xmm6, %xmm7
  1478. PSHUFD2($0xb1, %xmm5, %xmm3)
  1479. shufps $0x93, %xmm6, %xmm7
  1480. mulps %xmm7, %xmm5
  1481. addps %xmm5, %xmm0
  1482. movaps -12 * SIZE(Y), %xmm5
  1483. mulps %xmm7, %xmm3
  1484. movaps -16 * SIZE(X), %xmm7
  1485. addps %xmm3, %xmm1
  1486. #if defined(PREFETCH) && !defined(FETCH128)
  1487. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  1488. #endif
  1489. movss %xmm7, %xmm6
  1490. PSHUFD2($0xb1, %xmm4, %xmm3)
  1491. shufps $0x93, %xmm7, %xmm6
  1492. mulps %xmm6, %xmm4
  1493. addps %xmm4, %xmm0
  1494. movaps -8 * SIZE(Y), %xmm4
  1495. mulps %xmm6, %xmm3
  1496. movaps -12 * SIZE(X), %xmm6
  1497. addps %xmm3, %xmm1
  1498. movss %xmm6, %xmm7
  1499. PSHUFD2($0xb1, %xmm5, %xmm3)
  1500. shufps $0x93, %xmm6, %xmm7
  1501. mulps %xmm7, %xmm5
  1502. addps %xmm5, %xmm0
  1503. movaps -4 * SIZE(Y), %xmm5
  1504. mulps %xmm7, %xmm3
  1505. movaps -8 * SIZE(X), %xmm7
  1506. addps %xmm3, %xmm1
  1507. #if defined(PREFETCH) && !defined(FETCH128)
  1508. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  1509. #endif
  1510. movss %xmm7, %xmm6
  1511. PSHUFD2($0xb1, %xmm4, %xmm3)
  1512. shufps $0x93, %xmm7, %xmm6
  1513. mulps %xmm6, %xmm4
  1514. addps %xmm4, %xmm0
  1515. movaps 0 * SIZE(Y), %xmm4
  1516. mulps %xmm6, %xmm3
  1517. movaps -4 * SIZE(X), %xmm6
  1518. addps %xmm3, %xmm1
  1519. movss %xmm6, %xmm7
  1520. PSHUFD2($0xb1, %xmm5, %xmm3)
  1521. shufps $0x93, %xmm6, %xmm7
  1522. mulps %xmm7, %xmm5
  1523. addps %xmm5, %xmm0
  1524. movaps 4 * SIZE(Y), %xmm5
  1525. mulps %xmm7, %xmm3
  1526. movaps 0 * SIZE(X), %xmm7
  1527. addps %xmm3, %xmm1
  1528. subl $-32 * SIZE, X
  1529. subl $-32 * SIZE, Y
  1530. decl %eax
  1531. jg .L61
  1532. ALIGN_3
  1533. .L62:
  1534. movss %xmm7, %xmm6
  1535. PSHUFD2($0xb1, %xmm4, %xmm3)
  1536. shufps $0x93, %xmm7, %xmm6
  1537. mulps %xmm6, %xmm4
  1538. addps %xmm4, %xmm0
  1539. movaps -24 * SIZE(Y), %xmm4
  1540. mulps %xmm6, %xmm3
  1541. movaps -28 * SIZE(X), %xmm6
  1542. addps %xmm3, %xmm1
  1543. movss %xmm6, %xmm7
  1544. PSHUFD2($0xb1, %xmm5, %xmm3)
  1545. shufps $0x93, %xmm6, %xmm7
  1546. mulps %xmm7, %xmm5
  1547. addps %xmm5, %xmm0
  1548. movaps -20 * SIZE(Y), %xmm5
  1549. mulps %xmm7, %xmm3
  1550. movaps -24 * SIZE(X), %xmm7
  1551. addps %xmm3, %xmm1
  1552. movss %xmm7, %xmm6
  1553. PSHUFD2($0xb1, %xmm4, %xmm3)
  1554. shufps $0x93, %xmm7, %xmm6
  1555. mulps %xmm6, %xmm4
  1556. addps %xmm4, %xmm0
  1557. movaps -16 * SIZE(Y), %xmm4
  1558. mulps %xmm6, %xmm3
  1559. movaps -20 * SIZE(X), %xmm6
  1560. addps %xmm3, %xmm1
  1561. movss %xmm6, %xmm7
  1562. PSHUFD2($0xb1, %xmm5, %xmm3)
  1563. shufps $0x93, %xmm6, %xmm7
  1564. mulps %xmm7, %xmm5
  1565. addps %xmm5, %xmm0
  1566. movaps -12 * SIZE(Y), %xmm5
  1567. mulps %xmm7, %xmm3
  1568. movaps -16 * SIZE(X), %xmm7
  1569. addps %xmm3, %xmm1
  1570. movss %xmm7, %xmm6
  1571. PSHUFD2($0xb1, %xmm4, %xmm3)
  1572. shufps $0x93, %xmm7, %xmm6
  1573. mulps %xmm6, %xmm4
  1574. addps %xmm4, %xmm0
  1575. movaps -8 * SIZE(Y), %xmm4
  1576. mulps %xmm6, %xmm3
  1577. movaps -12 * SIZE(X), %xmm6
  1578. addps %xmm3, %xmm1
  1579. movss %xmm6, %xmm7
  1580. PSHUFD2($0xb1, %xmm5, %xmm3)
  1581. shufps $0x93, %xmm6, %xmm7
  1582. mulps %xmm7, %xmm5
  1583. addps %xmm5, %xmm0
  1584. movaps -4 * SIZE(Y), %xmm5
  1585. mulps %xmm7, %xmm3
  1586. movaps -8 * SIZE(X), %xmm7
  1587. addps %xmm3, %xmm1
  1588. movss %xmm7, %xmm6
  1589. PSHUFD2($0xb1, %xmm4, %xmm3)
  1590. shufps $0x93, %xmm7, %xmm6
  1591. mulps %xmm6, %xmm4
  1592. addps %xmm4, %xmm0
  1593. mulps %xmm6, %xmm3
  1594. movaps -4 * SIZE(X), %xmm6
  1595. addps %xmm3, %xmm1
  1596. movss %xmm6, %xmm7
  1597. PSHUFD2($0xb1, %xmm5, %xmm3)
  1598. shufps $0x93, %xmm6, %xmm7
  1599. mulps %xmm7, %xmm5
  1600. addps %xmm5, %xmm0
  1601. mulps %xmm7, %xmm3
  1602. addps %xmm3, %xmm1
  1603. subl $-32 * SIZE, X
  1604. subl $-32 * SIZE, Y
  1605. ALIGN_3
  1606. .L65:
  1607. testl $8, N
  1608. jle .L66
  1609. movaps -32 * SIZE(Y), %xmm4
  1610. movaps -28 * SIZE(Y), %xmm5
  1611. movaps -32 * SIZE(X), %xmm7
  1612. movss %xmm7, %xmm6
  1613. PSHUFD2($0xb1, %xmm4, %xmm3)
  1614. shufps $0x93, %xmm7, %xmm6
  1615. mulps %xmm6, %xmm4
  1616. addps %xmm4, %xmm0
  1617. movaps -24 * SIZE(Y), %xmm4
  1618. mulps %xmm6, %xmm3
  1619. movaps -28 * SIZE(X), %xmm6
  1620. addps %xmm3, %xmm1
  1621. movss %xmm6, %xmm7
  1622. PSHUFD2($0xb1, %xmm5, %xmm3)
  1623. shufps $0x93, %xmm6, %xmm7
  1624. mulps %xmm7, %xmm5
  1625. addps %xmm5, %xmm0
  1626. movaps -20 * SIZE(Y), %xmm5
  1627. mulps %xmm7, %xmm3
  1628. movaps -24 * SIZE(X), %xmm7
  1629. addps %xmm3, %xmm1
  1630. movss %xmm7, %xmm6
  1631. PSHUFD2($0xb1, %xmm4, %xmm3)
  1632. shufps $0x93, %xmm7, %xmm6
  1633. mulps %xmm6, %xmm4
  1634. addps %xmm4, %xmm0
  1635. mulps %xmm6, %xmm3
  1636. movaps -20 * SIZE(X), %xmm6
  1637. addps %xmm3, %xmm1
  1638. movss %xmm6, %xmm7
  1639. PSHUFD2($0xb1, %xmm5, %xmm3)
  1640. shufps $0x93, %xmm6, %xmm7
  1641. mulps %xmm7, %xmm5
  1642. addps %xmm5, %xmm0
  1643. mulps %xmm7, %xmm3
  1644. addps %xmm3, %xmm1
  1645. addl $16 * SIZE, X
  1646. addl $16 * SIZE, Y
  1647. ALIGN_3
  1648. .L66:
  1649. testl $4, N
  1650. jle .L67
  1651. movaps -32 * SIZE(Y), %xmm4
  1652. movaps -28 * SIZE(Y), %xmm5
  1653. movaps -32 * SIZE(X), %xmm7
  1654. movss %xmm7, %xmm6
  1655. PSHUFD2($0xb1, %xmm4, %xmm3)
  1656. shufps $0x93, %xmm7, %xmm6
  1657. mulps %xmm6, %xmm4
  1658. addps %xmm4, %xmm0
  1659. mulps %xmm6, %xmm3
  1660. movaps -28 * SIZE(X), %xmm6
  1661. addps %xmm3, %xmm1
  1662. movss %xmm6, %xmm7
  1663. PSHUFD2($0xb1, %xmm5, %xmm3)
  1664. shufps $0x93, %xmm6, %xmm7
  1665. mulps %xmm7, %xmm5
  1666. addps %xmm5, %xmm0
  1667. mulps %xmm7, %xmm3
  1668. addps %xmm3, %xmm1
  1669. addl $8 * SIZE, X
  1670. addl $8 * SIZE, Y
  1671. ALIGN_3
  1672. .L67:
  1673. testl $2, N
  1674. jle .L68
  1675. movaps -32 * SIZE(Y), %xmm4
  1676. movaps -32 * SIZE(X), %xmm7
  1677. movss %xmm7, %xmm6
  1678. PSHUFD2($0xb1, %xmm4, %xmm3)
  1679. shufps $0x93, %xmm7, %xmm6
  1680. mulps %xmm6, %xmm4
  1681. addps %xmm4, %xmm0
  1682. mulps %xmm6, %xmm3
  1683. addps %xmm3, %xmm1
  1684. movaps %xmm7, %xmm6
  1685. addl $4 * SIZE, X
  1686. addl $4 * SIZE, Y
  1687. ALIGN_3
  1688. .L68:
  1689. testl $1, N
  1690. jle .L98
  1691. #ifdef movsd
  1692. xorps %xmm4, %xmm4
  1693. #endif
  1694. movsd -32 * SIZE(Y), %xmm4
  1695. movss -32 * SIZE(X), %xmm7
  1696. movss %xmm7, %xmm6
  1697. PSHUFD2($0xb1, %xmm4, %xmm3)
  1698. shufps $0x93, %xmm6, %xmm6
  1699. mulps %xmm6, %xmm4
  1700. addps %xmm4, %xmm0
  1701. mulps %xmm6, %xmm3
  1702. addps %xmm3, %xmm1
  1703. jmp .L98
  1704. ALIGN_3
  1705. #else
  1706. testl $2 * SIZE, Y
  1707. je .L50x
  1708. #ifdef movsd
  1709. xorps %xmm0, %xmm0
  1710. #endif
  1711. movsd -32 * SIZE(Y), %xmm0
  1712. #ifdef movsd
  1713. xorps %xmm4, %xmm4
  1714. #endif
  1715. movsd -32 * SIZE(X), %xmm4
  1716. PSHUFD2($0xb1, %xmm0, %xmm1)
  1717. mulps %xmm4, %xmm0
  1718. mulps %xmm4, %xmm1
  1719. addl $2 * SIZE, X
  1720. addl $2 * SIZE, Y
  1721. decl N
  1722. ALIGN_3
  1723. .L50x:
  1724. movl N, %eax
  1725. sarl $4, %eax
  1726. jle .L55
  1727. movaps -32 * SIZE(Y), %xmm4
  1728. movlps -32 * SIZE(X), %xmm6
  1729. movhps -30 * SIZE(X), %xmm6
  1730. movaps -28 * SIZE(Y), %xmm5
  1731. movlps -28 * SIZE(X), %xmm7
  1732. movhps -26 * SIZE(X), %xmm7
  1733. decl %eax
  1734. jle .L52
  1735. ALIGN_3
  1736. .L51:
  1737. #ifdef PREFETCH
  1738. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  1739. #endif
  1740. #ifdef PREFETCH
  1741. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  1742. #endif
  1743. #if defined(PREFETCH) && !defined(FETCH128)
  1744. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  1745. #endif
  1746. #if defined(PREFETCH) && !defined(FETCH128)
  1747. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  1748. #endif
  1749. PSHUFD2($0xb1, %xmm4, %xmm3)
  1750. mulps %xmm6, %xmm4
  1751. addps %xmm4, %xmm0
  1752. movaps -24 * SIZE(Y), %xmm4
  1753. mulps %xmm6, %xmm3
  1754. movlps -24 * SIZE(X), %xmm6
  1755. movhps -22 * SIZE(X), %xmm6
  1756. addps %xmm3, %xmm1
  1757. PSHUFD2($0xb1, %xmm5, %xmm3)
  1758. mulps %xmm7, %xmm5
  1759. addps %xmm5, %xmm0
  1760. movaps -20 * SIZE(Y), %xmm5
  1761. mulps %xmm7, %xmm3
  1762. movlps -20 * SIZE(X), %xmm7
  1763. movhps -18 * SIZE(X), %xmm7
  1764. addps %xmm3, %xmm1
  1765. PSHUFD2($0xb1, %xmm4, %xmm3)
  1766. mulps %xmm6, %xmm4
  1767. addps %xmm4, %xmm0
  1768. movaps -16 * SIZE(Y), %xmm4
  1769. mulps %xmm6, %xmm3
  1770. movlps -16 * SIZE(X), %xmm6
  1771. movhps -14 * SIZE(X), %xmm6
  1772. addps %xmm3, %xmm1
  1773. PSHUFD2($0xb1, %xmm5, %xmm3)
  1774. mulps %xmm7, %xmm5
  1775. addps %xmm5, %xmm0
  1776. movaps -12 * SIZE(Y), %xmm5
  1777. mulps %xmm7, %xmm3
  1778. movlps -12 * SIZE(X), %xmm7
  1779. movhps -10 * SIZE(X), %xmm7
  1780. addps %xmm3, %xmm1
  1781. PSHUFD2($0xb1, %xmm4, %xmm3)
  1782. mulps %xmm6, %xmm4
  1783. addps %xmm4, %xmm0
  1784. movaps -8 * SIZE(Y), %xmm4
  1785. mulps %xmm6, %xmm3
  1786. movlps -8 * SIZE(X), %xmm6
  1787. movhps -6 * SIZE(X), %xmm6
  1788. addps %xmm3, %xmm1
  1789. PSHUFD2($0xb1, %xmm5, %xmm3)
  1790. mulps %xmm7, %xmm5
  1791. addps %xmm5, %xmm0
  1792. movaps -20 * SIZE(Y), %xmm5
  1793. mulps %xmm7, %xmm3
  1794. movlps -20 * SIZE(X), %xmm7
  1795. movhps -18 * SIZE(X), %xmm7
  1796. addps %xmm3, %xmm1
  1797. PSHUFD2($0xb1, %xmm4, %xmm3)
  1798. mulps %xmm6, %xmm4
  1799. addps %xmm4, %xmm0
  1800. movaps 0 * SIZE(Y), %xmm4
  1801. mulps %xmm6, %xmm3
  1802. movlps 0 * SIZE(X), %xmm6
  1803. movhps 2 * SIZE(X), %xmm6
  1804. addps %xmm3, %xmm1
  1805. PSHUFD2($0xb1, %xmm5, %xmm3)
  1806. mulps %xmm7, %xmm5
  1807. addps %xmm5, %xmm0
  1808. movaps 4 * SIZE(Y), %xmm5
  1809. mulps %xmm7, %xmm3
  1810. movlps 4 * SIZE(X), %xmm7
  1811. movhps 6 * SIZE(X), %xmm7
  1812. addps %xmm3, %xmm1
  1813. subl $-32 * SIZE, X
  1814. subl $-32 * SIZE, Y
  1815. decl %eax
  1816. jg .L51
  1817. ALIGN_3
  1818. .L52:
  1819. PSHUFD2($0xb1, %xmm4, %xmm3)
  1820. mulps %xmm6, %xmm4
  1821. addps %xmm4, %xmm0
  1822. movaps -24 * SIZE(Y), %xmm4
  1823. mulps %xmm6, %xmm3
  1824. movlps -24 * SIZE(X), %xmm6
  1825. movhps -22 * SIZE(X), %xmm6
  1826. addps %xmm3, %xmm1
  1827. PSHUFD2($0xb1, %xmm5, %xmm3)
  1828. mulps %xmm7, %xmm5
  1829. addps %xmm5, %xmm0
  1830. movaps -20 * SIZE(Y), %xmm5
  1831. mulps %xmm7, %xmm3
  1832. movlps -20 * SIZE(X), %xmm7
  1833. movhps -18 * SIZE(X), %xmm7
  1834. addps %xmm3, %xmm1
  1835. PSHUFD2($0xb1, %xmm4, %xmm3)
  1836. mulps %xmm6, %xmm4
  1837. addps %xmm4, %xmm0
  1838. movaps -16 * SIZE(Y), %xmm4
  1839. mulps %xmm6, %xmm3
  1840. movlps -16 * SIZE(X), %xmm6
  1841. movhps -14 * SIZE(X), %xmm6
  1842. addps %xmm3, %xmm1
  1843. PSHUFD2($0xb1, %xmm5, %xmm3)
  1844. mulps %xmm7, %xmm5
  1845. addps %xmm5, %xmm0
  1846. movaps -12 * SIZE(Y), %xmm5
  1847. mulps %xmm7, %xmm3
  1848. movlps -12 * SIZE(X), %xmm7
  1849. movhps -10 * SIZE(X), %xmm7
  1850. addps %xmm3, %xmm1
  1851. PSHUFD2($0xb1, %xmm4, %xmm3)
  1852. mulps %xmm6, %xmm4
  1853. addps %xmm4, %xmm0
  1854. movaps -8 * SIZE(Y), %xmm4
  1855. mulps %xmm6, %xmm3
  1856. movlps -8 * SIZE(X), %xmm6
  1857. movhps -6 * SIZE(X), %xmm6
  1858. addps %xmm3, %xmm1
  1859. PSHUFD2($0xb1, %xmm5, %xmm3)
  1860. mulps %xmm7, %xmm5
  1861. addps %xmm5, %xmm0
  1862. movaps -20 * SIZE(Y), %xmm5
  1863. mulps %xmm7, %xmm3
  1864. movlps -20 * SIZE(X), %xmm7
  1865. movhps -18 * SIZE(X), %xmm7
  1866. addps %xmm3, %xmm1
  1867. PSHUFD2($0xb1, %xmm4, %xmm3)
  1868. mulps %xmm6, %xmm4
  1869. addps %xmm4, %xmm0
  1870. mulps %xmm6, %xmm3
  1871. addps %xmm3, %xmm1
  1872. PSHUFD2($0xb1, %xmm5, %xmm3)
  1873. mulps %xmm7, %xmm5
  1874. addps %xmm5, %xmm0
  1875. mulps %xmm7, %xmm3
  1876. addps %xmm3, %xmm1
  1877. subl $-32 * SIZE, X
  1878. subl $-32 * SIZE, Y
  1879. ALIGN_3
  1880. .L55:
  1881. testl $8, N
  1882. jle .L56
  1883. movaps -32 * SIZE(Y), %xmm4
  1884. movlps -32 * SIZE(X), %xmm6
  1885. movhps -30 * SIZE(X), %xmm6
  1886. movaps -28 * SIZE(Y), %xmm5
  1887. movlps -28 * SIZE(X), %xmm7
  1888. movhps -26 * SIZE(X), %xmm7
  1889. PSHUFD2($0xb1, %xmm4, %xmm3)
  1890. mulps %xmm6, %xmm4
  1891. addps %xmm4, %xmm0
  1892. movaps -24 * SIZE(Y), %xmm4
  1893. mulps %xmm6, %xmm3
  1894. movlps -24 * SIZE(X), %xmm6
  1895. movhps -22 * SIZE(X), %xmm6
  1896. addps %xmm3, %xmm1
  1897. PSHUFD2($0xb1, %xmm5, %xmm3)
  1898. mulps %xmm7, %xmm5
  1899. addps %xmm5, %xmm0
  1900. movaps -20 * SIZE(Y), %xmm5
  1901. mulps %xmm7, %xmm3
  1902. movlps -20 * SIZE(X), %xmm7
  1903. movhps -18 * SIZE(X), %xmm7
  1904. addps %xmm3, %xmm1
  1905. PSHUFD2($0xb1, %xmm4, %xmm3)
  1906. mulps %xmm6, %xmm4
  1907. addps %xmm4, %xmm0
  1908. mulps %xmm6, %xmm3
  1909. addps %xmm3, %xmm1
  1910. PSHUFD2($0xb1, %xmm5, %xmm3)
  1911. mulps %xmm7, %xmm5
  1912. addps %xmm5, %xmm0
  1913. mulps %xmm7, %xmm3
  1914. addps %xmm3, %xmm1
  1915. addl $16 * SIZE, X
  1916. addl $16 * SIZE, Y
  1917. ALIGN_3
  1918. .L56:
  1919. testl $4, N
  1920. jle .L57
  1921. movaps -32 * SIZE(Y), %xmm4
  1922. movlps -32 * SIZE(X), %xmm6
  1923. movhps -30 * SIZE(X), %xmm6
  1924. PSHUFD2($0xb1, %xmm4, %xmm3)
  1925. mulps %xmm6, %xmm4
  1926. addps %xmm4, %xmm0
  1927. mulps %xmm6, %xmm3
  1928. addps %xmm3, %xmm1
  1929. movaps -28 * SIZE(Y), %xmm5
  1930. movlps -28 * SIZE(X), %xmm7
  1931. movhps -26 * SIZE(X), %xmm7
  1932. PSHUFD2($0xb1, %xmm5, %xmm3)
  1933. mulps %xmm7, %xmm5
  1934. addps %xmm5, %xmm0
  1935. mulps %xmm7, %xmm3
  1936. addps %xmm3, %xmm1
  1937. addl $8 * SIZE, X
  1938. addl $8 * SIZE, Y
  1939. ALIGN_3
  1940. .L57:
  1941. testl $2, N
  1942. jle .L58
  1943. movaps -32 * SIZE(Y), %xmm4
  1944. movlps -32 * SIZE(X), %xmm6
  1945. movhps -30 * SIZE(X), %xmm6
  1946. PSHUFD2($0xb1, %xmm4, %xmm3)
  1947. mulps %xmm6, %xmm4
  1948. addps %xmm4, %xmm0
  1949. mulps %xmm6, %xmm3
  1950. addps %xmm3, %xmm1
  1951. addl $4 * SIZE, X
  1952. addl $4 * SIZE, Y
  1953. ALIGN_3
  1954. .L58:
  1955. testl $1, N
  1956. jle .L98
  1957. #ifdef movsd
  1958. xorps %xmm4, %xmm4
  1959. #endif
  1960. movsd -32 * SIZE(Y), %xmm4
  1961. #ifdef movsd
  1962. xorps %xmm6, %xmm6
  1963. #endif
  1964. movsd -32 * SIZE(X), %xmm6
  1965. PSHUFD2($0xb1, %xmm4, %xmm3)
  1966. mulps %xmm6, %xmm4
  1967. addps %xmm4, %xmm0
  1968. mulps %xmm6, %xmm3
  1969. addps %xmm3, %xmm1
  1970. jmp .L98
  1971. ALIGN_3
  1972. #endif
  1973. .L70:
  1974. testl $2 * SIZE, Y
  1975. je .L70x
  1976. #ifdef movsd
  1977. xorps %xmm4, %xmm4
  1978. #endif
  1979. movsd -32 * SIZE(X), %xmm4
  1980. addl $2 * SIZE, X
  1981. #ifdef movsd
  1982. xorps %xmm1, %xmm1
  1983. #endif
  1984. movsd -32 * SIZE(Y), %xmm1
  1985. addl $2 * SIZE, Y
  1986. PSHUFD2($0xb1, %xmm1, %xmm0)
  1987. shufps $0xb1, %xmm4, %xmm4
  1988. mulps %xmm4, %xmm0
  1989. mulps %xmm4, %xmm1
  1990. decl N
  1991. ALIGN_3
  1992. .L70x:
  1993. testl $2 * SIZE, X
  1994. jne .L80
  1995. movaps -33 * SIZE(X), %xmm4
  1996. addl $3 * SIZE, X
  1997. movaps -33 * SIZE(Y), %xmm6
  1998. addl $3 * SIZE, Y
  1999. movl N, %eax
  2000. sarl $4, %eax
  2001. jle .L75
  2002. movaps -32 * SIZE(X), %xmm5
  2003. movaps -32 * SIZE(Y), %xmm7
  2004. decl %eax
  2005. jle .L72
  2006. ALIGN_3
  2007. .L71:
  2008. #ifdef PREFETCH
  2009. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  2010. #endif
  2011. movss %xmm7, %xmm6
  2012. PSHUFD2($0x1b, %xmm6, %xmm3)
  2013. movss %xmm5, %xmm4
  2014. mulps %xmm4, %xmm6
  2015. addps %xmm6, %xmm0
  2016. movaps -28 * SIZE(Y), %xmm6
  2017. mulps %xmm4, %xmm3
  2018. movaps -28 * SIZE(X), %xmm4
  2019. addps %xmm3, %xmm1
  2020. movss %xmm6, %xmm7
  2021. PSHUFD2($0x1b, %xmm7, %xmm3)
  2022. movss %xmm4, %xmm5
  2023. mulps %xmm5, %xmm7
  2024. addps %xmm7, %xmm0
  2025. movaps -24 * SIZE(Y), %xmm7
  2026. mulps %xmm5, %xmm3
  2027. movaps -24 * SIZE(X), %xmm5
  2028. addps %xmm3, %xmm1
  2029. #ifdef PREFETCH
  2030. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  2031. #endif
  2032. movss %xmm7, %xmm6
  2033. PSHUFD2($0x1b, %xmm6, %xmm3)
  2034. movss %xmm5, %xmm4
  2035. mulps %xmm4, %xmm6
  2036. addps %xmm6, %xmm0
  2037. movaps -20 * SIZE(Y), %xmm6
  2038. mulps %xmm4, %xmm3
  2039. movaps -20 * SIZE(X), %xmm4
  2040. addps %xmm3, %xmm1
  2041. movss %xmm6, %xmm7
  2042. PSHUFD2($0x1b, %xmm7, %xmm3)
  2043. movss %xmm4, %xmm5
  2044. mulps %xmm5, %xmm7
  2045. addps %xmm7, %xmm0
  2046. movaps -16 * SIZE(Y), %xmm7
  2047. mulps %xmm5, %xmm3
  2048. movaps -16 * SIZE(X), %xmm5
  2049. addps %xmm3, %xmm1
  2050. #if defined(PREFETCH) && !defined(FETCH128)
  2051. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  2052. #endif
  2053. movss %xmm7, %xmm6
  2054. PSHUFD2($0x1b, %xmm6, %xmm3)
  2055. movss %xmm5, %xmm4
  2056. mulps %xmm4, %xmm6
  2057. addps %xmm6, %xmm0
  2058. movaps -12 * SIZE(Y), %xmm6
  2059. mulps %xmm4, %xmm3
  2060. movaps -12 * SIZE(X), %xmm4
  2061. addps %xmm3, %xmm1
  2062. movss %xmm6, %xmm7
  2063. PSHUFD2($0x1b, %xmm7, %xmm3)
  2064. movss %xmm4, %xmm5
  2065. mulps %xmm5, %xmm7
  2066. addps %xmm7, %xmm0
  2067. movaps -8 * SIZE(Y), %xmm7
  2068. mulps %xmm5, %xmm3
  2069. movaps -8 * SIZE(X), %xmm5
  2070. addps %xmm3, %xmm1
  2071. #if defined(PREFETCH) && !defined(FETCH128)
  2072. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  2073. #endif
  2074. movss %xmm7, %xmm6
  2075. PSHUFD2($0x1b, %xmm6, %xmm3)
  2076. movss %xmm5, %xmm4
  2077. mulps %xmm4, %xmm6
  2078. addps %xmm6, %xmm0
  2079. movaps -4 * SIZE(Y), %xmm6
  2080. mulps %xmm4, %xmm3
  2081. movaps -4 * SIZE(X), %xmm4
  2082. addps %xmm3, %xmm1
  2083. movss %xmm6, %xmm7
  2084. PSHUFD2($0x1b, %xmm7, %xmm3)
  2085. movss %xmm4, %xmm5
  2086. mulps %xmm5, %xmm7
  2087. addps %xmm7, %xmm0
  2088. movaps 0 * SIZE(Y), %xmm7
  2089. mulps %xmm5, %xmm3
  2090. movaps 0 * SIZE(X), %xmm5
  2091. addps %xmm3, %xmm1
  2092. subl $-32 * SIZE, X
  2093. subl $-32 * SIZE, Y
  2094. decl %eax
  2095. jg .L71
  2096. ALIGN_3
  2097. .L72:
  2098. movss %xmm7, %xmm6
  2099. PSHUFD2($0x1b, %xmm6, %xmm3)
  2100. movss %xmm5, %xmm4
  2101. mulps %xmm4, %xmm6
  2102. addps %xmm6, %xmm0
  2103. movaps -28 * SIZE(Y), %xmm6
  2104. mulps %xmm4, %xmm3
  2105. movaps -28 * SIZE(X), %xmm4
  2106. addps %xmm3, %xmm1
  2107. movss %xmm6, %xmm7
  2108. PSHUFD2($0x1b, %xmm7, %xmm3)
  2109. movss %xmm4, %xmm5
  2110. mulps %xmm5, %xmm7
  2111. addps %xmm7, %xmm0
  2112. movaps -24 * SIZE(Y), %xmm7
  2113. mulps %xmm5, %xmm3
  2114. movaps -24 * SIZE(X), %xmm5
  2115. addps %xmm3, %xmm1
  2116. movss %xmm7, %xmm6
  2117. PSHUFD2($0x1b, %xmm6, %xmm3)
  2118. movss %xmm5, %xmm4
  2119. mulps %xmm4, %xmm6
  2120. addps %xmm6, %xmm0
  2121. movaps -20 * SIZE(Y), %xmm6
  2122. mulps %xmm4, %xmm3
  2123. movaps -20 * SIZE(X), %xmm4
  2124. addps %xmm3, %xmm1
  2125. movss %xmm6, %xmm7
  2126. PSHUFD2($0x1b, %xmm7, %xmm3)
  2127. movss %xmm4, %xmm5
  2128. mulps %xmm5, %xmm7
  2129. addps %xmm7, %xmm0
  2130. movaps -16 * SIZE(Y), %xmm7
  2131. mulps %xmm5, %xmm3
  2132. movaps -16 * SIZE(X), %xmm5
  2133. addps %xmm3, %xmm1
  2134. movss %xmm7, %xmm6
  2135. PSHUFD2($0x1b, %xmm6, %xmm3)
  2136. movss %xmm5, %xmm4
  2137. mulps %xmm4, %xmm6
  2138. addps %xmm6, %xmm0
  2139. movaps -12 * SIZE(Y), %xmm6
  2140. mulps %xmm4, %xmm3
  2141. movaps -12 * SIZE(X), %xmm4
  2142. addps %xmm3, %xmm1
  2143. movss %xmm6, %xmm7
  2144. PSHUFD2($0x1b, %xmm7, %xmm3)
  2145. movss %xmm4, %xmm5
  2146. mulps %xmm5, %xmm7
  2147. addps %xmm7, %xmm0
  2148. movaps -8 * SIZE(Y), %xmm7
  2149. mulps %xmm5, %xmm3
  2150. movaps -8 * SIZE(X), %xmm5
  2151. addps %xmm3, %xmm1
  2152. movss %xmm7, %xmm6
  2153. PSHUFD2($0x1b, %xmm6, %xmm3)
  2154. movss %xmm5, %xmm4
  2155. mulps %xmm4, %xmm6
  2156. addps %xmm6, %xmm0
  2157. movaps -4 * SIZE(Y), %xmm6
  2158. mulps %xmm4, %xmm3
  2159. movaps -4 * SIZE(X), %xmm4
  2160. addps %xmm3, %xmm1
  2161. movss %xmm6, %xmm7
  2162. PSHUFD2($0x1b, %xmm7, %xmm3)
  2163. movss %xmm4, %xmm5
  2164. mulps %xmm5, %xmm7
  2165. addps %xmm7, %xmm0
  2166. mulps %xmm5, %xmm3
  2167. addps %xmm3, %xmm1
  2168. subl $-32 * SIZE, X
  2169. subl $-32 * SIZE, Y
  2170. ALIGN_3
  2171. .L75:
  2172. testl $8, N
  2173. jle .L76
  2174. movaps -32 * SIZE(X), %xmm5
  2175. movaps -32 * SIZE(Y), %xmm7
  2176. movss %xmm7, %xmm6
  2177. PSHUFD2($0x1b, %xmm6, %xmm3)
  2178. movss %xmm5, %xmm4
  2179. mulps %xmm4, %xmm6
  2180. addps %xmm6, %xmm0
  2181. movaps -28 * SIZE(Y), %xmm6
  2182. mulps %xmm4, %xmm3
  2183. movaps -28 * SIZE(X), %xmm4
  2184. addps %xmm3, %xmm1
  2185. movss %xmm6, %xmm7
  2186. PSHUFD2($0x1b, %xmm7, %xmm3)
  2187. movss %xmm4, %xmm5
  2188. mulps %xmm5, %xmm7
  2189. addps %xmm7, %xmm0
  2190. movaps -24 * SIZE(Y), %xmm7
  2191. mulps %xmm5, %xmm3
  2192. movaps -24 * SIZE(X), %xmm5
  2193. addps %xmm3, %xmm1
  2194. movss %xmm7, %xmm6
  2195. PSHUFD2($0x1b, %xmm6, %xmm3)
  2196. movss %xmm5, %xmm4
  2197. mulps %xmm4, %xmm6
  2198. addps %xmm6, %xmm0
  2199. movaps -20 * SIZE(Y), %xmm6
  2200. mulps %xmm4, %xmm3
  2201. movaps -20 * SIZE(X), %xmm4
  2202. addps %xmm3, %xmm1
  2203. movss %xmm6, %xmm7
  2204. PSHUFD2($0x1b, %xmm7, %xmm3)
  2205. movss %xmm4, %xmm5
  2206. mulps %xmm5, %xmm7
  2207. addps %xmm7, %xmm0
  2208. mulps %xmm5, %xmm3
  2209. addps %xmm3, %xmm1
  2210. addl $16 * SIZE, X
  2211. addl $16 * SIZE, Y
  2212. ALIGN_3
  2213. .L76:
  2214. testl $4, N
  2215. jle .L77
  2216. movaps -32 * SIZE(X), %xmm5
  2217. movaps -32 * SIZE(Y), %xmm7
  2218. movss %xmm7, %xmm6
  2219. PSHUFD2($0x1b, %xmm6, %xmm3)
  2220. movss %xmm5, %xmm4
  2221. mulps %xmm4, %xmm6
  2222. addps %xmm6, %xmm0
  2223. movaps -28 * SIZE(Y), %xmm6
  2224. mulps %xmm4, %xmm3
  2225. movaps -28 * SIZE(X), %xmm4
  2226. addps %xmm3, %xmm1
  2227. movss %xmm6, %xmm7
  2228. PSHUFD2($0x1b, %xmm7, %xmm3)
  2229. movss %xmm4, %xmm5
  2230. mulps %xmm5, %xmm7
  2231. addps %xmm7, %xmm0
  2232. mulps %xmm5, %xmm3
  2233. addps %xmm3, %xmm1
  2234. addl $8 * SIZE, X
  2235. addl $8 * SIZE, Y
  2236. ALIGN_3
  2237. .L77:
  2238. testl $2, N
  2239. jle .L78
  2240. movaps -32 * SIZE(X), %xmm5
  2241. movaps -32 * SIZE(Y), %xmm7
  2242. movss %xmm7, %xmm6
  2243. PSHUFD2($0x1b, %xmm6, %xmm3)
  2244. movss %xmm5, %xmm4
  2245. mulps %xmm4, %xmm6
  2246. addps %xmm6, %xmm0
  2247. mulps %xmm4, %xmm3
  2248. addps %xmm3, %xmm1
  2249. movaps %xmm5, %xmm4
  2250. movaps %xmm7, %xmm6
  2251. ALIGN_3
  2252. .L78:
  2253. testl $1, N
  2254. jle .L79
  2255. xorps %xmm5, %xmm5
  2256. movss %xmm5, %xmm4
  2257. movss %xmm5, %xmm6
  2258. shufps $0x24, %xmm4, %xmm4
  2259. PSHUFD2($0x18, %xmm6, %xmm3)
  2260. shufps $0x24, %xmm6, %xmm6
  2261. mulps %xmm4, %xmm6
  2262. addps %xmm6, %xmm0
  2263. mulps %xmm4, %xmm3
  2264. addps %xmm3, %xmm1
  2265. ALIGN_3
  2266. .L79:
  2267. shufps $0x39, %xmm0, %xmm0
  2268. shufps $0x39, %xmm1, %xmm1
  2269. jmp .L98
  2270. ALIGN_3
  2271. .L80:
  2272. movsd -33 * SIZE(X), %xmm4
  2273. movhps -31 * SIZE(X), %xmm4
  2274. addl $3 * SIZE, X
  2275. movaps -33 * SIZE(Y), %xmm6
  2276. addl $3 * SIZE, Y
  2277. movl N, %eax
  2278. sarl $4, %eax
  2279. jle .L85
  2280. movsd -32 * SIZE(X), %xmm5
  2281. movhps -30 * SIZE(X), %xmm5
  2282. movaps -32 * SIZE(Y), %xmm7
  2283. decl %eax
  2284. jle .L82
  2285. ALIGN_3
  2286. .L81:
  2287. #ifdef PREFETCH
  2288. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  2289. #endif
  2290. movss %xmm7, %xmm6
  2291. PSHUFD2($0x1b, %xmm6, %xmm3)
  2292. movss %xmm5, %xmm4
  2293. mulps %xmm4, %xmm6
  2294. addps %xmm6, %xmm0
  2295. movaps -28 * SIZE(Y), %xmm6
  2296. mulps %xmm4, %xmm3
  2297. movsd -28 * SIZE(X), %xmm4
  2298. movhps -26 * SIZE(X), %xmm4
  2299. addps %xmm3, %xmm1
  2300. movss %xmm6, %xmm7
  2301. PSHUFD2($0x1b, %xmm7, %xmm3)
  2302. movss %xmm4, %xmm5
  2303. mulps %xmm5, %xmm7
  2304. addps %xmm7, %xmm0
  2305. movaps -24 * SIZE(Y), %xmm7
  2306. mulps %xmm5, %xmm3
  2307. movsd -24 * SIZE(X), %xmm5
  2308. movhps -22 * SIZE(X), %xmm5
  2309. addps %xmm3, %xmm1
  2310. #ifdef PREFETCH
  2311. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y)
  2312. #endif
  2313. movss %xmm7, %xmm6
  2314. PSHUFD2($0x1b, %xmm6, %xmm3)
  2315. movss %xmm5, %xmm4
  2316. mulps %xmm4, %xmm6
  2317. addps %xmm6, %xmm0
  2318. movaps -20 * SIZE(Y), %xmm6
  2319. mulps %xmm4, %xmm3
  2320. movsd -20 * SIZE(X), %xmm4
  2321. movhps -18 * SIZE(X), %xmm4
  2322. addps %xmm3, %xmm1
  2323. movss %xmm6, %xmm7
  2324. PSHUFD2($0x1b, %xmm7, %xmm3)
  2325. movss %xmm4, %xmm5
  2326. mulps %xmm5, %xmm7
  2327. addps %xmm7, %xmm0
  2328. movaps -16 * SIZE(Y), %xmm7
  2329. mulps %xmm5, %xmm3
  2330. movsd -16 * SIZE(X), %xmm5
  2331. movhps -14 * SIZE(X), %xmm5
  2332. addps %xmm3, %xmm1
  2333. #if defined(PREFETCH) && !defined(FETCH128)
  2334. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  2335. #endif
  2336. movss %xmm7, %xmm6
  2337. PSHUFD2($0x1b, %xmm6, %xmm3)
  2338. movss %xmm5, %xmm4
  2339. mulps %xmm4, %xmm6
  2340. addps %xmm6, %xmm0
  2341. movaps -12 * SIZE(Y), %xmm6
  2342. mulps %xmm4, %xmm3
  2343. movsd -12 * SIZE(X), %xmm4
  2344. movhps -10 * SIZE(X), %xmm4
  2345. addps %xmm3, %xmm1
  2346. movss %xmm6, %xmm7
  2347. PSHUFD2($0x1b, %xmm7, %xmm3)
  2348. movss %xmm4, %xmm5
  2349. mulps %xmm5, %xmm7
  2350. addps %xmm7, %xmm0
  2351. movaps -8 * SIZE(Y), %xmm7
  2352. mulps %xmm5, %xmm3
  2353. movsd -8 * SIZE(X), %xmm5
  2354. movhps -6 * SIZE(X), %xmm5
  2355. addps %xmm3, %xmm1
  2356. #if defined(PREFETCH) && !defined(FETCH128)
  2357. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y)
  2358. #endif
  2359. movss %xmm7, %xmm6
  2360. PSHUFD2($0x1b, %xmm6, %xmm3)
  2361. movss %xmm5, %xmm4
  2362. mulps %xmm4, %xmm6
  2363. addps %xmm6, %xmm0
  2364. movaps -4 * SIZE(Y), %xmm6
  2365. mulps %xmm4, %xmm3
  2366. movsd -4 * SIZE(X), %xmm4
  2367. movhps -2 * SIZE(X), %xmm4
  2368. addps %xmm3, %xmm1
  2369. movss %xmm6, %xmm7
  2370. PSHUFD2($0x1b, %xmm7, %xmm3)
  2371. movss %xmm4, %xmm5
  2372. mulps %xmm5, %xmm7
  2373. addps %xmm7, %xmm0
  2374. movaps 0 * SIZE(Y), %xmm7
  2375. mulps %xmm5, %xmm3
  2376. movsd 0 * SIZE(X), %xmm5
  2377. movhps 2 * SIZE(X), %xmm5
  2378. addps %xmm3, %xmm1
  2379. subl $-32 * SIZE, X
  2380. subl $-32 * SIZE, Y
  2381. decl %eax
  2382. jg .L81
  2383. ALIGN_3
  2384. .L82:
  2385. movss %xmm7, %xmm6
  2386. PSHUFD2($0x1b, %xmm6, %xmm3)
  2387. movss %xmm5, %xmm4
  2388. mulps %xmm4, %xmm6
  2389. addps %xmm6, %xmm0
  2390. movaps -28 * SIZE(Y), %xmm6
  2391. mulps %xmm4, %xmm3
  2392. movsd -28 * SIZE(X), %xmm4
  2393. movhps -26 * SIZE(X), %xmm4
  2394. addps %xmm3, %xmm1
  2395. movss %xmm6, %xmm7
  2396. PSHUFD2($0x1b, %xmm7, %xmm3)
  2397. movss %xmm4, %xmm5
  2398. mulps %xmm5, %xmm7
  2399. addps %xmm7, %xmm0
  2400. movaps -24 * SIZE(Y), %xmm7
  2401. mulps %xmm5, %xmm3
  2402. movsd -24 * SIZE(X), %xmm5
  2403. movhps -22 * SIZE(X), %xmm5
  2404. addps %xmm3, %xmm1
  2405. movss %xmm7, %xmm6
  2406. PSHUFD2($0x1b, %xmm6, %xmm3)
  2407. movss %xmm5, %xmm4
  2408. mulps %xmm4, %xmm6
  2409. addps %xmm6, %xmm0
  2410. movaps -20 * SIZE(Y), %xmm6
  2411. mulps %xmm4, %xmm3
  2412. movsd -20 * SIZE(X), %xmm4
  2413. movhps -18 * SIZE(X), %xmm4
  2414. addps %xmm3, %xmm1
  2415. movss %xmm6, %xmm7
  2416. PSHUFD2($0x1b, %xmm7, %xmm3)
  2417. movss %xmm4, %xmm5
  2418. mulps %xmm5, %xmm7
  2419. addps %xmm7, %xmm0
  2420. movaps -16 * SIZE(Y), %xmm7
  2421. mulps %xmm5, %xmm3
  2422. movsd -16 * SIZE(X), %xmm5
  2423. movhps -14 * SIZE(X), %xmm5
  2424. addps %xmm3, %xmm1
  2425. movss %xmm7, %xmm6
  2426. PSHUFD2($0x1b, %xmm6, %xmm3)
  2427. movss %xmm5, %xmm4
  2428. mulps %xmm4, %xmm6
  2429. addps %xmm6, %xmm0
  2430. movaps -12 * SIZE(Y), %xmm6
  2431. mulps %xmm4, %xmm3
  2432. movsd -12 * SIZE(X), %xmm4
  2433. movhps -10 * SIZE(X), %xmm4
  2434. addps %xmm3, %xmm1
  2435. movss %xmm6, %xmm7
  2436. PSHUFD2($0x1b, %xmm7, %xmm3)
  2437. movss %xmm4, %xmm5
  2438. mulps %xmm5, %xmm7
  2439. addps %xmm7, %xmm0
  2440. movaps -8 * SIZE(Y), %xmm7
  2441. mulps %xmm5, %xmm3
  2442. movsd -8 * SIZE(X), %xmm5
  2443. movhps -6 * SIZE(X), %xmm5
  2444. addps %xmm3, %xmm1
  2445. movss %xmm7, %xmm6
  2446. PSHUFD2($0x1b, %xmm6, %xmm3)
  2447. movss %xmm5, %xmm4
  2448. mulps %xmm4, %xmm6
  2449. addps %xmm6, %xmm0
  2450. movaps -4 * SIZE(Y), %xmm6
  2451. mulps %xmm4, %xmm3
  2452. movsd -4 * SIZE(X), %xmm4
  2453. movhps -2 * SIZE(X), %xmm4
  2454. addps %xmm3, %xmm1
  2455. movss %xmm6, %xmm7
  2456. PSHUFD2($0x1b, %xmm7, %xmm3)
  2457. movss %xmm4, %xmm5
  2458. mulps %xmm5, %xmm7
  2459. addps %xmm7, %xmm0
  2460. mulps %xmm5, %xmm3
  2461. addps %xmm3, %xmm1
  2462. subl $-32 * SIZE, X
  2463. subl $-32 * SIZE, Y
  2464. ALIGN_3
  2465. .L85:
  2466. testl $8, N
  2467. jle .L86
  2468. movsd -32 * SIZE(X), %xmm5
  2469. movhps -30 * SIZE(X), %xmm5
  2470. movaps -32 * SIZE(Y), %xmm7
  2471. movss %xmm7, %xmm6
  2472. PSHUFD2($0x1b, %xmm6, %xmm3)
  2473. movss %xmm5, %xmm4
  2474. mulps %xmm4, %xmm6
  2475. addps %xmm6, %xmm0
  2476. movaps -28 * SIZE(Y), %xmm6
  2477. mulps %xmm4, %xmm3
  2478. movsd -28 * SIZE(X), %xmm4
  2479. movhps -26 * SIZE(X), %xmm4
  2480. addps %xmm3, %xmm1
  2481. movss %xmm6, %xmm7
  2482. PSHUFD2($0x1b, %xmm7, %xmm3)
  2483. movss %xmm4, %xmm5
  2484. mulps %xmm5, %xmm7
  2485. addps %xmm7, %xmm0
  2486. movaps -24 * SIZE(Y), %xmm7
  2487. mulps %xmm5, %xmm3
  2488. movsd -24 * SIZE(X), %xmm5
  2489. movhps -22 * SIZE(X), %xmm5
  2490. addps %xmm3, %xmm1
  2491. movss %xmm7, %xmm6
  2492. PSHUFD2($0x1b, %xmm6, %xmm3)
  2493. movss %xmm5, %xmm4
  2494. mulps %xmm4, %xmm6
  2495. addps %xmm6, %xmm0
  2496. movaps -20 * SIZE(Y), %xmm6
  2497. mulps %xmm4, %xmm3
  2498. movsd -20 * SIZE(X), %xmm4
  2499. movhps -18 * SIZE(X), %xmm4
  2500. addps %xmm3, %xmm1
  2501. movss %xmm6, %xmm7
  2502. PSHUFD2($0x1b, %xmm7, %xmm3)
  2503. movss %xmm4, %xmm5
  2504. mulps %xmm5, %xmm7
  2505. addps %xmm7, %xmm0
  2506. mulps %xmm5, %xmm3
  2507. addps %xmm3, %xmm1
  2508. addl $16 * SIZE, X
  2509. addl $16 * SIZE, Y
  2510. ALIGN_3
  2511. .L86:
  2512. testl $4, N
  2513. jle .L87
  2514. movsd -32 * SIZE(X), %xmm5
  2515. movhps -30 * SIZE(X), %xmm5
  2516. movaps -32 * SIZE(Y), %xmm7
  2517. movss %xmm7, %xmm6
  2518. PSHUFD2($0x1b, %xmm6, %xmm3)
  2519. movss %xmm5, %xmm4
  2520. mulps %xmm4, %xmm6
  2521. addps %xmm6, %xmm0
  2522. movaps -28 * SIZE(Y), %xmm6
  2523. mulps %xmm4, %xmm3
  2524. movsd -28 * SIZE(X), %xmm4
  2525. movhps -26 * SIZE(X), %xmm4
  2526. addps %xmm3, %xmm1
  2527. movss %xmm6, %xmm7
  2528. PSHUFD2($0x1b, %xmm7, %xmm3)
  2529. movss %xmm4, %xmm5
  2530. mulps %xmm5, %xmm7
  2531. addps %xmm7, %xmm0
  2532. mulps %xmm5, %xmm3
  2533. addps %xmm3, %xmm1
  2534. addl $8 * SIZE, X
  2535. addl $8 * SIZE, Y
  2536. ALIGN_3
  2537. .L87:
  2538. testl $2, N
  2539. jle .L88
  2540. movsd -32 * SIZE(X), %xmm5
  2541. movhps -30 * SIZE(X), %xmm5
  2542. movaps -32 * SIZE(Y), %xmm7
  2543. movss %xmm7, %xmm6
  2544. PSHUFD2($0x1b, %xmm6, %xmm3)
  2545. movss %xmm5, %xmm4
  2546. mulps %xmm4, %xmm6
  2547. addps %xmm6, %xmm0
  2548. mulps %xmm4, %xmm3
  2549. addps %xmm3, %xmm1
  2550. movaps %xmm5, %xmm4
  2551. movaps %xmm7, %xmm6
  2552. ALIGN_3
  2553. .L88:
  2554. testl $1, N
  2555. jle .L89
  2556. xorps %xmm5, %xmm5
  2557. movss %xmm5, %xmm4
  2558. movss %xmm5, %xmm6
  2559. shufps $0x24, %xmm4, %xmm4
  2560. PSHUFD2($0x18, %xmm6, %xmm3)
  2561. shufps $0x24, %xmm6, %xmm6
  2562. mulps %xmm4, %xmm6
  2563. addps %xmm6, %xmm0
  2564. mulps %xmm4, %xmm3
  2565. addps %xmm3, %xmm1
  2566. ALIGN_3
  2567. .L89:
  2568. shufps $0x39, %xmm0, %xmm0
  2569. shufps $0x39, %xmm1, %xmm1
  2570. jmp .L98
  2571. ALIGN_3
  2572. .L200:
  2573. movl N, %eax
  2574. sarl $4, %eax
  2575. jle .L205
  2576. movsd (X), %xmm4
  2577. addl INCX, X
  2578. movhps (X), %xmm4
  2579. addl INCX, X
  2580. movsd (Y), %xmm6
  2581. addl INCY, Y
  2582. movhps (Y), %xmm6
  2583. addl INCY, Y
  2584. movsd (X), %xmm5
  2585. addl INCX, X
  2586. movhps (X), %xmm5
  2587. addl INCX, X
  2588. movsd (Y), %xmm7
  2589. addl INCY, Y
  2590. movhps (Y), %xmm7
  2591. addl INCY, Y
  2592. decl %eax
  2593. jle .L204
  2594. ALIGN_3
  2595. .L203:
  2596. PSHUFD2($0xb1, %xmm6, %xmm3)
  2597. mulps %xmm4, %xmm6
  2598. addps %xmm6, %xmm0
  2599. movsd (Y), %xmm6
  2600. addl INCY, Y
  2601. movhps (Y), %xmm6
  2602. addl INCY, Y
  2603. mulps %xmm4, %xmm3
  2604. movsd (X), %xmm4
  2605. addl INCX, X
  2606. movhps (X), %xmm4
  2607. addl INCX, X
  2608. addps %xmm3, %xmm1
  2609. PSHUFD2($0xb1, %xmm7, %xmm3)
  2610. mulps %xmm5, %xmm7
  2611. addps %xmm7, %xmm0
  2612. movsd (Y), %xmm7
  2613. addl INCY, Y
  2614. movhps (Y), %xmm7
  2615. addl INCY, Y
  2616. mulps %xmm5, %xmm3
  2617. movsd (X), %xmm5
  2618. addl INCX, X
  2619. movhps (X), %xmm5
  2620. addl INCX, X
  2621. addps %xmm3, %xmm1
  2622. PSHUFD2($0xb1, %xmm6, %xmm3)
  2623. mulps %xmm4, %xmm6
  2624. addps %xmm6, %xmm0
  2625. movsd (Y), %xmm6
  2626. addl INCY, Y
  2627. movhps (Y), %xmm6
  2628. addl INCY, Y
  2629. mulps %xmm4, %xmm3
  2630. movsd (X), %xmm4
  2631. addl INCX, X
  2632. movhps (X), %xmm4
  2633. addl INCX, X
  2634. addps %xmm3, %xmm1
  2635. PSHUFD2($0xb1, %xmm7, %xmm3)
  2636. mulps %xmm5, %xmm7
  2637. addps %xmm7, %xmm0
  2638. movsd (Y), %xmm7
  2639. addl INCY, Y
  2640. movhps (Y), %xmm7
  2641. addl INCY, Y
  2642. mulps %xmm5, %xmm3
  2643. movsd (X), %xmm5
  2644. addl INCX, X
  2645. movhps (X), %xmm5
  2646. addl INCX, X
  2647. addps %xmm3, %xmm1
  2648. PSHUFD2($0xb1, %xmm6, %xmm3)
  2649. mulps %xmm4, %xmm6
  2650. addps %xmm6, %xmm0
  2651. movsd (Y), %xmm6
  2652. addl INCY, Y
  2653. movhps (Y), %xmm6
  2654. addl INCY, Y
  2655. mulps %xmm4, %xmm3
  2656. movsd (X), %xmm4
  2657. addl INCX, X
  2658. movhps (X), %xmm4
  2659. addl INCX, X
  2660. addps %xmm3, %xmm1
  2661. PSHUFD2($0xb1, %xmm7, %xmm3)
  2662. mulps %xmm5, %xmm7
  2663. addps %xmm7, %xmm0
  2664. movsd (Y), %xmm7
  2665. addl INCY, Y
  2666. movhps (Y), %xmm7
  2667. addl INCY, Y
  2668. mulps %xmm5, %xmm3
  2669. movsd (X), %xmm5
  2670. addl INCX, X
  2671. movhps (X), %xmm5
  2672. addl INCX, X
  2673. addps %xmm3, %xmm1
  2674. PSHUFD2($0xb1, %xmm6, %xmm3)
  2675. mulps %xmm4, %xmm6
  2676. addps %xmm6, %xmm0
  2677. movsd (Y), %xmm6
  2678. addl INCY, Y
  2679. movhps (Y), %xmm6
  2680. addl INCY, Y
  2681. mulps %xmm4, %xmm3
  2682. movsd (X), %xmm4
  2683. addl INCX, X
  2684. movhps (X), %xmm4
  2685. addl INCX, X
  2686. addps %xmm3, %xmm1
  2687. PSHUFD2($0xb1, %xmm7, %xmm3)
  2688. mulps %xmm5, %xmm7
  2689. addps %xmm7, %xmm0
  2690. movsd (Y), %xmm7
  2691. addl INCY, Y
  2692. movhps (Y), %xmm7
  2693. addl INCY, Y
  2694. mulps %xmm5, %xmm3
  2695. movsd (X), %xmm5
  2696. addl INCX, X
  2697. movhps (X), %xmm5
  2698. addl INCX, X
  2699. addps %xmm3, %xmm1
  2700. decl %eax
  2701. jg .L203
  2702. ALIGN_3
  2703. .L204:
  2704. PSHUFD2($0xb1, %xmm6, %xmm3)
  2705. mulps %xmm4, %xmm6
  2706. addps %xmm6, %xmm0
  2707. movsd (Y), %xmm6
  2708. addl INCY, Y
  2709. movhps (Y), %xmm6
  2710. addl INCY, Y
  2711. mulps %xmm4, %xmm3
  2712. movsd (X), %xmm4
  2713. addl INCX, X
  2714. movhps (X), %xmm4
  2715. addl INCX, X
  2716. addps %xmm3, %xmm1
  2717. PSHUFD2($0xb1, %xmm7, %xmm3)
  2718. mulps %xmm5, %xmm7
  2719. addps %xmm7, %xmm0
  2720. movsd (Y), %xmm7
  2721. addl INCY, Y
  2722. movhps (Y), %xmm7
  2723. addl INCY, Y
  2724. mulps %xmm5, %xmm3
  2725. movsd (X), %xmm5
  2726. addl INCX, X
  2727. movhps (X), %xmm5
  2728. addl INCX, X
  2729. addps %xmm3, %xmm1
  2730. PSHUFD2($0xb1, %xmm6, %xmm3)
  2731. mulps %xmm4, %xmm6
  2732. addps %xmm6, %xmm0
  2733. movsd (Y), %xmm6
  2734. addl INCY, Y
  2735. movhps (Y), %xmm6
  2736. addl INCY, Y
  2737. mulps %xmm4, %xmm3
  2738. movsd (X), %xmm4
  2739. addl INCX, X
  2740. movhps (X), %xmm4
  2741. addl INCX, X
  2742. addps %xmm3, %xmm1
  2743. PSHUFD2($0xb1, %xmm7, %xmm3)
  2744. mulps %xmm5, %xmm7
  2745. addps %xmm7, %xmm0
  2746. movsd (Y), %xmm7
  2747. addl INCY, Y
  2748. movhps (Y), %xmm7
  2749. addl INCY, Y
  2750. mulps %xmm5, %xmm3
  2751. movsd (X), %xmm5
  2752. addl INCX, X
  2753. movhps (X), %xmm5
  2754. addl INCX, X
  2755. addps %xmm3, %xmm1
  2756. PSHUFD2($0xb1, %xmm6, %xmm3)
  2757. mulps %xmm4, %xmm6
  2758. addps %xmm6, %xmm0
  2759. movsd (Y), %xmm6
  2760. addl INCY, Y
  2761. movhps (Y), %xmm6
  2762. addl INCY, Y
  2763. mulps %xmm4, %xmm3
  2764. movsd (X), %xmm4
  2765. addl INCX, X
  2766. movhps (X), %xmm4
  2767. addl INCX, X
  2768. addps %xmm3, %xmm1
  2769. PSHUFD2($0xb1, %xmm7, %xmm3)
  2770. mulps %xmm5, %xmm7
  2771. addps %xmm7, %xmm0
  2772. movsd (Y), %xmm7
  2773. addl INCY, Y
  2774. movhps (Y), %xmm7
  2775. addl INCY, Y
  2776. mulps %xmm5, %xmm3
  2777. movsd (X), %xmm5
  2778. addl INCX, X
  2779. movhps (X), %xmm5
  2780. addl INCX, X
  2781. addps %xmm3, %xmm1
  2782. PSHUFD2($0xb1, %xmm6, %xmm3)
  2783. mulps %xmm4, %xmm6
  2784. addps %xmm6, %xmm0
  2785. mulps %xmm4, %xmm3
  2786. addps %xmm3, %xmm1
  2787. PSHUFD2($0xb1, %xmm7, %xmm3)
  2788. mulps %xmm5, %xmm7
  2789. addps %xmm7, %xmm0
  2790. mulps %xmm5, %xmm3
  2791. addps %xmm3, %xmm1
  2792. ALIGN_3
  2793. .L205:
  2794. testl $8, N
  2795. jle .L206
  2796. movsd (X), %xmm4
  2797. addl INCX, X
  2798. movhps (X), %xmm4
  2799. addl INCX, X
  2800. movsd (Y), %xmm6
  2801. addl INCY, Y
  2802. movhps (Y), %xmm6
  2803. addl INCY, Y
  2804. movsd (X), %xmm5
  2805. addl INCX, X
  2806. movhps (X), %xmm5
  2807. addl INCX, X
  2808. movsd (Y), %xmm7
  2809. addl INCY, Y
  2810. movhps (Y), %xmm7
  2811. addl INCY, Y
  2812. PSHUFD2($0xb1, %xmm6, %xmm3)
  2813. mulps %xmm4, %xmm6
  2814. addps %xmm6, %xmm0
  2815. movsd (Y), %xmm6
  2816. addl INCY, Y
  2817. movhps (Y), %xmm6
  2818. addl INCY, Y
  2819. mulps %xmm4, %xmm3
  2820. movsd (X), %xmm4
  2821. addl INCX, X
  2822. movhps (X), %xmm4
  2823. addl INCX, X
  2824. addps %xmm3, %xmm1
  2825. PSHUFD2($0xb1, %xmm7, %xmm3)
  2826. mulps %xmm5, %xmm7
  2827. addps %xmm7, %xmm0
  2828. movsd (Y), %xmm7
  2829. addl INCY, Y
  2830. movhps (Y), %xmm7
  2831. addl INCY, Y
  2832. mulps %xmm5, %xmm3
  2833. movsd (X), %xmm5
  2834. addl INCX, X
  2835. movhps (X), %xmm5
  2836. addl INCX, X
  2837. addps %xmm3, %xmm1
  2838. PSHUFD2($0xb1, %xmm6, %xmm3)
  2839. mulps %xmm4, %xmm6
  2840. addps %xmm6, %xmm0
  2841. mulps %xmm4, %xmm3
  2842. addps %xmm3, %xmm1
  2843. PSHUFD2($0xb1, %xmm7, %xmm3)
  2844. mulps %xmm5, %xmm7
  2845. addps %xmm7, %xmm0
  2846. mulps %xmm5, %xmm3
  2847. addps %xmm3, %xmm1
  2848. ALIGN_3
  2849. .L206:
  2850. testl $4, N
  2851. jle .L207
  2852. movsd (X), %xmm4
  2853. addl INCX, X
  2854. movhps (X), %xmm4
  2855. addl INCX, X
  2856. movsd (Y), %xmm6
  2857. addl INCY, Y
  2858. movhps (Y), %xmm6
  2859. addl INCY, Y
  2860. PSHUFD2($0xb1, %xmm6, %xmm3)
  2861. mulps %xmm4, %xmm6
  2862. addps %xmm6, %xmm0
  2863. mulps %xmm4, %xmm3
  2864. addps %xmm3, %xmm1
  2865. movsd (X), %xmm5
  2866. addl INCX, X
  2867. movhps (X), %xmm5
  2868. addl INCX, X
  2869. movsd (Y), %xmm7
  2870. addl INCY, Y
  2871. movhps (Y), %xmm7
  2872. addl INCY, Y
  2873. PSHUFD2($0xb1, %xmm7, %xmm3)
  2874. mulps %xmm5, %xmm7
  2875. addps %xmm7, %xmm0
  2876. mulps %xmm5, %xmm3
  2877. addps %xmm3, %xmm1
  2878. ALIGN_3
  2879. .L207:
  2880. testl $2, N
  2881. jle .L208
  2882. movsd (X), %xmm4
  2883. addl INCX, X
  2884. movhps (X), %xmm4
  2885. addl INCX, X
  2886. movsd (Y), %xmm6
  2887. addl INCY, Y
  2888. movhps (Y), %xmm6
  2889. addl INCY, Y
  2890. PSHUFD2($0xb1, %xmm6, %xmm3)
  2891. mulps %xmm4, %xmm6
  2892. addps %xmm6, %xmm0
  2893. mulps %xmm4, %xmm3
  2894. addps %xmm3, %xmm1
  2895. ALIGN_3
  2896. .L208:
  2897. testl $1, N
  2898. jle .L98
  2899. #ifdef movsd
  2900. xorps %xmm4, %xmm4
  2901. #endif
  2902. movsd (X), %xmm4
  2903. #ifdef movsd
  2904. xorps %xmm6, %xmm6
  2905. #endif
  2906. movsd (Y), %xmm6
  2907. PSHUFD2($0xb1, %xmm6, %xmm3)
  2908. mulps %xmm4, %xmm6
  2909. addps %xmm6, %xmm0
  2910. mulps %xmm4, %xmm3
  2911. addps %xmm3, %xmm1
  2912. ALIGN_3
  2913. .L98:
  2914. movhlps %xmm0, %xmm2
  2915. movhlps %xmm1, %xmm3
  2916. addps %xmm2, %xmm0
  2917. addps %xmm3, %xmm1
  2918. PSHUFD2($1, %xmm0, %xmm2)
  2919. PSHUFD2($1, %xmm1, %xmm3)
  2920. #ifndef CONJ
  2921. subss %xmm2, %xmm0
  2922. addss %xmm3, %xmm1
  2923. #else
  2924. addss %xmm2, %xmm0
  2925. subss %xmm3, %xmm1
  2926. #endif
  2927. ALIGN_4
  2928. .L999:
  2929. subl $2 * SIZE, %esp
  2930. movss %xmm0, 0 * SIZE(%esp)
  2931. movss %xmm1, 1 * SIZE(%esp)
  2932. movl 0 * SIZE(%esp), %eax
  2933. movl 1 * SIZE(%esp), %edx
  2934. addl $2 * SIZE, %esp
  2935. popl %ebx
  2936. popl %esi
  2937. popl %edi
  2938. ret
  2939. EPILOGUE