You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

zaxpy_sse.S 64 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define STACK 16
  41. #define ARGS 0
  42. #define STACK_M 4 + STACK + ARGS(%esp)
  43. #define STACK_ALPHA_R 16 + STACK + ARGS(%esp)
  44. #define STACK_ALPHA_I 20 + STACK + ARGS(%esp)
  45. #define STACK_X 24 + STACK + ARGS(%esp)
  46. #define STACK_INCX 28 + STACK + ARGS(%esp)
  47. #define STACK_Y 32 + STACK + ARGS(%esp)
  48. #define STACK_INCY 36 + STACK + ARGS(%esp)
  49. #define M %ebx
  50. #define X %esi
  51. #define INCX %ecx
  52. #define Y %edi
  53. #define INCY %edx
  54. #define YY %ebp
  55. #define ALPHA_R %xmm6
  56. #define ALPHA_I %xmm7
  57. #include "l1param.h"
  58. PROLOGUE
  59. PROFCODE
  60. pushl %edi
  61. pushl %esi
  62. pushl %ebx
  63. pushl %ebp
  64. movl STACK_M, M
  65. movss STACK_ALPHA_R, ALPHA_R
  66. movss STACK_ALPHA_I, ALPHA_I
  67. movl STACK_X, X
  68. movl STACK_INCX, INCX
  69. movl STACK_Y, Y
  70. movl STACK_INCY, INCY
  71. sall $ZBASE_SHIFT, INCX
  72. sall $ZBASE_SHIFT, INCY
  73. testl M, M
  74. jle .L999
  75. cmpl $2 * SIZE, INCX
  76. jne .L100
  77. cmpl $2 * SIZE, INCY
  78. jne .L100
  79. #ifdef HAVE_SSE2
  80. pcmpeqb %xmm5, %xmm5
  81. psllq $63, %xmm5
  82. #else
  83. movl $0x80000000, STACK_M
  84. movss STACK_M, %xmm5
  85. shufps $0x11, %xmm5, %xmm5
  86. #endif
  87. shufps $0, ALPHA_R, ALPHA_R
  88. shufps $0, ALPHA_I, ALPHA_I
  89. #ifndef CONJ
  90. shufps $0xb1, %xmm5, %xmm5
  91. xorps %xmm5, ALPHA_I
  92. #else
  93. xorps %xmm5, ALPHA_R
  94. #endif
  95. subl $-32 * SIZE, X
  96. subl $-32 * SIZE, Y
  97. testl $2 * SIZE, Y
  98. je .L10
  99. #ifndef HAVE_SSE2
  100. xorps %xmm0, %xmm0
  101. #endif
  102. movsd -32 * SIZE(X), %xmm0
  103. #ifndef HAVE_SSE2
  104. xorps %xmm1, %xmm1
  105. #endif
  106. movsd -32 * SIZE(Y), %xmm1
  107. PSHUFD2($0xb1, %xmm0, %xmm5)
  108. mulps ALPHA_R, %xmm0
  109. mulps ALPHA_I, %xmm5
  110. addps %xmm5, %xmm0
  111. addps %xmm1, %xmm0
  112. movlps %xmm0, -32 * SIZE(Y)
  113. addl $2 * SIZE, X
  114. addl $2 * SIZE, Y
  115. decl M
  116. jle .L999
  117. ALIGN_2
  118. .L10:
  119. testl $SIZE, Y
  120. jne .L50
  121. testl $3 * SIZE, X
  122. jne .L20
  123. movl M, %eax
  124. sarl $4, %eax
  125. jle .L15
  126. movaps -32 * SIZE(X), %xmm0
  127. movaps -28 * SIZE(X), %xmm1
  128. movaps -24 * SIZE(X), %xmm2
  129. movaps -20 * SIZE(X), %xmm3
  130. decl %eax
  131. jle .L12
  132. ALIGN_3
  133. .L11:
  134. #ifdef PREFETCHW
  135. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  136. #endif
  137. PSHUFD2($0xb1, %xmm0, %xmm5)
  138. mulps ALPHA_R, %xmm0
  139. mulps ALPHA_I, %xmm5
  140. addps -32 * SIZE(Y), %xmm0
  141. addps %xmm5, %xmm0
  142. movaps %xmm0, -32 * SIZE(Y)
  143. movaps -16 * SIZE(X), %xmm0
  144. PSHUFD2($0xb1, %xmm1, %xmm5)
  145. mulps ALPHA_R, %xmm1
  146. mulps ALPHA_I, %xmm5
  147. addps -28 * SIZE(Y), %xmm1
  148. addps %xmm5, %xmm1
  149. movaps %xmm1, -28 * SIZE(Y)
  150. movaps -12 * SIZE(X), %xmm1
  151. #ifdef PREFETCH
  152. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  153. #endif
  154. PSHUFD2($0xb1, %xmm2, %xmm5)
  155. mulps ALPHA_R, %xmm2
  156. mulps ALPHA_I, %xmm5
  157. addps -24 * SIZE(Y), %xmm2
  158. addps %xmm5, %xmm2
  159. movaps %xmm2, -24 * SIZE(Y)
  160. movaps -8 * SIZE(X), %xmm2
  161. PSHUFD2($0xb1, %xmm3, %xmm5)
  162. mulps ALPHA_R, %xmm3
  163. mulps ALPHA_I, %xmm5
  164. addps -20 * SIZE(Y), %xmm3
  165. addps %xmm5, %xmm3
  166. movaps %xmm3, -20 * SIZE(Y)
  167. movaps -4 * SIZE(X), %xmm3
  168. #if defined(PREFETCHW) && !defined(FETCH128)
  169. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  170. #endif
  171. PSHUFD2($0xb1, %xmm0, %xmm5)
  172. mulps ALPHA_R, %xmm0
  173. mulps ALPHA_I, %xmm5
  174. addps -16 * SIZE(Y), %xmm0
  175. addps %xmm5, %xmm0
  176. movaps %xmm0, -16 * SIZE(Y)
  177. movaps 0 * SIZE(X), %xmm0
  178. PSHUFD2($0xb1, %xmm1, %xmm5)
  179. mulps ALPHA_R, %xmm1
  180. mulps ALPHA_I, %xmm5
  181. addps -12 * SIZE(Y), %xmm1
  182. addps %xmm5, %xmm1
  183. movaps %xmm1, -12 * SIZE(Y)
  184. movaps 4 * SIZE(X), %xmm1
  185. #if defined(PREFETCH) && !defined(FETCH128)
  186. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  187. #endif
  188. PSHUFD2($0xb1, %xmm2, %xmm5)
  189. mulps ALPHA_R, %xmm2
  190. mulps ALPHA_I, %xmm5
  191. addps -8 * SIZE(Y), %xmm2
  192. addps %xmm5, %xmm2
  193. movaps %xmm2, -8 * SIZE(Y)
  194. movaps 8 * SIZE(X), %xmm2
  195. PSHUFD2($0xb1, %xmm3, %xmm5)
  196. mulps ALPHA_R, %xmm3
  197. mulps ALPHA_I, %xmm5
  198. addps -4 * SIZE(Y), %xmm3
  199. addps %xmm5, %xmm3
  200. movaps %xmm3, -4 * SIZE(Y)
  201. movaps 12 * SIZE(X), %xmm3
  202. subl $-32 * SIZE, X
  203. subl $-32 * SIZE, Y
  204. decl %eax
  205. jg .L11
  206. ALIGN_3
  207. .L12:
  208. PSHUFD2($0xb1, %xmm0, %xmm5)
  209. mulps ALPHA_R, %xmm0
  210. mulps ALPHA_I, %xmm5
  211. addps -32 * SIZE(Y), %xmm0
  212. addps %xmm5, %xmm0
  213. movaps %xmm0, -32 * SIZE(Y)
  214. movaps -16 * SIZE(X), %xmm0
  215. PSHUFD2($0xb1, %xmm1, %xmm5)
  216. mulps ALPHA_R, %xmm1
  217. mulps ALPHA_I, %xmm5
  218. addps -28 * SIZE(Y), %xmm1
  219. addps %xmm5, %xmm1
  220. movaps %xmm1, -28 * SIZE(Y)
  221. movaps -12 * SIZE(X), %xmm1
  222. PSHUFD2($0xb1, %xmm2, %xmm5)
  223. mulps ALPHA_R, %xmm2
  224. mulps ALPHA_I, %xmm5
  225. addps -24 * SIZE(Y), %xmm2
  226. addps %xmm5, %xmm2
  227. movaps %xmm2, -24 * SIZE(Y)
  228. movaps -8 * SIZE(X), %xmm2
  229. PSHUFD2($0xb1, %xmm3, %xmm5)
  230. mulps ALPHA_R, %xmm3
  231. mulps ALPHA_I, %xmm5
  232. addps -20 * SIZE(Y), %xmm3
  233. addps %xmm5, %xmm3
  234. movaps %xmm3, -20 * SIZE(Y)
  235. movaps -4 * SIZE(X), %xmm3
  236. PSHUFD2($0xb1, %xmm0, %xmm5)
  237. mulps ALPHA_R, %xmm0
  238. mulps ALPHA_I, %xmm5
  239. addps -16 * SIZE(Y), %xmm0
  240. addps %xmm5, %xmm0
  241. movaps %xmm0, -16 * SIZE(Y)
  242. PSHUFD2($0xb1, %xmm1, %xmm5)
  243. mulps ALPHA_R, %xmm1
  244. mulps ALPHA_I, %xmm5
  245. addps -12 * SIZE(Y), %xmm1
  246. addps %xmm5, %xmm1
  247. movaps %xmm1, -12 * SIZE(Y)
  248. PSHUFD2($0xb1, %xmm2, %xmm5)
  249. mulps ALPHA_R, %xmm2
  250. mulps ALPHA_I, %xmm5
  251. addps -8 * SIZE(Y), %xmm2
  252. addps %xmm5, %xmm2
  253. movaps %xmm2, -8 * SIZE(Y)
  254. PSHUFD2($0xb1, %xmm3, %xmm5)
  255. mulps ALPHA_R, %xmm3
  256. mulps ALPHA_I, %xmm5
  257. addps -4 * SIZE(Y), %xmm3
  258. addps %xmm5, %xmm3
  259. movaps %xmm3, -4 * SIZE(Y)
  260. subl $-32 * SIZE, X
  261. subl $-32 * SIZE, Y
  262. ALIGN_3
  263. .L15:
  264. testl $8, M
  265. jle .L16
  266. movaps -32 * SIZE(X), %xmm0
  267. movaps -28 * SIZE(X), %xmm1
  268. PSHUFD2($0xb1, %xmm0, %xmm5)
  269. mulps ALPHA_R, %xmm0
  270. mulps ALPHA_I, %xmm5
  271. addps -32 * SIZE(Y), %xmm0
  272. addps %xmm5, %xmm0
  273. movaps %xmm0, -32 * SIZE(Y)
  274. PSHUFD2($0xb1, %xmm1, %xmm5)
  275. mulps ALPHA_R, %xmm1
  276. mulps ALPHA_I, %xmm5
  277. addps -28 * SIZE(Y), %xmm1
  278. addps %xmm5, %xmm1
  279. movaps %xmm1, -28 * SIZE(Y)
  280. movaps -24 * SIZE(X), %xmm2
  281. movaps -20 * SIZE(X), %xmm3
  282. PSHUFD2($0xb1, %xmm2, %xmm5)
  283. mulps ALPHA_R, %xmm2
  284. mulps ALPHA_I, %xmm5
  285. addps -24 * SIZE(Y), %xmm2
  286. addps %xmm5, %xmm2
  287. movaps %xmm2, -24 * SIZE(Y)
  288. PSHUFD2($0xb1, %xmm3, %xmm5)
  289. mulps ALPHA_R, %xmm3
  290. mulps ALPHA_I, %xmm5
  291. addps -20 * SIZE(Y), %xmm3
  292. addps %xmm5, %xmm3
  293. movaps %xmm3, -20 * SIZE(Y)
  294. addl $16 * SIZE, X
  295. addl $16 * SIZE, Y
  296. ALIGN_2
  297. .L16:
  298. testl $4, M
  299. jle .L17
  300. movaps -32 * SIZE(X), %xmm0
  301. movaps -28 * SIZE(X), %xmm1
  302. PSHUFD2($0xb1, %xmm0, %xmm5)
  303. mulps ALPHA_R, %xmm0
  304. mulps ALPHA_I, %xmm5
  305. addps -32 * SIZE(Y), %xmm0
  306. addps %xmm5, %xmm0
  307. movaps %xmm0, -32 * SIZE(Y)
  308. PSHUFD2($0xb1, %xmm1, %xmm5)
  309. mulps ALPHA_R, %xmm1
  310. mulps ALPHA_I, %xmm5
  311. addps -28 * SIZE(Y), %xmm1
  312. addps %xmm5, %xmm1
  313. movaps %xmm1, -28 * SIZE(Y)
  314. addl $8 * SIZE, X
  315. addl $8 * SIZE, Y
  316. ALIGN_2
  317. .L17:
  318. testl $2, M
  319. jle .L18
  320. movaps -32 * SIZE(X), %xmm0
  321. PSHUFD2($0xb1, %xmm0, %xmm5)
  322. mulps ALPHA_R, %xmm0
  323. mulps ALPHA_I, %xmm5
  324. addps -32 * SIZE(Y), %xmm0
  325. addps %xmm5, %xmm0
  326. movaps %xmm0, -32 * SIZE(Y)
  327. addl $4 * SIZE, X
  328. addl $4 * SIZE, Y
  329. ALIGN_2
  330. .L18:
  331. testl $1, M
  332. jle .L999
  333. #ifndef HAVE_SSE2
  334. xorps %xmm0, %xmm0
  335. #endif
  336. movsd -32 * SIZE(X), %xmm0
  337. PSHUFD2($0xb1, %xmm0, %xmm5)
  338. mulps ALPHA_R, %xmm0
  339. mulps ALPHA_I, %xmm5
  340. #ifndef HAVE_SSE2
  341. xorps %xmm1, %xmm1
  342. movlps -32 * SIZE(Y), %xmm1
  343. #else
  344. movsd -32 * SIZE(Y), %xmm1
  345. #endif
  346. addps %xmm1, %xmm0
  347. addps %xmm5, %xmm0
  348. movlps %xmm0, -32 * SIZE(Y)
  349. jmp .L999
  350. ALIGN_3
  351. .L20:
  352. #ifdef ALIGNED_ACCESS
  353. testl $2 * SIZE, X
  354. jne .L30
  355. subl $1 * SIZE, X
  356. movaps -32 * SIZE(X), %xmm0
  357. movl M, %eax
  358. sarl $4, %eax
  359. jle .L25
  360. movaps -28 * SIZE(X), %xmm1
  361. movaps -24 * SIZE(X), %xmm2
  362. movaps -20 * SIZE(X), %xmm3
  363. decl %eax
  364. jle .L22
  365. ALIGN_3
  366. .L21:
  367. #ifdef PREFETCHW
  368. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  369. #endif
  370. movss %xmm1, %xmm0
  371. SHUFPS_39 %xmm0, %xmm0
  372. PSHUFD2($0xb1, %xmm0, %xmm5)
  373. mulps ALPHA_R, %xmm0
  374. mulps ALPHA_I, %xmm5
  375. addps -32 * SIZE(Y), %xmm0
  376. addps %xmm5, %xmm0
  377. movaps %xmm0, -32 * SIZE(Y)
  378. movaps -16 * SIZE(X), %xmm0
  379. movss %xmm2, %xmm1
  380. SHUFPS_39 %xmm1, %xmm1
  381. PSHUFD2($0xb1, %xmm1, %xmm5)
  382. mulps ALPHA_R, %xmm1
  383. mulps ALPHA_I, %xmm5
  384. addps -28 * SIZE(Y), %xmm1
  385. addps %xmm5, %xmm1
  386. movaps %xmm1, -28 * SIZE(Y)
  387. movaps -12 * SIZE(X), %xmm1
  388. #ifdef PREFETCH
  389. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  390. #endif
  391. movss %xmm3, %xmm2
  392. SHUFPS_39 %xmm2, %xmm2
  393. PSHUFD2($0xb1, %xmm2, %xmm5)
  394. mulps ALPHA_R, %xmm2
  395. mulps ALPHA_I, %xmm5
  396. addps -24 * SIZE(Y), %xmm2
  397. addps %xmm5, %xmm2
  398. movaps %xmm2, -24 * SIZE(Y)
  399. movaps -8 * SIZE(X), %xmm2
  400. movss %xmm0, %xmm3
  401. SHUFPS_39 %xmm3, %xmm3
  402. PSHUFD2($0xb1, %xmm3, %xmm5)
  403. mulps ALPHA_R, %xmm3
  404. mulps ALPHA_I, %xmm5
  405. addps -20 * SIZE(Y), %xmm3
  406. addps %xmm5, %xmm3
  407. movaps %xmm3, -20 * SIZE(Y)
  408. movaps -4 * SIZE(X), %xmm3
  409. #if defined(PREFETCHW) && !defined(FETCH128)
  410. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  411. #endif
  412. movss %xmm1, %xmm0
  413. SHUFPS_39 %xmm0, %xmm0
  414. PSHUFD2($0xb1, %xmm0, %xmm5)
  415. mulps ALPHA_R, %xmm0
  416. mulps ALPHA_I, %xmm5
  417. addps -16 * SIZE(Y), %xmm0
  418. addps %xmm5, %xmm0
  419. movaps %xmm0, -16 * SIZE(Y)
  420. movaps 0 * SIZE(X), %xmm0
  421. movss %xmm2, %xmm1
  422. SHUFPS_39 %xmm1, %xmm1
  423. PSHUFD2($0xb1, %xmm1, %xmm5)
  424. mulps ALPHA_R, %xmm1
  425. mulps ALPHA_I, %xmm5
  426. addps -12 * SIZE(Y), %xmm1
  427. addps %xmm5, %xmm1
  428. movaps %xmm1, -12 * SIZE(Y)
  429. movaps 4 * SIZE(X), %xmm1
  430. #if defined(PREFETCH) && !defined(FETCH128)
  431. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  432. #endif
  433. movss %xmm3, %xmm2
  434. SHUFPS_39 %xmm2, %xmm2
  435. PSHUFD2($0xb1, %xmm2, %xmm5)
  436. mulps ALPHA_R, %xmm2
  437. mulps ALPHA_I, %xmm5
  438. addps -8 * SIZE(Y), %xmm2
  439. addps %xmm5, %xmm2
  440. movaps %xmm2, -8 * SIZE(Y)
  441. movaps 8 * SIZE(X), %xmm2
  442. movss %xmm0, %xmm3
  443. SHUFPS_39 %xmm3, %xmm3
  444. PSHUFD2($0xb1, %xmm3, %xmm5)
  445. mulps ALPHA_R, %xmm3
  446. mulps ALPHA_I, %xmm5
  447. addps -4 * SIZE(Y), %xmm3
  448. addps %xmm5, %xmm3
  449. movaps %xmm3, -4 * SIZE(Y)
  450. movaps 12 * SIZE(X), %xmm3
  451. subl $-32 * SIZE, X
  452. subl $-32 * SIZE, Y
  453. decl %eax
  454. jg .L21
  455. ALIGN_3
  456. .L22:
  457. movss %xmm1, %xmm0
  458. SHUFPS_39 %xmm0, %xmm0
  459. PSHUFD2($0xb1, %xmm0, %xmm5)
  460. mulps ALPHA_R, %xmm0
  461. mulps ALPHA_I, %xmm5
  462. addps -32 * SIZE(Y), %xmm0
  463. addps %xmm5, %xmm0
  464. movaps %xmm0, -32 * SIZE(Y)
  465. movaps -16 * SIZE(X), %xmm0
  466. movss %xmm2, %xmm1
  467. SHUFPS_39 %xmm1, %xmm1
  468. PSHUFD2($0xb1, %xmm1, %xmm5)
  469. mulps ALPHA_R, %xmm1
  470. mulps ALPHA_I, %xmm5
  471. addps -28 * SIZE(Y), %xmm1
  472. addps %xmm5, %xmm1
  473. movaps %xmm1, -28 * SIZE(Y)
  474. movaps -12 * SIZE(X), %xmm1
  475. movss %xmm3, %xmm2
  476. SHUFPS_39 %xmm2, %xmm2
  477. PSHUFD2($0xb1, %xmm2, %xmm5)
  478. mulps ALPHA_R, %xmm2
  479. mulps ALPHA_I, %xmm5
  480. addps -24 * SIZE(Y), %xmm2
  481. addps %xmm5, %xmm2
  482. movaps %xmm2, -24 * SIZE(Y)
  483. movaps -8 * SIZE(X), %xmm2
  484. movss %xmm0, %xmm3
  485. SHUFPS_39 %xmm3, %xmm3
  486. PSHUFD2($0xb1, %xmm3, %xmm5)
  487. mulps ALPHA_R, %xmm3
  488. mulps ALPHA_I, %xmm5
  489. addps -20 * SIZE(Y), %xmm3
  490. addps %xmm5, %xmm3
  491. movaps %xmm3, -20 * SIZE(Y)
  492. movaps -4 * SIZE(X), %xmm3
  493. movss %xmm1, %xmm0
  494. SHUFPS_39 %xmm0, %xmm0
  495. PSHUFD2($0xb1, %xmm0, %xmm5)
  496. mulps ALPHA_R, %xmm0
  497. mulps ALPHA_I, %xmm5
  498. addps -16 * SIZE(Y), %xmm0
  499. addps %xmm5, %xmm0
  500. movaps %xmm0, -16 * SIZE(Y)
  501. movaps 0 * SIZE(X), %xmm0
  502. movss %xmm2, %xmm1
  503. SHUFPS_39 %xmm1, %xmm1
  504. PSHUFD2($0xb1, %xmm1, %xmm5)
  505. mulps ALPHA_R, %xmm1
  506. mulps ALPHA_I, %xmm5
  507. addps -12 * SIZE(Y), %xmm1
  508. addps %xmm5, %xmm1
  509. movaps %xmm1, -12 * SIZE(Y)
  510. movss %xmm3, %xmm2
  511. SHUFPS_39 %xmm2, %xmm2
  512. PSHUFD2($0xb1, %xmm2, %xmm5)
  513. mulps ALPHA_R, %xmm2
  514. mulps ALPHA_I, %xmm5
  515. addps -8 * SIZE(Y), %xmm2
  516. addps %xmm5, %xmm2
  517. movaps %xmm2, -8 * SIZE(Y)
  518. movss %xmm0, %xmm3
  519. SHUFPS_39 %xmm3, %xmm3
  520. PSHUFD2($0xb1, %xmm3, %xmm5)
  521. mulps ALPHA_R, %xmm3
  522. mulps ALPHA_I, %xmm5
  523. addps -4 * SIZE(Y), %xmm3
  524. addps %xmm5, %xmm3
  525. movaps %xmm3, -4 * SIZE(Y)
  526. subl $-32 * SIZE, X
  527. subl $-32 * SIZE, Y
  528. ALIGN_3
  529. .L25:
  530. testl $8, M
  531. jle .L26
  532. movaps -28 * SIZE(X), %xmm1
  533. movaps -24 * SIZE(X), %xmm2
  534. movss %xmm1, %xmm0
  535. SHUFPS_39 %xmm0, %xmm0
  536. PSHUFD2($0xb1, %xmm0, %xmm5)
  537. mulps ALPHA_R, %xmm0
  538. mulps ALPHA_I, %xmm5
  539. addps -32 * SIZE(Y), %xmm0
  540. addps %xmm5, %xmm0
  541. movaps %xmm0, -32 * SIZE(Y)
  542. movss %xmm2, %xmm1
  543. SHUFPS_39 %xmm1, %xmm1
  544. PSHUFD2($0xb1, %xmm1, %xmm5)
  545. mulps ALPHA_R, %xmm1
  546. mulps ALPHA_I, %xmm5
  547. addps -28 * SIZE(Y), %xmm1
  548. addps %xmm5, %xmm1
  549. movaps %xmm1, -28 * SIZE(Y)
  550. movaps -20 * SIZE(X), %xmm3
  551. movaps -16 * SIZE(X), %xmm0
  552. movss %xmm3, %xmm2
  553. SHUFPS_39 %xmm2, %xmm2
  554. PSHUFD2($0xb1, %xmm2, %xmm5)
  555. mulps ALPHA_R, %xmm2
  556. mulps ALPHA_I, %xmm5
  557. addps -24 * SIZE(Y), %xmm2
  558. addps %xmm5, %xmm2
  559. movaps %xmm2, -24 * SIZE(Y)
  560. movss %xmm0, %xmm3
  561. SHUFPS_39 %xmm3, %xmm3
  562. PSHUFD2($0xb1, %xmm3, %xmm5)
  563. mulps ALPHA_R, %xmm3
  564. mulps ALPHA_I, %xmm5
  565. addps -20 * SIZE(Y), %xmm3
  566. addps %xmm5, %xmm3
  567. movaps %xmm3, -20 * SIZE(Y)
  568. addl $16 * SIZE, X
  569. addl $16 * SIZE, Y
  570. ALIGN_2
  571. .L26:
  572. testl $4, M
  573. jle .L27
  574. movaps -28 * SIZE(X), %xmm1
  575. movaps -24 * SIZE(X), %xmm2
  576. movss %xmm1, %xmm0
  577. SHUFPS_39 %xmm0, %xmm0
  578. PSHUFD2($0xb1, %xmm0, %xmm5)
  579. mulps ALPHA_R, %xmm0
  580. mulps ALPHA_I, %xmm5
  581. addps -32 * SIZE(Y), %xmm0
  582. addps %xmm5, %xmm0
  583. movaps %xmm0, -32 * SIZE(Y)
  584. movss %xmm2, %xmm1
  585. SHUFPS_39 %xmm1, %xmm1
  586. PSHUFD2($0xb1, %xmm1, %xmm5)
  587. mulps ALPHA_R, %xmm1
  588. mulps ALPHA_I, %xmm5
  589. addps -28 * SIZE(Y), %xmm1
  590. addps %xmm5, %xmm1
  591. movaps %xmm1, -28 * SIZE(Y)
  592. movaps %xmm2, %xmm0
  593. addl $8 * SIZE, X
  594. addl $8 * SIZE, Y
  595. ALIGN_2
  596. .L27:
  597. testl $2, M
  598. jle .L28
  599. movaps -28 * SIZE(X), %xmm1
  600. movss %xmm1, %xmm0
  601. SHUFPS_39 %xmm0, %xmm0
  602. PSHUFD2($0xb1, %xmm0, %xmm5)
  603. mulps ALPHA_R, %xmm0
  604. mulps ALPHA_I, %xmm5
  605. addps -32 * SIZE(Y), %xmm0
  606. addps %xmm5, %xmm0
  607. movaps %xmm0, -32 * SIZE(Y)
  608. movaps %xmm1, %xmm0
  609. addl $4 * SIZE, X
  610. addl $4 * SIZE, Y
  611. ALIGN_2
  612. .L28:
  613. testl $1, M
  614. jle .L999
  615. PSHUFD2($0x06, %xmm0, %xmm5)
  616. PSHUFD2($0x09, %xmm0, %xmm0)
  617. mulps ALPHA_I, %xmm5
  618. mulps ALPHA_R, %xmm0
  619. #ifndef HAVE_SSE2
  620. xorps %xmm1, %xmm1
  621. movlps -32 * SIZE(Y), %xmm1
  622. #else
  623. movsd -32 * SIZE(Y), %xmm1
  624. #endif
  625. addps %xmm1, %xmm0
  626. addps %xmm5, %xmm0
  627. movlps %xmm0, -32 * SIZE(Y)
  628. jmp .L999
  629. ALIGN_3
  630. .L30:
  631. testl $1 * SIZE, X
  632. jne .L40
  633. #endif
  634. movl M, %eax
  635. sarl $4, %eax
  636. jle .L35
  637. movsd -32 * SIZE(X), %xmm0
  638. movhps -30 * SIZE(X), %xmm0
  639. movsd -28 * SIZE(X), %xmm1
  640. movhps -26 * SIZE(X), %xmm1
  641. movsd -24 * SIZE(X), %xmm2
  642. movhps -22 * SIZE(X), %xmm2
  643. movsd -20 * SIZE(X), %xmm3
  644. movhps -18 * SIZE(X), %xmm3
  645. decl %eax
  646. jle .L32
  647. ALIGN_3
  648. .L31:
  649. #ifdef PREFETCHW
  650. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  651. #endif
  652. PSHUFD2($0xb1, %xmm0, %xmm5)
  653. mulps ALPHA_R, %xmm0
  654. mulps ALPHA_I, %xmm5
  655. addps -32 * SIZE(Y), %xmm0
  656. addps %xmm5, %xmm0
  657. movaps %xmm0, -32 * SIZE(Y)
  658. movsd -16 * SIZE(X), %xmm0
  659. movhps -14 * SIZE(X), %xmm0
  660. PSHUFD2($0xb1, %xmm1, %xmm5)
  661. mulps ALPHA_R, %xmm1
  662. mulps ALPHA_I, %xmm5
  663. addps -28 * SIZE(Y), %xmm1
  664. addps %xmm5, %xmm1
  665. movaps %xmm1, -28 * SIZE(Y)
  666. movsd -12 * SIZE(X), %xmm1
  667. movhps -10 * SIZE(X), %xmm1
  668. #ifdef PREFETCH
  669. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  670. #endif
  671. PSHUFD2($0xb1, %xmm2, %xmm5)
  672. mulps ALPHA_R, %xmm2
  673. mulps ALPHA_I, %xmm5
  674. addps -24 * SIZE(Y), %xmm2
  675. addps %xmm5, %xmm2
  676. movaps %xmm2, -24 * SIZE(Y)
  677. movsd -8 * SIZE(X), %xmm2
  678. movhps -6 * SIZE(X), %xmm2
  679. PSHUFD2($0xb1, %xmm3, %xmm5)
  680. mulps ALPHA_R, %xmm3
  681. mulps ALPHA_I, %xmm5
  682. addps -20 * SIZE(Y), %xmm3
  683. addps %xmm5, %xmm3
  684. movaps %xmm3, -20 * SIZE(Y)
  685. movsd -4 * SIZE(X), %xmm3
  686. movhps -2 * SIZE(X), %xmm3
  687. #if defined(PREFETCHW) && !defined(FETCH128)
  688. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  689. #endif
  690. PSHUFD2($0xb1, %xmm0, %xmm5)
  691. mulps ALPHA_R, %xmm0
  692. mulps ALPHA_I, %xmm5
  693. addps -16 * SIZE(Y), %xmm0
  694. addps %xmm5, %xmm0
  695. movaps %xmm0, -16 * SIZE(Y)
  696. movsd 0 * SIZE(X), %xmm0
  697. movhps 2 * SIZE(X), %xmm0
  698. PSHUFD2($0xb1, %xmm1, %xmm5)
  699. mulps ALPHA_R, %xmm1
  700. mulps ALPHA_I, %xmm5
  701. addps -12 * SIZE(Y), %xmm1
  702. addps %xmm5, %xmm1
  703. movaps %xmm1, -12 * SIZE(Y)
  704. movsd 4 * SIZE(X), %xmm1
  705. movhps 6 * SIZE(X), %xmm1
  706. #if defined(PREFETCH) && !defined(FETCH128)
  707. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  708. #endif
  709. PSHUFD2($0xb1, %xmm2, %xmm5)
  710. mulps ALPHA_R, %xmm2
  711. mulps ALPHA_I, %xmm5
  712. addps -8 * SIZE(Y), %xmm2
  713. addps %xmm5, %xmm2
  714. movaps %xmm2, -8 * SIZE(Y)
  715. movsd 8 * SIZE(X), %xmm2
  716. movhps 10 * SIZE(X), %xmm2
  717. PSHUFD2($0xb1, %xmm3, %xmm5)
  718. mulps ALPHA_R, %xmm3
  719. mulps ALPHA_I, %xmm5
  720. addps -4 * SIZE(Y), %xmm3
  721. addps %xmm5, %xmm3
  722. movaps %xmm3, -4 * SIZE(Y)
  723. movsd 12 * SIZE(X), %xmm3
  724. movhps 14 * SIZE(X), %xmm3
  725. subl $-32 * SIZE, X
  726. subl $-32 * SIZE, Y
  727. decl %eax
  728. jg .L31
  729. ALIGN_3
  730. .L32:
  731. PSHUFD2($0xb1, %xmm0, %xmm5)
  732. mulps ALPHA_R, %xmm0
  733. mulps ALPHA_I, %xmm5
  734. addps -32 * SIZE(Y), %xmm0
  735. addps %xmm5, %xmm0
  736. movaps %xmm0, -32 * SIZE(Y)
  737. movsd -16 * SIZE(X), %xmm0
  738. movhps -14 * SIZE(X), %xmm0
  739. PSHUFD2($0xb1, %xmm1, %xmm5)
  740. mulps ALPHA_R, %xmm1
  741. mulps ALPHA_I, %xmm5
  742. addps -28 * SIZE(Y), %xmm1
  743. addps %xmm5, %xmm1
  744. movaps %xmm1, -28 * SIZE(Y)
  745. movsd -12 * SIZE(X), %xmm1
  746. movhps -10 * SIZE(X), %xmm1
  747. PSHUFD2($0xb1, %xmm2, %xmm5)
  748. mulps ALPHA_R, %xmm2
  749. mulps ALPHA_I, %xmm5
  750. addps -24 * SIZE(Y), %xmm2
  751. addps %xmm5, %xmm2
  752. movaps %xmm2, -24 * SIZE(Y)
  753. movsd -8 * SIZE(X), %xmm2
  754. movhps -6 * SIZE(X), %xmm2
  755. PSHUFD2($0xb1, %xmm3, %xmm5)
  756. mulps ALPHA_R, %xmm3
  757. mulps ALPHA_I, %xmm5
  758. addps -20 * SIZE(Y), %xmm3
  759. addps %xmm5, %xmm3
  760. movaps %xmm3, -20 * SIZE(Y)
  761. movsd -4 * SIZE(X), %xmm3
  762. movhps -2 * SIZE(X), %xmm3
  763. PSHUFD2($0xb1, %xmm0, %xmm5)
  764. mulps ALPHA_R, %xmm0
  765. mulps ALPHA_I, %xmm5
  766. addps -16 * SIZE(Y), %xmm0
  767. addps %xmm5, %xmm0
  768. movaps %xmm0, -16 * SIZE(Y)
  769. PSHUFD2($0xb1, %xmm1, %xmm5)
  770. mulps ALPHA_R, %xmm1
  771. mulps ALPHA_I, %xmm5
  772. addps -12 * SIZE(Y), %xmm1
  773. addps %xmm5, %xmm1
  774. movaps %xmm1, -12 * SIZE(Y)
  775. PSHUFD2($0xb1, %xmm2, %xmm5)
  776. mulps ALPHA_R, %xmm2
  777. mulps ALPHA_I, %xmm5
  778. addps -8 * SIZE(Y), %xmm2
  779. addps %xmm5, %xmm2
  780. movaps %xmm2, -8 * SIZE(Y)
  781. PSHUFD2($0xb1, %xmm3, %xmm5)
  782. mulps ALPHA_R, %xmm3
  783. mulps ALPHA_I, %xmm5
  784. addps -4 * SIZE(Y), %xmm3
  785. addps %xmm5, %xmm3
  786. movaps %xmm3, -4 * SIZE(Y)
  787. subl $-32 * SIZE, X
  788. subl $-32 * SIZE, Y
  789. ALIGN_3
  790. .L35:
  791. testl $8, M
  792. jle .L36
  793. movsd -32 * SIZE(X), %xmm0
  794. movhps -30 * SIZE(X), %xmm0
  795. movsd -28 * SIZE(X), %xmm1
  796. movhps -26 * SIZE(X), %xmm1
  797. PSHUFD2($0xb1, %xmm0, %xmm5)
  798. mulps ALPHA_R, %xmm0
  799. mulps ALPHA_I, %xmm5
  800. addps -32 * SIZE(Y), %xmm0
  801. addps %xmm5, %xmm0
  802. movaps %xmm0, -32 * SIZE(Y)
  803. PSHUFD2($0xb1, %xmm1, %xmm5)
  804. mulps ALPHA_R, %xmm1
  805. mulps ALPHA_I, %xmm5
  806. addps -28 * SIZE(Y), %xmm1
  807. addps %xmm5, %xmm1
  808. movaps %xmm1, -28 * SIZE(Y)
  809. movsd -24 * SIZE(X), %xmm2
  810. movhps -22 * SIZE(X), %xmm2
  811. movsd -20 * SIZE(X), %xmm3
  812. movhps -18 * SIZE(X), %xmm3
  813. PSHUFD2($0xb1, %xmm2, %xmm5)
  814. mulps ALPHA_R, %xmm2
  815. mulps ALPHA_I, %xmm5
  816. addps -24 * SIZE(Y), %xmm2
  817. addps %xmm5, %xmm2
  818. movaps %xmm2, -24 * SIZE(Y)
  819. PSHUFD2($0xb1, %xmm3, %xmm5)
  820. mulps ALPHA_R, %xmm3
  821. mulps ALPHA_I, %xmm5
  822. addps -20 * SIZE(Y), %xmm3
  823. addps %xmm5, %xmm3
  824. movaps %xmm3, -20 * SIZE(Y)
  825. addl $16 * SIZE, X
  826. addl $16 * SIZE, Y
  827. ALIGN_2
  828. .L36:
  829. testl $4, M
  830. jle .L37
  831. movsd -32 * SIZE(X), %xmm0
  832. movhps -30 * SIZE(X), %xmm0
  833. movsd -28 * SIZE(X), %xmm1
  834. movhps -26 * SIZE(X), %xmm1
  835. PSHUFD2($0xb1, %xmm0, %xmm5)
  836. mulps ALPHA_R, %xmm0
  837. mulps ALPHA_I, %xmm5
  838. addps -32 * SIZE(Y), %xmm0
  839. addps %xmm5, %xmm0
  840. movaps %xmm0, -32 * SIZE(Y)
  841. PSHUFD2($0xb1, %xmm1, %xmm5)
  842. mulps ALPHA_R, %xmm1
  843. mulps ALPHA_I, %xmm5
  844. addps -28 * SIZE(Y), %xmm1
  845. addps %xmm5, %xmm1
  846. movaps %xmm1, -28 * SIZE(Y)
  847. addl $8 * SIZE, X
  848. addl $8 * SIZE, Y
  849. ALIGN_2
  850. .L37:
  851. testl $2, M
  852. jle .L38
  853. movsd -32 * SIZE(X), %xmm0
  854. movhps -30 * SIZE(X), %xmm0
  855. PSHUFD2($0xb1, %xmm0, %xmm5)
  856. mulps ALPHA_R, %xmm0
  857. mulps ALPHA_I, %xmm5
  858. addps -32 * SIZE(Y), %xmm0
  859. addps %xmm5, %xmm0
  860. movaps %xmm0, -32 * SIZE(Y)
  861. addl $4 * SIZE, X
  862. addl $4 * SIZE, Y
  863. ALIGN_2
  864. .L38:
  865. testl $1, M
  866. jle .L999
  867. movsd -32 * SIZE(X), %xmm0
  868. PSHUFD2($0xb1, %xmm0, %xmm5)
  869. mulps ALPHA_R, %xmm0
  870. mulps ALPHA_I, %xmm5
  871. addps -32 * SIZE(Y), %xmm0
  872. addps %xmm5, %xmm0
  873. movlps %xmm0, -32 * SIZE(Y)
  874. jmp .L999
  875. ALIGN_3
  876. #ifdef ALIGNED_ACCESS
  877. .L40:
  878. subl $3 * SIZE, X
  879. movaps -32 * SIZE(X), %xmm0
  880. movl M, %eax
  881. sarl $4, %eax
  882. jle .L45
  883. movaps -28 * SIZE(X), %xmm1
  884. movaps -24 * SIZE(X), %xmm2
  885. movaps -20 * SIZE(X), %xmm3
  886. decl %eax
  887. jle .L42
  888. ALIGN_3
  889. .L41:
  890. #ifdef PREFETCHW
  891. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  892. #endif
  893. movss %xmm1, %xmm0
  894. shufps $0x93, %xmm1, %xmm0
  895. PSHUFD2($0xb1, %xmm0, %xmm5)
  896. mulps ALPHA_R, %xmm0
  897. mulps ALPHA_I, %xmm5
  898. addps -32 * SIZE(Y), %xmm0
  899. addps %xmm5, %xmm0
  900. movaps %xmm0, -32 * SIZE(Y)
  901. movaps -16 * SIZE(X), %xmm0
  902. movss %xmm2, %xmm1
  903. shufps $0x93, %xmm2, %xmm1
  904. PSHUFD2($0xb1, %xmm1, %xmm5)
  905. mulps ALPHA_R, %xmm1
  906. mulps ALPHA_I, %xmm5
  907. addps -28 * SIZE(Y), %xmm1
  908. addps %xmm5, %xmm1
  909. movaps %xmm1, -28 * SIZE(Y)
  910. movaps -12 * SIZE(X), %xmm1
  911. #ifdef PREFETCH
  912. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  913. #endif
  914. movss %xmm3, %xmm2
  915. shufps $0x93, %xmm3, %xmm2
  916. PSHUFD2($0xb1, %xmm2, %xmm5)
  917. mulps ALPHA_R, %xmm2
  918. mulps ALPHA_I, %xmm5
  919. addps -24 * SIZE(Y), %xmm2
  920. addps %xmm5, %xmm2
  921. movaps %xmm2, -24 * SIZE(Y)
  922. movaps -8 * SIZE(X), %xmm2
  923. movss %xmm0, %xmm3
  924. shufps $0x93, %xmm0, %xmm3
  925. PSHUFD2($0xb1, %xmm3, %xmm5)
  926. mulps ALPHA_R, %xmm3
  927. mulps ALPHA_I, %xmm5
  928. addps -20 * SIZE(Y), %xmm3
  929. addps %xmm5, %xmm3
  930. movaps %xmm3, -20 * SIZE(Y)
  931. movaps -4 * SIZE(X), %xmm3
  932. #if defined(PREFETCHW) && !defined(FETCH128)
  933. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  934. #endif
  935. movss %xmm1, %xmm0
  936. shufps $0x93, %xmm1, %xmm0
  937. PSHUFD2($0xb1, %xmm0, %xmm5)
  938. mulps ALPHA_R, %xmm0
  939. mulps ALPHA_I, %xmm5
  940. addps -16 * SIZE(Y), %xmm0
  941. addps %xmm5, %xmm0
  942. movaps %xmm0, -16 * SIZE(Y)
  943. movaps 0 * SIZE(X), %xmm0
  944. movss %xmm2, %xmm1
  945. shufps $0x93, %xmm2, %xmm1
  946. PSHUFD2($0xb1, %xmm1, %xmm5)
  947. mulps ALPHA_R, %xmm1
  948. mulps ALPHA_I, %xmm5
  949. addps -12 * SIZE(Y), %xmm1
  950. addps %xmm5, %xmm1
  951. movaps %xmm1, -12 * SIZE(Y)
  952. movaps 4 * SIZE(X), %xmm1
  953. #if defined(PREFETCH) && !defined(FETCH128)
  954. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  955. #endif
  956. movss %xmm3, %xmm2
  957. shufps $0x93, %xmm3, %xmm2
  958. PSHUFD2($0xb1, %xmm2, %xmm5)
  959. mulps ALPHA_R, %xmm2
  960. mulps ALPHA_I, %xmm5
  961. addps -8 * SIZE(Y), %xmm2
  962. addps %xmm5, %xmm2
  963. movaps %xmm2, -8 * SIZE(Y)
  964. movaps 8 * SIZE(X), %xmm2
  965. movss %xmm0, %xmm3
  966. shufps $0x93, %xmm0, %xmm3
  967. PSHUFD2($0xb1, %xmm3, %xmm5)
  968. mulps ALPHA_R, %xmm3
  969. mulps ALPHA_I, %xmm5
  970. addps -4 * SIZE(Y), %xmm3
  971. addps %xmm5, %xmm3
  972. movaps %xmm3, -4 * SIZE(Y)
  973. movaps 12 * SIZE(X), %xmm3
  974. subl $-32 * SIZE, X
  975. subl $-32 * SIZE, Y
  976. decl %eax
  977. jg .L41
  978. ALIGN_3
  979. .L42:
  980. movss %xmm1, %xmm0
  981. shufps $0x93, %xmm1, %xmm0
  982. PSHUFD2($0xb1, %xmm0, %xmm5)
  983. mulps ALPHA_R, %xmm0
  984. mulps ALPHA_I, %xmm5
  985. addps -32 * SIZE(Y), %xmm0
  986. addps %xmm5, %xmm0
  987. movaps %xmm0, -32 * SIZE(Y)
  988. movaps -16 * SIZE(X), %xmm0
  989. movss %xmm2, %xmm1
  990. shufps $0x93, %xmm2, %xmm1
  991. PSHUFD2($0xb1, %xmm1, %xmm5)
  992. mulps ALPHA_R, %xmm1
  993. mulps ALPHA_I, %xmm5
  994. addps -28 * SIZE(Y), %xmm1
  995. addps %xmm5, %xmm1
  996. movaps %xmm1, -28 * SIZE(Y)
  997. movaps -12 * SIZE(X), %xmm1
  998. movss %xmm3, %xmm2
  999. shufps $0x93, %xmm3, %xmm2
  1000. PSHUFD2($0xb1, %xmm2, %xmm5)
  1001. mulps ALPHA_R, %xmm2
  1002. mulps ALPHA_I, %xmm5
  1003. addps -24 * SIZE(Y), %xmm2
  1004. addps %xmm5, %xmm2
  1005. movaps %xmm2, -24 * SIZE(Y)
  1006. movaps -8 * SIZE(X), %xmm2
  1007. movss %xmm0, %xmm3
  1008. shufps $0x93, %xmm0, %xmm3
  1009. PSHUFD2($0xb1, %xmm3, %xmm5)
  1010. mulps ALPHA_R, %xmm3
  1011. mulps ALPHA_I, %xmm5
  1012. addps -20 * SIZE(Y), %xmm3
  1013. addps %xmm5, %xmm3
  1014. movaps %xmm3, -20 * SIZE(Y)
  1015. movaps -4 * SIZE(X), %xmm3
  1016. movss %xmm1, %xmm0
  1017. shufps $0x93, %xmm1, %xmm0
  1018. PSHUFD2($0xb1, %xmm0, %xmm5)
  1019. mulps ALPHA_R, %xmm0
  1020. mulps ALPHA_I, %xmm5
  1021. addps -16 * SIZE(Y), %xmm0
  1022. addps %xmm5, %xmm0
  1023. movaps %xmm0, -16 * SIZE(Y)
  1024. movaps 0 * SIZE(X), %xmm0
  1025. movss %xmm2, %xmm1
  1026. shufps $0x93, %xmm2, %xmm1
  1027. PSHUFD2($0xb1, %xmm1, %xmm5)
  1028. mulps ALPHA_R, %xmm1
  1029. mulps ALPHA_I, %xmm5
  1030. addps -12 * SIZE(Y), %xmm1
  1031. addps %xmm5, %xmm1
  1032. movaps %xmm1, -12 * SIZE(Y)
  1033. movss %xmm3, %xmm2
  1034. shufps $0x93, %xmm3, %xmm2
  1035. PSHUFD2($0xb1, %xmm2, %xmm5)
  1036. mulps ALPHA_R, %xmm2
  1037. mulps ALPHA_I, %xmm5
  1038. addps -8 * SIZE(Y), %xmm2
  1039. addps %xmm5, %xmm2
  1040. movaps %xmm2, -8 * SIZE(Y)
  1041. movss %xmm0, %xmm3
  1042. shufps $0x93, %xmm0, %xmm3
  1043. PSHUFD2($0xb1, %xmm3, %xmm5)
  1044. mulps ALPHA_R, %xmm3
  1045. mulps ALPHA_I, %xmm5
  1046. addps -4 * SIZE(Y), %xmm3
  1047. addps %xmm5, %xmm3
  1048. movaps %xmm3, -4 * SIZE(Y)
  1049. subl $-32 * SIZE, X
  1050. subl $-32 * SIZE, Y
  1051. ALIGN_3
  1052. .L45:
  1053. testl $8, M
  1054. jle .L46
  1055. movaps -28 * SIZE(X), %xmm1
  1056. movaps -24 * SIZE(X), %xmm2
  1057. movss %xmm1, %xmm0
  1058. shufps $0x93, %xmm1, %xmm0
  1059. PSHUFD2($0xb1, %xmm0, %xmm5)
  1060. mulps ALPHA_R, %xmm0
  1061. mulps ALPHA_I, %xmm5
  1062. addps -32 * SIZE(Y), %xmm0
  1063. addps %xmm5, %xmm0
  1064. movaps %xmm0, -32 * SIZE(Y)
  1065. movss %xmm2, %xmm1
  1066. shufps $0x93, %xmm2, %xmm1
  1067. PSHUFD2($0xb1, %xmm1, %xmm5)
  1068. mulps ALPHA_R, %xmm1
  1069. mulps ALPHA_I, %xmm5
  1070. addps -28 * SIZE(Y), %xmm1
  1071. addps %xmm5, %xmm1
  1072. movaps %xmm1, -28 * SIZE(Y)
  1073. movaps -20 * SIZE(X), %xmm3
  1074. movaps -16 * SIZE(X), %xmm0
  1075. movss %xmm3, %xmm2
  1076. shufps $0x93, %xmm3, %xmm2
  1077. PSHUFD2($0xb1, %xmm2, %xmm5)
  1078. mulps ALPHA_R, %xmm2
  1079. mulps ALPHA_I, %xmm5
  1080. addps -24 * SIZE(Y), %xmm2
  1081. addps %xmm5, %xmm2
  1082. movaps %xmm2, -24 * SIZE(Y)
  1083. movss %xmm0, %xmm3
  1084. shufps $0x93, %xmm0, %xmm3
  1085. PSHUFD2($0xb1, %xmm3, %xmm5)
  1086. mulps ALPHA_R, %xmm3
  1087. mulps ALPHA_I, %xmm5
  1088. addps -20 * SIZE(Y), %xmm3
  1089. addps %xmm5, %xmm3
  1090. movaps %xmm3, -20 * SIZE(Y)
  1091. addl $16 * SIZE, X
  1092. addl $16 * SIZE, Y
  1093. ALIGN_2
  1094. .L46:
  1095. testl $4, M
  1096. jle .L47
  1097. movaps -28 * SIZE(X), %xmm1
  1098. movaps -24 * SIZE(X), %xmm2
  1099. movss %xmm1, %xmm0
  1100. shufps $0x93, %xmm1, %xmm0
  1101. PSHUFD2($0xb1, %xmm0, %xmm5)
  1102. mulps ALPHA_R, %xmm0
  1103. mulps ALPHA_I, %xmm5
  1104. addps -32 * SIZE(Y), %xmm0
  1105. addps %xmm5, %xmm0
  1106. movaps %xmm0, -32 * SIZE(Y)
  1107. movss %xmm2, %xmm1
  1108. shufps $0x93, %xmm2, %xmm1
  1109. PSHUFD2($0xb1, %xmm1, %xmm5)
  1110. mulps ALPHA_R, %xmm1
  1111. mulps ALPHA_I, %xmm5
  1112. addps -28 * SIZE(Y), %xmm1
  1113. addps %xmm5, %xmm1
  1114. movaps %xmm1, -28 * SIZE(Y)
  1115. movaps %xmm2, %xmm0
  1116. addl $8 * SIZE, X
  1117. addl $8 * SIZE, Y
  1118. ALIGN_2
  1119. .L47:
  1120. testl $2, M
  1121. jle .L48
  1122. movaps -28 * SIZE(X), %xmm1
  1123. movss %xmm1, %xmm0
  1124. shufps $0x93, %xmm1, %xmm0
  1125. PSHUFD2($0xb1, %xmm0, %xmm5)
  1126. mulps ALPHA_R, %xmm0
  1127. mulps ALPHA_I, %xmm5
  1128. addps -32 * SIZE(Y), %xmm0
  1129. addps %xmm5, %xmm0
  1130. movaps %xmm0, -32 * SIZE(Y)
  1131. movaps %xmm1, %xmm0
  1132. addl $4 * SIZE, X
  1133. addl $4 * SIZE, Y
  1134. ALIGN_2
  1135. .L48:
  1136. testl $1, M
  1137. jle .L999
  1138. movaps -28 * SIZE(X), %xmm1
  1139. movsd -32 * SIZE(Y), %xmm2
  1140. movss %xmm1, %xmm0
  1141. shufps $0x93, %xmm1, %xmm0
  1142. PSHUFD2($0xb1, %xmm0, %xmm5)
  1143. mulps ALPHA_R, %xmm0
  1144. mulps ALPHA_I, %xmm5
  1145. addps %xmm5, %xmm0
  1146. addps %xmm2, %xmm0
  1147. movlps %xmm0, -32 * SIZE(Y)
  1148. jmp .L999
  1149. ALIGN_3
  1150. #endif
  1151. .L50:
  1152. xorps %xmm0, %xmm0
  1153. subl $1 * SIZE, Y
  1154. testl $3 * SIZE, X
  1155. jne .L60
  1156. movl M, %eax
  1157. sarl $4, %eax
  1158. jle .L55
  1159. movaps -32 * SIZE(X), %xmm1
  1160. movaps -28 * SIZE(X), %xmm2
  1161. movaps -24 * SIZE(X), %xmm3
  1162. decl %eax
  1163. jle .L52
  1164. ALIGN_3
  1165. .L51:
  1166. #ifdef PREFETCHW
  1167. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  1168. #endif
  1169. PSHUFD2($0xb1, %xmm1, %xmm5)
  1170. mulps ALPHA_R, %xmm1
  1171. mulps ALPHA_I, %xmm5
  1172. addps %xmm5, %xmm1
  1173. movss %xmm1, %xmm0
  1174. shufps $0x93, %xmm1, %xmm0
  1175. addps -32 * SIZE(Y), %xmm0
  1176. movaps %xmm0, -32 * SIZE(Y)
  1177. movaps -20 * SIZE(X), %xmm0
  1178. PSHUFD2($0xb1, %xmm2, %xmm5)
  1179. mulps ALPHA_R, %xmm2
  1180. mulps ALPHA_I, %xmm5
  1181. addps %xmm5, %xmm2
  1182. movss %xmm2, %xmm1
  1183. shufps $0x93, %xmm2, %xmm1
  1184. addps -28 * SIZE(Y), %xmm1
  1185. movaps %xmm1, -28 * SIZE(Y)
  1186. movaps -16 * SIZE(X), %xmm1
  1187. #ifdef PREFETCH
  1188. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  1189. #endif
  1190. PSHUFD2($0xb1, %xmm3, %xmm5)
  1191. mulps ALPHA_R, %xmm3
  1192. mulps ALPHA_I, %xmm5
  1193. addps %xmm5, %xmm3
  1194. movss %xmm3, %xmm2
  1195. shufps $0x93, %xmm3, %xmm2
  1196. addps -24 * SIZE(Y), %xmm2
  1197. movaps %xmm2, -24 * SIZE(Y)
  1198. movaps -12 * SIZE(X), %xmm2
  1199. PSHUFD2($0xb1, %xmm0, %xmm5)
  1200. mulps ALPHA_R, %xmm0
  1201. mulps ALPHA_I, %xmm5
  1202. addps %xmm5, %xmm0
  1203. movss %xmm0, %xmm3
  1204. shufps $0x93, %xmm0, %xmm3
  1205. addps -20 * SIZE(Y), %xmm3
  1206. movaps %xmm3, -20 * SIZE(Y)
  1207. movaps -8 * SIZE(X), %xmm3
  1208. #if defined(PREFETCHW) && !defined(FETCH128)
  1209. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  1210. #endif
  1211. PSHUFD2($0xb1, %xmm1, %xmm5)
  1212. mulps ALPHA_R, %xmm1
  1213. mulps ALPHA_I, %xmm5
  1214. addps %xmm5, %xmm1
  1215. movss %xmm1, %xmm0
  1216. shufps $0x93, %xmm1, %xmm0
  1217. addps -16 * SIZE(Y), %xmm0
  1218. movaps %xmm0, -16 * SIZE(Y)
  1219. movaps -4 * SIZE(X), %xmm0
  1220. PSHUFD2($0xb1, %xmm2, %xmm5)
  1221. mulps ALPHA_R, %xmm2
  1222. mulps ALPHA_I, %xmm5
  1223. addps %xmm5, %xmm2
  1224. movss %xmm2, %xmm1
  1225. shufps $0x93, %xmm2, %xmm1
  1226. addps -12 * SIZE(Y), %xmm1
  1227. movaps %xmm1, -12 * SIZE(Y)
  1228. movaps 0 * SIZE(X), %xmm1
  1229. #if defined(PREFETCH) && !defined(FETCH128)
  1230. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  1231. #endif
  1232. PSHUFD2($0xb1, %xmm3, %xmm5)
  1233. mulps ALPHA_R, %xmm3
  1234. mulps ALPHA_I, %xmm5
  1235. addps %xmm5, %xmm3
  1236. movss %xmm3, %xmm2
  1237. shufps $0x93, %xmm3, %xmm2
  1238. addps -8 * SIZE(Y), %xmm2
  1239. movaps %xmm2, -8 * SIZE(Y)
  1240. movaps 4 * SIZE(X), %xmm2
  1241. PSHUFD2($0xb1, %xmm0, %xmm5)
  1242. mulps ALPHA_R, %xmm0
  1243. mulps ALPHA_I, %xmm5
  1244. addps %xmm5, %xmm0
  1245. movss %xmm0, %xmm3
  1246. shufps $0x93, %xmm0, %xmm3
  1247. addps -4 * SIZE(Y), %xmm3
  1248. movaps %xmm3, -4 * SIZE(Y)
  1249. movaps 8 * SIZE(X), %xmm3
  1250. subl $-32 * SIZE, X
  1251. subl $-32 * SIZE, Y
  1252. decl %eax
  1253. jg .L51
  1254. ALIGN_3
  1255. .L52:
  1256. PSHUFD2($0xb1, %xmm1, %xmm5)
  1257. mulps ALPHA_R, %xmm1
  1258. mulps ALPHA_I, %xmm5
  1259. addps %xmm5, %xmm1
  1260. movss %xmm1, %xmm0
  1261. shufps $0x93, %xmm1, %xmm0
  1262. addps -32 * SIZE(Y), %xmm0
  1263. movaps %xmm0, -32 * SIZE(Y)
  1264. movaps -20 * SIZE(X), %xmm0
  1265. PSHUFD2($0xb1, %xmm2, %xmm5)
  1266. mulps ALPHA_R, %xmm2
  1267. mulps ALPHA_I, %xmm5
  1268. addps %xmm5, %xmm2
  1269. movss %xmm2, %xmm1
  1270. shufps $0x93, %xmm2, %xmm1
  1271. addps -28 * SIZE(Y), %xmm1
  1272. movaps %xmm1, -28 * SIZE(Y)
  1273. movaps -16 * SIZE(X), %xmm1
  1274. PSHUFD2($0xb1, %xmm3, %xmm5)
  1275. mulps ALPHA_R, %xmm3
  1276. mulps ALPHA_I, %xmm5
  1277. addps %xmm5, %xmm3
  1278. movss %xmm3, %xmm2
  1279. shufps $0x93, %xmm3, %xmm2
  1280. addps -24 * SIZE(Y), %xmm2
  1281. movaps %xmm2, -24 * SIZE(Y)
  1282. movaps -12 * SIZE(X), %xmm2
  1283. PSHUFD2($0xb1, %xmm0, %xmm5)
  1284. mulps ALPHA_R, %xmm0
  1285. mulps ALPHA_I, %xmm5
  1286. addps %xmm5, %xmm0
  1287. movss %xmm0, %xmm3
  1288. shufps $0x93, %xmm0, %xmm3
  1289. addps -20 * SIZE(Y), %xmm3
  1290. movaps %xmm3, -20 * SIZE(Y)
  1291. movaps -8 * SIZE(X), %xmm3
  1292. PSHUFD2($0xb1, %xmm1, %xmm5)
  1293. mulps ALPHA_R, %xmm1
  1294. mulps ALPHA_I, %xmm5
  1295. addps %xmm5, %xmm1
  1296. movss %xmm1, %xmm0
  1297. shufps $0x93, %xmm1, %xmm0
  1298. addps -16 * SIZE(Y), %xmm0
  1299. movaps %xmm0, -16 * SIZE(Y)
  1300. movaps -4 * SIZE(X), %xmm0
  1301. PSHUFD2($0xb1, %xmm2, %xmm5)
  1302. mulps ALPHA_R, %xmm2
  1303. mulps ALPHA_I, %xmm5
  1304. addps %xmm5, %xmm2
  1305. movss %xmm2, %xmm1
  1306. shufps $0x93, %xmm2, %xmm1
  1307. addps -12 * SIZE(Y), %xmm1
  1308. movaps %xmm1, -12 * SIZE(Y)
  1309. PSHUFD2($0xb1, %xmm3, %xmm5)
  1310. mulps ALPHA_R, %xmm3
  1311. mulps ALPHA_I, %xmm5
  1312. addps %xmm5, %xmm3
  1313. movss %xmm3, %xmm2
  1314. shufps $0x93, %xmm3, %xmm2
  1315. addps -8 * SIZE(Y), %xmm2
  1316. movaps %xmm2, -8 * SIZE(Y)
  1317. PSHUFD2($0xb1, %xmm0, %xmm5)
  1318. mulps ALPHA_R, %xmm0
  1319. mulps ALPHA_I, %xmm5
  1320. addps %xmm5, %xmm0
  1321. movss %xmm0, %xmm3
  1322. shufps $0x93, %xmm0, %xmm3
  1323. addps -4 * SIZE(Y), %xmm3
  1324. movaps %xmm3, -4 * SIZE(Y)
  1325. subl $-32 * SIZE, X
  1326. subl $-32 * SIZE, Y
  1327. ALIGN_3
  1328. .L55:
  1329. testl $8, M
  1330. jle .L56
  1331. movaps -32 * SIZE(X), %xmm1
  1332. movaps -28 * SIZE(X), %xmm2
  1333. PSHUFD2($0xb1, %xmm1, %xmm5)
  1334. mulps ALPHA_R, %xmm1
  1335. mulps ALPHA_I, %xmm5
  1336. addps %xmm5, %xmm1
  1337. movss %xmm1, %xmm0
  1338. shufps $0x93, %xmm1, %xmm0
  1339. addps -32 * SIZE(Y), %xmm0
  1340. movaps %xmm0, -32 * SIZE(Y)
  1341. PSHUFD2($0xb1, %xmm2, %xmm5)
  1342. mulps ALPHA_R, %xmm2
  1343. mulps ALPHA_I, %xmm5
  1344. addps %xmm5, %xmm2
  1345. movss %xmm2, %xmm1
  1346. shufps $0x93, %xmm2, %xmm1
  1347. addps -28 * SIZE(Y), %xmm1
  1348. movaps %xmm1, -28 * SIZE(Y)
  1349. movaps -24 * SIZE(X), %xmm3
  1350. movaps -20 * SIZE(X), %xmm0
  1351. PSHUFD2($0xb1, %xmm3, %xmm5)
  1352. mulps ALPHA_R, %xmm3
  1353. mulps ALPHA_I, %xmm5
  1354. addps %xmm5, %xmm3
  1355. movss %xmm3, %xmm2
  1356. shufps $0x93, %xmm3, %xmm2
  1357. addps -24 * SIZE(Y), %xmm2
  1358. movaps %xmm2, -24 * SIZE(Y)
  1359. PSHUFD2($0xb1, %xmm0, %xmm5)
  1360. mulps ALPHA_R, %xmm0
  1361. mulps ALPHA_I, %xmm5
  1362. addps %xmm5, %xmm0
  1363. movss %xmm0, %xmm3
  1364. shufps $0x93, %xmm0, %xmm3
  1365. addps -20 * SIZE(Y), %xmm3
  1366. movaps %xmm3, -20 * SIZE(Y)
  1367. addl $16 * SIZE, X
  1368. addl $16 * SIZE, Y
  1369. ALIGN_2
  1370. .L56:
  1371. testl $4, M
  1372. jle .L57
  1373. movaps -32 * SIZE(X), %xmm1
  1374. movaps -28 * SIZE(X), %xmm2
  1375. PSHUFD2($0xb1, %xmm1, %xmm5)
  1376. mulps ALPHA_R, %xmm1
  1377. mulps ALPHA_I, %xmm5
  1378. addps %xmm5, %xmm1
  1379. movss %xmm1, %xmm0
  1380. shufps $0x93, %xmm1, %xmm0
  1381. addps -32 * SIZE(Y), %xmm0
  1382. movaps %xmm0, -32 * SIZE(Y)
  1383. PSHUFD2($0xb1, %xmm2, %xmm5)
  1384. mulps ALPHA_R, %xmm2
  1385. mulps ALPHA_I, %xmm5
  1386. addps %xmm5, %xmm2
  1387. movss %xmm2, %xmm1
  1388. shufps $0x93, %xmm2, %xmm1
  1389. addps -28 * SIZE(Y), %xmm1
  1390. movaps %xmm1, -28 * SIZE(Y)
  1391. movaps %xmm2, %xmm0
  1392. addl $8 * SIZE, X
  1393. addl $8 * SIZE, Y
  1394. ALIGN_2
  1395. .L57:
  1396. testl $2, M
  1397. jle .L58
  1398. movaps -32 * SIZE(X), %xmm1
  1399. PSHUFD2($0xb1, %xmm1, %xmm5)
  1400. mulps ALPHA_R, %xmm1
  1401. mulps ALPHA_I, %xmm5
  1402. addps %xmm5, %xmm1
  1403. movss %xmm1, %xmm0
  1404. shufps $0x93, %xmm1, %xmm0
  1405. addps -32 * SIZE(Y), %xmm0
  1406. movaps %xmm0, -32 * SIZE(Y)
  1407. movaps %xmm1, %xmm0
  1408. addl $4 * SIZE, X
  1409. addl $4 * SIZE, Y
  1410. ALIGN_2
  1411. .L58:
  1412. testl $1, M
  1413. jle .L59
  1414. #ifdef movsd
  1415. xorps %xmm1, %xmm1
  1416. #endif
  1417. movsd -32 * SIZE(X), %xmm1
  1418. PSHUFD2($0xb1, %xmm1, %xmm5)
  1419. mulps ALPHA_R, %xmm1
  1420. mulps ALPHA_I, %xmm5
  1421. addps %xmm5, %xmm1
  1422. movss %xmm1, %xmm0
  1423. shufps $0x93, %xmm1, %xmm0
  1424. addps -32 * SIZE(Y), %xmm0
  1425. movaps %xmm0, -32 * SIZE(Y)
  1426. jmp .L999
  1427. ALIGN_3
  1428. .L59:
  1429. shufps $0x93, %xmm0, %xmm0
  1430. addss -32 * SIZE(Y), %xmm0
  1431. movss %xmm0, -32 * SIZE(Y)
  1432. jmp .L999
  1433. ALIGN_3
  1434. .L60:
  1435. #ifdef ALIGNED_ACCESS
  1436. testl $2 * SIZE, X
  1437. jne .L70
  1438. subl $1 * SIZE, X
  1439. movaps -32 * SIZE(X), %xmm1
  1440. movl M, %eax
  1441. sarl $4, %eax
  1442. jle .L65
  1443. movaps -28 * SIZE(X), %xmm2
  1444. movaps -24 * SIZE(X), %xmm3
  1445. decl %eax
  1446. jle .L62
  1447. ALIGN_3
  1448. .L61:
  1449. #ifdef PREFETCHW
  1450. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  1451. #endif
  1452. movss %xmm2, %xmm1
  1453. SHUFPS_39 %xmm1, %xmm1
  1454. PSHUFD2($0xb1, %xmm1, %xmm5)
  1455. mulps ALPHA_R, %xmm1
  1456. mulps ALPHA_I, %xmm5
  1457. addps %xmm5, %xmm1
  1458. movss %xmm1, %xmm0
  1459. shufps $0x93, %xmm1, %xmm0
  1460. addps -32 * SIZE(Y), %xmm0
  1461. movaps %xmm0, -32 * SIZE(Y)
  1462. movaps -20 * SIZE(X), %xmm0
  1463. movss %xmm3, %xmm2
  1464. SHUFPS_39 %xmm2, %xmm2
  1465. PSHUFD2($0xb1, %xmm2, %xmm5)
  1466. mulps ALPHA_R, %xmm2
  1467. mulps ALPHA_I, %xmm5
  1468. addps %xmm5, %xmm2
  1469. movss %xmm2, %xmm1
  1470. shufps $0x93, %xmm2, %xmm1
  1471. addps -28 * SIZE(Y), %xmm1
  1472. movaps %xmm1, -28 * SIZE(Y)
  1473. movaps -16 * SIZE(X), %xmm1
  1474. #ifdef PREFETCH
  1475. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  1476. #endif
  1477. movss %xmm0, %xmm3
  1478. SHUFPS_39 %xmm3, %xmm3
  1479. PSHUFD2($0xb1, %xmm3, %xmm5)
  1480. mulps ALPHA_R, %xmm3
  1481. mulps ALPHA_I, %xmm5
  1482. addps %xmm5, %xmm3
  1483. movss %xmm3, %xmm2
  1484. shufps $0x93, %xmm3, %xmm2
  1485. addps -24 * SIZE(Y), %xmm2
  1486. movaps %xmm2, -24 * SIZE(Y)
  1487. movaps -12 * SIZE(X), %xmm2
  1488. movss %xmm1, %xmm0
  1489. SHUFPS_39 %xmm0, %xmm0
  1490. PSHUFD2($0xb1, %xmm0, %xmm5)
  1491. mulps ALPHA_R, %xmm0
  1492. mulps ALPHA_I, %xmm5
  1493. addps %xmm5, %xmm0
  1494. movss %xmm0, %xmm3
  1495. shufps $0x93, %xmm0, %xmm3
  1496. addps -20 * SIZE(Y), %xmm3
  1497. movaps %xmm3, -20 * SIZE(Y)
  1498. movaps -8 * SIZE(X), %xmm3
  1499. #if defined(PREFETCHW) && !defined(FETCH128)
  1500. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  1501. #endif
  1502. movss %xmm2, %xmm1
  1503. SHUFPS_39 %xmm1, %xmm1
  1504. PSHUFD2($0xb1, %xmm1, %xmm5)
  1505. mulps ALPHA_R, %xmm1
  1506. mulps ALPHA_I, %xmm5
  1507. addps %xmm5, %xmm1
  1508. movss %xmm1, %xmm0
  1509. shufps $0x93, %xmm1, %xmm0
  1510. addps -16 * SIZE(Y), %xmm0
  1511. movaps %xmm0, -16 * SIZE(Y)
  1512. movaps -4 * SIZE(X), %xmm0
  1513. movss %xmm3, %xmm2
  1514. SHUFPS_39 %xmm2, %xmm2
  1515. PSHUFD2($0xb1, %xmm2, %xmm5)
  1516. mulps ALPHA_R, %xmm2
  1517. mulps ALPHA_I, %xmm5
  1518. addps %xmm5, %xmm2
  1519. movss %xmm2, %xmm1
  1520. shufps $0x93, %xmm2, %xmm1
  1521. addps -12 * SIZE(Y), %xmm1
  1522. movaps %xmm1, -12 * SIZE(Y)
  1523. movaps 0 * SIZE(X), %xmm1
  1524. #if defined(PREFETCH) && !defined(FETCH128)
  1525. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  1526. #endif
  1527. movss %xmm0, %xmm3
  1528. SHUFPS_39 %xmm3, %xmm3
  1529. PSHUFD2($0xb1, %xmm3, %xmm5)
  1530. mulps ALPHA_R, %xmm3
  1531. mulps ALPHA_I, %xmm5
  1532. addps %xmm5, %xmm3
  1533. movss %xmm3, %xmm2
  1534. shufps $0x93, %xmm3, %xmm2
  1535. addps -8 * SIZE(Y), %xmm2
  1536. movaps %xmm2, -8 * SIZE(Y)
  1537. movaps 4 * SIZE(X), %xmm2
  1538. movss %xmm1, %xmm0
  1539. SHUFPS_39 %xmm0, %xmm0
  1540. PSHUFD2($0xb1, %xmm0, %xmm5)
  1541. mulps ALPHA_R, %xmm0
  1542. mulps ALPHA_I, %xmm5
  1543. addps %xmm5, %xmm0
  1544. movss %xmm0, %xmm3
  1545. shufps $0x93, %xmm0, %xmm3
  1546. addps -4 * SIZE(Y), %xmm3
  1547. movaps %xmm3, -4 * SIZE(Y)
  1548. movaps 8 * SIZE(X), %xmm3
  1549. subl $-32 * SIZE, X
  1550. subl $-32 * SIZE, Y
  1551. decl %eax
  1552. jg .L61
  1553. ALIGN_3
  1554. .L62:
  1555. movss %xmm2, %xmm1
  1556. SHUFPS_39 %xmm1, %xmm1
  1557. PSHUFD2($0xb1, %xmm1, %xmm5)
  1558. mulps ALPHA_R, %xmm1
  1559. mulps ALPHA_I, %xmm5
  1560. addps %xmm5, %xmm1
  1561. movss %xmm1, %xmm0
  1562. shufps $0x93, %xmm1, %xmm0
  1563. addps -32 * SIZE(Y), %xmm0
  1564. movaps %xmm0, -32 * SIZE(Y)
  1565. movaps -20 * SIZE(X), %xmm0
  1566. movss %xmm3, %xmm2
  1567. SHUFPS_39 %xmm2, %xmm2
  1568. PSHUFD2($0xb1, %xmm2, %xmm5)
  1569. mulps ALPHA_R, %xmm2
  1570. mulps ALPHA_I, %xmm5
  1571. addps %xmm5, %xmm2
  1572. movss %xmm2, %xmm1
  1573. shufps $0x93, %xmm2, %xmm1
  1574. addps -28 * SIZE(Y), %xmm1
  1575. movaps %xmm1, -28 * SIZE(Y)
  1576. movaps -16 * SIZE(X), %xmm1
  1577. movss %xmm0, %xmm3
  1578. SHUFPS_39 %xmm3, %xmm3
  1579. PSHUFD2($0xb1, %xmm3, %xmm5)
  1580. mulps ALPHA_R, %xmm3
  1581. mulps ALPHA_I, %xmm5
  1582. addps %xmm5, %xmm3
  1583. movss %xmm3, %xmm2
  1584. shufps $0x93, %xmm3, %xmm2
  1585. addps -24 * SIZE(Y), %xmm2
  1586. movaps %xmm2, -24 * SIZE(Y)
  1587. movaps -12 * SIZE(X), %xmm2
  1588. movss %xmm1, %xmm0
  1589. SHUFPS_39 %xmm0, %xmm0
  1590. PSHUFD2($0xb1, %xmm0, %xmm5)
  1591. mulps ALPHA_R, %xmm0
  1592. mulps ALPHA_I, %xmm5
  1593. addps %xmm5, %xmm0
  1594. movss %xmm0, %xmm3
  1595. shufps $0x93, %xmm0, %xmm3
  1596. addps -20 * SIZE(Y), %xmm3
  1597. movaps %xmm3, -20 * SIZE(Y)
  1598. movaps -8 * SIZE(X), %xmm3
  1599. movss %xmm2, %xmm1
  1600. SHUFPS_39 %xmm1, %xmm1
  1601. PSHUFD2($0xb1, %xmm1, %xmm5)
  1602. mulps ALPHA_R, %xmm1
  1603. mulps ALPHA_I, %xmm5
  1604. addps %xmm5, %xmm1
  1605. movss %xmm1, %xmm0
  1606. shufps $0x93, %xmm1, %xmm0
  1607. addps -16 * SIZE(Y), %xmm0
  1608. movaps %xmm0, -16 * SIZE(Y)
  1609. movaps -4 * SIZE(X), %xmm0
  1610. movss %xmm3, %xmm2
  1611. SHUFPS_39 %xmm2, %xmm2
  1612. PSHUFD2($0xb1, %xmm2, %xmm5)
  1613. mulps ALPHA_R, %xmm2
  1614. mulps ALPHA_I, %xmm5
  1615. addps %xmm5, %xmm2
  1616. movss %xmm2, %xmm1
  1617. shufps $0x93, %xmm2, %xmm1
  1618. addps -12 * SIZE(Y), %xmm1
  1619. movaps %xmm1, -12 * SIZE(Y)
  1620. movaps 0 * SIZE(X), %xmm1
  1621. movss %xmm0, %xmm3
  1622. SHUFPS_39 %xmm3, %xmm3
  1623. PSHUFD2($0xb1, %xmm3, %xmm5)
  1624. mulps ALPHA_R, %xmm3
  1625. mulps ALPHA_I, %xmm5
  1626. addps %xmm5, %xmm3
  1627. movss %xmm3, %xmm2
  1628. shufps $0x93, %xmm3, %xmm2
  1629. addps -8 * SIZE(Y), %xmm2
  1630. movaps %xmm2, -8 * SIZE(Y)
  1631. movss %xmm1, %xmm0
  1632. SHUFPS_39 %xmm0, %xmm0
  1633. PSHUFD2($0xb1, %xmm0, %xmm5)
  1634. mulps ALPHA_R, %xmm0
  1635. mulps ALPHA_I, %xmm5
  1636. addps %xmm5, %xmm0
  1637. movss %xmm0, %xmm3
  1638. shufps $0x93, %xmm0, %xmm3
  1639. addps -4 * SIZE(Y), %xmm3
  1640. movaps %xmm3, -4 * SIZE(Y)
  1641. subl $-32 * SIZE, X
  1642. subl $-32 * SIZE, Y
  1643. ALIGN_3
  1644. .L65:
  1645. testl $8, M
  1646. jle .L66
  1647. movaps -28 * SIZE(X), %xmm2
  1648. movaps -24 * SIZE(X), %xmm3
  1649. movss %xmm2, %xmm1
  1650. SHUFPS_39 %xmm1, %xmm1
  1651. PSHUFD2($0xb1, %xmm1, %xmm5)
  1652. mulps ALPHA_R, %xmm1
  1653. mulps ALPHA_I, %xmm5
  1654. addps %xmm5, %xmm1
  1655. movss %xmm1, %xmm0
  1656. shufps $0x93, %xmm1, %xmm0
  1657. addps -32 * SIZE(Y), %xmm0
  1658. movaps %xmm0, -32 * SIZE(Y)
  1659. movss %xmm3, %xmm2
  1660. SHUFPS_39 %xmm2, %xmm2
  1661. PSHUFD2($0xb1, %xmm2, %xmm5)
  1662. mulps ALPHA_R, %xmm2
  1663. mulps ALPHA_I, %xmm5
  1664. addps %xmm5, %xmm2
  1665. movss %xmm2, %xmm1
  1666. shufps $0x93, %xmm2, %xmm1
  1667. addps -28 * SIZE(Y), %xmm1
  1668. movaps %xmm1, -28 * SIZE(Y)
  1669. movaps -20 * SIZE(X), %xmm0
  1670. movaps -16 * SIZE(X), %xmm1
  1671. movss %xmm0, %xmm3
  1672. SHUFPS_39 %xmm3, %xmm3
  1673. PSHUFD2($0xb1, %xmm3, %xmm5)
  1674. mulps ALPHA_R, %xmm3
  1675. mulps ALPHA_I, %xmm5
  1676. addps %xmm5, %xmm3
  1677. movss %xmm3, %xmm2
  1678. shufps $0x93, %xmm3, %xmm2
  1679. addps -24 * SIZE(Y), %xmm2
  1680. movaps %xmm2, -24 * SIZE(Y)
  1681. movss %xmm1, %xmm0
  1682. SHUFPS_39 %xmm0, %xmm0
  1683. PSHUFD2($0xb1, %xmm0, %xmm5)
  1684. mulps ALPHA_R, %xmm0
  1685. mulps ALPHA_I, %xmm5
  1686. addps %xmm5, %xmm0
  1687. movss %xmm0, %xmm3
  1688. shufps $0x93, %xmm0, %xmm3
  1689. addps -20 * SIZE(Y), %xmm3
  1690. movaps %xmm3, -20 * SIZE(Y)
  1691. addl $16 * SIZE, X
  1692. addl $16 * SIZE, Y
  1693. ALIGN_2
  1694. .L66:
  1695. testl $4, M
  1696. jle .L67
  1697. movaps -28 * SIZE(X), %xmm2
  1698. movaps -24 * SIZE(X), %xmm3
  1699. movss %xmm2, %xmm1
  1700. SHUFPS_39 %xmm1, %xmm1
  1701. PSHUFD2($0xb1, %xmm1, %xmm5)
  1702. mulps ALPHA_R, %xmm1
  1703. mulps ALPHA_I, %xmm5
  1704. addps %xmm5, %xmm1
  1705. movss %xmm1, %xmm0
  1706. shufps $0x93, %xmm1, %xmm0
  1707. addps -32 * SIZE(Y), %xmm0
  1708. movaps %xmm0, -32 * SIZE(Y)
  1709. movss %xmm3, %xmm2
  1710. SHUFPS_39 %xmm2, %xmm2
  1711. PSHUFD2($0xb1, %xmm2, %xmm5)
  1712. mulps ALPHA_R, %xmm2
  1713. mulps ALPHA_I, %xmm5
  1714. addps %xmm5, %xmm2
  1715. movss %xmm2, %xmm1
  1716. shufps $0x93, %xmm2, %xmm1
  1717. addps -28 * SIZE(Y), %xmm1
  1718. movaps %xmm1, -28 * SIZE(Y)
  1719. movaps %xmm2, %xmm0
  1720. movaps %xmm3, %xmm1
  1721. addl $8 * SIZE, X
  1722. addl $8 * SIZE, Y
  1723. ALIGN_2
  1724. .L67:
  1725. testl $2, M
  1726. jle .L68
  1727. movaps -28 * SIZE(X), %xmm2
  1728. movss %xmm2, %xmm1
  1729. SHUFPS_39 %xmm1, %xmm1
  1730. PSHUFD2($0xb1, %xmm1, %xmm5)
  1731. mulps ALPHA_R, %xmm1
  1732. mulps ALPHA_I, %xmm5
  1733. addps %xmm5, %xmm1
  1734. movss %xmm1, %xmm0
  1735. shufps $0x93, %xmm1, %xmm0
  1736. addps -32 * SIZE(Y), %xmm0
  1737. movaps %xmm0, -32 * SIZE(Y)
  1738. movaps %xmm1, %xmm0
  1739. movaps %xmm2, %xmm1
  1740. addl $4 * SIZE, X
  1741. addl $4 * SIZE, Y
  1742. ALIGN_2
  1743. .L68:
  1744. testl $1, M
  1745. jle .L69
  1746. movaps -28 * SIZE(X), %xmm2
  1747. movss %xmm2, %xmm1
  1748. SHUFPS_39 %xmm1, %xmm1
  1749. PSHUFD2($0xb1, %xmm1, %xmm5)
  1750. mulps ALPHA_R, %xmm1
  1751. mulps ALPHA_I, %xmm5
  1752. addps %xmm5, %xmm1
  1753. movss %xmm1, %xmm0
  1754. shufps $0x93, %xmm1, %xmm0
  1755. addps -32 * SIZE(Y), %xmm0
  1756. movlps %xmm0, -32 * SIZE(Y)
  1757. movhlps %xmm0, %xmm0
  1758. movss %xmm0, -30 * SIZE(Y)
  1759. jmp .L999
  1760. .L69:
  1761. shufps $0x93, %xmm0, %xmm0
  1762. addss -32 * SIZE(Y), %xmm0
  1763. movss %xmm0, -32 * SIZE(Y)
  1764. jmp .L999
  1765. ALIGN_3
  1766. .L70:
  1767. testl $1 * SIZE, X
  1768. jne .L80
  1769. #endif
  1770. movl M, %eax
  1771. sarl $4, %eax
  1772. jle .L75
  1773. movsd -32 * SIZE(X), %xmm1
  1774. movhps -30 * SIZE(X), %xmm1
  1775. movsd -28 * SIZE(X), %xmm2
  1776. movhps -26 * SIZE(X), %xmm2
  1777. movsd -24 * SIZE(X), %xmm3
  1778. movhps -22 * SIZE(X), %xmm3
  1779. decl %eax
  1780. jle .L72
  1781. ALIGN_3
  1782. .L71:
  1783. #ifdef PREFETCHW
  1784. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  1785. #endif
  1786. PSHUFD2($0xb1, %xmm1, %xmm5)
  1787. mulps ALPHA_R, %xmm1
  1788. mulps ALPHA_I, %xmm5
  1789. addps %xmm5, %xmm1
  1790. movss %xmm1, %xmm0
  1791. shufps $0x93, %xmm1, %xmm0
  1792. addps -32 * SIZE(Y), %xmm0
  1793. movaps %xmm0, -32 * SIZE(Y)
  1794. movsd -20 * SIZE(X), %xmm0
  1795. movhps -18 * SIZE(X), %xmm0
  1796. PSHUFD2($0xb1, %xmm2, %xmm5)
  1797. mulps ALPHA_R, %xmm2
  1798. mulps ALPHA_I, %xmm5
  1799. addps %xmm5, %xmm2
  1800. movss %xmm2, %xmm1
  1801. shufps $0x93, %xmm2, %xmm1
  1802. addps -28 * SIZE(Y), %xmm1
  1803. movaps %xmm1, -28 * SIZE(Y)
  1804. movsd -16 * SIZE(X), %xmm1
  1805. movhps -14 * SIZE(X), %xmm1
  1806. #ifdef PREFETCH
  1807. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  1808. #endif
  1809. PSHUFD2($0xb1, %xmm3, %xmm5)
  1810. mulps ALPHA_R, %xmm3
  1811. mulps ALPHA_I, %xmm5
  1812. addps %xmm5, %xmm3
  1813. movss %xmm3, %xmm2
  1814. shufps $0x93, %xmm3, %xmm2
  1815. addps -24 * SIZE(Y), %xmm2
  1816. movaps %xmm2, -24 * SIZE(Y)
  1817. movsd -12 * SIZE(X), %xmm2
  1818. movhps -10 * SIZE(X), %xmm2
  1819. PSHUFD2($0xb1, %xmm0, %xmm5)
  1820. mulps ALPHA_R, %xmm0
  1821. mulps ALPHA_I, %xmm5
  1822. addps %xmm5, %xmm0
  1823. movss %xmm0, %xmm3
  1824. shufps $0x93, %xmm0, %xmm3
  1825. addps -20 * SIZE(Y), %xmm3
  1826. movaps %xmm3, -20 * SIZE(Y)
  1827. movsd -8 * SIZE(X), %xmm3
  1828. movhps -6 * SIZE(X), %xmm3
  1829. #if defined(PREFETCHW) && !defined(FETCH128)
  1830. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  1831. #endif
  1832. PSHUFD2($0xb1, %xmm1, %xmm5)
  1833. mulps ALPHA_R, %xmm1
  1834. mulps ALPHA_I, %xmm5
  1835. addps %xmm5, %xmm1
  1836. movss %xmm1, %xmm0
  1837. shufps $0x93, %xmm1, %xmm0
  1838. addps -16 * SIZE(Y), %xmm0
  1839. movaps %xmm0, -16 * SIZE(Y)
  1840. movsd -4 * SIZE(X), %xmm0
  1841. movhps -2 * SIZE(X), %xmm0
  1842. PSHUFD2($0xb1, %xmm2, %xmm5)
  1843. mulps ALPHA_R, %xmm2
  1844. mulps ALPHA_I, %xmm5
  1845. addps %xmm5, %xmm2
  1846. movss %xmm2, %xmm1
  1847. shufps $0x93, %xmm2, %xmm1
  1848. addps -12 * SIZE(Y), %xmm1
  1849. movaps %xmm1, -12 * SIZE(Y)
  1850. movsd 0 * SIZE(X), %xmm1
  1851. movhps 2 * SIZE(X), %xmm1
  1852. #if defined(PREFETCH) && !defined(FETCH128)
  1853. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  1854. #endif
  1855. PSHUFD2($0xb1, %xmm3, %xmm5)
  1856. mulps ALPHA_R, %xmm3
  1857. mulps ALPHA_I, %xmm5
  1858. addps %xmm5, %xmm3
  1859. movss %xmm3, %xmm2
  1860. shufps $0x93, %xmm3, %xmm2
  1861. addps -8 * SIZE(Y), %xmm2
  1862. movaps %xmm2, -8 * SIZE(Y)
  1863. movsd 4 * SIZE(X), %xmm2
  1864. movhps 6 * SIZE(X), %xmm2
  1865. PSHUFD2($0xb1, %xmm0, %xmm5)
  1866. mulps ALPHA_R, %xmm0
  1867. mulps ALPHA_I, %xmm5
  1868. addps %xmm5, %xmm0
  1869. movss %xmm0, %xmm3
  1870. shufps $0x93, %xmm0, %xmm3
  1871. addps -4 * SIZE(Y), %xmm3
  1872. movaps %xmm3, -4 * SIZE(Y)
  1873. movsd 8 * SIZE(X), %xmm3
  1874. movhps 10 * SIZE(X), %xmm3
  1875. subl $-32 * SIZE, X
  1876. subl $-32 * SIZE, Y
  1877. decl %eax
  1878. jg .L71
  1879. ALIGN_3
  1880. .L72:
  1881. PSHUFD2($0xb1, %xmm1, %xmm5)
  1882. mulps ALPHA_R, %xmm1
  1883. mulps ALPHA_I, %xmm5
  1884. addps %xmm5, %xmm1
  1885. movss %xmm1, %xmm0
  1886. shufps $0x93, %xmm1, %xmm0
  1887. addps -32 * SIZE(Y), %xmm0
  1888. movaps %xmm0, -32 * SIZE(Y)
  1889. movsd -20 * SIZE(X), %xmm0
  1890. movhps -18 * SIZE(X), %xmm0
  1891. PSHUFD2($0xb1, %xmm2, %xmm5)
  1892. mulps ALPHA_R, %xmm2
  1893. mulps ALPHA_I, %xmm5
  1894. addps %xmm5, %xmm2
  1895. movss %xmm2, %xmm1
  1896. shufps $0x93, %xmm2, %xmm1
  1897. addps -28 * SIZE(Y), %xmm1
  1898. movaps %xmm1, -28 * SIZE(Y)
  1899. movsd -16 * SIZE(X), %xmm1
  1900. movhps -14 * SIZE(X), %xmm1
  1901. PSHUFD2($0xb1, %xmm3, %xmm5)
  1902. mulps ALPHA_R, %xmm3
  1903. mulps ALPHA_I, %xmm5
  1904. addps %xmm5, %xmm3
  1905. movss %xmm3, %xmm2
  1906. shufps $0x93, %xmm3, %xmm2
  1907. addps -24 * SIZE(Y), %xmm2
  1908. movaps %xmm2, -24 * SIZE(Y)
  1909. movsd -12 * SIZE(X), %xmm2
  1910. movhps -10 * SIZE(X), %xmm2
  1911. PSHUFD2($0xb1, %xmm0, %xmm5)
  1912. mulps ALPHA_R, %xmm0
  1913. mulps ALPHA_I, %xmm5
  1914. addps %xmm5, %xmm0
  1915. movss %xmm0, %xmm3
  1916. shufps $0x93, %xmm0, %xmm3
  1917. addps -20 * SIZE(Y), %xmm3
  1918. movaps %xmm3, -20 * SIZE(Y)
  1919. movsd -8 * SIZE(X), %xmm3
  1920. movhps -6 * SIZE(X), %xmm3
  1921. PSHUFD2($0xb1, %xmm1, %xmm5)
  1922. mulps ALPHA_R, %xmm1
  1923. mulps ALPHA_I, %xmm5
  1924. addps %xmm5, %xmm1
  1925. movss %xmm1, %xmm0
  1926. shufps $0x93, %xmm1, %xmm0
  1927. addps -16 * SIZE(Y), %xmm0
  1928. movaps %xmm0, -16 * SIZE(Y)
  1929. movsd -4 * SIZE(X), %xmm0
  1930. movhps -2 * SIZE(X), %xmm0
  1931. PSHUFD2($0xb1, %xmm2, %xmm5)
  1932. mulps ALPHA_R, %xmm2
  1933. mulps ALPHA_I, %xmm5
  1934. addps %xmm5, %xmm2
  1935. movss %xmm2, %xmm1
  1936. shufps $0x93, %xmm2, %xmm1
  1937. addps -12 * SIZE(Y), %xmm1
  1938. movaps %xmm1, -12 * SIZE(Y)
  1939. PSHUFD2($0xb1, %xmm3, %xmm5)
  1940. mulps ALPHA_R, %xmm3
  1941. mulps ALPHA_I, %xmm5
  1942. addps %xmm5, %xmm3
  1943. movss %xmm3, %xmm2
  1944. shufps $0x93, %xmm3, %xmm2
  1945. addps -8 * SIZE(Y), %xmm2
  1946. movaps %xmm2, -8 * SIZE(Y)
  1947. PSHUFD2($0xb1, %xmm0, %xmm5)
  1948. mulps ALPHA_R, %xmm0
  1949. mulps ALPHA_I, %xmm5
  1950. addps %xmm5, %xmm0
  1951. movss %xmm0, %xmm3
  1952. shufps $0x93, %xmm0, %xmm3
  1953. addps -4 * SIZE(Y), %xmm3
  1954. movaps %xmm3, -4 * SIZE(Y)
  1955. subl $-32 * SIZE, X
  1956. subl $-32 * SIZE, Y
  1957. ALIGN_3
  1958. .L75:
  1959. testl $8, M
  1960. jle .L76
  1961. movsd -32 * SIZE(X), %xmm1
  1962. movhps -30 * SIZE(X), %xmm1
  1963. movsd -28 * SIZE(X), %xmm2
  1964. movhps -26 * SIZE(X), %xmm2
  1965. PSHUFD2($0xb1, %xmm1, %xmm5)
  1966. mulps ALPHA_R, %xmm1
  1967. mulps ALPHA_I, %xmm5
  1968. addps %xmm5, %xmm1
  1969. movss %xmm1, %xmm0
  1970. shufps $0x93, %xmm1, %xmm0
  1971. addps -32 * SIZE(Y), %xmm0
  1972. movaps %xmm0, -32 * SIZE(Y)
  1973. PSHUFD2($0xb1, %xmm2, %xmm5)
  1974. mulps ALPHA_R, %xmm2
  1975. mulps ALPHA_I, %xmm5
  1976. addps %xmm5, %xmm2
  1977. movss %xmm2, %xmm1
  1978. shufps $0x93, %xmm2, %xmm1
  1979. addps -28 * SIZE(Y), %xmm1
  1980. movaps %xmm1, -28 * SIZE(Y)
  1981. movsd -24 * SIZE(X), %xmm3
  1982. movhps -22 * SIZE(X), %xmm3
  1983. movsd -20 * SIZE(X), %xmm0
  1984. movhps -18 * SIZE(X), %xmm0
  1985. PSHUFD2($0xb1, %xmm3, %xmm5)
  1986. mulps ALPHA_R, %xmm3
  1987. mulps ALPHA_I, %xmm5
  1988. addps %xmm5, %xmm3
  1989. movss %xmm3, %xmm2
  1990. shufps $0x93, %xmm3, %xmm2
  1991. addps -24 * SIZE(Y), %xmm2
  1992. movaps %xmm2, -24 * SIZE(Y)
  1993. PSHUFD2($0xb1, %xmm0, %xmm5)
  1994. mulps ALPHA_R, %xmm0
  1995. mulps ALPHA_I, %xmm5
  1996. addps %xmm5, %xmm0
  1997. movss %xmm0, %xmm3
  1998. shufps $0x93, %xmm0, %xmm3
  1999. addps -20 * SIZE(Y), %xmm3
  2000. movaps %xmm3, -20 * SIZE(Y)
  2001. addl $16 * SIZE, X
  2002. addl $16 * SIZE, Y
  2003. ALIGN_2
  2004. .L76:
  2005. testl $4, M
  2006. jle .L77
  2007. movsd -32 * SIZE(X), %xmm1
  2008. movhps -30 * SIZE(X), %xmm1
  2009. PSHUFD2($0xb1, %xmm1, %xmm5)
  2010. mulps ALPHA_R, %xmm1
  2011. mulps ALPHA_I, %xmm5
  2012. addps %xmm5, %xmm1
  2013. movss %xmm1, %xmm0
  2014. shufps $0x93, %xmm1, %xmm0
  2015. addps -32 * SIZE(Y), %xmm0
  2016. movaps %xmm0, -32 * SIZE(Y)
  2017. movsd -28 * SIZE(X), %xmm2
  2018. movhps -26 * SIZE(X), %xmm2
  2019. PSHUFD2($0xb1, %xmm2, %xmm5)
  2020. mulps ALPHA_R, %xmm2
  2021. mulps ALPHA_I, %xmm5
  2022. addps %xmm5, %xmm2
  2023. movss %xmm2, %xmm1
  2024. shufps $0x93, %xmm2, %xmm1
  2025. addps -28 * SIZE(Y), %xmm1
  2026. movaps %xmm1, -28 * SIZE(Y)
  2027. movaps %xmm2, %xmm0
  2028. addl $8 * SIZE, X
  2029. addl $8 * SIZE, Y
  2030. ALIGN_2
  2031. .L77:
  2032. testl $2, M
  2033. jle .L78
  2034. movsd -32 * SIZE(X), %xmm1
  2035. movhps -30 * SIZE(X), %xmm1
  2036. PSHUFD2($0xb1, %xmm1, %xmm5)
  2037. mulps ALPHA_R, %xmm1
  2038. mulps ALPHA_I, %xmm5
  2039. addps %xmm5, %xmm1
  2040. movss %xmm1, %xmm0
  2041. shufps $0x93, %xmm1, %xmm0
  2042. addps -32 * SIZE(Y), %xmm0
  2043. movaps %xmm0, -32 * SIZE(Y)
  2044. movaps %xmm1, %xmm0
  2045. addl $4 * SIZE, X
  2046. addl $4 * SIZE, Y
  2047. ALIGN_2
  2048. .L78:
  2049. testl $1, M
  2050. jle .L79
  2051. #ifdef movsd
  2052. xorps %xmm1, %xmm1
  2053. #endif
  2054. movsd -32 * SIZE(X), %xmm1
  2055. PSHUFD2($0xb1, %xmm1, %xmm5)
  2056. mulps ALPHA_R, %xmm1
  2057. mulps ALPHA_I, %xmm5
  2058. addps %xmm5, %xmm1
  2059. movss %xmm1, %xmm0
  2060. shufps $0x93, %xmm1, %xmm0
  2061. addps -32 * SIZE(Y), %xmm0
  2062. movaps %xmm0, -32 * SIZE(Y)
  2063. jmp .L999
  2064. ALIGN_3
  2065. .L79:
  2066. shufps $0x93, %xmm0, %xmm0
  2067. addss -32 * SIZE(Y), %xmm0
  2068. movss %xmm0, -32 * SIZE(Y)
  2069. jmp .L999
  2070. ALIGN_3
  2071. #ifdef ALIGNED_ACCESS
  2072. .L80:
  2073. subl $3 * SIZE, X
  2074. movaps -32 * SIZE(X), %xmm1
  2075. movl M, %eax
  2076. sarl $4, %eax
  2077. jle .L85
  2078. movaps -28 * SIZE(X), %xmm2
  2079. movaps -24 * SIZE(X), %xmm3
  2080. decl %eax
  2081. jle .L82
  2082. ALIGN_3
  2083. .L81:
  2084. #ifdef PREFETCHW
  2085. PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y)
  2086. #endif
  2087. movss %xmm2, %xmm1
  2088. shufps $0x93, %xmm2, %xmm1
  2089. PSHUFD2($0xb1, %xmm1, %xmm5)
  2090. mulps ALPHA_R, %xmm1
  2091. mulps ALPHA_I, %xmm5
  2092. addps %xmm5, %xmm1
  2093. movss %xmm1, %xmm0
  2094. shufps $0x93, %xmm1, %xmm0
  2095. addps -32 * SIZE(Y), %xmm0
  2096. movaps %xmm0, -32 * SIZE(Y)
  2097. movaps -20 * SIZE(X), %xmm0
  2098. movss %xmm3, %xmm2
  2099. shufps $0x93, %xmm3, %xmm2
  2100. PSHUFD2($0xb1, %xmm2, %xmm5)
  2101. mulps ALPHA_R, %xmm2
  2102. mulps ALPHA_I, %xmm5
  2103. addps %xmm5, %xmm2
  2104. movss %xmm2, %xmm1
  2105. shufps $0x93, %xmm2, %xmm1
  2106. addps -28 * SIZE(Y), %xmm1
  2107. movaps %xmm1, -28 * SIZE(Y)
  2108. movaps -16 * SIZE(X), %xmm1
  2109. #ifdef PREFETCH
  2110. PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X)
  2111. #endif
  2112. movss %xmm0, %xmm3
  2113. shufps $0x93, %xmm0, %xmm3
  2114. PSHUFD2($0xb1, %xmm3, %xmm5)
  2115. mulps ALPHA_R, %xmm3
  2116. mulps ALPHA_I, %xmm5
  2117. addps %xmm5, %xmm3
  2118. movss %xmm3, %xmm2
  2119. shufps $0x93, %xmm3, %xmm2
  2120. addps -24 * SIZE(Y), %xmm2
  2121. movaps %xmm2, -24 * SIZE(Y)
  2122. movaps -12 * SIZE(X), %xmm2
  2123. movss %xmm1, %xmm0
  2124. shufps $0x93, %xmm1, %xmm0
  2125. PSHUFD2($0xb1, %xmm0, %xmm5)
  2126. mulps ALPHA_R, %xmm0
  2127. mulps ALPHA_I, %xmm5
  2128. addps %xmm5, %xmm0
  2129. movss %xmm0, %xmm3
  2130. shufps $0x93, %xmm0, %xmm3
  2131. addps -20 * SIZE(Y), %xmm3
  2132. movaps %xmm3, -20 * SIZE(Y)
  2133. movaps -8 * SIZE(X), %xmm3
  2134. #if defined(PREFETCHW) && !defined(FETCH128)
  2135. PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y)
  2136. #endif
  2137. movss %xmm2, %xmm1
  2138. shufps $0x93, %xmm2, %xmm1
  2139. PSHUFD2($0xb1, %xmm1, %xmm5)
  2140. mulps ALPHA_R, %xmm1
  2141. mulps ALPHA_I, %xmm5
  2142. addps %xmm5, %xmm1
  2143. movss %xmm1, %xmm0
  2144. shufps $0x93, %xmm1, %xmm0
  2145. addps -16 * SIZE(Y), %xmm0
  2146. movaps %xmm0, -16 * SIZE(Y)
  2147. movaps -4 * SIZE(X), %xmm0
  2148. movss %xmm3, %xmm2
  2149. shufps $0x93, %xmm3, %xmm2
  2150. PSHUFD2($0xb1, %xmm2, %xmm5)
  2151. mulps ALPHA_R, %xmm2
  2152. mulps ALPHA_I, %xmm5
  2153. addps %xmm5, %xmm2
  2154. movss %xmm2, %xmm1
  2155. shufps $0x93, %xmm2, %xmm1
  2156. addps -12 * SIZE(Y), %xmm1
  2157. movaps %xmm1, -12 * SIZE(Y)
  2158. movaps 0 * SIZE(X), %xmm1
  2159. #if defined(PREFETCH) && !defined(FETCH128)
  2160. PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X)
  2161. #endif
  2162. movss %xmm0, %xmm3
  2163. shufps $0x93, %xmm0, %xmm3
  2164. PSHUFD2($0xb1, %xmm3, %xmm5)
  2165. mulps ALPHA_R, %xmm3
  2166. mulps ALPHA_I, %xmm5
  2167. addps %xmm5, %xmm3
  2168. movss %xmm3, %xmm2
  2169. shufps $0x93, %xmm3, %xmm2
  2170. addps -8 * SIZE(Y), %xmm2
  2171. movaps %xmm2, -8 * SIZE(Y)
  2172. movaps 4 * SIZE(X), %xmm2
  2173. movss %xmm1, %xmm0
  2174. shufps $0x93, %xmm1, %xmm0
  2175. PSHUFD2($0xb1, %xmm0, %xmm5)
  2176. mulps ALPHA_R, %xmm0
  2177. mulps ALPHA_I, %xmm5
  2178. addps %xmm5, %xmm0
  2179. movss %xmm0, %xmm3
  2180. shufps $0x93, %xmm0, %xmm3
  2181. addps -4 * SIZE(Y), %xmm3
  2182. movaps %xmm3, -4 * SIZE(Y)
  2183. movaps 8 * SIZE(X), %xmm3
  2184. subl $-32 * SIZE, X
  2185. subl $-32 * SIZE, Y
  2186. decl %eax
  2187. jg .L81
  2188. ALIGN_3
  2189. .L82:
  2190. movss %xmm2, %xmm1
  2191. shufps $0x93, %xmm2, %xmm1
  2192. PSHUFD2($0xb1, %xmm1, %xmm5)
  2193. mulps ALPHA_R, %xmm1
  2194. mulps ALPHA_I, %xmm5
  2195. addps %xmm5, %xmm1
  2196. movss %xmm1, %xmm0
  2197. shufps $0x93, %xmm1, %xmm0
  2198. addps -32 * SIZE(Y), %xmm0
  2199. movaps %xmm0, -32 * SIZE(Y)
  2200. movaps -20 * SIZE(X), %xmm0
  2201. movss %xmm3, %xmm2
  2202. shufps $0x93, %xmm3, %xmm2
  2203. PSHUFD2($0xb1, %xmm2, %xmm5)
  2204. mulps ALPHA_R, %xmm2
  2205. mulps ALPHA_I, %xmm5
  2206. addps %xmm5, %xmm2
  2207. movss %xmm2, %xmm1
  2208. shufps $0x93, %xmm2, %xmm1
  2209. addps -28 * SIZE(Y), %xmm1
  2210. movaps %xmm1, -28 * SIZE(Y)
  2211. movaps -16 * SIZE(X), %xmm1
  2212. movss %xmm0, %xmm3
  2213. shufps $0x93, %xmm0, %xmm3
  2214. PSHUFD2($0xb1, %xmm3, %xmm5)
  2215. mulps ALPHA_R, %xmm3
  2216. mulps ALPHA_I, %xmm5
  2217. addps %xmm5, %xmm3
  2218. movss %xmm3, %xmm2
  2219. shufps $0x93, %xmm3, %xmm2
  2220. addps -24 * SIZE(Y), %xmm2
  2221. movaps %xmm2, -24 * SIZE(Y)
  2222. movaps -12 * SIZE(X), %xmm2
  2223. movss %xmm1, %xmm0
  2224. shufps $0x93, %xmm1, %xmm0
  2225. PSHUFD2($0xb1, %xmm0, %xmm5)
  2226. mulps ALPHA_R, %xmm0
  2227. mulps ALPHA_I, %xmm5
  2228. addps %xmm5, %xmm0
  2229. movss %xmm0, %xmm3
  2230. shufps $0x93, %xmm0, %xmm3
  2231. addps -20 * SIZE(Y), %xmm3
  2232. movaps %xmm3, -20 * SIZE(Y)
  2233. movaps -8 * SIZE(X), %xmm3
  2234. movss %xmm2, %xmm1
  2235. shufps $0x93, %xmm2, %xmm1
  2236. PSHUFD2($0xb1, %xmm1, %xmm5)
  2237. mulps ALPHA_R, %xmm1
  2238. mulps ALPHA_I, %xmm5
  2239. addps %xmm5, %xmm1
  2240. movss %xmm1, %xmm0
  2241. shufps $0x93, %xmm1, %xmm0
  2242. addps -16 * SIZE(Y), %xmm0
  2243. movaps %xmm0, -16 * SIZE(Y)
  2244. movaps -4 * SIZE(X), %xmm0
  2245. movss %xmm3, %xmm2
  2246. shufps $0x93, %xmm3, %xmm2
  2247. PSHUFD2($0xb1, %xmm2, %xmm5)
  2248. mulps ALPHA_R, %xmm2
  2249. mulps ALPHA_I, %xmm5
  2250. addps %xmm5, %xmm2
  2251. movss %xmm2, %xmm1
  2252. shufps $0x93, %xmm2, %xmm1
  2253. addps -12 * SIZE(Y), %xmm1
  2254. movaps %xmm1, -12 * SIZE(Y)
  2255. movaps 0 * SIZE(X), %xmm1
  2256. movss %xmm0, %xmm3
  2257. shufps $0x93, %xmm0, %xmm3
  2258. PSHUFD2($0xb1, %xmm3, %xmm5)
  2259. mulps ALPHA_R, %xmm3
  2260. mulps ALPHA_I, %xmm5
  2261. addps %xmm5, %xmm3
  2262. movss %xmm3, %xmm2
  2263. shufps $0x93, %xmm3, %xmm2
  2264. addps -8 * SIZE(Y), %xmm2
  2265. movaps %xmm2, -8 * SIZE(Y)
  2266. movss %xmm1, %xmm0
  2267. shufps $0x93, %xmm1, %xmm0
  2268. PSHUFD2($0xb1, %xmm0, %xmm5)
  2269. mulps ALPHA_R, %xmm0
  2270. mulps ALPHA_I, %xmm5
  2271. addps %xmm5, %xmm0
  2272. movss %xmm0, %xmm3
  2273. shufps $0x93, %xmm0, %xmm3
  2274. addps -4 * SIZE(Y), %xmm3
  2275. movaps %xmm3, -4 * SIZE(Y)
  2276. subl $-32 * SIZE, X
  2277. subl $-32 * SIZE, Y
  2278. ALIGN_3
  2279. .L85:
  2280. testl $8, M
  2281. jle .L86
  2282. movaps -28 * SIZE(X), %xmm2
  2283. movaps -24 * SIZE(X), %xmm3
  2284. movss %xmm2, %xmm1
  2285. shufps $0x93, %xmm2, %xmm1
  2286. PSHUFD2($0xb1, %xmm1, %xmm5)
  2287. mulps ALPHA_R, %xmm1
  2288. mulps ALPHA_I, %xmm5
  2289. addps %xmm5, %xmm1
  2290. movss %xmm1, %xmm0
  2291. shufps $0x93, %xmm1, %xmm0
  2292. addps -32 * SIZE(Y), %xmm0
  2293. movaps %xmm0, -32 * SIZE(Y)
  2294. movss %xmm3, %xmm2
  2295. shufps $0x93, %xmm3, %xmm2
  2296. PSHUFD2($0xb1, %xmm2, %xmm5)
  2297. mulps ALPHA_R, %xmm2
  2298. mulps ALPHA_I, %xmm5
  2299. addps %xmm5, %xmm2
  2300. movss %xmm2, %xmm1
  2301. shufps $0x93, %xmm2, %xmm1
  2302. addps -28 * SIZE(Y), %xmm1
  2303. movaps %xmm1, -28 * SIZE(Y)
  2304. movaps -20 * SIZE(X), %xmm0
  2305. movaps -16 * SIZE(X), %xmm1
  2306. movss %xmm0, %xmm3
  2307. shufps $0x93, %xmm0, %xmm3
  2308. PSHUFD2($0xb1, %xmm3, %xmm5)
  2309. mulps ALPHA_R, %xmm3
  2310. mulps ALPHA_I, %xmm5
  2311. addps %xmm5, %xmm3
  2312. movss %xmm3, %xmm2
  2313. shufps $0x93, %xmm3, %xmm2
  2314. addps -24 * SIZE(Y), %xmm2
  2315. movaps %xmm2, -24 * SIZE(Y)
  2316. movss %xmm1, %xmm0
  2317. shufps $0x93, %xmm1, %xmm0
  2318. PSHUFD2($0xb1, %xmm0, %xmm5)
  2319. mulps ALPHA_R, %xmm0
  2320. mulps ALPHA_I, %xmm5
  2321. addps %xmm5, %xmm0
  2322. movss %xmm0, %xmm3
  2323. shufps $0x93, %xmm0, %xmm3
  2324. addps -20 * SIZE(Y), %xmm3
  2325. movaps %xmm3, -20 * SIZE(Y)
  2326. addl $16 * SIZE, X
  2327. addl $16 * SIZE, Y
  2328. ALIGN_2
  2329. .L86:
  2330. testl $4, M
  2331. jle .L87
  2332. movaps -28 * SIZE(X), %xmm2
  2333. movaps -24 * SIZE(X), %xmm3
  2334. movss %xmm2, %xmm1
  2335. shufps $0x93, %xmm2, %xmm1
  2336. PSHUFD2($0xb1, %xmm1, %xmm5)
  2337. mulps ALPHA_R, %xmm1
  2338. mulps ALPHA_I, %xmm5
  2339. addps %xmm5, %xmm1
  2340. movss %xmm1, %xmm0
  2341. shufps $0x93, %xmm1, %xmm0
  2342. addps -32 * SIZE(Y), %xmm0
  2343. movaps %xmm0, -32 * SIZE(Y)
  2344. movss %xmm3, %xmm2
  2345. shufps $0x93, %xmm3, %xmm2
  2346. PSHUFD2($0xb1, %xmm2, %xmm5)
  2347. mulps ALPHA_R, %xmm2
  2348. mulps ALPHA_I, %xmm5
  2349. addps %xmm5, %xmm2
  2350. movss %xmm2, %xmm1
  2351. shufps $0x93, %xmm2, %xmm1
  2352. addps -28 * SIZE(Y), %xmm1
  2353. movaps %xmm1, -28 * SIZE(Y)
  2354. movaps %xmm2, %xmm0
  2355. movaps %xmm3, %xmm1
  2356. addl $8 * SIZE, X
  2357. addl $8 * SIZE, Y
  2358. ALIGN_2
  2359. .L87:
  2360. testl $2, M
  2361. jle .L88
  2362. movaps -28 * SIZE(X), %xmm2
  2363. movss %xmm2, %xmm1
  2364. shufps $0x93, %xmm2, %xmm1
  2365. PSHUFD2($0xb1, %xmm1, %xmm5)
  2366. mulps ALPHA_R, %xmm1
  2367. mulps ALPHA_I, %xmm5
  2368. addps %xmm5, %xmm1
  2369. movss %xmm1, %xmm0
  2370. shufps $0x93, %xmm1, %xmm0
  2371. addps -32 * SIZE(Y), %xmm0
  2372. movaps %xmm0, -32 * SIZE(Y)
  2373. movaps %xmm1, %xmm0
  2374. movaps %xmm2, %xmm1
  2375. addl $4 * SIZE, X
  2376. addl $4 * SIZE, Y
  2377. ALIGN_2
  2378. .L88:
  2379. testl $1, M
  2380. jle .L89
  2381. movaps -28 * SIZE(X), %xmm2
  2382. movss %xmm2, %xmm1
  2383. shufps $0x93, %xmm2, %xmm1
  2384. PSHUFD2($0xb1, %xmm1, %xmm5)
  2385. mulps ALPHA_R, %xmm1
  2386. mulps ALPHA_I, %xmm5
  2387. addps %xmm5, %xmm1
  2388. movss %xmm1, %xmm0
  2389. shufps $0x93, %xmm1, %xmm0
  2390. addps -32 * SIZE(Y), %xmm0
  2391. movlps %xmm0, -32 * SIZE(Y)
  2392. movhlps %xmm0, %xmm0
  2393. movss %xmm0, -30 * SIZE(Y)
  2394. jmp .L999
  2395. .L89:
  2396. shufps $0x93, %xmm0, %xmm0
  2397. addss -32 * SIZE(Y), %xmm0
  2398. movss %xmm0, -32 * SIZE(Y)
  2399. jmp .L999
  2400. ALIGN_3
  2401. #endif
  2402. .L100:
  2403. shufps $0, ALPHA_R, ALPHA_R
  2404. shufps $0, ALPHA_I, ALPHA_I
  2405. #ifndef CONJ
  2406. xorps %xmm5, %xmm5
  2407. subps ALPHA_I, %xmm5
  2408. unpcklps ALPHA_R, %xmm5
  2409. unpcklps ALPHA_I, ALPHA_R
  2410. movaps %xmm5, ALPHA_I
  2411. #else
  2412. xorps %xmm5, %xmm5
  2413. subps ALPHA_R, %xmm5
  2414. unpcklps ALPHA_I, ALPHA_R
  2415. unpcklps %xmm5, ALPHA_I
  2416. #endif
  2417. //If incx==0 || incy==0, avoid unloop and jump to end.
  2418. cmpl $0, INCX
  2419. je .L200
  2420. cmpl $0, INCY
  2421. je .L200
  2422. movl Y, YY
  2423. movl M, %eax
  2424. sarl $3, %eax
  2425. jle .L105
  2426. ALIGN_3
  2427. .L102:
  2428. movsd (X), %xmm0
  2429. addl INCX, X
  2430. movhps (X), %xmm0
  2431. addl INCX, X
  2432. movsd (X), %xmm2
  2433. addl INCX, X
  2434. movhps (X), %xmm2
  2435. addl INCX, X
  2436. #ifdef HAVE_SSE3
  2437. movshdup %xmm0, %xmm1
  2438. movsldup %xmm0, %xmm0
  2439. movshdup %xmm2, %xmm3
  2440. movsldup %xmm2, %xmm2
  2441. #else
  2442. movaps %xmm0, %xmm1
  2443. shufps $0xa0, %xmm0, %xmm0
  2444. shufps $0xf5, %xmm1, %xmm1
  2445. movaps %xmm2, %xmm3
  2446. shufps $0xa0, %xmm2, %xmm2
  2447. shufps $0xf5, %xmm3, %xmm3
  2448. #endif
  2449. mulps ALPHA_R, %xmm0
  2450. mulps ALPHA_I, %xmm1
  2451. mulps ALPHA_R, %xmm2
  2452. mulps ALPHA_I, %xmm3
  2453. movsd (Y), %xmm4
  2454. addl INCY, Y
  2455. movhps (Y), %xmm4
  2456. addl INCY, Y
  2457. movsd (Y), %xmm5
  2458. addl INCY, Y
  2459. movhps (Y), %xmm5
  2460. addl INCY, Y
  2461. addps %xmm0, %xmm4
  2462. addps %xmm1, %xmm4
  2463. addps %xmm2, %xmm5
  2464. addps %xmm3, %xmm5
  2465. movsd %xmm4, (YY)
  2466. addl INCY, YY
  2467. movhps %xmm4, (YY)
  2468. addl INCY, YY
  2469. movsd %xmm5, (YY)
  2470. addl INCY, YY
  2471. movhps %xmm5, (YY)
  2472. addl INCY, YY
  2473. movsd (X), %xmm0
  2474. addl INCX, X
  2475. movhps (X), %xmm0
  2476. addl INCX, X
  2477. movsd (X), %xmm2
  2478. addl INCX, X
  2479. movhps (X), %xmm2
  2480. addl INCX, X
  2481. #ifdef HAVE_SSE3
  2482. movshdup %xmm0, %xmm1
  2483. movsldup %xmm0, %xmm0
  2484. movshdup %xmm2, %xmm3
  2485. movsldup %xmm2, %xmm2
  2486. #else
  2487. movaps %xmm0, %xmm1
  2488. shufps $0xa0, %xmm0, %xmm0
  2489. shufps $0xf5, %xmm1, %xmm1
  2490. movaps %xmm2, %xmm3
  2491. shufps $0xa0, %xmm2, %xmm2
  2492. shufps $0xf5, %xmm3, %xmm3
  2493. #endif
  2494. mulps ALPHA_R, %xmm0
  2495. mulps ALPHA_I, %xmm1
  2496. mulps ALPHA_R, %xmm2
  2497. mulps ALPHA_I, %xmm3
  2498. movsd (Y), %xmm4
  2499. addl INCY, Y
  2500. movhps (Y), %xmm4
  2501. addl INCY, Y
  2502. movsd (Y), %xmm5
  2503. addl INCY, Y
  2504. movhps (Y), %xmm5
  2505. addl INCY, Y
  2506. addps %xmm0, %xmm4
  2507. addps %xmm1, %xmm4
  2508. addps %xmm2, %xmm5
  2509. addps %xmm3, %xmm5
  2510. movsd %xmm4, (YY)
  2511. addl INCY, YY
  2512. movhps %xmm4, (YY)
  2513. addl INCY, YY
  2514. movsd %xmm5, (YY)
  2515. addl INCY, YY
  2516. movhps %xmm5, (YY)
  2517. addl INCY, YY
  2518. decl %eax
  2519. jg .L102
  2520. ALIGN_3
  2521. .L105:
  2522. testl $4, M
  2523. jle .L106
  2524. movsd (X), %xmm0
  2525. addl INCX, X
  2526. movhps (X), %xmm0
  2527. addl INCX, X
  2528. movsd (X), %xmm2
  2529. addl INCX, X
  2530. movhps (X), %xmm2
  2531. addl INCX, X
  2532. #ifdef HAVE_SSE3
  2533. movshdup %xmm0, %xmm1
  2534. movsldup %xmm0, %xmm0
  2535. movshdup %xmm2, %xmm3
  2536. movsldup %xmm2, %xmm2
  2537. #else
  2538. movaps %xmm0, %xmm1
  2539. shufps $0xa0, %xmm0, %xmm0
  2540. shufps $0xf5, %xmm1, %xmm1
  2541. movaps %xmm2, %xmm3
  2542. shufps $0xa0, %xmm2, %xmm2
  2543. shufps $0xf5, %xmm3, %xmm3
  2544. #endif
  2545. mulps ALPHA_R, %xmm0
  2546. mulps ALPHA_I, %xmm1
  2547. mulps ALPHA_R, %xmm2
  2548. mulps ALPHA_I, %xmm3
  2549. movsd (Y), %xmm4
  2550. addl INCY, Y
  2551. movhps (Y), %xmm4
  2552. addl INCY, Y
  2553. movsd (Y), %xmm5
  2554. addl INCY, Y
  2555. movhps (Y), %xmm5
  2556. addl INCY, Y
  2557. addps %xmm0, %xmm4
  2558. addps %xmm1, %xmm4
  2559. addps %xmm2, %xmm5
  2560. addps %xmm3, %xmm5
  2561. movsd %xmm4, (YY)
  2562. addl INCY, YY
  2563. movhps %xmm4, (YY)
  2564. addl INCY, YY
  2565. movsd %xmm5, (YY)
  2566. addl INCY, YY
  2567. movhps %xmm5, (YY)
  2568. addl INCY, YY
  2569. ALIGN_3
  2570. .L106:
  2571. testl $2, M
  2572. jle .L107
  2573. movsd (X), %xmm0
  2574. addl INCX, X
  2575. movhps (X), %xmm0
  2576. addl INCX, X
  2577. #ifdef HAVE_SSE3
  2578. movshdup %xmm0, %xmm1
  2579. movsldup %xmm0, %xmm0
  2580. #else
  2581. movaps %xmm0, %xmm1
  2582. shufps $0xa0, %xmm0, %xmm0
  2583. shufps $0xf5, %xmm1, %xmm1
  2584. #endif
  2585. mulps ALPHA_R, %xmm0
  2586. mulps ALPHA_I, %xmm1
  2587. movsd (Y), %xmm4
  2588. addl INCY, Y
  2589. movhps (Y), %xmm4
  2590. addl INCY, Y
  2591. addps %xmm0, %xmm4
  2592. addps %xmm1, %xmm4
  2593. movsd %xmm4, (YY)
  2594. addl INCY, YY
  2595. movhps %xmm4, (YY)
  2596. addl INCY, YY
  2597. ALIGN_3
  2598. .L107:
  2599. testl $1, M
  2600. jle .L999
  2601. movsd (X), %xmm0
  2602. #ifdef HAVE_SSE3
  2603. movshdup %xmm0, %xmm1
  2604. movsldup %xmm0, %xmm0
  2605. #else
  2606. movaps %xmm0, %xmm1
  2607. shufps $0xa0, %xmm0, %xmm0
  2608. shufps $0xf5, %xmm1, %xmm1
  2609. #endif
  2610. mulps ALPHA_R, %xmm0
  2611. mulps ALPHA_I, %xmm1
  2612. movsd (Y), %xmm4
  2613. addps %xmm0, %xmm4
  2614. addps %xmm1, %xmm4
  2615. movsd %xmm4, (Y)
  2616. jmp .L999
  2617. ALIGN_3
  2618. .L200:
  2619. movl M, %eax
  2620. cmpl $0, %eax
  2621. jle .L999
  2622. ALIGN_3
  2623. .L201:
  2624. movsd (X), %xmm0
  2625. #ifdef HAVE_SSE3
  2626. movshdup %xmm0, %xmm1
  2627. movsldup %xmm0, %xmm0
  2628. #else
  2629. movaps %xmm0, %xmm1
  2630. shufps $0xa0, %xmm0, %xmm0
  2631. shufps $0xf5, %xmm1, %xmm1
  2632. #endif
  2633. mulps ALPHA_R, %xmm0
  2634. mulps ALPHA_I, %xmm1
  2635. movsd (Y), %xmm4
  2636. addps %xmm0, %xmm4
  2637. addps %xmm1, %xmm4
  2638. movsd %xmm4, (Y)
  2639. decl %eax
  2640. jg .L201
  2641. ALIGN_3
  2642. .L999:
  2643. popl %ebp
  2644. popl %ebx
  2645. popl %esi
  2646. popl %edi
  2647. ret
  2648. EPILOGUE