You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cgemv_n.S 81 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #include "l2param.h"
  41. #if GEMV_UNROLL < 2
  42. #undef GEMV_UNROLL
  43. #define GEMV_UNROLL 2
  44. #endif
  45. #ifndef WINDOWS_ABI
  46. #define STACKSIZE 128
  47. #define OLD_INCX 8 + STACKSIZE(%rsp)
  48. #define OLD_Y 16 + STACKSIZE(%rsp)
  49. #define OLD_INCY 24 + STACKSIZE(%rsp)
  50. #define OLD_BUFFER 32 + STACKSIZE(%rsp)
  51. #define ALPHA 48 (%rsp)
  52. #define MMM 64(%rsp)
  53. #define NN 72(%rsp)
  54. #define AA 80(%rsp)
  55. #define XX 88(%rsp)
  56. #define LDAX 96(%rsp)
  57. #define ALPHAR 104(%rsp)
  58. #define ALPHAI 112(%rsp)
  59. #define M %rdi
  60. #define N %rsi
  61. #define A %rcx
  62. #define LDA %r8
  63. #define X %r9
  64. #define INCX %rdx
  65. #define Y %rbp
  66. #define INCY %r10
  67. #else
  68. #define STACKSIZE 288
  69. #define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
  70. #define OLD_A 48 + STACKSIZE(%rsp)
  71. #define OLD_LDA 56 + STACKSIZE(%rsp)
  72. #define OLD_X 64 + STACKSIZE(%rsp)
  73. #define OLD_INCX 72 + STACKSIZE(%rsp)
  74. #define OLD_Y 80 + STACKSIZE(%rsp)
  75. #define OLD_INCY 88 + STACKSIZE(%rsp)
  76. #define OLD_BUFFER 96 + STACKSIZE(%rsp)
  77. #define ALPHA 224 (%rsp)
  78. #define MMM 232(%rsp)
  79. #define NN 240(%rsp)
  80. #define AA 248(%rsp)
  81. #define XX 256(%rsp)
  82. #define LDAX 264(%rsp)
  83. #define ALPHAR 272(%rsp)
  84. #define ALPHAI 280(%rsp)
  85. #define M %rcx
  86. #define N %rdx
  87. #define A %r8
  88. #define LDA %r9
  89. #define X %rdi
  90. #define INCX %rsi
  91. #define Y %rbp
  92. #define INCY %r10
  93. #endif
  94. #define I %rax
  95. #define A1 %r11
  96. #define A2 %r12
  97. #define Y1 %r13
  98. #define BUFFER %r14
  99. #ifdef ALIGNED_ACCESS
  100. #define MM %r15
  101. #else
  102. #define MM M
  103. #endif
  104. #undef SUBPS
  105. #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ))
  106. #define SUBPS subps
  107. #else
  108. #define SUBPS addps
  109. #endif
  110. PROLOGUE
  111. PROFCODE
  112. subq $STACKSIZE, %rsp
  113. movq %rbx, 0(%rsp)
  114. movq %rbp, 8(%rsp)
  115. movq %r12, 16(%rsp)
  116. movq %r13, 24(%rsp)
  117. movq %r14, 32(%rsp)
  118. movq %r15, 40(%rsp)
  119. #ifdef WINDOWS_ABI
  120. movq %rdi, 48(%rsp)
  121. movq %rsi, 56(%rsp)
  122. movups %xmm6, 64(%rsp)
  123. movups %xmm7, 80(%rsp)
  124. movups %xmm8, 96(%rsp)
  125. movups %xmm9, 112(%rsp)
  126. movups %xmm10, 128(%rsp)
  127. movups %xmm11, 144(%rsp)
  128. movups %xmm12, 160(%rsp)
  129. movups %xmm13, 176(%rsp)
  130. movups %xmm14, 192(%rsp)
  131. movups %xmm15, 208(%rsp)
  132. movq OLD_A, A
  133. movq OLD_LDA, LDA
  134. movq OLD_X, X
  135. movaps %xmm3, %xmm0
  136. movss OLD_ALPHA_I, %xmm1
  137. #endif
  138. movq A, AA
  139. movq N, NN
  140. movq M, MMM
  141. movq LDA, LDAX
  142. movq X, XX
  143. movq OLD_Y, Y
  144. movss %xmm0,ALPHAR
  145. movss %xmm1,ALPHAI
  146. .L0t:
  147. xorq I,I
  148. addq $1,I
  149. salq $20,I
  150. subq I,MMM
  151. movq I,M
  152. movss ALPHAR,%xmm0
  153. movss ALPHAI,%xmm1
  154. jge .L00t
  155. movq MMM,M
  156. addq I,M
  157. jle .L999x
  158. .L00t:
  159. movq AA, A
  160. movq NN, N
  161. movq LDAX, LDA
  162. movq XX, X
  163. movq OLD_INCX, INCX
  164. # movq OLD_Y, Y
  165. movq OLD_INCY, INCY
  166. movq OLD_BUFFER, BUFFER
  167. salq $ZBASE_SHIFT, LDA
  168. salq $ZBASE_SHIFT, INCX
  169. salq $ZBASE_SHIFT, INCY
  170. unpcklps %xmm1, %xmm0
  171. movlps %xmm0, ALPHA
  172. testq M, M
  173. jle .L999
  174. testq N, N
  175. jle .L999
  176. ALIGN_3
  177. subq $-32 * SIZE, A
  178. movq BUFFER, Y1
  179. pxor %xmm4, %xmm4
  180. movq M, %rax
  181. addq $8, %rax
  182. sarq $3, %rax
  183. ALIGN_3
  184. .L01:
  185. movaps %xmm4, 0 * SIZE(Y1)
  186. movaps %xmm4, 4 * SIZE(Y1)
  187. movaps %xmm4, 8 * SIZE(Y1)
  188. movaps %xmm4, 12 * SIZE(Y1)
  189. subq $-16 * SIZE, Y1
  190. decq %rax
  191. jg .L01
  192. ALIGN_3
  193. .L10:
  194. #ifdef ALIGNED_ACCESS
  195. movq M, MM
  196. movq A, %rax
  197. andq $4 * SIZE - 1, %rax
  198. leaq 2 * SIZE(BUFFER), A1
  199. leaq -1(M), A2
  200. cmpq $2 * SIZE, %rax
  201. cmovge A1, BUFFER
  202. cmovge A2, MM
  203. testq $SIZE, A
  204. jne .L200
  205. testq $2 * SIZE, LDA
  206. jne .L100
  207. #endif
  208. #if GEMV_UNROLL >= 4
  209. cmpq $4, N
  210. jl .L20
  211. ALIGN_3
  212. .L11:
  213. subq $4, N
  214. leaq 32 * SIZE(BUFFER), Y1
  215. movq A, A1
  216. leaq (A, LDA, 2), A2
  217. leaq (A, LDA, 4), A
  218. movsd (X), %xmm9
  219. addq INCX, X
  220. movsd (X), %xmm11
  221. addq INCX, X
  222. movsd (X), %xmm13
  223. addq INCX, X
  224. movsd (X), %xmm15
  225. addq INCX, X
  226. #ifdef HAVE_SSE3
  227. movddup ALPHA, %xmm6
  228. #else
  229. movsd ALPHA, %xmm6
  230. unpcklpd %xmm6, %xmm6
  231. #endif
  232. pshufd $0xb1, %xmm6, %xmm5
  233. pcmpeqb %xmm7, %xmm7
  234. psllq $63, %xmm7
  235. pshufd $0x00, %xmm9, %xmm8
  236. pshufd $0x55, %xmm9, %xmm9
  237. pshufd $0x00, %xmm11, %xmm10
  238. pshufd $0x55, %xmm11, %xmm11
  239. pshufd $0x00, %xmm13, %xmm12
  240. pshufd $0x55, %xmm13, %xmm13
  241. pshufd $0x00, %xmm15, %xmm14
  242. pshufd $0x55, %xmm15, %xmm15
  243. #ifndef XCONJ
  244. xorps %xmm7, %xmm9
  245. xorps %xmm7, %xmm11
  246. xorps %xmm7, %xmm13
  247. xorps %xmm7, %xmm15
  248. #else
  249. xorps %xmm7, %xmm8
  250. xorps %xmm7, %xmm10
  251. xorps %xmm7, %xmm12
  252. xorps %xmm7, %xmm14
  253. #endif
  254. mulps %xmm6, %xmm8
  255. mulps %xmm5, %xmm9
  256. mulps %xmm6, %xmm10
  257. mulps %xmm5, %xmm11
  258. mulps %xmm6, %xmm12
  259. mulps %xmm5, %xmm13
  260. mulps %xmm6, %xmm14
  261. mulps %xmm5, %xmm15
  262. #ifndef XCONJ
  263. subps %xmm9, %xmm8
  264. subps %xmm11, %xmm10
  265. subps %xmm13, %xmm12
  266. subps %xmm15, %xmm14
  267. #else
  268. addps %xmm9, %xmm8
  269. addps %xmm11, %xmm10
  270. addps %xmm13, %xmm12
  271. addps %xmm15, %xmm14
  272. #endif
  273. pshufd $0x55, %xmm8, %xmm9
  274. pshufd $0x00, %xmm8, %xmm8
  275. pshufd $0x55, %xmm10, %xmm11
  276. pshufd $0x00, %xmm10, %xmm10
  277. pshufd $0x55, %xmm12, %xmm13
  278. pshufd $0x00, %xmm12, %xmm12
  279. pshufd $0x55, %xmm14, %xmm15
  280. pshufd $0x00, %xmm14, %xmm14
  281. #ifndef CONJ
  282. xorps %xmm7, %xmm9
  283. xorps %xmm7, %xmm11
  284. xorps %xmm7, %xmm13
  285. xorps %xmm7, %xmm15
  286. #else
  287. xorps %xmm7, %xmm8
  288. xorps %xmm7, %xmm10
  289. xorps %xmm7, %xmm12
  290. xorps %xmm7, %xmm14
  291. #endif
  292. #ifdef ALIGNED_ACCESS
  293. cmpq M, MM
  294. je .L1X
  295. movsd -32 * SIZE(A1), %xmm4
  296. movsd -32 * SIZE(A1, LDA), %xmm6
  297. movsd -32 * SIZE(Y1), %xmm0
  298. pshufd $0xb1, %xmm4, %xmm5
  299. mulps %xmm8, %xmm4
  300. addps %xmm4, %xmm0
  301. movsd -32 * SIZE(A2), %xmm4
  302. pshufd $0xb1, %xmm6, %xmm7
  303. mulps %xmm9, %xmm5
  304. SUBPS %xmm5, %xmm0
  305. mulps %xmm10, %xmm6
  306. addps %xmm6, %xmm0
  307. movsd -32 * SIZE(A2, LDA), %xmm6
  308. mulps %xmm11, %xmm7
  309. SUBPS %xmm7, %xmm0
  310. pshufd $0xb1, %xmm4, %xmm5
  311. mulps %xmm12, %xmm4
  312. addps %xmm4, %xmm0
  313. pshufd $0xb1, %xmm6, %xmm7
  314. mulps %xmm13, %xmm5
  315. SUBPS %xmm5, %xmm0
  316. mulps %xmm14, %xmm6
  317. addps %xmm6, %xmm0
  318. mulps %xmm15, %xmm7
  319. SUBPS %xmm7, %xmm0
  320. movlps %xmm0, -32 * SIZE(Y1)
  321. addq $2 * SIZE, A1
  322. addq $2 * SIZE, A2
  323. addq $2 * SIZE, Y1
  324. ALIGN_3
  325. .L1X:
  326. #endif
  327. movaps -32 * SIZE(Y1), %xmm0
  328. movaps -28 * SIZE(Y1), %xmm1
  329. movaps -24 * SIZE(Y1), %xmm2
  330. movaps -20 * SIZE(Y1), %xmm3
  331. movq MM, I
  332. sarq $3, I
  333. jle .L15
  334. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  335. MOVUPS_A1(-28 * SIZE, A1, %xmm6)
  336. decq I
  337. jle .L14
  338. ALIGN_3
  339. .L13:
  340. #ifdef PREFETCH
  341. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1)
  342. #endif
  343. pshufd $0xb1, %xmm4, %xmm5
  344. mulps %xmm8, %xmm4
  345. addps %xmm4, %xmm0
  346. MOVUPS_A1(-24 * SIZE, A1, %xmm4)
  347. pshufd $0xb1, %xmm6, %xmm7
  348. mulps %xmm8, %xmm6
  349. addps %xmm6, %xmm1
  350. MOVUPS_A1(-20 * SIZE, A1, %xmm6)
  351. mulps %xmm9, %xmm5
  352. SUBPS %xmm5, %xmm0
  353. mulps %xmm9, %xmm7
  354. SUBPS %xmm7, %xmm1
  355. pshufd $0xb1, %xmm4, %xmm5
  356. mulps %xmm8, %xmm4
  357. addps %xmm4, %xmm2
  358. MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4)
  359. pshufd $0xb1, %xmm6, %xmm7
  360. mulps %xmm8, %xmm6
  361. addps %xmm6, %xmm3
  362. MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm6)
  363. mulps %xmm9, %xmm5
  364. SUBPS %xmm5, %xmm2
  365. mulps %xmm9, %xmm7
  366. SUBPS %xmm7, %xmm3
  367. #ifdef PREFETCH
  368. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
  369. #endif
  370. pshufd $0xb1, %xmm4, %xmm5
  371. mulps %xmm10, %xmm4
  372. addps %xmm4, %xmm0
  373. MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm4)
  374. pshufd $0xb1, %xmm6, %xmm7
  375. mulps %xmm10, %xmm6
  376. addps %xmm6, %xmm1
  377. MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm6)
  378. mulps %xmm11, %xmm5
  379. SUBPS %xmm5, %xmm0
  380. mulps %xmm11, %xmm7
  381. SUBPS %xmm7, %xmm1
  382. pshufd $0xb1, %xmm4, %xmm5
  383. mulps %xmm10, %xmm4
  384. addps %xmm4, %xmm2
  385. MOVUPS_A1(-32 * SIZE, A2, %xmm4)
  386. pshufd $0xb1, %xmm6, %xmm7
  387. mulps %xmm10, %xmm6
  388. addps %xmm6, %xmm3
  389. MOVUPS_A1(-28 * SIZE, A2, %xmm6)
  390. mulps %xmm11, %xmm5
  391. SUBPS %xmm5, %xmm2
  392. mulps %xmm11, %xmm7
  393. SUBPS %xmm7, %xmm3
  394. #ifdef PREFETCH
  395. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2)
  396. #endif
  397. pshufd $0xb1, %xmm4, %xmm5
  398. mulps %xmm12, %xmm4
  399. addps %xmm4, %xmm0
  400. MOVUPS_A1(-24 * SIZE, A2, %xmm4)
  401. pshufd $0xb1, %xmm6, %xmm7
  402. mulps %xmm12, %xmm6
  403. addps %xmm6, %xmm1
  404. MOVUPS_A1(-20 * SIZE, A2, %xmm6)
  405. mulps %xmm13, %xmm5
  406. SUBPS %xmm5, %xmm0
  407. mulps %xmm13, %xmm7
  408. SUBPS %xmm7, %xmm1
  409. pshufd $0xb1, %xmm4, %xmm5
  410. mulps %xmm12, %xmm4
  411. addps %xmm4, %xmm2
  412. MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4)
  413. pshufd $0xb1, %xmm6, %xmm7
  414. mulps %xmm12, %xmm6
  415. addps %xmm6, %xmm3
  416. MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm6)
  417. mulps %xmm13, %xmm5
  418. SUBPS %xmm5, %xmm2
  419. mulps %xmm13, %xmm7
  420. SUBPS %xmm7, %xmm3
  421. #ifdef PREFETCH
  422. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
  423. #endif
  424. pshufd $0xb1, %xmm4, %xmm5
  425. mulps %xmm14, %xmm4
  426. addps %xmm4, %xmm0
  427. MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm4)
  428. pshufd $0xb1, %xmm6, %xmm7
  429. mulps %xmm14, %xmm6
  430. addps %xmm6, %xmm1
  431. MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm6)
  432. mulps %xmm15, %xmm5
  433. SUBPS %xmm5, %xmm0
  434. mulps %xmm15, %xmm7
  435. SUBPS %xmm7, %xmm1
  436. pshufd $0xb1, %xmm4, %xmm5
  437. mulps %xmm14, %xmm4
  438. addps %xmm4, %xmm2
  439. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  440. pshufd $0xb1, %xmm6, %xmm7
  441. mulps %xmm14, %xmm6
  442. addps %xmm6, %xmm3
  443. MOVUPS_A1(-12 * SIZE, A1, %xmm6)
  444. mulps %xmm15, %xmm5
  445. SUBPS %xmm5, %xmm2
  446. mulps %xmm15, %xmm7
  447. SUBPS %xmm7, %xmm3
  448. #ifdef PREFETCHW
  449. PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1)
  450. #endif
  451. movaps %xmm0, -32 * SIZE(Y1)
  452. movaps %xmm1, -28 * SIZE(Y1)
  453. movaps %xmm2, -24 * SIZE(Y1)
  454. movaps %xmm3, -20 * SIZE(Y1)
  455. movaps -16 * SIZE(Y1), %xmm0
  456. movaps -12 * SIZE(Y1), %xmm1
  457. movaps -8 * SIZE(Y1), %xmm2
  458. movaps -4 * SIZE(Y1), %xmm3
  459. subq $-16 * SIZE, A1
  460. subq $-16 * SIZE, A2
  461. subq $-16 * SIZE, Y1
  462. subq $1, I
  463. BRANCH
  464. jg .L13
  465. ALIGN_3
  466. .L14:
  467. pshufd $0xb1, %xmm4, %xmm5
  468. mulps %xmm8, %xmm4
  469. addps %xmm4, %xmm0
  470. MOVUPS_A1(-24 * SIZE, A1, %xmm4)
  471. pshufd $0xb1, %xmm6, %xmm7
  472. mulps %xmm8, %xmm6
  473. addps %xmm6, %xmm1
  474. MOVUPS_A1(-20 * SIZE, A1, %xmm6)
  475. mulps %xmm9, %xmm5
  476. SUBPS %xmm5, %xmm0
  477. mulps %xmm9, %xmm7
  478. SUBPS %xmm7, %xmm1
  479. pshufd $0xb1, %xmm4, %xmm5
  480. mulps %xmm8, %xmm4
  481. addps %xmm4, %xmm2
  482. MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4)
  483. pshufd $0xb1, %xmm6, %xmm7
  484. mulps %xmm8, %xmm6
  485. addps %xmm6, %xmm3
  486. MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm6)
  487. mulps %xmm9, %xmm5
  488. SUBPS %xmm5, %xmm2
  489. mulps %xmm9, %xmm7
  490. SUBPS %xmm7, %xmm3
  491. pshufd $0xb1, %xmm4, %xmm5
  492. mulps %xmm10, %xmm4
  493. addps %xmm4, %xmm0
  494. MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm4)
  495. pshufd $0xb1, %xmm6, %xmm7
  496. mulps %xmm10, %xmm6
  497. addps %xmm6, %xmm1
  498. MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm6)
  499. mulps %xmm11, %xmm5
  500. SUBPS %xmm5, %xmm0
  501. mulps %xmm11, %xmm7
  502. SUBPS %xmm7, %xmm1
  503. pshufd $0xb1, %xmm4, %xmm5
  504. mulps %xmm10, %xmm4
  505. addps %xmm4, %xmm2
  506. MOVUPS_A1(-32 * SIZE, A2, %xmm4)
  507. pshufd $0xb1, %xmm6, %xmm7
  508. mulps %xmm10, %xmm6
  509. addps %xmm6, %xmm3
  510. MOVUPS_A1(-28 * SIZE, A2, %xmm6)
  511. mulps %xmm11, %xmm5
  512. SUBPS %xmm5, %xmm2
  513. mulps %xmm11, %xmm7
  514. SUBPS %xmm7, %xmm3
  515. pshufd $0xb1, %xmm4, %xmm5
  516. mulps %xmm12, %xmm4
  517. addps %xmm4, %xmm0
  518. MOVUPS_A1(-24 * SIZE, A2, %xmm4)
  519. pshufd $0xb1, %xmm6, %xmm7
  520. mulps %xmm12, %xmm6
  521. addps %xmm6, %xmm1
  522. MOVUPS_A1(-20 * SIZE, A2, %xmm6)
  523. mulps %xmm13, %xmm5
  524. SUBPS %xmm5, %xmm0
  525. mulps %xmm13, %xmm7
  526. SUBPS %xmm7, %xmm1
  527. pshufd $0xb1, %xmm4, %xmm5
  528. mulps %xmm12, %xmm4
  529. addps %xmm4, %xmm2
  530. MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4)
  531. pshufd $0xb1, %xmm6, %xmm7
  532. mulps %xmm12, %xmm6
  533. addps %xmm6, %xmm3
  534. MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm6)
  535. mulps %xmm13, %xmm5
  536. SUBPS %xmm5, %xmm2
  537. mulps %xmm13, %xmm7
  538. SUBPS %xmm7, %xmm3
  539. pshufd $0xb1, %xmm4, %xmm5
  540. mulps %xmm14, %xmm4
  541. addps %xmm4, %xmm0
  542. MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm4)
  543. pshufd $0xb1, %xmm6, %xmm7
  544. mulps %xmm14, %xmm6
  545. addps %xmm6, %xmm1
  546. MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm6)
  547. mulps %xmm15, %xmm5
  548. SUBPS %xmm5, %xmm0
  549. mulps %xmm15, %xmm7
  550. SUBPS %xmm7, %xmm1
  551. pshufd $0xb1, %xmm4, %xmm5
  552. mulps %xmm14, %xmm4
  553. addps %xmm4, %xmm2
  554. pshufd $0xb1, %xmm6, %xmm7
  555. mulps %xmm14, %xmm6
  556. addps %xmm6, %xmm3
  557. mulps %xmm15, %xmm5
  558. SUBPS %xmm5, %xmm2
  559. mulps %xmm15, %xmm7
  560. SUBPS %xmm7, %xmm3
  561. movaps %xmm0, -32 * SIZE(Y1)
  562. movaps %xmm1, -28 * SIZE(Y1)
  563. movaps %xmm2, -24 * SIZE(Y1)
  564. movaps %xmm3, -20 * SIZE(Y1)
  565. movaps -16 * SIZE(Y1), %xmm0
  566. movaps -12 * SIZE(Y1), %xmm1
  567. movaps -8 * SIZE(Y1), %xmm2
  568. movaps -4 * SIZE(Y1), %xmm3
  569. subq $-16 * SIZE, A1
  570. subq $-16 * SIZE, A2
  571. subq $-16 * SIZE, Y1
  572. ALIGN_3
  573. .L15:
  574. testq $4, MM
  575. je .L17
  576. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  577. MOVUPS_A1(-28 * SIZE, A1, %xmm6)
  578. pshufd $0xb1, %xmm4, %xmm5
  579. mulps %xmm8, %xmm4
  580. addps %xmm4, %xmm0
  581. pshufd $0xb1, %xmm6, %xmm7
  582. mulps %xmm8, %xmm6
  583. addps %xmm6, %xmm1
  584. mulps %xmm9, %xmm5
  585. SUBPS %xmm5, %xmm0
  586. mulps %xmm9, %xmm7
  587. SUBPS %xmm7, %xmm1
  588. MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4)
  589. MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm6)
  590. pshufd $0xb1, %xmm4, %xmm5
  591. mulps %xmm10, %xmm4
  592. addps %xmm4, %xmm0
  593. pshufd $0xb1, %xmm6, %xmm7
  594. mulps %xmm10, %xmm6
  595. addps %xmm6, %xmm1
  596. mulps %xmm11, %xmm5
  597. SUBPS %xmm5, %xmm0
  598. mulps %xmm11, %xmm7
  599. SUBPS %xmm7, %xmm1
  600. MOVUPS_A1(-32 * SIZE, A2, %xmm4)
  601. MOVUPS_A1(-28 * SIZE, A2, %xmm6)
  602. pshufd $0xb1, %xmm4, %xmm5
  603. mulps %xmm12, %xmm4
  604. addps %xmm4, %xmm0
  605. pshufd $0xb1, %xmm6, %xmm7
  606. mulps %xmm12, %xmm6
  607. addps %xmm6, %xmm1
  608. mulps %xmm13, %xmm5
  609. SUBPS %xmm5, %xmm0
  610. mulps %xmm13, %xmm7
  611. SUBPS %xmm7, %xmm1
  612. MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4)
  613. MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm6)
  614. pshufd $0xb1, %xmm4, %xmm5
  615. mulps %xmm14, %xmm4
  616. addps %xmm4, %xmm0
  617. pshufd $0xb1, %xmm6, %xmm7
  618. mulps %xmm14, %xmm6
  619. addps %xmm6, %xmm1
  620. mulps %xmm15, %xmm5
  621. SUBPS %xmm5, %xmm0
  622. mulps %xmm15, %xmm7
  623. SUBPS %xmm7, %xmm1
  624. movaps %xmm0, -32 * SIZE(Y1)
  625. movaps %xmm1, -28 * SIZE(Y1)
  626. movaps %xmm2, %xmm0
  627. movaps %xmm3, %xmm1
  628. addq $8 * SIZE, A1
  629. addq $8 * SIZE, A2
  630. addq $8 * SIZE, Y1
  631. ALIGN_3
  632. .L17:
  633. testq $2, MM
  634. je .L18
  635. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  636. MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm6)
  637. pshufd $0xb1, %xmm4, %xmm5
  638. mulps %xmm8, %xmm4
  639. addps %xmm4, %xmm0
  640. MOVUPS_A1(-32 * SIZE, A2, %xmm4)
  641. pshufd $0xb1, %xmm6, %xmm7
  642. mulps %xmm9, %xmm5
  643. SUBPS %xmm5, %xmm0
  644. mulps %xmm10, %xmm6
  645. addps %xmm6, %xmm0
  646. MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm6)
  647. mulps %xmm11, %xmm7
  648. SUBPS %xmm7, %xmm0
  649. pshufd $0xb1, %xmm4, %xmm5
  650. mulps %xmm12, %xmm4
  651. addps %xmm4, %xmm0
  652. pshufd $0xb1, %xmm6, %xmm7
  653. mulps %xmm13, %xmm5
  654. SUBPS %xmm5, %xmm0
  655. mulps %xmm14, %xmm6
  656. addps %xmm6, %xmm0
  657. mulps %xmm15, %xmm7
  658. SUBPS %xmm7, %xmm0
  659. movaps %xmm0, -32 * SIZE(Y1)
  660. movaps %xmm1, %xmm0
  661. addq $4 * SIZE, A1
  662. addq $4 * SIZE, A2
  663. addq $4 * SIZE, Y1
  664. ALIGN_3
  665. .L18:
  666. testq $1, MM
  667. je .L19
  668. movsd -32 * SIZE(A1), %xmm4
  669. movsd -32 * SIZE(A1, LDA), %xmm6
  670. pshufd $0xb1, %xmm4, %xmm5
  671. mulps %xmm8, %xmm4
  672. addps %xmm4, %xmm0
  673. movsd -32 * SIZE(A2), %xmm4
  674. pshufd $0xb1, %xmm6, %xmm7
  675. mulps %xmm9, %xmm5
  676. SUBPS %xmm5, %xmm0
  677. mulps %xmm10, %xmm6
  678. addps %xmm6, %xmm0
  679. movsd -32 * SIZE(A2, LDA), %xmm6
  680. mulps %xmm11, %xmm7
  681. SUBPS %xmm7, %xmm0
  682. pshufd $0xb1, %xmm4, %xmm5
  683. mulps %xmm12, %xmm4
  684. addps %xmm4, %xmm0
  685. pshufd $0xb1, %xmm6, %xmm7
  686. mulps %xmm13, %xmm5
  687. SUBPS %xmm5, %xmm0
  688. mulps %xmm14, %xmm6
  689. addps %xmm6, %xmm0
  690. mulps %xmm15, %xmm7
  691. SUBPS %xmm7, %xmm0
  692. movlps %xmm0, -32 * SIZE(Y1)
  693. ALIGN_3
  694. .L19:
  695. cmpq $4, N
  696. jge .L11
  697. ALIGN_3
  698. .L20:
  699. #endif
  700. cmpq $2, N
  701. jl .L30
  702. #if GEMV_UNROLL == 2
  703. ALIGN_3
  704. .L21:
  705. #endif
  706. subq $2, N
  707. leaq 32 * SIZE(BUFFER), Y1
  708. movq A, A1
  709. leaq (A, LDA, 1), A2
  710. leaq (A, LDA, 2), A
  711. movsd (X), %xmm13
  712. addq INCX, X
  713. movsd (X), %xmm15
  714. addq INCX, X
  715. #ifdef HAVE_SSE3
  716. movddup ALPHA, %xmm8
  717. #else
  718. movsd ALPHA, %xmm8
  719. unpcklpd %xmm8, %xmm8
  720. #endif
  721. pshufd $0xb1, %xmm8, %xmm9
  722. pcmpeqb %xmm11, %xmm11
  723. psllq $63, %xmm11
  724. pshufd $0x00, %xmm13, %xmm12
  725. pshufd $0x55, %xmm13, %xmm13
  726. pshufd $0x00, %xmm15, %xmm14
  727. pshufd $0x55, %xmm15, %xmm15
  728. #ifndef XCONJ
  729. xorps %xmm11, %xmm13
  730. xorps %xmm11, %xmm15
  731. #else
  732. xorps %xmm11, %xmm12
  733. xorps %xmm11, %xmm14
  734. #endif
  735. mulps %xmm8, %xmm12
  736. mulps %xmm9, %xmm13
  737. mulps %xmm8, %xmm14
  738. mulps %xmm9, %xmm15
  739. #ifndef XCONJ
  740. subps %xmm13, %xmm12
  741. subps %xmm15, %xmm14
  742. #else
  743. addps %xmm13, %xmm12
  744. addps %xmm15, %xmm14
  745. #endif
  746. pshufd $0x55, %xmm12, %xmm13
  747. pshufd $0x00, %xmm12, %xmm12
  748. pshufd $0x55, %xmm14, %xmm15
  749. pshufd $0x00, %xmm14, %xmm14
  750. #ifndef CONJ
  751. xorps %xmm11, %xmm13
  752. xorps %xmm11, %xmm15
  753. #else
  754. xorps %xmm11, %xmm12
  755. xorps %xmm11, %xmm14
  756. #endif
  757. #ifdef ALIGNED_ACCESS
  758. cmpq M, MM
  759. je .L2X
  760. movsd -32 * SIZE(A1), %xmm4
  761. movsd -32 * SIZE(A2), %xmm6
  762. movsd -32 * SIZE(Y1), %xmm0
  763. pshufd $0xb1, %xmm4, %xmm5
  764. pshufd $0xb1, %xmm6, %xmm7
  765. mulps %xmm12, %xmm4
  766. addps %xmm4, %xmm0
  767. mulps %xmm13, %xmm5
  768. SUBPS %xmm5, %xmm0
  769. mulps %xmm14, %xmm6
  770. addps %xmm6, %xmm0
  771. mulps %xmm15, %xmm7
  772. SUBPS %xmm7, %xmm0
  773. movlps %xmm0, -32 * SIZE(Y1)
  774. addq $2 * SIZE, A1
  775. addq $2 * SIZE, A2
  776. addq $2 * SIZE, Y1
  777. ALIGN_3
  778. .L2X:
  779. #endif
  780. movaps -32 * SIZE(Y1), %xmm0
  781. movaps -28 * SIZE(Y1), %xmm1
  782. movaps -24 * SIZE(Y1), %xmm2
  783. movaps -20 * SIZE(Y1), %xmm3
  784. ALIGN_3
  785. movq MM, I
  786. sarq $3, I
  787. jle .L25
  788. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  789. MOVUPS_A1(-28 * SIZE, A1, %xmm6)
  790. MOVUPS_A1(-24 * SIZE, A1, %xmm8)
  791. MOVUPS_A1(-20 * SIZE, A1, %xmm10)
  792. decq I
  793. jle .L24
  794. ALIGN_3
  795. .L23:
  796. #ifdef PREFETCH
  797. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  798. #endif
  799. pshufd $0xb1, %xmm4, %xmm5
  800. mulps %xmm12, %xmm4
  801. addps %xmm4, %xmm0
  802. MOVUPS_A1(-32 * SIZE, A2, %xmm4)
  803. pshufd $0xb1, %xmm6, %xmm7
  804. mulps %xmm12, %xmm6
  805. addps %xmm6, %xmm1
  806. MOVUPS_A1(-28 * SIZE, A2, %xmm6)
  807. pshufd $0xb1, %xmm8, %xmm9
  808. mulps %xmm12, %xmm8
  809. addps %xmm8, %xmm2
  810. MOVUPS_A1(-24 * SIZE, A2, %xmm8)
  811. pshufd $0xb1, %xmm10, %xmm11
  812. mulps %xmm12, %xmm10
  813. addps %xmm10, %xmm3
  814. MOVUPS_A1(-20 * SIZE, A2, %xmm10)
  815. mulps %xmm13, %xmm5
  816. SUBPS %xmm5, %xmm0
  817. mulps %xmm13, %xmm7
  818. SUBPS %xmm7, %xmm1
  819. mulps %xmm13, %xmm9
  820. SUBPS %xmm9, %xmm2
  821. mulps %xmm13, %xmm11
  822. SUBPS %xmm11, %xmm3
  823. #ifdef PREFETCH
  824. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  825. #endif
  826. pshufd $0xb1, %xmm4, %xmm5
  827. mulps %xmm14, %xmm4
  828. addps %xmm4, %xmm0
  829. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  830. pshufd $0xb1, %xmm6, %xmm7
  831. mulps %xmm14, %xmm6
  832. addps %xmm6, %xmm1
  833. MOVUPS_A1(-12 * SIZE, A1, %xmm6)
  834. pshufd $0xb1, %xmm8, %xmm9
  835. mulps %xmm14, %xmm8
  836. addps %xmm8, %xmm2
  837. MOVUPS_A1( -8 * SIZE, A1, %xmm8)
  838. pshufd $0xb1, %xmm10, %xmm11
  839. mulps %xmm14, %xmm10
  840. addps %xmm10, %xmm3
  841. MOVUPS_A1( -4 * SIZE, A1, %xmm10)
  842. mulps %xmm15, %xmm5
  843. SUBPS %xmm5, %xmm0
  844. mulps %xmm15, %xmm7
  845. SUBPS %xmm7, %xmm1
  846. mulps %xmm15, %xmm9
  847. SUBPS %xmm9, %xmm2
  848. mulps %xmm15, %xmm11
  849. SUBPS %xmm11, %xmm3
  850. #ifdef PREFETCHW
  851. PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1)
  852. #endif
  853. movaps %xmm0, -32 * SIZE(Y1)
  854. movaps %xmm1, -28 * SIZE(Y1)
  855. movaps %xmm2, -24 * SIZE(Y1)
  856. movaps %xmm3, -20 * SIZE(Y1)
  857. movaps -16 * SIZE(Y1), %xmm0
  858. movaps -12 * SIZE(Y1), %xmm1
  859. movaps -8 * SIZE(Y1), %xmm2
  860. movaps -4 * SIZE(Y1), %xmm3
  861. subq $-16 * SIZE, A1
  862. subq $-16 * SIZE, A2
  863. subq $-16 * SIZE, Y1
  864. subq $1, I
  865. BRANCH
  866. jg .L23
  867. ALIGN_3
  868. .L24:
  869. pshufd $0xb1, %xmm4, %xmm5
  870. mulps %xmm12, %xmm4
  871. addps %xmm4, %xmm0
  872. MOVUPS_A1(-32 * SIZE, A2, %xmm4)
  873. pshufd $0xb1, %xmm6, %xmm7
  874. mulps %xmm12, %xmm6
  875. addps %xmm6, %xmm1
  876. MOVUPS_A1(-28 * SIZE, A2, %xmm6)
  877. pshufd $0xb1, %xmm8, %xmm9
  878. mulps %xmm12, %xmm8
  879. addps %xmm8, %xmm2
  880. MOVUPS_A1(-24 * SIZE, A2, %xmm8)
  881. pshufd $0xb1, %xmm10, %xmm11
  882. mulps %xmm12, %xmm10
  883. addps %xmm10, %xmm3
  884. MOVUPS_A1(-20 * SIZE, A2, %xmm10)
  885. mulps %xmm13, %xmm5
  886. SUBPS %xmm5, %xmm0
  887. mulps %xmm13, %xmm7
  888. SUBPS %xmm7, %xmm1
  889. mulps %xmm13, %xmm9
  890. SUBPS %xmm9, %xmm2
  891. mulps %xmm13, %xmm11
  892. SUBPS %xmm11, %xmm3
  893. pshufd $0xb1, %xmm4, %xmm5
  894. mulps %xmm14, %xmm4
  895. addps %xmm4, %xmm0
  896. pshufd $0xb1, %xmm6, %xmm7
  897. mulps %xmm14, %xmm6
  898. addps %xmm6, %xmm1
  899. pshufd $0xb1, %xmm8, %xmm9
  900. mulps %xmm14, %xmm8
  901. addps %xmm8, %xmm2
  902. pshufd $0xb1, %xmm10, %xmm11
  903. mulps %xmm14, %xmm10
  904. addps %xmm10, %xmm3
  905. mulps %xmm15, %xmm5
  906. SUBPS %xmm5, %xmm0
  907. mulps %xmm15, %xmm7
  908. SUBPS %xmm7, %xmm1
  909. mulps %xmm15, %xmm9
  910. SUBPS %xmm9, %xmm2
  911. mulps %xmm15, %xmm11
  912. SUBPS %xmm11, %xmm3
  913. movaps %xmm0, -32 * SIZE(Y1)
  914. movaps %xmm1, -28 * SIZE(Y1)
  915. movaps %xmm2, -24 * SIZE(Y1)
  916. movaps %xmm3, -20 * SIZE(Y1)
  917. movaps -16 * SIZE(Y1), %xmm0
  918. movaps -12 * SIZE(Y1), %xmm1
  919. movaps -8 * SIZE(Y1), %xmm2
  920. movaps -4 * SIZE(Y1), %xmm3
  921. subq $-16 * SIZE, A1
  922. subq $-16 * SIZE, A2
  923. subq $-16 * SIZE, Y1
  924. ALIGN_3
  925. .L25:
  926. testq $4, MM
  927. je .L27
  928. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  929. MOVUPS_A1(-28 * SIZE, A1, %xmm6)
  930. MOVUPS_A1(-32 * SIZE, A2, %xmm8)
  931. MOVUPS_A1(-28 * SIZE, A2, %xmm10)
  932. pshufd $0xb1, %xmm4, %xmm5
  933. pshufd $0xb1, %xmm6, %xmm7
  934. pshufd $0xb1, %xmm8, %xmm9
  935. pshufd $0xb1, %xmm10, %xmm11
  936. mulps %xmm12, %xmm4
  937. addps %xmm4, %xmm0
  938. mulps %xmm12, %xmm6
  939. addps %xmm6, %xmm1
  940. mulps %xmm13, %xmm5
  941. SUBPS %xmm5, %xmm0
  942. mulps %xmm13, %xmm7
  943. SUBPS %xmm7, %xmm1
  944. mulps %xmm14, %xmm8
  945. addps %xmm8, %xmm0
  946. mulps %xmm14, %xmm10
  947. addps %xmm10, %xmm1
  948. mulps %xmm15, %xmm9
  949. SUBPS %xmm9, %xmm0
  950. mulps %xmm15, %xmm11
  951. SUBPS %xmm11, %xmm1
  952. movaps %xmm0, -32 * SIZE(Y1)
  953. movaps %xmm1, -28 * SIZE(Y1)
  954. movaps %xmm2, %xmm0
  955. movaps %xmm3, %xmm1
  956. addq $8 * SIZE, A1
  957. addq $8 * SIZE, A2
  958. addq $8 * SIZE, Y1
  959. ALIGN_3
  960. .L27:
  961. testq $2, MM
  962. je .L28
  963. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  964. MOVUPS_A1(-32 * SIZE, A2, %xmm6)
  965. pshufd $0xb1, %xmm4, %xmm5
  966. mulps %xmm12, %xmm4
  967. addps %xmm4, %xmm0
  968. pshufd $0xb1, %xmm6, %xmm7
  969. mulps %xmm13, %xmm5
  970. SUBPS %xmm5, %xmm0
  971. mulps %xmm14, %xmm6
  972. addps %xmm6, %xmm0
  973. mulps %xmm15, %xmm7
  974. SUBPS %xmm7, %xmm0
  975. movaps %xmm0, -32 * SIZE(Y1)
  976. movaps %xmm1, %xmm0
  977. addq $4 * SIZE, A1
  978. addq $4 * SIZE, A2
  979. addq $4 * SIZE, Y1
  980. ALIGN_3
  981. .L28:
  982. testq $1, MM
  983. #if GEMV_UNROLL == 2
  984. je .L29
  985. #else
  986. je .L30
  987. #endif
  988. movsd -32 * SIZE(A1), %xmm4
  989. movsd -32 * SIZE(A2), %xmm6
  990. pshufd $0xb1, %xmm4, %xmm5
  991. pshufd $0xb1, %xmm6, %xmm7
  992. mulps %xmm12, %xmm4
  993. addps %xmm4, %xmm0
  994. mulps %xmm13, %xmm5
  995. SUBPS %xmm5, %xmm0
  996. mulps %xmm14, %xmm6
  997. addps %xmm6, %xmm0
  998. mulps %xmm15, %xmm7
  999. SUBPS %xmm7, %xmm0
  1000. movlps %xmm0, -32 * SIZE(Y1)
  1001. #if GEMV_UNROLL == 2
  1002. ALIGN_3
  1003. .L29:
  1004. cmpq $2, N
  1005. jge .L21
  1006. #endif
  1007. ALIGN_3
  1008. .L30:
  1009. cmpq $1, N
  1010. jl .L990
  1011. leaq 32 * SIZE(BUFFER), Y1
  1012. movq A, A1
  1013. movsd (X), %xmm13
  1014. addq INCX, X
  1015. #ifdef HAVE_SSE3
  1016. movddup ALPHA, %xmm8
  1017. #else
  1018. movsd ALPHA, %xmm8
  1019. unpcklpd %xmm8, %xmm8
  1020. #endif
  1021. pshufd $0xb1, %xmm8, %xmm9
  1022. pcmpeqb %xmm11, %xmm11
  1023. psllq $63, %xmm11
  1024. pshufd $0x00, %xmm13, %xmm12
  1025. pshufd $0x55, %xmm13, %xmm13
  1026. #ifndef XCONJ
  1027. xorps %xmm11, %xmm13
  1028. #else
  1029. xorps %xmm11, %xmm12
  1030. #endif
  1031. mulps %xmm8, %xmm12
  1032. mulps %xmm9, %xmm13
  1033. #ifndef XCONJ
  1034. subps %xmm13, %xmm12
  1035. #else
  1036. addps %xmm13, %xmm12
  1037. #endif
  1038. pshufd $0x55, %xmm12, %xmm13
  1039. pshufd $0x00, %xmm12, %xmm12
  1040. #ifndef CONJ
  1041. xorps %xmm11, %xmm13
  1042. #else
  1043. xorps %xmm11, %xmm12
  1044. #endif
  1045. #ifdef ALIGNED_ACCESS
  1046. cmpq M, MM
  1047. je .L3X
  1048. movsd -32 * SIZE(A1), %xmm4
  1049. movsd -32 * SIZE(Y1), %xmm0
  1050. pshufd $0xb1, %xmm4, %xmm5
  1051. mulps %xmm12, %xmm4
  1052. addps %xmm4, %xmm0
  1053. mulps %xmm13, %xmm5
  1054. SUBPS %xmm5, %xmm0
  1055. movlps %xmm0, -32 * SIZE(Y1)
  1056. addq $2 * SIZE, A1
  1057. addq $2 * SIZE, Y1
  1058. ALIGN_3
  1059. .L3X:
  1060. #endif
  1061. movaps -32 * SIZE(Y1), %xmm0
  1062. movaps -28 * SIZE(Y1), %xmm1
  1063. movaps -24 * SIZE(Y1), %xmm2
  1064. movaps -20 * SIZE(Y1), %xmm3
  1065. ALIGN_3
  1066. movq MM, I
  1067. sarq $3, I
  1068. jle .L35
  1069. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  1070. MOVUPS_A1(-28 * SIZE, A1, %xmm6)
  1071. MOVUPS_A1(-24 * SIZE, A1, %xmm8)
  1072. MOVUPS_A1(-20 * SIZE, A1, %xmm10)
  1073. decq I
  1074. jle .L34
  1075. ALIGN_3
  1076. .L33:
  1077. #ifdef PREFETCH
  1078. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  1079. #endif
  1080. pshufd $0xb1, %xmm4, %xmm5
  1081. mulps %xmm12, %xmm4
  1082. addps %xmm4, %xmm0
  1083. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1084. pshufd $0xb1, %xmm6, %xmm7
  1085. mulps %xmm12, %xmm6
  1086. addps %xmm6, %xmm1
  1087. MOVUPS_A1(-12 * SIZE, A1, %xmm6)
  1088. pshufd $0xb1, %xmm8, %xmm9
  1089. mulps %xmm12, %xmm8
  1090. addps %xmm8, %xmm2
  1091. MOVUPS_A1( -8 * SIZE, A1, %xmm8)
  1092. pshufd $0xb1, %xmm10, %xmm11
  1093. mulps %xmm12, %xmm10
  1094. addps %xmm10, %xmm3
  1095. MOVUPS_A1( -4 * SIZE, A1, %xmm10)
  1096. mulps %xmm13, %xmm5
  1097. SUBPS %xmm5, %xmm0
  1098. mulps %xmm13, %xmm7
  1099. SUBPS %xmm7, %xmm1
  1100. mulps %xmm13, %xmm9
  1101. SUBPS %xmm9, %xmm2
  1102. mulps %xmm13, %xmm11
  1103. SUBPS %xmm11, %xmm3
  1104. #ifdef PREFETCHW
  1105. PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1)
  1106. #endif
  1107. movaps %xmm0, -32 * SIZE(Y1)
  1108. movaps %xmm1, -28 * SIZE(Y1)
  1109. movaps %xmm2, -24 * SIZE(Y1)
  1110. movaps %xmm3, -20 * SIZE(Y1)
  1111. movaps -16 * SIZE(Y1), %xmm0
  1112. movaps -12 * SIZE(Y1), %xmm1
  1113. movaps -8 * SIZE(Y1), %xmm2
  1114. movaps -4 * SIZE(Y1), %xmm3
  1115. subq $-16 * SIZE, A1
  1116. subq $-16 * SIZE, A2
  1117. subq $-16 * SIZE, Y1
  1118. subq $1, I
  1119. BRANCH
  1120. jg .L33
  1121. ALIGN_3
  1122. .L34:
  1123. pshufd $0xb1, %xmm4, %xmm5
  1124. mulps %xmm12, %xmm4
  1125. addps %xmm4, %xmm0
  1126. pshufd $0xb1, %xmm6, %xmm7
  1127. mulps %xmm12, %xmm6
  1128. addps %xmm6, %xmm1
  1129. pshufd $0xb1, %xmm8, %xmm9
  1130. mulps %xmm12, %xmm8
  1131. addps %xmm8, %xmm2
  1132. pshufd $0xb1, %xmm10, %xmm11
  1133. mulps %xmm12, %xmm10
  1134. addps %xmm10, %xmm3
  1135. mulps %xmm13, %xmm5
  1136. SUBPS %xmm5, %xmm0
  1137. mulps %xmm13, %xmm7
  1138. SUBPS %xmm7, %xmm1
  1139. mulps %xmm13, %xmm9
  1140. SUBPS %xmm9, %xmm2
  1141. mulps %xmm13, %xmm11
  1142. SUBPS %xmm11, %xmm3
  1143. movaps %xmm0, -32 * SIZE(Y1)
  1144. movaps %xmm1, -28 * SIZE(Y1)
  1145. movaps %xmm2, -24 * SIZE(Y1)
  1146. movaps %xmm3, -20 * SIZE(Y1)
  1147. movaps -16 * SIZE(Y1), %xmm0
  1148. movaps -12 * SIZE(Y1), %xmm1
  1149. movaps -8 * SIZE(Y1), %xmm2
  1150. movaps -4 * SIZE(Y1), %xmm3
  1151. subq $-16 * SIZE, A1
  1152. subq $-16 * SIZE, Y1
  1153. ALIGN_3
  1154. .L35:
  1155. testq $4, MM
  1156. je .L37
  1157. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  1158. MOVUPS_A1(-28 * SIZE, A1, %xmm6)
  1159. pshufd $0xb1, %xmm4, %xmm5
  1160. mulps %xmm12, %xmm4
  1161. addps %xmm4, %xmm0
  1162. pshufd $0xb1, %xmm6, %xmm7
  1163. mulps %xmm12, %xmm6
  1164. addps %xmm6, %xmm1
  1165. mulps %xmm13, %xmm5
  1166. SUBPS %xmm5, %xmm0
  1167. mulps %xmm13, %xmm7
  1168. SUBPS %xmm7, %xmm1
  1169. movaps %xmm0, -32 * SIZE(Y1)
  1170. movaps %xmm1, -28 * SIZE(Y1)
  1171. movaps %xmm2, %xmm0
  1172. movaps %xmm3, %xmm1
  1173. addq $8 * SIZE, A1
  1174. addq $8 * SIZE, Y1
  1175. ALIGN_3
  1176. .L37:
  1177. testq $2, MM
  1178. je .L38
  1179. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  1180. pshufd $0xb1, %xmm4, %xmm5
  1181. mulps %xmm12, %xmm4
  1182. addps %xmm4, %xmm0
  1183. mulps %xmm13, %xmm5
  1184. SUBPS %xmm5, %xmm0
  1185. movaps %xmm0, -32 * SIZE(Y1)
  1186. movaps %xmm1, %xmm0
  1187. addq $4 * SIZE, A1
  1188. addq $4 * SIZE, Y1
  1189. ALIGN_3
  1190. .L38:
  1191. testq $1, MM
  1192. je .L990
  1193. movsd -32 * SIZE(A1), %xmm4
  1194. pshufd $0xb1, %xmm4, %xmm5
  1195. mulps %xmm12, %xmm4
  1196. addps %xmm4, %xmm0
  1197. mulps %xmm13, %xmm5
  1198. SUBPS %xmm5, %xmm0
  1199. movlps %xmm0, -32 * SIZE(Y1)
  1200. #ifdef ALIGNED_ACCESS
  1201. jmp .L990
  1202. ALIGN_3
  1203. .L100:
  1204. #if GEMV_UNROLL >= 4
  1205. cmpq $4, N
  1206. jl .L110
  1207. ALIGN_3
  1208. .L101:
  1209. subq $4, N
  1210. leaq 32 * SIZE(BUFFER), Y1
  1211. movq A, A1
  1212. leaq (A, LDA, 2), A2
  1213. leaq (A, LDA, 4), A
  1214. movsd (X), %xmm9
  1215. addq INCX, X
  1216. movsd (X), %xmm11
  1217. addq INCX, X
  1218. movsd (X), %xmm13
  1219. addq INCX, X
  1220. movsd (X), %xmm15
  1221. addq INCX, X
  1222. #ifdef HAVE_SSE3
  1223. movddup ALPHA, %xmm6
  1224. #else
  1225. movsd ALPHA, %xmm6
  1226. unpcklpd %xmm6, %xmm6
  1227. #endif
  1228. pshufd $0xb1, %xmm6, %xmm5
  1229. pcmpeqb %xmm7, %xmm7
  1230. psllq $63, %xmm7
  1231. pshufd $0x00, %xmm9, %xmm8
  1232. pshufd $0x55, %xmm9, %xmm9
  1233. pshufd $0x00, %xmm11, %xmm10
  1234. pshufd $0x55, %xmm11, %xmm11
  1235. pshufd $0x00, %xmm13, %xmm12
  1236. pshufd $0x55, %xmm13, %xmm13
  1237. pshufd $0x00, %xmm15, %xmm14
  1238. pshufd $0x55, %xmm15, %xmm15
  1239. #ifndef XCONJ
  1240. xorps %xmm7, %xmm9
  1241. xorps %xmm7, %xmm11
  1242. xorps %xmm7, %xmm13
  1243. xorps %xmm7, %xmm15
  1244. #else
  1245. xorps %xmm7, %xmm8
  1246. xorps %xmm7, %xmm10
  1247. xorps %xmm7, %xmm12
  1248. xorps %xmm7, %xmm14
  1249. #endif
  1250. mulps %xmm6, %xmm8
  1251. mulps %xmm5, %xmm9
  1252. mulps %xmm6, %xmm10
  1253. mulps %xmm5, %xmm11
  1254. mulps %xmm6, %xmm12
  1255. mulps %xmm5, %xmm13
  1256. mulps %xmm6, %xmm14
  1257. mulps %xmm5, %xmm15
  1258. #ifndef XCONJ
  1259. subps %xmm9, %xmm8
  1260. subps %xmm11, %xmm10
  1261. subps %xmm13, %xmm12
  1262. subps %xmm15, %xmm14
  1263. #else
  1264. addps %xmm9, %xmm8
  1265. addps %xmm11, %xmm10
  1266. addps %xmm13, %xmm12
  1267. addps %xmm15, %xmm14
  1268. #endif
  1269. pshufd $0x55, %xmm8, %xmm9
  1270. pshufd $0x00, %xmm8, %xmm8
  1271. pshufd $0x55, %xmm10, %xmm11
  1272. pshufd $0x00, %xmm10, %xmm10
  1273. pshufd $0x55, %xmm12, %xmm13
  1274. pshufd $0x00, %xmm12, %xmm12
  1275. pshufd $0x55, %xmm14, %xmm15
  1276. pshufd $0x00, %xmm14, %xmm14
  1277. #ifndef CONJ
  1278. xorps %xmm7, %xmm9
  1279. xorps %xmm7, %xmm11
  1280. xorps %xmm7, %xmm13
  1281. xorps %xmm7, %xmm15
  1282. #else
  1283. xorps %xmm7, %xmm8
  1284. xorps %xmm7, %xmm10
  1285. xorps %xmm7, %xmm12
  1286. xorps %xmm7, %xmm14
  1287. #endif
  1288. #ifdef ALIGNED_ACCESS
  1289. cmpq M, MM
  1290. je .L10X
  1291. movsd -32 * SIZE(A1), %xmm4
  1292. movsd -32 * SIZE(A1, LDA), %xmm6
  1293. movsd -32 * SIZE(Y1), %xmm0
  1294. pshufd $0xb1, %xmm4, %xmm5
  1295. mulps %xmm8, %xmm4
  1296. addps %xmm4, %xmm0
  1297. movsd -32 * SIZE(A2), %xmm4
  1298. pshufd $0xb1, %xmm6, %xmm7
  1299. mulps %xmm9, %xmm5
  1300. SUBPS %xmm5, %xmm0
  1301. mulps %xmm10, %xmm6
  1302. addps %xmm6, %xmm0
  1303. movsd -32 * SIZE(A2, LDA), %xmm6
  1304. mulps %xmm11, %xmm7
  1305. SUBPS %xmm7, %xmm0
  1306. pshufd $0xb1, %xmm4, %xmm5
  1307. mulps %xmm12, %xmm4
  1308. addps %xmm4, %xmm0
  1309. pshufd $0xb1, %xmm6, %xmm7
  1310. mulps %xmm13, %xmm5
  1311. SUBPS %xmm5, %xmm0
  1312. mulps %xmm14, %xmm6
  1313. addps %xmm6, %xmm0
  1314. mulps %xmm15, %xmm7
  1315. SUBPS %xmm7, %xmm0
  1316. movlps %xmm0, -32 * SIZE(Y1)
  1317. addq $2 * SIZE, A1
  1318. addq $2 * SIZE, A2
  1319. addq $2 * SIZE, Y1
  1320. ALIGN_3
  1321. .L10X:
  1322. #endif
  1323. movaps -32 * SIZE(Y1), %xmm0
  1324. movaps -28 * SIZE(Y1), %xmm1
  1325. movaps -24 * SIZE(Y1), %xmm2
  1326. movaps -20 * SIZE(Y1), %xmm3
  1327. movq MM, I
  1328. sarq $3, I
  1329. jle .L105
  1330. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  1331. MOVUPS_A1(-28 * SIZE, A1, %xmm6)
  1332. decq I
  1333. jle .L104
  1334. ALIGN_3
  1335. .L103:
  1336. #ifdef PREFETCH
  1337. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1)
  1338. #endif
  1339. pshufd $0xb1, %xmm4, %xmm5
  1340. mulps %xmm8, %xmm4
  1341. addps %xmm4, %xmm0
  1342. MOVUPS_A1(-24 * SIZE, A1, %xmm4)
  1343. pshufd $0xb1, %xmm6, %xmm7
  1344. mulps %xmm8, %xmm6
  1345. addps %xmm6, %xmm1
  1346. MOVUPS_A1(-20 * SIZE, A1, %xmm6)
  1347. mulps %xmm9, %xmm5
  1348. SUBPS %xmm5, %xmm0
  1349. mulps %xmm9, %xmm7
  1350. SUBPS %xmm7, %xmm1
  1351. pshufd $0xb1, %xmm4, %xmm5
  1352. mulps %xmm8, %xmm4
  1353. addps %xmm4, %xmm2
  1354. movsd -32 * SIZE(A1, LDA), %xmm4
  1355. movhps -30 * SIZE(A1, LDA), %xmm4
  1356. pshufd $0xb1, %xmm6, %xmm7
  1357. mulps %xmm8, %xmm6
  1358. addps %xmm6, %xmm3
  1359. movsd -28 * SIZE(A1, LDA), %xmm6
  1360. movhps -26 * SIZE(A1, LDA), %xmm6
  1361. mulps %xmm9, %xmm5
  1362. SUBPS %xmm5, %xmm2
  1363. mulps %xmm9, %xmm7
  1364. SUBPS %xmm7, %xmm3
  1365. #ifdef PREFETCH
  1366. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA)
  1367. #endif
  1368. pshufd $0xb1, %xmm4, %xmm5
  1369. mulps %xmm10, %xmm4
  1370. addps %xmm4, %xmm0
  1371. movsd -24 * SIZE(A1, LDA), %xmm4
  1372. movhps -22 * SIZE(A1, LDA), %xmm4
  1373. pshufd $0xb1, %xmm6, %xmm7
  1374. mulps %xmm10, %xmm6
  1375. addps %xmm6, %xmm1
  1376. movsd -20 * SIZE(A1, LDA), %xmm6
  1377. movhps -18 * SIZE(A1, LDA), %xmm6
  1378. mulps %xmm11, %xmm5
  1379. SUBPS %xmm5, %xmm0
  1380. mulps %xmm11, %xmm7
  1381. SUBPS %xmm7, %xmm1
  1382. pshufd $0xb1, %xmm4, %xmm5
  1383. mulps %xmm10, %xmm4
  1384. addps %xmm4, %xmm2
  1385. MOVUPS_A1(-32 * SIZE, A2, %xmm4)
  1386. pshufd $0xb1, %xmm6, %xmm7
  1387. mulps %xmm10, %xmm6
  1388. addps %xmm6, %xmm3
  1389. MOVUPS_A1(-28 * SIZE, A2, %xmm6)
  1390. mulps %xmm11, %xmm5
  1391. SUBPS %xmm5, %xmm2
  1392. mulps %xmm11, %xmm7
  1393. SUBPS %xmm7, %xmm3
  1394. #ifdef PREFETCH
  1395. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2)
  1396. #endif
  1397. pshufd $0xb1, %xmm4, %xmm5
  1398. mulps %xmm12, %xmm4
  1399. addps %xmm4, %xmm0
  1400. MOVUPS_A1(-24 * SIZE, A2, %xmm4)
  1401. pshufd $0xb1, %xmm6, %xmm7
  1402. mulps %xmm12, %xmm6
  1403. addps %xmm6, %xmm1
  1404. MOVUPS_A1(-20 * SIZE, A2, %xmm6)
  1405. mulps %xmm13, %xmm5
  1406. SUBPS %xmm5, %xmm0
  1407. mulps %xmm13, %xmm7
  1408. SUBPS %xmm7, %xmm1
  1409. pshufd $0xb1, %xmm4, %xmm5
  1410. mulps %xmm12, %xmm4
  1411. addps %xmm4, %xmm2
  1412. movsd -32 * SIZE(A2, LDA), %xmm4
  1413. movhps -30 * SIZE(A2, LDA), %xmm4
  1414. pshufd $0xb1, %xmm6, %xmm7
  1415. mulps %xmm12, %xmm6
  1416. addps %xmm6, %xmm3
  1417. movsd -28 * SIZE(A2, LDA), %xmm6
  1418. movhps -26 * SIZE(A2, LDA), %xmm6
  1419. mulps %xmm13, %xmm5
  1420. SUBPS %xmm5, %xmm2
  1421. mulps %xmm13, %xmm7
  1422. SUBPS %xmm7, %xmm3
  1423. #ifdef PREFETCH
  1424. PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA)
  1425. #endif
  1426. pshufd $0xb1, %xmm4, %xmm5
  1427. mulps %xmm14, %xmm4
  1428. addps %xmm4, %xmm0
  1429. movsd -24 * SIZE(A2, LDA), %xmm4
  1430. movhps -22 * SIZE(A2, LDA), %xmm4
  1431. pshufd $0xb1, %xmm6, %xmm7
  1432. mulps %xmm14, %xmm6
  1433. addps %xmm6, %xmm1
  1434. movsd -20 * SIZE(A2, LDA), %xmm6
  1435. movhps -18 * SIZE(A2, LDA), %xmm6
  1436. mulps %xmm15, %xmm5
  1437. SUBPS %xmm5, %xmm0
  1438. mulps %xmm15, %xmm7
  1439. SUBPS %xmm7, %xmm1
  1440. pshufd $0xb1, %xmm4, %xmm5
  1441. mulps %xmm14, %xmm4
  1442. addps %xmm4, %xmm2
  1443. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1444. pshufd $0xb1, %xmm6, %xmm7
  1445. mulps %xmm14, %xmm6
  1446. addps %xmm6, %xmm3
  1447. MOVUPS_A1(-12 * SIZE, A1, %xmm6)
  1448. mulps %xmm15, %xmm5
  1449. SUBPS %xmm5, %xmm2
  1450. mulps %xmm15, %xmm7
  1451. SUBPS %xmm7, %xmm3
  1452. #ifdef PREFETCHW
  1453. PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1)
  1454. #endif
  1455. movaps %xmm0, -32 * SIZE(Y1)
  1456. movaps %xmm1, -28 * SIZE(Y1)
  1457. movaps %xmm2, -24 * SIZE(Y1)
  1458. movaps %xmm3, -20 * SIZE(Y1)
  1459. movaps -16 * SIZE(Y1), %xmm0
  1460. movaps -12 * SIZE(Y1), %xmm1
  1461. movaps -8 * SIZE(Y1), %xmm2
  1462. movaps -4 * SIZE(Y1), %xmm3
  1463. subq $-16 * SIZE, A1
  1464. subq $-16 * SIZE, A2
  1465. subq $-16 * SIZE, Y1
  1466. subq $1, I
  1467. BRANCH
  1468. jg .L103
  1469. ALIGN_3
  1470. .L104:
  1471. pshufd $0xb1, %xmm4, %xmm5
  1472. mulps %xmm8, %xmm4
  1473. addps %xmm4, %xmm0
  1474. MOVUPS_A1(-24 * SIZE, A1, %xmm4)
  1475. pshufd $0xb1, %xmm6, %xmm7
  1476. mulps %xmm8, %xmm6
  1477. addps %xmm6, %xmm1
  1478. MOVUPS_A1(-20 * SIZE, A1, %xmm6)
  1479. mulps %xmm9, %xmm5
  1480. SUBPS %xmm5, %xmm0
  1481. mulps %xmm9, %xmm7
  1482. SUBPS %xmm7, %xmm1
  1483. pshufd $0xb1, %xmm4, %xmm5
  1484. mulps %xmm8, %xmm4
  1485. addps %xmm4, %xmm2
  1486. movsd -32 * SIZE(A1, LDA), %xmm4
  1487. movhps -30 * SIZE(A1, LDA), %xmm4
  1488. pshufd $0xb1, %xmm6, %xmm7
  1489. mulps %xmm8, %xmm6
  1490. addps %xmm6, %xmm3
  1491. movsd -28 * SIZE(A1, LDA), %xmm6
  1492. movhps -26 * SIZE(A1, LDA), %xmm6
  1493. mulps %xmm9, %xmm5
  1494. SUBPS %xmm5, %xmm2
  1495. mulps %xmm9, %xmm7
  1496. SUBPS %xmm7, %xmm3
  1497. pshufd $0xb1, %xmm4, %xmm5
  1498. mulps %xmm10, %xmm4
  1499. addps %xmm4, %xmm0
  1500. movsd -24 * SIZE(A1, LDA), %xmm4
  1501. movhps -22 * SIZE(A1, LDA), %xmm4
  1502. pshufd $0xb1, %xmm6, %xmm7
  1503. mulps %xmm10, %xmm6
  1504. addps %xmm6, %xmm1
  1505. movsd -20 * SIZE(A1, LDA), %xmm6
  1506. movhps -18 * SIZE(A1, LDA), %xmm6
  1507. mulps %xmm11, %xmm5
  1508. SUBPS %xmm5, %xmm0
  1509. mulps %xmm11, %xmm7
  1510. SUBPS %xmm7, %xmm1
  1511. pshufd $0xb1, %xmm4, %xmm5
  1512. mulps %xmm10, %xmm4
  1513. addps %xmm4, %xmm2
  1514. MOVUPS_A1(-32 * SIZE, A2, %xmm4)
  1515. pshufd $0xb1, %xmm6, %xmm7
  1516. mulps %xmm10, %xmm6
  1517. addps %xmm6, %xmm3
  1518. MOVUPS_A1(-28 * SIZE, A2, %xmm6)
  1519. mulps %xmm11, %xmm5
  1520. SUBPS %xmm5, %xmm2
  1521. mulps %xmm11, %xmm7
  1522. SUBPS %xmm7, %xmm3
  1523. pshufd $0xb1, %xmm4, %xmm5
  1524. mulps %xmm12, %xmm4
  1525. addps %xmm4, %xmm0
  1526. MOVUPS_A1(-24 * SIZE, A2, %xmm4)
  1527. pshufd $0xb1, %xmm6, %xmm7
  1528. mulps %xmm12, %xmm6
  1529. addps %xmm6, %xmm1
  1530. MOVUPS_A1(-20 * SIZE, A2, %xmm6)
  1531. mulps %xmm13, %xmm5
  1532. SUBPS %xmm5, %xmm0
  1533. mulps %xmm13, %xmm7
  1534. SUBPS %xmm7, %xmm1
  1535. pshufd $0xb1, %xmm4, %xmm5
  1536. mulps %xmm12, %xmm4
  1537. addps %xmm4, %xmm2
  1538. movsd -32 * SIZE(A2, LDA), %xmm4
  1539. movhps -30 * SIZE(A2, LDA), %xmm4
  1540. pshufd $0xb1, %xmm6, %xmm7
  1541. mulps %xmm12, %xmm6
  1542. addps %xmm6, %xmm3
  1543. movsd -28 * SIZE(A2, LDA), %xmm6
  1544. movhps -26 * SIZE(A2, LDA), %xmm6
  1545. mulps %xmm13, %xmm5
  1546. SUBPS %xmm5, %xmm2
  1547. mulps %xmm13, %xmm7
  1548. SUBPS %xmm7, %xmm3
  1549. pshufd $0xb1, %xmm4, %xmm5
  1550. mulps %xmm14, %xmm4
  1551. addps %xmm4, %xmm0
  1552. movsd -24 * SIZE(A2, LDA), %xmm4
  1553. movhps -22 * SIZE(A2, LDA), %xmm4
  1554. pshufd $0xb1, %xmm6, %xmm7
  1555. mulps %xmm14, %xmm6
  1556. addps %xmm6, %xmm1
  1557. movsd -20 * SIZE(A2, LDA), %xmm6
  1558. movhps -18 * SIZE(A2, LDA), %xmm6
  1559. mulps %xmm15, %xmm5
  1560. SUBPS %xmm5, %xmm0
  1561. mulps %xmm15, %xmm7
  1562. SUBPS %xmm7, %xmm1
  1563. pshufd $0xb1, %xmm4, %xmm5
  1564. mulps %xmm14, %xmm4
  1565. addps %xmm4, %xmm2
  1566. pshufd $0xb1, %xmm6, %xmm7
  1567. mulps %xmm14, %xmm6
  1568. addps %xmm6, %xmm3
  1569. mulps %xmm15, %xmm5
  1570. SUBPS %xmm5, %xmm2
  1571. mulps %xmm15, %xmm7
  1572. SUBPS %xmm7, %xmm3
  1573. movaps %xmm0, -32 * SIZE(Y1)
  1574. movaps %xmm1, -28 * SIZE(Y1)
  1575. movaps %xmm2, -24 * SIZE(Y1)
  1576. movaps %xmm3, -20 * SIZE(Y1)
  1577. movaps -16 * SIZE(Y1), %xmm0
  1578. movaps -12 * SIZE(Y1), %xmm1
  1579. movaps -8 * SIZE(Y1), %xmm2
  1580. movaps -4 * SIZE(Y1), %xmm3
  1581. subq $-16 * SIZE, A1
  1582. subq $-16 * SIZE, A2
  1583. subq $-16 * SIZE, Y1
  1584. ALIGN_3
  1585. .L105:
  1586. testq $4, MM
  1587. je .L107
  1588. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  1589. MOVUPS_A1(-28 * SIZE, A1, %xmm6)
  1590. pshufd $0xb1, %xmm4, %xmm5
  1591. mulps %xmm8, %xmm4
  1592. addps %xmm4, %xmm0
  1593. movsd -32 * SIZE(A1, LDA), %xmm4
  1594. movhps -30 * SIZE(A1, LDA), %xmm4
  1595. pshufd $0xb1, %xmm6, %xmm7
  1596. mulps %xmm8, %xmm6
  1597. addps %xmm6, %xmm1
  1598. movsd -28 * SIZE(A1, LDA), %xmm6
  1599. movhps -26 * SIZE(A1, LDA), %xmm6
  1600. mulps %xmm9, %xmm5
  1601. SUBPS %xmm5, %xmm0
  1602. mulps %xmm9, %xmm7
  1603. SUBPS %xmm7, %xmm1
  1604. pshufd $0xb1, %xmm4, %xmm5
  1605. mulps %xmm10, %xmm4
  1606. addps %xmm4, %xmm0
  1607. MOVUPS_A1(-32 * SIZE, A2, %xmm4)
  1608. pshufd $0xb1, %xmm6, %xmm7
  1609. mulps %xmm10, %xmm6
  1610. addps %xmm6, %xmm1
  1611. MOVUPS_A1(-28 * SIZE, A2, %xmm6)
  1612. mulps %xmm11, %xmm5
  1613. SUBPS %xmm5, %xmm0
  1614. mulps %xmm11, %xmm7
  1615. SUBPS %xmm7, %xmm1
  1616. pshufd $0xb1, %xmm4, %xmm5
  1617. mulps %xmm12, %xmm4
  1618. addps %xmm4, %xmm0
  1619. movsd -32 * SIZE(A2, LDA), %xmm4
  1620. movhps -30 * SIZE(A2, LDA), %xmm4
  1621. pshufd $0xb1, %xmm6, %xmm7
  1622. mulps %xmm12, %xmm6
  1623. addps %xmm6, %xmm1
  1624. movsd -28 * SIZE(A2, LDA), %xmm6
  1625. movhps -26 * SIZE(A2, LDA), %xmm6
  1626. mulps %xmm13, %xmm5
  1627. SUBPS %xmm5, %xmm0
  1628. mulps %xmm13, %xmm7
  1629. SUBPS %xmm7, %xmm1
  1630. pshufd $0xb1, %xmm4, %xmm5
  1631. mulps %xmm14, %xmm4
  1632. addps %xmm4, %xmm0
  1633. pshufd $0xb1, %xmm6, %xmm7
  1634. mulps %xmm14, %xmm6
  1635. addps %xmm6, %xmm1
  1636. mulps %xmm15, %xmm5
  1637. SUBPS %xmm5, %xmm0
  1638. mulps %xmm15, %xmm7
  1639. SUBPS %xmm7, %xmm1
  1640. movaps %xmm0, -32 * SIZE(Y1)
  1641. movaps %xmm1, -28 * SIZE(Y1)
  1642. movaps %xmm2, %xmm0
  1643. movaps %xmm3, %xmm1
  1644. addq $8 * SIZE, A1
  1645. addq $8 * SIZE, A2
  1646. addq $8 * SIZE, Y1
  1647. ALIGN_3
  1648. .L107:
  1649. testq $2, MM
  1650. je .L108
  1651. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  1652. movsd -32 * SIZE(A1, LDA), %xmm6
  1653. movhps -30 * SIZE(A1, LDA), %xmm6
  1654. pshufd $0xb1, %xmm4, %xmm5
  1655. mulps %xmm8, %xmm4
  1656. addps %xmm4, %xmm0
  1657. MOVUPS_A1(-32 * SIZE, A2, %xmm4)
  1658. pshufd $0xb1, %xmm6, %xmm7
  1659. mulps %xmm9, %xmm5
  1660. SUBPS %xmm5, %xmm0
  1661. mulps %xmm10, %xmm6
  1662. addps %xmm6, %xmm0
  1663. movsd -32 * SIZE(A2, LDA), %xmm6
  1664. movhps -30 * SIZE(A2, LDA), %xmm6
  1665. mulps %xmm11, %xmm7
  1666. SUBPS %xmm7, %xmm0
  1667. pshufd $0xb1, %xmm4, %xmm5
  1668. mulps %xmm12, %xmm4
  1669. addps %xmm4, %xmm0
  1670. pshufd $0xb1, %xmm6, %xmm7
  1671. mulps %xmm13, %xmm5
  1672. SUBPS %xmm5, %xmm0
  1673. mulps %xmm14, %xmm6
  1674. addps %xmm6, %xmm0
  1675. mulps %xmm15, %xmm7
  1676. SUBPS %xmm7, %xmm0
  1677. movaps %xmm0, -32 * SIZE(Y1)
  1678. movaps %xmm1, %xmm0
  1679. addq $4 * SIZE, A1
  1680. addq $4 * SIZE, A2
  1681. addq $4 * SIZE, Y1
  1682. ALIGN_3
  1683. .L108:
  1684. testq $1, MM
  1685. je .L109
  1686. movsd -32 * SIZE(A1), %xmm4
  1687. movsd -32 * SIZE(A1, LDA), %xmm6
  1688. pshufd $0xb1, %xmm4, %xmm5
  1689. mulps %xmm8, %xmm4
  1690. addps %xmm4, %xmm0
  1691. movsd -32 * SIZE(A2), %xmm4
  1692. pshufd $0xb1, %xmm6, %xmm7
  1693. mulps %xmm9, %xmm5
  1694. SUBPS %xmm5, %xmm0
  1695. mulps %xmm10, %xmm6
  1696. addps %xmm6, %xmm0
  1697. movsd -32 * SIZE(A2, LDA), %xmm6
  1698. mulps %xmm11, %xmm7
  1699. SUBPS %xmm7, %xmm0
  1700. pshufd $0xb1, %xmm4, %xmm5
  1701. mulps %xmm12, %xmm4
  1702. addps %xmm4, %xmm0
  1703. pshufd $0xb1, %xmm6, %xmm7
  1704. mulps %xmm13, %xmm5
  1705. SUBPS %xmm5, %xmm0
  1706. mulps %xmm14, %xmm6
  1707. addps %xmm6, %xmm0
  1708. mulps %xmm15, %xmm7
  1709. SUBPS %xmm7, %xmm0
  1710. movlps %xmm0, -32 * SIZE(Y1)
  1711. ALIGN_3
  1712. .L109:
  1713. cmpq $4, N
  1714. jge .L101
  1715. ALIGN_3
  1716. .L110:
  1717. #endif
  1718. #if GEMV_UNROLL >= 2
  1719. cmpq $2, N
  1720. jl .L120
  1721. #if GEMV_UNROLL == 2
  1722. ALIGN_3
  1723. .L111:
  1724. #endif
  1725. subq $2, N
  1726. leaq 32 * SIZE(BUFFER), Y1
  1727. movq A, A1
  1728. leaq (A, LDA, 1), A2
  1729. leaq (A, LDA, 2), A
  1730. movsd (X), %xmm13
  1731. addq INCX, X
  1732. movsd (X), %xmm15
  1733. addq INCX, X
  1734. #ifdef HAVE_SSE3
  1735. movddup ALPHA, %xmm8
  1736. #else
  1737. movsd ALPHA, %xmm8
  1738. unpcklpd %xmm8, %xmm8
  1739. #endif
  1740. pshufd $0xb1, %xmm8, %xmm9
  1741. pcmpeqb %xmm11, %xmm11
  1742. psllq $63, %xmm11
  1743. pshufd $0x00, %xmm13, %xmm12
  1744. pshufd $0x55, %xmm13, %xmm13
  1745. pshufd $0x00, %xmm15, %xmm14
  1746. pshufd $0x55, %xmm15, %xmm15
  1747. #ifndef XCONJ
  1748. xorps %xmm11, %xmm13
  1749. xorps %xmm11, %xmm15
  1750. #else
  1751. xorps %xmm11, %xmm12
  1752. xorps %xmm11, %xmm14
  1753. #endif
  1754. mulps %xmm8, %xmm12
  1755. mulps %xmm9, %xmm13
  1756. mulps %xmm8, %xmm14
  1757. mulps %xmm9, %xmm15
  1758. #ifndef XCONJ
  1759. subps %xmm13, %xmm12
  1760. subps %xmm15, %xmm14
  1761. #else
  1762. addps %xmm13, %xmm12
  1763. addps %xmm15, %xmm14
  1764. #endif
  1765. pshufd $0x55, %xmm12, %xmm13
  1766. pshufd $0x00, %xmm12, %xmm12
  1767. pshufd $0x55, %xmm14, %xmm15
  1768. pshufd $0x00, %xmm14, %xmm14
  1769. #ifndef CONJ
  1770. xorps %xmm11, %xmm13
  1771. xorps %xmm11, %xmm15
  1772. #else
  1773. xorps %xmm11, %xmm12
  1774. xorps %xmm11, %xmm14
  1775. #endif
  1776. #ifdef ALIGNED_ACCESS
  1777. cmpq M, MM
  1778. je .L11X
  1779. movsd -32 * SIZE(A1), %xmm4
  1780. movsd -32 * SIZE(A2), %xmm6
  1781. movsd -32 * SIZE(Y1), %xmm0
  1782. pshufd $0xb1, %xmm4, %xmm5
  1783. pshufd $0xb1, %xmm6, %xmm7
  1784. mulps %xmm12, %xmm4
  1785. addps %xmm4, %xmm0
  1786. mulps %xmm13, %xmm5
  1787. SUBPS %xmm5, %xmm0
  1788. mulps %xmm14, %xmm6
  1789. addps %xmm6, %xmm0
  1790. mulps %xmm15, %xmm7
  1791. SUBPS %xmm7, %xmm0
  1792. movlps %xmm0, -32 * SIZE(Y1)
  1793. addq $2 * SIZE, A1
  1794. addq $2 * SIZE, A2
  1795. addq $2 * SIZE, Y1
  1796. ALIGN_3
  1797. .L11X:
  1798. #endif
  1799. movaps -32 * SIZE(Y1), %xmm0
  1800. movaps -28 * SIZE(Y1), %xmm1
  1801. movaps -24 * SIZE(Y1), %xmm2
  1802. movaps -20 * SIZE(Y1), %xmm3
  1803. ALIGN_3
  1804. movq MM, I
  1805. sarq $3, I
  1806. jle .L115
  1807. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  1808. MOVUPS_A1(-28 * SIZE, A1, %xmm6)
  1809. MOVUPS_A1(-24 * SIZE, A1, %xmm8)
  1810. MOVUPS_A1(-20 * SIZE, A1, %xmm10)
  1811. decq I
  1812. jle .L114
  1813. ALIGN_3
  1814. .L113:
  1815. #ifdef PREFETCH
  1816. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  1817. #endif
  1818. pshufd $0xb1, %xmm4, %xmm5
  1819. mulps %xmm12, %xmm4
  1820. addps %xmm4, %xmm0
  1821. movsd -32 * SIZE(A2), %xmm4
  1822. movhps -30 * SIZE(A2), %xmm4
  1823. pshufd $0xb1, %xmm6, %xmm7
  1824. mulps %xmm12, %xmm6
  1825. addps %xmm6, %xmm1
  1826. movsd -28 * SIZE(A2), %xmm6
  1827. movhps -26 * SIZE(A2), %xmm6
  1828. pshufd $0xb1, %xmm8, %xmm9
  1829. mulps %xmm12, %xmm8
  1830. addps %xmm8, %xmm2
  1831. movsd -24 * SIZE(A2), %xmm8
  1832. movhps -22 * SIZE(A2), %xmm8
  1833. pshufd $0xb1, %xmm10, %xmm11
  1834. mulps %xmm12, %xmm10
  1835. addps %xmm10, %xmm3
  1836. movsd -20 * SIZE(A2), %xmm10
  1837. movhps -18 * SIZE(A2), %xmm10
  1838. mulps %xmm13, %xmm5
  1839. SUBPS %xmm5, %xmm0
  1840. mulps %xmm13, %xmm7
  1841. SUBPS %xmm7, %xmm1
  1842. mulps %xmm13, %xmm9
  1843. SUBPS %xmm9, %xmm2
  1844. mulps %xmm13, %xmm11
  1845. SUBPS %xmm11, %xmm3
  1846. #ifdef PREFETCH
  1847. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  1848. #endif
  1849. pshufd $0xb1, %xmm4, %xmm5
  1850. mulps %xmm14, %xmm4
  1851. addps %xmm4, %xmm0
  1852. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  1853. pshufd $0xb1, %xmm6, %xmm7
  1854. mulps %xmm14, %xmm6
  1855. addps %xmm6, %xmm1
  1856. MOVUPS_A1(-12 * SIZE, A1, %xmm6)
  1857. pshufd $0xb1, %xmm8, %xmm9
  1858. mulps %xmm14, %xmm8
  1859. addps %xmm8, %xmm2
  1860. MOVUPS_A1( -8 * SIZE, A1, %xmm8)
  1861. pshufd $0xb1, %xmm10, %xmm11
  1862. mulps %xmm14, %xmm10
  1863. addps %xmm10, %xmm3
  1864. MOVUPS_A1( -4 * SIZE, A1, %xmm10)
  1865. mulps %xmm15, %xmm5
  1866. SUBPS %xmm5, %xmm0
  1867. mulps %xmm15, %xmm7
  1868. SUBPS %xmm7, %xmm1
  1869. mulps %xmm15, %xmm9
  1870. SUBPS %xmm9, %xmm2
  1871. mulps %xmm15, %xmm11
  1872. SUBPS %xmm11, %xmm3
  1873. #ifdef PREFETCHW
  1874. PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1)
  1875. #endif
  1876. movaps %xmm0, -32 * SIZE(Y1)
  1877. movaps %xmm1, -28 * SIZE(Y1)
  1878. movaps %xmm2, -24 * SIZE(Y1)
  1879. movaps %xmm3, -20 * SIZE(Y1)
  1880. movaps -16 * SIZE(Y1), %xmm0
  1881. movaps -12 * SIZE(Y1), %xmm1
  1882. movaps -8 * SIZE(Y1), %xmm2
  1883. movaps -4 * SIZE(Y1), %xmm3
  1884. subq $-16 * SIZE, A1
  1885. subq $-16 * SIZE, A2
  1886. subq $-16 * SIZE, Y1
  1887. subq $1, I
  1888. BRANCH
  1889. jg .L113
  1890. ALIGN_3
  1891. .L114:
  1892. pshufd $0xb1, %xmm4, %xmm5
  1893. mulps %xmm12, %xmm4
  1894. addps %xmm4, %xmm0
  1895. movsd -32 * SIZE(A2), %xmm4
  1896. movhps -30 * SIZE(A2), %xmm4
  1897. pshufd $0xb1, %xmm6, %xmm7
  1898. mulps %xmm12, %xmm6
  1899. addps %xmm6, %xmm1
  1900. movsd -28 * SIZE(A2), %xmm6
  1901. movhps -26 * SIZE(A2), %xmm6
  1902. pshufd $0xb1, %xmm8, %xmm9
  1903. mulps %xmm12, %xmm8
  1904. addps %xmm8, %xmm2
  1905. movsd -24 * SIZE(A2), %xmm8
  1906. movhps -22 * SIZE(A2), %xmm8
  1907. pshufd $0xb1, %xmm10, %xmm11
  1908. mulps %xmm12, %xmm10
  1909. addps %xmm10, %xmm3
  1910. movsd -20 * SIZE(A2), %xmm10
  1911. movhps -18 * SIZE(A2), %xmm10
  1912. mulps %xmm13, %xmm5
  1913. SUBPS %xmm5, %xmm0
  1914. mulps %xmm13, %xmm7
  1915. SUBPS %xmm7, %xmm1
  1916. mulps %xmm13, %xmm9
  1917. SUBPS %xmm9, %xmm2
  1918. mulps %xmm13, %xmm11
  1919. SUBPS %xmm11, %xmm3
  1920. pshufd $0xb1, %xmm4, %xmm5
  1921. mulps %xmm14, %xmm4
  1922. addps %xmm4, %xmm0
  1923. pshufd $0xb1, %xmm6, %xmm7
  1924. mulps %xmm14, %xmm6
  1925. addps %xmm6, %xmm1
  1926. pshufd $0xb1, %xmm8, %xmm9
  1927. mulps %xmm14, %xmm8
  1928. addps %xmm8, %xmm2
  1929. pshufd $0xb1, %xmm10, %xmm11
  1930. mulps %xmm14, %xmm10
  1931. addps %xmm10, %xmm3
  1932. mulps %xmm15, %xmm5
  1933. SUBPS %xmm5, %xmm0
  1934. mulps %xmm15, %xmm7
  1935. SUBPS %xmm7, %xmm1
  1936. mulps %xmm15, %xmm9
  1937. SUBPS %xmm9, %xmm2
  1938. mulps %xmm15, %xmm11
  1939. SUBPS %xmm11, %xmm3
  1940. movaps %xmm0, -32 * SIZE(Y1)
  1941. movaps %xmm1, -28 * SIZE(Y1)
  1942. movaps %xmm2, -24 * SIZE(Y1)
  1943. movaps %xmm3, -20 * SIZE(Y1)
  1944. movaps -16 * SIZE(Y1), %xmm0
  1945. movaps -12 * SIZE(Y1), %xmm1
  1946. movaps -8 * SIZE(Y1), %xmm2
  1947. movaps -4 * SIZE(Y1), %xmm3
  1948. subq $-16 * SIZE, A1
  1949. subq $-16 * SIZE, A2
  1950. subq $-16 * SIZE, Y1
  1951. ALIGN_3
  1952. .L115:
  1953. testq $4, MM
  1954. je .L117
  1955. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  1956. MOVUPS_A1(-28 * SIZE, A1, %xmm6)
  1957. movsd -32 * SIZE(A2), %xmm8
  1958. movhps -30 * SIZE(A2), %xmm8
  1959. movsd -28 * SIZE(A2), %xmm10
  1960. movhps -26 * SIZE(A2), %xmm10
  1961. pshufd $0xb1, %xmm4, %xmm5
  1962. pshufd $0xb1, %xmm6, %xmm7
  1963. pshufd $0xb1, %xmm8, %xmm9
  1964. pshufd $0xb1, %xmm10, %xmm11
  1965. mulps %xmm12, %xmm4
  1966. addps %xmm4, %xmm0
  1967. mulps %xmm12, %xmm6
  1968. addps %xmm6, %xmm1
  1969. mulps %xmm13, %xmm5
  1970. SUBPS %xmm5, %xmm0
  1971. mulps %xmm13, %xmm7
  1972. SUBPS %xmm7, %xmm1
  1973. mulps %xmm14, %xmm8
  1974. addps %xmm8, %xmm0
  1975. mulps %xmm14, %xmm10
  1976. addps %xmm10, %xmm1
  1977. mulps %xmm15, %xmm9
  1978. SUBPS %xmm9, %xmm0
  1979. mulps %xmm15, %xmm11
  1980. SUBPS %xmm11, %xmm1
  1981. movaps %xmm0, -32 * SIZE(Y1)
  1982. movaps %xmm1, -28 * SIZE(Y1)
  1983. movaps %xmm2, %xmm0
  1984. movaps %xmm3, %xmm1
  1985. addq $8 * SIZE, A1
  1986. addq $8 * SIZE, A2
  1987. addq $8 * SIZE, Y1
  1988. ALIGN_3
  1989. .L117:
  1990. testq $2, MM
  1991. je .L118
  1992. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  1993. movsd -32 * SIZE(A2), %xmm6
  1994. movhps -30 * SIZE(A2), %xmm6
  1995. pshufd $0xb1, %xmm4, %xmm5
  1996. mulps %xmm12, %xmm4
  1997. addps %xmm4, %xmm0
  1998. pshufd $0xb1, %xmm6, %xmm7
  1999. mulps %xmm13, %xmm5
  2000. SUBPS %xmm5, %xmm0
  2001. mulps %xmm14, %xmm6
  2002. addps %xmm6, %xmm0
  2003. mulps %xmm15, %xmm7
  2004. SUBPS %xmm7, %xmm0
  2005. movaps %xmm0, -32 * SIZE(Y1)
  2006. movaps %xmm1, %xmm0
  2007. addq $4 * SIZE, A1
  2008. addq $4 * SIZE, A2
  2009. addq $4 * SIZE, Y1
  2010. ALIGN_3
  2011. .L118:
  2012. testq $1, MM
  2013. #if GEMV_UNROLL == 2
  2014. je .L119
  2015. #else
  2016. je .L120
  2017. #endif
  2018. movsd -32 * SIZE(A1), %xmm4
  2019. movsd -32 * SIZE(A2), %xmm6
  2020. pshufd $0xb1, %xmm4, %xmm5
  2021. pshufd $0xb1, %xmm6, %xmm7
  2022. mulps %xmm12, %xmm4
  2023. addps %xmm4, %xmm0
  2024. mulps %xmm13, %xmm5
  2025. SUBPS %xmm5, %xmm0
  2026. mulps %xmm14, %xmm6
  2027. addps %xmm6, %xmm0
  2028. mulps %xmm15, %xmm7
  2029. SUBPS %xmm7, %xmm0
  2030. movlps %xmm0, -32 * SIZE(Y1)
  2031. #if GEMV_UNROLL == 2
  2032. ALIGN_3
  2033. .L119:
  2034. cmpq $2, N
  2035. jge .L111
  2036. #endif
  2037. ALIGN_3
  2038. .L120:
  2039. #endif
  2040. cmpq $1, N
  2041. jl .L990
  2042. leaq 32 * SIZE(BUFFER), Y1
  2043. movq A, A1
  2044. movsd (X), %xmm13
  2045. addq INCX, X
  2046. #ifdef HAVE_SSE3
  2047. movddup ALPHA, %xmm8
  2048. #else
  2049. movsd ALPHA, %xmm8
  2050. unpcklpd %xmm8, %xmm8
  2051. #endif
  2052. pshufd $0xb1, %xmm8, %xmm9
  2053. pcmpeqb %xmm11, %xmm11
  2054. psllq $63, %xmm11
  2055. pshufd $0x00, %xmm13, %xmm12
  2056. pshufd $0x55, %xmm13, %xmm13
  2057. #ifndef XCONJ
  2058. xorps %xmm11, %xmm13
  2059. #else
  2060. xorps %xmm11, %xmm12
  2061. #endif
  2062. mulps %xmm8, %xmm12
  2063. mulps %xmm9, %xmm13
  2064. #ifndef XCONJ
  2065. subps %xmm13, %xmm12
  2066. #else
  2067. addps %xmm13, %xmm12
  2068. #endif
  2069. pshufd $0x55, %xmm12, %xmm13
  2070. pshufd $0x00, %xmm12, %xmm12
  2071. #ifndef CONJ
  2072. xorps %xmm11, %xmm13
  2073. #else
  2074. xorps %xmm11, %xmm12
  2075. #endif
  2076. #ifdef ALIGNED_ACCESS
  2077. cmpq M, MM
  2078. je .L12X
  2079. movsd -32 * SIZE(A1), %xmm4
  2080. movsd -32 * SIZE(Y1), %xmm0
  2081. pshufd $0xb1, %xmm4, %xmm5
  2082. mulps %xmm12, %xmm4
  2083. addps %xmm4, %xmm0
  2084. mulps %xmm13, %xmm5
  2085. SUBPS %xmm5, %xmm0
  2086. movlps %xmm0, -32 * SIZE(Y1)
  2087. addq $2 * SIZE, A1
  2088. addq $2 * SIZE, Y1
  2089. ALIGN_3
  2090. .L12X:
  2091. #endif
  2092. movaps -32 * SIZE(Y1), %xmm0
  2093. movaps -28 * SIZE(Y1), %xmm1
  2094. movaps -24 * SIZE(Y1), %xmm2
  2095. movaps -20 * SIZE(Y1), %xmm3
  2096. ALIGN_3
  2097. movq MM, I
  2098. sarq $3, I
  2099. jle .L125
  2100. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  2101. MOVUPS_A1(-28 * SIZE, A1, %xmm6)
  2102. MOVUPS_A1(-24 * SIZE, A1, %xmm8)
  2103. MOVUPS_A1(-20 * SIZE, A1, %xmm10)
  2104. decq I
  2105. jle .L124
  2106. ALIGN_3
  2107. .L123:
  2108. #ifdef PREFETCH
  2109. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  2110. #endif
  2111. pshufd $0xb1, %xmm4, %xmm5
  2112. mulps %xmm12, %xmm4
  2113. addps %xmm4, %xmm0
  2114. MOVUPS_A1(-16 * SIZE, A1, %xmm4)
  2115. pshufd $0xb1, %xmm6, %xmm7
  2116. mulps %xmm12, %xmm6
  2117. addps %xmm6, %xmm1
  2118. MOVUPS_A1(-12 * SIZE, A1, %xmm6)
  2119. pshufd $0xb1, %xmm8, %xmm9
  2120. mulps %xmm12, %xmm8
  2121. addps %xmm8, %xmm2
  2122. MOVUPS_A1( -8 * SIZE, A1, %xmm8)
  2123. pshufd $0xb1, %xmm10, %xmm11
  2124. mulps %xmm12, %xmm10
  2125. addps %xmm10, %xmm3
  2126. MOVUPS_A1( -4 * SIZE, A1, %xmm10)
  2127. mulps %xmm13, %xmm5
  2128. SUBPS %xmm5, %xmm0
  2129. mulps %xmm13, %xmm7
  2130. SUBPS %xmm7, %xmm1
  2131. mulps %xmm13, %xmm9
  2132. SUBPS %xmm9, %xmm2
  2133. mulps %xmm13, %xmm11
  2134. SUBPS %xmm11, %xmm3
  2135. #ifdef PREFETCHW
  2136. PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1)
  2137. #endif
  2138. movaps %xmm0, -32 * SIZE(Y1)
  2139. movaps %xmm1, -28 * SIZE(Y1)
  2140. movaps %xmm2, -24 * SIZE(Y1)
  2141. movaps %xmm3, -20 * SIZE(Y1)
  2142. movaps -16 * SIZE(Y1), %xmm0
  2143. movaps -12 * SIZE(Y1), %xmm1
  2144. movaps -8 * SIZE(Y1), %xmm2
  2145. movaps -4 * SIZE(Y1), %xmm3
  2146. subq $-16 * SIZE, A1
  2147. subq $-16 * SIZE, A2
  2148. subq $-16 * SIZE, Y1
  2149. subq $1, I
  2150. BRANCH
  2151. jg .L123
  2152. ALIGN_3
  2153. .L124:
  2154. pshufd $0xb1, %xmm4, %xmm5
  2155. mulps %xmm12, %xmm4
  2156. addps %xmm4, %xmm0
  2157. pshufd $0xb1, %xmm6, %xmm7
  2158. mulps %xmm12, %xmm6
  2159. addps %xmm6, %xmm1
  2160. pshufd $0xb1, %xmm8, %xmm9
  2161. mulps %xmm12, %xmm8
  2162. addps %xmm8, %xmm2
  2163. pshufd $0xb1, %xmm10, %xmm11
  2164. mulps %xmm12, %xmm10
  2165. addps %xmm10, %xmm3
  2166. mulps %xmm13, %xmm5
  2167. SUBPS %xmm5, %xmm0
  2168. mulps %xmm13, %xmm7
  2169. SUBPS %xmm7, %xmm1
  2170. mulps %xmm13, %xmm9
  2171. SUBPS %xmm9, %xmm2
  2172. mulps %xmm13, %xmm11
  2173. SUBPS %xmm11, %xmm3
  2174. movaps %xmm0, -32 * SIZE(Y1)
  2175. movaps %xmm1, -28 * SIZE(Y1)
  2176. movaps %xmm2, -24 * SIZE(Y1)
  2177. movaps %xmm3, -20 * SIZE(Y1)
  2178. movaps -16 * SIZE(Y1), %xmm0
  2179. movaps -12 * SIZE(Y1), %xmm1
  2180. movaps -8 * SIZE(Y1), %xmm2
  2181. movaps -4 * SIZE(Y1), %xmm3
  2182. subq $-16 * SIZE, A1
  2183. subq $-16 * SIZE, Y1
  2184. ALIGN_3
  2185. .L125:
  2186. testq $4, MM
  2187. je .L127
  2188. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  2189. MOVUPS_A1(-28 * SIZE, A1, %xmm6)
  2190. pshufd $0xb1, %xmm4, %xmm5
  2191. mulps %xmm12, %xmm4
  2192. addps %xmm4, %xmm0
  2193. pshufd $0xb1, %xmm6, %xmm7
  2194. mulps %xmm12, %xmm6
  2195. addps %xmm6, %xmm1
  2196. mulps %xmm13, %xmm5
  2197. SUBPS %xmm5, %xmm0
  2198. mulps %xmm13, %xmm7
  2199. SUBPS %xmm7, %xmm1
  2200. movaps %xmm0, -32 * SIZE(Y1)
  2201. movaps %xmm1, -28 * SIZE(Y1)
  2202. movaps %xmm2, %xmm0
  2203. movaps %xmm3, %xmm1
  2204. addq $8 * SIZE, A1
  2205. addq $8 * SIZE, Y1
  2206. ALIGN_3
  2207. .L127:
  2208. testq $2, MM
  2209. je .L128
  2210. MOVUPS_A1(-32 * SIZE, A1, %xmm4)
  2211. pshufd $0xb1, %xmm4, %xmm5
  2212. mulps %xmm12, %xmm4
  2213. addps %xmm4, %xmm0
  2214. mulps %xmm13, %xmm5
  2215. SUBPS %xmm5, %xmm0
  2216. movaps %xmm0, -32 * SIZE(Y1)
  2217. movaps %xmm1, %xmm0
  2218. addq $4 * SIZE, A1
  2219. addq $4 * SIZE, Y1
  2220. ALIGN_3
  2221. .L128:
  2222. testq $1, MM
  2223. je .L990
  2224. movsd -32 * SIZE(A1), %xmm4
  2225. pshufd $0xb1, %xmm4, %xmm5
  2226. mulps %xmm12, %xmm4
  2227. addps %xmm4, %xmm0
  2228. mulps %xmm13, %xmm5
  2229. SUBPS %xmm5, %xmm0
  2230. movlps %xmm0, -32 * SIZE(Y1)
  2231. jmp .L990
  2232. ALIGN_3
  2233. .L200:
  2234. testq $2 * SIZE, LDA
  2235. jne .L300
  2236. cmpq $2, N
  2237. jl .L210
  2238. ALIGN_3
  2239. .L201:
  2240. subq $2, N
  2241. leaq 32 * SIZE(BUFFER), Y1
  2242. movq A, A1
  2243. leaq (A, LDA, 1), A2
  2244. leaq (A, LDA, 2), A
  2245. movsd (X), %xmm13
  2246. addq INCX, X
  2247. movsd (X), %xmm15
  2248. addq INCX, X
  2249. #ifdef HAVE_SSE3
  2250. movddup ALPHA, %xmm8
  2251. #else
  2252. movsd ALPHA, %xmm8
  2253. unpcklpd %xmm8, %xmm8
  2254. #endif
  2255. pshufd $0xb1, %xmm8, %xmm9
  2256. pcmpeqb %xmm11, %xmm11
  2257. psllq $63, %xmm11
  2258. pshufd $0x00, %xmm13, %xmm12
  2259. pshufd $0x55, %xmm13, %xmm13
  2260. pshufd $0x00, %xmm15, %xmm14
  2261. pshufd $0x55, %xmm15, %xmm15
  2262. #ifndef XCONJ
  2263. xorps %xmm11, %xmm13
  2264. xorps %xmm11, %xmm15
  2265. #else
  2266. xorps %xmm11, %xmm12
  2267. xorps %xmm11, %xmm14
  2268. #endif
  2269. mulps %xmm8, %xmm12
  2270. mulps %xmm9, %xmm13
  2271. mulps %xmm8, %xmm14
  2272. mulps %xmm9, %xmm15
  2273. #ifndef XCONJ
  2274. subps %xmm13, %xmm12
  2275. subps %xmm15, %xmm14
  2276. #else
  2277. addps %xmm13, %xmm12
  2278. addps %xmm15, %xmm14
  2279. #endif
  2280. pshufd $0x55, %xmm12, %xmm13
  2281. pshufd $0x00, %xmm12, %xmm12
  2282. pshufd $0x55, %xmm14, %xmm15
  2283. pshufd $0x00, %xmm14, %xmm14
  2284. #ifndef CONJ
  2285. xorps %xmm11, %xmm13
  2286. xorps %xmm11, %xmm15
  2287. #else
  2288. xorps %xmm11, %xmm12
  2289. xorps %xmm11, %xmm14
  2290. #endif
  2291. #ifdef ALIGNED_ACCESS
  2292. cmpq M, MM
  2293. je .L20X
  2294. movsd -32 * SIZE(A1), %xmm4
  2295. movsd -32 * SIZE(A2), %xmm6
  2296. movsd -32 * SIZE(Y1), %xmm0
  2297. pshufd $0xb1, %xmm4, %xmm5
  2298. pshufd $0xb1, %xmm6, %xmm7
  2299. mulps %xmm12, %xmm4
  2300. addps %xmm4, %xmm0
  2301. mulps %xmm13, %xmm5
  2302. SUBPS %xmm5, %xmm0
  2303. mulps %xmm14, %xmm6
  2304. addps %xmm6, %xmm0
  2305. mulps %xmm15, %xmm7
  2306. SUBPS %xmm7, %xmm0
  2307. movlps %xmm0, -32 * SIZE(Y1)
  2308. addq $2 * SIZE, A1
  2309. addq $2 * SIZE, A2
  2310. addq $2 * SIZE, Y1
  2311. ALIGN_3
  2312. .L20X:
  2313. #endif
  2314. movaps -33 * SIZE(A1), %xmm4
  2315. movaps -33 * SIZE(A2), %xmm6
  2316. movaps -32 * SIZE(Y1), %xmm0
  2317. movaps -28 * SIZE(Y1), %xmm1
  2318. movaps -24 * SIZE(Y1), %xmm2
  2319. movaps -20 * SIZE(Y1), %xmm3
  2320. movq MM, I
  2321. sarq $3, I
  2322. jle .L205
  2323. movaps -29 * SIZE(A1), %xmm8
  2324. movaps -25 * SIZE(A1), %xmm9
  2325. movaps -21 * SIZE(A1), %xmm10
  2326. decq I
  2327. jle .L204
  2328. ALIGN_3
  2329. .L203:
  2330. #ifdef PREFETCH
  2331. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  2332. #endif
  2333. movss %xmm8, %xmm4
  2334. shufps $0x39, %xmm4, %xmm4
  2335. pshufd $0xb1, %xmm4, %xmm5
  2336. mulps %xmm12, %xmm4
  2337. addps %xmm4, %xmm0
  2338. movaps -17 * SIZE(A1), %xmm4
  2339. movss %xmm9, %xmm8
  2340. shufps $0x39, %xmm8, %xmm8
  2341. pshufd $0xb1, %xmm8, %xmm7
  2342. mulps %xmm12, %xmm8
  2343. addps %xmm8, %xmm1
  2344. movaps -29 * SIZE(A2), %xmm8
  2345. mulps %xmm13, %xmm5
  2346. SUBPS %xmm5, %xmm0
  2347. mulps %xmm13, %xmm7
  2348. SUBPS %xmm7, %xmm1
  2349. movss %xmm10, %xmm9
  2350. shufps $0x39, %xmm9, %xmm9
  2351. pshufd $0xb1, %xmm9, %xmm5
  2352. mulps %xmm12, %xmm9
  2353. addps %xmm9, %xmm2
  2354. movaps -25 * SIZE(A2), %xmm9
  2355. movss %xmm4, %xmm10
  2356. shufps $0x39, %xmm10, %xmm10
  2357. pshufd $0xb1, %xmm10, %xmm7
  2358. mulps %xmm12, %xmm10
  2359. addps %xmm10, %xmm3
  2360. movaps -21 * SIZE(A2), %xmm10
  2361. mulps %xmm13, %xmm5
  2362. SUBPS %xmm5, %xmm2
  2363. mulps %xmm13, %xmm7
  2364. SUBPS %xmm7, %xmm3
  2365. #ifdef PREFETCH
  2366. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  2367. #endif
  2368. movss %xmm8, %xmm6
  2369. shufps $0x39, %xmm6, %xmm6
  2370. pshufd $0xb1, %xmm6, %xmm5
  2371. mulps %xmm14, %xmm6
  2372. addps %xmm6, %xmm0
  2373. movaps -17 * SIZE(A2), %xmm6
  2374. movss %xmm9, %xmm8
  2375. shufps $0x39, %xmm8, %xmm8
  2376. pshufd $0xb1, %xmm8, %xmm7
  2377. mulps %xmm14, %xmm8
  2378. addps %xmm8, %xmm1
  2379. movaps -13 * SIZE(A1), %xmm8
  2380. mulps %xmm15, %xmm5
  2381. SUBPS %xmm5, %xmm0
  2382. mulps %xmm15, %xmm7
  2383. SUBPS %xmm7, %xmm1
  2384. movss %xmm10, %xmm9
  2385. shufps $0x39, %xmm9, %xmm9
  2386. pshufd $0xb1, %xmm9, %xmm5
  2387. mulps %xmm14, %xmm9
  2388. addps %xmm9, %xmm2
  2389. movaps -9 * SIZE(A1), %xmm9
  2390. movss %xmm6, %xmm10
  2391. shufps $0x39, %xmm10, %xmm10
  2392. pshufd $0xb1, %xmm10, %xmm7
  2393. mulps %xmm14, %xmm10
  2394. addps %xmm10, %xmm3
  2395. movaps -5 * SIZE(A1), %xmm10
  2396. mulps %xmm15, %xmm5
  2397. SUBPS %xmm5, %xmm2
  2398. mulps %xmm15, %xmm7
  2399. SUBPS %xmm7, %xmm3
  2400. #ifdef PREFETCHW
  2401. PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1)
  2402. #endif
  2403. movaps %xmm0, -32 * SIZE(Y1)
  2404. movaps %xmm1, -28 * SIZE(Y1)
  2405. movaps %xmm2, -24 * SIZE(Y1)
  2406. movaps %xmm3, -20 * SIZE(Y1)
  2407. movaps -16 * SIZE(Y1), %xmm0
  2408. movaps -12 * SIZE(Y1), %xmm1
  2409. movaps -8 * SIZE(Y1), %xmm2
  2410. movaps -4 * SIZE(Y1), %xmm3
  2411. subq $-16 * SIZE, A1
  2412. subq $-16 * SIZE, A2
  2413. subq $-16 * SIZE, Y1
  2414. subq $1, I
  2415. BRANCH
  2416. jg .L203
  2417. ALIGN_3
  2418. .L204:
  2419. movss %xmm8, %xmm4
  2420. shufps $0x39, %xmm4, %xmm4
  2421. pshufd $0xb1, %xmm4, %xmm5
  2422. mulps %xmm12, %xmm4
  2423. addps %xmm4, %xmm0
  2424. movaps -17 * SIZE(A1), %xmm4
  2425. movss %xmm9, %xmm8
  2426. shufps $0x39, %xmm8, %xmm8
  2427. pshufd $0xb1, %xmm8, %xmm7
  2428. mulps %xmm12, %xmm8
  2429. addps %xmm8, %xmm1
  2430. movaps -29 * SIZE(A2), %xmm8
  2431. mulps %xmm13, %xmm5
  2432. SUBPS %xmm5, %xmm0
  2433. mulps %xmm13, %xmm7
  2434. SUBPS %xmm7, %xmm1
  2435. movss %xmm10, %xmm9
  2436. shufps $0x39, %xmm9, %xmm9
  2437. pshufd $0xb1, %xmm9, %xmm5
  2438. mulps %xmm12, %xmm9
  2439. addps %xmm9, %xmm2
  2440. movaps -25 * SIZE(A2), %xmm9
  2441. movss %xmm4, %xmm10
  2442. shufps $0x39, %xmm10, %xmm10
  2443. pshufd $0xb1, %xmm10, %xmm7
  2444. mulps %xmm12, %xmm10
  2445. addps %xmm10, %xmm3
  2446. movaps -21 * SIZE(A2), %xmm10
  2447. mulps %xmm13, %xmm5
  2448. SUBPS %xmm5, %xmm2
  2449. mulps %xmm13, %xmm7
  2450. SUBPS %xmm7, %xmm3
  2451. movss %xmm8, %xmm6
  2452. shufps $0x39, %xmm6, %xmm6
  2453. pshufd $0xb1, %xmm6, %xmm5
  2454. mulps %xmm14, %xmm6
  2455. addps %xmm6, %xmm0
  2456. movaps -17 * SIZE(A2), %xmm6
  2457. movss %xmm9, %xmm8
  2458. shufps $0x39, %xmm8, %xmm8
  2459. pshufd $0xb1, %xmm8, %xmm7
  2460. mulps %xmm14, %xmm8
  2461. addps %xmm8, %xmm1
  2462. mulps %xmm15, %xmm5
  2463. SUBPS %xmm5, %xmm0
  2464. mulps %xmm15, %xmm7
  2465. SUBPS %xmm7, %xmm1
  2466. movss %xmm10, %xmm9
  2467. shufps $0x39, %xmm9, %xmm9
  2468. pshufd $0xb1, %xmm9, %xmm5
  2469. mulps %xmm14, %xmm9
  2470. addps %xmm9, %xmm2
  2471. movss %xmm6, %xmm10
  2472. shufps $0x39, %xmm10, %xmm10
  2473. pshufd $0xb1, %xmm10, %xmm7
  2474. mulps %xmm14, %xmm10
  2475. addps %xmm10, %xmm3
  2476. mulps %xmm15, %xmm5
  2477. SUBPS %xmm5, %xmm2
  2478. mulps %xmm15, %xmm7
  2479. SUBPS %xmm7, %xmm3
  2480. movaps %xmm0, -32 * SIZE(Y1)
  2481. movaps %xmm1, -28 * SIZE(Y1)
  2482. movaps %xmm2, -24 * SIZE(Y1)
  2483. movaps %xmm3, -20 * SIZE(Y1)
  2484. movaps -16 * SIZE(Y1), %xmm0
  2485. movaps -12 * SIZE(Y1), %xmm1
  2486. movaps -8 * SIZE(Y1), %xmm2
  2487. movaps -4 * SIZE(Y1), %xmm3
  2488. subq $-16 * SIZE, A1
  2489. subq $-16 * SIZE, A2
  2490. subq $-16 * SIZE, Y1
  2491. ALIGN_3
  2492. .L205:
  2493. testq $4, MM
  2494. je .L207
  2495. movaps -29 * SIZE(A1), %xmm8
  2496. movaps -25 * SIZE(A1), %xmm9
  2497. movaps -29 * SIZE(A2), %xmm10
  2498. movaps -25 * SIZE(A2), %xmm11
  2499. movss %xmm8, %xmm4
  2500. shufps $0x39, %xmm4, %xmm4
  2501. pshufd $0xb1, %xmm4, %xmm5
  2502. mulps %xmm12, %xmm4
  2503. addps %xmm4, %xmm0
  2504. movss %xmm9, %xmm8
  2505. shufps $0x39, %xmm8, %xmm8
  2506. pshufd $0xb1, %xmm8, %xmm7
  2507. mulps %xmm12, %xmm8
  2508. addps %xmm8, %xmm1
  2509. mulps %xmm13, %xmm5
  2510. SUBPS %xmm5, %xmm0
  2511. mulps %xmm13, %xmm7
  2512. SUBPS %xmm7, %xmm1
  2513. movss %xmm10, %xmm6
  2514. shufps $0x39, %xmm6, %xmm6
  2515. pshufd $0xb1, %xmm6, %xmm5
  2516. mulps %xmm14, %xmm6
  2517. addps %xmm6, %xmm0
  2518. movss %xmm11, %xmm10
  2519. shufps $0x39, %xmm10, %xmm10
  2520. pshufd $0xb1, %xmm10, %xmm7
  2521. mulps %xmm14, %xmm10
  2522. addps %xmm10, %xmm1
  2523. mulps %xmm15, %xmm5
  2524. SUBPS %xmm5, %xmm0
  2525. mulps %xmm15, %xmm7
  2526. SUBPS %xmm7, %xmm1
  2527. movaps %xmm0, -32 * SIZE(Y1)
  2528. movaps %xmm1, -28 * SIZE(Y1)
  2529. movaps %xmm9, %xmm4
  2530. movaps %xmm11, %xmm6
  2531. movaps %xmm2, %xmm0
  2532. movaps %xmm3, %xmm1
  2533. addq $8 * SIZE, A1
  2534. addq $8 * SIZE, A2
  2535. addq $8 * SIZE, Y1
  2536. ALIGN_3
  2537. .L207:
  2538. testq $2, MM
  2539. je .L208
  2540. movaps -29 * SIZE(A1), %xmm8
  2541. movaps -29 * SIZE(A2), %xmm9
  2542. movss %xmm8, %xmm4
  2543. shufps $0x39, %xmm4, %xmm4
  2544. movss %xmm9, %xmm6
  2545. shufps $0x39, %xmm6, %xmm6
  2546. pshufd $0xb1, %xmm4, %xmm5
  2547. mulps %xmm12, %xmm4
  2548. addps %xmm4, %xmm0
  2549. pshufd $0xb1, %xmm6, %xmm7
  2550. mulps %xmm13, %xmm5
  2551. SUBPS %xmm5, %xmm0
  2552. mulps %xmm14, %xmm6
  2553. addps %xmm6, %xmm0
  2554. mulps %xmm15, %xmm7
  2555. SUBPS %xmm7, %xmm0
  2556. movaps %xmm0, -32 * SIZE(Y1)
  2557. movaps %xmm1, %xmm0
  2558. addq $4 * SIZE, A1
  2559. addq $4 * SIZE, A2
  2560. addq $4 * SIZE, Y1
  2561. ALIGN_3
  2562. .L208:
  2563. testq $1, MM
  2564. je .L209
  2565. movsd -32 * SIZE(A1), %xmm4
  2566. movsd -32 * SIZE(A2), %xmm6
  2567. pshufd $0xb1, %xmm4, %xmm5
  2568. pshufd $0xb1, %xmm6, %xmm7
  2569. mulps %xmm12, %xmm4
  2570. addps %xmm4, %xmm0
  2571. mulps %xmm13, %xmm5
  2572. SUBPS %xmm5, %xmm0
  2573. mulps %xmm14, %xmm6
  2574. addps %xmm6, %xmm0
  2575. mulps %xmm15, %xmm7
  2576. SUBPS %xmm7, %xmm0
  2577. movlps %xmm0, -32 * SIZE(Y1)
  2578. ALIGN_3
  2579. .L209:
  2580. cmpq $2, N
  2581. jge .L201
  2582. ALIGN_3
  2583. .L210:
  2584. cmpq $1, N
  2585. jl .L990
  2586. leaq 32 * SIZE(BUFFER), Y1
  2587. movq A, A1
  2588. movsd (X), %xmm13
  2589. addq INCX, X
  2590. #ifdef HAVE_SSE3
  2591. movddup ALPHA, %xmm8
  2592. #else
  2593. movsd ALPHA, %xmm8
  2594. unpcklpd %xmm8, %xmm8
  2595. #endif
  2596. pshufd $0xb1, %xmm8, %xmm9
  2597. pcmpeqb %xmm11, %xmm11
  2598. psllq $63, %xmm11
  2599. pshufd $0x00, %xmm13, %xmm12
  2600. pshufd $0x55, %xmm13, %xmm13
  2601. #ifndef XCONJ
  2602. xorps %xmm11, %xmm13
  2603. #else
  2604. xorps %xmm11, %xmm12
  2605. #endif
  2606. mulps %xmm8, %xmm12
  2607. mulps %xmm9, %xmm13
  2608. #ifndef XCONJ
  2609. subps %xmm13, %xmm12
  2610. #else
  2611. addps %xmm13, %xmm12
  2612. #endif
  2613. pshufd $0x55, %xmm12, %xmm13
  2614. pshufd $0x00, %xmm12, %xmm12
  2615. #ifndef CONJ
  2616. xorps %xmm11, %xmm13
  2617. #else
  2618. xorps %xmm11, %xmm12
  2619. #endif
  2620. #ifdef ALIGNED_ACCESS
  2621. cmpq M, MM
  2622. je .L21X
  2623. movsd -32 * SIZE(A1), %xmm4
  2624. movsd -32 * SIZE(Y1), %xmm0
  2625. pshufd $0xb1, %xmm4, %xmm5
  2626. mulps %xmm12, %xmm4
  2627. addps %xmm4, %xmm0
  2628. mulps %xmm13, %xmm5
  2629. SUBPS %xmm5, %xmm0
  2630. movlps %xmm0, -32 * SIZE(Y1)
  2631. addq $2 * SIZE, A1
  2632. addq $2 * SIZE, Y1
  2633. ALIGN_3
  2634. .L21X:
  2635. #endif
  2636. movaps -33 * SIZE(A1), %xmm4
  2637. movaps -32 * SIZE(Y1), %xmm0
  2638. movaps -28 * SIZE(Y1), %xmm1
  2639. movaps -24 * SIZE(Y1), %xmm2
  2640. movaps -20 * SIZE(Y1), %xmm3
  2641. movq MM, I
  2642. sarq $3, I
  2643. jle .L215
  2644. movaps -29 * SIZE(A1), %xmm6
  2645. movaps -25 * SIZE(A1), %xmm8
  2646. movaps -21 * SIZE(A1), %xmm10
  2647. decq I
  2648. jle .L214
  2649. ALIGN_3
  2650. .L213:
  2651. #ifdef PREFETCH
  2652. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  2653. #endif
  2654. movss %xmm6, %xmm4
  2655. shufps $0x39, %xmm4, %xmm4
  2656. pshufd $0xb1, %xmm4, %xmm5
  2657. mulps %xmm12, %xmm4
  2658. addps %xmm4, %xmm0
  2659. movaps -17 * SIZE(A1), %xmm4
  2660. movss %xmm8, %xmm6
  2661. shufps $0x39, %xmm6, %xmm6
  2662. pshufd $0xb1, %xmm6, %xmm7
  2663. mulps %xmm12, %xmm6
  2664. addps %xmm6, %xmm1
  2665. movaps -13 * SIZE(A1), %xmm6
  2666. movss %xmm10, %xmm8
  2667. shufps $0x39, %xmm8, %xmm8
  2668. pshufd $0xb1, %xmm8, %xmm9
  2669. mulps %xmm12, %xmm8
  2670. addps %xmm8, %xmm2
  2671. movaps -9 * SIZE(A1), %xmm8
  2672. movss %xmm4, %xmm10
  2673. shufps $0x39, %xmm10, %xmm10
  2674. pshufd $0xb1, %xmm10, %xmm11
  2675. mulps %xmm12, %xmm10
  2676. addps %xmm10, %xmm3
  2677. movaps -5 * SIZE(A1), %xmm10
  2678. mulps %xmm13, %xmm5
  2679. SUBPS %xmm5, %xmm0
  2680. mulps %xmm13, %xmm7
  2681. SUBPS %xmm7, %xmm1
  2682. mulps %xmm13, %xmm9
  2683. SUBPS %xmm9, %xmm2
  2684. mulps %xmm13, %xmm11
  2685. SUBPS %xmm11, %xmm3
  2686. #ifdef PREFETCHW
  2687. PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1)
  2688. #endif
  2689. movaps %xmm0, -32 * SIZE(Y1)
  2690. movaps %xmm1, -28 * SIZE(Y1)
  2691. movaps %xmm2, -24 * SIZE(Y1)
  2692. movaps %xmm3, -20 * SIZE(Y1)
  2693. movaps -16 * SIZE(Y1), %xmm0
  2694. movaps -12 * SIZE(Y1), %xmm1
  2695. movaps -8 * SIZE(Y1), %xmm2
  2696. movaps -4 * SIZE(Y1), %xmm3
  2697. subq $-16 * SIZE, A1
  2698. subq $-16 * SIZE, A2
  2699. subq $-16 * SIZE, Y1
  2700. subq $1, I
  2701. BRANCH
  2702. jg .L213
  2703. ALIGN_3
  2704. .L214:
  2705. movss %xmm6, %xmm4
  2706. shufps $0x39, %xmm4, %xmm4
  2707. pshufd $0xb1, %xmm4, %xmm5
  2708. mulps %xmm12, %xmm4
  2709. addps %xmm4, %xmm0
  2710. movaps -17 * SIZE(A1), %xmm4
  2711. movss %xmm8, %xmm6
  2712. shufps $0x39, %xmm6, %xmm6
  2713. pshufd $0xb1, %xmm6, %xmm7
  2714. mulps %xmm12, %xmm6
  2715. addps %xmm6, %xmm1
  2716. movss %xmm10, %xmm8
  2717. shufps $0x39, %xmm8, %xmm8
  2718. pshufd $0xb1, %xmm8, %xmm9
  2719. mulps %xmm12, %xmm8
  2720. addps %xmm8, %xmm2
  2721. movss %xmm4, %xmm10
  2722. shufps $0x39, %xmm10, %xmm10
  2723. pshufd $0xb1, %xmm10, %xmm11
  2724. mulps %xmm12, %xmm10
  2725. addps %xmm10, %xmm3
  2726. mulps %xmm13, %xmm5
  2727. SUBPS %xmm5, %xmm0
  2728. mulps %xmm13, %xmm7
  2729. SUBPS %xmm7, %xmm1
  2730. mulps %xmm13, %xmm9
  2731. SUBPS %xmm9, %xmm2
  2732. mulps %xmm13, %xmm11
  2733. SUBPS %xmm11, %xmm3
  2734. movaps %xmm0, -32 * SIZE(Y1)
  2735. movaps %xmm1, -28 * SIZE(Y1)
  2736. movaps %xmm2, -24 * SIZE(Y1)
  2737. movaps %xmm3, -20 * SIZE(Y1)
  2738. movaps -16 * SIZE(Y1), %xmm0
  2739. movaps -12 * SIZE(Y1), %xmm1
  2740. movaps -8 * SIZE(Y1), %xmm2
  2741. movaps -4 * SIZE(Y1), %xmm3
  2742. subq $-16 * SIZE, A1
  2743. subq $-16 * SIZE, Y1
  2744. ALIGN_3
  2745. .L215:
  2746. testq $4, MM
  2747. je .L217
  2748. movaps -29 * SIZE(A1), %xmm6
  2749. movaps -25 * SIZE(A1), %xmm8
  2750. movss %xmm6, %xmm4
  2751. shufps $0x39, %xmm4, %xmm4
  2752. pshufd $0xb1, %xmm4, %xmm5
  2753. mulps %xmm12, %xmm4
  2754. addps %xmm4, %xmm0
  2755. movss %xmm8, %xmm6
  2756. shufps $0x39, %xmm6, %xmm6
  2757. pshufd $0xb1, %xmm6, %xmm7
  2758. mulps %xmm12, %xmm6
  2759. addps %xmm6, %xmm1
  2760. mulps %xmm13, %xmm5
  2761. SUBPS %xmm5, %xmm0
  2762. mulps %xmm13, %xmm7
  2763. SUBPS %xmm7, %xmm1
  2764. movaps %xmm0, -32 * SIZE(Y1)
  2765. movaps %xmm1, -28 * SIZE(Y1)
  2766. movaps %xmm2, %xmm0
  2767. movaps %xmm3, %xmm1
  2768. movaps %xmm8, %xmm4
  2769. addq $8 * SIZE, A1
  2770. addq $8 * SIZE, Y1
  2771. ALIGN_3
  2772. .L217:
  2773. testq $2, MM
  2774. je .L218
  2775. movaps -29 * SIZE(A1), %xmm6
  2776. movss %xmm6, %xmm4
  2777. shufps $0x39, %xmm4, %xmm4
  2778. pshufd $0xb1, %xmm4, %xmm5
  2779. mulps %xmm12, %xmm4
  2780. addps %xmm4, %xmm0
  2781. mulps %xmm13, %xmm5
  2782. SUBPS %xmm5, %xmm0
  2783. movaps %xmm0, -32 * SIZE(Y1)
  2784. movaps %xmm1, %xmm0
  2785. addq $4 * SIZE, A1
  2786. addq $4 * SIZE, Y1
  2787. ALIGN_3
  2788. .L218:
  2789. testq $1, MM
  2790. je .L990
  2791. movsd -32 * SIZE(A1), %xmm4
  2792. pshufd $0xb1, %xmm4, %xmm5
  2793. mulps %xmm12, %xmm4
  2794. addps %xmm4, %xmm0
  2795. mulps %xmm13, %xmm5
  2796. SUBPS %xmm5, %xmm0
  2797. movlps %xmm0, -32 * SIZE(Y1)
  2798. jmp .L990
  2799. ALIGN_3
  2800. .L300:
  2801. cmpq $2, N
  2802. jl .L310
  2803. ALIGN_3
  2804. .L301:
  2805. subq $2, N
  2806. leaq 32 * SIZE(BUFFER), Y1
  2807. movq A, A1
  2808. leaq (A, LDA, 1), A2
  2809. leaq (A, LDA, 2), A
  2810. movsd (X), %xmm13
  2811. addq INCX, X
  2812. movsd (X), %xmm15
  2813. addq INCX, X
  2814. #ifdef HAVE_SSE3
  2815. movddup ALPHA, %xmm8
  2816. #else
  2817. movsd ALPHA, %xmm8
  2818. unpcklpd %xmm8, %xmm8
  2819. #endif
  2820. pshufd $0xb1, %xmm8, %xmm9
  2821. pcmpeqb %xmm11, %xmm11
  2822. psllq $63, %xmm11
  2823. pshufd $0x00, %xmm13, %xmm12
  2824. pshufd $0x55, %xmm13, %xmm13
  2825. pshufd $0x00, %xmm15, %xmm14
  2826. pshufd $0x55, %xmm15, %xmm15
  2827. #ifndef XCONJ
  2828. xorps %xmm11, %xmm13
  2829. xorps %xmm11, %xmm15
  2830. #else
  2831. xorps %xmm11, %xmm12
  2832. xorps %xmm11, %xmm14
  2833. #endif
  2834. mulps %xmm8, %xmm12
  2835. mulps %xmm9, %xmm13
  2836. mulps %xmm8, %xmm14
  2837. mulps %xmm9, %xmm15
  2838. #ifndef XCONJ
  2839. subps %xmm13, %xmm12
  2840. subps %xmm15, %xmm14
  2841. #else
  2842. addps %xmm13, %xmm12
  2843. addps %xmm15, %xmm14
  2844. #endif
  2845. pshufd $0x55, %xmm12, %xmm13
  2846. pshufd $0x00, %xmm12, %xmm12
  2847. pshufd $0x55, %xmm14, %xmm15
  2848. pshufd $0x00, %xmm14, %xmm14
  2849. #ifndef CONJ
  2850. xorps %xmm11, %xmm13
  2851. xorps %xmm11, %xmm15
  2852. #else
  2853. xorps %xmm11, %xmm12
  2854. xorps %xmm11, %xmm14
  2855. #endif
  2856. #ifdef ALIGNED_ACCESS
  2857. cmpq M, MM
  2858. je .L30X
  2859. movsd -32 * SIZE(A1), %xmm4
  2860. movsd -32 * SIZE(A2), %xmm6
  2861. movsd -32 * SIZE(Y1), %xmm0
  2862. pshufd $0xb1, %xmm4, %xmm5
  2863. pshufd $0xb1, %xmm6, %xmm7
  2864. mulps %xmm12, %xmm4
  2865. addps %xmm4, %xmm0
  2866. mulps %xmm13, %xmm5
  2867. SUBPS %xmm5, %xmm0
  2868. mulps %xmm14, %xmm6
  2869. addps %xmm6, %xmm0
  2870. mulps %xmm15, %xmm7
  2871. SUBPS %xmm7, %xmm0
  2872. movlps %xmm0, -32 * SIZE(Y1)
  2873. addq $2 * SIZE, A1
  2874. addq $2 * SIZE, A2
  2875. addq $2 * SIZE, Y1
  2876. ALIGN_3
  2877. .L30X:
  2878. #endif
  2879. movaps -33 * SIZE(A1), %xmm4
  2880. movaps -35 * SIZE(A2), %xmm6
  2881. movaps -32 * SIZE(Y1), %xmm0
  2882. movaps -28 * SIZE(Y1), %xmm1
  2883. movaps -24 * SIZE(Y1), %xmm2
  2884. movaps -20 * SIZE(Y1), %xmm3
  2885. movq MM, I
  2886. sarq $3, I
  2887. jle .L305
  2888. movaps -29 * SIZE(A1), %xmm8
  2889. movaps -25 * SIZE(A1), %xmm9
  2890. movaps -21 * SIZE(A1), %xmm10
  2891. decq I
  2892. jle .L304
  2893. ALIGN_3
  2894. .L303:
  2895. #ifdef PREFETCH
  2896. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2)
  2897. #endif
  2898. movss %xmm8, %xmm4
  2899. shufps $0x39, %xmm4, %xmm4
  2900. pshufd $0xb1, %xmm4, %xmm5
  2901. mulps %xmm12, %xmm4
  2902. addps %xmm4, %xmm0
  2903. movaps -17 * SIZE(A1), %xmm4
  2904. movss %xmm9, %xmm8
  2905. shufps $0x39, %xmm8, %xmm8
  2906. pshufd $0xb1, %xmm8, %xmm7
  2907. mulps %xmm12, %xmm8
  2908. addps %xmm8, %xmm1
  2909. movaps -31 * SIZE(A2), %xmm8
  2910. mulps %xmm13, %xmm5
  2911. SUBPS %xmm5, %xmm0
  2912. mulps %xmm13, %xmm7
  2913. SUBPS %xmm7, %xmm1
  2914. movss %xmm10, %xmm9
  2915. shufps $0x39, %xmm9, %xmm9
  2916. pshufd $0xb1, %xmm9, %xmm5
  2917. mulps %xmm12, %xmm9
  2918. addps %xmm9, %xmm2
  2919. movaps -27 * SIZE(A2), %xmm9
  2920. movss %xmm4, %xmm10
  2921. shufps $0x39, %xmm10, %xmm10
  2922. pshufd $0xb1, %xmm10, %xmm7
  2923. mulps %xmm12, %xmm10
  2924. addps %xmm10, %xmm3
  2925. movaps -23 * SIZE(A2), %xmm10
  2926. mulps %xmm13, %xmm5
  2927. SUBPS %xmm5, %xmm2
  2928. mulps %xmm13, %xmm7
  2929. SUBPS %xmm7, %xmm3
  2930. #ifdef PREFETCH
  2931. PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1)
  2932. #endif
  2933. movss %xmm8, %xmm6
  2934. shufps $0x93, %xmm8, %xmm6
  2935. pshufd $0xb1, %xmm6, %xmm5
  2936. mulps %xmm14, %xmm6
  2937. addps %xmm6, %xmm0
  2938. movaps -19 * SIZE(A2), %xmm6
  2939. movss %xmm9, %xmm8
  2940. shufps $0x93, %xmm9, %xmm8
  2941. pshufd $0xb1, %xmm8, %xmm7
  2942. mulps %xmm14, %xmm8
  2943. addps %xmm8, %xmm1
  2944. movaps -13 * SIZE(A1), %xmm8
  2945. mulps %xmm15, %xmm5
  2946. SUBPS %xmm5, %xmm0
  2947. mulps %xmm15, %xmm7
  2948. SUBPS %xmm7, %xmm1
  2949. movss %xmm10, %xmm9
  2950. shufps $0x93, %xmm10, %xmm9
  2951. pshufd $0xb1, %xmm9, %xmm5
  2952. mulps %xmm14, %xmm9
  2953. addps %xmm9, %xmm2
  2954. movaps -9 * SIZE(A1), %xmm9
  2955. movss %xmm6, %xmm10
  2956. shufps $0x93, %xmm6, %xmm10
  2957. pshufd $0xb1, %xmm10, %xmm7
  2958. mulps %xmm14, %xmm10
  2959. addps %xmm10, %xmm3
  2960. movaps -5 * SIZE(A1), %xmm10
  2961. mulps %xmm15, %xmm5
  2962. SUBPS %xmm5, %xmm2
  2963. mulps %xmm15, %xmm7
  2964. SUBPS %xmm7, %xmm3
  2965. #ifdef PREFETCHW
  2966. PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1)
  2967. #endif
  2968. movaps %xmm0, -32 * SIZE(Y1)
  2969. movaps %xmm1, -28 * SIZE(Y1)
  2970. movaps %xmm2, -24 * SIZE(Y1)
  2971. movaps %xmm3, -20 * SIZE(Y1)
  2972. movaps -16 * SIZE(Y1), %xmm0
  2973. movaps -12 * SIZE(Y1), %xmm1
  2974. movaps -8 * SIZE(Y1), %xmm2
  2975. movaps -4 * SIZE(Y1), %xmm3
  2976. subq $-16 * SIZE, A1
  2977. subq $-16 * SIZE, A2
  2978. subq $-16 * SIZE, Y1
  2979. subq $1, I
  2980. BRANCH
  2981. jg .L303
  2982. ALIGN_3
  2983. .L304:
  2984. movss %xmm8, %xmm4
  2985. shufps $0x39, %xmm4, %xmm4
  2986. pshufd $0xb1, %xmm4, %xmm5
  2987. mulps %xmm12, %xmm4
  2988. addps %xmm4, %xmm0
  2989. movaps -17 * SIZE(A1), %xmm4
  2990. movss %xmm9, %xmm8
  2991. shufps $0x39, %xmm8, %xmm8
  2992. pshufd $0xb1, %xmm8, %xmm7
  2993. mulps %xmm12, %xmm8
  2994. addps %xmm8, %xmm1
  2995. movaps -31 * SIZE(A2), %xmm8
  2996. mulps %xmm13, %xmm5
  2997. SUBPS %xmm5, %xmm0
  2998. mulps %xmm13, %xmm7
  2999. SUBPS %xmm7, %xmm1
  3000. movss %xmm10, %xmm9
  3001. shufps $0x39, %xmm9, %xmm9
  3002. pshufd $0xb1, %xmm9, %xmm5
  3003. mulps %xmm12, %xmm9
  3004. addps %xmm9, %xmm2
  3005. movaps -27 * SIZE(A2), %xmm9
  3006. movss %xmm4, %xmm10
  3007. shufps $0x39, %xmm10, %xmm10
  3008. pshufd $0xb1, %xmm10, %xmm7
  3009. mulps %xmm12, %xmm10
  3010. addps %xmm10, %xmm3
  3011. movaps -23 * SIZE(A2), %xmm10
  3012. mulps %xmm13, %xmm5
  3013. SUBPS %xmm5, %xmm2
  3014. mulps %xmm13, %xmm7
  3015. SUBPS %xmm7, %xmm3
  3016. movss %xmm8, %xmm6
  3017. shufps $0x93, %xmm8, %xmm6
  3018. pshufd $0xb1, %xmm6, %xmm5
  3019. mulps %xmm14, %xmm6
  3020. addps %xmm6, %xmm0
  3021. movaps -19 * SIZE(A2), %xmm6
  3022. movss %xmm9, %xmm8
  3023. shufps $0x93, %xmm9, %xmm8
  3024. pshufd $0xb1, %xmm8, %xmm7
  3025. mulps %xmm14, %xmm8
  3026. addps %xmm8, %xmm1
  3027. mulps %xmm15, %xmm5
  3028. SUBPS %xmm5, %xmm0
  3029. mulps %xmm15, %xmm7
  3030. SUBPS %xmm7, %xmm1
  3031. movss %xmm10, %xmm9
  3032. shufps $0x93, %xmm10, %xmm9
  3033. pshufd $0xb1, %xmm9, %xmm5
  3034. mulps %xmm14, %xmm9
  3035. addps %xmm9, %xmm2
  3036. movss %xmm6, %xmm10
  3037. shufps $0x93, %xmm6, %xmm10
  3038. pshufd $0xb1, %xmm10, %xmm7
  3039. mulps %xmm14, %xmm10
  3040. addps %xmm10, %xmm3
  3041. mulps %xmm15, %xmm5
  3042. SUBPS %xmm5, %xmm2
  3043. mulps %xmm15, %xmm7
  3044. SUBPS %xmm7, %xmm3
  3045. movaps %xmm0, -32 * SIZE(Y1)
  3046. movaps %xmm1, -28 * SIZE(Y1)
  3047. movaps %xmm2, -24 * SIZE(Y1)
  3048. movaps %xmm3, -20 * SIZE(Y1)
  3049. movaps -16 * SIZE(Y1), %xmm0
  3050. movaps -12 * SIZE(Y1), %xmm1
  3051. movaps -8 * SIZE(Y1), %xmm2
  3052. movaps -4 * SIZE(Y1), %xmm3
  3053. subq $-16 * SIZE, A1
  3054. subq $-16 * SIZE, A2
  3055. subq $-16 * SIZE, Y1
  3056. ALIGN_3
  3057. .L305:
  3058. testq $4, MM
  3059. je .L307
  3060. movaps -29 * SIZE(A1), %xmm8
  3061. movaps -25 * SIZE(A1), %xmm9
  3062. movaps -31 * SIZE(A2), %xmm10
  3063. movaps -27 * SIZE(A2), %xmm11
  3064. movss %xmm8, %xmm4
  3065. shufps $0x39, %xmm4, %xmm4
  3066. pshufd $0xb1, %xmm4, %xmm5
  3067. mulps %xmm12, %xmm4
  3068. addps %xmm4, %xmm0
  3069. movss %xmm9, %xmm8
  3070. shufps $0x39, %xmm8, %xmm8
  3071. pshufd $0xb1, %xmm8, %xmm7
  3072. mulps %xmm12, %xmm8
  3073. addps %xmm8, %xmm1
  3074. mulps %xmm13, %xmm5
  3075. SUBPS %xmm5, %xmm0
  3076. mulps %xmm13, %xmm7
  3077. SUBPS %xmm7, %xmm1
  3078. movss %xmm10, %xmm6
  3079. shufps $0x93, %xmm10, %xmm6
  3080. pshufd $0xb1, %xmm6, %xmm5
  3081. mulps %xmm14, %xmm6
  3082. addps %xmm6, %xmm0
  3083. movss %xmm11, %xmm10
  3084. shufps $0x93, %xmm11, %xmm10
  3085. pshufd $0xb1, %xmm10, %xmm7
  3086. mulps %xmm14, %xmm10
  3087. addps %xmm10, %xmm1
  3088. mulps %xmm15, %xmm5
  3089. SUBPS %xmm5, %xmm0
  3090. mulps %xmm15, %xmm7
  3091. SUBPS %xmm7, %xmm1
  3092. movaps %xmm0, -32 * SIZE(Y1)
  3093. movaps %xmm1, -28 * SIZE(Y1)
  3094. movaps %xmm9, %xmm4
  3095. movaps %xmm11, %xmm6
  3096. movaps %xmm2, %xmm0
  3097. movaps %xmm3, %xmm1
  3098. addq $8 * SIZE, A1
  3099. addq $8 * SIZE, A2
  3100. addq $8 * SIZE, Y1
  3101. ALIGN_3
  3102. .L307:
  3103. testq $2, MM
  3104. je .L308
  3105. movaps -29 * SIZE(A1), %xmm8
  3106. movaps -31 * SIZE(A2), %xmm9
  3107. movss %xmm8, %xmm4
  3108. shufps $0x39, %xmm4, %xmm4
  3109. pshufd $0xb1, %xmm4, %xmm5
  3110. mulps %xmm12, %xmm4
  3111. addps %xmm4, %xmm0
  3112. movss %xmm9, %xmm6
  3113. shufps $0x93, %xmm9, %xmm6
  3114. pshufd $0xb1, %xmm6, %xmm7
  3115. mulps %xmm13, %xmm5
  3116. SUBPS %xmm5, %xmm0
  3117. mulps %xmm14, %xmm6
  3118. addps %xmm6, %xmm0
  3119. mulps %xmm15, %xmm7
  3120. SUBPS %xmm7, %xmm0
  3121. movaps %xmm0, -32 * SIZE(Y1)
  3122. movaps %xmm1, %xmm0
  3123. addq $4 * SIZE, A1
  3124. addq $4 * SIZE, A2
  3125. addq $4 * SIZE, Y1
  3126. ALIGN_3
  3127. .L308:
  3128. testq $1, MM
  3129. je .L309
  3130. movsd -32 * SIZE(A1), %xmm4
  3131. movsd -32 * SIZE(A2), %xmm6
  3132. pshufd $0xb1, %xmm4, %xmm5
  3133. pshufd $0xb1, %xmm6, %xmm7
  3134. mulps %xmm12, %xmm4
  3135. addps %xmm4, %xmm0
  3136. mulps %xmm13, %xmm5
  3137. SUBPS %xmm5, %xmm0
  3138. mulps %xmm14, %xmm6
  3139. addps %xmm6, %xmm0
  3140. mulps %xmm15, %xmm7
  3141. SUBPS %xmm7, %xmm0
  3142. movlps %xmm0, -32 * SIZE(Y1)
  3143. ALIGN_3
  3144. .L309:
  3145. cmpq $2, N
  3146. jge .L301
  3147. ALIGN_3
  3148. .L310:
  3149. cmpq $1, N
  3150. jl .L990
  3151. leaq 32 * SIZE(BUFFER), Y1
  3152. movq A, A1
  3153. movsd (X), %xmm13
  3154. addq INCX, X
  3155. #ifdef HAVE_SSE3
  3156. movddup ALPHA, %xmm8
  3157. #else
  3158. movsd ALPHA, %xmm8
  3159. unpcklpd %xmm8, %xmm8
  3160. #endif
  3161. pshufd $0xb1, %xmm8, %xmm9
  3162. pcmpeqb %xmm11, %xmm11
  3163. psllq $63, %xmm11
  3164. pshufd $0x00, %xmm13, %xmm12
  3165. pshufd $0x55, %xmm13, %xmm13
  3166. #ifndef XCONJ
  3167. xorps %xmm11, %xmm13
  3168. #else
  3169. xorps %xmm11, %xmm12
  3170. #endif
  3171. mulps %xmm8, %xmm12
  3172. mulps %xmm9, %xmm13
  3173. #ifndef XCONJ
  3174. subps %xmm13, %xmm12
  3175. #else
  3176. addps %xmm13, %xmm12
  3177. #endif
  3178. pshufd $0x55, %xmm12, %xmm13
  3179. pshufd $0x00, %xmm12, %xmm12
  3180. #ifndef CONJ
  3181. xorps %xmm11, %xmm13
  3182. #else
  3183. xorps %xmm11, %xmm12
  3184. #endif
  3185. #ifdef ALIGNED_ACCESS
  3186. cmpq M, MM
  3187. je .L31X
  3188. movsd -32 * SIZE(A1), %xmm4
  3189. movsd -32 * SIZE(Y1), %xmm0
  3190. pshufd $0xb1, %xmm4, %xmm5
  3191. mulps %xmm12, %xmm4
  3192. addps %xmm4, %xmm0
  3193. mulps %xmm13, %xmm5
  3194. SUBPS %xmm5, %xmm0
  3195. movlps %xmm0, -32 * SIZE(Y1)
  3196. addq $2 * SIZE, A1
  3197. addq $2 * SIZE, Y1
  3198. ALIGN_3
  3199. .L31X:
  3200. #endif
  3201. movaps -33 * SIZE(A1), %xmm4
  3202. movaps -32 * SIZE(Y1), %xmm0
  3203. movaps -28 * SIZE(Y1), %xmm1
  3204. movaps -24 * SIZE(Y1), %xmm2
  3205. movaps -20 * SIZE(Y1), %xmm3
  3206. movq MM, I
  3207. sarq $3, I
  3208. jle .L315
  3209. movaps -29 * SIZE(A1), %xmm6
  3210. movaps -25 * SIZE(A1), %xmm8
  3211. movaps -21 * SIZE(A1), %xmm10
  3212. decq I
  3213. jle .L314
  3214. ALIGN_3
  3215. .L313:
  3216. #ifdef PREFETCH
  3217. PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1)
  3218. #endif
  3219. movss %xmm6, %xmm4
  3220. shufps $0x39, %xmm4, %xmm4
  3221. pshufd $0xb1, %xmm4, %xmm5
  3222. mulps %xmm12, %xmm4
  3223. addps %xmm4, %xmm0
  3224. movaps -17 * SIZE(A1), %xmm4
  3225. movss %xmm8, %xmm6
  3226. shufps $0x39, %xmm6, %xmm6
  3227. pshufd $0xb1, %xmm6, %xmm7
  3228. mulps %xmm12, %xmm6
  3229. addps %xmm6, %xmm1
  3230. movaps -13 * SIZE(A1), %xmm6
  3231. movss %xmm10, %xmm8
  3232. shufps $0x39, %xmm8, %xmm8
  3233. pshufd $0xb1, %xmm8, %xmm9
  3234. mulps %xmm12, %xmm8
  3235. addps %xmm8, %xmm2
  3236. movaps -9 * SIZE(A1), %xmm8
  3237. movss %xmm4, %xmm10
  3238. shufps $0x39, %xmm10, %xmm10
  3239. pshufd $0xb1, %xmm10, %xmm11
  3240. mulps %xmm12, %xmm10
  3241. addps %xmm10, %xmm3
  3242. movaps -5 * SIZE(A1), %xmm10
  3243. mulps %xmm13, %xmm5
  3244. SUBPS %xmm5, %xmm0
  3245. mulps %xmm13, %xmm7
  3246. SUBPS %xmm7, %xmm1
  3247. mulps %xmm13, %xmm9
  3248. SUBPS %xmm9, %xmm2
  3249. mulps %xmm13, %xmm11
  3250. SUBPS %xmm11, %xmm3
  3251. #ifdef PREFETCHW
  3252. PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1)
  3253. #endif
  3254. movaps %xmm0, -32 * SIZE(Y1)
  3255. movaps %xmm1, -28 * SIZE(Y1)
  3256. movaps %xmm2, -24 * SIZE(Y1)
  3257. movaps %xmm3, -20 * SIZE(Y1)
  3258. movaps -16 * SIZE(Y1), %xmm0
  3259. movaps -12 * SIZE(Y1), %xmm1
  3260. movaps -8 * SIZE(Y1), %xmm2
  3261. movaps -4 * SIZE(Y1), %xmm3
  3262. subq $-16 * SIZE, A1
  3263. subq $-16 * SIZE, A2
  3264. subq $-16 * SIZE, Y1
  3265. subq $1, I
  3266. BRANCH
  3267. jg .L313
  3268. ALIGN_3
  3269. .L314:
  3270. movss %xmm6, %xmm4
  3271. shufps $0x39, %xmm4, %xmm4
  3272. pshufd $0xb1, %xmm4, %xmm5
  3273. mulps %xmm12, %xmm4
  3274. addps %xmm4, %xmm0
  3275. movaps -17 * SIZE(A1), %xmm4
  3276. movss %xmm8, %xmm6
  3277. shufps $0x39, %xmm6, %xmm6
  3278. pshufd $0xb1, %xmm6, %xmm7
  3279. mulps %xmm12, %xmm6
  3280. addps %xmm6, %xmm1
  3281. movss %xmm10, %xmm8
  3282. shufps $0x39, %xmm8, %xmm8
  3283. pshufd $0xb1, %xmm8, %xmm9
  3284. mulps %xmm12, %xmm8
  3285. addps %xmm8, %xmm2
  3286. movss %xmm4, %xmm10
  3287. shufps $0x39, %xmm10, %xmm10
  3288. pshufd $0xb1, %xmm10, %xmm11
  3289. mulps %xmm12, %xmm10
  3290. addps %xmm10, %xmm3
  3291. mulps %xmm13, %xmm5
  3292. SUBPS %xmm5, %xmm0
  3293. mulps %xmm13, %xmm7
  3294. SUBPS %xmm7, %xmm1
  3295. mulps %xmm13, %xmm9
  3296. SUBPS %xmm9, %xmm2
  3297. mulps %xmm13, %xmm11
  3298. SUBPS %xmm11, %xmm3
  3299. movaps %xmm0, -32 * SIZE(Y1)
  3300. movaps %xmm1, -28 * SIZE(Y1)
  3301. movaps %xmm2, -24 * SIZE(Y1)
  3302. movaps %xmm3, -20 * SIZE(Y1)
  3303. movaps -16 * SIZE(Y1), %xmm0
  3304. movaps -12 * SIZE(Y1), %xmm1
  3305. movaps -8 * SIZE(Y1), %xmm2
  3306. movaps -4 * SIZE(Y1), %xmm3
  3307. subq $-16 * SIZE, A1
  3308. subq $-16 * SIZE, Y1
  3309. ALIGN_3
  3310. .L315:
  3311. testq $4, MM
  3312. je .L317
  3313. movaps -29 * SIZE(A1), %xmm6
  3314. movaps -25 * SIZE(A1), %xmm8
  3315. movss %xmm6, %xmm4
  3316. shufps $0x39, %xmm4, %xmm4
  3317. pshufd $0xb1, %xmm4, %xmm5
  3318. mulps %xmm12, %xmm4
  3319. addps %xmm4, %xmm0
  3320. movss %xmm8, %xmm6
  3321. shufps $0x39, %xmm6, %xmm6
  3322. pshufd $0xb1, %xmm6, %xmm7
  3323. mulps %xmm12, %xmm6
  3324. addps %xmm6, %xmm1
  3325. mulps %xmm13, %xmm5
  3326. SUBPS %xmm5, %xmm0
  3327. mulps %xmm13, %xmm7
  3328. SUBPS %xmm7, %xmm1
  3329. movaps %xmm0, -32 * SIZE(Y1)
  3330. movaps %xmm1, -28 * SIZE(Y1)
  3331. movaps %xmm2, %xmm0
  3332. movaps %xmm3, %xmm1
  3333. movaps %xmm8, %xmm4
  3334. addq $8 * SIZE, A1
  3335. addq $8 * SIZE, Y1
  3336. ALIGN_3
  3337. .L317:
  3338. testq $2, MM
  3339. je .L318
  3340. movaps -29 * SIZE(A1), %xmm6
  3341. movss %xmm6, %xmm4
  3342. shufps $0x39, %xmm4, %xmm4
  3343. pshufd $0xb1, %xmm4, %xmm5
  3344. mulps %xmm12, %xmm4
  3345. addps %xmm4, %xmm0
  3346. mulps %xmm13, %xmm5
  3347. SUBPS %xmm5, %xmm0
  3348. movaps %xmm0, -32 * SIZE(Y1)
  3349. movaps %xmm1, %xmm0
  3350. addq $4 * SIZE, A1
  3351. addq $4 * SIZE, Y1
  3352. ALIGN_3
  3353. .L318:
  3354. testq $1, MM
  3355. je .L990
  3356. movsd -32 * SIZE(A1), %xmm4
  3357. pshufd $0xb1, %xmm4, %xmm5
  3358. mulps %xmm12, %xmm4
  3359. addps %xmm4, %xmm0
  3360. mulps %xmm13, %xmm5
  3361. SUBPS %xmm5, %xmm0
  3362. movlps %xmm0, -32 * SIZE(Y1)
  3363. #endif
  3364. ALIGN_3
  3365. .L990:
  3366. movq Y, Y1
  3367. #ifdef ALIGNED_ACCESS
  3368. cmpq M, MM
  3369. je .L991
  3370. movsd (Y), %xmm0
  3371. addq INCY, Y
  3372. movsd (BUFFER), %xmm1
  3373. addq $2 * SIZE, BUFFER
  3374. addps %xmm1, %xmm0
  3375. movlps %xmm0, (Y1)
  3376. addq INCY, Y1
  3377. ALIGN_3
  3378. .L991:
  3379. #endif
  3380. movq MM, %rax
  3381. sarq $3, %rax
  3382. jle .L994
  3383. ALIGN_3
  3384. .L992:
  3385. movsd (Y), %xmm0
  3386. addq INCY, Y
  3387. movhps (Y), %xmm0
  3388. addq INCY, Y
  3389. movsd (Y), %xmm1
  3390. addq INCY, Y
  3391. movhps (Y), %xmm1
  3392. addq INCY, Y
  3393. movsd (Y), %xmm2
  3394. addq INCY, Y
  3395. movhps (Y), %xmm2
  3396. addq INCY, Y
  3397. movsd (Y), %xmm3
  3398. addq INCY, Y
  3399. movhps (Y), %xmm3
  3400. addq INCY, Y
  3401. addps 0 * SIZE(BUFFER), %xmm0
  3402. addps 4 * SIZE(BUFFER), %xmm1
  3403. addps 8 * SIZE(BUFFER), %xmm2
  3404. addps 12 * SIZE(BUFFER), %xmm3
  3405. movlps %xmm0, (Y1)
  3406. addq INCY, Y1
  3407. movhps %xmm0, (Y1)
  3408. addq INCY, Y1
  3409. movlps %xmm1, (Y1)
  3410. addq INCY, Y1
  3411. movhps %xmm1, (Y1)
  3412. addq INCY, Y1
  3413. movlps %xmm2, (Y1)
  3414. addq INCY, Y1
  3415. movhps %xmm2, (Y1)
  3416. addq INCY, Y1
  3417. movlps %xmm3, (Y1)
  3418. addq INCY, Y1
  3419. movhps %xmm3, (Y1)
  3420. addq INCY, Y1
  3421. addq $16 * SIZE, BUFFER
  3422. decq %rax
  3423. jg .L992
  3424. ALIGN_3
  3425. .L994:
  3426. testq $7, MM
  3427. jle .L999
  3428. testq $4, MM
  3429. jle .L995
  3430. movsd (Y), %xmm0
  3431. addq INCY, Y
  3432. movhps (Y), %xmm0
  3433. addq INCY, Y
  3434. movsd (Y), %xmm1
  3435. addq INCY, Y
  3436. movhps (Y), %xmm1
  3437. addq INCY, Y
  3438. addps 0 * SIZE(BUFFER), %xmm0
  3439. addps 4 * SIZE(BUFFER), %xmm1
  3440. movlps %xmm0, (Y1)
  3441. addq INCY, Y1
  3442. movhps %xmm0, (Y1)
  3443. addq INCY, Y1
  3444. movlps %xmm1, (Y1)
  3445. addq INCY, Y1
  3446. movhps %xmm1, (Y1)
  3447. addq INCY, Y1
  3448. addq $8 * SIZE, BUFFER
  3449. ALIGN_3
  3450. .L995:
  3451. testq $2, MM
  3452. jle .L996
  3453. movsd (Y), %xmm0
  3454. addq INCY, Y
  3455. movhps (Y), %xmm0
  3456. addq INCY, Y
  3457. addps 0 * SIZE(BUFFER), %xmm0
  3458. movlps %xmm0, (Y1)
  3459. addq INCY, Y1
  3460. movhps %xmm0, (Y1)
  3461. addq INCY, Y1
  3462. addq $4 * SIZE, BUFFER
  3463. ALIGN_3
  3464. .L996:
  3465. testq $1, MM
  3466. jle .L999
  3467. movsd (Y), %xmm0
  3468. addps 0 * SIZE(BUFFER), %xmm0
  3469. movlps %xmm0, (Y1)
  3470. ALIGN_3
  3471. .L999:
  3472. movq M, I
  3473. salq $ZBASE_SHIFT,I
  3474. addq I,AA
  3475. jmp .L0t
  3476. .L999x:
  3477. movq 0(%rsp), %rbx
  3478. movq 8(%rsp), %rbp
  3479. movq 16(%rsp), %r12
  3480. movq 24(%rsp), %r13
  3481. movq 32(%rsp), %r14
  3482. movq 40(%rsp), %r15
  3483. #ifdef WINDOWS_ABI
  3484. movq 48(%rsp), %rdi
  3485. movq 56(%rsp), %rsi
  3486. movups 64(%rsp), %xmm6
  3487. movups 80(%rsp), %xmm7
  3488. movups 96(%rsp), %xmm8
  3489. movups 112(%rsp), %xmm9
  3490. movups 128(%rsp), %xmm10
  3491. movups 144(%rsp), %xmm11
  3492. movups 160(%rsp), %xmm12
  3493. movups 176(%rsp), %xmm13
  3494. movups 192(%rsp), %xmm14
  3495. movups 208(%rsp), %xmm15
  3496. #endif
  3497. addq $STACKSIZE, %rsp
  3498. ret
  3499. EPILOGUE