You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sgemm_kernel_16x4_skylakex.S 134 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103610461056106610761086109611061116112611361146115611661176118611961206121612261236124612561266127612861296130613161326133613461356136613761386139614061416142614361446145614661476148614961506151615261536154615561566157615861596160616161626163616461656166616761686169617061716172617361746175617661776178617961806181618261836184618561866187618861896190619161926193619461956196619761986199620062016202620362046205620662076208620962106211621262136214621562166217621862196220622162226223622462256226622762286229623062316232623362346235623662376238623962406241624262436244624562466247624862496250625162526253625462556256625762586259626062616262626362646265626662676268626962706271627262736274627562766277627862796280628162826283628462856286628762886289629062916292629362946295629662976298629963006301630263036304630563066307630863096310631163126313631463156316631763186319632063216322632363246325632663276328632963306331633263336334633563366337633863396340634163426343634463456346634763486349635063516352635363546355635663576358635963606361636263636364636563666367636863696370637163726373637463756376637763786379638063816382638363846385638663876388638963906391639263936394639563966397639863996400640164026403640464056406640764086409641064116412641364146415641664176418641964206421642264236424642564266427642864296430643164326433643464356436643764386439644064416442644364446445644664476448644964506451645264536454645564566457645864596460646164626463646464656466646764686469647064716472647364746475647664776478647964806481648264836484648564866487648864896490649164926493649464956496649764986499650065016502650365046505650665076508650965106511651265136514651565166517651865196520652165226523652465256526652765286529653065316532653365346535653665376538653965406541654265436544654565466547654865496550655165526553655465556556655765586559656065616562656365646565656665676568656965706571657265736574657565766577657865796580658165826583658465856586658765886589659065916592659365946595659665976598659966006601660266036604660566066607660866096610661166126613661466156616661766186619662066216622662366246625662666276628662966306631663266336634663566366637663866396640664166426643664466456646664766486649665066516652665366546655665666576658665966606661666266636664666566666667666866696670667166726673667466756676667766786679668066816682668366846685668666876688668966906691669266936694669566966697669866996700670167026703670467056706670767086709671067116712671367146715671667176718671967206721672267236724672567266727672867296730673167326733673467356736673767386739674067416742674367446745674667476748674967506751675267536754675567566757675867596760676167626763676467656766676767686769677067716772677367746775677667776778677967806781678267836784678567866787678867896790679167926793679467956796679767986799680068016802680368046805680668076808680968106811
  1. /*********************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. **********************************************************************************/
  27. /*********************************************************************
  28. * 2014/07/28 Saar
  29. * BLASTEST : OK
  30. * CTEST : OK
  31. * TEST : OK
  32. *
  33. * 2013/10/28 Saar
  34. * Parameter:
  35. * SGEMM_DEFAULT_UNROLL_N 4
  36. * SGEMM_DEFAULT_UNROLL_M 16
  37. * SGEMM_DEFAULT_P 768
  38. * SGEMM_DEFAULT_Q 384
  39. * A_PR1 512
  40. * B_PR1 512
  41. *
  42. *
  43. * 2014/07/28 Saar
  44. * Performance at 9216x9216x9216:
  45. * 1 thread: 102 GFLOPS (SANDYBRIDGE: 59) (MKL: 83)
  46. * 2 threads: 195 GFLOPS (SANDYBRIDGE: 116) (MKL: 155)
  47. * 3 threads: 281 GFLOPS (SANDYBRIDGE: 165) (MKL: 230)
  48. * 4 threads: 366 GFLOPS (SANDYBRIDGE: 223) (MKL: 267)
  49. *
  50. *********************************************************************/
  51. #define ASSEMBLER
  52. #include "common.h"
  53. #define OLD_M %rdi
  54. #define OLD_N %rsi
  55. #define M %r13
  56. #define J %r14
  57. #define OLD_K %rdx
  58. #define A %rcx
  59. #define B %r8
  60. #define C %r9
  61. #define LDC %r10
  62. #define I %r11
  63. #define AO %rdi
  64. #define BO %rsi
  65. #define CO1 %r15
  66. #define K %r12
  67. #define BI %rbp
  68. #define BO2 %rbp
  69. #define SP %rbx
  70. #define BO1 %rdi
  71. #define CO2 %rdx
  72. #ifndef WINDOWS_ABI
  73. #define STACKSIZE 96
  74. #else
  75. #define STACKSIZE 256
  76. #define OLD_A 40 + STACKSIZE(%rsp)
  77. #define OLD_B 48 + STACKSIZE(%rsp)
  78. #define OLD_C 56 + STACKSIZE(%rsp)
  79. #define OLD_LDC 64 + STACKSIZE(%rsp)
  80. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  81. #endif
  82. #if defined(OS_WINDOWS)
  83. #define L_BUFFER_SIZE 8192
  84. #else
  85. #define L_BUFFER_SIZE 12288
  86. #endif
  87. #define Ndiv6 24(%rsp)
  88. #define Nmod6 32(%rsp)
  89. #define N 40(%rsp)
  90. #define ALPHA 48(%rsp)
  91. #define OFFSET 56(%rsp)
  92. #define KK 64(%rsp)
  93. #define KKK 72(%rsp)
  94. #define BUFFER1 128(%rsp)
  95. #if defined(OS_WINDOWS)
  96. #if L_BUFFER_SIZE > 16384
  97. #define STACK_TOUCH \
  98. movl $0, 4096 * 4(%rsp);\
  99. movl $0, 4096 * 3(%rsp);\
  100. movl $0, 4096 * 2(%rsp);\
  101. movl $0, 4096 * 1(%rsp);
  102. #elif L_BUFFER_SIZE > 12288
  103. #define STACK_TOUCH \
  104. movl $0, 4096 * 3(%rsp);\
  105. movl $0, 4096 * 2(%rsp);\
  106. movl $0, 4096 * 1(%rsp);
  107. #elif L_BUFFER_SIZE > 8192
  108. #define STACK_TOUCH \
  109. movl $0, 4096 * 2(%rsp);\
  110. movl $0, 4096 * 1(%rsp);
  111. #elif L_BUFFER_SIZE > 4096
  112. #define STACK_TOUCH \
  113. movl $0, 4096 * 1(%rsp);
  114. #else
  115. #define STACK_TOUCH
  116. #endif
  117. #else
  118. #define STACK_TOUCH
  119. #endif
  120. #if defined(BULLDOZER)
  121. #define VFMADD231PS_( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
  122. #define VFMADD231SS_( x0,x1,x2 ) vfmaddss x0,x1,x2,x0
  123. #else
  124. #define VFMADD231PS_( y0,y1,y2 ) vfmadd231ps y1,y2,y0
  125. #define VFMADD231SS_( x0,x1,x2 ) vfmadd231ss x1,x2,x0
  126. #endif
  127. #define A_PR1 512
  128. #define B_PR1 512
  129. /*******************************************************************************************
  130. * 6 lines of N
  131. *******************************************************************************************/
  132. .macro KERNEL16x6_SUB
  133. vmovups -16 * SIZE(AO), %zmm0
  134. vbroadcastss -4 * SIZE(BO), %zmm2
  135. vbroadcastss -3 * SIZE(BO), %zmm3
  136. # prefetcht0 A_PR1(AO)
  137. VFMADD231PS_( %zmm4,%zmm2,%zmm0 )
  138. VFMADD231PS_( %zmm6,%zmm3,%zmm0 )
  139. vbroadcastss -2 * SIZE(BO), %zmm2
  140. vbroadcastss -1 * SIZE(BO), %zmm3
  141. VFMADD231PS_( %zmm8,%zmm2,%zmm0 )
  142. VFMADD231PS_( %zmm10,%zmm3,%zmm0 )
  143. vbroadcastss 0 * SIZE(BO), %zmm2
  144. vbroadcastss 1 * SIZE(BO), %zmm3
  145. VFMADD231PS_( %zmm12,%zmm2,%zmm0 )
  146. VFMADD231PS_( %zmm14,%zmm3,%zmm0 )
  147. addq $ 6*SIZE, BO
  148. addq $ 16*SIZE, AO
  149. decq %rax
  150. .endm
  151. .macro KERNEL16x6_SUB4
  152. vmovups -16 * SIZE(AO), %zmm0
  153. vbroadcastss -4 * SIZE(BO), %zmm2
  154. vbroadcastss -3 * SIZE(BO), %zmm3
  155. VFMADD231PS_( %zmm4,%zmm2,%zmm0 )
  156. VFMADD231PS_( %zmm6,%zmm3,%zmm0 )
  157. vbroadcastss -2 * SIZE(BO), %zmm7
  158. vbroadcastss -1 * SIZE(BO), %zmm9
  159. VFMADD231PS_( %zmm8,%zmm7,%zmm0 )
  160. VFMADD231PS_( %zmm10,%zmm9,%zmm0 )
  161. vbroadcastss 0 * SIZE(BO), %zmm11
  162. vbroadcastss 1 * SIZE(BO), %zmm13
  163. VFMADD231PS_( %zmm12,%zmm11,%zmm0 )
  164. VFMADD231PS_( %zmm14,%zmm13,%zmm0 )
  165. addq $ 6*SIZE, BO
  166. addq $ 16*SIZE, AO
  167. decq %rax
  168. vmovups -16 * SIZE(AO), %zmm0
  169. vbroadcastss -4 * SIZE(BO), %zmm16
  170. vbroadcastss -3 * SIZE(BO), %zmm17
  171. VFMADD231PS_( %zmm4,%zmm16,%zmm0 )
  172. VFMADD231PS_( %zmm6,%zmm17,%zmm0 )
  173. vbroadcastss -2 * SIZE(BO), %zmm18
  174. vbroadcastss -1 * SIZE(BO), %zmm19
  175. VFMADD231PS_( %zmm8,%zmm18,%zmm0 )
  176. VFMADD231PS_( %zmm10,%zmm19,%zmm0 )
  177. vbroadcastss 0 * SIZE(BO), %zmm20
  178. vbroadcastss 1 * SIZE(BO), %zmm21
  179. VFMADD231PS_( %zmm12,%zmm20,%zmm0 )
  180. VFMADD231PS_( %zmm14,%zmm21,%zmm0 )
  181. addq $ 6*SIZE, BO
  182. addq $ 16*SIZE, AO
  183. decq %rax
  184. vmovups -16 * SIZE(AO), %zmm0
  185. vbroadcastss -4 * SIZE(BO), %zmm22
  186. vbroadcastss -3 * SIZE(BO), %zmm23
  187. VFMADD231PS_( %zmm4,%zmm22,%zmm0 )
  188. VFMADD231PS_( %zmm6,%zmm23,%zmm0 )
  189. vbroadcastss -2 * SIZE(BO), %zmm24
  190. vbroadcastss -1 * SIZE(BO), %zmm25
  191. VFMADD231PS_( %zmm8,%zmm24,%zmm0 )
  192. VFMADD231PS_( %zmm10,%zmm25,%zmm0 )
  193. vbroadcastss 0 * SIZE(BO), %zmm26
  194. vbroadcastss 1 * SIZE(BO), %zmm27
  195. VFMADD231PS_( %zmm12,%zmm26,%zmm0 )
  196. VFMADD231PS_( %zmm14,%zmm27,%zmm0 )
  197. addq $ 6*SIZE, BO
  198. addq $ 16*SIZE, AO
  199. decq %rax
  200. vmovups -16 * SIZE(AO), %zmm0
  201. vbroadcastss -4 * SIZE(BO), %zmm28
  202. vbroadcastss -3 * SIZE(BO), %zmm29
  203. VFMADD231PS_( %zmm4,%zmm28,%zmm0 )
  204. VFMADD231PS_( %zmm6,%zmm29,%zmm0 )
  205. vbroadcastss -2 * SIZE(BO), %zmm30
  206. vbroadcastss -1 * SIZE(BO), %zmm31
  207. VFMADD231PS_( %zmm8,%zmm30,%zmm0 )
  208. VFMADD231PS_( %zmm10,%zmm31,%zmm0 )
  209. vbroadcastss 0 * SIZE(BO), %zmm1
  210. vbroadcastss 1 * SIZE(BO), %zmm5
  211. VFMADD231PS_( %zmm12,%zmm1,%zmm0 )
  212. VFMADD231PS_( %zmm14,%zmm5,%zmm0 )
  213. addq $ 6*SIZE, BO
  214. addq $ 16*SIZE, AO
  215. decq %rax
  216. .endm
  217. .macro SAVE16x6
  218. vbroadcastss ALPHA, %zmm0
  219. vmulps %zmm0 , %zmm4 , %zmm4
  220. vmulps %zmm0 , %zmm6 , %zmm6
  221. vmulps %zmm0 , %zmm8 , %zmm8
  222. vmulps %zmm0 , %zmm10, %zmm10
  223. vmulps %zmm0 , %zmm12, %zmm12
  224. vmulps %zmm0 , %zmm14, %zmm14
  225. #if !defined(TRMMKERNEL)
  226. vaddps (CO1), %zmm4,%zmm4
  227. vaddps (CO1, LDC), %zmm6,%zmm6
  228. vaddps (CO1, LDC,2), %zmm8,%zmm8
  229. vaddps (CO2), %zmm10,%zmm10
  230. vaddps (CO2, LDC), %zmm12,%zmm12
  231. vaddps (CO2, LDC,2), %zmm14,%zmm14
  232. #endif
  233. vmovups %zmm4 , (CO1)
  234. vmovups %zmm6 , (CO1, LDC)
  235. vmovups %zmm8 , (CO1, LDC,2)
  236. vmovups %zmm10, (CO2)
  237. vmovups %zmm12, (CO2, LDC)
  238. vmovups %zmm14, (CO2, LDC,2)
  239. .endm
  240. /*******************************************************************************************/
  241. .macro KERNEL8x6_SUB
  242. vmovups -16 * SIZE(AO), %ymm0
  243. vbroadcastss -4 * SIZE(BO), %ymm2
  244. vbroadcastss -3 * SIZE(BO), %ymm3
  245. VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
  246. VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
  247. vbroadcastss -2 * SIZE(BO), %ymm2
  248. vbroadcastss -1 * SIZE(BO), %ymm3
  249. VFMADD231PS_( %ymm8,%ymm2,%ymm0 )
  250. VFMADD231PS_( %ymm10,%ymm3,%ymm0 )
  251. vbroadcastss 0 * SIZE(BO), %ymm2
  252. vbroadcastss 1 * SIZE(BO), %ymm3
  253. VFMADD231PS_( %ymm12,%ymm2,%ymm0 )
  254. VFMADD231PS_( %ymm14,%ymm3,%ymm0 )
  255. addq $ 6*SIZE, BO
  256. addq $ 8*SIZE, AO
  257. decq %rax
  258. .endm
  259. .macro SAVE8x6
  260. vbroadcastss ALPHA, %ymm0
  261. vmulps %ymm0 , %ymm4 , %ymm4
  262. vmulps %ymm0 , %ymm6 , %ymm6
  263. vmulps %ymm0 , %ymm8 , %ymm8
  264. vmulps %ymm0 , %ymm10, %ymm10
  265. vmulps %ymm0 , %ymm12, %ymm12
  266. vmulps %ymm0 , %ymm14, %ymm14
  267. #if !defined(TRMMKERNEL)
  268. vaddps (CO1), %ymm4,%ymm4
  269. vaddps (CO1, LDC), %ymm6,%ymm6
  270. vaddps (CO1, LDC,2), %ymm8,%ymm8
  271. vaddps (CO2), %ymm10,%ymm10
  272. vaddps (CO2, LDC), %ymm12,%ymm12
  273. vaddps (CO2, LDC,2), %ymm14,%ymm14
  274. #endif
  275. vmovups %ymm4 , (CO1)
  276. vmovups %ymm6 , (CO1, LDC)
  277. vmovups %ymm8 , (CO1, LDC,2)
  278. vmovups %ymm10, (CO2)
  279. vmovups %ymm12, (CO2, LDC)
  280. vmovups %ymm14, (CO2, LDC,2)
  281. .endm
  282. /*******************************************************************************************/
  283. .macro KERNEL4x6_SUB
  284. vmovups -16 * SIZE(AO), %xmm0
  285. vbroadcastss -4 * SIZE(BO), %xmm2
  286. vbroadcastss -3 * SIZE(BO), %xmm3
  287. VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
  288. VFMADD231PS_( %xmm6,%xmm3,%xmm0 )
  289. vbroadcastss -2 * SIZE(BO), %xmm2
  290. vbroadcastss -1 * SIZE(BO), %xmm3
  291. VFMADD231PS_( %xmm8,%xmm2,%xmm0 )
  292. VFMADD231PS_( %xmm10,%xmm3,%xmm0 )
  293. vbroadcastss 0 * SIZE(BO), %xmm2
  294. vbroadcastss 1 * SIZE(BO), %xmm3
  295. VFMADD231PS_( %xmm12,%xmm2,%xmm0 )
  296. VFMADD231PS_( %xmm14,%xmm3,%xmm0 )
  297. addq $ 6*SIZE, BO
  298. addq $ 4*SIZE, AO
  299. decq %rax
  300. .endm
  301. .macro SAVE4x6
  302. vbroadcastss ALPHA, %xmm0
  303. vmulps %xmm0 , %xmm4 , %xmm4
  304. vmulps %xmm0 , %xmm6 , %xmm6
  305. vmulps %xmm0 , %xmm8 , %xmm8
  306. vmulps %xmm0 , %xmm10, %xmm10
  307. vmulps %xmm0 , %xmm12, %xmm12
  308. vmulps %xmm0 , %xmm14, %xmm14
  309. #if !defined(TRMMKERNEL)
  310. vaddps (CO1), %xmm4,%xmm4
  311. vaddps (CO1, LDC), %xmm6,%xmm6
  312. vaddps (CO1, LDC,2), %xmm8,%xmm8
  313. vaddps (CO2), %xmm10,%xmm10
  314. vaddps (CO2, LDC), %xmm12,%xmm12
  315. vaddps (CO2, LDC,2), %xmm14,%xmm14
  316. #endif
  317. vmovups %xmm4 , (CO1)
  318. vmovups %xmm6 , (CO1, LDC)
  319. vmovups %xmm8 , (CO1, LDC,2)
  320. vmovups %xmm10, (CO2)
  321. vmovups %xmm12, (CO2, LDC)
  322. vmovups %xmm14, (CO2, LDC,2)
  323. .endm
  324. /*******************************************************************************************/
  325. .macro KERNEL2x6_SUB
  326. vmovss -16 * SIZE(AO), %xmm0
  327. vmovss -15 * SIZE(AO), %xmm1
  328. vmovss -4 * SIZE(BO), %xmm2
  329. vmovss -3 * SIZE(BO), %xmm3
  330. VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
  331. VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
  332. VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
  333. VFMADD231SS_( %xmm7,%xmm3,%xmm1 )
  334. vmovss -2 * SIZE(BO), %xmm2
  335. vmovss -1 * SIZE(BO), %xmm3
  336. VFMADD231SS_( %xmm8,%xmm2,%xmm0 )
  337. VFMADD231SS_( %xmm9,%xmm2,%xmm1 )
  338. VFMADD231SS_( %xmm10,%xmm3,%xmm0 )
  339. VFMADD231SS_( %xmm11,%xmm3,%xmm1 )
  340. vmovss 0 * SIZE(BO), %xmm2
  341. vmovss 1 * SIZE(BO), %xmm3
  342. VFMADD231SS_( %xmm12,%xmm2,%xmm0 )
  343. VFMADD231SS_( %xmm13,%xmm2,%xmm1 )
  344. VFMADD231SS_( %xmm14,%xmm3,%xmm0 )
  345. VFMADD231SS_( %xmm15,%xmm3,%xmm1 )
  346. addq $ 6*SIZE, BO
  347. addq $ 2*SIZE, AO
  348. decq %rax
  349. .endm
  350. .macro SAVE2x6
  351. vmovss ALPHA, %xmm0
  352. vmulss %xmm0 , %xmm4 , %xmm4
  353. vmulss %xmm0 , %xmm5 , %xmm5
  354. vmulss %xmm0 , %xmm6 , %xmm6
  355. vmulss %xmm0 , %xmm7 , %xmm7
  356. vmulss %xmm0 , %xmm8 , %xmm8
  357. vmulss %xmm0 , %xmm9 , %xmm9
  358. vmulss %xmm0 , %xmm10, %xmm10
  359. vmulss %xmm0 , %xmm11, %xmm11
  360. vmulss %xmm0 , %xmm12, %xmm12
  361. vmulss %xmm0 , %xmm13, %xmm13
  362. vmulss %xmm0 , %xmm14, %xmm14
  363. vmulss %xmm0 , %xmm15, %xmm15
  364. #if !defined(TRMMKERNEL)
  365. vaddss (CO1), %xmm4,%xmm4
  366. vaddss 1 * SIZE(CO1), %xmm5,%xmm5
  367. vaddss (CO1, LDC), %xmm6,%xmm6
  368. vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7
  369. vaddss (CO1, LDC,2), %xmm8,%xmm8
  370. vaddss 1 * SIZE(CO1, LDC,2), %xmm9,%xmm9
  371. vaddss (CO2), %xmm10,%xmm10
  372. vaddss 1 * SIZE(CO2), %xmm11,%xmm11
  373. vaddss (CO2, LDC), %xmm12,%xmm12
  374. vaddss 1 * SIZE(CO2, LDC), %xmm13,%xmm13
  375. vaddss (CO2, LDC,2), %xmm14,%xmm14
  376. vaddss 1 * SIZE(CO2, LDC,2), %xmm15,%xmm15
  377. #endif
  378. vmovss %xmm4 , (CO1)
  379. vmovss %xmm5 , 1 * SIZE(CO1)
  380. vmovss %xmm6 , (CO1, LDC)
  381. vmovss %xmm7 , 1 * SIZE(CO1, LDC)
  382. vmovss %xmm8 , (CO1, LDC,2)
  383. vmovss %xmm9 , 1 * SIZE(CO1, LDC,2)
  384. vmovss %xmm10, (CO2)
  385. vmovss %xmm11, 1 * SIZE(CO2)
  386. vmovss %xmm12, (CO2, LDC)
  387. vmovss %xmm13, 1 * SIZE(CO2, LDC)
  388. vmovss %xmm14, (CO2, LDC,2)
  389. vmovss %xmm15, 1 * SIZE(CO2, LDC,2)
  390. .endm
  391. /*******************************************************************************************/
  392. .macro KERNEL1x6_SUB
  393. vmovss -16 * SIZE(AO), %xmm0
  394. vmovss -4 * SIZE(BO), %xmm2
  395. vmovss -3 * SIZE(BO), %xmm3
  396. VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
  397. VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
  398. vmovss -2 * SIZE(BO), %xmm2
  399. vmovss -1 * SIZE(BO), %xmm3
  400. VFMADD231SS_( %xmm8,%xmm2,%xmm0 )
  401. VFMADD231SS_( %xmm10,%xmm3,%xmm0 )
  402. vmovss 0 * SIZE(BO), %xmm2
  403. vmovss 1 * SIZE(BO), %xmm3
  404. VFMADD231SS_( %xmm12,%xmm2,%xmm0 )
  405. VFMADD231SS_( %xmm14,%xmm3,%xmm0 )
  406. addq $ 6*SIZE, BO
  407. addq $ 1*SIZE, AO
  408. decq %rax
  409. .endm
  410. .macro SAVE1x6
  411. vmovss ALPHA, %xmm0
  412. vmulss %xmm0 , %xmm4 , %xmm4
  413. vmulss %xmm0 , %xmm6 , %xmm6
  414. vmulss %xmm0 , %xmm8 , %xmm8
  415. vmulss %xmm0 , %xmm10, %xmm10
  416. vmulss %xmm0 , %xmm12, %xmm12
  417. vmulss %xmm0 , %xmm14, %xmm14
  418. #if !defined(TRMMKERNEL)
  419. vaddss (CO1), %xmm4,%xmm4
  420. vaddss (CO1, LDC), %xmm6,%xmm6
  421. vaddss (CO1, LDC,2), %xmm8,%xmm8
  422. vaddss (CO2), %xmm10,%xmm10
  423. vaddss (CO2, LDC), %xmm12,%xmm12
  424. vaddss (CO2, LDC,2), %xmm14,%xmm14
  425. #endif
  426. vmovss %xmm4 , (CO1)
  427. vmovss %xmm6 , (CO1, LDC)
  428. vmovss %xmm8 , (CO1, LDC,2)
  429. vmovss %xmm10, (CO2)
  430. vmovss %xmm12, (CO2, LDC)
  431. vmovss %xmm14, (CO2, LDC,2)
  432. .endm
  433. /*******************************************************************************************/
  434. /*******************************************************************************************
  435. * 4 lines of N
  436. *******************************************************************************************/
  437. .macro KERNEL16x4_SUB
  438. vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0
  439. vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2
  440. vbroadcastss -3 * SIZE(BO, BI, SIZE), %zmm3
  441. VFMADD231PS_( %zmm4,%zmm2,%zmm0 )
  442. VFMADD231PS_( %zmm6,%zmm3,%zmm0 )
  443. vbroadcastss -2 * SIZE(BO, BI, SIZE), %zmm2
  444. vbroadcastss -1 * SIZE(BO, BI, SIZE), %zmm3
  445. VFMADD231PS_( %zmm8,%zmm2,%zmm0 )
  446. VFMADD231PS_( %zmm10,%zmm3,%zmm0 )
  447. addq $ 4 , BI
  448. addq $ 16, %rax
  449. .endm
  450. .macro SAVE16x4
  451. vbroadcastss ALPHA, %zmm0
  452. vmulps %zmm0 , %zmm4 , %zmm4
  453. vmulps %zmm0 , %zmm6 , %zmm6
  454. vmulps %zmm0 , %zmm8 , %zmm8
  455. vmulps %zmm0 , %zmm10, %zmm10
  456. #if !defined(TRMMKERNEL)
  457. vaddps (CO1), %zmm4,%zmm4
  458. vaddps (CO1, LDC), %zmm6,%zmm6
  459. vaddps (CO2), %zmm8,%zmm8
  460. vaddps (CO2, LDC), %zmm10,%zmm10
  461. #endif
  462. vmovups %zmm4 , (CO1)
  463. vmovups %zmm6 , (CO1, LDC)
  464. vmovups %zmm8 , (CO2)
  465. vmovups %zmm10, (CO2, LDC)
  466. prefetcht0 64(CO1)
  467. prefetcht0 64(CO1, LDC)
  468. prefetcht0 64(CO2)
  469. prefetcht0 64(CO2, LDC)
  470. .endm
  471. /*******************************************************************************************/
  472. .macro KERNEL8x4_SUB
  473. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  474. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
  475. vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
  476. VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
  477. VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
  478. vbroadcastss -2 * SIZE(BO, BI, SIZE), %ymm2
  479. vbroadcastss -1 * SIZE(BO, BI, SIZE), %ymm3
  480. VFMADD231PS_( %ymm8,%ymm2,%ymm0 )
  481. VFMADD231PS_( %ymm10,%ymm3,%ymm0 )
  482. addq $ 4 , BI
  483. addq $ 8 , %rax
  484. .endm
  485. .macro SAVE8x4
  486. vbroadcastss ALPHA, %ymm0
  487. vmulps %ymm0 , %ymm4 , %ymm4
  488. vmulps %ymm0 , %ymm6 , %ymm6
  489. vmulps %ymm0 , %ymm8 , %ymm8
  490. vmulps %ymm0 , %ymm10, %ymm10
  491. #if !defined(TRMMKERNEL)
  492. vaddps (CO1), %ymm4,%ymm4
  493. vaddps (CO1, LDC), %ymm6,%ymm6
  494. vaddps (CO2), %ymm8,%ymm8
  495. vaddps (CO2, LDC), %ymm10,%ymm10
  496. #endif
  497. vmovups %ymm4 , (CO1)
  498. vmovups %ymm6 , (CO1, LDC)
  499. vmovups %ymm8 , (CO2)
  500. vmovups %ymm10, (CO2, LDC)
  501. .endm
  502. /*******************************************************************************************/
  503. .macro KERNEL4x4_SUB
  504. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  505. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
  506. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3
  507. VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
  508. VFMADD231PS_( %xmm6,%xmm3,%xmm0 )
  509. vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2
  510. vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3
  511. VFMADD231PS_( %xmm8,%xmm2,%xmm0 )
  512. VFMADD231PS_( %xmm10,%xmm3,%xmm0 )
  513. addq $ 4 , BI
  514. addq $ 4 , %rax
  515. .endm
  516. .macro SAVE4x4
  517. vbroadcastss ALPHA, %xmm0
  518. vmulps %xmm0 , %xmm4 , %xmm4
  519. vmulps %xmm0 , %xmm6 , %xmm6
  520. vmulps %xmm0 , %xmm8 , %xmm8
  521. vmulps %xmm0 , %xmm10, %xmm10
  522. #if !defined(TRMMKERNEL)
  523. vaddps (CO1), %xmm4,%xmm4
  524. vaddps (CO1, LDC), %xmm6,%xmm6
  525. vaddps (CO2), %xmm8,%xmm8
  526. vaddps (CO2, LDC), %xmm10,%xmm10
  527. #endif
  528. vmovups %xmm4 , (CO1)
  529. vmovups %xmm6 , (CO1, LDC)
  530. vmovups %xmm8 , (CO2)
  531. vmovups %xmm10, (CO2, LDC)
  532. .endm
  533. /*******************************************************************************************/
  534. .macro KERNEL2x4_SUB
  535. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
  536. vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
  537. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
  538. vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
  539. VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
  540. VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
  541. VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
  542. VFMADD231SS_( %xmm7,%xmm3,%xmm1 )
  543. vmovss -2 * SIZE(BO, BI, SIZE), %xmm2
  544. vmovss -1 * SIZE(BO, BI, SIZE), %xmm3
  545. VFMADD231SS_( %xmm8,%xmm2,%xmm0 )
  546. VFMADD231SS_( %xmm9,%xmm2,%xmm1 )
  547. VFMADD231SS_( %xmm10,%xmm3,%xmm0 )
  548. VFMADD231SS_( %xmm11,%xmm3,%xmm1 )
  549. addq $ 4 , BI
  550. addq $ 2, %rax
  551. .endm
  552. .macro SAVE2x4
  553. vmovss ALPHA, %xmm0
  554. vmulss %xmm0 , %xmm4 , %xmm4
  555. vmulss %xmm0 , %xmm5 , %xmm5
  556. vmulss %xmm0 , %xmm6 , %xmm6
  557. vmulss %xmm0 , %xmm7 , %xmm7
  558. vmulss %xmm0 , %xmm8 , %xmm8
  559. vmulss %xmm0 , %xmm9 , %xmm9
  560. vmulss %xmm0 , %xmm10, %xmm10
  561. vmulss %xmm0 , %xmm11, %xmm11
  562. #if !defined(TRMMKERNEL)
  563. vaddss (CO1), %xmm4,%xmm4
  564. vaddss 1 * SIZE(CO1), %xmm5,%xmm5
  565. vaddss (CO1, LDC), %xmm6,%xmm6
  566. vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7
  567. vaddss (CO2), %xmm8,%xmm8
  568. vaddss 1 * SIZE(CO2), %xmm9,%xmm9
  569. vaddss (CO2, LDC), %xmm10,%xmm10
  570. vaddss 1 * SIZE(CO2, LDC), %xmm11,%xmm11
  571. #endif
  572. vmovss %xmm4 , (CO1)
  573. vmovss %xmm5 , 1 * SIZE(CO1)
  574. vmovss %xmm6 , (CO1, LDC)
  575. vmovss %xmm7 , 1 * SIZE(CO1, LDC)
  576. vmovss %xmm8 , (CO2)
  577. vmovss %xmm9 , 1 * SIZE(CO2)
  578. vmovss %xmm10, (CO2, LDC)
  579. vmovss %xmm11, 1 * SIZE(CO2, LDC)
  580. .endm
  581. /*******************************************************************************************/
  582. .macro KERNEL1x4_SUB
  583. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
  584. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
  585. vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
  586. VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
  587. VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
  588. vmovss -2 * SIZE(BO, BI, SIZE), %xmm2
  589. vmovss -1 * SIZE(BO, BI, SIZE), %xmm3
  590. VFMADD231SS_( %xmm8,%xmm2,%xmm0 )
  591. VFMADD231SS_( %xmm10,%xmm3,%xmm0 )
  592. addq $ 4 , BI
  593. addq $ 1, %rax
  594. .endm
  595. .macro SAVE1x4
  596. vmovss ALPHA, %xmm0
  597. vmulss %xmm0 , %xmm4 , %xmm4
  598. vmulss %xmm0 , %xmm6 , %xmm6
  599. vmulss %xmm0 , %xmm8 , %xmm8
  600. vmulss %xmm0 , %xmm10, %xmm10
  601. #if !defined(TRMMKERNEL)
  602. vaddss (CO1), %xmm4,%xmm4
  603. vaddss (CO1, LDC), %xmm6,%xmm6
  604. vaddss (CO2), %xmm8,%xmm8
  605. vaddss (CO2, LDC), %xmm10,%xmm10
  606. #endif
  607. vmovss %xmm4 , (CO1)
  608. vmovss %xmm6 , (CO1, LDC)
  609. vmovss %xmm8 , (CO2)
  610. vmovss %xmm10, (CO2, LDC)
  611. .endm
  612. /*******************************************************************************************/
  613. /*******************************************************************************************
  614. * 2 lines of N
  615. *******************************************************************************************/
  616. .macro KERNEL16x2_SUB
  617. vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0
  618. vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2
  619. vbroadcastss -3 * SIZE(BO, BI, SIZE), %zmm3
  620. VFMADD231PS_( %zmm4,%zmm2,%zmm0 )
  621. VFMADD231PS_( %zmm6,%zmm3,%zmm0 )
  622. addq $ 2 , BI
  623. addq $ 16, %rax
  624. .endm
  625. .macro SAVE16x2
  626. vbroadcastss ALPHA, %zmm0
  627. vmulps %zmm0 , %zmm4 , %zmm4
  628. vmulps %zmm0 , %zmm6 , %zmm6
  629. #if !defined(TRMMKERNEL)
  630. vaddps (CO1), %zmm4,%zmm4
  631. vaddps (CO1, LDC), %zmm6,%zmm6
  632. #endif
  633. vmovups %zmm4 , (CO1)
  634. vmovups %zmm6 , (CO1, LDC)
  635. .endm
  636. /*******************************************************************************************/
  637. .macro KERNEL8x2_SUB
  638. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  639. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
  640. vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm3
  641. VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
  642. VFMADD231PS_( %ymm6,%ymm3,%ymm0 )
  643. addq $ 2 , BI
  644. addq $ 8 , %rax
  645. .endm
  646. .macro SAVE8x2
  647. vbroadcastss ALPHA, %ymm0
  648. vmulps %ymm0 , %ymm4 , %ymm4
  649. vmulps %ymm0 , %ymm6 , %ymm6
  650. #if !defined(TRMMKERNEL)
  651. vaddps (CO1), %ymm4,%ymm4
  652. vaddps (CO1, LDC), %ymm6,%ymm6
  653. #endif
  654. vmovups %ymm4 , (CO1)
  655. vmovups %ymm6 , (CO1, LDC)
  656. .endm
  657. /*******************************************************************************************/
  658. .macro KERNEL4x2_SUB
  659. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  660. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
  661. vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm3
  662. VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
  663. VFMADD231PS_( %xmm6,%xmm3,%xmm0 )
  664. addq $ 2 , BI
  665. addq $ 4 , %rax
  666. .endm
  667. .macro SAVE4x2
  668. vbroadcastss ALPHA, %xmm0
  669. vmulps %xmm0 , %xmm4 , %xmm4
  670. vmulps %xmm0 , %xmm6 , %xmm6
  671. #if !defined(TRMMKERNEL)
  672. vaddps (CO1), %xmm4,%xmm4
  673. vaddps (CO1, LDC), %xmm6,%xmm6
  674. #endif
  675. vmovups %xmm4 , (CO1)
  676. vmovups %xmm6 , (CO1, LDC)
  677. .endm
  678. /*******************************************************************************************/
  679. .macro KERNEL2x2_SUB
  680. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
  681. vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
  682. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
  683. vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
  684. VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
  685. VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
  686. VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
  687. VFMADD231SS_( %xmm7,%xmm3,%xmm1 )
  688. addq $ 2 , BI
  689. addq $ 2, %rax
  690. .endm
  691. .macro SAVE2x2
  692. vmovss ALPHA, %xmm0
  693. vmulss %xmm0 , %xmm4 , %xmm4
  694. vmulss %xmm0 , %xmm5 , %xmm5
  695. vmulss %xmm0 , %xmm6 , %xmm6
  696. vmulss %xmm0 , %xmm7 , %xmm7
  697. #if !defined(TRMMKERNEL)
  698. vaddss (CO1), %xmm4,%xmm4
  699. vaddss 1 * SIZE(CO1), %xmm5,%xmm5
  700. vaddss (CO1, LDC), %xmm6,%xmm6
  701. vaddss 1 * SIZE(CO1, LDC), %xmm7,%xmm7
  702. #endif
  703. vmovss %xmm4 , (CO1)
  704. vmovss %xmm5 , 1 * SIZE(CO1)
  705. vmovss %xmm6 , (CO1, LDC)
  706. vmovss %xmm7 , 1 * SIZE(CO1, LDC)
  707. .endm
  708. /*******************************************************************************************/
  709. .macro KERNEL1x2_SUB
  710. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
  711. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
  712. vmovss -3 * SIZE(BO, BI, SIZE), %xmm3
  713. VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
  714. VFMADD231SS_( %xmm6,%xmm3,%xmm0 )
  715. addq $ 2 , BI
  716. addq $ 1, %rax
  717. .endm
  718. .macro SAVE1x2
  719. vmovss ALPHA, %xmm0
  720. vmulss %xmm0 , %xmm4 , %xmm4
  721. vmulss %xmm0 , %xmm6 , %xmm6
  722. #if !defined(TRMMKERNEL)
  723. vaddss (CO1), %xmm4,%xmm4
  724. vaddss (CO1, LDC), %xmm6,%xmm6
  725. #endif
  726. vmovss %xmm4 , (CO1)
  727. vmovss %xmm6 , (CO1, LDC)
  728. .endm
  729. /*******************************************************************************************/
  730. /*******************************************************************************************
  731. * 1 line of N
  732. *******************************************************************************************/
  733. .macro KERNEL16x1_SUB
  734. vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0
  735. vbroadcastss -4 * SIZE(BO, BI, SIZE), %zmm2
  736. VFMADD231PS_( %zmm4,%zmm2,%zmm0 )
  737. addq $ 1 , BI
  738. addq $ 16, %rax
  739. .endm
  740. .macro SAVE16x1
  741. vbroadcastss ALPHA, %zmm0
  742. vmulps %zmm0 , %zmm4 , %zmm4
  743. #if !defined(TRMMKERNEL)
  744. vaddps (CO1), %zmm4,%zmm4
  745. #endif
  746. vmovups %zmm4 , (CO1)
  747. .endm
  748. /*******************************************************************************************/
  749. .macro KERNEL8x1_SUB
  750. vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0
  751. vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm2
  752. VFMADD231PS_( %ymm4,%ymm2,%ymm0 )
  753. addq $ 1 , BI
  754. addq $ 8 , %rax
  755. .endm
  756. .macro SAVE8x1
  757. vbroadcastss ALPHA, %ymm0
  758. vmulps %ymm0 , %ymm4 , %ymm4
  759. #if !defined(TRMMKERNEL)
  760. vaddps (CO1), %ymm4,%ymm4
  761. #endif
  762. vmovups %ymm4 , (CO1)
  763. .endm
  764. /*******************************************************************************************/
  765. .macro KERNEL4x1_SUB
  766. vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0
  767. vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm2
  768. VFMADD231PS_( %xmm4,%xmm2,%xmm0 )
  769. addq $ 1 , BI
  770. addq $ 4 , %rax
  771. .endm
  772. .macro SAVE4x1
  773. vbroadcastss ALPHA, %xmm0
  774. vmulps %xmm0 , %xmm4 , %xmm4
  775. #if !defined(TRMMKERNEL)
  776. vaddps (CO1), %xmm4,%xmm4
  777. #endif
  778. vmovups %xmm4 , (CO1)
  779. .endm
  780. /*******************************************************************************************/
  781. .macro KERNEL2x1_SUB
  782. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
  783. vmovss -15 * SIZE(AO, %rax, SIZE), %xmm1
  784. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
  785. VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
  786. VFMADD231SS_( %xmm5,%xmm2,%xmm1 )
  787. addq $ 1 , BI
  788. addq $ 2 , %rax
  789. .endm
  790. .macro SAVE2x1
  791. vmovss ALPHA, %xmm0
  792. vmulss %xmm0 , %xmm4 , %xmm4
  793. vmulss %xmm0 , %xmm5 , %xmm5
  794. #if !defined(TRMMKERNEL)
  795. vaddss (CO1), %xmm4,%xmm4
  796. vaddss 1 * SIZE(CO1), %xmm5,%xmm5
  797. #endif
  798. vmovss %xmm4 , (CO1)
  799. vmovss %xmm5 , 1 * SIZE(CO1)
  800. .endm
  801. /*******************************************************************************************/
  802. .macro KERNEL1x1_SUB
  803. vmovss -16 * SIZE(AO, %rax, SIZE), %xmm0
  804. vmovss -4 * SIZE(BO, BI, SIZE), %xmm2
  805. VFMADD231SS_( %xmm4,%xmm2,%xmm0 )
  806. addq $ 1 , BI
  807. addq $ 1 , %rax
  808. .endm
  809. .macro SAVE1x1
  810. vmovss ALPHA, %xmm0
  811. vmulss %xmm0 , %xmm4 , %xmm4
  812. #if !defined(TRMMKERNEL)
  813. vaddss (CO1), %xmm4,%xmm4
  814. #endif
  815. vmovss %xmm4 , (CO1)
  816. .endm
  817. /*******************************************************************************************/
  818. #if !defined(TRMMKERNEL)
  819. /*************************************************************************************
  820. * GEMM Kernel
  821. *************************************************************************************/
  822. PROLOGUE
  823. PROFCODE
  824. subq $STACKSIZE, %rsp
  825. movq %rbx, (%rsp)
  826. movq %rbp, 8(%rsp)
  827. movq %r12, 16(%rsp)
  828. movq %r13, 24(%rsp)
  829. movq %r14, 32(%rsp)
  830. movq %r15, 40(%rsp)
  831. vzeroupper
  832. #ifdef WINDOWS_ABI
  833. movq %rdi, 48(%rsp)
  834. movq %rsi, 56(%rsp)
  835. movups %xmm6, 64(%rsp)
  836. movups %xmm7, 80(%rsp)
  837. movups %xmm8, 96(%rsp)
  838. movups %xmm9, 112(%rsp)
  839. movups %xmm10, 128(%rsp)
  840. movups %xmm11, 144(%rsp)
  841. movups %xmm12, 160(%rsp)
  842. movups %xmm13, 176(%rsp)
  843. movups %xmm14, 192(%rsp)
  844. movups %xmm15, 208(%rsp)
  845. movq ARG1, OLD_M
  846. movq ARG2, OLD_N
  847. movq ARG3, OLD_K
  848. movq OLD_A, A
  849. movq OLD_B, B
  850. movq OLD_C, C
  851. movq OLD_LDC, LDC
  852. #ifdef TRMMKERNEL
  853. vmovsd OLD_OFFSET, %xmm12
  854. #endif
  855. vmovaps %xmm3, %xmm0
  856. #else
  857. movq STACKSIZE + 8(%rsp), LDC
  858. #ifdef TRMMKERNEL
  859. movsd STACKSIZE + 16(%rsp), %xmm12
  860. #endif
  861. #endif
  862. movq %rsp, SP # save old stack
  863. subq $128 + L_BUFFER_SIZE, %rsp
  864. andq $-4096, %rsp # align stack
  865. STACK_TOUCH
  866. cmpq $0, OLD_M
  867. je .L999
  868. cmpq $0, OLD_N
  869. je .L999
  870. cmpq $0, OLD_K
  871. je .L999
  872. movq OLD_M, M
  873. movq OLD_N, N
  874. movq OLD_K, K
  875. vmovss %xmm0, ALPHA
  876. salq $BASE_SHIFT, LDC
  877. movq N, %rax
  878. xorq %rdx, %rdx
  879. movq $12, %rdi
  880. divq %rdi // N / 12
  881. movq %rax, Ndiv6 // N / 12
  882. movq %rdx, Nmod6 // N % 12
  883. movq Ndiv6, J
  884. cmpq $0, J
  885. je .L4_00
  886. ALIGN_4
  887. /*******************************************************************************************/
  888. .L6_01:
  889. // copy to sub buffer
  890. movq B, BO1
  891. leaq BUFFER1, BO // first buffer to BO
  892. movq K, %rax
  893. salq $2, %rax // 4 values of B
  894. leaq (B, %rax,4), BO2
  895. movq BO2, B // next offset of B
  896. movq K, %rax
  897. ALIGN_4
  898. .L6_02c:
  899. vmovups (BO1), %xmm0
  900. vmovsd (BO2), %xmm1
  901. vmovups %xmm0, (BO)
  902. vmovsd %xmm1, 4*SIZE(BO)
  903. addq $ 4*SIZE,BO1
  904. addq $ 4*SIZE,BO2
  905. addq $ 6*SIZE,BO
  906. decq %rax
  907. jnz .L6_02c
  908. .L6_10:
  909. movq C, CO1
  910. leaq (C, LDC, 2), CO2
  911. leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc
  912. leaq (C, LDC, 4), C
  913. leaq (C, LDC, 2), C // c = c + 6 * ldc
  914. movq A, AO // aoffset = a
  915. addq $ 16 * SIZE, AO
  916. movq M, I
  917. sarq $4, I // i = (m >> 4)
  918. je .L6_20
  919. ALIGN_4
  920. .L6_11:
  921. leaq BUFFER1, BO // first buffer to BO
  922. addq $4 * SIZE, BO
  923. vzeroall
  924. movq K, %rax
  925. andq $-8, %rax // K = K - ( K % 8 )
  926. je .L6_16
  927. ALIGN_4
  928. .L6_12:
  929. KERNEL16x6_SUB4
  930. KERNEL16x6_SUB4
  931. je .L6_16
  932. KERNEL16x6_SUB4
  933. KERNEL16x6_SUB4
  934. je .L6_16
  935. jmp .L6_12
  936. ALIGN_4
  937. .L6_16:
  938. movq K, %rax
  939. andq $7, %rax # if (k & 1)
  940. je .L6_19
  941. ALIGN_4
  942. .L6_17:
  943. KERNEL16x6_SUB
  944. jnz .L6_17
  945. ALIGN_4
  946. .L6_19:
  947. SAVE16x6
  948. addq $16 * SIZE, CO1 # coffset += 16
  949. addq $16 * SIZE, CO2 # coffset += 16
  950. decq I # i --
  951. jg .L6_11
  952. ALIGN_4
  953. /**************************************************************************
  954. * Rest of M
  955. ***************************************************************************/
  956. .L6_20:
  957. // Test rest of M
  958. testq $15, M
  959. jz .L6_60 // to next 6 lines of N
  960. testq $8, M
  961. jz .L6_21pre
  962. ALIGN_4
  963. /**************************************************************************/
  964. .L6_20_1:
  965. leaq BUFFER1, BO // first buffer to BO
  966. addq $4 * SIZE, BO
  967. vzeroall
  968. movq K, %rax
  969. andq $-8, %rax
  970. je .L6_20_6
  971. ALIGN_4
  972. .L6_20_2:
  973. prefetcht0 A_PR1(AO)
  974. KERNEL8x6_SUB
  975. KERNEL8x6_SUB
  976. prefetcht0 A_PR1(AO)
  977. KERNEL8x6_SUB
  978. KERNEL8x6_SUB
  979. prefetcht0 A_PR1(AO)
  980. KERNEL8x6_SUB
  981. KERNEL8x6_SUB
  982. prefetcht0 A_PR1(AO)
  983. KERNEL8x6_SUB
  984. KERNEL8x6_SUB
  985. je .L6_20_6
  986. prefetcht0 A_PR1(AO)
  987. KERNEL8x6_SUB
  988. KERNEL8x6_SUB
  989. prefetcht0 A_PR1(AO)
  990. KERNEL8x6_SUB
  991. KERNEL8x6_SUB
  992. prefetcht0 A_PR1(AO)
  993. KERNEL8x6_SUB
  994. KERNEL8x6_SUB
  995. prefetcht0 A_PR1(AO)
  996. KERNEL8x6_SUB
  997. KERNEL8x6_SUB
  998. je .L6_20_6
  999. jmp .L6_20_2
  1000. ALIGN_4
  1001. .L6_20_6:
  1002. movq K, %rax
  1003. andq $7, %rax # if (k & 1)
  1004. je .L6_20_9
  1005. ALIGN_4
  1006. .L6_20_7:
  1007. KERNEL8x6_SUB
  1008. jnz .L6_20_7
  1009. ALIGN_4
  1010. .L6_20_9:
  1011. SAVE8x6
  1012. addq $8 * SIZE, CO1 # coffset += 8
  1013. addq $8 * SIZE, CO2 # coffset += 8
  1014. ALIGN_4
  1015. /**************************************************************************/
  1016. .L6_21pre:
  1017. testq $4, M
  1018. jz .L6_30
  1019. ALIGN_4
  1020. .L6_21:
  1021. leaq BUFFER1, BO // first buffer to BO
  1022. addq $4 * SIZE, BO
  1023. vzeroall
  1024. movq K, %rax
  1025. andq $-8, %rax
  1026. je .L6_26
  1027. ALIGN_4
  1028. .L6_22:
  1029. prefetcht0 A_PR1(AO)
  1030. KERNEL4x6_SUB
  1031. KERNEL4x6_SUB
  1032. KERNEL4x6_SUB
  1033. KERNEL4x6_SUB
  1034. prefetcht0 A_PR1(AO)
  1035. KERNEL4x6_SUB
  1036. KERNEL4x6_SUB
  1037. KERNEL4x6_SUB
  1038. KERNEL4x6_SUB
  1039. je .L6_26
  1040. prefetcht0 A_PR1(AO)
  1041. KERNEL4x6_SUB
  1042. KERNEL4x6_SUB
  1043. KERNEL4x6_SUB
  1044. KERNEL4x6_SUB
  1045. prefetcht0 A_PR1(AO)
  1046. KERNEL4x6_SUB
  1047. KERNEL4x6_SUB
  1048. KERNEL4x6_SUB
  1049. KERNEL4x6_SUB
  1050. je .L6_26
  1051. jmp .L6_22
  1052. ALIGN_4
  1053. .L6_26:
  1054. movq K, %rax
  1055. andq $7, %rax # if (k & 1)
  1056. je .L6_29
  1057. ALIGN_4
  1058. .L6_27:
  1059. KERNEL4x6_SUB
  1060. jnz .L6_27
  1061. ALIGN_4
  1062. .L6_29:
  1063. SAVE4x6
  1064. addq $4 * SIZE, CO1 # coffset += 4
  1065. addq $4 * SIZE, CO2 # coffset += 4
  1066. ALIGN_4
  1067. .L6_30:
  1068. testq $2, M
  1069. jz .L6_40
  1070. ALIGN_4
  1071. .L6_31:
  1072. leaq BUFFER1, BO // first buffer to BO
  1073. addq $4 * SIZE, BO
  1074. vzeroall
  1075. movq K, %rax
  1076. andq $-8, %rax
  1077. je .L6_36
  1078. ALIGN_4
  1079. .L6_32:
  1080. prefetcht0 A_PR1(AO)
  1081. KERNEL2x6_SUB
  1082. KERNEL2x6_SUB
  1083. KERNEL2x6_SUB
  1084. KERNEL2x6_SUB
  1085. KERNEL2x6_SUB
  1086. KERNEL2x6_SUB
  1087. KERNEL2x6_SUB
  1088. KERNEL2x6_SUB
  1089. je .L6_36
  1090. prefetcht0 A_PR1(AO)
  1091. KERNEL2x6_SUB
  1092. KERNEL2x6_SUB
  1093. KERNEL2x6_SUB
  1094. KERNEL2x6_SUB
  1095. KERNEL2x6_SUB
  1096. KERNEL2x6_SUB
  1097. KERNEL2x6_SUB
  1098. KERNEL2x6_SUB
  1099. je .L6_36
  1100. jmp .L6_32
  1101. ALIGN_4
  1102. .L6_36:
  1103. movq K, %rax
  1104. andq $7, %rax # if (k & 1)
  1105. je .L6_39
  1106. ALIGN_4
  1107. .L6_37:
  1108. KERNEL2x6_SUB
  1109. jnz .L6_37
  1110. ALIGN_4
  1111. .L6_39:
  1112. SAVE2x6
  1113. addq $2 * SIZE, CO1 # coffset += 2
  1114. addq $2 * SIZE, CO2 # coffset += 2
  1115. ALIGN_4
  1116. .L6_40:
  1117. testq $1, M
  1118. jz .L6_60 // to next 4 lines of N
  1119. ALIGN_4
  1120. .L6_41:
  1121. leaq BUFFER1, BO // first buffer to BO
  1122. addq $4 * SIZE, BO
  1123. vzeroall
  1124. movq K, %rax
  1125. andq $-8, %rax
  1126. je .L6_46
  1127. ALIGN_4
  1128. .L6_42:
  1129. prefetcht0 A_PR1(AO)
  1130. KERNEL1x6_SUB
  1131. KERNEL1x6_SUB
  1132. KERNEL1x6_SUB
  1133. KERNEL1x6_SUB
  1134. KERNEL1x6_SUB
  1135. KERNEL1x6_SUB
  1136. KERNEL1x6_SUB
  1137. KERNEL1x6_SUB
  1138. je .L6_46
  1139. KERNEL1x6_SUB
  1140. KERNEL1x6_SUB
  1141. KERNEL1x6_SUB
  1142. KERNEL1x6_SUB
  1143. KERNEL1x6_SUB
  1144. KERNEL1x6_SUB
  1145. KERNEL1x6_SUB
  1146. KERNEL1x6_SUB
  1147. je .L6_46
  1148. jmp .L6_42
  1149. ALIGN_4
  1150. .L6_46:
  1151. movq K, %rax
  1152. andq $7, %rax # if (k & 1)
  1153. je .L6_49
  1154. ALIGN_4
  1155. .L6_47:
  1156. KERNEL1x6_SUB
  1157. jnz .L6_47
  1158. ALIGN_4
  1159. .L6_49:
  1160. SAVE1x6
  1161. addq $1 * SIZE, CO1 # coffset += 1
  1162. addq $1 * SIZE, CO2 # coffset += 1
  1163. ALIGN_4
  1164. .L6_60:
  1165. /*******************************************************************************************/
  1166. .L7_01:
  1167. // copy to sub buffer
  1168. movq B, BO1
  1169. leaq BUFFER1, BO // first buffer to BO
  1170. movq K, %rax
  1171. salq $2, %rax // 4 values of B
  1172. leaq (B, %rax,4), BO2
  1173. movq K, %rax
  1174. ALIGN_4
  1175. .L7_02c:
  1176. vmovsd 2*SIZE(BO1), %xmm0
  1177. vmovups (BO2), %xmm1
  1178. vmovsd %xmm0, (BO)
  1179. vmovups %xmm1, 2*SIZE(BO)
  1180. addq $ 4*SIZE,BO1
  1181. addq $ 4*SIZE,BO2
  1182. addq $ 6*SIZE,BO
  1183. decq %rax
  1184. jnz .L7_02c
  1185. movq BO2, B // next offset of B
  1186. .L7_10:
  1187. movq C, CO1
  1188. leaq (C, LDC, 2), CO2
  1189. leaq (CO2, LDC, 1), CO2 // co2 = c + 3 * ldc
  1190. leaq (C, LDC, 4), C
  1191. leaq (C, LDC, 2), C // c = c + 6 * ldc
  1192. movq A, AO // aoffset = a
  1193. addq $ 16 * SIZE, AO
  1194. movq M, I
  1195. sarq $4, I // i = (m >> 4)
  1196. je .L7_20
  1197. ALIGN_4
  1198. .L7_11:
  1199. leaq BUFFER1, BO // first buffer to BO
  1200. addq $4 * SIZE, BO
  1201. vzeroall
  1202. movq K, %rax
  1203. andq $-8, %rax // K = K - ( K % 8 )
  1204. je .L7_16
  1205. ALIGN_4
  1206. .L7_12:
  1207. KERNEL16x6_SUB4
  1208. KERNEL16x6_SUB4
  1209. je .L7_16
  1210. KERNEL16x6_SUB4
  1211. KERNEL16x6_SUB4
  1212. je .L7_16
  1213. jmp .L7_12
  1214. ALIGN_4
  1215. .L7_16:
  1216. movq K, %rax
  1217. andq $7, %rax # if (k & 1)
  1218. je .L7_19
  1219. ALIGN_4
  1220. .L7_17:
  1221. KERNEL16x6_SUB
  1222. jnz .L7_17
  1223. ALIGN_4
  1224. .L7_19:
  1225. SAVE16x6
  1226. addq $16 * SIZE, CO1 # coffset += 16
  1227. addq $16 * SIZE, CO2 # coffset += 16
  1228. decq I # i --
  1229. jg .L7_11
  1230. ALIGN_4
  1231. /**************************************************************************
  1232. * Rest of M
  1233. ***************************************************************************/
  1234. .L7_20:
  1235. // Test rest of M
  1236. testq $15, M
  1237. jz .L7_60 // to next 6 lines of N
  1238. testq $8, M
  1239. jz .L7_21pre
  1240. ALIGN_4
  1241. /**************************************************************************/
  1242. .L7_20_1:
  1243. leaq BUFFER1, BO // first buffer to BO
  1244. addq $4 * SIZE, BO
  1245. vzeroall
  1246. movq K, %rax
  1247. andq $-8, %rax
  1248. je .L7_20_6
  1249. ALIGN_4
  1250. .L7_20_2:
  1251. prefetcht0 A_PR1(AO)
  1252. KERNEL8x6_SUB
  1253. KERNEL8x6_SUB
  1254. prefetcht0 A_PR1(AO)
  1255. KERNEL8x6_SUB
  1256. KERNEL8x6_SUB
  1257. prefetcht0 A_PR1(AO)
  1258. KERNEL8x6_SUB
  1259. KERNEL8x6_SUB
  1260. prefetcht0 A_PR1(AO)
  1261. KERNEL8x6_SUB
  1262. KERNEL8x6_SUB
  1263. je .L7_20_6
  1264. prefetcht0 A_PR1(AO)
  1265. KERNEL8x6_SUB
  1266. KERNEL8x6_SUB
  1267. prefetcht0 A_PR1(AO)
  1268. KERNEL8x6_SUB
  1269. KERNEL8x6_SUB
  1270. prefetcht0 A_PR1(AO)
  1271. KERNEL8x6_SUB
  1272. KERNEL8x6_SUB
  1273. prefetcht0 A_PR1(AO)
  1274. KERNEL8x6_SUB
  1275. KERNEL8x6_SUB
  1276. je .L7_20_6
  1277. jmp .L7_20_2
  1278. ALIGN_4
  1279. .L7_20_6:
  1280. movq K, %rax
  1281. andq $7, %rax # if (k & 1)
  1282. je .L7_20_9
  1283. ALIGN_4
  1284. .L7_20_7:
  1285. KERNEL8x6_SUB
  1286. jnz .L7_20_7
  1287. ALIGN_4
  1288. .L7_20_9:
  1289. SAVE8x6
  1290. addq $8 * SIZE, CO1 # coffset += 8
  1291. addq $8 * SIZE, CO2 # coffset += 8
  1292. ALIGN_4
  1293. /**************************************************************************/
  1294. .L7_21pre:
  1295. testq $4, M
  1296. jz .L7_30
  1297. ALIGN_4
  1298. .L7_21:
  1299. leaq BUFFER1, BO // first buffer to BO
  1300. addq $4 * SIZE, BO
  1301. vzeroall
  1302. movq K, %rax
  1303. andq $-8, %rax
  1304. je .L7_26
  1305. ALIGN_4
  1306. .L7_22:
  1307. prefetcht0 A_PR1(AO)
  1308. KERNEL4x6_SUB
  1309. KERNEL4x6_SUB
  1310. KERNEL4x6_SUB
  1311. KERNEL4x6_SUB
  1312. prefetcht0 A_PR1(AO)
  1313. KERNEL4x6_SUB
  1314. KERNEL4x6_SUB
  1315. KERNEL4x6_SUB
  1316. KERNEL4x6_SUB
  1317. je .L7_26
  1318. prefetcht0 A_PR1(AO)
  1319. KERNEL4x6_SUB
  1320. KERNEL4x6_SUB
  1321. KERNEL4x6_SUB
  1322. KERNEL4x6_SUB
  1323. prefetcht0 A_PR1(AO)
  1324. KERNEL4x6_SUB
  1325. KERNEL4x6_SUB
  1326. KERNEL4x6_SUB
  1327. KERNEL4x6_SUB
  1328. je .L7_26
  1329. jmp .L7_22
  1330. ALIGN_4
  1331. .L7_26:
  1332. movq K, %rax
  1333. andq $7, %rax # if (k & 1)
  1334. je .L7_29
  1335. ALIGN_4
  1336. .L7_27:
  1337. KERNEL4x6_SUB
  1338. jnz .L7_27
  1339. ALIGN_4
  1340. .L7_29:
  1341. SAVE4x6
  1342. addq $4 * SIZE, CO1 # coffset += 4
  1343. addq $4 * SIZE, CO2 # coffset += 4
  1344. ALIGN_4
  1345. .L7_30:
  1346. testq $2, M
  1347. jz .L7_40
  1348. ALIGN_4
  1349. .L7_31:
  1350. leaq BUFFER1, BO // first buffer to BO
  1351. addq $4 * SIZE, BO
  1352. vzeroall
  1353. movq K, %rax
  1354. andq $-8, %rax
  1355. je .L7_36
  1356. ALIGN_4
  1357. .L7_32:
  1358. prefetcht0 A_PR1(AO)
  1359. KERNEL2x6_SUB
  1360. KERNEL2x6_SUB
  1361. KERNEL2x6_SUB
  1362. KERNEL2x6_SUB
  1363. KERNEL2x6_SUB
  1364. KERNEL2x6_SUB
  1365. KERNEL2x6_SUB
  1366. KERNEL2x6_SUB
  1367. je .L7_36
  1368. prefetcht0 A_PR1(AO)
  1369. KERNEL2x6_SUB
  1370. KERNEL2x6_SUB
  1371. KERNEL2x6_SUB
  1372. KERNEL2x6_SUB
  1373. KERNEL2x6_SUB
  1374. KERNEL2x6_SUB
  1375. KERNEL2x6_SUB
  1376. KERNEL2x6_SUB
  1377. je .L7_36
  1378. jmp .L7_32
  1379. ALIGN_4
  1380. .L7_36:
  1381. movq K, %rax
  1382. andq $7, %rax # if (k & 1)
  1383. je .L7_39
  1384. ALIGN_4
  1385. .L7_37:
  1386. KERNEL2x6_SUB
  1387. jnz .L7_37
  1388. ALIGN_4
  1389. .L7_39:
  1390. SAVE2x6
  1391. addq $2 * SIZE, CO1 # coffset += 2
  1392. addq $2 * SIZE, CO2 # coffset += 2
  1393. ALIGN_4
  1394. .L7_40:
  1395. testq $1, M
  1396. jz .L7_60 // to next 4 lines of N
  1397. ALIGN_4
  1398. .L7_41:
  1399. leaq BUFFER1, BO // first buffer to BO
  1400. addq $4 * SIZE, BO
  1401. vzeroall
  1402. movq K, %rax
  1403. andq $-8, %rax
  1404. je .L7_46
  1405. ALIGN_4
  1406. .L7_42:
  1407. prefetcht0 A_PR1(AO)
  1408. KERNEL1x6_SUB
  1409. KERNEL1x6_SUB
  1410. KERNEL1x6_SUB
  1411. KERNEL1x6_SUB
  1412. KERNEL1x6_SUB
  1413. KERNEL1x6_SUB
  1414. KERNEL1x6_SUB
  1415. KERNEL1x6_SUB
  1416. je .L7_46
  1417. KERNEL1x6_SUB
  1418. KERNEL1x6_SUB
  1419. KERNEL1x6_SUB
  1420. KERNEL1x6_SUB
  1421. KERNEL1x6_SUB
  1422. KERNEL1x6_SUB
  1423. KERNEL1x6_SUB
  1424. KERNEL1x6_SUB
  1425. je .L7_46
  1426. jmp .L7_42
  1427. ALIGN_4
  1428. .L7_46:
  1429. movq K, %rax
  1430. andq $7, %rax # if (k & 1)
  1431. je .L7_49
  1432. ALIGN_4
  1433. .L7_47:
  1434. KERNEL1x6_SUB
  1435. jnz .L7_47
  1436. ALIGN_4
  1437. .L7_49:
  1438. SAVE1x6
  1439. addq $1 * SIZE, CO1 # coffset += 1
  1440. addq $1 * SIZE, CO2 # coffset += 1
  1441. ALIGN_4
  1442. .L7_60:
  1443. decq J // j --
  1444. jg .L6_01 // next 12 lines of N
  1445. /*******************************************************************************************/
  1446. .L4_00:
  1447. movq Nmod6, J
  1448. sarq $2, J // j = j / 4
  1449. cmpq $ 0, J
  1450. je .L2_00
  1451. ALIGN_4
  1452. .L4_01:
  1453. // copy to sub buffer
  1454. movq B, BO1
  1455. leaq BUFFER1, BO // first buffer to BO
  1456. movq K, %rax
  1457. sarq $2, %rax // K / 4
  1458. jz .L4_01b
  1459. ALIGN_4
  1460. .L4_01a:
  1461. prefetcht0 512(BO1)
  1462. prefetchw 512(BO)
  1463. vmovups (BO1), %xmm0
  1464. vmovups 4*SIZE(BO1), %xmm1
  1465. vmovups 8*SIZE(BO1), %xmm2
  1466. vmovups 12*SIZE(BO1), %xmm3
  1467. vmovups %xmm0, (BO)
  1468. vmovups %xmm1, 4*SIZE(BO)
  1469. vmovups %xmm2, 8*SIZE(BO)
  1470. vmovups %xmm3,12*SIZE(BO)
  1471. addq $ 16*SIZE,BO1
  1472. addq $ 16*SIZE,BO
  1473. decq %rax
  1474. jnz .L4_01a
  1475. .L4_01b:
  1476. movq K, %rax
  1477. andq $3, %rax // K % 4
  1478. jz .L4_02d
  1479. ALIGN_4
  1480. .L4_02c:
  1481. vmovups (BO1), %xmm0
  1482. vmovups %xmm0, (BO)
  1483. addq $ 4*SIZE,BO1
  1484. addq $ 4*SIZE,BO
  1485. decq %rax
  1486. jnz .L4_02c
  1487. .L4_02d:
  1488. movq BO1, B // next offset of B
  1489. .L4_10:
  1490. movq C, CO1
  1491. leaq (C, LDC, 2), CO2
  1492. leaq (C, LDC, 4), C // c += 4 * ldc
  1493. #if defined(TRMMKERNEL) && defined(LEFT)
  1494. movq OFFSET, %rax
  1495. movq %rax, KK
  1496. #endif
  1497. movq A, AO // aoffset = a
  1498. addq $ 16 * SIZE, AO
  1499. movq M, I
  1500. sarq $4, I // i = (m >> 4)
  1501. je .L4_20
  1502. ALIGN_4
  1503. .L4_11:
  1504. #if !defined(TRMMKERNEL) || \
  1505. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1506. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1507. leaq BUFFER1, BO // first buffer to BO
  1508. addq $4 * SIZE, BO
  1509. #else
  1510. movq KK, %rax
  1511. leaq BUFFER1, BO // first buffer to BO
  1512. addq $4 * SIZE, BO
  1513. movq %rax, BI // Index for BO
  1514. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1515. leaq (BO, BI, SIZE), BO
  1516. salq $4, %rax // rax = rax * 16 ; number of values
  1517. leaq (AO, %rax, SIZE), AO
  1518. #endif
  1519. vzeroall
  1520. #ifndef TRMMKERNEL
  1521. movq K, %rax
  1522. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1523. movq K, %rax
  1524. subq KK, %rax
  1525. movq %rax, KKK
  1526. #else
  1527. movq KK, %rax
  1528. #ifdef LEFT
  1529. addq $16, %rax // number of values in AO
  1530. #else
  1531. addq $4, %rax // number of values in BO
  1532. #endif
  1533. movq %rax, KKK
  1534. #endif
  1535. andq $-8, %rax // K = K - ( K % 8 )
  1536. je .L4_16
  1537. movq %rax, BI // Index for BO
  1538. leaq (,BI,4) , BI // BI = BI * 4 ; number of values
  1539. salq $4, %rax // rax = rax * 16 ; number of values
  1540. leaq (AO, %rax, SIZE), AO
  1541. leaq (BO, BI, SIZE), BO
  1542. negq BI
  1543. negq %rax
  1544. ALIGN_4
  1545. .L4_12:
  1546. prefetcht0 A_PR1(AO, %rax, SIZE)
  1547. prefetcht0 B_PR1(BO, BI , SIZE)
  1548. KERNEL16x4_SUB
  1549. prefetcht0 A_PR1(AO, %rax, SIZE)
  1550. KERNEL16x4_SUB
  1551. prefetcht0 A_PR1(AO, %rax, SIZE)
  1552. KERNEL16x4_SUB
  1553. prefetcht0 A_PR1(AO, %rax, SIZE)
  1554. KERNEL16x4_SUB
  1555. prefetcht0 A_PR1(AO, %rax, SIZE)
  1556. prefetcht0 B_PR1(BO, BI , SIZE)
  1557. KERNEL16x4_SUB
  1558. prefetcht0 A_PR1(AO, %rax, SIZE)
  1559. KERNEL16x4_SUB
  1560. prefetcht0 A_PR1(AO, %rax, SIZE)
  1561. KERNEL16x4_SUB
  1562. prefetcht0 A_PR1(AO, %rax, SIZE)
  1563. KERNEL16x4_SUB
  1564. je .L4_16
  1565. prefetcht0 A_PR1(AO, %rax, SIZE)
  1566. prefetcht0 B_PR1(BO, BI , SIZE)
  1567. KERNEL16x4_SUB
  1568. prefetcht0 A_PR1(AO, %rax, SIZE)
  1569. KERNEL16x4_SUB
  1570. prefetcht0 A_PR1(AO, %rax, SIZE)
  1571. KERNEL16x4_SUB
  1572. prefetcht0 A_PR1(AO, %rax, SIZE)
  1573. KERNEL16x4_SUB
  1574. prefetcht0 A_PR1(AO, %rax, SIZE)
  1575. prefetcht0 B_PR1(BO, BI , SIZE)
  1576. KERNEL16x4_SUB
  1577. prefetcht0 A_PR1(AO, %rax, SIZE)
  1578. KERNEL16x4_SUB
  1579. prefetcht0 A_PR1(AO, %rax, SIZE)
  1580. KERNEL16x4_SUB
  1581. prefetcht0 A_PR1(AO, %rax, SIZE)
  1582. KERNEL16x4_SUB
  1583. je .L4_16
  1584. jmp .L4_12
  1585. ALIGN_4
  1586. .L4_16:
  1587. #ifndef TRMMKERNEL
  1588. movq K, %rax
  1589. #else
  1590. movq KKK, %rax
  1591. #endif
  1592. andq $7, %rax # if (k & 1)
  1593. je .L4_19
  1594. movq %rax, BI // Index for BO
  1595. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1596. salq $4, %rax // rax = rax * 16 ; number of values
  1597. leaq (AO, %rax, SIZE), AO
  1598. leaq (BO, BI, SIZE), BO
  1599. negq BI
  1600. negq %rax
  1601. ALIGN_4
  1602. .L4_17:
  1603. KERNEL16x4_SUB
  1604. jl .L4_17
  1605. ALIGN_4
  1606. .L4_19:
  1607. SAVE16x4
  1608. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1609. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1610. movq K, %rax
  1611. subq KKK, %rax
  1612. movq %rax, BI // Index for BO
  1613. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1614. leaq (BO, BI, SIZE), BO
  1615. salq $4, %rax // rax = rax * 16 ; number of values
  1616. leaq (AO, %rax, SIZE), AO
  1617. #endif
  1618. #if defined(TRMMKERNEL) && defined(LEFT)
  1619. addq $16, KK
  1620. #endif
  1621. addq $16 * SIZE, CO1 # coffset += 16
  1622. addq $16 * SIZE, CO2 # coffset += 16
  1623. decq I # i --
  1624. jg .L4_11
  1625. ALIGN_4
  1626. /**************************************************************************
  1627. * Rest of M
  1628. ***************************************************************************/
  1629. .L4_20:
  1630. // Test rest of M
  1631. testq $15, M
  1632. jz .L4_60 // to next 3 lines of N
  1633. testq $8, M
  1634. jz .L4_21pre
  1635. ALIGN_4
  1636. /**************************************************************************/
  1637. .L4_20_1:
  1638. #if !defined(TRMMKERNEL) || \
  1639. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1640. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1641. leaq BUFFER1, BO // first buffer to BO
  1642. addq $4 * SIZE, BO
  1643. #else
  1644. movq KK, %rax
  1645. leaq BUFFER1, BO // first buffer to BO
  1646. addq $4 * SIZE, BO
  1647. movq %rax, BI // Index for BO
  1648. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1649. leaq (BO, BI, SIZE), BO
  1650. salq $3, %rax // rax = rax * 8 ; number of values
  1651. leaq (AO, %rax, SIZE), AO
  1652. #endif
  1653. vzeroall
  1654. #ifndef TRMMKERNEL
  1655. movq K, %rax
  1656. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1657. movq K, %rax
  1658. subq KK, %rax
  1659. movq %rax, KKK
  1660. #else
  1661. movq KK, %rax
  1662. #ifdef LEFT
  1663. addq $8, %rax // number of values in A
  1664. #else
  1665. addq $4, %rax // number of values in BO
  1666. #endif
  1667. movq %rax, KKK
  1668. #endif
  1669. andq $-8, %rax
  1670. je .L4_20_6
  1671. movq %rax, BI // Index for BO
  1672. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1673. salq $3, %rax // rax = rax * 8 ; number of values
  1674. leaq (AO, %rax, SIZE), AO
  1675. leaq (BO, BI, SIZE), BO
  1676. negq BI
  1677. negq %rax
  1678. ALIGN_4
  1679. .L4_20_2:
  1680. KERNEL8x4_SUB
  1681. KERNEL8x4_SUB
  1682. KERNEL8x4_SUB
  1683. KERNEL8x4_SUB
  1684. KERNEL8x4_SUB
  1685. KERNEL8x4_SUB
  1686. KERNEL8x4_SUB
  1687. KERNEL8x4_SUB
  1688. je .L4_20_6
  1689. KERNEL8x4_SUB
  1690. KERNEL8x4_SUB
  1691. KERNEL8x4_SUB
  1692. KERNEL8x4_SUB
  1693. KERNEL8x4_SUB
  1694. KERNEL8x4_SUB
  1695. KERNEL8x4_SUB
  1696. KERNEL8x4_SUB
  1697. je .L4_20_6
  1698. jmp .L4_20_2
  1699. ALIGN_4
  1700. .L4_20_6:
  1701. #ifndef TRMMKERNEL
  1702. movq K, %rax
  1703. #else
  1704. movq KKK, %rax
  1705. #endif
  1706. andq $7, %rax # if (k & 1)
  1707. je .L4_20_9
  1708. movq %rax, BI // Index for BO
  1709. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1710. salq $3, %rax // rax = rax * 8 ; number of values
  1711. leaq (AO, %rax, SIZE), AO
  1712. leaq (BO, BI, SIZE), BO
  1713. negq BI
  1714. negq %rax
  1715. ALIGN_4
  1716. .L4_20_7:
  1717. KERNEL8x4_SUB
  1718. jl .L4_20_7
  1719. ALIGN_4
  1720. .L4_20_9:
  1721. SAVE8x4
  1722. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1723. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1724. movq K, %rax
  1725. subq KKK, %rax
  1726. movq %rax, BI // Index for BO
  1727. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1728. leaq (BO, BI, SIZE), BO
  1729. salq $3, %rax // rax = rax * 8 ; number of values
  1730. leaq (AO, %rax, SIZE), AO
  1731. #endif
  1732. #if defined(TRMMKERNEL) && defined(LEFT)
  1733. addq $8, KK
  1734. #endif
  1735. addq $8 * SIZE, CO1 # coffset += 8
  1736. addq $8 * SIZE, CO2 # coffset += 8
  1737. ALIGN_4
  1738. /**************************************************************************/
  1739. .L4_21pre:
  1740. testq $4, M
  1741. jz .L4_30
  1742. ALIGN_4
  1743. .L4_21:
  1744. #if !defined(TRMMKERNEL) || \
  1745. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1746. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1747. leaq BUFFER1, BO // first buffer to BO
  1748. addq $4 * SIZE, BO
  1749. #else
  1750. movq KK, %rax
  1751. leaq BUFFER1, BO // first buffer to BO
  1752. addq $4 * SIZE, BO
  1753. movq %rax, BI // Index for BO
  1754. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1755. leaq (BO, BI, SIZE), BO
  1756. salq $2, %rax // rax = rax * 4 ; number of values
  1757. leaq (AO, %rax, SIZE), AO
  1758. #endif
  1759. vzeroall
  1760. #ifndef TRMMKERNEL
  1761. movq K, %rax
  1762. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1763. movq K, %rax
  1764. subq KK, %rax
  1765. movq %rax, KKK
  1766. #else
  1767. movq KK, %rax
  1768. #ifdef LEFT
  1769. addq $4, %rax // number of values in A
  1770. #else
  1771. addq $4, %rax // number of values in BO
  1772. #endif
  1773. movq %rax, KKK
  1774. #endif
  1775. andq $-8, %rax
  1776. je .L4_26
  1777. movq %rax, BI // Index for BO
  1778. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1779. salq $2, %rax // rax = rax * 4 ; number of values
  1780. leaq (AO, %rax, SIZE), AO
  1781. leaq (BO, BI, SIZE), BO
  1782. negq BI
  1783. negq %rax
  1784. ALIGN_4
  1785. .L4_22:
  1786. KERNEL4x4_SUB
  1787. KERNEL4x4_SUB
  1788. KERNEL4x4_SUB
  1789. KERNEL4x4_SUB
  1790. KERNEL4x4_SUB
  1791. KERNEL4x4_SUB
  1792. KERNEL4x4_SUB
  1793. KERNEL4x4_SUB
  1794. je .L4_26
  1795. KERNEL4x4_SUB
  1796. KERNEL4x4_SUB
  1797. KERNEL4x4_SUB
  1798. KERNEL4x4_SUB
  1799. KERNEL4x4_SUB
  1800. KERNEL4x4_SUB
  1801. KERNEL4x4_SUB
  1802. KERNEL4x4_SUB
  1803. je .L4_26
  1804. jmp .L4_22
  1805. ALIGN_4
  1806. .L4_26:
  1807. #ifndef TRMMKERNEL
  1808. movq K, %rax
  1809. #else
  1810. movq KKK, %rax
  1811. #endif
  1812. andq $7, %rax # if (k & 1)
  1813. je .L4_29
  1814. movq %rax, BI // Index for BO
  1815. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1816. salq $2, %rax // rax = rax * 4 ; number of values
  1817. leaq (AO, %rax, SIZE), AO
  1818. leaq (BO, BI, SIZE), BO
  1819. negq BI
  1820. negq %rax
  1821. ALIGN_4
  1822. .L4_27:
  1823. KERNEL4x4_SUB
  1824. jl .L4_27
  1825. ALIGN_4
  1826. .L4_29:
  1827. SAVE4x4
  1828. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1829. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1830. movq K, %rax
  1831. subq KKK, %rax
  1832. movq %rax, BI // Index for BO
  1833. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1834. leaq (BO, BI, SIZE), BO
  1835. salq $2, %rax // rax = rax * 4 ; number of values
  1836. leaq (AO, %rax, SIZE), AO
  1837. #endif
  1838. #if defined(TRMMKERNEL) && defined(LEFT)
  1839. addq $4, KK
  1840. #endif
  1841. addq $4 * SIZE, CO1 # coffset += 4
  1842. addq $4 * SIZE, CO2 # coffset += 4
  1843. ALIGN_4
  1844. .L4_30:
  1845. testq $2, M
  1846. jz .L4_40
  1847. ALIGN_4
  1848. .L4_31:
  1849. #if !defined(TRMMKERNEL) || \
  1850. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1851. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1852. leaq BUFFER1, BO // first buffer to BO
  1853. addq $4 * SIZE, BO
  1854. #else
  1855. movq KK, %rax
  1856. leaq BUFFER1, BO // first buffer to BO
  1857. addq $4 * SIZE, BO
  1858. movq %rax, BI // Index for BO
  1859. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1860. leaq (BO, BI, SIZE), BO
  1861. salq $1, %rax // rax = rax * 2 ; number of values
  1862. leaq (AO, %rax, SIZE), AO
  1863. #endif
  1864. vzeroall
  1865. #ifndef TRMMKERNEL
  1866. movq K, %rax
  1867. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1868. movq K, %rax
  1869. subq KK, %rax
  1870. movq %rax, KKK
  1871. #else
  1872. movq KK, %rax
  1873. #ifdef LEFT
  1874. addq $2, %rax // number of values in AO
  1875. #else
  1876. addq $4, %rax // number of values in BO
  1877. #endif
  1878. movq %rax, KKK
  1879. #endif
  1880. andq $-8, %rax
  1881. je .L4_36
  1882. movq %rax, BI // Index for BO
  1883. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1884. salq $1, %rax // rax = rax *2 ; number of values
  1885. leaq (AO, %rax, SIZE), AO
  1886. leaq (BO, BI, SIZE), BO
  1887. negq BI
  1888. negq %rax
  1889. ALIGN_4
  1890. .L4_32:
  1891. KERNEL2x4_SUB
  1892. KERNEL2x4_SUB
  1893. KERNEL2x4_SUB
  1894. KERNEL2x4_SUB
  1895. KERNEL2x4_SUB
  1896. KERNEL2x4_SUB
  1897. KERNEL2x4_SUB
  1898. KERNEL2x4_SUB
  1899. je .L4_36
  1900. KERNEL2x4_SUB
  1901. KERNEL2x4_SUB
  1902. KERNEL2x4_SUB
  1903. KERNEL2x4_SUB
  1904. KERNEL2x4_SUB
  1905. KERNEL2x4_SUB
  1906. KERNEL2x4_SUB
  1907. KERNEL2x4_SUB
  1908. je .L4_36
  1909. jmp .L4_32
  1910. ALIGN_4
  1911. .L4_36:
  1912. #ifndef TRMMKERNEL
  1913. movq K, %rax
  1914. #else
  1915. movq KKK, %rax
  1916. #endif
  1917. andq $7, %rax # if (k & 1)
  1918. je .L4_39
  1919. movq %rax, BI // Index for BO
  1920. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1921. salq $1, %rax // rax = rax *2 ; number of values
  1922. leaq (AO, %rax, SIZE), AO
  1923. leaq (BO, BI, SIZE), BO
  1924. negq BI
  1925. negq %rax
  1926. ALIGN_4
  1927. .L4_37:
  1928. KERNEL2x4_SUB
  1929. jl .L4_37
  1930. ALIGN_4
  1931. .L4_39:
  1932. SAVE2x4
  1933. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1934. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1935. movq K, %rax
  1936. subq KKK, %rax
  1937. movq %rax, BI // Index for BO
  1938. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1939. leaq (BO, BI, SIZE), BO
  1940. salq $1, %rax // rax = rax * 2 ; number of values
  1941. leaq (AO, %rax, SIZE), AO
  1942. #endif
  1943. #if defined(TRMMKERNEL) && defined(LEFT)
  1944. addq $2, KK
  1945. #endif
  1946. addq $2 * SIZE, CO1 # coffset += 2
  1947. addq $2 * SIZE, CO2 # coffset += 2
  1948. ALIGN_4
  1949. .L4_40:
  1950. testq $1, M
  1951. jz .L4_60 // to next 4 lines of N
  1952. ALIGN_4
  1953. .L4_41:
  1954. #if !defined(TRMMKERNEL) || \
  1955. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1956. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1957. leaq BUFFER1, BO // first buffer to BO
  1958. addq $4 * SIZE, BO
  1959. #else
  1960. movq KK, %rax
  1961. leaq BUFFER1, BO // first buffer to BO
  1962. addq $4 * SIZE, BO
  1963. movq %rax, BI // Index for BO
  1964. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  1965. leaq (BO, BI, SIZE), BO
  1966. leaq (AO, %rax, SIZE), AO
  1967. #endif
  1968. vzeroall
  1969. #ifndef TRMMKERNEL
  1970. movq K, %rax
  1971. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1972. movq K, %rax
  1973. subq KK, %rax
  1974. movq %rax, KKK
  1975. #else
  1976. movq KK, %rax
  1977. #ifdef LEFT
  1978. addq $1, %rax // number of values in AO
  1979. #else
  1980. addq $4, %rax // number of values in BO
  1981. #endif
  1982. movq %rax, KKK
  1983. #endif
  1984. andq $-8, %rax
  1985. je .L4_46
  1986. movq %rax, BI // Index for BO
  1987. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  1988. leaq (AO, %rax, SIZE), AO
  1989. leaq (BO, BI, SIZE), BO
  1990. negq BI
  1991. negq %rax
  1992. ALIGN_4
  1993. .L4_42:
  1994. KERNEL1x4_SUB
  1995. KERNEL1x4_SUB
  1996. KERNEL1x4_SUB
  1997. KERNEL1x4_SUB
  1998. KERNEL1x4_SUB
  1999. KERNEL1x4_SUB
  2000. KERNEL1x4_SUB
  2001. KERNEL1x4_SUB
  2002. je .L4_46
  2003. KERNEL1x4_SUB
  2004. KERNEL1x4_SUB
  2005. KERNEL1x4_SUB
  2006. KERNEL1x4_SUB
  2007. KERNEL1x4_SUB
  2008. KERNEL1x4_SUB
  2009. KERNEL1x4_SUB
  2010. KERNEL1x4_SUB
  2011. je .L4_46
  2012. jmp .L4_42
  2013. ALIGN_4
  2014. .L4_46:
  2015. #ifndef TRMMKERNEL
  2016. movq K, %rax
  2017. #else
  2018. movq KKK, %rax
  2019. #endif
  2020. andq $7, %rax # if (k & 1)
  2021. je .L4_49
  2022. movq %rax, BI // Index for BO
  2023. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  2024. leaq (AO, %rax, SIZE), AO
  2025. leaq (BO, BI, SIZE), BO
  2026. negq BI
  2027. negq %rax
  2028. ALIGN_4
  2029. .L4_47:
  2030. KERNEL1x4_SUB
  2031. jl .L4_47
  2032. ALIGN_4
  2033. .L4_49:
  2034. SAVE1x4
  2035. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2036. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2037. movq K, %rax
  2038. subq KKK, %rax
  2039. movq %rax, BI // Index for BO
  2040. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  2041. leaq (BO, BI, SIZE), BO
  2042. leaq (AO, %rax, SIZE), AO
  2043. #endif
  2044. #if defined(TRMMKERNEL) && defined(LEFT)
  2045. addq $1, KK
  2046. #endif
  2047. addq $1 * SIZE, CO1 # coffset += 1
  2048. addq $1 * SIZE, CO2 # coffset += 1
  2049. ALIGN_4
  2050. .L4_60:
  2051. #if defined(TRMMKERNEL) && !defined(LEFT)
  2052. addq $4, KK
  2053. #endif
  2054. decq J // j --
  2055. jg .L4_01 // next 4 lines of N
  2056. /*******************************************************************************************/
  2057. .L2_00:
  2058. movq Nmod6, J
  2059. andq $3, J // j % 4
  2060. je .L999
  2061. movq Nmod6, J
  2062. andq $2, J // j % 4
  2063. je .L1_0
  2064. .L2_01:
  2065. // copy to sub buffer
  2066. movq B, BO1
  2067. leaq BUFFER1, BO // first buffer to BO
  2068. movq K, %rax
  2069. sarq $2, %rax // K / 4
  2070. jz .L2_01b
  2071. ALIGN_4
  2072. .L2_01a:
  2073. vmovsd (BO1), %xmm0
  2074. vmovsd 2*SIZE(BO1), %xmm1
  2075. vmovsd 4*SIZE(BO1), %xmm2
  2076. vmovsd 6*SIZE(BO1), %xmm3
  2077. vmovsd %xmm0, (BO)
  2078. vmovsd %xmm1, 2*SIZE(BO)
  2079. vmovsd %xmm2, 4*SIZE(BO)
  2080. vmovsd %xmm3, 6*SIZE(BO)
  2081. addq $8*SIZE,BO1
  2082. addq $8*SIZE,BO
  2083. decq %rax
  2084. jnz .L2_01a
  2085. .L2_01b:
  2086. movq K, %rax
  2087. andq $3, %rax // K % 4
  2088. jz .L2_02d
  2089. ALIGN_4
  2090. .L2_02c:
  2091. vmovsd (BO1), %xmm0
  2092. vmovsd %xmm0, (BO)
  2093. addq $2*SIZE,BO1
  2094. addq $2*SIZE,BO
  2095. decq %rax
  2096. jnz .L2_02c
  2097. .L2_02d:
  2098. movq BO1, B // next offset of B
  2099. .L2_10:
  2100. movq C, CO1
  2101. leaq (C, LDC, 2), C // c += 2 * ldc
  2102. #if defined(TRMMKERNEL) && defined(LEFT)
  2103. movq OFFSET, %rax
  2104. movq %rax, KK
  2105. #endif
  2106. movq A, AO // aoffset = a
  2107. addq $16 * SIZE, AO
  2108. movq M, I
  2109. sarq $4, I // i = (m >> 4)
  2110. je .L2_20
  2111. ALIGN_4
  2112. .L2_11:
  2113. #if !defined(TRMMKERNEL) || \
  2114. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2115. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2116. leaq BUFFER1, BO // first buffer to BO
  2117. addq $4 * SIZE, BO
  2118. #else
  2119. movq KK, %rax
  2120. leaq BUFFER1, BO // first buffer to BO
  2121. addq $4 * SIZE, BO
  2122. movq %rax, BI // Index for BO
  2123. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2124. leaq (BO, BI, SIZE), BO
  2125. salq $4, %rax // rax = rax * 16 ; number of values
  2126. leaq (AO, %rax, SIZE), AO
  2127. #endif
  2128. vzeroall
  2129. #ifndef TRMMKERNEL
  2130. movq K, %rax
  2131. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2132. movq K, %rax
  2133. subq KK, %rax
  2134. movq %rax, KKK
  2135. #else
  2136. movq KK, %rax
  2137. #ifdef LEFT
  2138. addq $16, %rax // number of values in AO
  2139. #else
  2140. addq $2, %rax // number of values in BO
  2141. #endif
  2142. movq %rax, KKK
  2143. #endif
  2144. andq $-8, %rax // K = K - ( K % 8 )
  2145. je .L2_16
  2146. movq %rax, BI // Index for BO
  2147. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2148. salq $4, %rax // rax = rax * 16 ; number of values
  2149. leaq (AO, %rax, SIZE), AO
  2150. leaq (BO, BI, SIZE), BO
  2151. negq BI
  2152. negq %rax
  2153. ALIGN_4
  2154. .L2_12:
  2155. KERNEL16x2_SUB
  2156. KERNEL16x2_SUB
  2157. KERNEL16x2_SUB
  2158. KERNEL16x2_SUB
  2159. KERNEL16x2_SUB
  2160. KERNEL16x2_SUB
  2161. KERNEL16x2_SUB
  2162. KERNEL16x2_SUB
  2163. je .L2_16
  2164. KERNEL16x2_SUB
  2165. KERNEL16x2_SUB
  2166. KERNEL16x2_SUB
  2167. KERNEL16x2_SUB
  2168. KERNEL16x2_SUB
  2169. KERNEL16x2_SUB
  2170. KERNEL16x2_SUB
  2171. KERNEL16x2_SUB
  2172. je .L2_16
  2173. jmp .L2_12
  2174. ALIGN_4
  2175. .L2_16:
  2176. #ifndef TRMMKERNEL
  2177. movq K, %rax
  2178. #else
  2179. movq KKK, %rax
  2180. #endif
  2181. andq $7, %rax # if (k & 1)
  2182. je .L2_19
  2183. movq %rax, BI // Index for BO
  2184. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2185. salq $4, %rax // rax = rax * 16 ; number of values
  2186. leaq (AO, %rax, SIZE), AO
  2187. leaq (BO, BI, SIZE), BO
  2188. negq BI
  2189. negq %rax
  2190. ALIGN_4
  2191. .L2_17:
  2192. KERNEL16x2_SUB
  2193. jl .L2_17
  2194. ALIGN_4
  2195. .L2_19:
  2196. SAVE16x2
  2197. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2198. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2199. movq K, %rax
  2200. subq KKK, %rax
  2201. movq %rax, BI // Index for BO
  2202. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2203. leaq (BO, BI, SIZE), BO
  2204. salq $4, %rax // rax = rax * 16 ; number of values
  2205. leaq (AO, %rax, SIZE), AO
  2206. #endif
  2207. #if defined(TRMMKERNEL) && defined(LEFT)
  2208. addq $16, KK
  2209. #endif
  2210. addq $16 * SIZE, CO1 # coffset += 16
  2211. decq I # i --
  2212. jg .L2_11
  2213. ALIGN_4
  2214. /**************************************************************************
  2215. * Rest of M
  2216. ***************************************************************************/
  2217. .L2_20:
  2218. // Test rest of M
  2219. testq $15, M
  2220. jz .L2_60 // to next 2 lines of N
  2221. testq $8, M
  2222. jz .L2_21pre
  2223. ALIGN_4
  2224. /**************************************************************************/
  2225. .L2_20_1:
  2226. #if !defined(TRMMKERNEL) || \
  2227. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2228. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2229. leaq BUFFER1, BO // first buffer to BO
  2230. addq $4 * SIZE, BO
  2231. #else
  2232. movq KK, %rax
  2233. leaq BUFFER1, BO // first buffer to BO
  2234. addq $4 * SIZE, BO
  2235. movq %rax, BI // Index for BO
  2236. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2237. leaq (BO, BI, SIZE), BO
  2238. salq $3, %rax // rax = rax * 8 ; number of values
  2239. leaq (AO, %rax, SIZE), AO
  2240. #endif
  2241. vzeroall
  2242. #ifndef TRMMKERNEL
  2243. movq K, %rax
  2244. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2245. movq K, %rax
  2246. subq KK, %rax
  2247. movq %rax, KKK
  2248. #else
  2249. movq KK, %rax
  2250. #ifdef LEFT
  2251. addq $8, %rax // number of values in A
  2252. #else
  2253. addq $2, %rax // number of values in BO
  2254. #endif
  2255. movq %rax, KKK
  2256. #endif
  2257. andq $-8, %rax
  2258. je .L2_20_6
  2259. movq %rax, BI // Index for BO
  2260. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2261. salq $3, %rax // rax = rax * 8 ; number of values
  2262. leaq (AO, %rax, SIZE), AO
  2263. leaq (BO, BI, SIZE), BO
  2264. negq BI
  2265. negq %rax
  2266. ALIGN_4
  2267. .L2_20_2:
  2268. KERNEL8x2_SUB
  2269. KERNEL8x2_SUB
  2270. KERNEL8x2_SUB
  2271. KERNEL8x2_SUB
  2272. KERNEL8x2_SUB
  2273. KERNEL8x2_SUB
  2274. KERNEL8x2_SUB
  2275. KERNEL8x2_SUB
  2276. je .L2_20_6
  2277. KERNEL8x2_SUB
  2278. KERNEL8x2_SUB
  2279. KERNEL8x2_SUB
  2280. KERNEL8x2_SUB
  2281. KERNEL8x2_SUB
  2282. KERNEL8x2_SUB
  2283. KERNEL8x2_SUB
  2284. KERNEL8x2_SUB
  2285. je .L2_20_6
  2286. jmp .L2_20_2
  2287. ALIGN_4
  2288. .L2_20_6:
  2289. #ifndef TRMMKERNEL
  2290. movq K, %rax
  2291. #else
  2292. movq KKK, %rax
  2293. #endif
  2294. andq $7, %rax # if (k & 1)
  2295. je .L2_20_9
  2296. movq %rax, BI // Index for BO
  2297. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2298. salq $3, %rax // rax = rax * 8 ; number of values
  2299. leaq (AO, %rax, SIZE), AO
  2300. leaq (BO, BI, SIZE), BO
  2301. negq BI
  2302. negq %rax
  2303. ALIGN_4
  2304. .L2_20_7:
  2305. KERNEL8x2_SUB
  2306. jl .L2_20_7
  2307. ALIGN_4
  2308. .L2_20_9:
  2309. SAVE8x2
  2310. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2311. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2312. movq K, %rax
  2313. subq KKK, %rax
  2314. movq %rax, BI // Index for BO
  2315. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2316. leaq (BO, BI, SIZE), BO
  2317. salq $3, %rax // rax = rax * 8 ; number of values
  2318. leaq (AO, %rax, SIZE), AO
  2319. #endif
  2320. #if defined(TRMMKERNEL) && defined(LEFT)
  2321. addq $8, KK
  2322. #endif
  2323. addq $8 * SIZE, CO1 # coffset += 8
  2324. ALIGN_4
  2325. /**************************************************************************/
  2326. .L2_21pre:
  2327. testq $4, M
  2328. jz .L2_30
  2329. ALIGN_4
  2330. .L2_21:
  2331. #if !defined(TRMMKERNEL) || \
  2332. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2333. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2334. leaq BUFFER1, BO // first buffer to BO
  2335. addq $4 * SIZE, BO
  2336. #else
  2337. movq KK, %rax
  2338. leaq BUFFER1, BO // first buffer to BO
  2339. addq $4 * SIZE, BO
  2340. movq %rax, BI // Index for BO
  2341. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2342. leaq (BO, BI, SIZE), BO
  2343. salq $2, %rax // rax = rax * 4 ; number of values
  2344. leaq (AO, %rax, SIZE), AO
  2345. #endif
  2346. vzeroall
  2347. #ifndef TRMMKERNEL
  2348. movq K, %rax
  2349. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2350. movq K, %rax
  2351. subq KK, %rax
  2352. movq %rax, KKK
  2353. #else
  2354. movq KK, %rax
  2355. #ifdef LEFT
  2356. addq $4, %rax // number of values in A
  2357. #else
  2358. addq $2, %rax // number of values in BO
  2359. #endif
  2360. movq %rax, KKK
  2361. #endif
  2362. andq $-8, %rax
  2363. je .L2_26
  2364. movq %rax, BI // Index for BO
  2365. leaq (BI,BI,1), BI // BI = BI * 1 ; number of values
  2366. salq $2, %rax // rax = rax * 4 ; number of values
  2367. leaq (AO, %rax, SIZE), AO
  2368. leaq (BO, BI, SIZE), BO
  2369. negq BI
  2370. negq %rax
  2371. ALIGN_4
  2372. .L2_22:
  2373. KERNEL4x2_SUB
  2374. KERNEL4x2_SUB
  2375. KERNEL4x2_SUB
  2376. KERNEL4x2_SUB
  2377. KERNEL4x2_SUB
  2378. KERNEL4x2_SUB
  2379. KERNEL4x2_SUB
  2380. KERNEL4x2_SUB
  2381. je .L2_26
  2382. KERNEL4x2_SUB
  2383. KERNEL4x2_SUB
  2384. KERNEL4x2_SUB
  2385. KERNEL4x2_SUB
  2386. KERNEL4x2_SUB
  2387. KERNEL4x2_SUB
  2388. KERNEL4x2_SUB
  2389. KERNEL4x2_SUB
  2390. je .L2_26
  2391. jmp .L2_22
  2392. ALIGN_4
  2393. .L2_26:
  2394. #ifndef TRMMKERNEL
  2395. movq K, %rax
  2396. #else
  2397. movq KKK, %rax
  2398. #endif
  2399. andq $7, %rax # if (k & 1)
  2400. je .L2_29
  2401. movq %rax, BI // Index for BO
  2402. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2403. salq $2, %rax // rax = rax * 4 ; number of values
  2404. leaq (AO, %rax, SIZE), AO
  2405. leaq (BO, BI, SIZE), BO
  2406. negq BI
  2407. negq %rax
  2408. ALIGN_4
  2409. .L2_27:
  2410. KERNEL4x2_SUB
  2411. jl .L2_27
  2412. ALIGN_4
  2413. .L2_29:
  2414. SAVE4x2
  2415. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2416. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2417. movq K, %rax
  2418. subq KKK, %rax
  2419. movq %rax, BI // Index for BO
  2420. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2421. leaq (BO, BI, SIZE), BO
  2422. salq $2, %rax // rax = rax * 4 ; number of values
  2423. leaq (AO, %rax, SIZE), AO
  2424. #endif
  2425. #if defined(TRMMKERNEL) && defined(LEFT)
  2426. addq $4, KK
  2427. #endif
  2428. addq $4 * SIZE, CO1 # coffset += 4
  2429. ALIGN_4
  2430. .L2_30:
  2431. testq $2, M
  2432. jz .L2_40
  2433. ALIGN_4
  2434. .L2_31:
  2435. #if !defined(TRMMKERNEL) || \
  2436. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2437. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2438. leaq BUFFER1, BO // first buffer to BO
  2439. addq $4 * SIZE, BO
  2440. #else
  2441. movq KK, %rax
  2442. leaq BUFFER1, BO // first buffer to BO
  2443. addq $4 * SIZE, BO
  2444. movq %rax, BI // Index for BO
  2445. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2446. leaq (BO, BI, SIZE), BO
  2447. salq $1, %rax // rax = rax * 2 ; number of values
  2448. leaq (AO, %rax, SIZE), AO
  2449. #endif
  2450. vzeroall
  2451. #ifndef TRMMKERNEL
  2452. movq K, %rax
  2453. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2454. movq K, %rax
  2455. subq KK, %rax
  2456. movq %rax, KKK
  2457. #else
  2458. movq KK, %rax
  2459. #ifdef LEFT
  2460. addq $2, %rax // number of values in AO
  2461. #else
  2462. addq $2, %rax // number of values in BO
  2463. #endif
  2464. movq %rax, KKK
  2465. #endif
  2466. andq $-8, %rax
  2467. je .L2_36
  2468. movq %rax, BI // Index for BO
  2469. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2470. salq $1, %rax // rax = rax *2 ; number of values
  2471. leaq (AO, %rax, SIZE), AO
  2472. leaq (BO, BI, SIZE), BO
  2473. negq BI
  2474. negq %rax
  2475. ALIGN_4
  2476. .L2_32:
  2477. KERNEL2x2_SUB
  2478. KERNEL2x2_SUB
  2479. KERNEL2x2_SUB
  2480. KERNEL2x2_SUB
  2481. KERNEL2x2_SUB
  2482. KERNEL2x2_SUB
  2483. KERNEL2x2_SUB
  2484. KERNEL2x2_SUB
  2485. je .L2_36
  2486. KERNEL2x2_SUB
  2487. KERNEL2x2_SUB
  2488. KERNEL2x2_SUB
  2489. KERNEL2x2_SUB
  2490. KERNEL2x2_SUB
  2491. KERNEL2x2_SUB
  2492. KERNEL2x2_SUB
  2493. KERNEL2x2_SUB
  2494. je .L2_36
  2495. jmp .L2_32
  2496. ALIGN_4
  2497. .L2_36:
  2498. #ifndef TRMMKERNEL
  2499. movq K, %rax
  2500. #else
  2501. movq KKK, %rax
  2502. #endif
  2503. andq $7, %rax # if (k & 1)
  2504. je .L2_39
  2505. movq %rax, BI // Index for BO
  2506. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2507. salq $1, %rax // rax = rax *2 ; number of values
  2508. leaq (AO, %rax, SIZE), AO
  2509. leaq (BO, BI, SIZE), BO
  2510. negq BI
  2511. negq %rax
  2512. ALIGN_4
  2513. .L2_37:
  2514. KERNEL2x2_SUB
  2515. jl .L2_37
  2516. ALIGN_4
  2517. .L2_39:
  2518. SAVE2x2
  2519. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2520. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2521. movq K, %rax
  2522. subq KKK, %rax
  2523. movq %rax, BI // Index for BO
  2524. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2525. leaq (BO, BI, SIZE), BO
  2526. salq $1, %rax // rax = rax * 2 ; number of values
  2527. leaq (AO, %rax, SIZE), AO
  2528. #endif
  2529. #if defined(TRMMKERNEL) && defined(LEFT)
  2530. addq $2, KK
  2531. #endif
  2532. addq $2 * SIZE, CO1 # coffset += 2
  2533. ALIGN_4
  2534. .L2_40:
  2535. testq $1, M
  2536. jz .L2_60 // to next 2 lines of N
  2537. ALIGN_4
  2538. .L2_41:
  2539. #if !defined(TRMMKERNEL) || \
  2540. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2541. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2542. leaq BUFFER1, BO // first buffer to BO
  2543. addq $4 * SIZE, BO
  2544. #else
  2545. movq KK, %rax
  2546. leaq BUFFER1, BO // first buffer to BO
  2547. addq $4 * SIZE, BO
  2548. movq %rax, BI // Index for BO
  2549. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2550. leaq (BO, BI, SIZE), BO
  2551. leaq (AO, %rax, SIZE), AO
  2552. #endif
  2553. vzeroall
  2554. #ifndef TRMMKERNEL
  2555. movq K, %rax
  2556. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2557. movq K, %rax
  2558. subq KK, %rax
  2559. movq %rax, KKK
  2560. #else
  2561. movq KK, %rax
  2562. #ifdef LEFT
  2563. addq $1, %rax // number of values in AO
  2564. #else
  2565. addq $2, %rax // number of values in BO
  2566. #endif
  2567. movq %rax, KKK
  2568. #endif
  2569. andq $-8, %rax
  2570. je .L2_46
  2571. movq %rax, BI // Index for BO
  2572. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2573. leaq (AO, %rax, SIZE), AO
  2574. leaq (BO, BI, SIZE), BO
  2575. negq BI
  2576. negq %rax
  2577. ALIGN_4
  2578. .L2_42:
  2579. KERNEL1x2_SUB
  2580. KERNEL1x2_SUB
  2581. KERNEL1x2_SUB
  2582. KERNEL1x2_SUB
  2583. KERNEL1x2_SUB
  2584. KERNEL1x2_SUB
  2585. KERNEL1x2_SUB
  2586. KERNEL1x2_SUB
  2587. je .L2_46
  2588. KERNEL1x2_SUB
  2589. KERNEL1x2_SUB
  2590. KERNEL1x2_SUB
  2591. KERNEL1x2_SUB
  2592. KERNEL1x2_SUB
  2593. KERNEL1x2_SUB
  2594. KERNEL1x2_SUB
  2595. KERNEL1x2_SUB
  2596. je .L2_46
  2597. jmp .L2_42
  2598. ALIGN_4
  2599. .L2_46:
  2600. #ifndef TRMMKERNEL
  2601. movq K, %rax
  2602. #else
  2603. movq KKK, %rax
  2604. #endif
  2605. andq $7, %rax # if (k & 1)
  2606. je .L2_49
  2607. movq %rax, BI // Index for BO
  2608. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2609. leaq (AO, %rax, SIZE), AO
  2610. leaq (BO, BI, SIZE), BO
  2611. negq BI
  2612. negq %rax
  2613. ALIGN_4
  2614. .L2_47:
  2615. KERNEL1x2_SUB
  2616. jl .L2_47
  2617. ALIGN_4
  2618. .L2_49:
  2619. SAVE1x2
  2620. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2621. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2622. movq K, %rax
  2623. subq KKK, %rax
  2624. movq %rax, BI // Index for BO
  2625. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2626. leaq (BO, BI, SIZE), BO
  2627. leaq (AO, %rax, SIZE), AO
  2628. #endif
  2629. #if defined(TRMMKERNEL) && defined(LEFT)
  2630. addq $1, KK
  2631. #endif
  2632. addq $1 * SIZE, CO1 # coffset += 1
  2633. ALIGN_4
  2634. .L2_60:
  2635. #if defined(TRMMKERNEL) && !defined(LEFT)
  2636. addq $2, KK
  2637. #endif
  2638. .L1_0:
  2639. /************************************************************************************************
  2640. * Loop for Nmod6 % 2 > 0
  2641. *************************************************************************************************/
  2642. movq Nmod6, J
  2643. andq $1, J // j % 2
  2644. je .L999
  2645. ALIGN_4
  2646. .L1_01:
  2647. // copy to sub buffer
  2648. movq B, BO1
  2649. leaq BUFFER1, BO // first buffer to BO
  2650. movq K, %rax
  2651. ALIGN_4
  2652. .L1_02b:
  2653. vmovss (BO1), %xmm0
  2654. vmovss %xmm0, (BO)
  2655. addq $1*SIZE,BO1
  2656. addq $1*SIZE,BO
  2657. decq %rax
  2658. jnz .L1_02b
  2659. .L1_02c:
  2660. movq BO1, B // next offset of B
  2661. .L1_10:
  2662. movq C, CO1
  2663. leaq (C, LDC, 1), C // c += 1 * ldc
  2664. #if defined(TRMMKERNEL) && defined(LEFT)
  2665. movq OFFSET, %rax
  2666. movq %rax, KK
  2667. #endif
  2668. movq A, AO // aoffset = a
  2669. addq $16 * SIZE, AO
  2670. movq M, I
  2671. sarq $4, I // i = (m >> 4)
  2672. je .L1_20
  2673. ALIGN_4
  2674. .L1_11:
  2675. #if !defined(TRMMKERNEL) || \
  2676. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2677. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2678. leaq BUFFER1, BO // first buffer to BO
  2679. addq $4 * SIZE, BO
  2680. #else
  2681. movq KK, %rax
  2682. leaq BUFFER1, BO // first buffer to BO
  2683. addq $4 * SIZE, BO
  2684. movq %rax, BI // Index for BO
  2685. leaq (BO, BI, SIZE), BO
  2686. salq $4, %rax // rax = rax * 16 ; number of values
  2687. leaq (AO, %rax, SIZE), AO
  2688. #endif
  2689. vzeroall
  2690. #ifndef TRMMKERNEL
  2691. movq K, %rax
  2692. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2693. movq K, %rax
  2694. subq KK, %rax
  2695. movq %rax, KKK
  2696. #else
  2697. movq KK, %rax
  2698. #ifdef LEFT
  2699. addq $16, %rax // number of values in AO
  2700. #else
  2701. addq $1, %rax // number of values in BO
  2702. #endif
  2703. movq %rax, KKK
  2704. #endif
  2705. andq $-8, %rax // K = K - ( K % 8 )
  2706. je .L1_16
  2707. movq %rax, BI // Index for BO
  2708. salq $4, %rax // rax = rax * 16 ; number of values
  2709. leaq (AO, %rax, SIZE), AO
  2710. leaq (BO, BI, SIZE), BO
  2711. negq BI
  2712. negq %rax
  2713. ALIGN_4
  2714. .L1_12:
  2715. KERNEL16x1_SUB
  2716. KERNEL16x1_SUB
  2717. KERNEL16x1_SUB
  2718. KERNEL16x1_SUB
  2719. KERNEL16x1_SUB
  2720. KERNEL16x1_SUB
  2721. KERNEL16x1_SUB
  2722. KERNEL16x1_SUB
  2723. je .L1_16
  2724. KERNEL16x1_SUB
  2725. KERNEL16x1_SUB
  2726. KERNEL16x1_SUB
  2727. KERNEL16x1_SUB
  2728. KERNEL16x1_SUB
  2729. KERNEL16x1_SUB
  2730. KERNEL16x1_SUB
  2731. KERNEL16x1_SUB
  2732. je .L1_16
  2733. jmp .L1_12
  2734. ALIGN_4
  2735. .L1_16:
  2736. #ifndef TRMMKERNEL
  2737. movq K, %rax
  2738. #else
  2739. movq KKK, %rax
  2740. #endif
  2741. andq $7, %rax # if (k & 1)
  2742. je .L1_19
  2743. movq %rax, BI // Index for BO
  2744. salq $4, %rax // rax = rax * 16 ; number of values
  2745. leaq (AO, %rax, SIZE), AO
  2746. leaq (BO, BI, SIZE), BO
  2747. negq BI
  2748. negq %rax
  2749. ALIGN_4
  2750. .L1_17:
  2751. KERNEL16x1_SUB
  2752. jl .L1_17
  2753. ALIGN_4
  2754. .L1_19:
  2755. SAVE16x1
  2756. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2757. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2758. movq K, %rax
  2759. subq KKK, %rax
  2760. movq %rax, BI // Index for BO
  2761. leaq (BO, BI, SIZE), BO
  2762. salq $4, %rax // rax = rax * 16 ; number of values
  2763. leaq (AO, %rax, SIZE), AO
  2764. #endif
  2765. #if defined(TRMMKERNEL) && defined(LEFT)
  2766. addq $16, KK
  2767. #endif
  2768. addq $16 * SIZE, CO1 # coffset += 16
  2769. decq I # i --
  2770. jg .L1_11
  2771. ALIGN_4
  2772. /**************************************************************************
  2773. * Rest of M
  2774. ***************************************************************************/
  2775. .L1_20:
  2776. // Test rest of M
  2777. testq $15, M
  2778. jz .L999
  2779. testq $8, M
  2780. jz .L1_21pre
  2781. ALIGN_4
  2782. /**************************************************************************/
  2783. .L1_20_1:
  2784. #if !defined(TRMMKERNEL) || \
  2785. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2786. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2787. leaq BUFFER1, BO // first buffer to BO
  2788. addq $4 * SIZE, BO
  2789. #else
  2790. movq KK, %rax
  2791. leaq BUFFER1, BO // first buffer to BO
  2792. addq $4 * SIZE, BO
  2793. movq %rax, BI // Index for BO
  2794. leaq (BO, BI, SIZE), BO
  2795. salq $3, %rax // rax = rax * 8 ; number of values
  2796. leaq (AO, %rax, SIZE), AO
  2797. #endif
  2798. vzeroall
  2799. #ifndef TRMMKERNEL
  2800. movq K, %rax
  2801. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2802. movq K, %rax
  2803. subq KK, %rax
  2804. movq %rax, KKK
  2805. #else
  2806. movq KK, %rax
  2807. #ifdef LEFT
  2808. addq $8, %rax // number of values in A
  2809. #else
  2810. addq $1, %rax // number of values in BO
  2811. #endif
  2812. movq %rax, KKK
  2813. #endif
  2814. andq $-8, %rax
  2815. je .L1_20_6
  2816. movq %rax, BI // Index for BO
  2817. salq $3, %rax // rax = rax * 8 ; number of values
  2818. leaq (AO, %rax, SIZE), AO
  2819. leaq (BO, BI, SIZE), BO
  2820. negq BI
  2821. negq %rax
  2822. ALIGN_4
  2823. .L1_20_2:
  2824. KERNEL8x1_SUB
  2825. KERNEL8x1_SUB
  2826. KERNEL8x1_SUB
  2827. KERNEL8x1_SUB
  2828. KERNEL8x1_SUB
  2829. KERNEL8x1_SUB
  2830. KERNEL8x1_SUB
  2831. KERNEL8x1_SUB
  2832. je .L1_20_6
  2833. KERNEL8x1_SUB
  2834. KERNEL8x1_SUB
  2835. KERNEL8x1_SUB
  2836. KERNEL8x1_SUB
  2837. KERNEL8x1_SUB
  2838. KERNEL8x1_SUB
  2839. KERNEL8x1_SUB
  2840. KERNEL8x1_SUB
  2841. je .L1_20_6
  2842. jmp .L1_20_2
  2843. ALIGN_4
  2844. .L1_20_6:
  2845. #ifndef TRMMKERNEL
  2846. movq K, %rax
  2847. #else
  2848. movq KKK, %rax
  2849. #endif
  2850. andq $7, %rax # if (k & 1)
  2851. je .L1_20_9
  2852. movq %rax, BI // Index for BO
  2853. salq $3, %rax // rax = rax * 8 ; number of values
  2854. leaq (AO, %rax, SIZE), AO
  2855. leaq (BO, BI, SIZE), BO
  2856. negq BI
  2857. negq %rax
  2858. ALIGN_4
  2859. .L1_20_7:
  2860. KERNEL8x1_SUB
  2861. jl .L1_20_7
  2862. ALIGN_4
  2863. .L1_20_9:
  2864. SAVE8x1
  2865. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2866. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2867. movq K, %rax
  2868. subq KKK, %rax
  2869. movq %rax, BI // Index for BO
  2870. leaq (BO, BI, SIZE), BO
  2871. salq $3, %rax // rax = rax * 8 ; number of values
  2872. leaq (AO, %rax, SIZE), AO
  2873. #endif
  2874. #if defined(TRMMKERNEL) && defined(LEFT)
  2875. addq $8, KK
  2876. #endif
  2877. addq $8 * SIZE, CO1 # coffset += 8
  2878. ALIGN_4
  2879. /**************************************************************************/
  2880. .L1_21pre:
  2881. testq $4, M
  2882. jz .L1_30
  2883. ALIGN_4
  2884. .L1_21:
  2885. #if !defined(TRMMKERNEL) || \
  2886. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2887. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2888. leaq BUFFER1, BO // first buffer to BO
  2889. addq $4 * SIZE, BO
  2890. #else
  2891. movq KK, %rax
  2892. leaq BUFFER1, BO // first buffer to BO
  2893. addq $4 * SIZE, BO
  2894. movq %rax, BI // Index for BO
  2895. leaq (BO, BI, SIZE), BO
  2896. salq $2, %rax // rax = rax * 4 ; number of values
  2897. leaq (AO, %rax, SIZE), AO
  2898. #endif
  2899. vzeroall
  2900. #ifndef TRMMKERNEL
  2901. movq K, %rax
  2902. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2903. movq K, %rax
  2904. subq KK, %rax
  2905. movq %rax, KKK
  2906. #else
  2907. movq KK, %rax
  2908. #ifdef LEFT
  2909. addq $4, %rax // number of values in A
  2910. #else
  2911. addq $1, %rax // number of values in BO
  2912. #endif
  2913. movq %rax, KKK
  2914. #endif
  2915. andq $-8, %rax
  2916. je .L1_26
  2917. movq %rax, BI // Index for BO
  2918. salq $2, %rax // rax = rax * 4 ; number of values
  2919. leaq (AO, %rax, SIZE), AO
  2920. leaq (BO, BI, SIZE), BO
  2921. negq BI
  2922. negq %rax
  2923. ALIGN_4
  2924. .L1_22:
  2925. KERNEL4x1_SUB
  2926. KERNEL4x1_SUB
  2927. KERNEL4x1_SUB
  2928. KERNEL4x1_SUB
  2929. KERNEL4x1_SUB
  2930. KERNEL4x1_SUB
  2931. KERNEL4x1_SUB
  2932. KERNEL4x1_SUB
  2933. je .L1_26
  2934. KERNEL4x1_SUB
  2935. KERNEL4x1_SUB
  2936. KERNEL4x1_SUB
  2937. KERNEL4x1_SUB
  2938. KERNEL4x1_SUB
  2939. KERNEL4x1_SUB
  2940. KERNEL4x1_SUB
  2941. KERNEL4x1_SUB
  2942. je .L1_26
  2943. jmp .L1_22
  2944. ALIGN_4
  2945. .L1_26:
  2946. #ifndef TRMMKERNEL
  2947. movq K, %rax
  2948. #else
  2949. movq KKK, %rax
  2950. #endif
  2951. andq $7, %rax # if (k & 1)
  2952. je .L1_29
  2953. movq %rax, BI // Index for BO
  2954. salq $2, %rax // rax = rax * 4 ; number of values
  2955. leaq (AO, %rax, SIZE), AO
  2956. leaq (BO, BI, SIZE), BO
  2957. negq BI
  2958. negq %rax
  2959. ALIGN_4
  2960. .L1_27:
  2961. KERNEL4x1_SUB
  2962. jl .L1_27
  2963. ALIGN_4
  2964. .L1_29:
  2965. SAVE4x1
  2966. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2967. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2968. movq K, %rax
  2969. subq KKK, %rax
  2970. movq %rax, BI // Index for BO
  2971. leaq (BO, BI, SIZE), BO
  2972. salq $2, %rax // rax = rax * 4 ; number of values
  2973. leaq (AO, %rax, SIZE), AO
  2974. #endif
  2975. #if defined(TRMMKERNEL) && defined(LEFT)
  2976. addq $4, KK
  2977. #endif
  2978. addq $4 * SIZE, CO1 # coffset += 4
  2979. ALIGN_4
  2980. .L1_30:
  2981. testq $2, M
  2982. jz .L1_40
  2983. ALIGN_4
  2984. .L1_31:
  2985. #if !defined(TRMMKERNEL) || \
  2986. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2987. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2988. leaq BUFFER1, BO // first buffer to BO
  2989. addq $4 * SIZE, BO
  2990. #else
  2991. movq KK, %rax
  2992. leaq BUFFER1, BO // first buffer to BO
  2993. addq $4 * SIZE, BO
  2994. movq %rax, BI // Index for BO
  2995. leaq (BO, BI, SIZE), BO
  2996. salq $1, %rax // rax = rax * 2 ; number of values
  2997. leaq (AO, %rax, SIZE), AO
  2998. #endif
  2999. vzeroall
  3000. #ifndef TRMMKERNEL
  3001. movq K, %rax
  3002. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3003. movq K, %rax
  3004. subq KK, %rax
  3005. movq %rax, KKK
  3006. #else
  3007. movq KK, %rax
  3008. #ifdef LEFT
  3009. addq $2, %rax // number of values in AO
  3010. #else
  3011. addq $1, %rax // number of values in BO
  3012. #endif
  3013. movq %rax, KKK
  3014. #endif
  3015. andq $-8, %rax
  3016. je .L1_36
  3017. movq %rax, BI // Index for BO
  3018. salq $1, %rax // rax = rax *2 ; number of values
  3019. leaq (AO, %rax, SIZE), AO
  3020. leaq (BO, BI, SIZE), BO
  3021. negq BI
  3022. negq %rax
  3023. ALIGN_4
  3024. .L1_32:
  3025. KERNEL2x1_SUB
  3026. KERNEL2x1_SUB
  3027. KERNEL2x1_SUB
  3028. KERNEL2x1_SUB
  3029. KERNEL2x1_SUB
  3030. KERNEL2x1_SUB
  3031. KERNEL2x1_SUB
  3032. KERNEL2x1_SUB
  3033. je .L1_36
  3034. KERNEL2x1_SUB
  3035. KERNEL2x1_SUB
  3036. KERNEL2x1_SUB
  3037. KERNEL2x1_SUB
  3038. KERNEL2x1_SUB
  3039. KERNEL2x1_SUB
  3040. KERNEL2x1_SUB
  3041. KERNEL2x1_SUB
  3042. je .L1_36
  3043. jmp .L1_32
  3044. ALIGN_4
  3045. .L1_36:
  3046. #ifndef TRMMKERNEL
  3047. movq K, %rax
  3048. #else
  3049. movq KKK, %rax
  3050. #endif
  3051. andq $7, %rax # if (k & 1)
  3052. je .L1_39
  3053. movq %rax, BI // Index for BO
  3054. salq $1, %rax // rax = rax *2 ; number of values
  3055. leaq (AO, %rax, SIZE), AO
  3056. leaq (BO, BI, SIZE), BO
  3057. negq BI
  3058. negq %rax
  3059. ALIGN_4
  3060. .L1_37:
  3061. KERNEL2x1_SUB
  3062. jl .L1_37
  3063. ALIGN_4
  3064. .L1_39:
  3065. SAVE2x1
  3066. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3067. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3068. movq K, %rax
  3069. subq KKK, %rax
  3070. movq %rax, BI // Index for BO
  3071. leaq (BO, BI, SIZE), BO
  3072. salq $1, %rax // rax = rax * 2 ; number of values
  3073. leaq (AO, %rax, SIZE), AO
  3074. #endif
  3075. #if defined(TRMMKERNEL) && defined(LEFT)
  3076. addq $2, KK
  3077. #endif
  3078. addq $2 * SIZE, CO1 # coffset += 2
  3079. ALIGN_4
  3080. .L1_40:
  3081. testq $1, M
  3082. jz .L999
  3083. ALIGN_4
  3084. .L1_41:
  3085. #if !defined(TRMMKERNEL) || \
  3086. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3087. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3088. leaq BUFFER1, BO // first buffer to BO
  3089. addq $4 * SIZE, BO
  3090. #else
  3091. movq KK, %rax
  3092. leaq BUFFER1, BO // first buffer to BO
  3093. addq $4 * SIZE, BO
  3094. movq %rax, BI // Index for BO
  3095. leaq (BO, BI, SIZE), BO
  3096. leaq (AO, %rax, SIZE), AO
  3097. #endif
  3098. vzeroall
  3099. #ifndef TRMMKERNEL
  3100. movq K, %rax
  3101. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3102. movq K, %rax
  3103. subq KK, %rax
  3104. movq %rax, KKK
  3105. #else
  3106. movq KK, %rax
  3107. #ifdef LEFT
  3108. addq $1, %rax // number of values in AO
  3109. #else
  3110. addq $1, %rax // number of values in BO
  3111. #endif
  3112. movq %rax, KKK
  3113. #endif
  3114. andq $-8, %rax
  3115. je .L1_46
  3116. movq %rax, BI // Index for BO
  3117. leaq (AO, %rax, SIZE), AO
  3118. leaq (BO, BI, SIZE), BO
  3119. negq BI
  3120. negq %rax
  3121. ALIGN_4
  3122. .L1_42:
  3123. KERNEL1x1_SUB
  3124. KERNEL1x1_SUB
  3125. KERNEL1x1_SUB
  3126. KERNEL1x1_SUB
  3127. KERNEL1x1_SUB
  3128. KERNEL1x1_SUB
  3129. KERNEL1x1_SUB
  3130. KERNEL1x1_SUB
  3131. je .L1_46
  3132. KERNEL1x1_SUB
  3133. KERNEL1x1_SUB
  3134. KERNEL1x1_SUB
  3135. KERNEL1x1_SUB
  3136. KERNEL1x1_SUB
  3137. KERNEL1x1_SUB
  3138. KERNEL1x1_SUB
  3139. KERNEL1x1_SUB
  3140. je .L1_46
  3141. jmp .L1_42
  3142. ALIGN_4
  3143. .L1_46:
  3144. #ifndef TRMMKERNEL
  3145. movq K, %rax
  3146. #else
  3147. movq KKK, %rax
  3148. #endif
  3149. andq $7, %rax # if (k & 1)
  3150. je .L1_49
  3151. movq %rax, BI // Index for BO
  3152. leaq (AO, %rax, SIZE), AO
  3153. leaq (BO, BI, SIZE), BO
  3154. negq BI
  3155. negq %rax
  3156. ALIGN_4
  3157. .L1_47:
  3158. KERNEL1x1_SUB
  3159. jl .L1_47
  3160. ALIGN_4
  3161. .L1_49:
  3162. SAVE1x1
  3163. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3164. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3165. movq K, %rax
  3166. subq KKK, %rax
  3167. movq %rax, BI // Index for BO
  3168. leaq (BO, BI, SIZE), BO
  3169. leaq (AO, %rax, SIZE), AO
  3170. #endif
  3171. #if defined(TRMMKERNEL) && defined(LEFT)
  3172. addq $1, KK
  3173. #endif
  3174. addq $1 * SIZE, CO1 # coffset += 1
  3175. ALIGN_4
  3176. .L999:
  3177. movq SP, %rsp
  3178. movq (%rsp), %rbx
  3179. movq 8(%rsp), %rbp
  3180. movq 16(%rsp), %r12
  3181. movq 24(%rsp), %r13
  3182. movq 32(%rsp), %r14
  3183. movq 40(%rsp), %r15
  3184. #ifdef WINDOWS_ABI
  3185. movq 48(%rsp), %rdi
  3186. movq 56(%rsp), %rsi
  3187. movups 64(%rsp), %xmm6
  3188. movups 80(%rsp), %xmm7
  3189. movups 96(%rsp), %xmm8
  3190. movups 112(%rsp), %xmm9
  3191. movups 128(%rsp), %xmm10
  3192. movups 144(%rsp), %xmm11
  3193. movups 160(%rsp), %xmm12
  3194. movups 176(%rsp), %xmm13
  3195. movups 192(%rsp), %xmm14
  3196. movups 208(%rsp), %xmm15
  3197. #endif
  3198. addq $STACKSIZE, %rsp
  3199. ret
  3200. EPILOGUE
  3201. #else
  3202. /*************************************************************************************
  3203. * TRMM Kernel
  3204. *************************************************************************************/
  3205. PROLOGUE
  3206. PROFCODE
  3207. subq $STACKSIZE, %rsp
  3208. movq %rbx, (%rsp)
  3209. movq %rbp, 8(%rsp)
  3210. movq %r12, 16(%rsp)
  3211. movq %r13, 24(%rsp)
  3212. movq %r14, 32(%rsp)
  3213. movq %r15, 40(%rsp)
  3214. vzeroupper
  3215. #ifdef WINDOWS_ABI
  3216. movq %rdi, 48(%rsp)
  3217. movq %rsi, 56(%rsp)
  3218. movups %xmm6, 64(%rsp)
  3219. movups %xmm7, 80(%rsp)
  3220. movups %xmm8, 96(%rsp)
  3221. movups %xmm9, 112(%rsp)
  3222. movups %xmm10, 128(%rsp)
  3223. movups %xmm11, 144(%rsp)
  3224. movups %xmm12, 160(%rsp)
  3225. movups %xmm13, 176(%rsp)
  3226. movups %xmm14, 192(%rsp)
  3227. movups %xmm15, 208(%rsp)
  3228. movq ARG1, OLD_M
  3229. movq ARG2, OLD_N
  3230. movq ARG3, OLD_K
  3231. movq OLD_A, A
  3232. movq OLD_B, B
  3233. movq OLD_C, C
  3234. movq OLD_LDC, LDC
  3235. #ifdef TRMMKERNEL
  3236. vmovsd OLD_OFFSET, %xmm12
  3237. #endif
  3238. vmovaps %xmm3, %xmm0
  3239. #else
  3240. movq STACKSIZE + 8(%rsp), LDC
  3241. #ifdef TRMMKERNEL
  3242. movsd STACKSIZE + 16(%rsp), %xmm12
  3243. #endif
  3244. #endif
  3245. movq %rsp, SP # save old stack
  3246. subq $128 + L_BUFFER_SIZE, %rsp
  3247. andq $-4096, %rsp # align stack
  3248. STACK_TOUCH
  3249. cmpq $0, OLD_M
  3250. je .L999
  3251. cmpq $0, OLD_N
  3252. je .L999
  3253. cmpq $0, OLD_K
  3254. je .L999
  3255. movq OLD_M, M
  3256. movq OLD_N, N
  3257. movq OLD_K, K
  3258. vmovss %xmm0, ALPHA
  3259. salq $BASE_SHIFT, LDC
  3260. movq N, %rax
  3261. xorq %rdx, %rdx
  3262. movq $4, %rdi
  3263. divq %rdi // N / 4
  3264. movq %rax, Ndiv6 // N / 4
  3265. movq %rdx, Nmod6 // N % 4
  3266. #ifdef TRMMKERNEL
  3267. vmovsd %xmm12, OFFSET
  3268. vmovsd %xmm12, KK
  3269. #ifndef LEFT
  3270. negq KK
  3271. #endif
  3272. #endif
  3273. movq Ndiv6, J
  3274. cmpq $0, J
  3275. je .L2_0
  3276. ALIGN_4
  3277. /*******************************************************************************************/
  3278. .L4_01:
  3279. // copy to sub buffer
  3280. movq B, BO1
  3281. leaq BUFFER1, BO // first buffer to BO
  3282. movq K, %rax
  3283. sarq $2, %rax // K / 4
  3284. jz .L4_01b
  3285. ALIGN_4
  3286. .L4_01a:
  3287. prefetcht0 512(BO1)
  3288. prefetchw 512(BO)
  3289. vmovups (BO1), %xmm0
  3290. vmovups 4*SIZE(BO1), %xmm1
  3291. vmovups 8*SIZE(BO1), %xmm2
  3292. vmovups 12*SIZE(BO1), %xmm3
  3293. vmovups %xmm0, (BO)
  3294. vmovups %xmm1, 4*SIZE(BO)
  3295. vmovups %xmm2, 8*SIZE(BO)
  3296. vmovups %xmm3,12*SIZE(BO)
  3297. addq $ 16*SIZE,BO1
  3298. addq $ 16*SIZE,BO
  3299. decq %rax
  3300. jnz .L4_01a
  3301. .L4_01b:
  3302. movq K, %rax
  3303. andq $3, %rax // K % 4
  3304. jz .L4_02d
  3305. ALIGN_4
  3306. .L4_02c:
  3307. vmovups (BO1), %xmm0
  3308. vmovups %xmm0, (BO)
  3309. addq $ 4*SIZE,BO1
  3310. addq $ 4*SIZE,BO
  3311. decq %rax
  3312. jnz .L4_02c
  3313. .L4_02d:
  3314. movq BO1, B // next offset of B
  3315. .L4_10:
  3316. movq C, CO1
  3317. leaq (C, LDC, 2), CO2
  3318. leaq (C, LDC, 4), C // c += 4 * ldc
  3319. #if defined(TRMMKERNEL) && defined(LEFT)
  3320. movq OFFSET, %rax
  3321. movq %rax, KK
  3322. #endif
  3323. movq A, AO // aoffset = a
  3324. addq $ 16 * SIZE, AO
  3325. movq M, I
  3326. sarq $4, I // i = (m >> 4)
  3327. je .L4_20
  3328. ALIGN_4
  3329. .L4_11:
  3330. #if !defined(TRMMKERNEL) || \
  3331. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3332. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3333. leaq BUFFER1, BO // first buffer to BO
  3334. addq $4 * SIZE, BO
  3335. #else
  3336. movq KK, %rax
  3337. leaq BUFFER1, BO // first buffer to BO
  3338. addq $4 * SIZE, BO
  3339. movq %rax, BI // Index for BO
  3340. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  3341. leaq (BO, BI, SIZE), BO
  3342. salq $4, %rax // rax = rax * 16 ; number of values
  3343. leaq (AO, %rax, SIZE), AO
  3344. #endif
  3345. vzeroall
  3346. #ifndef TRMMKERNEL
  3347. movq K, %rax
  3348. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3349. movq K, %rax
  3350. subq KK, %rax
  3351. movq %rax, KKK
  3352. #else
  3353. movq KK, %rax
  3354. #ifdef LEFT
  3355. addq $16, %rax // number of values in AO
  3356. #else
  3357. addq $4, %rax // number of values in BO
  3358. #endif
  3359. movq %rax, KKK
  3360. #endif
  3361. andq $-8, %rax // K = K - ( K % 8 )
  3362. je .L4_16
  3363. movq %rax, BI // Index for BO
  3364. leaq (,BI,4) , BI // BI = BI * 4 ; number of values
  3365. salq $4, %rax // rax = rax * 16 ; number of values
  3366. leaq (AO, %rax, SIZE), AO
  3367. leaq (BO, BI, SIZE), BO
  3368. negq BI
  3369. negq %rax
  3370. ALIGN_4
  3371. .L4_12:
  3372. prefetcht0 A_PR1(AO, %rax, SIZE)
  3373. prefetcht0 B_PR1(BO, BI , SIZE)
  3374. KERNEL16x4_SUB
  3375. prefetcht0 A_PR1(AO, %rax, SIZE)
  3376. KERNEL16x4_SUB
  3377. prefetcht0 A_PR1(AO, %rax, SIZE)
  3378. KERNEL16x4_SUB
  3379. prefetcht0 A_PR1(AO, %rax, SIZE)
  3380. KERNEL16x4_SUB
  3381. prefetcht0 A_PR1(AO, %rax, SIZE)
  3382. prefetcht0 B_PR1(BO, BI , SIZE)
  3383. KERNEL16x4_SUB
  3384. prefetcht0 A_PR1(AO, %rax, SIZE)
  3385. KERNEL16x4_SUB
  3386. prefetcht0 A_PR1(AO, %rax, SIZE)
  3387. KERNEL16x4_SUB
  3388. prefetcht0 A_PR1(AO, %rax, SIZE)
  3389. KERNEL16x4_SUB
  3390. je .L4_16
  3391. prefetcht0 A_PR1(AO, %rax, SIZE)
  3392. prefetcht0 B_PR1(BO, BI , SIZE)
  3393. KERNEL16x4_SUB
  3394. prefetcht0 A_PR1(AO, %rax, SIZE)
  3395. KERNEL16x4_SUB
  3396. prefetcht0 A_PR1(AO, %rax, SIZE)
  3397. KERNEL16x4_SUB
  3398. prefetcht0 A_PR1(AO, %rax, SIZE)
  3399. KERNEL16x4_SUB
  3400. prefetcht0 A_PR1(AO, %rax, SIZE)
  3401. prefetcht0 B_PR1(BO, BI , SIZE)
  3402. KERNEL16x4_SUB
  3403. prefetcht0 A_PR1(AO, %rax, SIZE)
  3404. KERNEL16x4_SUB
  3405. prefetcht0 A_PR1(AO, %rax, SIZE)
  3406. KERNEL16x4_SUB
  3407. prefetcht0 A_PR1(AO, %rax, SIZE)
  3408. KERNEL16x4_SUB
  3409. je .L4_16
  3410. jmp .L4_12
  3411. ALIGN_4
  3412. .L4_16:
  3413. #ifndef TRMMKERNEL
  3414. movq K, %rax
  3415. #else
  3416. movq KKK, %rax
  3417. #endif
  3418. andq $7, %rax # if (k & 1)
  3419. je .L4_19
  3420. movq %rax, BI // Index for BO
  3421. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  3422. salq $4, %rax // rax = rax * 16 ; number of values
  3423. leaq (AO, %rax, SIZE), AO
  3424. leaq (BO, BI, SIZE), BO
  3425. negq BI
  3426. negq %rax
  3427. ALIGN_4
  3428. .L4_17:
  3429. KERNEL16x4_SUB
  3430. jl .L4_17
  3431. ALIGN_4
  3432. .L4_19:
  3433. SAVE16x4
  3434. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3435. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3436. movq K, %rax
  3437. subq KKK, %rax
  3438. movq %rax, BI // Index for BO
  3439. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  3440. leaq (BO, BI, SIZE), BO
  3441. salq $4, %rax // rax = rax * 16 ; number of values
  3442. leaq (AO, %rax, SIZE), AO
  3443. #endif
  3444. #if defined(TRMMKERNEL) && defined(LEFT)
  3445. addq $16, KK
  3446. #endif
  3447. addq $16 * SIZE, CO1 # coffset += 16
  3448. addq $16 * SIZE, CO2 # coffset += 16
  3449. decq I # i --
  3450. jg .L4_11
  3451. ALIGN_4
  3452. /**************************************************************************
  3453. * Rest of M
  3454. ***************************************************************************/
  3455. .L4_20:
  3456. // Test rest of M
  3457. testq $15, M
  3458. jz .L4_60 // to next 3 lines of N
  3459. testq $8, M
  3460. jz .L4_21pre
  3461. ALIGN_4
  3462. /**************************************************************************/
  3463. .L4_20_1:
  3464. #if !defined(TRMMKERNEL) || \
  3465. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3466. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3467. leaq BUFFER1, BO // first buffer to BO
  3468. addq $4 * SIZE, BO
  3469. #else
  3470. movq KK, %rax
  3471. leaq BUFFER1, BO // first buffer to BO
  3472. addq $4 * SIZE, BO
  3473. movq %rax, BI // Index for BO
  3474. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  3475. leaq (BO, BI, SIZE), BO
  3476. salq $3, %rax // rax = rax * 8 ; number of values
  3477. leaq (AO, %rax, SIZE), AO
  3478. #endif
  3479. vzeroall
  3480. #ifndef TRMMKERNEL
  3481. movq K, %rax
  3482. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3483. movq K, %rax
  3484. subq KK, %rax
  3485. movq %rax, KKK
  3486. #else
  3487. movq KK, %rax
  3488. #ifdef LEFT
  3489. addq $8, %rax // number of values in A
  3490. #else
  3491. addq $4, %rax // number of values in BO
  3492. #endif
  3493. movq %rax, KKK
  3494. #endif
  3495. andq $-8, %rax
  3496. je .L4_20_6
  3497. movq %rax, BI // Index for BO
  3498. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  3499. salq $3, %rax // rax = rax * 8 ; number of values
  3500. leaq (AO, %rax, SIZE), AO
  3501. leaq (BO, BI, SIZE), BO
  3502. negq BI
  3503. negq %rax
  3504. ALIGN_4
  3505. .L4_20_2:
  3506. KERNEL8x4_SUB
  3507. KERNEL8x4_SUB
  3508. KERNEL8x4_SUB
  3509. KERNEL8x4_SUB
  3510. KERNEL8x4_SUB
  3511. KERNEL8x4_SUB
  3512. KERNEL8x4_SUB
  3513. KERNEL8x4_SUB
  3514. je .L4_20_6
  3515. KERNEL8x4_SUB
  3516. KERNEL8x4_SUB
  3517. KERNEL8x4_SUB
  3518. KERNEL8x4_SUB
  3519. KERNEL8x4_SUB
  3520. KERNEL8x4_SUB
  3521. KERNEL8x4_SUB
  3522. KERNEL8x4_SUB
  3523. je .L4_20_6
  3524. jmp .L4_20_2
  3525. ALIGN_4
  3526. .L4_20_6:
  3527. #ifndef TRMMKERNEL
  3528. movq K, %rax
  3529. #else
  3530. movq KKK, %rax
  3531. #endif
  3532. andq $7, %rax # if (k & 1)
  3533. je .L4_20_9
  3534. movq %rax, BI // Index for BO
  3535. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  3536. salq $3, %rax // rax = rax * 8 ; number of values
  3537. leaq (AO, %rax, SIZE), AO
  3538. leaq (BO, BI, SIZE), BO
  3539. negq BI
  3540. negq %rax
  3541. ALIGN_4
  3542. .L4_20_7:
  3543. KERNEL8x4_SUB
  3544. jl .L4_20_7
  3545. ALIGN_4
  3546. .L4_20_9:
  3547. SAVE8x4
  3548. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3549. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3550. movq K, %rax
  3551. subq KKK, %rax
  3552. movq %rax, BI // Index for BO
  3553. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  3554. leaq (BO, BI, SIZE), BO
  3555. salq $3, %rax // rax = rax * 8 ; number of values
  3556. leaq (AO, %rax, SIZE), AO
  3557. #endif
  3558. #if defined(TRMMKERNEL) && defined(LEFT)
  3559. addq $8, KK
  3560. #endif
  3561. addq $8 * SIZE, CO1 # coffset += 8
  3562. addq $8 * SIZE, CO2 # coffset += 8
  3563. ALIGN_4
  3564. /**************************************************************************/
  3565. .L4_21pre:
  3566. testq $4, M
  3567. jz .L4_30
  3568. ALIGN_4
  3569. .L4_21:
  3570. #if !defined(TRMMKERNEL) || \
  3571. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3572. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3573. leaq BUFFER1, BO // first buffer to BO
  3574. addq $4 * SIZE, BO
  3575. #else
  3576. movq KK, %rax
  3577. leaq BUFFER1, BO // first buffer to BO
  3578. addq $4 * SIZE, BO
  3579. movq %rax, BI // Index for BO
  3580. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  3581. leaq (BO, BI, SIZE), BO
  3582. salq $2, %rax // rax = rax * 4 ; number of values
  3583. leaq (AO, %rax, SIZE), AO
  3584. #endif
  3585. vzeroall
  3586. #ifndef TRMMKERNEL
  3587. movq K, %rax
  3588. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3589. movq K, %rax
  3590. subq KK, %rax
  3591. movq %rax, KKK
  3592. #else
  3593. movq KK, %rax
  3594. #ifdef LEFT
  3595. addq $4, %rax // number of values in A
  3596. #else
  3597. addq $4, %rax // number of values in BO
  3598. #endif
  3599. movq %rax, KKK
  3600. #endif
  3601. andq $-8, %rax
  3602. je .L4_26
  3603. movq %rax, BI // Index for BO
  3604. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  3605. salq $2, %rax // rax = rax * 4 ; number of values
  3606. leaq (AO, %rax, SIZE), AO
  3607. leaq (BO, BI, SIZE), BO
  3608. negq BI
  3609. negq %rax
  3610. ALIGN_4
  3611. .L4_22:
  3612. KERNEL4x4_SUB
  3613. KERNEL4x4_SUB
  3614. KERNEL4x4_SUB
  3615. KERNEL4x4_SUB
  3616. KERNEL4x4_SUB
  3617. KERNEL4x4_SUB
  3618. KERNEL4x4_SUB
  3619. KERNEL4x4_SUB
  3620. je .L4_26
  3621. KERNEL4x4_SUB
  3622. KERNEL4x4_SUB
  3623. KERNEL4x4_SUB
  3624. KERNEL4x4_SUB
  3625. KERNEL4x4_SUB
  3626. KERNEL4x4_SUB
  3627. KERNEL4x4_SUB
  3628. KERNEL4x4_SUB
  3629. je .L4_26
  3630. jmp .L4_22
  3631. ALIGN_4
  3632. .L4_26:
  3633. #ifndef TRMMKERNEL
  3634. movq K, %rax
  3635. #else
  3636. movq KKK, %rax
  3637. #endif
  3638. andq $7, %rax # if (k & 1)
  3639. je .L4_29
  3640. movq %rax, BI // Index for BO
  3641. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  3642. salq $2, %rax // rax = rax * 4 ; number of values
  3643. leaq (AO, %rax, SIZE), AO
  3644. leaq (BO, BI, SIZE), BO
  3645. negq BI
  3646. negq %rax
  3647. ALIGN_4
  3648. .L4_27:
  3649. KERNEL4x4_SUB
  3650. jl .L4_27
  3651. ALIGN_4
  3652. .L4_29:
  3653. SAVE4x4
  3654. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3655. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3656. movq K, %rax
  3657. subq KKK, %rax
  3658. movq %rax, BI // Index for BO
  3659. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  3660. leaq (BO, BI, SIZE), BO
  3661. salq $2, %rax // rax = rax * 4 ; number of values
  3662. leaq (AO, %rax, SIZE), AO
  3663. #endif
  3664. #if defined(TRMMKERNEL) && defined(LEFT)
  3665. addq $4, KK
  3666. #endif
  3667. addq $4 * SIZE, CO1 # coffset += 4
  3668. addq $4 * SIZE, CO2 # coffset += 4
  3669. ALIGN_4
  3670. .L4_30:
  3671. testq $2, M
  3672. jz .L4_40
  3673. ALIGN_4
  3674. .L4_31:
  3675. #if !defined(TRMMKERNEL) || \
  3676. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3677. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3678. leaq BUFFER1, BO // first buffer to BO
  3679. addq $4 * SIZE, BO
  3680. #else
  3681. movq KK, %rax
  3682. leaq BUFFER1, BO // first buffer to BO
  3683. addq $4 * SIZE, BO
  3684. movq %rax, BI // Index for BO
  3685. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  3686. leaq (BO, BI, SIZE), BO
  3687. salq $1, %rax // rax = rax * 2 ; number of values
  3688. leaq (AO, %rax, SIZE), AO
  3689. #endif
  3690. vzeroall
  3691. #ifndef TRMMKERNEL
  3692. movq K, %rax
  3693. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3694. movq K, %rax
  3695. subq KK, %rax
  3696. movq %rax, KKK
  3697. #else
  3698. movq KK, %rax
  3699. #ifdef LEFT
  3700. addq $2, %rax // number of values in AO
  3701. #else
  3702. addq $4, %rax // number of values in BO
  3703. #endif
  3704. movq %rax, KKK
  3705. #endif
  3706. andq $-8, %rax
  3707. je .L4_36
  3708. movq %rax, BI // Index for BO
  3709. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  3710. salq $1, %rax // rax = rax *2 ; number of values
  3711. leaq (AO, %rax, SIZE), AO
  3712. leaq (BO, BI, SIZE), BO
  3713. negq BI
  3714. negq %rax
  3715. ALIGN_4
  3716. .L4_32:
  3717. KERNEL2x4_SUB
  3718. KERNEL2x4_SUB
  3719. KERNEL2x4_SUB
  3720. KERNEL2x4_SUB
  3721. KERNEL2x4_SUB
  3722. KERNEL2x4_SUB
  3723. KERNEL2x4_SUB
  3724. KERNEL2x4_SUB
  3725. je .L4_36
  3726. KERNEL2x4_SUB
  3727. KERNEL2x4_SUB
  3728. KERNEL2x4_SUB
  3729. KERNEL2x4_SUB
  3730. KERNEL2x4_SUB
  3731. KERNEL2x4_SUB
  3732. KERNEL2x4_SUB
  3733. KERNEL2x4_SUB
  3734. je .L4_36
  3735. jmp .L4_32
  3736. ALIGN_4
  3737. .L4_36:
  3738. #ifndef TRMMKERNEL
  3739. movq K, %rax
  3740. #else
  3741. movq KKK, %rax
  3742. #endif
  3743. andq $7, %rax # if (k & 1)
  3744. je .L4_39
  3745. movq %rax, BI // Index for BO
  3746. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  3747. salq $1, %rax // rax = rax *2 ; number of values
  3748. leaq (AO, %rax, SIZE), AO
  3749. leaq (BO, BI, SIZE), BO
  3750. negq BI
  3751. negq %rax
  3752. ALIGN_4
  3753. .L4_37:
  3754. KERNEL2x4_SUB
  3755. jl .L4_37
  3756. ALIGN_4
  3757. .L4_39:
  3758. SAVE2x4
  3759. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3760. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3761. movq K, %rax
  3762. subq KKK, %rax
  3763. movq %rax, BI // Index for BO
  3764. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  3765. leaq (BO, BI, SIZE), BO
  3766. salq $1, %rax // rax = rax * 2 ; number of values
  3767. leaq (AO, %rax, SIZE), AO
  3768. #endif
  3769. #if defined(TRMMKERNEL) && defined(LEFT)
  3770. addq $2, KK
  3771. #endif
  3772. addq $2 * SIZE, CO1 # coffset += 2
  3773. addq $2 * SIZE, CO2 # coffset += 2
  3774. ALIGN_4
  3775. .L4_40:
  3776. testq $1, M
  3777. jz .L4_60 // to next 4 lines of N
  3778. ALIGN_4
  3779. .L4_41:
  3780. #if !defined(TRMMKERNEL) || \
  3781. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3782. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3783. leaq BUFFER1, BO // first buffer to BO
  3784. addq $4 * SIZE, BO
  3785. #else
  3786. movq KK, %rax
  3787. leaq BUFFER1, BO // first buffer to BO
  3788. addq $4 * SIZE, BO
  3789. movq %rax, BI // Index for BO
  3790. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  3791. leaq (BO, BI, SIZE), BO
  3792. leaq (AO, %rax, SIZE), AO
  3793. #endif
  3794. vzeroall
  3795. #ifndef TRMMKERNEL
  3796. movq K, %rax
  3797. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3798. movq K, %rax
  3799. subq KK, %rax
  3800. movq %rax, KKK
  3801. #else
  3802. movq KK, %rax
  3803. #ifdef LEFT
  3804. addq $1, %rax // number of values in AO
  3805. #else
  3806. addq $4, %rax // number of values in BO
  3807. #endif
  3808. movq %rax, KKK
  3809. #endif
  3810. andq $-8, %rax
  3811. je .L4_46
  3812. movq %rax, BI // Index for BO
  3813. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  3814. leaq (AO, %rax, SIZE), AO
  3815. leaq (BO, BI, SIZE), BO
  3816. negq BI
  3817. negq %rax
  3818. ALIGN_4
  3819. .L4_42:
  3820. KERNEL1x4_SUB
  3821. KERNEL1x4_SUB
  3822. KERNEL1x4_SUB
  3823. KERNEL1x4_SUB
  3824. KERNEL1x4_SUB
  3825. KERNEL1x4_SUB
  3826. KERNEL1x4_SUB
  3827. KERNEL1x4_SUB
  3828. je .L4_46
  3829. KERNEL1x4_SUB
  3830. KERNEL1x4_SUB
  3831. KERNEL1x4_SUB
  3832. KERNEL1x4_SUB
  3833. KERNEL1x4_SUB
  3834. KERNEL1x4_SUB
  3835. KERNEL1x4_SUB
  3836. KERNEL1x4_SUB
  3837. je .L4_46
  3838. jmp .L4_42
  3839. ALIGN_4
  3840. .L4_46:
  3841. #ifndef TRMMKERNEL
  3842. movq K, %rax
  3843. #else
  3844. movq KKK, %rax
  3845. #endif
  3846. andq $7, %rax # if (k & 1)
  3847. je .L4_49
  3848. movq %rax, BI // Index for BO
  3849. leaq (,BI,4), BI // BI = BI * 4 ; number of values
  3850. leaq (AO, %rax, SIZE), AO
  3851. leaq (BO, BI, SIZE), BO
  3852. negq BI
  3853. negq %rax
  3854. ALIGN_4
  3855. .L4_47:
  3856. KERNEL1x4_SUB
  3857. jl .L4_47
  3858. ALIGN_4
  3859. .L4_49:
  3860. SAVE1x4
  3861. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3862. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3863. movq K, %rax
  3864. subq KKK, %rax
  3865. movq %rax, BI // Index for BO
  3866. leaq (,BI, 4), BI // BI = BI * 4 ; number of values
  3867. leaq (BO, BI, SIZE), BO
  3868. leaq (AO, %rax, SIZE), AO
  3869. #endif
  3870. #if defined(TRMMKERNEL) && defined(LEFT)
  3871. addq $1, KK
  3872. #endif
  3873. addq $1 * SIZE, CO1 # coffset += 1
  3874. addq $1 * SIZE, CO2 # coffset += 1
  3875. ALIGN_4
  3876. .L4_60:
  3877. #if defined(TRMMKERNEL) && !defined(LEFT)
  3878. addq $4, KK
  3879. #endif
  3880. decq J // j --
  3881. jg .L4_01 // next 4 lines of N
  3882. /*******************************************************************************************/
  3883. .L2_0:
  3884. movq Nmod6, J
  3885. andq $3, J // j % 4
  3886. je .L999
  3887. movq Nmod6, J
  3888. andq $2, J // j % 4
  3889. je .L1_0
  3890. .L2_01:
  3891. // copy to sub buffer
  3892. movq B, BO1
  3893. leaq BUFFER1, BO // first buffer to BO
  3894. movq K, %rax
  3895. sarq $2, %rax // K / 4
  3896. jz .L2_01b
  3897. ALIGN_4
  3898. .L2_01a:
  3899. vmovsd (BO1), %xmm0
  3900. vmovsd 2*SIZE(BO1), %xmm1
  3901. vmovsd 4*SIZE(BO1), %xmm2
  3902. vmovsd 6*SIZE(BO1), %xmm3
  3903. vmovsd %xmm0, (BO)
  3904. vmovsd %xmm1, 2*SIZE(BO)
  3905. vmovsd %xmm2, 4*SIZE(BO)
  3906. vmovsd %xmm3, 6*SIZE(BO)
  3907. addq $8*SIZE,BO1
  3908. addq $8*SIZE,BO
  3909. decq %rax
  3910. jnz .L2_01a
  3911. .L2_01b:
  3912. movq K, %rax
  3913. andq $3, %rax // K % 4
  3914. jz .L2_02d
  3915. ALIGN_4
  3916. .L2_02c:
  3917. vmovsd (BO1), %xmm0
  3918. vmovsd %xmm0, (BO)
  3919. addq $2*SIZE,BO1
  3920. addq $2*SIZE,BO
  3921. decq %rax
  3922. jnz .L2_02c
  3923. .L2_02d:
  3924. movq BO1, B // next offset of B
  3925. .L2_10:
  3926. movq C, CO1
  3927. leaq (C, LDC, 2), C // c += 2 * ldc
  3928. #if defined(TRMMKERNEL) && defined(LEFT)
  3929. movq OFFSET, %rax
  3930. movq %rax, KK
  3931. #endif
  3932. movq A, AO // aoffset = a
  3933. addq $16 * SIZE, AO
  3934. movq M, I
  3935. sarq $4, I // i = (m >> 4)
  3936. je .L2_20
  3937. ALIGN_4
  3938. .L2_11:
  3939. #if !defined(TRMMKERNEL) || \
  3940. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3941. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3942. leaq BUFFER1, BO // first buffer to BO
  3943. addq $4 * SIZE, BO
  3944. #else
  3945. movq KK, %rax
  3946. leaq BUFFER1, BO // first buffer to BO
  3947. addq $4 * SIZE, BO
  3948. movq %rax, BI // Index for BO
  3949. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3950. leaq (BO, BI, SIZE), BO
  3951. salq $4, %rax // rax = rax * 16 ; number of values
  3952. leaq (AO, %rax, SIZE), AO
  3953. #endif
  3954. vzeroall
  3955. #ifndef TRMMKERNEL
  3956. movq K, %rax
  3957. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3958. movq K, %rax
  3959. subq KK, %rax
  3960. movq %rax, KKK
  3961. #else
  3962. movq KK, %rax
  3963. #ifdef LEFT
  3964. addq $16, %rax // number of values in AO
  3965. #else
  3966. addq $2, %rax // number of values in BO
  3967. #endif
  3968. movq %rax, KKK
  3969. #endif
  3970. andq $-8, %rax // K = K - ( K % 8 )
  3971. je .L2_16
  3972. movq %rax, BI // Index for BO
  3973. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  3974. salq $4, %rax // rax = rax * 16 ; number of values
  3975. leaq (AO, %rax, SIZE), AO
  3976. leaq (BO, BI, SIZE), BO
  3977. negq BI
  3978. negq %rax
  3979. ALIGN_4
  3980. .L2_12:
  3981. KERNEL16x2_SUB
  3982. KERNEL16x2_SUB
  3983. KERNEL16x2_SUB
  3984. KERNEL16x2_SUB
  3985. KERNEL16x2_SUB
  3986. KERNEL16x2_SUB
  3987. KERNEL16x2_SUB
  3988. KERNEL16x2_SUB
  3989. je .L2_16
  3990. KERNEL16x2_SUB
  3991. KERNEL16x2_SUB
  3992. KERNEL16x2_SUB
  3993. KERNEL16x2_SUB
  3994. KERNEL16x2_SUB
  3995. KERNEL16x2_SUB
  3996. KERNEL16x2_SUB
  3997. KERNEL16x2_SUB
  3998. je .L2_16
  3999. jmp .L2_12
  4000. ALIGN_4
  4001. .L2_16:
  4002. #ifndef TRMMKERNEL
  4003. movq K, %rax
  4004. #else
  4005. movq KKK, %rax
  4006. #endif
  4007. andq $7, %rax # if (k & 1)
  4008. je .L2_19
  4009. movq %rax, BI // Index for BO
  4010. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4011. salq $4, %rax // rax = rax * 16 ; number of values
  4012. leaq (AO, %rax, SIZE), AO
  4013. leaq (BO, BI, SIZE), BO
  4014. negq BI
  4015. negq %rax
  4016. ALIGN_4
  4017. .L2_17:
  4018. KERNEL16x2_SUB
  4019. jl .L2_17
  4020. ALIGN_4
  4021. .L2_19:
  4022. SAVE16x2
  4023. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4024. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4025. movq K, %rax
  4026. subq KKK, %rax
  4027. movq %rax, BI // Index for BO
  4028. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4029. leaq (BO, BI, SIZE), BO
  4030. salq $4, %rax // rax = rax * 16 ; number of values
  4031. leaq (AO, %rax, SIZE), AO
  4032. #endif
  4033. #if defined(TRMMKERNEL) && defined(LEFT)
  4034. addq $16, KK
  4035. #endif
  4036. addq $16 * SIZE, CO1 # coffset += 16
  4037. decq I # i --
  4038. jg .L2_11
  4039. ALIGN_4
  4040. /**************************************************************************
  4041. * Rest of M
  4042. ***************************************************************************/
  4043. .L2_20:
  4044. // Test rest of M
  4045. testq $15, M
  4046. jz .L2_60 // to next 2 lines of N
  4047. testq $8, M
  4048. jz .L2_21pre
  4049. ALIGN_4
  4050. /**************************************************************************/
  4051. .L2_20_1:
  4052. #if !defined(TRMMKERNEL) || \
  4053. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4054. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4055. leaq BUFFER1, BO // first buffer to BO
  4056. addq $4 * SIZE, BO
  4057. #else
  4058. movq KK, %rax
  4059. leaq BUFFER1, BO // first buffer to BO
  4060. addq $4 * SIZE, BO
  4061. movq %rax, BI // Index for BO
  4062. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4063. leaq (BO, BI, SIZE), BO
  4064. salq $3, %rax // rax = rax * 8 ; number of values
  4065. leaq (AO, %rax, SIZE), AO
  4066. #endif
  4067. vzeroall
  4068. #ifndef TRMMKERNEL
  4069. movq K, %rax
  4070. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4071. movq K, %rax
  4072. subq KK, %rax
  4073. movq %rax, KKK
  4074. #else
  4075. movq KK, %rax
  4076. #ifdef LEFT
  4077. addq $8, %rax // number of values in A
  4078. #else
  4079. addq $2, %rax // number of values in BO
  4080. #endif
  4081. movq %rax, KKK
  4082. #endif
  4083. andq $-8, %rax
  4084. je .L2_20_6
  4085. movq %rax, BI // Index for BO
  4086. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4087. salq $3, %rax // rax = rax * 8 ; number of values
  4088. leaq (AO, %rax, SIZE), AO
  4089. leaq (BO, BI, SIZE), BO
  4090. negq BI
  4091. negq %rax
  4092. ALIGN_4
  4093. .L2_20_2:
  4094. KERNEL8x2_SUB
  4095. KERNEL8x2_SUB
  4096. KERNEL8x2_SUB
  4097. KERNEL8x2_SUB
  4098. KERNEL8x2_SUB
  4099. KERNEL8x2_SUB
  4100. KERNEL8x2_SUB
  4101. KERNEL8x2_SUB
  4102. je .L2_20_6
  4103. KERNEL8x2_SUB
  4104. KERNEL8x2_SUB
  4105. KERNEL8x2_SUB
  4106. KERNEL8x2_SUB
  4107. KERNEL8x2_SUB
  4108. KERNEL8x2_SUB
  4109. KERNEL8x2_SUB
  4110. KERNEL8x2_SUB
  4111. je .L2_20_6
  4112. jmp .L2_20_2
  4113. ALIGN_4
  4114. .L2_20_6:
  4115. #ifndef TRMMKERNEL
  4116. movq K, %rax
  4117. #else
  4118. movq KKK, %rax
  4119. #endif
  4120. andq $7, %rax # if (k & 1)
  4121. je .L2_20_9
  4122. movq %rax, BI // Index for BO
  4123. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4124. salq $3, %rax // rax = rax * 8 ; number of values
  4125. leaq (AO, %rax, SIZE), AO
  4126. leaq (BO, BI, SIZE), BO
  4127. negq BI
  4128. negq %rax
  4129. ALIGN_4
  4130. .L2_20_7:
  4131. KERNEL8x2_SUB
  4132. jl .L2_20_7
  4133. ALIGN_4
  4134. .L2_20_9:
  4135. SAVE8x2
  4136. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4137. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4138. movq K, %rax
  4139. subq KKK, %rax
  4140. movq %rax, BI // Index for BO
  4141. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4142. leaq (BO, BI, SIZE), BO
  4143. salq $3, %rax // rax = rax * 8 ; number of values
  4144. leaq (AO, %rax, SIZE), AO
  4145. #endif
  4146. #if defined(TRMMKERNEL) && defined(LEFT)
  4147. addq $8, KK
  4148. #endif
  4149. addq $8 * SIZE, CO1 # coffset += 8
  4150. ALIGN_4
  4151. /**************************************************************************/
  4152. .L2_21pre:
  4153. testq $4, M
  4154. jz .L2_30
  4155. ALIGN_4
  4156. .L2_21:
  4157. #if !defined(TRMMKERNEL) || \
  4158. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4159. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4160. leaq BUFFER1, BO // first buffer to BO
  4161. addq $4 * SIZE, BO
  4162. #else
  4163. movq KK, %rax
  4164. leaq BUFFER1, BO // first buffer to BO
  4165. addq $4 * SIZE, BO
  4166. movq %rax, BI // Index for BO
  4167. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4168. leaq (BO, BI, SIZE), BO
  4169. salq $2, %rax // rax = rax * 4 ; number of values
  4170. leaq (AO, %rax, SIZE), AO
  4171. #endif
  4172. vzeroall
  4173. #ifndef TRMMKERNEL
  4174. movq K, %rax
  4175. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4176. movq K, %rax
  4177. subq KK, %rax
  4178. movq %rax, KKK
  4179. #else
  4180. movq KK, %rax
  4181. #ifdef LEFT
  4182. addq $4, %rax // number of values in A
  4183. #else
  4184. addq $2, %rax // number of values in BO
  4185. #endif
  4186. movq %rax, KKK
  4187. #endif
  4188. andq $-8, %rax
  4189. je .L2_26
  4190. movq %rax, BI // Index for BO
  4191. leaq (BI,BI,1), BI // BI = BI * 1 ; number of values
  4192. salq $2, %rax // rax = rax * 4 ; number of values
  4193. leaq (AO, %rax, SIZE), AO
  4194. leaq (BO, BI, SIZE), BO
  4195. negq BI
  4196. negq %rax
  4197. ALIGN_4
  4198. .L2_22:
  4199. KERNEL4x2_SUB
  4200. KERNEL4x2_SUB
  4201. KERNEL4x2_SUB
  4202. KERNEL4x2_SUB
  4203. KERNEL4x2_SUB
  4204. KERNEL4x2_SUB
  4205. KERNEL4x2_SUB
  4206. KERNEL4x2_SUB
  4207. je .L2_26
  4208. KERNEL4x2_SUB
  4209. KERNEL4x2_SUB
  4210. KERNEL4x2_SUB
  4211. KERNEL4x2_SUB
  4212. KERNEL4x2_SUB
  4213. KERNEL4x2_SUB
  4214. KERNEL4x2_SUB
  4215. KERNEL4x2_SUB
  4216. je .L2_26
  4217. jmp .L2_22
  4218. ALIGN_4
  4219. .L2_26:
  4220. #ifndef TRMMKERNEL
  4221. movq K, %rax
  4222. #else
  4223. movq KKK, %rax
  4224. #endif
  4225. andq $7, %rax # if (k & 1)
  4226. je .L2_29
  4227. movq %rax, BI // Index for BO
  4228. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4229. salq $2, %rax // rax = rax * 4 ; number of values
  4230. leaq (AO, %rax, SIZE), AO
  4231. leaq (BO, BI, SIZE), BO
  4232. negq BI
  4233. negq %rax
  4234. ALIGN_4
  4235. .L2_27:
  4236. KERNEL4x2_SUB
  4237. jl .L2_27
  4238. ALIGN_4
  4239. .L2_29:
  4240. SAVE4x2
  4241. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4242. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4243. movq K, %rax
  4244. subq KKK, %rax
  4245. movq %rax, BI // Index for BO
  4246. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4247. leaq (BO, BI, SIZE), BO
  4248. salq $2, %rax // rax = rax * 4 ; number of values
  4249. leaq (AO, %rax, SIZE), AO
  4250. #endif
  4251. #if defined(TRMMKERNEL) && defined(LEFT)
  4252. addq $4, KK
  4253. #endif
  4254. addq $4 * SIZE, CO1 # coffset += 4
  4255. ALIGN_4
  4256. .L2_30:
  4257. testq $2, M
  4258. jz .L2_40
  4259. ALIGN_4
  4260. .L2_31:
  4261. #if !defined(TRMMKERNEL) || \
  4262. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4263. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4264. leaq BUFFER1, BO // first buffer to BO
  4265. addq $4 * SIZE, BO
  4266. #else
  4267. movq KK, %rax
  4268. leaq BUFFER1, BO // first buffer to BO
  4269. addq $4 * SIZE, BO
  4270. movq %rax, BI // Index for BO
  4271. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4272. leaq (BO, BI, SIZE), BO
  4273. salq $1, %rax // rax = rax * 2 ; number of values
  4274. leaq (AO, %rax, SIZE), AO
  4275. #endif
  4276. vzeroall
  4277. #ifndef TRMMKERNEL
  4278. movq K, %rax
  4279. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4280. movq K, %rax
  4281. subq KK, %rax
  4282. movq %rax, KKK
  4283. #else
  4284. movq KK, %rax
  4285. #ifdef LEFT
  4286. addq $2, %rax // number of values in AO
  4287. #else
  4288. addq $2, %rax // number of values in BO
  4289. #endif
  4290. movq %rax, KKK
  4291. #endif
  4292. andq $-8, %rax
  4293. je .L2_36
  4294. movq %rax, BI // Index for BO
  4295. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4296. salq $1, %rax // rax = rax *2 ; number of values
  4297. leaq (AO, %rax, SIZE), AO
  4298. leaq (BO, BI, SIZE), BO
  4299. negq BI
  4300. negq %rax
  4301. ALIGN_4
  4302. .L2_32:
  4303. KERNEL2x2_SUB
  4304. KERNEL2x2_SUB
  4305. KERNEL2x2_SUB
  4306. KERNEL2x2_SUB
  4307. KERNEL2x2_SUB
  4308. KERNEL2x2_SUB
  4309. KERNEL2x2_SUB
  4310. KERNEL2x2_SUB
  4311. je .L2_36
  4312. KERNEL2x2_SUB
  4313. KERNEL2x2_SUB
  4314. KERNEL2x2_SUB
  4315. KERNEL2x2_SUB
  4316. KERNEL2x2_SUB
  4317. KERNEL2x2_SUB
  4318. KERNEL2x2_SUB
  4319. KERNEL2x2_SUB
  4320. je .L2_36
  4321. jmp .L2_32
  4322. ALIGN_4
  4323. .L2_36:
  4324. #ifndef TRMMKERNEL
  4325. movq K, %rax
  4326. #else
  4327. movq KKK, %rax
  4328. #endif
  4329. andq $7, %rax # if (k & 1)
  4330. je .L2_39
  4331. movq %rax, BI // Index for BO
  4332. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4333. salq $1, %rax // rax = rax *2 ; number of values
  4334. leaq (AO, %rax, SIZE), AO
  4335. leaq (BO, BI, SIZE), BO
  4336. negq BI
  4337. negq %rax
  4338. ALIGN_4
  4339. .L2_37:
  4340. KERNEL2x2_SUB
  4341. jl .L2_37
  4342. ALIGN_4
  4343. .L2_39:
  4344. SAVE2x2
  4345. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4346. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4347. movq K, %rax
  4348. subq KKK, %rax
  4349. movq %rax, BI // Index for BO
  4350. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4351. leaq (BO, BI, SIZE), BO
  4352. salq $1, %rax // rax = rax * 2 ; number of values
  4353. leaq (AO, %rax, SIZE), AO
  4354. #endif
  4355. #if defined(TRMMKERNEL) && defined(LEFT)
  4356. addq $2, KK
  4357. #endif
  4358. addq $2 * SIZE, CO1 # coffset += 2
  4359. ALIGN_4
  4360. .L2_40:
  4361. testq $1, M
  4362. jz .L2_60 // to next 2 lines of N
  4363. ALIGN_4
  4364. .L2_41:
  4365. #if !defined(TRMMKERNEL) || \
  4366. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4367. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4368. leaq BUFFER1, BO // first buffer to BO
  4369. addq $4 * SIZE, BO
  4370. #else
  4371. movq KK, %rax
  4372. leaq BUFFER1, BO // first buffer to BO
  4373. addq $4 * SIZE, BO
  4374. movq %rax, BI // Index for BO
  4375. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4376. leaq (BO, BI, SIZE), BO
  4377. leaq (AO, %rax, SIZE), AO
  4378. #endif
  4379. vzeroall
  4380. #ifndef TRMMKERNEL
  4381. movq K, %rax
  4382. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4383. movq K, %rax
  4384. subq KK, %rax
  4385. movq %rax, KKK
  4386. #else
  4387. movq KK, %rax
  4388. #ifdef LEFT
  4389. addq $1, %rax // number of values in AO
  4390. #else
  4391. addq $2, %rax // number of values in BO
  4392. #endif
  4393. movq %rax, KKK
  4394. #endif
  4395. andq $-8, %rax
  4396. je .L2_46
  4397. movq %rax, BI // Index for BO
  4398. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4399. leaq (AO, %rax, SIZE), AO
  4400. leaq (BO, BI, SIZE), BO
  4401. negq BI
  4402. negq %rax
  4403. ALIGN_4
  4404. .L2_42:
  4405. KERNEL1x2_SUB
  4406. KERNEL1x2_SUB
  4407. KERNEL1x2_SUB
  4408. KERNEL1x2_SUB
  4409. KERNEL1x2_SUB
  4410. KERNEL1x2_SUB
  4411. KERNEL1x2_SUB
  4412. KERNEL1x2_SUB
  4413. je .L2_46
  4414. KERNEL1x2_SUB
  4415. KERNEL1x2_SUB
  4416. KERNEL1x2_SUB
  4417. KERNEL1x2_SUB
  4418. KERNEL1x2_SUB
  4419. KERNEL1x2_SUB
  4420. KERNEL1x2_SUB
  4421. KERNEL1x2_SUB
  4422. je .L2_46
  4423. jmp .L2_42
  4424. ALIGN_4
  4425. .L2_46:
  4426. #ifndef TRMMKERNEL
  4427. movq K, %rax
  4428. #else
  4429. movq KKK, %rax
  4430. #endif
  4431. andq $7, %rax # if (k & 1)
  4432. je .L2_49
  4433. movq %rax, BI // Index for BO
  4434. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4435. leaq (AO, %rax, SIZE), AO
  4436. leaq (BO, BI, SIZE), BO
  4437. negq BI
  4438. negq %rax
  4439. ALIGN_4
  4440. .L2_47:
  4441. KERNEL1x2_SUB
  4442. jl .L2_47
  4443. ALIGN_4
  4444. .L2_49:
  4445. SAVE1x2
  4446. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4447. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4448. movq K, %rax
  4449. subq KKK, %rax
  4450. movq %rax, BI // Index for BO
  4451. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  4452. leaq (BO, BI, SIZE), BO
  4453. leaq (AO, %rax, SIZE), AO
  4454. #endif
  4455. #if defined(TRMMKERNEL) && defined(LEFT)
  4456. addq $1, KK
  4457. #endif
  4458. addq $1 * SIZE, CO1 # coffset += 1
  4459. ALIGN_4
  4460. .L2_60:
  4461. #if defined(TRMMKERNEL) && !defined(LEFT)
  4462. addq $2, KK
  4463. #endif
  4464. .L1_0:
  4465. /************************************************************************************************
  4466. * Loop for Nmod6 % 2 > 0
  4467. *************************************************************************************************/
  4468. movq Nmod6, J
  4469. andq $1, J // j % 2
  4470. je .L999
  4471. ALIGN_4
  4472. .L1_01:
  4473. // copy to sub buffer
  4474. movq B, BO1
  4475. leaq BUFFER1, BO // first buffer to BO
  4476. movq K, %rax
  4477. ALIGN_4
  4478. .L1_02b:
  4479. vmovss (BO1), %xmm0
  4480. vmovss %xmm0, (BO)
  4481. addq $1*SIZE,BO1
  4482. addq $1*SIZE,BO
  4483. decq %rax
  4484. jnz .L1_02b
  4485. .L1_02c:
  4486. movq BO1, B // next offset of B
  4487. .L1_10:
  4488. movq C, CO1
  4489. leaq (C, LDC, 1), C // c += 1 * ldc
  4490. #if defined(TRMMKERNEL) && defined(LEFT)
  4491. movq OFFSET, %rax
  4492. movq %rax, KK
  4493. #endif
  4494. movq A, AO // aoffset = a
  4495. addq $16 * SIZE, AO
  4496. movq M, I
  4497. sarq $4, I // i = (m >> 4)
  4498. je .L1_20
  4499. ALIGN_4
  4500. .L1_11:
  4501. #if !defined(TRMMKERNEL) || \
  4502. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4503. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4504. leaq BUFFER1, BO // first buffer to BO
  4505. addq $4 * SIZE, BO
  4506. #else
  4507. movq KK, %rax
  4508. leaq BUFFER1, BO // first buffer to BO
  4509. addq $4 * SIZE, BO
  4510. movq %rax, BI // Index for BO
  4511. leaq (BO, BI, SIZE), BO
  4512. salq $4, %rax // rax = rax * 16 ; number of values
  4513. leaq (AO, %rax, SIZE), AO
  4514. #endif
  4515. vzeroall
  4516. #ifndef TRMMKERNEL
  4517. movq K, %rax
  4518. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4519. movq K, %rax
  4520. subq KK, %rax
  4521. movq %rax, KKK
  4522. #else
  4523. movq KK, %rax
  4524. #ifdef LEFT
  4525. addq $16, %rax // number of values in AO
  4526. #else
  4527. addq $1, %rax // number of values in BO
  4528. #endif
  4529. movq %rax, KKK
  4530. #endif
  4531. andq $-8, %rax // K = K - ( K % 8 )
  4532. je .L1_16
  4533. movq %rax, BI // Index for BO
  4534. salq $4, %rax // rax = rax * 16 ; number of values
  4535. leaq (AO, %rax, SIZE), AO
  4536. leaq (BO, BI, SIZE), BO
  4537. negq BI
  4538. negq %rax
  4539. ALIGN_4
  4540. .L1_12:
  4541. KERNEL16x1_SUB
  4542. KERNEL16x1_SUB
  4543. KERNEL16x1_SUB
  4544. KERNEL16x1_SUB
  4545. KERNEL16x1_SUB
  4546. KERNEL16x1_SUB
  4547. KERNEL16x1_SUB
  4548. KERNEL16x1_SUB
  4549. je .L1_16
  4550. KERNEL16x1_SUB
  4551. KERNEL16x1_SUB
  4552. KERNEL16x1_SUB
  4553. KERNEL16x1_SUB
  4554. KERNEL16x1_SUB
  4555. KERNEL16x1_SUB
  4556. KERNEL16x1_SUB
  4557. KERNEL16x1_SUB
  4558. je .L1_16
  4559. jmp .L1_12
  4560. ALIGN_4
  4561. .L1_16:
  4562. #ifndef TRMMKERNEL
  4563. movq K, %rax
  4564. #else
  4565. movq KKK, %rax
  4566. #endif
  4567. andq $7, %rax # if (k & 1)
  4568. je .L1_19
  4569. movq %rax, BI // Index for BO
  4570. salq $4, %rax // rax = rax * 16 ; number of values
  4571. leaq (AO, %rax, SIZE), AO
  4572. leaq (BO, BI, SIZE), BO
  4573. negq BI
  4574. negq %rax
  4575. ALIGN_4
  4576. .L1_17:
  4577. KERNEL16x1_SUB
  4578. jl .L1_17
  4579. ALIGN_4
  4580. .L1_19:
  4581. SAVE16x1
  4582. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4583. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4584. movq K, %rax
  4585. subq KKK, %rax
  4586. movq %rax, BI // Index for BO
  4587. leaq (BO, BI, SIZE), BO
  4588. salq $4, %rax // rax = rax * 16 ; number of values
  4589. leaq (AO, %rax, SIZE), AO
  4590. #endif
  4591. #if defined(TRMMKERNEL) && defined(LEFT)
  4592. addq $16, KK
  4593. #endif
  4594. addq $16 * SIZE, CO1 # coffset += 16
  4595. decq I # i --
  4596. jg .L1_11
  4597. ALIGN_4
  4598. /**************************************************************************
  4599. * Rest of M
  4600. ***************************************************************************/
  4601. .L1_20:
  4602. // Test rest of M
  4603. testq $15, M
  4604. jz .L999
  4605. testq $8, M
  4606. jz .L1_21pre
  4607. ALIGN_4
  4608. /**************************************************************************/
  4609. .L1_20_1:
  4610. #if !defined(TRMMKERNEL) || \
  4611. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4612. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4613. leaq BUFFER1, BO // first buffer to BO
  4614. addq $4 * SIZE, BO
  4615. #else
  4616. movq KK, %rax
  4617. leaq BUFFER1, BO // first buffer to BO
  4618. addq $4 * SIZE, BO
  4619. movq %rax, BI // Index for BO
  4620. leaq (BO, BI, SIZE), BO
  4621. salq $3, %rax // rax = rax * 8 ; number of values
  4622. leaq (AO, %rax, SIZE), AO
  4623. #endif
  4624. vzeroall
  4625. #ifndef TRMMKERNEL
  4626. movq K, %rax
  4627. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4628. movq K, %rax
  4629. subq KK, %rax
  4630. movq %rax, KKK
  4631. #else
  4632. movq KK, %rax
  4633. #ifdef LEFT
  4634. addq $8, %rax // number of values in A
  4635. #else
  4636. addq $1, %rax // number of values in BO
  4637. #endif
  4638. movq %rax, KKK
  4639. #endif
  4640. andq $-8, %rax
  4641. je .L1_20_6
  4642. movq %rax, BI // Index for BO
  4643. salq $3, %rax // rax = rax * 8 ; number of values
  4644. leaq (AO, %rax, SIZE), AO
  4645. leaq (BO, BI, SIZE), BO
  4646. negq BI
  4647. negq %rax
  4648. ALIGN_4
  4649. .L1_20_2:
  4650. KERNEL8x1_SUB
  4651. KERNEL8x1_SUB
  4652. KERNEL8x1_SUB
  4653. KERNEL8x1_SUB
  4654. KERNEL8x1_SUB
  4655. KERNEL8x1_SUB
  4656. KERNEL8x1_SUB
  4657. KERNEL8x1_SUB
  4658. je .L1_20_6
  4659. KERNEL8x1_SUB
  4660. KERNEL8x1_SUB
  4661. KERNEL8x1_SUB
  4662. KERNEL8x1_SUB
  4663. KERNEL8x1_SUB
  4664. KERNEL8x1_SUB
  4665. KERNEL8x1_SUB
  4666. KERNEL8x1_SUB
  4667. je .L1_20_6
  4668. jmp .L1_20_2
  4669. ALIGN_4
  4670. .L1_20_6:
  4671. #ifndef TRMMKERNEL
  4672. movq K, %rax
  4673. #else
  4674. movq KKK, %rax
  4675. #endif
  4676. andq $7, %rax # if (k & 1)
  4677. je .L1_20_9
  4678. movq %rax, BI // Index for BO
  4679. salq $3, %rax // rax = rax * 8 ; number of values
  4680. leaq (AO, %rax, SIZE), AO
  4681. leaq (BO, BI, SIZE), BO
  4682. negq BI
  4683. negq %rax
  4684. ALIGN_4
  4685. .L1_20_7:
  4686. KERNEL8x1_SUB
  4687. jl .L1_20_7
  4688. ALIGN_4
  4689. .L1_20_9:
  4690. SAVE8x1
  4691. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4692. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4693. movq K, %rax
  4694. subq KKK, %rax
  4695. movq %rax, BI // Index for BO
  4696. leaq (BO, BI, SIZE), BO
  4697. salq $3, %rax // rax = rax * 8 ; number of values
  4698. leaq (AO, %rax, SIZE), AO
  4699. #endif
  4700. #if defined(TRMMKERNEL) && defined(LEFT)
  4701. addq $8, KK
  4702. #endif
  4703. addq $8 * SIZE, CO1 # coffset += 8
  4704. ALIGN_4
  4705. /**************************************************************************/
  4706. .L1_21pre:
  4707. testq $4, M
  4708. jz .L1_30
  4709. ALIGN_4
  4710. .L1_21:
  4711. #if !defined(TRMMKERNEL) || \
  4712. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4713. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4714. leaq BUFFER1, BO // first buffer to BO
  4715. addq $4 * SIZE, BO
  4716. #else
  4717. movq KK, %rax
  4718. leaq BUFFER1, BO // first buffer to BO
  4719. addq $4 * SIZE, BO
  4720. movq %rax, BI // Index for BO
  4721. leaq (BO, BI, SIZE), BO
  4722. salq $2, %rax // rax = rax * 4 ; number of values
  4723. leaq (AO, %rax, SIZE), AO
  4724. #endif
  4725. vzeroall
  4726. #ifndef TRMMKERNEL
  4727. movq K, %rax
  4728. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4729. movq K, %rax
  4730. subq KK, %rax
  4731. movq %rax, KKK
  4732. #else
  4733. movq KK, %rax
  4734. #ifdef LEFT
  4735. addq $4, %rax // number of values in A
  4736. #else
  4737. addq $1, %rax // number of values in BO
  4738. #endif
  4739. movq %rax, KKK
  4740. #endif
  4741. andq $-8, %rax
  4742. je .L1_26
  4743. movq %rax, BI // Index for BO
  4744. salq $2, %rax // rax = rax * 4 ; number of values
  4745. leaq (AO, %rax, SIZE), AO
  4746. leaq (BO, BI, SIZE), BO
  4747. negq BI
  4748. negq %rax
  4749. ALIGN_4
  4750. .L1_22:
  4751. KERNEL4x1_SUB
  4752. KERNEL4x1_SUB
  4753. KERNEL4x1_SUB
  4754. KERNEL4x1_SUB
  4755. KERNEL4x1_SUB
  4756. KERNEL4x1_SUB
  4757. KERNEL4x1_SUB
  4758. KERNEL4x1_SUB
  4759. je .L1_26
  4760. KERNEL4x1_SUB
  4761. KERNEL4x1_SUB
  4762. KERNEL4x1_SUB
  4763. KERNEL4x1_SUB
  4764. KERNEL4x1_SUB
  4765. KERNEL4x1_SUB
  4766. KERNEL4x1_SUB
  4767. KERNEL4x1_SUB
  4768. je .L1_26
  4769. jmp .L1_22
  4770. ALIGN_4
  4771. .L1_26:
  4772. #ifndef TRMMKERNEL
  4773. movq K, %rax
  4774. #else
  4775. movq KKK, %rax
  4776. #endif
  4777. andq $7, %rax # if (k & 1)
  4778. je .L1_29
  4779. movq %rax, BI // Index for BO
  4780. salq $2, %rax // rax = rax * 4 ; number of values
  4781. leaq (AO, %rax, SIZE), AO
  4782. leaq (BO, BI, SIZE), BO
  4783. negq BI
  4784. negq %rax
  4785. ALIGN_4
  4786. .L1_27:
  4787. KERNEL4x1_SUB
  4788. jl .L1_27
  4789. ALIGN_4
  4790. .L1_29:
  4791. SAVE4x1
  4792. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4793. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4794. movq K, %rax
  4795. subq KKK, %rax
  4796. movq %rax, BI // Index for BO
  4797. leaq (BO, BI, SIZE), BO
  4798. salq $2, %rax // rax = rax * 4 ; number of values
  4799. leaq (AO, %rax, SIZE), AO
  4800. #endif
  4801. #if defined(TRMMKERNEL) && defined(LEFT)
  4802. addq $4, KK
  4803. #endif
  4804. addq $4 * SIZE, CO1 # coffset += 4
  4805. ALIGN_4
  4806. .L1_30:
  4807. testq $2, M
  4808. jz .L1_40
  4809. ALIGN_4
  4810. .L1_31:
  4811. #if !defined(TRMMKERNEL) || \
  4812. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4813. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4814. leaq BUFFER1, BO // first buffer to BO
  4815. addq $4 * SIZE, BO
  4816. #else
  4817. movq KK, %rax
  4818. leaq BUFFER1, BO // first buffer to BO
  4819. addq $4 * SIZE, BO
  4820. movq %rax, BI // Index for BO
  4821. leaq (BO, BI, SIZE), BO
  4822. salq $1, %rax // rax = rax * 2 ; number of values
  4823. leaq (AO, %rax, SIZE), AO
  4824. #endif
  4825. vzeroall
  4826. #ifndef TRMMKERNEL
  4827. movq K, %rax
  4828. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4829. movq K, %rax
  4830. subq KK, %rax
  4831. movq %rax, KKK
  4832. #else
  4833. movq KK, %rax
  4834. #ifdef LEFT
  4835. addq $2, %rax // number of values in AO
  4836. #else
  4837. addq $1, %rax // number of values in BO
  4838. #endif
  4839. movq %rax, KKK
  4840. #endif
  4841. andq $-8, %rax
  4842. je .L1_36
  4843. movq %rax, BI // Index for BO
  4844. salq $1, %rax // rax = rax *2 ; number of values
  4845. leaq (AO, %rax, SIZE), AO
  4846. leaq (BO, BI, SIZE), BO
  4847. negq BI
  4848. negq %rax
  4849. ALIGN_4
  4850. .L1_32:
  4851. KERNEL2x1_SUB
  4852. KERNEL2x1_SUB
  4853. KERNEL2x1_SUB
  4854. KERNEL2x1_SUB
  4855. KERNEL2x1_SUB
  4856. KERNEL2x1_SUB
  4857. KERNEL2x1_SUB
  4858. KERNEL2x1_SUB
  4859. je .L1_36
  4860. KERNEL2x1_SUB
  4861. KERNEL2x1_SUB
  4862. KERNEL2x1_SUB
  4863. KERNEL2x1_SUB
  4864. KERNEL2x1_SUB
  4865. KERNEL2x1_SUB
  4866. KERNEL2x1_SUB
  4867. KERNEL2x1_SUB
  4868. je .L1_36
  4869. jmp .L1_32
  4870. ALIGN_4
  4871. .L1_36:
  4872. #ifndef TRMMKERNEL
  4873. movq K, %rax
  4874. #else
  4875. movq KKK, %rax
  4876. #endif
  4877. andq $7, %rax # if (k & 1)
  4878. je .L1_39
  4879. movq %rax, BI // Index for BO
  4880. salq $1, %rax // rax = rax *2 ; number of values
  4881. leaq (AO, %rax, SIZE), AO
  4882. leaq (BO, BI, SIZE), BO
  4883. negq BI
  4884. negq %rax
  4885. ALIGN_4
  4886. .L1_37:
  4887. KERNEL2x1_SUB
  4888. jl .L1_37
  4889. ALIGN_4
  4890. .L1_39:
  4891. SAVE2x1
  4892. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4893. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4894. movq K, %rax
  4895. subq KKK, %rax
  4896. movq %rax, BI // Index for BO
  4897. leaq (BO, BI, SIZE), BO
  4898. salq $1, %rax // rax = rax * 2 ; number of values
  4899. leaq (AO, %rax, SIZE), AO
  4900. #endif
  4901. #if defined(TRMMKERNEL) && defined(LEFT)
  4902. addq $2, KK
  4903. #endif
  4904. addq $2 * SIZE, CO1 # coffset += 2
  4905. ALIGN_4
  4906. .L1_40:
  4907. testq $1, M
  4908. jz .L999
  4909. ALIGN_4
  4910. .L1_41:
  4911. #if !defined(TRMMKERNEL) || \
  4912. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4913. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4914. leaq BUFFER1, BO // first buffer to BO
  4915. addq $4 * SIZE, BO
  4916. #else
  4917. movq KK, %rax
  4918. leaq BUFFER1, BO // first buffer to BO
  4919. addq $4 * SIZE, BO
  4920. movq %rax, BI // Index for BO
  4921. leaq (BO, BI, SIZE), BO
  4922. leaq (AO, %rax, SIZE), AO
  4923. #endif
  4924. vzeroall
  4925. #ifndef TRMMKERNEL
  4926. movq K, %rax
  4927. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  4928. movq K, %rax
  4929. subq KK, %rax
  4930. movq %rax, KKK
  4931. #else
  4932. movq KK, %rax
  4933. #ifdef LEFT
  4934. addq $1, %rax // number of values in AO
  4935. #else
  4936. addq $1, %rax // number of values in BO
  4937. #endif
  4938. movq %rax, KKK
  4939. #endif
  4940. andq $-8, %rax
  4941. je .L1_46
  4942. movq %rax, BI // Index for BO
  4943. leaq (AO, %rax, SIZE), AO
  4944. leaq (BO, BI, SIZE), BO
  4945. negq BI
  4946. negq %rax
  4947. ALIGN_4
  4948. .L1_42:
  4949. KERNEL1x1_SUB
  4950. KERNEL1x1_SUB
  4951. KERNEL1x1_SUB
  4952. KERNEL1x1_SUB
  4953. KERNEL1x1_SUB
  4954. KERNEL1x1_SUB
  4955. KERNEL1x1_SUB
  4956. KERNEL1x1_SUB
  4957. je .L1_46
  4958. KERNEL1x1_SUB
  4959. KERNEL1x1_SUB
  4960. KERNEL1x1_SUB
  4961. KERNEL1x1_SUB
  4962. KERNEL1x1_SUB
  4963. KERNEL1x1_SUB
  4964. KERNEL1x1_SUB
  4965. KERNEL1x1_SUB
  4966. je .L1_46
  4967. jmp .L1_42
  4968. ALIGN_4
  4969. .L1_46:
  4970. #ifndef TRMMKERNEL
  4971. movq K, %rax
  4972. #else
  4973. movq KKK, %rax
  4974. #endif
  4975. andq $7, %rax # if (k & 1)
  4976. je .L1_49
  4977. movq %rax, BI // Index for BO
  4978. leaq (AO, %rax, SIZE), AO
  4979. leaq (BO, BI, SIZE), BO
  4980. negq BI
  4981. negq %rax
  4982. ALIGN_4
  4983. .L1_47:
  4984. KERNEL1x1_SUB
  4985. jl .L1_47
  4986. ALIGN_4
  4987. .L1_49:
  4988. SAVE1x1
  4989. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  4990. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  4991. movq K, %rax
  4992. subq KKK, %rax
  4993. movq %rax, BI // Index for BO
  4994. leaq (BO, BI, SIZE), BO
  4995. leaq (AO, %rax, SIZE), AO
  4996. #endif
  4997. #if defined(TRMMKERNEL) && defined(LEFT)
  4998. addq $1, KK
  4999. #endif
  5000. addq $1 * SIZE, CO1 # coffset += 1
  5001. ALIGN_4
  5002. .L999:
  5003. movq SP, %rsp
  5004. movq (%rsp), %rbx
  5005. movq 8(%rsp), %rbp
  5006. movq 16(%rsp), %r12
  5007. movq 24(%rsp), %r13
  5008. movq 32(%rsp), %r14
  5009. movq 40(%rsp), %r15
  5010. #ifdef WINDOWS_ABI
  5011. movq 48(%rsp), %rdi
  5012. movq 56(%rsp), %rsi
  5013. movups 64(%rsp), %xmm6
  5014. movups 80(%rsp), %xmm7
  5015. movups 96(%rsp), %xmm8
  5016. movups 112(%rsp), %xmm9
  5017. movups 128(%rsp), %xmm10
  5018. movups 144(%rsp), %xmm11
  5019. movups 160(%rsp), %xmm12
  5020. movups 176(%rsp), %xmm13
  5021. movups 192(%rsp), %xmm14
  5022. movups 208(%rsp), %xmm15
  5023. #endif
  5024. addq $STACKSIZE, %rsp
  5025. ret
  5026. EPILOGUE
  5027. #endif