|
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575 |
- /***************************************************************************
- Copyright (c) 2013-2019, The OpenBLAS Project
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in
- the documentation and/or other materials provided with the
- distribution.
- 3. Neither the name of the OpenBLAS project nor the names of
- its contributors may be used to endorse or promote products
- derived from this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *****************************************************************************/
-
- #define unit_size 4
- #define DISP64(ind,disp) (ind*unit_size*64+disp)
- #define DISP32(ind,disp) (ind*unit_size*32+disp)
- #define DISP16(ind,disp) (ind*unit_size*16+disp)
- #define DISP8(ind,disp) (ind*unit_size*8+disp)
- #define DISP4(ind,disp) (ind*unit_size*4+disp)
- #define DISP2(ind,disp) (ind*unit_size*2+disp)
- #define DISP1(ind,disp) (ind*unit_size+disp)
-
- /**********************************************************************************************
- * Macros for N=8 and M=16
- **********************************************************************************************/
-
-
-
- .macro KERNEL8x16_L1_L4 Index,IsLast
- KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
- .endm
-
- .macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro Zero8X16
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs38, vs38, vs38
- xxlxor vs39, vs39, vs39
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs42, vs42, vs42
- xxlxor vs43, vs43, vs43
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
- xxlxor vs46, vs46, vs46
- xxlxor vs47, vs47, vs47
- xxlxor vs48, vs48, vs48
- xxlxor vs49, vs49, vs49
- xxlxor vs50, vs50, vs50
- xxlxor vs51, vs51, vs51
- xxlxor vs52, vs52, vs52
- xxlxor vs53, vs53, vs53
- xxlxor vs54, vs54, vs54
- xxlxor vs55, vs55, vs55
- xxlxor vs56, vs56, vs56
- xxlxor vs57, vs57, vs57
- xxlxor vs58, vs58, vs58
- xxlxor vs59, vs59, vs59
- xxlxor vs60, vs60, vs60
- xxlxor vs61, vs61, vs61
- xxlxor vs62, vs62, vs62
- xxlxor vs63, vs63, vs63
- .endm
-
- .macro LOAD8x16 OffsetA,OffsetB
-
- lxv vs24, (\OffsetB+0)(BO)
- lxv vs28, (\OffsetB+16)(BO)
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- lxv vs0, (\OffsetA+0)(AO)
- lxv vs1, (\OffsetA+16)(AO)
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
- lxv vs2, (\OffsetA+32)(AO)
- lxv vs3, (\OffsetA+48)(AO)
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
- .endm
-
- .macro END8x16_NORMAL
- END8x16 0, AO, BO, 64,32
- .endm
-
- .macro END8x16_WITHOUT_ADD
- END8x16 0, AO,BO,0,0
- .endm
-
- .macro END8x16 First, AREG, BREG, OffsetA, OffsetB
-
- .if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
- .endif
- .if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
- .endif
-
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
- xvmulsp vs34, vs2,vs24
- xvmulsp vs35, vs3,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
- xvmulsp vs38, vs2,vs25
- xvmulsp vs39, vs3,vs25
-
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
- xvmulsp vs42, vs2,vs26
- xvmulsp vs43, vs3,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
- xvmulsp vs46, vs2,vs27
- xvmulsp vs47, vs3,vs27
-
- xvmulsp vs48, vs0,vs28
- xvmulsp vs49, vs1,vs28
- xvmulsp vs50, vs2,vs28
- xvmulsp vs51, vs3,vs28
-
- xvmulsp vs52, vs0,vs29
- xvmulsp vs53, vs1,vs29
- xvmulsp vs54, vs2,vs29
- xvmulsp vs55, vs3,vs29
-
- xvmulsp vs56, vs0,vs30
- xvmulsp vs57, vs1,vs30
- xvmulsp vs58, vs2,vs30
- xvmulsp vs59, vs3,vs30
-
- xvmulsp vs60, vs0,vs31
- xvmulsp vs61, vs1,vs31
- xvmulsp vs62, vs2,vs31
- xvmulsp vs63, vs3,vs31
-
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
- xvmaddasp vs50, vs2,vs28
- xvmaddasp vs51, vs3,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
- xvmaddasp vs54, vs2,vs29
- xvmaddasp vs55, vs3,vs29
-
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
- xvmaddasp vs58, vs2,vs30
- xvmaddasp vs59, vs3,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
- xvmaddasp vs62, vs2,vs31
- xvmaddasp vs63, vs3,vs31
-
- .endif
- .endm
-
- .macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
- KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
-
- .endm
-
- .macro KERNEL8x16 First
-
- LOAD8x16 0,0
- END8x16 \First, AO, BO, 64,32
- .endm
-
- .macro LOAD8x16_2
- LOAD8x16_2O AO,BO, 0,0
- .endm
-
- .macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB
- lxv vs8, (\OffsetB)(\BREG)
- lxv vs12, (16+\OffsetB)(\BREG)
- lxv vs24, (32+\OffsetB)(\BREG)
- lxv vs28, (32+16+\OffsetB)(\BREG)
- lxv vs4, (0+\OffsetA)(\AREG)
- lxv vs5, (16+\OffsetA)(\AREG)
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
- lxv vs6, (32+\OffsetA)(\AREG)
- lxv vs7, (48+\OffsetA)(\AREG)
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
- lxv vs0, (64+\OffsetA)(\AREG)
- lxv vs1, (64+16+\OffsetA)(\AREG)
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
- lxv vs2, (64+32+\OffsetA)(\AREG)
- lxv vs3, (64+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
- .endm
-
- .macro END8x16_2
- /*for load2 offset will be 128 and 64*/
- KERNEL8x16_2 AO,BO, 128,64,0 ,1,1
- .endm
-
-
-
- .macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
- .endm
-
-
- .macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast
- KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
- .endm
-
-
- .macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs49, vs5,vs12
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs57, vs5,vs14
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs52, vs4,vs13
- xvmaddasp vs53, vs5,vs13
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs60, vs4,vs15
- xvmaddasp vs61, vs5,vs15
-
- .if \Complete==0
- lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
- .endif
-
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
- xvmaddasp vs50, vs6,vs12
- xvmaddasp vs51, vs7,vs12
- .if \Complete==0
- lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
- lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
- .endif
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
- xvmaddasp vs58, vs6,vs14
- xvmaddasp vs59, vs7,vs14
- .if \Complete==0
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
- .endif
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
- xvmaddasp vs54, vs6,vs13
- xvmaddasp vs55, vs7,vs13
- .if \Complete==0
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
- .endif
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
- xvmaddasp vs62, vs6,vs15
- xvmaddasp vs63, vs7,vs15
- .if \Complete==0
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
- .endif
-
- .if \Complete==0
- lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
- .endif
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
- .if \Complete==0
- lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
- .endif
-
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
- xvmaddasp vs50, vs2,vs28
- xvmaddasp vs51, vs3,vs28
- .if \Complete==0
- lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
- lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
- .endif
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
- xvmaddasp vs58, vs2,vs30
- xvmaddasp vs59, vs3,vs30
- .if \Complete==0
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- .endif
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
- xvmaddasp vs54, vs2,vs29
- xvmaddasp vs55, vs3,vs29
- .if \Complete==0
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
- .endif
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
- xvmaddasp vs62, vs2,vs31
- xvmaddasp vs63, vs3,vs31
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
- .endif
- .if \Complete==0
- lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
- lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
- .endif
-
-
- .if \IsLast==1
- .if \Complete==1
- addi \BREG, \BREG, DISP16(\Index,\OffsetB)
- addi \AREG, \AREG, DISP32(\Index,\OffsetA)
-
- .else
- addi \BREG, \BREG, DISP16(\Index,64)
- addi \AREG, \AREG, DISP32(\Index,128)
-
- .endif
- .endif
-
-
- .endm
-
-
- .macro SAVE8x16
-
- slwi T10, LDC , 1
- add T1, CO, LDC
-
- add T2, CO, T10
- add T3, T1, T10
-
- add T4, T2, T10
- add T5, T3, T10
-
- add T6, T4, T10
- add T7, T5, T10
-
-
-
- /* permute to restore butterfly rank 1 updateto normal promoted one */
- /* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */
- /* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */
- /* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */
- /* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */
-
- xxmrglw vs8, vs32, vs44
- xxmrglw vs10, vs36, vs40
-
- xxmrghw vs1, vs32, vs44
- xxmrghw vs0, vs36, vs40
-
- xxmrglw vs12, vs33, vs45
- xxmrglw vs14, vs37, vs41
-
- xxmrghw vs2, vs37, vs41
- xxmrghw vs3, vs33, vs45
- #ifndef TRMMKERNEL
- lxv vs32, 0(CO)
- lxv vs33, 16(CO)
- #endif
- xxmrglw vs16, vs34, vs46
- xxmrglw vs18, vs38, vs42
-
- xxlor vs9, vs8, vs8
- xxlor vs11, vs10, vs10
-
- xxmrghw vs4, vs38, vs42
- xxmrghw vs5, vs34, vs46
-
- xxlor vs13, vs12, vs12
- xxlor vs15, vs14, vs14
-
- xxmrglw vs24, vs35, vs47
- xxmrglw vs26, vs39, vs43
-
- xxlor vs17, vs16, vs16
- xxlor vs19, vs18, vs18
-
- xxmrghw vs30, vs39, vs43
- xxmrghw vs31, vs35, vs47
- #ifndef TRMMKERNEL
- lxv vs34, 32(CO)
- lxv vs35, 48(CO)
- #endif
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
- #ifndef TRMMKERNEL
- lxv vs36, 0(T1)
- lxv vs37, 16(T1)
- #endif
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
- #ifndef TRMMKERNEL
- lxv vs38, 32(T1)
- lxv vs39, 48(T1)
- #endif
-
- xxlor vs25, vs24, vs24
- xxlor vs27, vs26, vs26
-
-
-
- #ifndef TRMMKERNEL
- lxv vs40, 0(T2)
- lxv vs41, 16(T2)
- #endif
-
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
- #ifndef TRMMKERNEL
- lxv vs42, 32(T2)
- lxv vs43, 48(T2)
- #endif
-
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
- #ifndef TRMMKERNEL
- lxv vs44, 0(T3)
- lxv vs45, 16(T3)
- #endif
- xxperm vs16, vs4, save_permute_1
- xxperm vs18, vs5, save_permute_1
- #ifndef TRMMKERNEL
- lxv vs46, 32(T3)
- lxv vs47, 48(T3)
- #endif
-
-
-
-
-
- xxperm vs17, vs4, save_permute_2
- xxperm vs19, vs5, save_permute_2
- #ifdef TRMMKERNEL
- xvmulsp vs32, vs8, alpha_r
- xvmulsp vs33, vs12, alpha_r
- #else
- xvmaddasp vs32, vs8, alpha_r
- xvmaddasp vs33, vs12, alpha_r
- #endif
- xxperm vs24, vs30, save_permute_1
- xxperm vs26, vs31, save_permute_1
-
-
- stxv vs32, 0(CO)
- stxv vs33, 16(CO)
- #ifdef TRMMKERNEL
- xvmulsp vs34, vs16, alpha_r
- xvmulsp vs35, vs24, alpha_r
- #else
- xvmaddasp vs34, vs16, alpha_r
- xvmaddasp vs35, vs24, alpha_r
- #endif
-
- xxperm vs25, vs30, save_permute_2
- xxperm vs27, vs31, save_permute_2
-
-
- stxv vs34, 32(CO)
- stxv vs35, 48(CO)
- #ifdef TRMMKERNEL
- xvmulsp vs36, vs9, alpha_r
- xvmulsp vs37, vs13, alpha_r
- #else
- xvmaddasp vs36, vs9, alpha_r
- xvmaddasp vs37, vs13, alpha_r
- #endif
- stxv vs36, 0(T1)
- stxv vs37, 16(T1)
- #ifdef TRMMKERNEL
- xvmulsp vs38, vs17, alpha_r
- xvmulsp vs39, vs25, alpha_r
- #else
- xvmaddasp vs38, vs17, alpha_r
- xvmaddasp vs39, vs25, alpha_r
- #endif
- stxv vs38, 32(T1)
- stxv vs39, 48(T1)
-
- #ifdef TRMMKERNEL
- xvmulsp vs40, vs10, alpha_r
- xvmulsp vs41, vs14, alpha_r
- #else
- xvmaddasp vs40, vs10, alpha_r
- xvmaddasp vs41, vs14, alpha_r
- #endif
-
- stxv vs40, 0(T2)
- stxv vs41, 16(T2)
- #ifdef TRMMKERNEL
- xvmulsp vs42, vs18, alpha_r
- xvmulsp vs43, vs26, alpha_r
- #else
- xvmaddasp vs42, vs18, alpha_r
- xvmaddasp vs43, vs26, alpha_r
- #endif
- stxv vs42, 32(T2)
- stxv vs43, 48(T2)
- #ifdef TRMMKERNEL
- xvmulsp vs44, vs11, alpha_r
- xvmulsp vs45, vs15, alpha_r
- #else
- xvmaddasp vs44, vs11, alpha_r
- xvmaddasp vs45, vs15, alpha_r
- #endif
- stxv vs44, 0(T3)
- stxv vs45, 16(T3)
- #ifdef TRMMKERNEL
- xvmulsp vs46, vs19, alpha_r
- xvmulsp vs47, vs27, alpha_r
- #else
- xvmaddasp vs46, vs19, alpha_r
- xvmaddasp vs47, vs27, alpha_r
- #endif
- stxv vs46, 32(T3)
- stxv vs47, 48(T3)
-
- /*****the same with the second 8X8 ****/
- #ifndef TRMMKERNEL
- lxv vs32, 0(T4)
- lxv vs33, 16(T4)
- #endif
- xxmrglw vs8, vs48, vs60
- xxmrglw vs10, vs52, vs56
- #ifndef TRMMKERNEL
- lxv vs34, 32(T4)
- lxv vs35, 48(T4)
- #endif
- xxmrghw vs1, vs48, vs60
- xxmrghw vs0, vs52, vs56
- #ifndef TRMMKERNEL
- lxv vs36, 0(T5)
- lxv vs37, 16(T5)
- #endif
- xxmrglw vs12, vs49, vs61
- xxmrglw vs14, vs53, vs57
- #ifndef TRMMKERNEL
- lxv vs38,32(T5)
- lxv vs39, 48(T5)
- #endif
-
- xxmrghw vs2, vs53, vs57
- xxmrghw vs3, vs49, vs61
- #ifndef TRMMKERNEL
- lxv vs40, 0(T6)
- lxv vs41, 16(T6)
- #endif
- xxmrglw vs16, vs50, vs62
- xxmrglw vs18, vs54, vs58
- #ifndef TRMMKERNEL
- lxv vs42, 32(T6)
- lxv vs43, 48(T6)
- #endif
- xxlor vs9, vs8, vs8
- xxlor vs11, vs10, vs10
- xxmrghw vs4, vs54, vs58
- xxmrghw vs5, vs50, vs62
- #ifndef TRMMKERNEL
- lxv vs44, 0(T7)
- lxv vs45, 16(T7)
- #endif
- xxlor vs13, vs12, vs12
- xxlor vs15, vs14, vs14
-
- xxmrglw vs24, vs51, vs63
- xxmrglw vs26, vs55, vs59
- #ifndef TRMMKERNEL
- lxv vs46, 32(T7)
- lxv vs47, 48(T7)
- #endif
- xxlor vs17, vs16, vs16
- xxlor vs19, vs18, vs18
- xxmrghw vs30, vs55, vs59
- xxmrghw vs31, vs51, vs63
-
-
-
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
-
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
- xxlor vs25, vs24, vs24
- xxlor vs27, vs26, vs26
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
-
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
- #ifdef TRMMKERNEL
- xvmulsp vs32, vs8, alpha_r
- xvmulsp vs33, vs12, alpha_r
- #else
- xvmaddasp vs32, vs8, alpha_r
- xvmaddasp vs33, vs12, alpha_r
- #endif
- xxperm vs16, vs4, save_permute_1
- xxperm vs18, vs5, save_permute_1
- stxv vs32, 0(T4)
- stxv vs33, 16(T4)
- xxperm vs17, vs4, save_permute_2
- xxperm vs19, vs5, save_permute_2
- xxperm vs24, vs30, save_permute_1
- xxperm vs26, vs31, save_permute_1
- xxperm vs25, vs30, save_permute_2
- xxperm vs27, vs31, save_permute_2
-
- #ifdef TRMMKERNEL
- xvmulsp vs34, vs16, alpha_r
- xvmulsp vs35, vs24, alpha_r
- #else
- xvmaddasp vs34, vs16, alpha_r
- xvmaddasp vs35, vs24, alpha_r
- #endif
- stxv vs34, 32(T4)
- stxv vs35, 48(T4)
-
- #ifdef TRMMKERNEL
- xvmulsp vs36, vs9, alpha_r
- xvmulsp vs37, vs13, alpha_r
- #else
- xvmaddasp vs36, vs9, alpha_r
- xvmaddasp vs37, vs13, alpha_r
- #endif
- stxv vs36, 0(T5)
- stxv vs37, 16(T5)
-
- #ifdef TRMMKERNEL
- xvmulsp vs38, vs17, alpha_r
- xvmulsp vs39, vs25, alpha_r
- #else
- xvmaddasp vs38, vs17, alpha_r
- xvmaddasp vs39, vs25, alpha_r
- #endif
-
-
-
-
- stxv vs38, 32(T5)
- stxv vs39, 48(T5)
-
-
- #ifdef TRMMKERNEL
- xvmulsp vs40, vs10, alpha_r
- xvmulsp vs41, vs14, alpha_r
- #else
- xvmaddasp vs40, vs10, alpha_r
- xvmaddasp vs41, vs14, alpha_r
- #endif
- stxv vs40, 0(T6)
- stxv vs41, 16(T6)
- #ifdef TRMMKERNEL
- xvmulsp vs42, vs18, alpha_r
- xvmulsp vs43, vs26, alpha_r
- #else
- xvmaddasp vs42, vs18, alpha_r
- xvmaddasp vs43, vs26, alpha_r
- #endif
- stxv vs42, 32(T6)
- stxv vs43, 48(T6)
- #ifdef TRMMKERNEL
- xvmulsp vs44, vs11, alpha_r
- xvmulsp vs45, vs15, alpha_r
- #else
- xvmaddasp vs44, vs11, alpha_r
- xvmaddasp vs45, vs15, alpha_r
- #endif
-
- stxv vs44, 0(T7)
- stxv vs45, 16(T7)
- #ifdef TRMMKERNEL
- xvmulsp vs46, vs19, alpha_r
- xvmulsp vs47, vs27, alpha_r
- #else
- xvmaddasp vs46, vs19, alpha_r
- xvmaddasp vs47, vs27, alpha_r
- #endif
-
- stxv vs46, 32(T7)
- stxv vs47, 48(T7)
-
-
- addi CO,CO,64
-
-
- .endm
-
-
-
- /**********************************************************************************************
- * Macros for N=8 and M=8
- **********************************************************************************************/
-
- .macro LOAD8x8_1
- LOAD8x8 1
- .endm
-
- .macro LOAD8x8_0
- LOAD8x8 0
- .endm
-
- .macro KERNEL8x8_L1_L4 Index,IsLast
- KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
- .endm
-
- .macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast
- KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
- .macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro END8x8_NORMAL
- END8x8 0, AO, BO, 32,32
- .endm
-
- .macro Zero8X8
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
-
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
-
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
-
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
-
- xxlxor vs48, vs48, vs48
- xxlxor vs49, vs49, vs49
-
- xxlxor vs52, vs52, vs52
- xxlxor vs53, vs53, vs53
-
- xxlxor vs56, vs56, vs56
- xxlxor vs57, vs57, vs57
-
- xxlxor vs60, vs60, vs60
- xxlxor vs61, vs61, vs61
-
- .endm
-
- .macro LOAD8x8 Zero
-
- lxv vs24, 0(BO)
- lxv vs28, 16(BO)
- lxv vs0, 0(AO)
- lxv vs1, 16(AO)
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
- .if \Zero==1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
- xxlxor vs48, vs48, vs48
- xxlxor vs49, vs49, vs49
- xxlxor vs52, vs52, vs52
- xxlxor vs53, vs53, vs53
- xxlxor vs56, vs56, vs56
- xxlxor vs57, vs57, vs57
- xxlxor vs60, vs60, vs60
- xxlxor vs61, vs61, vs61
- .endif
- .endm
-
-
- .macro END8x8 First, AREG, BREG, OffsetA, OffsetB
-
- .if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
- .endif
- .if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
- .endif
-
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
-
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
-
- xvmulsp vs48, vs0,vs28
- xvmulsp vs49, vs1,vs28
-
- xvmulsp vs52, vs0,vs29
- xvmulsp vs53, vs1,vs29
-
- xvmulsp vs56, vs0,vs30
- xvmulsp vs57, vs1,vs30
-
- xvmulsp vs60, vs0,vs31
- xvmulsp vs61, vs1,vs31
-
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
-
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
-
- .endif
- .endm
-
- .macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG)
- lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG)
-
- lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
-
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
- lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
-
- lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)
-
-
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
-
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
-
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs49, vs5,vs12
-
- xvmaddasp vs52, vs4,vs13
- xvmaddasp vs53, vs5,vs13
- lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)
- lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs57, vs5,vs14
-
- xvmaddasp vs60, vs4,vs15
- xvmaddasp vs61, vs5,vs15
-
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
-
-
- lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)
-
-
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
- .if \Complete==0
- lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)
- .endif
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
- .if \Complete==0
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- .endif
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
-
-
- .if \Complete==0
- lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG)
- .endif
-
- .if \Complete==0
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
-
- .endif
- .if \IsLast==1
- .if \Complete==1
-
- addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)
- addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
- .else
-
- addi \BREG, \BREG, DISP32(\Index,128)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
- .endif
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
-
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
- .endif
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
-
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs49, vs5,vs12
-
- xvmaddasp vs52, vs4,vs13
- xvmaddasp vs53, vs5,vs13
-
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs57, vs5,vs14
-
- xvmaddasp vs60, vs4,vs15
- xvmaddasp vs61, vs5,vs15
-
- .endm
-
- .macro KERNEL8x8 First
-
- LOAD8x8 0
- END8x8 \First, AO, BO, 32,32
- .endm
-
- .macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
- lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
-
- lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxperm vs14, vs12, permute_mask
- xxpermdi vs9, vs8, vs8,2
- xxpermdi vs13, vs12, vs12,2
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
-
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- .endif
-
- xxpermdi vs11, vs10, vs10,2
- xxpermdi vs15, vs14, vs14,2
-
- .if \First==1
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
-
- xvmulsp vs48, vs0,vs28
- xvmulsp vs49, vs1,vs28
-
- xvmulsp vs52, vs0,vs29
- xvmulsp vs53, vs1,vs29
-
- xvmulsp vs56, vs0,vs30
- xvmulsp vs57, vs1,vs30
-
- xvmulsp vs60, vs0,vs31
- xvmulsp vs61, vs1,vs31
-
- .else
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
- xvmaddasp vs48, vs0,vs28
- xvmaddasp vs49, vs1,vs28
-
- xvmaddasp vs52, vs0,vs29
- xvmaddasp vs53, vs1,vs29
-
- xvmaddasp vs56, vs0,vs30
- xvmaddasp vs57, vs1,vs30
-
- xvmaddasp vs60, vs0,vs31
- xvmaddasp vs61, vs1,vs31
-
- .endif
- .if \Complete==0
- lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
- lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
-
- lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG)
- lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxperm vs30, vs28, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs29, vs28, vs28,2
- .endif
- .if \IsLast==1
- .if \Complete==1
- addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
- addi \AREG, \AREG, DISP16(\Index,32+\OffsetA)
-
- .else
- addi \BREG, \BREG, DISP16(\Index,64)
- addi \AREG, \AREG, DISP16(\Index,64)
- .endif
- .endif
-
- .if \First==1
- xvmulsp vs32, vs4,vs8
- xvmulsp vs33, vs5,vs8
-
- xvmulsp vs36, vs4,vs9
- xvmulsp vs37, vs5,vs9
-
- .else
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
-
- .endif
-
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
- xxpermdi vs31, vs30, vs30,2
-
- .endif
- .if \First==1
- xvmulsp vs40, vs4,vs10
- xvmulsp vs41, vs5,vs10
-
- xvmulsp vs44, vs4,vs11
- xvmulsp vs45, vs5,vs11
-
- xvmulsp vs48, vs4,vs12
- xvmulsp vs49, vs5,vs12
-
- xvmulsp vs52, vs4,vs13
- xvmulsp vs53, vs5,vs13
-
- xvmulsp vs56, vs4,vs14
- xvmulsp vs57, vs5,vs14
-
- xvmulsp vs60, vs4,vs15
- xvmulsp vs61, vs5,vs15
-
- .else
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
-
- xvmaddasp vs48, vs4,vs12
- xvmaddasp vs49, vs5,vs12
-
- xvmaddasp vs52, vs4,vs13
- xvmaddasp vs53, vs5,vs13
-
- xvmaddasp vs56, vs4,vs14
- xvmaddasp vs57, vs5,vs14
-
- xvmaddasp vs60, vs4,vs15
- xvmaddasp vs61, vs5,vs15
-
- .endif
-
- .endm
-
-
- .macro SAVE8x8
-
- slwi T10, LDC , 1
- add T1, CO, LDC
-
- add T2, CO, T10
- add T3, T1, T10
-
- add T4, T2, T10
- add T5, T3, T10
-
- add T6, T4, T10
- add T7, T5, T10
-
- #ifndef TRMMKERNEL
- lxv vs34, 0(CO)
- lxv vs35, 16(CO)
- lxv vs38, 0(T1)
- lxv vs39, 16(T1)
- lxv vs42, 0(T2)
- lxv vs43, 16(T2)
- lxv vs46, 0(T3)
- lxv vs47, 16(T3)
-
- lxv vs50, 0(T4)
- lxv vs51, 16(T4)
- lxv vs54, 0(T5)
- lxv vs55, 16(T5)
- lxv vs58, 0(T6)
- lxv vs59, 16(T6)
- lxv vs62, 0(T7)
- lxv vs63, 16(T7)
- #endif
-
- xxmrglw vs8, vs32, vs44
- xxmrglw vs10, vs36, vs40
-
- xxmrghw vs1, vs32, vs44
- xxmrghw vs0, vs36, vs40
-
- xxmrglw vs12, vs33, vs45
- xxmrglw vs14, vs37, vs41
-
- xxmrghw vs2, vs37, vs41
- xxmrghw vs3, vs33, vs45
-
- xxlor vs9, vs8, vs8
- xxlor vs11, vs10, vs10
-
- xxlor vs13, vs12, vs12
- xxlor vs15, vs14, vs14
-
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
-
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
-
-
- /* multiply add normal way */
-
- #ifdef TRMMKERNEL
- xvmulsp vs34, vs8, alpha_r
- xvmulsp vs35, vs12, alpha_r
- xvmulsp vs38, vs9, alpha_r
- xvmulsp vs39, vs13, alpha_r
- xvmulsp vs42, vs10, alpha_r
- xvmulsp vs43, vs14, alpha_r
- xvmulsp vs46, vs11, alpha_r
- xvmulsp vs47, vs15, alpha_r
- #else
- xvmaddasp vs34, vs8, alpha_r
- xvmaddasp vs35, vs12, alpha_r
- xvmaddasp vs38, vs9, alpha_r
- xvmaddasp vs39, vs13, alpha_r
- xvmaddasp vs42, vs10, alpha_r
- xvmaddasp vs43, vs14, alpha_r
- xvmaddasp vs46, vs11, alpha_r
- xvmaddasp vs47, vs15, alpha_r
- #endif
-
-
- xxmrglw vs8, vs48, vs60
- xxmrglw vs10, vs52, vs56
-
- xxmrghw vs1, vs48, vs60
- xxmrghw vs0, vs52, vs56
- stxv vs34, 0(CO)
- stxv vs35, 16(CO)
- xxmrglw vs12, vs49, vs61
- xxmrglw vs14, vs53, vs57
- stxv vs38, 0(T1)
- stxv vs39, 16(T1)
- xxmrghw vs2, vs53, vs57
- xxmrghw vs3, vs49, vs61
- stxv vs42, 0(T2)
- stxv vs43, 16(T2)
- xxlor vs9, vs8, vs8
- xxlor vs11, vs10, vs10
- stxv vs46, 0(T3)
- stxv vs47, 16(T3)
- xxlor vs13, vs12, vs12
- xxlor vs15, vs14, vs14
-
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
-
-
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
-
- #ifdef TRMMKERNEL
- xvmulsp vs50, vs8, alpha_r
- xvmulsp vs51, vs12, alpha_r
- xvmulsp vs54, vs9, alpha_r
- xvmulsp vs55, vs13, alpha_r
- xvmulsp vs58, vs10, alpha_r
- xvmulsp vs59, vs14, alpha_r
- xvmulsp vs62, vs11, alpha_r
- xvmulsp vs63, vs15, alpha_r
- #else
- xvmaddasp vs50, vs8, alpha_r
- xvmaddasp vs51, vs12, alpha_r
- xvmaddasp vs54, vs9, alpha_r
- xvmaddasp vs55, vs13, alpha_r
- xvmaddasp vs58, vs10, alpha_r
- xvmaddasp vs59, vs14, alpha_r
- xvmaddasp vs62, vs11, alpha_r
- xvmaddasp vs63, vs15, alpha_r
- #endif
-
- stxv vs50, 0(T4)
- stxv vs51, 16(T4)
- stxv vs54, 0(T5)
- stxv vs55, 16(T5)
- stxv vs58, 0(T6)
- stxv vs59, 16(T6)
- stxv vs62, 0(T7)
- stxv vs63, 16(T7)
-
- addi CO,CO,32
-
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=8 and M=4
- **********************************************************************************************/
-
- .macro LOAD8x4_1
- LOAD8x4 1
- .endm
-
- .macro LOAD8x4_0
- LOAD8x4 0
- .endm
-
- .macro KERNEL8x4_L1_L4 Index,IsLast
- KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
- .endm
-
- .macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast
- KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
- .macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro Zero8X4
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
-
- xxlxor vs48, vs48, vs48
- xxlxor vs49, vs49, vs49
- xxlxor vs50, vs50, vs50
- xxlxor vs51, vs51, vs51
-
- .endm
-
- .macro LOAD8x4 Zero
-
- lxv vs0, 0(AO)
- lxv vs24, 0(BO)
- lxv vs25, 16(BO)
-
-
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
-
- .if \Zero==1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
-
- xxlxor vs48, vs48, vs48
- xxlxor vs49, vs49, vs49
- xxlxor vs50, vs50, vs50
- xxlxor vs51, vs51, vs51
- .endif
- .endm
-
- .macro END8x4_NORMAL
- END8x4 0, AO, BO, 16,32
- .endm
-
- .macro END8x4 First, AREG, BREG, OffsetA, OffsetB
-
- .if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
- .endif
- .if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
- .endif
-
- .if \First==1
- xvmulsp vs32, vs24, vs0
- xvmulsp vs33, vs24, vs1
- xvmulsp vs34, vs24, vs2
- xvmulsp vs35, vs24, vs3
-
- xvmulsp vs48, vs25, vs0
- xvmulsp vs49, vs25, vs1
- xvmulsp vs50, vs25, vs2
- xvmulsp vs51, vs25, vs3
- .else
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
- xvmaddasp vs48, vs25, vs0
- xvmaddasp vs49, vs25, vs1
- xvmaddasp vs50, vs25, vs2
- xvmaddasp vs51, vs25, vs3
-
- .endif
- .endm
-
- .macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG)
- lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG)
-
- xxperm vs6, vs4, permute_mask
- xxpermdi vs5, vs4, vs4,2
- xxpermdi vs7, vs6, vs6,2
-
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
- xvmaddasp vs48, vs25, vs0
- xvmaddasp vs49, vs25, vs1
- xvmaddasp vs50, vs25, vs2
- xvmaddasp vs51, vs25, vs3
-
- lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG)
- lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG)
- lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG)
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
-
- xvmaddasp vs32, vs26, vs4
- xvmaddasp vs33, vs26, vs5
- xvmaddasp vs34, vs26, vs6
- xvmaddasp vs35, vs26, vs7
-
- xvmaddasp vs48, vs27, vs4
- xvmaddasp vs49, vs27, vs5
- xvmaddasp vs50, vs27, vs6
- xvmaddasp vs51, vs27, vs7
-
-
- lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG)
- lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG)
- lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG)
-
- xxperm vs6, vs4, permute_mask
- xxpermdi vs5, vs4, vs4,2
- xxpermdi vs7, vs6, vs6,2
-
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
- xvmaddasp vs48, vs25, vs0
- xvmaddasp vs49, vs25, vs1
- xvmaddasp vs50, vs25, vs2
- xvmaddasp vs51, vs25, vs3
-
- .if \Complete==0
-
- lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG)
- lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG)
- lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG)
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
- .endif
- xvmaddasp vs32, vs26, vs4
- xvmaddasp vs33, vs26, vs5
- xvmaddasp vs34, vs26, vs6
- xvmaddasp vs35, vs26, vs7
-
- xvmaddasp vs48, vs27, vs4
- xvmaddasp vs49, vs27, vs5
- xvmaddasp vs50, vs27, vs6
- xvmaddasp vs51, vs27, vs7
-
-
-
- .if \IsLast==1
- .if \Complete==1
- addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)
- addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)
-
- .else
- addi \AREG, \AREG, DISP16(\Index,64)
- addi \BREG, \BREG, DISP32(\Index,128)
-
- .endif
- .endif
-
-
- .endm
-
- .macro KERNEL8x4 First
- LOAD8x4 0
- END8x4 \First, AO, BO, 16,32
- .endm
-
- .macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
- lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG)
-
- xxperm vs6, vs4, permute_mask
- xxpermdi vs5, vs4, vs4,2
- xxpermdi vs7, vs6, vs6,2
- .if \First==1
- xvmulsp vs32, vs24, vs0
- xvmulsp vs33, vs24, vs1
- xvmulsp vs34, vs24, vs2
- xvmulsp vs35, vs24, vs3
-
- xvmulsp vs48, vs25, vs0
- xvmulsp vs49, vs25, vs1
- xvmulsp vs50, vs25, vs2
- xvmulsp vs51, vs25, vs3
- .else
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
- xvmaddasp vs48, vs25, vs0
- xvmaddasp vs49, vs25, vs1
- xvmaddasp vs50, vs25, vs2
- xvmaddasp vs51, vs25, vs3
- .endif
-
- .if \Complete==0
-
- lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG)
- lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG)
- lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG)
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
- .endif
-
- .if \First==1
- xvmulsp vs32, vs26, vs4
- xvmulsp vs33, vs26, vs5
- xvmulsp vs34, vs26, vs6
- xvmulsp vs35, vs26, vs7
-
- xvmulsp vs48, vs27, vs4
- xvmulsp vs49, vs27, vs5
- xvmulsp vs50, vs27, vs6
- xvmulsp vs51, vs27, vs7
-
-
- .else
- xvmaddasp vs32, vs26, vs4
- xvmaddasp vs33, vs26, vs5
- xvmaddasp vs34, vs26, vs6
- xvmaddasp vs35, vs26, vs7
-
- xvmaddasp vs48, vs27, vs4
- xvmaddasp vs49, vs27, vs5
- xvmaddasp vs50, vs27, vs6
- xvmaddasp vs51, vs27, vs7
- .endif
-
-
- .if \IsLast==1
- .if \Complete==1
- addi \AREG, \AREG, DISP8(\Index,16+\OffsetA)
- addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
-
- .else
- addi \AREG, \AREG, DISP8(\Index,32)
- addi \BREG, \BREG, DISP16(\Index,64)
-
- .endif
- .endif
-
-
- .endm
-
-
- .macro SAVE8x4
- slwi T10, LDC , 1
- add T1, CO, LDC
- #if !defined(TRMMKERNEL)
- lxv vs36, 0(CO)
- lxv vs37, 0(T1)
- #endif
- add T2, CO, T10
- add T3, T1, T10
- #if !defined(TRMMKERNEL)
- lxv vs38, 0(T2)
- lxv vs39, 0(T3)
- #endif
- add T4, T2, T10
- add T5, T3, T10
- #if !defined(TRMMKERNEL)
- lxv vs40, 0(T4)
- lxv vs41, 0(T5)
- #endif
- add T6, T4, T10
- add T7, T5, T10
- #if !defined(TRMMKERNEL)
- lxv vs42, 0(T6)
- lxv vs43, 0(T7)
- #endif
- xxmrglw vs0, vs35,vs32
- xxmrglw vs1, vs34,vs33
- xxmrglw vs4, vs32,vs35
- xxmrglw vs5, vs33,vs34
-
-
- xxmrghw vs2, vs35,vs32
- xxmrghw vs3, vs34,vs33
- xxmrghw vs6, vs32,vs35
- xxmrghw vs7, vs33,vs34
-
- xxmrgld vs24, vs1, vs0
- xxmrghd vs25,vs5,vs4
-
- xxmrgld vs26, vs2, vs3
- xxmrghd vs27,vs6,vs7
-
-
- xxmrglw vs0, vs51,vs48
- xxmrglw vs1, vs50,vs49
- xxmrglw vs4, vs48,vs51
- xxmrglw vs5, vs49,vs50
-
- xxmrghw vs2, vs51,vs48
- xxmrghw vs3, vs50,vs49
- xxmrghw vs6, vs48,vs51
- xxmrghw vs7, vs49,vs50
-
- xxmrgld vs28, vs1, vs0
- xxmrghd vs29,vs5,vs4
-
- xxmrgld vs30, vs2, vs3
- xxmrghd vs31,vs6,vs7
- #if defined(TRMMKERNEL)
-
- xvmulsp vs36, vs24, alpha_r
- xvmulsp vs37, vs25, alpha_r
- xvmulsp vs38, vs26, alpha_r
- xvmulsp vs39, vs27, alpha_r
- xvmulsp vs40, vs28, alpha_r
- xvmulsp vs41, vs29, alpha_r
- xvmulsp vs42, vs30, alpha_r
- xvmulsp vs43, vs31, alpha_r
- #else
- xvmaddasp vs36, vs24, alpha_r
- xvmaddasp vs37, vs25, alpha_r
- xvmaddasp vs38, vs26, alpha_r
- xvmaddasp vs39, vs27, alpha_r
- xvmaddasp vs40, vs28, alpha_r
- xvmaddasp vs41, vs29, alpha_r
- xvmaddasp vs42, vs30, alpha_r
- xvmaddasp vs43, vs31, alpha_r
- #endif
-
- stxv vs36, 0(CO)
- stxv vs37, 0(T1)
- stxv vs38, 0(T2)
- stxv vs39, 0(T3)
- stxv vs40, 0(T4)
- stxv vs41, 0(T5)
- stxv vs42, 0(T6)
- stxv vs43, 0(T7)
-
-
- addi CO,CO,16
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=8 and M=2
- **********************************************************************************************/
-
-
- .macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast
- KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
-
- .macro Zero8x2
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2, vs2, vs2
- xxlxor vs3, vs3, vs3
-
- .endm
-
- .macro KERNEL8x2
- KERNEL8x2_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 0
- xxspltw vs9, vs36, 1
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- xvmulsp vs2, vs26, vs9
- xvmulsp vs3, vs27, vs9
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs26, vs9
- xvmaddasp vs3, vs27, vs9
-
- .endif
-
- addi \AREG, \AREG, DISP2(\Index,8)
- addi \BREG, \BREG, DISP8(\Index,32)
-
- .endm
-
- .macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
-
- lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
- lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG)
- lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG)
- lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG)
- xxspltw vs8, vs4, 2
- xxspltw vs9, vs4, 3
- xxspltw vs10, vs4, 0
- xxspltw vs11, vs4, 1
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- xvmulsp vs2, vs26, vs9
- xvmulsp vs3, vs27, vs9
-
- xvmulsp vs0, vs28, vs10
- xvmulsp vs1, vs29, vs10
- xvmulsp vs2, vs28, vs11
- xvmulsp vs3, vs29, vs11
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs26, vs9
- xvmaddasp vs3, vs27, vs9
-
- xvmaddasp vs0, vs28, vs10
- xvmaddasp vs1, vs29, vs10
- xvmaddasp vs2, vs28, vs11
- xvmaddasp vs3, vs29, vs11
- .endif
-
-
- .if \IsLast==1
- addi \AREG, \AREG, DISP4(\Index,16)
- addi \BREG, \BREG, DISP16(\Index,64)
- .endif
-
- .endm
-
-
- .macro SAVE8x2
- slwi T10, LDC , 1
- add T1, CO, LDC
- add T2, CO, T10
- add T3, T1, T10
- add T4, T2, T10
- add T5, T3, T10
- add T6, T4, T10
- add T7, T5, T10
- /*convert alpha_r for multiply*/
- xscvspdp vs4,alpha_r
- /* v0 corresponds to vs32, do not forget*/
- #if !defined(TRMMKERNEL)
- lxssp v0,0(CO)
- lxssp v1,4(CO)
-
- lxssp v2,0(T1)
- lxssp v3,4(T1)
-
- lxssp v4,0(T2)
- lxssp v5,4(T2)
-
- lxssp v6,0(T3)
- lxssp v7,4(T3)
-
- lxssp v8,0(T4)
- lxssp v9,4(T4)
-
- lxssp v10,0(T5)
- lxssp v11,4(T5)
-
- lxssp v12,0(T6)
- lxssp v13,4(T6)
-
- lxssp v14,0(T7)
- lxssp v15,4(T7)
- #endif
- xscvspdp vs5, vs2
- xxspltw vs6, vs2, 1
- xxspltw vs7, vs2, 2
- xxspltw vs8, vs2, 3
- xscvspdp vs6,vs6
- xscvspdp vs7,vs7
- xscvspdp vs8,vs8
-
- xscvspdp vs24, vs0
- xxspltw vs25, vs0, 1
- xxspltw vs26, vs0, 2
- xxspltw vs27, vs0, 3
- xscvspdp vs25,vs25
- xscvspdp vs26,vs26
- xscvspdp vs27,vs27
-
- xscvspdp vs9, vs3
- xxspltw vs10, vs3, 1
- xxspltw vs11, vs3, 2
- xxspltw vs12, vs3, 3
- xscvspdp vs10,vs10
- xscvspdp vs11,vs11
- xscvspdp vs12,vs12
-
- xscvspdp vs28, vs1
- xxspltw vs29, vs1, 1
- xxspltw vs30, vs1, 2
- xxspltw vs31, vs1, 3
- xscvspdp vs29,vs29
- xscvspdp vs30,vs30
- xscvspdp vs31,vs31
-
-
-
-
- #if defined(TRMMKERNEL)
- xsmuldp vs32,vs8, vs4
- xsmuldp vs33,vs27, vs4
-
- xsmuldp vs34,vs7, vs4
- xsmuldp vs35,vs26, vs4
-
- xsmuldp vs36,vs6, vs4
- xsmuldp vs37,vs25, vs4
-
- xsmuldp vs38,vs5, vs4
- xsmuldp vs39,vs24, vs4
-
- xsmuldp vs40,vs12, vs4
- xsmuldp vs41,vs31, vs4
-
- xsmuldp vs42,vs11, vs4
- xsmuldp vs43,vs30, vs4
-
- xsmuldp vs44,vs10, vs4
- xsmuldp vs45,vs29, vs4
-
- xsmuldp vs46,vs9, vs4
- xsmuldp vs47,vs28, vs4
- #else
- xsmaddadp vs32,vs8, vs4
- xsmaddadp vs33,vs27, vs4
-
- xsmaddadp vs34,vs7, vs4
- xsmaddadp vs35,vs26, vs4
-
- xsmaddadp vs36,vs6, vs4
- xsmaddadp vs37,vs25, vs4
-
- xsmaddadp vs38,vs5, vs4
- xsmaddadp vs39,vs24, vs4
-
- xsmaddadp vs40,vs12, vs4
- xsmaddadp vs41,vs31, vs4
-
- xsmaddadp vs42,vs11, vs4
- xsmaddadp vs43,vs30, vs4
-
- xsmaddadp vs44,vs10, vs4
- xsmaddadp vs45,vs29, vs4
-
- xsmaddadp vs46,vs9, vs4
- xsmaddadp vs47,vs28, vs4
- #endif
-
- stxssp v0,0(CO)
- stxssp v1,4(CO)
-
- stxssp v2,0(T1)
- stxssp v3,4(T1)
-
- stxssp v4,0(T2)
- stxssp v5,4(T2)
-
- stxssp v6,0(T3)
- stxssp v7,4(T3)
-
- stxssp v8,0(T4)
- stxssp v9,4(T4)
-
- stxssp v10,0(T5)
- stxssp v11,4(T5)
-
- stxssp v12,0(T6)
- stxssp v13,4(T6)
-
- stxssp v14,0(T7)
- stxssp v15,4(T7)
-
-
- addi CO,CO,8
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=8 and M=1
- **********************************************************************************************/
- .macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast
- KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro Zero8x1
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- .endm
-
- .macro KERNEL8x1
- KERNEL8x1_1 AO,BO, 0
- .endm
-
- .macro KERNEL8x1_2
- KERNEL8x1_2_1 AO,BO, 0
- .endm
-
- .macro KERNEL8x1_1 AREG,BREG,First
- lxvwsx vs8, 0, \AREG
- lxv vs26, 0(\BREG)
- lxv vs27, 16(\BREG)
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- .endif
- addi \AREG, \AREG, 4
- addi \BREG, \BREG, 32
- .endm
-
- .macro KERNEL8x1_2_1 AREG,BREG,First
- lxsd v4, 0(\AREG)
- lxv vs26, 0(\BREG)
- lxv vs27, 16(\BREG)
- lxv vs28, 32(\BREG)
- lxv vs29, 48(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- xvmulsp vs0, vs28, vs9
- xvmulsp vs1, vs29, vs9
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs0, vs28, vs9
- xvmaddasp vs1, vs29, vs9
- .endif
- addi \AREG, \AREG, 8
- addi \BREG, \BREG, 64
- .endm
-
- .macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
- lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
- xxspltw vs8, vs4, 3
- xxspltw vs9, vs4, 2
- xxspltw vs10, vs4, 1
- xxspltw vs11, vs4, 0
- lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG)
- lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG)
- lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG)
- lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG)
- lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG)
- lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG)
- lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG)
- lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG)
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- xvmulsp vs0, vs28, vs9
- xvmulsp vs1, vs29, vs9
- xvmulsp vs0, vs30, vs10
- xvmulsp vs1, vs31, vs10
- xvmulsp vs0, vs32, vs11
- xvmulsp vs1, vs33, vs11
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs0, vs28, vs9
- xvmaddasp vs1, vs29, vs9
- xvmaddasp vs0, vs30, vs10
- xvmaddasp vs1, vs31, vs10
- xvmaddasp vs0, vs32, vs11
- xvmaddasp vs1, vs33, vs11
- .endif
- .if \IsLast==1
- addi \AREG, \AREG, DISP4(\Index,16)
- addi \BREG, \BREG, DISP32(\Index,128)
- .endif
- .endm
-
- .macro SAVE8x1
- slwi T10, LDC , 1
- add T1, CO, LDC
- add T2, CO, T10
- add T3, T1, T10
- add T4, T2, T10
- add T5, T3, T10
- add T6, T4, T10
- add T7, T5, T10
- /*convert alpha_r for multiply*/
- xscvspdp vs4,alpha_r
- /* v0 corresponds to vs32, do not forget*/
- #if !defined(TRMMKERNEL)
- lxssp v0,0(CO)
- lxssp v2,0(T1)
- lxssp v4,0(T2)
- lxssp v6,0(T3)
- lxssp v8,0(T4)
- lxssp v10,0(T5)
- lxssp v12,0(T6)
- lxssp v14,0(T7)
- #endif
- xscvspdp vs24, vs0
- xxspltw vs25, vs0, 1
- xxspltw vs26, vs0, 2
- xxspltw vs27, vs0, 3
- xscvspdp vs25,vs25
- xscvspdp vs26,vs26
- xscvspdp vs27,vs27
- xscvspdp vs28, vs1
- xxspltw vs29, vs1, 1
- xxspltw vs30, vs1, 2
- xxspltw vs31, vs1, 3
- xscvspdp vs29,vs29
- xscvspdp vs30,vs30
- xscvspdp vs31,vs31
- #if defined(TRMMKERNEL)
- xsmuldp vs32,vs27, vs4
- xsmuldp vs34,vs26, vs4
- xsmuldp vs36,vs25, vs4
- xsmuldp vs38,vs24, vs4
- xsmuldp vs40,vs31, vs4
- xsmuldp vs42,vs30, vs4
- xsmuldp vs44,vs29, vs4
- xsmuldp vs46,vs28, vs4
- #else
- xsmaddadp vs32,vs27, vs4
- xsmaddadp vs34,vs26, vs4
- xsmaddadp vs36,vs25, vs4
- xsmaddadp vs38,vs24, vs4
- xsmaddadp vs40,vs31, vs4
- xsmaddadp vs42,vs30, vs4
- xsmaddadp vs44,vs29, vs4
- xsmaddadp vs46,vs28, vs4
- #endif
- stxssp v0,0(CO)
- stxssp v2,0(T1)
- stxssp v4,0(T2)
- stxssp v6,0(T3)
- stxssp v8,0(T4)
- stxssp v10,0(T5)
- stxssp v12,0(T6)
- stxssp v14,0(T7)
- addi CO,CO,4
- .endm
-
-
-
- /**********************************************************************************************
- * Macros for N=4 and M=16
- **********************************************************************************************/
-
- .macro LOAD4x16_1
- LOAD4x16 1
- .endm
-
- .macro LOAD4x16_0
- LOAD4x16 0
- .endm
-
- .macro KERNEL4x16_L1_L4 Index,IsLast
- KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
- .endm
-
- .macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast
- KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
- KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
- .macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro Zero4X16
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs38, vs38, vs38
- xxlxor vs39, vs39, vs39
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs42, vs42, vs42
- xxlxor vs43, vs43, vs43
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
- xxlxor vs46, vs46, vs46
- xxlxor vs47, vs47, vs47
- .endm
-
- .macro LOAD4x16 Zero
-
- lxv vs24, 0(BO)
- lxv vs0, 0(AO)
- lxv vs1, 16(AO)
- lxv vs2, 32(AO)
- lxv vs3, 48(AO)
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
- xxpermdi vs27, vs26, vs26,2
-
- .if \Zero==1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs38, vs38, vs38
- xxlxor vs39, vs39, vs39
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs42, vs42, vs42
- xxlxor vs43, vs43, vs43
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
- xxlxor vs46, vs46, vs46
- xxlxor vs47, vs47, vs47
-
- .endif
- .endm
-
- .macro END4x16_NORMAL
- END4x16 0, AO, BO, 64,16
- .endm
-
- .macro END4x16 First, AREG, BREG, OffsetA, OffsetB
-
- .if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
- .endif
- .if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
- .endif
-
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
- xvmulsp vs34, vs2,vs24
- xvmulsp vs35, vs3,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
- xvmulsp vs38, vs2,vs25
- xvmulsp vs39, vs3,vs25
-
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
- xvmulsp vs42, vs2,vs26
- xvmulsp vs43, vs3,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
- xvmulsp vs46, vs2,vs27
- xvmulsp vs47, vs3,vs27
-
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
- .endif
- .endm
-
- .macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
-
- lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG)
- lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG)
- lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxpermdi vs9, vs8, vs8,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
-
- xxpermdi vs11, vs10, vs10,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
-
-
- lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG)
-
- lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG)
- lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG)
- lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG)
- lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
-
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
-
- xxpermdi vs27, vs26, vs26,2
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
-
-
- lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG)
-
- lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG)
- lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG)
- lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG)
- lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxpermdi vs9, vs8, vs8,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
-
- xxpermdi vs11, vs10, vs10,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
-
-
- .if \Complete==0
- lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG)
-
- lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG)
- lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG)
- lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG)
- lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
-
- .endif
- .if \IsLast==1
- .if \Complete==1
-
- addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
- addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
- .else
-
- addi \BREG, \BREG, DISP16(\Index,64)
- addi \AREG, \AREG, DISP64(\Index,256)
- .endif
- .endif
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
-
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
-
- .endif
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
-
-
-
- .endm
-
- .macro KERNEL4x16 First
-
- LOAD4x16 0
- END4x16 \First, AO, BO, 64,16
- .endm
-
- .macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
- lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxpermdi vs9, vs8, vs8,2
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
- xvmulsp vs34, vs2,vs24
- xvmulsp vs35, vs3,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
- xvmulsp vs38, vs2,vs25
- xvmulsp vs39, vs3,vs25
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
- xvmaddasp vs34, vs2,vs24
- xvmaddasp vs35, vs3,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
- xvmaddasp vs38, vs2,vs25
- xvmaddasp vs39, vs3,vs25
- .endif
-
- xxpermdi vs11, vs10, vs10,2
-
- .if \First==1
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
- xvmulsp vs42, vs2,vs26
- xvmulsp vs43, vs3,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
- xvmulsp vs46, vs2,vs27
- xvmulsp vs47, vs3,vs27
-
-
- .else
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
- xvmaddasp vs42, vs2,vs26
- xvmaddasp vs43, vs3,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
- xvmaddasp vs46, vs2,vs27
- xvmaddasp vs47, vs3,vs27
-
-
- .endif
- .if \Complete==0
- lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG)
- lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
- lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
- lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
- .endif
- .if \IsLast==1
- .if \Complete==1
- addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
- addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
-
- .else
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
- .endif
-
- .if \First==1
- xvmulsp vs32, vs4,vs8
- xvmulsp vs33, vs5,vs8
- xvmulsp vs34, vs6,vs8
- xvmulsp vs35, vs7,vs8
-
- xvmulsp vs36, vs4,vs9
- xvmulsp vs37, vs5,vs9
- xvmulsp vs38, vs6,vs9
- xvmulsp vs39, vs7,vs9
- .else
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
- xvmaddasp vs34, vs6,vs8
- xvmaddasp vs35, vs7,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
- xvmaddasp vs38, vs6,vs9
- xvmaddasp vs39, vs7,vs9
- .endif
-
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
-
- .endif
- .if \First==1
- xvmulsp vs40, vs4,vs10
- xvmulsp vs41, vs5,vs10
- xvmulsp vs42, vs6,vs10
- xvmulsp vs43, vs7,vs10
-
- xvmulsp vs44, vs4,vs11
- xvmulsp vs45, vs5,vs11
- xvmulsp vs46, vs6,vs11
- xvmulsp vs47, vs7,vs11
-
-
-
- .else
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
- xvmaddasp vs42, vs6,vs10
- xvmaddasp vs43, vs7,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
- xvmaddasp vs46, vs6,vs11
- xvmaddasp vs47, vs7,vs11
-
-
-
- .endif
-
- .endm
-
-
- .macro SAVE4x16
-
- slwi T10, LDC , 1
- add T1, CO, LDC
-
- add T2, CO, T10
- add T3, T1, T10
-
-
-
- xxmrglw vs8, vs32, vs44
- xxmrglw vs10, vs36, vs40
-
- xxmrghw vs1, vs32, vs44
- xxmrghw vs0, vs36, vs40
-
- xxmrglw vs12, vs33, vs45
- xxmrglw vs14, vs37, vs41
-
- xxmrghw vs2, vs37, vs41
- xxmrghw vs3, vs33, vs45
-
- xxmrglw vs16, vs34, vs46
- xxmrglw vs18, vs38, vs42
-
- xxlor vs9, vs8, vs8
- xxlor vs11, vs10, vs10
-
- xxmrghw vs4, vs38, vs42
- xxmrghw vs5, vs34, vs46
-
- xxlor vs13, vs12, vs12
- xxlor vs15, vs14, vs14
-
- xxmrglw vs24, vs35, vs47
- xxmrglw vs26, vs39, vs43
-
- xxlor vs17, vs16, vs16
- xxlor vs19, vs18, vs18
-
- xxmrghw vs30, vs39, vs43
- xxmrghw vs31, vs35, vs47
-
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
- #ifndef TRMMKERNEL
- lxv vs32, 0(CO)
- lxv vs33, 16(CO)
- lxv vs34, 32(CO)
- lxv vs35, 48(CO)
- #endif
- xxlor vs25, vs24, vs24
- xxlor vs27, vs26, vs26
-
- #ifndef TRMMKERNEL
- lxv vs36, 0(T1)
- lxv vs37, 16(T1)
- lxv vs38, 32(T1)
- lxv vs39, 48(T1)
- #endif
- #ifndef TRMMKERNEL
- lxv vs40, 0(T2)
- lxv vs41, 16(T2)
- lxv vs42, 32(T2)
- lxv vs43, 48(T2)
- #endif
- #ifndef TRMMKERNEL
- lxv vs44, 0(T3)
- lxv vs45, 16(T3)
- lxv vs46, 32(T3)
- lxv vs47, 48(T3)
- #endif
-
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
-
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
-
- xxperm vs16, vs4, save_permute_1
- xxperm vs18, vs5, save_permute_1
-
- xxperm vs17, vs4, save_permute_2
- xxperm vs19, vs5, save_permute_2
-
- xxperm vs24, vs30, save_permute_1
- xxperm vs26, vs31, save_permute_1
-
- xxperm vs25, vs30, save_permute_2
- xxperm vs27, vs31, save_permute_2
-
-
- /* multiply add normal way */
-
- #ifdef TRMMKERNEL
- xvmulsp vs32, vs8, alpha_r
- xvmulsp vs33, vs12, alpha_r
- xvmulsp vs34, vs16, alpha_r
- xvmulsp vs35, vs24, alpha_r
- xvmulsp vs36, vs9, alpha_r
- xvmulsp vs37, vs13, alpha_r
- xvmulsp vs38, vs17, alpha_r
- xvmulsp vs39, vs25, alpha_r
- #else
- xvmaddasp vs32, vs8, alpha_r
- xvmaddasp vs33, vs12, alpha_r
- xvmaddasp vs34, vs16, alpha_r
- xvmaddasp vs35, vs24, alpha_r
- xvmaddasp vs36, vs9, alpha_r
- xvmaddasp vs37, vs13, alpha_r
- xvmaddasp vs38, vs17, alpha_r
- xvmaddasp vs39, vs25, alpha_r
- #endif
-
-
-
- #ifdef TRMMKERNEL
- xvmulsp vs40, vs10, alpha_r
- xvmulsp vs41, vs14, alpha_r
- xvmulsp vs42, vs18, alpha_r
- xvmulsp vs43, vs26, alpha_r
- xvmulsp vs44, vs11, alpha_r
- xvmulsp vs45, vs15, alpha_r
- xvmulsp vs46, vs19, alpha_r
- xvmulsp vs47, vs27, alpha_r
- #else
-
- xvmaddasp vs40, vs10, alpha_r
- xvmaddasp vs41, vs14, alpha_r
- xvmaddasp vs42, vs18, alpha_r
- xvmaddasp vs43, vs26, alpha_r
- xvmaddasp vs44, vs11, alpha_r
- xvmaddasp vs45, vs15, alpha_r
- xvmaddasp vs46, vs19, alpha_r
- xvmaddasp vs47, vs27, alpha_r
-
- #endif
-
- stxv vs32, 0(CO)
- stxv vs33, 16(CO)
- stxv vs34, 32(CO)
- stxv vs35, 48(CO)
-
- stxv vs36, 0(T1)
- stxv vs37, 16(T1)
- stxv vs38, 32(T1)
- stxv vs39, 48(T1)
-
- stxv vs40, 0(T2)
- stxv vs41, 16(T2)
- stxv vs42, 32(T2)
- stxv vs43, 48(T2)
- stxv vs44, 0(T3)
- stxv vs45, 16(T3)
- stxv vs46, 32(T3)
- stxv vs47, 48(T3)
-
- addi CO,CO,64
-
-
- .endm
-
-
-
- /**********************************************************************************************
- * Macros for N=4 and M=8
- **********************************************************************************************/
-
- .macro LOAD4x8_1
- LOAD4x8 1
- .endm
-
- .macro LOAD4x8_0
- LOAD4x8 0
- .endm
-
- .macro KERNEL4x8_L1_L4 Index,IsLast
- KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
- .endm
-
- .macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast
- KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast
- KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
- .macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro END4x8_NORMAL
- END4x8 0, AO, BO, 32,16
- .endm
-
- .macro Zero4X8
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
-
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
-
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
-
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
-
- .endm
-
- .macro LOAD4x8 Zero
-
- lxv vs24, 0(BO)
- lxv vs0, 0(AO)
- lxv vs1, 16(AO)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
-
- xxpermdi vs27, vs26, vs26,2
-
- .if \Zero==1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs36, vs36, vs36
- xxlxor vs37, vs37, vs37
- xxlxor vs40, vs40, vs40
- xxlxor vs41, vs41, vs41
- xxlxor vs44, vs44, vs44
- xxlxor vs45, vs45, vs45
-
- .endif
- .endm
-
-
- .macro END4x8 First, AREG, BREG, OffsetA, OffsetB
-
- .if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
- .endif
- .if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
- .endif
-
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
-
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
-
-
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
-
- .endif
- .endm
-
- .macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
-
- lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxpermdi vs9, vs8, vs8,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- xxpermdi vs11, vs10, vs10,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
-
-
- lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG)
-
- lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
-
- xxpermdi vs27, vs26, vs26,2
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
-
-
-
- lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG)
-
- lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)
- lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxpermdi vs9, vs8, vs8,2
-
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- xxpermdi vs11, vs10, vs10,2
-
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
-
-
- .if \Complete==0
- lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG)
-
- lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)
- lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
-
- .endif
- .if \IsLast==1
- .if \Complete==1
-
- addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
- addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
- .else
-
- addi \BREG, \BREG, DISP16(\Index,64)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
- .endif
-
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
-
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
-
- .endif
-
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
-
-
-
- .endm
-
- .macro KERNEL4x8 First
-
- LOAD4x8 0
- END4x8 \First, AO, BO, 32,16
- .endm
-
- .macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
-
- xxperm vs10, vs8, permute_mask
- xxpermdi vs9, vs8, vs8,2
- .if \First==1
- xvmulsp vs32, vs0,vs24
- xvmulsp vs33, vs1,vs24
-
- xvmulsp vs36, vs0,vs25
- xvmulsp vs37, vs1,vs25
-
- .else
- xvmaddasp vs32, vs0,vs24
- xvmaddasp vs33, vs1,vs24
-
- xvmaddasp vs36, vs0,vs25
- xvmaddasp vs37, vs1,vs25
-
- .endif
-
- xxpermdi vs11, vs10, vs10,2
-
- .if \First==1
- xvmulsp vs40, vs0,vs26
- xvmulsp vs41, vs1,vs26
-
- xvmulsp vs44, vs0,vs27
- xvmulsp vs45, vs1,vs27
-
-
- .else
- xvmaddasp vs40, vs0,vs26
- xvmaddasp vs41, vs1,vs26
-
- xvmaddasp vs44, vs0,vs27
- xvmaddasp vs45, vs1,vs27
-
-
- .endif
- .if \Complete==0
- lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG)
-
- lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG)
- lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG)
-
- xxperm vs26, vs24, permute_mask
- xxpermdi vs25, vs24, vs24,2
- .endif
- .if \IsLast==1
- .if \Complete==1
- addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
- addi \AREG, \AREG, DISP16(\Index,32+\OffsetA)
-
- .else
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP16(\Index,64)
- .endif
- .endif
-
- .if \First==1
- xvmulsp vs32, vs4,vs8
- xvmulsp vs33, vs5,vs8
-
- xvmulsp vs36, vs4,vs9
- xvmulsp vs37, vs5,vs9
-
- .else
- xvmaddasp vs32, vs4,vs8
- xvmaddasp vs33, vs5,vs8
-
- xvmaddasp vs36, vs4,vs9
- xvmaddasp vs37, vs5,vs9
-
- .endif
-
- .if \Complete==0
- xxpermdi vs27, vs26, vs26,2
-
- .endif
- .if \First==1
- xvmulsp vs40, vs4,vs10
- xvmulsp vs41, vs5,vs10
-
- xvmulsp vs44, vs4,vs11
- xvmulsp vs45, vs5,vs11
-
- .else
- xvmaddasp vs40, vs4,vs10
- xvmaddasp vs41, vs5,vs10
-
- xvmaddasp vs44, vs4,vs11
- xvmaddasp vs45, vs5,vs11
-
- .endif
-
- .endm
-
-
- .macro SAVE4x8
-
- slwi T10, LDC , 1
- add T1, CO, LDC
-
- add T2, CO, T10
- add T3, T1, T10
-
-
-
- #ifndef TRMMKERNEL
- lxv vs34, 0(CO)
- lxv vs35, 16(CO)
- lxv vs38, 0(T1)
- lxv vs39, 16(T1)
- lxv vs42, 0(T2)
- lxv vs43, 16(T2)
- lxv vs46, 0(T3)
- lxv vs47, 16(T3)
-
-
- #endif
-
- xxmrglw vs8, vs32, vs44
- xxmrglw vs10, vs36, vs40
-
- xxmrghw vs1, vs32, vs44
- xxmrghw vs0, vs36, vs40
-
- xxmrglw vs12, vs33, vs45
- xxmrglw vs14, vs37, vs41
-
- xxmrghw vs2, vs37, vs41
- xxmrghw vs3, vs33, vs45
-
- xxlor vs9, vs8, vs8
- xxlor vs11, vs10, vs10
-
- xxlor vs13, vs12, vs12
- xxlor vs15, vs14, vs14
-
- xxperm vs8, vs0, save_permute_1
- xxperm vs10, vs1, save_permute_1
- xxperm vs9, vs0, save_permute_2
- xxperm vs11, vs1, save_permute_2
-
- xxperm vs12, vs2, save_permute_1
- xxperm vs14, vs3, save_permute_1
-
- xxperm vs13, vs2, save_permute_2
- xxperm vs15, vs3, save_permute_2
-
-
- /* multiply add normal way */
-
- #ifdef TRMMKERNEL
- xvmulsp vs34, vs8, alpha_r
- xvmulsp vs35, vs12, alpha_r
- xvmulsp vs38, vs9, alpha_r
- xvmulsp vs39, vs13, alpha_r
- xvmulsp vs42, vs10, alpha_r
- xvmulsp vs43, vs14, alpha_r
- xvmulsp vs46, vs11, alpha_r
- xvmulsp vs47, vs15, alpha_r
- #else
- xvmaddasp vs34, vs8, alpha_r
- xvmaddasp vs35, vs12, alpha_r
- xvmaddasp vs38, vs9, alpha_r
- xvmaddasp vs39, vs13, alpha_r
- xvmaddasp vs42, vs10, alpha_r
- xvmaddasp vs43, vs14, alpha_r
- xvmaddasp vs46, vs11, alpha_r
- xvmaddasp vs47, vs15, alpha_r
- #endif
-
-
- stxv vs34, 0(CO)
- stxv vs35, 16(CO)
- stxv vs38, 0(T1)
- stxv vs39, 16(T1)
- stxv vs42, 0(T2)
- stxv vs43, 16(T2)
- stxv vs46, 0(T3)
- stxv vs47, 16(T3)
-
-
- addi CO,CO,32
-
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=4 and M=4
- **********************************************************************************************/
-
- .macro LOAD4x4_1
- LOAD4x4 1
- .endm
-
- .macro LOAD4x4_0
- LOAD4x4 0
- .endm
-
- .macro KERNEL4x4_L1_L4 Index,IsLast
- KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
- .endm
-
- .macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast
- KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast
- KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
- .macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast
- KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
- .endm
-
- .macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
- KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
- .endm
-
- .macro Zero4X4
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
-
- .endm
-
- .macro LOAD4x4 Zero
-
- lxv vs0, 0(AO)
- lxv vs24, 0(BO)
-
-
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
-
- .if \Zero==1
- xxlxor vs32, vs32, vs32
- xxlxor vs33, vs33, vs33
- xxlxor vs34, vs34, vs34
- xxlxor vs35, vs35, vs35
-
- .endif
- .endm
-
- .macro END4x4_NORMAL
- END4x4 0, AO, BO, 16,16
- .endm
-
- .macro END4x4 First, AREG, BREG, OffsetA, OffsetB
-
- .if \OffsetB != 0
- addi \BREG, \BREG, \OffsetB
- .endif
- .if \OffsetA != 0
- addi \AREG, \AREG, \OffsetA
- .endif
-
- .if \First==1
- xvmulsp vs32, vs24, vs0
- xvmulsp vs33, vs24, vs1
- xvmulsp vs34, vs24, vs2
- xvmulsp vs35, vs24, vs3
- .else
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
-
- .endif
- .endm
-
- .macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
-
- xxperm vs6, vs4, permute_mask
- xxpermdi vs5, vs4, vs4,2
- xxpermdi vs7, vs6, vs6,2
-
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
-
- lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG)
- lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG)
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
-
- xvmaddasp vs32, vs26, vs4
- xvmaddasp vs33, vs26, vs5
- xvmaddasp vs34, vs26, vs6
- xvmaddasp vs35, vs26, vs7
-
-
-
- lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG)
- lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG)
-
- xxperm vs6, vs4, permute_mask
- xxpermdi vs5, vs4, vs4,2
- xxpermdi vs7, vs6, vs6,2
-
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
-
- .if \Complete==0
-
- lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG)
- lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG)
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
- .endif
- xvmaddasp vs32, vs26, vs4
- xvmaddasp vs33, vs26, vs5
- xvmaddasp vs34, vs26, vs6
- xvmaddasp vs35, vs26, vs7
-
-
-
-
- .if \IsLast==1
- .if \Complete==1
- addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)
- addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
-
- .else
- addi \AREG, \AREG, DISP16(\Index,64)
- addi \BREG, \BREG, DISP16(\Index,64)
-
- .endif
- .endif
-
-
- .endm
-
- .macro KERNEL4x4 First
- LOAD4x4 0
- END4x4 \First, AO, BO, 16,16
- .endm
-
- .macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
-
- lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
-
- xxperm vs6, vs4, permute_mask
- xxpermdi vs5, vs4, vs4,2
- xxpermdi vs7, vs6, vs6,2
- .if \First==1
- xvmulsp vs32, vs24, vs0
- xvmulsp vs33, vs24, vs1
- xvmulsp vs34, vs24, vs2
- xvmulsp vs35, vs24, vs3
-
- .else
- xvmaddasp vs32, vs24, vs0
- xvmaddasp vs33, vs24, vs1
- xvmaddasp vs34, vs24, vs2
- xvmaddasp vs35, vs24, vs3
-
- .endif
-
- .if \Complete==0
-
- lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG)
- lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG)
-
- xxperm vs2, vs0, permute_mask
- xxpermdi vs1, vs0, vs0,2
- xxpermdi vs3, vs2, vs2,2
- .endif
-
- .if \First==1
- xvmulsp vs32, vs26, vs4
- xvmulsp vs33, vs26, vs5
- xvmulsp vs34, vs26, vs6
- xvmulsp vs35, vs26, vs7
-
-
- .else
- xvmaddasp vs32, vs26, vs4
- xvmaddasp vs33, vs26, vs5
- xvmaddasp vs34, vs26, vs6
- xvmaddasp vs35, vs26, vs7
-
- .endif
-
-
- .if \IsLast==1
- .if \Complete==1
- addi \AREG, \AREG, DISP8(\Index,16+\OffsetA)
- addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
-
- .else
- addi \AREG, \AREG, DISP8(\Index,32)
- addi \BREG, \BREG, DISP8(\Index,32)
-
- .endif
- .endif
-
-
- .endm
-
-
- .macro SAVE4x4
- slwi T10, LDC , 1
- add T1, CO, LDC
- #if !defined(TRMMKERNEL)
- lxv vs36, 0(CO)
- lxv vs37, 0(T1)
- #endif
- add T2, CO, T10
- add T3, T1, T10
- #if !defined(TRMMKERNEL)
- lxv vs38, 0(T2)
- lxv vs39, 0(T3)
- #endif
-
- xxmrglw vs0, vs35,vs32
- xxmrglw vs1, vs34,vs33
- xxmrglw vs4, vs32,vs35
- xxmrglw vs5, vs33,vs34
-
-
- xxmrghw vs2, vs35,vs32
- xxmrghw vs3, vs34,vs33
- xxmrghw vs6, vs32,vs35
- xxmrghw vs7, vs33,vs34
-
- xxmrgld vs24, vs1, vs0
- xxmrghd vs25,vs5,vs4
-
- xxmrgld vs26, vs2, vs3
- xxmrghd vs27,vs6,vs7
-
- #if defined(TRMMKERNEL)
- xvmulsp vs36, vs24, alpha_r
- xvmulsp vs37, vs25, alpha_r
- xvmulsp vs38, vs26, alpha_r
- xvmulsp vs39, vs27, alpha_r
- #else
- xvmaddasp vs36, vs24, alpha_r
- xvmaddasp vs37, vs25, alpha_r
- xvmaddasp vs38, vs26, alpha_r
- xvmaddasp vs39, vs27, alpha_r
- #endif
- stxv vs36, 0(CO)
- stxv vs37, 0(T1)
- stxv vs38, 0(T2)
- stxv vs39, 0(T3)
-
-
-
- addi CO,CO,16
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=4 and M=2
- **********************************************************************************************/
-
-
- .macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast
- KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
-
- .macro Zero4x2
- xxlxor vs0, vs0, vs0
- xxlxor vs2, vs2, vs2
-
- .endm
-
- .macro KERNEL4x2
- KERNEL4x2_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 0
- xxspltw vs9, vs36, 1
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs2, vs26, vs9
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs2, vs26, vs9
-
- .endif
-
- addi \AREG, \AREG, DISP2(\Index,8)
- addi \BREG, \BREG, DISP4(\Index,16)
-
- .endm
-
- .macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
-
- lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
- lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG)
- xxspltw vs8, vs4, 2
- xxspltw vs9, vs4, 3
- xxspltw vs10, vs4, 0
- xxspltw vs11, vs4, 1
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs2, vs26, vs9
-
- xvmulsp vs0, vs28, vs10
- xvmulsp vs2, vs28, vs11
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs2, vs26, vs9
-
- xvmaddasp vs0, vs28, vs10
- xvmaddasp vs2, vs28, vs11
- .endif
-
-
- .if \IsLast==1
- addi \AREG, \AREG, DISP4(\Index,16)
- addi \BREG, \BREG, DISP8(\Index,32)
- .endif
-
- .endm
-
-
- .macro SAVE4x2
- slwi T10, LDC , 1
- add T1, CO, LDC
- add T2, CO, T10
- add T3, T1, T10
- /*convert alpha_r for multiply*/
- xscvspdp vs4,alpha_r
- /* v0 corresponds to vs32, do not forget*/
- #if !defined(TRMMKERNEL)
- lxssp v0,0(CO)
- lxssp v1,4(CO)
-
- lxssp v2,0(T1)
- lxssp v3,4(T1)
-
- lxssp v4,0(T2)
- lxssp v5,4(T2)
-
- lxssp v6,0(T3)
- lxssp v7,4(T3)
-
-
- #endif
- xscvspdp vs5, vs2
- xxspltw vs6, vs2, 1
- xxspltw vs7, vs2, 2
- xxspltw vs8, vs2, 3
- xscvspdp vs6,vs6
- xscvspdp vs7,vs7
- xscvspdp vs8,vs8
-
- xscvspdp vs24, vs0
- xxspltw vs25, vs0, 1
- xxspltw vs26, vs0, 2
- xxspltw vs27, vs0, 3
- xscvspdp vs25,vs25
- xscvspdp vs26,vs26
- xscvspdp vs27,vs27
-
-
- #if defined(TRMMKERNEL)
- xsmuldp vs32,vs8, vs4
- xsmuldp vs33,vs27, vs4
-
- xsmuldp vs34,vs7, vs4
- xsmuldp vs35,vs26, vs4
-
- xsmuldp vs36,vs6, vs4
- xsmuldp vs37,vs25, vs4
-
- xsmuldp vs38,vs5, vs4
- xsmuldp vs39,vs24, vs4
-
-
- #else
- xsmaddadp vs32,vs8, vs4
- xsmaddadp vs33,vs27, vs4
-
- xsmaddadp vs34,vs7, vs4
- xsmaddadp vs35,vs26, vs4
-
- xsmaddadp vs36,vs6, vs4
- xsmaddadp vs37,vs25, vs4
-
- xsmaddadp vs38,vs5, vs4
- xsmaddadp vs39,vs24, vs4
-
-
- #endif
-
- stxssp v0,0(CO)
- stxssp v1,4(CO)
-
- stxssp v2,0(T1)
- stxssp v3,4(T1)
-
- stxssp v4,0(T2)
- stxssp v5,4(T2)
-
- stxssp v6,0(T3)
- stxssp v7,4(T3)
-
-
-
-
- addi CO,CO,8
- .endm
-
-
- /**********************************************************************************************
- * Macros for N=4 and M=1
- **********************************************************************************************/
- .macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast
- KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro Zero4x1
- xxlxor vs0, vs0, vs0
- .endm
-
- .macro KERNEL4x1
- KERNEL4x1_1 AO,BO, 0
- .endm
-
- .macro KERNEL4x1_2
- KERNEL4x1_2_1 AO,BO, 0
- .endm
-
- .macro KERNEL4x1_1 AREG,BREG,First
- lxvwsx vs8, 0, \AREG
- lxv vs26, 0(\BREG)
- .if \First==1
- xvmulsp vs0, vs26, vs8
- .else
- xvmaddasp vs0, vs26, vs8
- .endif
- addi \AREG, \AREG, 4
- addi \BREG, \BREG, 16
- .endm
-
- .macro KERNEL4x1_2_1 AREG,BREG,First
- lxsd v4, 0(\AREG)
- lxv vs26, 0(\BREG)
- lxv vs28, 16(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs0, vs28, vs9
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs0, vs28, vs9
- .endif
- addi \AREG, \AREG, 8
- addi \BREG, \BREG, 32
- .endm
-
- .macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
- lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
- xxspltw vs8, vs4, 3
- xxspltw vs9, vs4, 2
- xxspltw vs10, vs4, 1
- xxspltw vs11, vs4, 0
- lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
- lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG)
- lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG)
- lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG)
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs0, vs28, vs9
- xvmulsp vs0, vs30, vs10
- xvmulsp vs0, vs32, vs11
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs0, vs28, vs9
- xvmaddasp vs0, vs30, vs10
- xvmaddasp vs0, vs32, vs11
- .endif
- .if \IsLast==1
- addi \AREG, \AREG, DISP4(\Index,16)
- addi \BREG, \BREG, DISP16(\Index,64)
- .endif
- .endm
-
- .macro SAVE4x1
- slwi T10, LDC , 1
- add T1, CO, LDC
- add T2, CO, T10
- add T3, T1, T10
- /*convert alpha_r for multiply*/
- xscvspdp vs4,alpha_r
- /* v0 corresponds to vs32, do not forget*/
- #if !defined(TRMMKERNEL)
- lxssp v0,0(CO)
- lxssp v2,0(T1)
- lxssp v4,0(T2)
- lxssp v6,0(T3)
- #endif
- xscvspdp vs24, vs0
- xxspltw vs25, vs0, 1
- xxspltw vs26, vs0, 2
- xxspltw vs27, vs0, 3
- xscvspdp vs25,vs25
- xscvspdp vs26,vs26
- xscvspdp vs27,vs27
-
- #if defined(TRMMKERNEL)
- xsmuldp vs32,vs27, vs4
- xsmuldp vs34,vs26, vs4
- xsmuldp vs36,vs25, vs4
- xsmuldp vs38,vs24, vs4
- #else
- xsmaddadp vs32,vs27, vs4
- xsmaddadp vs34,vs26, vs4
- xsmaddadp vs36,vs25, vs4
- xsmaddadp vs38,vs24, vs4
- #endif
- stxssp v0,0(CO)
- stxssp v2,0(T1)
- stxssp v4,0(T2)
- stxssp v6,0(T3)
- addi CO,CO,4
- .endm
-
- /****************************N=2 section*****************/
-
- .macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast
- KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
- .macro Zero2x16
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2, vs2, vs2
- xxlxor vs3, vs3, vs3
- xxlxor vs4, vs4, vs4
- xxlxor vs5, vs5, vs5
- xxlxor vs6, vs6, vs6
- xxlxor vs7, vs7, vs7
- .endm
-
- .macro KERNEL2x16
- KERNEL2x16_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast
- KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
- lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG)
- lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- xvmulsp vs2, vs28, vs8
- xvmulsp vs3, vs29, vs8
-
- xvmulsp vs4, vs26, vs9
- xvmulsp vs5, vs27, vs9
- xvmulsp vs6, vs28, vs9
- xvmulsp vs7, vs29, vs9
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs28, vs8
- xvmaddasp vs3, vs29, vs8
-
- xvmaddasp vs4, vs26, vs9
- xvmaddasp vs5, vs27, vs9
- xvmaddasp vs6, vs28, vs9
- xvmaddasp vs7, vs29, vs9
-
- .endif
-
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP16(\Index,64)
-
- .endm
-
-
-
-
- .macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
-
- lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG)
- lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG)
- lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG)
-
- lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG)
- lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG)
- lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG)
-
- lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG)
- lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG)
- lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG)
- lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG)
-
- lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
- lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
- lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
- lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG)
-
- xxspltw vs8, vs38, 3
- xxspltw vs9, vs38, 2
- xxspltw vs10, vs38, 1
- xxspltw vs11, vs38, 0
-
- xxspltw vs12, vs39, 3
- xxspltw vs13, vs39, 2
- xxspltw vs14, vs39, 1
- xxspltw vs15, vs39, 0
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs28, vs8
- xvmaddasp vs3, vs29, vs8
-
- xvmaddasp vs4, vs26, vs9
- xvmaddasp vs5, vs27, vs9
- xvmaddasp vs6, vs28, vs9
- xvmaddasp vs7, vs29, vs9
-
- xvmaddasp vs0, vs16, vs10
- xvmaddasp vs1, vs17, vs10
- xvmaddasp vs2, vs18, vs10
- xvmaddasp vs3, vs19, vs10
-
- xvmaddasp vs4, vs16, vs11
- xvmaddasp vs5, vs17, vs11
- xvmaddasp vs6, vs18, vs11
- xvmaddasp vs7, vs19, vs11
-
- xvmaddasp vs0, vs30, vs12
- xvmaddasp vs1, vs31, vs12
- xvmaddasp vs2, vs32, vs12
- xvmaddasp vs3, vs33, vs12
-
- xvmaddasp vs4, vs30, vs13
- xvmaddasp vs5, vs31, vs13
- xvmaddasp vs6, vs32, vs13
- xvmaddasp vs7, vs33, vs13
-
- xvmaddasp vs0, vs34, vs14
- xvmaddasp vs1, vs35, vs14
- xvmaddasp vs2, vs36, vs14
- xvmaddasp vs3, vs37, vs14
-
- xvmaddasp vs4, vs34, vs15
- xvmaddasp vs5, vs35, vs15
- xvmaddasp vs6, vs36, vs15
- xvmaddasp vs7, vs37, vs15
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP64(\Index,256)
- .endif
-
- .endm
-
- .macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 3
- xxspltw vs9, vs36, 2
- xxspltw vs10, vs36, 1
- xxspltw vs11, vs36, 0
- lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
- lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG)
- lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
- lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG)
- lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG)
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs28, vs8
- xvmaddasp vs3, vs29, vs8
-
- xvmaddasp vs4, vs26, vs9
- xvmaddasp vs5, vs27, vs9
- xvmaddasp vs6, vs28, vs9
- xvmaddasp vs7, vs29, vs9
-
- xvmaddasp vs0, vs16, vs10
- xvmaddasp vs1, vs17, vs10
- xvmaddasp vs2, vs18, vs10
- xvmaddasp vs3, vs19, vs10
-
- xvmaddasp vs4, vs16, vs11
- xvmaddasp vs5, vs17, vs11
- xvmaddasp vs6, vs18, vs11
- xvmaddasp vs7, vs19, vs11
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
-
- .endm
-
-
- .macro SAVE2x16
-
- #ifndef TRMMKERNEL
- lxv vs16, 0(CO)
- lxv vs17, 16(CO)
- lxv vs18, 32(CO)
- lxv vs19, 48(CO)
- #endif
- add T1, CO, LDC
- #ifndef TRMMKERNEL
- lxv vs26, 0(T1)
- lxv vs27, 16(T1)
- lxv vs28, 32(T1)
- lxv vs29, 48(T1)
- #endif
-
- #if defined(TRMMKERNEL)
- xvmulsp vs16, vs0, alpha_r
- xvmulsp vs17, vs1, alpha_r
- xvmulsp vs18, vs2, alpha_r
- xvmulsp vs19, vs3, alpha_r
- xvmulsp vs26, vs4, alpha_r
- xvmulsp vs27, vs5, alpha_r
- xvmulsp vs28, vs6, alpha_r
- xvmulsp vs29, vs7, alpha_r
- #else
- xvmaddasp vs16, vs0, alpha_r
- xvmaddasp vs17, vs1, alpha_r
- xvmaddasp vs18, vs2, alpha_r
- xvmaddasp vs19, vs3, alpha_r
- xvmaddasp vs26, vs4, alpha_r
- xvmaddasp vs27, vs5, alpha_r
- xvmaddasp vs28, vs6, alpha_r
- xvmaddasp vs29, vs7, alpha_r
- #endif
- stxv vs16, 0(CO)
- stxv vs17, 16(CO)
- stxv vs18, 32(CO)
- stxv vs19, 48(CO)
-
- stxv vs26, 0(T1)
- stxv vs27, 16(T1)
- stxv vs28, 32(T1)
- stxv vs29, 48(T1)
-
- addi CO,CO,64
-
- .endm
-
- /* M=8 N=2 */
-
- .macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast
- KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
- .macro Zero2x8
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
-
- xxlxor vs4, vs4, vs4
- xxlxor vs5, vs5, vs5
-
- .endm
-
- .macro KERNEL2x8
- KERNEL2x8_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast
- KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
-
- xvmulsp vs4, vs26, vs9
- xvmulsp vs5, vs27, vs9
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
-
- xvmaddasp vs4, vs26, vs9
- xvmaddasp vs5, vs27, vs9
-
- .endif
-
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP8(\Index,32)
-
- .endm
-
-
-
-
- .macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
-
- lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
-
- lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG)
-
- lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
- lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
-
- lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG)
- lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG)
-
- xxspltw vs8, vs38, 3
- xxspltw vs9, vs38, 2
- xxspltw vs10, vs38, 1
- xxspltw vs11, vs38, 0
-
- xxspltw vs12, vs39, 3
- xxspltw vs13, vs39, 2
- xxspltw vs14, vs39, 1
- xxspltw vs15, vs39, 0
-
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs4, vs26, vs9
- xvmaddasp vs5, vs27, vs9
-
-
- xvmaddasp vs0, vs16, vs10
- xvmaddasp vs1, vs17, vs10
- xvmaddasp vs4, vs16, vs11
- xvmaddasp vs5, vs17, vs11
-
-
- xvmaddasp vs0, vs30, vs12
- xvmaddasp vs1, vs31, vs12
- xvmaddasp vs4, vs30, vs13
- xvmaddasp vs5, vs31, vs13
-
- xvmaddasp vs0, vs34, vs14
- xvmaddasp vs1, vs35, vs14
- xvmaddasp vs4, vs34, vs15
- xvmaddasp vs5, vs35, vs15
-
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
-
- .endm
-
- .macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 3
- xxspltw vs9, vs36, 2
- xxspltw vs10, vs36, 1
- xxspltw vs11, vs36, 0
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
- lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG)
- lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG)
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
-
- xvmaddasp vs4, vs26, vs9
- xvmaddasp vs5, vs27, vs9
-
- xvmaddasp vs0, vs16, vs10
- xvmaddasp vs1, vs17, vs10
-
- xvmaddasp vs4, vs16, vs11
- xvmaddasp vs5, vs17, vs11
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP16(\Index,64)
- .endif
-
- .endm
-
-
- .macro SAVE2x8
-
- #ifndef TRMMKERNEL
- lxv vs16, 0(CO)
- lxv vs17, 16(CO)
- #endif
- add T1, CO, LDC
- #ifndef TRMMKERNEL
- lxv vs26, 0(T1)
- lxv vs27, 16(T1)
-
- #endif
-
- #if defined(TRMMKERNEL)
- xvmulsp vs16, vs0, alpha_r
- xvmulsp vs17, vs1, alpha_r
- xvmulsp vs26, vs4, alpha_r
- xvmulsp vs27, vs5, alpha_r
- #else
- xvmaddasp vs16, vs0, alpha_r
- xvmaddasp vs17, vs1, alpha_r
- xvmaddasp vs26, vs4, alpha_r
- xvmaddasp vs27, vs5, alpha_r
- #endif
-
- stxv vs16, 0(CO)
- stxv vs17, 16(CO)
-
-
- stxv vs26, 0(T1)
- stxv vs27, 16(T1)
-
- addi CO,CO,32
-
- .endm
-
-
- /*M=4*/
-
-
- .macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- /* we will aggregate on save vs0 +vs4 vs11+vs5 */
- .macro Zero2x4
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
-
- xxlxor vs4, vs4, vs4
- xxlxor vs5, vs5, vs5
-
- .endm
-
- .macro KERNEL2x4
- KERNEL2x4_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast
- KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs26, vs9
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs26, vs9
- .endif
-
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP4(\Index,16)
-
- .endm
-
-
-
-
- .macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
-
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG)
-
- lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
- lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
-
-
- xxspltw vs8, vs38, 3
- xxspltw vs9, vs38, 2
- xxspltw vs10, vs38, 1
- xxspltw vs11, vs38, 0
-
- xxspltw vs12, vs39, 3
- xxspltw vs13, vs39, 2
- xxspltw vs14, vs39, 1
- xxspltw vs15, vs39, 0
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs26, vs9
- xvmaddasp vs4, vs16, vs10
- xvmaddasp vs5, vs16, vs11
-
-
- xvmaddasp vs0, vs30, vs12
- xvmaddasp vs1, vs30, vs13
- xvmaddasp vs4, vs34, vs14
- xvmaddasp vs5, vs34, vs15
-
-
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP16(\Index,64)
- .endif
-
- .endm
-
- .macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 3
- xxspltw vs9, vs36, 2
- xxspltw vs10, vs36, 1
- xxspltw vs11, vs36, 0
- lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG)
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs26, vs9
- xvmaddasp vs4, vs16, vs10
- xvmaddasp vs5, vs16, vs11
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP8(\Index,32)
- .endif
-
- .endm
-
-
- .macro SAVE2x4
-
- #ifndef TRMMKERNEL
- lxv vs16, 0(CO)
- #endif
- add T1, CO, LDC
- #ifndef TRMMKERNEL
- lxv vs26, 0(T1)
-
- #endif
- /*aggregate vectors*/
- xvaddsp vs0,vs0,vs4
- xvaddsp vs1,vs1,vs5
- #if defined(TRMMKERNEL)
- xvmulsp vs16, vs0, alpha_r
- xvmulsp vs26, vs1, alpha_r
- #else
- xvmaddasp vs16, vs0, alpha_r
- xvmaddasp vs26, vs1, alpha_r
- #endif
-
- stxv vs16, 0(CO)
- stxv vs26, 0(T1)
-
- addi CO,CO,16
-
- .endm
-
-
- /* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */
- .macro SWITCH_PERMUTE_INNER
- xxpermdi permute_mask, permute_mask, permute_mask,2
- .endm
-
- .macro Zero2x2
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- SWITCH_PERMUTE_INNER
- .endm
-
- .macro KERNEL2x2
- KERNEL2x2_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast
- KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast
- KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxperm vs9, vs36, permute_mask
- lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs37, vs36
- xvmulsp vs1, vs37, vs9
-
- .else
- xvmaddasp vs0, vs37, vs36
- xvmaddasp vs1, vs37, vs9
- .endif
-
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP2(\Index,8)
-
- .endm
-
-
-
-
- .macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG)
-
- lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG)
-
-
- xxperm vs9, vs8, permute_mask
- xxperm vs11, vs10, permute_mask
-
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs26, vs9
- xvmaddasp vs0, vs16, vs10
- xvmaddasp vs1, vs16, vs11
-
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP8(\Index,32)
- .endif
-
- .endm
-
- .macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG)
- lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
-
-
- xxperm vs9, vs8, permute_mask
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs26, vs9
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP4(\Index,16)
- .endif
- .endm
-
-
- .macro SAVE2x2
-
- #ifndef TRMMKERNEL
- lxsd v4 , 0(CO)
- #endif
- add T1, CO, LDC
- #ifndef TRMMKERNEL
- lxsd v5 , 0(T1)
-
- #endif
- /*aggregate vectors*/
- xxpermdi vs4,vs0,vs0,2
- xxpermdi vs5,vs1,vs1,2
- xvaddsp vs0,vs0,vs4
- xvaddsp vs1,vs1,vs5
- /* */
- /* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */
- xxperm vs1,vs1, permute_mask
-
-
- xxmrghw vs2 ,vs1,vs0
- xxpermdi vs2,vs2,vs2,2
- xxmrghw vs3 ,vs0,vs1
- #if defined(TRMMKERNEL)
- xvmulsp vs36, vs2, alpha_r
- xvmulsp vs37, vs3, alpha_r
- #else
- xvmaddasp vs36, vs2, alpha_r
- xvmaddasp vs37, vs3, alpha_r
- #endif
- /**** store last two words*/
-
-
- stxsd v4, 0(CO)
- stxsd v5, 0(T1)
-
- addi CO,CO,8
-
- .endm
-
- /*--------------------------- M=1 N=2 */
- .macro Zero2x1
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2,vs2,vs2
- xxlxor vs3,vs3,vs3
- .endm
-
- .macro KERNEL2x1
- KERNEL2x1_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast
- KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast
- KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
- /*
- we will calculate 1 alone then will add it to batched ones
- */
- .macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG)
- lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG)
- lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs2, vs37, vs35
- xvmulsp vs3, vs37, vs36
-
- .else
- xsmaddadp vs2, vs37, vs35
- xsmaddadp vs3, vs37, vs36
- .endif
-
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP1(\Index,4)
-
- .endm
-
-
-
-
- .macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
- lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG)
-
- lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
-
- xxmrglw vs5, vs26,vs26
- xxmrghw vs6, vs26,vs26
-
- xvmaddasp vs0, vs8, vs5
- xvmaddasp vs1, vs10, vs6
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP8(\Index,32)
- addi \AREG, \AREG, DISP4(\Index,16)
- .endif
-
- .endm
-
- .macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG)
- lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG)
- lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG)
- lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG)
- lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG)
- lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG)
-
-
- xsmaddadp vs2, vs37, vs35
- xsmaddadp vs3, vs37, vs36
-
- xsmaddadp vs2, vs38, vs39
- xsmaddadp vs3, vs38, vs40
-
-
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP2(\Index,8)
- .endm
-
-
- .macro SAVE2x1
-
- #ifndef TRMMKERNEL
- lxssp v4 , 0(CO)
- #endif
- add T1, CO, LDC
- #ifndef TRMMKERNEL
- lxssp v5 , 0(T1)
-
- #endif
-
- /*convert alpha_r for multiply*/
- xscvspdp vs16,alpha_r
-
- /*aggregate vectors 2x2_4 */
- xxpermdi vs4,vs0,vs0,2
- xxpermdi vs5,vs1,vs1,2
- xvaddsp vs0,vs0,vs4
- xvaddsp vs1,vs1,vs5
- xvaddsp vs0,vs0,vs1
- /*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/
- xscvspdp vs5, vs0
- xxspltw vs6, vs0, 1
- xscvspdp vs6,vs6
- xsadddp vs2,vs2,vs6
- xsadddp vs3,vs3,vs5
-
- /**** store last two words*/
- #if defined(TRMMKERNEL)
- xsmuldp vs36,vs2, vs16
- xsmuldp vs37,vs3, vs16
-
- #else
- xsmaddadp vs36,vs2, vs16
- xsmaddadp vs37,vs3, vs16
- #endif
-
- stxssp v4, 0(CO)
- stxssp v5, 0(T1)
-
- addi CO,CO,4
-
- .endm
-
-
-
- /****************************N=1 section*****************/
-
- .macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast
- KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
- .macro Zero1x16
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2, vs2, vs2
- xxlxor vs3, vs3, vs3
- .endm
-
- .macro KERNEL1x16
- KERNEL1x16_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast
- KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
- xscvdpspn vs36,vs36
- xxspltw vs8, vs36, 0
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
- lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG)
- lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
- xvmulsp vs2, vs28, vs8
- xvmulsp vs3, vs29, vs8
-
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs28, vs8
- xvmaddasp vs3, vs29, vs8
-
- .endif
-
- addi \BREG, \BREG, DISP1(\Index,4)
- addi \AREG, \AREG, DISP16(\Index,64)
-
- .endm
-
-
-
-
- .macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
-
- lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG)
- lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG)
- lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG)
-
- lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG)
- lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG)
- lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG)
-
- xxspltw vs8, vs38, 3
- xxspltw vs9, vs38, 2
-
- lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG)
- lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG)
- lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG)
- lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG)
-
- lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
- lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
- lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
- lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG)
-
- xxspltw vs10, vs38, 1
- xxspltw vs11, vs38, 0
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs28, vs8
- xvmaddasp vs3, vs29, vs8
-
-
- xvmaddasp vs0, vs16, vs9
- xvmaddasp vs1, vs17, vs9
- xvmaddasp vs2, vs18, vs9
- xvmaddasp vs3, vs19, vs9
-
-
- xvmaddasp vs0, vs30, vs10
- xvmaddasp vs1, vs31, vs10
- xvmaddasp vs2, vs32, vs10
- xvmaddasp vs3, vs33, vs10
-
-
- xvmaddasp vs0, vs34, vs11
- xvmaddasp vs1, vs35, vs11
- xvmaddasp vs2, vs36, vs11
- xvmaddasp vs3, vs37, vs11
-
-
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP64(\Index,256)
- .endif
-
- .endm
-
- .macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
- lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG)
- lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG)
- lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
- lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG)
- lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG)
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
- xvmaddasp vs2, vs28, vs8
- xvmaddasp vs3, vs29, vs8
-
-
- xvmaddasp vs0, vs16, vs9
- xvmaddasp vs1, vs17, vs9
- xvmaddasp vs2, vs18, vs9
- xvmaddasp vs3, vs19, vs9
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
-
- .endm
-
-
- .macro SAVE1x16
-
- #ifndef TRMMKERNEL
- lxv vs16, 0(CO)
- lxv vs17, 16(CO)
- lxv vs18, 32(CO)
- lxv vs19, 48(CO)
- #endif
-
-
- #if defined(TRMMKERNEL)
- xvmulsp vs16, vs0, alpha_r
- xvmulsp vs17, vs1, alpha_r
- xvmulsp vs18, vs2, alpha_r
- xvmulsp vs19, vs3, alpha_r
- #else
- xvmaddasp vs16, vs0, alpha_r
- xvmaddasp vs17, vs1, alpha_r
- xvmaddasp vs18, vs2, alpha_r
- xvmaddasp vs19, vs3, alpha_r
- #endif
- stxv vs16, 0(CO)
- stxv vs17, 16(CO)
- stxv vs18, 32(CO)
- stxv vs19, 48(CO)
-
- addi CO,CO,64
-
- .endm
-
- /* M=8 N=1 */
-
- .macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast
- KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
- .macro Zero1x8
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2, vs2, vs2
- xxlxor vs3, vs3, vs3
- .endm
-
- .macro KERNEL1x8
- KERNEL1x8_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast
- KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
- xscvdpspn vs36,vs36
- xxspltw vs8, vs36, 0
- lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- xvmulsp vs1, vs27, vs8
-
-
- .else
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
-
- .endif
-
- addi \BREG, \BREG, DISP1(\Index,4)
- addi \AREG, \AREG, DISP8(\Index,32)
-
- .endm
-
-
-
-
- .macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
-
- lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
-
- lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG)
-
- xxspltw vs8, vs38, 3
- xxspltw vs9, vs38, 2
-
- lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
- lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
-
- lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG)
- lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG)
-
- xxspltw vs10, vs38, 1
- xxspltw vs11, vs38, 0
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
-
-
- xvmaddasp vs2, vs16, vs9
- xvmaddasp vs3, vs17, vs9
-
-
- xvmaddasp vs0, vs30, vs10
- xvmaddasp vs1, vs31, vs10
-
-
- xvmaddasp vs2, vs34, vs11
- xvmaddasp vs3, vs35, vs11
-
-
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP32(\Index,128)
- .endif
-
- .endm
-
- .macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
- lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
- lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs8
-
-
- xvmaddasp vs2, vs16, vs9
- xvmaddasp vs3, vs17, vs9
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP16(\Index,64)
- .endif
-
- .endm
-
-
- .macro SAVE1x8
-
- #ifndef TRMMKERNEL
- lxv vs16, 0(CO)
- lxv vs17, 16(CO)
- #endif
- /* aggregate vs0 vs2 and vs1 vs3*/
- xvaddsp vs0,vs0,vs2
- xvaddsp vs1,vs1,vs3
- #if defined(TRMMKERNEL)
- xvmulsp vs16, vs0, alpha_r
- xvmulsp vs17, vs1, alpha_r
- #else
- xvmaddasp vs16, vs0, alpha_r
- xvmaddasp vs17, vs1, alpha_r
- #endif
- stxv vs16, 0(CO)
- stxv vs17, 16(CO)
-
- addi CO,CO,32
-
- .endm
- /*M=4*/
-
- .macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast
- KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
-
- .macro Zero1x4
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2, vs2, vs2
- xxlxor vs3, vs3, vs3
- .endm
-
- .macro KERNEL1x4
- KERNEL1x4_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast
- KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
- xscvdpspn vs36,vs36
- xxspltw vs8, vs36, 0
- lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
-
-
- .if \First==1
- xvmulsp vs0, vs26, vs8
- .else
- xvmaddasp vs0, vs26, vs8
-
- .endif
-
- addi \BREG, \BREG, DISP1(\Index,4)
- addi \AREG, \AREG, DISP4(\Index,16)
-
- .endm
-
-
-
-
- .macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
-
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
-
-
- xxspltw vs8, vs38, 3
- xxspltw vs9, vs38, 2
-
- lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
- lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
-
-
- xxspltw vs10, vs38, 1
- xxspltw vs11, vs38, 0
-
-
- xvmaddasp vs0, vs26, vs8
-
- xvmaddasp vs1, vs27, vs9
-
- xvmaddasp vs2, vs30, vs10
-
-
- xvmaddasp vs3, vs31, vs11
-
-
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP4(\Index,16)
- addi \AREG, \AREG, DISP16(\Index,64)
- .endif
-
- .endm
-
- .macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
- xxspltw vs8, vs36, 1
- xxspltw vs9, vs36, 0
- lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
- lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
-
-
- xvmaddasp vs0, vs26, vs8
- xvmaddasp vs1, vs27, vs9
-
-
- .if \IsLast==1
- addi \BREG, \BREG, DISP2(\Index,8)
- addi \AREG, \AREG, DISP8(\Index,32)
- .endif
-
- .endm
-
-
- .macro SAVE1x4
-
- #ifndef TRMMKERNEL
- lxv vs16, 0(CO)
- #endif
- /* aggregate */
- xvaddsp vs0,vs0,vs2
- xvaddsp vs1,vs1,vs3
- xvaddsp vs0,vs1,vs0
- #if defined(TRMMKERNEL)
- xvmulsp vs16, vs0, alpha_r
- #else
- xvmaddasp vs16, vs0, alpha_r
- #endif
- stxv vs16, 0(CO)
-
- addi CO,CO,16
-
- .endm
-
- /* M=2 N=1*/
- .macro Zero1x2
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2,vs2,vs2
- xxlxor vs3,vs3,vs3
- .endm
-
- .macro KERNEL1x2
- KERNEL1x2_1 AO,BO, 0, 0,0,0
- .endm
- .macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast
- KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast
- KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
- /*
- we will calculate 1 alone then will add it to batched ones
- */
- .macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG)
- lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG)
- lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG)
-
-
- .if \First==1
- xvmuldp vs2, vs37, vs35
- xvmuldp vs3, vs37, vs36
-
- .else
- xsmaddadp vs2, vs37, vs35
- xsmaddadp vs3, vs37, vs36
- .endif
-
- addi \AREG, \AREG, DISP2(\Index,8)
- addi \BREG, \BREG, DISP1(\Index,4)
-
- .endm
-
-
-
-
- .macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG)
- lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG)
-
- lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG)
-
- xxmrglw vs5, vs26,vs26
- xxmrghw vs6, vs26,vs26
-
- xvmaddasp vs0, vs8, vs5
- xvmaddasp vs1, vs10, vs6
-
-
- .if \IsLast==1
- addi \AREG, \AREG, DISP8(\Index,32)
- addi \BREG, \BREG, DISP4(\Index,16)
- .endif
-
- .endm
-
- .macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG)
- lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG)
- lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG)
- lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG)
- lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG)
- lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG)
-
-
- xsmaddadp vs2, vs37, vs35
- xsmaddadp vs3, vs37, vs36
-
- xsmaddadp vs2, vs38, vs39
- xsmaddadp vs3, vs38, vs40
-
-
- addi \AREG, \AREG, DISP4(\Index,16)
- addi \BREG, \BREG, DISP2(\Index,8)
- .endm
-
-
- .macro SAVE1x2
-
- #ifndef TRMMKERNEL
- lxssp v4 , 0(CO)
- lxssp v5 , 4(CO)
-
- #endif
-
- /*convert alpha_r for multiply*/
- xscvspdp vs16,alpha_r
-
- /*aggregate vectors 1x2_4 */
- xxpermdi vs4,vs0,vs0,2
- xxpermdi vs5,vs1,vs1,2
- xvaddsp vs0,vs0,vs4
- xvaddsp vs1,vs1,vs5
- xvaddsp vs0,vs0,vs1
- /*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/
- xscvspdp vs5, vs0
- xxspltw vs6, vs0, 1
- xscvspdp vs6,vs6
- xsadddp vs2,vs2,vs6
- xsadddp vs3,vs3,vs5
-
- /**** store last two words*/
- #if defined(TRMMKERNEL)
- xsmuldp vs36,vs2, vs16
- xsmuldp vs37,vs3, vs16
-
- #else
- xsmaddadp vs36,vs2, vs16
- xsmaddadp vs37,vs3, vs16
- #endif
-
- stxssp v4, 0(CO)
- stxssp v5, 4(CO)
-
- addi CO,CO,8
-
- .endm
- /*///////////////// N=1 M=1 //////////////////*/
- .macro Zero1x1
- xxlxor vs0, vs0, vs0
- xxlxor vs1, vs1, vs1
- xxlxor vs2, vs2,vs2
- xxlxor vs3,vs3,vs3
- xxlxor vs4,vs4,vs4
- .endm
-
- .macro KERNEL1x1
- KERNEL1x1_1 AO,BO, 1, 0,0,0
- .endm
-
- .macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast
- KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast
- KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast
- KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
-
- .macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast
- KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
- .endm
- /*
- we will calculate 1 alone ( FIRST==1 to zero vs4)
- */
- .macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
-
-
- lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG)
- lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG)
-
-
- .if \First==1
- xvmuldp vs4, vs37, vs35
-
- .else
- xsmaddadp vs4, vs37, vs35
- .endif
-
- addi \AREG, \AREG, DISP1(\Index,4)
- addi \BREG, \BREG, DISP1(\Index,4)
-
- .endm
-
-
- .macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG)
- lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG)
- lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG)
- lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG)
- lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG)
- lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG)
- lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG)
- lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG)
- xvmaddasp vs0, vs8, vs26
- xvmaddasp vs1, vs9, vs16
- xvmaddasp vs2, vs10, vs17
- xvmaddasp vs3, vs11, vs18
- .if \IsLast==1
- addi \AREG, \AREG, DISP16(\Index,64)
- addi \BREG, \BREG, DISP16(\Index,64)
- .endif
-
- .endm
-
- .macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG)
- lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG)
- lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG)
- lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG)
- xvmaddasp vs0, vs8, vs26
- xvmaddasp vs1, vs9, vs16
-
- .if \IsLast==1
- addi \AREG, \AREG, DISP8(\Index,32)
- addi \BREG, \BREG, DISP8(\Index,32)
- .endif
-
- .endm
-
-
- .macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG)
- lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG)
-
- xvmaddasp vs0, vs8, vs26
-
-
- .if \IsLast==1
- addi \AREG, \AREG, DISP4(\Index,16)
- addi \BREG, \BREG, DISP4(\Index,16)
- .endif
-
- .endm
-
- .macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
-
- lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG)
- lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG)
-
- xvmaddasp vs0, vs36, vs37
-
- addi \AREG, \AREG, DISP2(\Index,8)
- addi \BREG, \BREG, DISP2(\Index,8)
- .endm
-
-
- .macro SAVE1x1
-
- #ifndef TRMMKERNEL
- lxssp v4 , 0(CO)
-
- #endif
-
- /*convert alpha_r for multiply*/
- xscvspdp vs16,alpha_r
-
- /*aggregate vectors */
- xvaddsp vs0,vs0,vs1
- xvaddsp vs2,vs2,vs3
- xvaddsp vs0,vs0,vs2
-
- xxpermdi vs7,vs0,vs0,2
- xvaddsp vs0,vs0,vs7
- /*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/
- xscvspdp vs5, vs0
- xxspltw vs6, vs0, 1
- xscvspdp vs6,vs6
- xsadddp vs7,vs5,vs6
- xsadddp vs4,vs4,vs7
-
- /**** store last two words*/
- #if defined(TRMMKERNEL)
- xsmuldp vs36,vs4, vs16
-
- #else
- xsmaddadp vs36,vs4, vs16
- #endif
-
- stxssp v4, 0(CO)
-
- addi CO,CO,4
-
- .endm
-
-
-
-
- /****************************TRMM POINTER REFRESH MACROSES*************************/
-
- .macro SHIFT_REG REG1,REG2,SHIFT_VAL
- .if \SHIFT_VAL==16
- slwi \REG1, \REG2, 6
- .elseif \SHIFT_VAL==8
- slwi \REG1, \REG2, 5
- .elseif \SHIFT_VAL==4
- slwi \REG1, \REG2, 4
- .elseif \SHIFT_VAL==2
- slwi \REG1, \REG2, 3
- .elseif \SHIFT_VAL==1
- slwi \REG1, \REG2, 2
- .endif
- .endm
-
- /*
- //#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- // ptrbb = bb;
- // #else
- // ptrba += off*16;
- // ptrbb = bb + off*2;
- // #endif
- */
- .macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
- #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- /* ptrbb = bb;*/
- mr \PTR_B,\B_VAL /* refresh BPOINT */
-
- #else
- /*
- // ptrba =ptrba+ off*C_A;
- // ptrbb = bb + off*C_B;
- */
- SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
- SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
- add \PTR_B, \B_VAL , T4 /* Add values to BO */
- add \PTR_A, \PTR_A, T2 /* Add values to AO */
- #endif
- .endm
-
-
- /*
- // #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
- // temp = bk-off;
- // #elif defined(LEFT)
- // temp = off+16; // number of values in A
- // #else
- // temp = off+2; // number of values in B
- // #endif
- */
- .macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
- #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
- /* temp = bk-off;*/
- sub \TEMP_BK,\BK_VAL,\OFF_VAL
-
- #elif defined(LEFT)
- /* temp = off+INCR_A; // number of values in A */
- addi \TEMP_BK, \OFF_VAL, \INCR_A
- #else
- /* temp = off+INCR_B // number of values in B*/
- addi \TEMP_BK,\OFF_VAL, \INCR_B
- #endif
-
- .endm
- /*
- // #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- // temp = bk - off;
- // #ifdef LEFT
- // temp -= 16; // number of values in A
- // #else
- // temp -= 2; // number of values in B
- // #endif
- // ptrba += temp*16;
- // ptrbb += temp*2;
- // #endif
-
- // #ifdef LEFT
- // off += 16; // number of values in A
- // #endif
- */
-
-
- .macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
-
- #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
- /*temp = bk - off;*/
- sub \TEMP_BK,\BK_VAL,\OFF_VAL
- #ifdef LEFT
- /*temp -= 8; // number of values in A*/
- addi \TEMP_BK,\TEMP_BK,-\C_A
- #else
- /*temp -= 4; // number of values in B*/
- addi \TEMP_BK,\TEMP_BK,-\C_B
- #endif
- /*ptrba += temp*C_A;
- ptrbb += temp*C_B;*/
- SHIFT_REG T4,\TEMP_BK,\C_A
- SHIFT_REG T2,\TEMP_BK,\C_B
- add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
- add \PTR_B, \PTR_B,T2
-
- #endif
-
- #ifdef LEFT
- /*off += 8; // number of values in A*/
- addi \OFF_VAL,\OFF_VAL,\C_A
- #endif
- .endm
|