You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dgemm_kernel_8x2_piledriver.S 94 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523
  1. /***************************************************************************
  2. Copyright (c) 2013, The OpenBLAS Project
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are
  6. met:
  7. 1. Redistributions of source code must retain the above copyright
  8. notice, this list of conditions and the following disclaimer.
  9. 2. Redistributions in binary form must reproduce the above copyright
  10. notice, this list of conditions and the following disclaimer in
  11. the documentation and/or other materials provided with the
  12. distribution.
  13. 3. Neither the name of the OpenBLAS project nor the names of
  14. its contributors may be used to endorse or promote products
  15. derived from this software without specific prior written permission.
  16. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19. ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
  20. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
  25. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. *****************************************************************************/
  27. /*********************************************************************
  28. *
  29. * 2013/11/13 Saar
  30. * BLASTEST : OK
  31. * CTEST : OK
  32. * TEST : OK
  33. *
  34. *
  35. * 2013/10/31 Saar
  36. *
  37. * Parameter:
  38. * UNROLL_M 8
  39. * UNROLL_N 2
  40. * DGEMM_P 768
  41. * DGEMM_Q 168
  42. * DGEMM_R 12288
  43. * A_PR1 512
  44. * B_PR1 256
  45. *
  46. * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1):
  47. *
  48. * 4608x4608 83.9 GFLOPS with 8 threads on 4 modules (ACML: 78.4 GFLOPS)
  49. * 4608x4608 80.9 GFLOPS with 4 threads on 4 modules (ACML: 78.4 GFLOPS)
  50. * 4608x4608 41.3 GFLOPS with 2 threads on 2 modules (ACML: 40.9 GFLOPS)
  51. * 4608x4608 20.7 GFLOPS with 1 threads on 1 modules (ACML: 20.8 GFLOPS)
  52. *
  53. * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1):
  54. *
  55. * 13824x13824 234.5 GFLOPS with 32 threads on 16 modules (ACML: 88.5 GFLOPS) !strange thermal behavior
  56. * 13824x13824 241.9 GFLOPS with 16 threads on 16 modules (ACML: 191.5 GFLOPS) !strange thermal behavior
  57. * 9216x9216 137.6 GFLOPS with 8 threads on 8 modules (ACML: 106.5 GFLOPS)
  58. * 4608x4608 75.7 GFLOPS with 4 threads on 4 modules (ACML: 56.3 GFLOPS)
  59. * 4608x4608 38.6 GFLOPS with 2 threads on 2 modules (ACML: 34.1 GFLOPS)
  60. * 4608x4608 19.6 GFLOPS with 1 threads on 1 modules (ACML: 18.3 GFLOPS)
  61. *
  62. *********************************************************************/
  63. #define ASSEMBLER
  64. #include "common.h"
  65. #define OLD_M %rdi
  66. #define OLD_N %rsi
  67. #define M %r13
  68. #define J %r14
  69. #define OLD_K %rdx
  70. #define A %rcx
  71. #define B %r8
  72. #define C %r9
  73. #define LDC %r10
  74. #define I %r11
  75. #define AO %rdi
  76. #define BO %rsi
  77. #define CO1 %r15
  78. #define K %r12
  79. #define BI %rbp
  80. #define SP %rbx
  81. #define BO1 %rdi
  82. #define BO2 %r15
  83. #ifndef WINDOWS_ABI
  84. #define STACKSIZE 96
  85. #else
  86. #define STACKSIZE 256
  87. #define OLD_A 40 + STACKSIZE(%rsp)
  88. #define OLD_B 48 + STACKSIZE(%rsp)
  89. #define OLD_C 56 + STACKSIZE(%rsp)
  90. #define OLD_LDC 64 + STACKSIZE(%rsp)
  91. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  92. #endif
  93. #define L_BUFFER_SIZE 8192
  94. #define LB2_OFFSET 4096
  95. #define Ndiv6 24(%rsp)
  96. #define Nmod6 32(%rsp)
  97. #define N 40(%rsp)
  98. #define ALPHA 48(%rsp)
  99. #define OFFSET 56(%rsp)
  100. #define KK 64(%rsp)
  101. #define KKK 72(%rsp)
  102. #define BUFFER1 128(%rsp)
  103. #define BUFFER2 LB2_OFFSET+128(%rsp)
  104. #if defined(OS_WINDOWS)
  105. #if L_BUFFER_SIZE > 16384
  106. #define STACK_TOUCH \
  107. movl $0, 4096 * 4(%rsp);\
  108. movl $0, 4096 * 3(%rsp);\
  109. movl $0, 4096 * 2(%rsp);\
  110. movl $0, 4096 * 1(%rsp);
  111. #elif L_BUFFER_SIZE > 12288
  112. #define STACK_TOUCH \
  113. movl $0, 4096 * 3(%rsp);\
  114. movl $0, 4096 * 2(%rsp);\
  115. movl $0, 4096 * 1(%rsp);
  116. #elif L_BUFFER_SIZE > 8192
  117. #define STACK_TOUCH \
  118. movl $0, 4096 * 2(%rsp);\
  119. movl $0, 4096 * 1(%rsp);
  120. #elif L_BUFFER_SIZE > 4096
  121. #define STACK_TOUCH \
  122. movl $0, 4096 * 1(%rsp);
  123. #else
  124. #define STACK_TOUCH
  125. #endif
  126. #else
  127. #define STACK_TOUCH
  128. #endif
  129. #if defined(BULLDOZER)
  130. #define VFMADD231PD_( y1,y2,y0 ) vfmaddpd y0,y1,y2,y0
  131. #define VFMADD231SD_( x1,x2,x0 ) vfmaddsd x0,x1,x2,x0
  132. #else
  133. #define VFMADD231PD_( y1,y2,y0 ) vfmadd231pd y2,y1,y0
  134. #define VFMADD231SD_( x1,x2,x0 ) vfmadd231sd x2,x1,x0
  135. #endif
  136. #define A_PR1 512
  137. #define B_PR1 256
  138. #define C_PR1 64
  139. .macro INIT8x3
  140. vxorpd %xmm4 , %xmm4 , %xmm4
  141. vxorpd %xmm5 , %xmm5 , %xmm5
  142. vxorpd %xmm6 , %xmm6 , %xmm6
  143. vxorpd %xmm7 , %xmm7 , %xmm7
  144. vxorpd %xmm8 , %xmm8 , %xmm8
  145. vxorpd %xmm9 , %xmm9 , %xmm9
  146. vxorpd %xmm10, %xmm10, %xmm10
  147. vxorpd %xmm11, %xmm11, %xmm11
  148. vxorpd %xmm12, %xmm12, %xmm12
  149. vxorpd %xmm13, %xmm13, %xmm13
  150. vxorpd %xmm14, %xmm14, %xmm14
  151. vxorpd %xmm15, %xmm15, %xmm15
  152. .endm
  153. .macro KERNEL8x3_INIT
  154. vmovddup -12 * SIZE(BO), %xmm1
  155. vmovups -16 * SIZE(AO), %xmm0
  156. prefetcht0 A_PR1(AO)
  157. vmulpd %xmm1,%xmm0,%xmm4
  158. vmovddup -11 * SIZE(BO), %xmm2
  159. vmulpd %xmm2,%xmm0,%xmm5
  160. vmovddup -10 * SIZE(BO), %xmm3
  161. vmulpd %xmm3,%xmm0,%xmm6
  162. vmovups -14 * SIZE(AO), %xmm0
  163. vmulpd %xmm1,%xmm0,%xmm7
  164. vmulpd %xmm2,%xmm0,%xmm8
  165. vmulpd %xmm3,%xmm0,%xmm9
  166. vmovups -12 * SIZE(AO), %xmm0
  167. vmulpd %xmm1,%xmm0,%xmm10
  168. vmulpd %xmm2,%xmm0,%xmm11
  169. addq $ 3 * SIZE, BO
  170. vmulpd %xmm3,%xmm0,%xmm12
  171. vmovups -10 * SIZE(AO), %xmm0
  172. vmulpd %xmm1,%xmm0,%xmm13
  173. vmovddup -12 * SIZE(BO), %xmm1
  174. vmulpd %xmm2,%xmm0,%xmm14
  175. vmovddup -11 * SIZE(BO), %xmm2
  176. vmulpd %xmm3,%xmm0,%xmm15
  177. .endm
  178. .macro KERNEL8x3_M1
  179. vmovups -16 * SIZE(AO), %xmm0
  180. prefetcht0 A_PR1(AO)
  181. VFMADD231PD_( %xmm1,%xmm0,%xmm4 )
  182. VFMADD231PD_( %xmm2,%xmm0,%xmm5 )
  183. VFMADD231PD_( %xmm3,%xmm0,%xmm6 )
  184. vmovups -14 * SIZE(AO), %xmm0
  185. VFMADD231PD_( %xmm1,%xmm0,%xmm7 )
  186. VFMADD231PD_( %xmm2,%xmm0,%xmm8 )
  187. VFMADD231PD_( %xmm3,%xmm0,%xmm9 )
  188. vmovups -12 * SIZE(AO), %xmm0
  189. VFMADD231PD_( %xmm1,%xmm0,%xmm10 )
  190. VFMADD231PD_( %xmm2,%xmm0,%xmm11 )
  191. VFMADD231PD_( %xmm3,%xmm0,%xmm12 )
  192. vmovups -10 * SIZE(AO), %xmm0
  193. VFMADD231PD_( %xmm1,%xmm0,%xmm13 )
  194. vmovddup -12 * SIZE(BO), %xmm1
  195. VFMADD231PD_( %xmm2,%xmm0,%xmm14 )
  196. vmovddup -11 * SIZE(BO), %xmm2
  197. VFMADD231PD_( %xmm3,%xmm0,%xmm15 )
  198. .endm
  199. .macro KERNEL8x3_M2
  200. vmovups -8 * SIZE(AO), %xmm0
  201. prefetcht0 A_PR1+64(AO)
  202. vmovddup -10 * SIZE(BO), %xmm3
  203. VFMADD231PD_( %xmm1,%xmm0,%xmm4 )
  204. VFMADD231PD_( %xmm2,%xmm0,%xmm5 )
  205. VFMADD231PD_( %xmm3,%xmm0,%xmm6 )
  206. vmovups -6 * SIZE(AO), %xmm0
  207. VFMADD231PD_( %xmm1,%xmm0,%xmm7 )
  208. VFMADD231PD_( %xmm2,%xmm0,%xmm8 )
  209. VFMADD231PD_( %xmm3,%xmm0,%xmm9 )
  210. vmovups -4 * SIZE(AO), %xmm0
  211. VFMADD231PD_( %xmm1,%xmm0,%xmm10 )
  212. VFMADD231PD_( %xmm2,%xmm0,%xmm11 )
  213. VFMADD231PD_( %xmm3,%xmm0,%xmm12 )
  214. vmovups -2 * SIZE(AO), %xmm0
  215. VFMADD231PD_( %xmm1,%xmm0,%xmm13 )
  216. vmovddup -9 * SIZE(BO), %xmm1
  217. VFMADD231PD_( %xmm2,%xmm0,%xmm14 )
  218. vmovddup -8 * SIZE(BO), %xmm2
  219. VFMADD231PD_( %xmm3,%xmm0,%xmm15 )
  220. .endm
  221. .macro KERNEL8x3_M3
  222. vmovups 0 * SIZE(AO), %xmm0
  223. prefetcht0 A_PR1+128(AO)
  224. vmovddup -7 * SIZE(BO), %xmm3
  225. VFMADD231PD_( %xmm1,%xmm0,%xmm4 )
  226. VFMADD231PD_( %xmm2,%xmm0,%xmm5 )
  227. VFMADD231PD_( %xmm3,%xmm0,%xmm6 )
  228. vmovups 2 * SIZE(AO), %xmm0
  229. VFMADD231PD_( %xmm1,%xmm0,%xmm7 )
  230. VFMADD231PD_( %xmm2,%xmm0,%xmm8 )
  231. VFMADD231PD_( %xmm3,%xmm0,%xmm9 )
  232. vmovups 4 * SIZE(AO), %xmm0
  233. VFMADD231PD_( %xmm1,%xmm0,%xmm10 )
  234. VFMADD231PD_( %xmm2,%xmm0,%xmm11 )
  235. VFMADD231PD_( %xmm3,%xmm0,%xmm12 )
  236. vmovups 6 * SIZE(AO), %xmm0
  237. VFMADD231PD_( %xmm1,%xmm0,%xmm13 )
  238. vmovddup -6 * SIZE(BO), %xmm1
  239. VFMADD231PD_( %xmm2,%xmm0,%xmm14 )
  240. vmovddup -5 * SIZE(BO), %xmm2
  241. VFMADD231PD_( %xmm3,%xmm0,%xmm15 )
  242. .endm
  243. .macro KERNEL8x3_M4
  244. vmovups 8 * SIZE(AO), %xmm0
  245. prefetcht0 A_PR1+192(AO)
  246. vmovddup -4 * SIZE(BO), %xmm3
  247. VFMADD231PD_( %xmm1,%xmm0,%xmm4 )
  248. VFMADD231PD_( %xmm2,%xmm0,%xmm5 )
  249. VFMADD231PD_( %xmm3,%xmm0,%xmm6 )
  250. vmovups 10 * SIZE(AO), %xmm0
  251. VFMADD231PD_( %xmm1,%xmm0,%xmm7 )
  252. VFMADD231PD_( %xmm2,%xmm0,%xmm8 )
  253. VFMADD231PD_( %xmm3,%xmm0,%xmm9 )
  254. vmovups 12 * SIZE(AO), %xmm0
  255. VFMADD231PD_( %xmm1,%xmm0,%xmm10 )
  256. VFMADD231PD_( %xmm2,%xmm0,%xmm11 )
  257. VFMADD231PD_( %xmm3,%xmm0,%xmm12 )
  258. vmovups 14 * SIZE(AO), %xmm0
  259. VFMADD231PD_( %xmm1,%xmm0,%xmm13 )
  260. vmovddup -3 * SIZE(BO), %xmm1
  261. addq $ 32 * SIZE, AO
  262. VFMADD231PD_( %xmm2,%xmm0,%xmm14 )
  263. vmovddup -2 * SIZE(BO), %xmm2
  264. VFMADD231PD_( %xmm3,%xmm0,%xmm15 )
  265. .endm
  266. .macro KERNEL8x3_M5
  267. vmovups -16 * SIZE(AO), %xmm0
  268. prefetcht0 A_PR1(AO)
  269. vmovddup -1 * SIZE(BO), %xmm3
  270. VFMADD231PD_( %xmm1,%xmm0,%xmm4 )
  271. VFMADD231PD_( %xmm2,%xmm0,%xmm5 )
  272. VFMADD231PD_( %xmm3,%xmm0,%xmm6 )
  273. vmovups -14 * SIZE(AO), %xmm0
  274. VFMADD231PD_( %xmm1,%xmm0,%xmm7 )
  275. VFMADD231PD_( %xmm2,%xmm0,%xmm8 )
  276. VFMADD231PD_( %xmm3,%xmm0,%xmm9 )
  277. vmovups -12 * SIZE(AO), %xmm0
  278. VFMADD231PD_( %xmm1,%xmm0,%xmm10 )
  279. VFMADD231PD_( %xmm2,%xmm0,%xmm11 )
  280. VFMADD231PD_( %xmm3,%xmm0,%xmm12 )
  281. vmovups -10 * SIZE(AO), %xmm0
  282. VFMADD231PD_( %xmm1,%xmm0,%xmm13 )
  283. vmovddup 0 * SIZE(BO), %xmm1
  284. VFMADD231PD_( %xmm2,%xmm0,%xmm14 )
  285. vmovddup 1 * SIZE(BO), %xmm2
  286. VFMADD231PD_( %xmm3,%xmm0,%xmm15 )
  287. .endm
  288. .macro KERNEL8x3_M6
  289. vmovups -8 * SIZE(AO), %xmm0
  290. prefetcht0 A_PR1+64(AO)
  291. vmovddup 2 * SIZE(BO), %xmm3
  292. VFMADD231PD_( %xmm1,%xmm0,%xmm4 )
  293. VFMADD231PD_( %xmm2,%xmm0,%xmm5 )
  294. VFMADD231PD_( %xmm3,%xmm0,%xmm6 )
  295. vmovups -6 * SIZE(AO), %xmm0
  296. VFMADD231PD_( %xmm1,%xmm0,%xmm7 )
  297. VFMADD231PD_( %xmm2,%xmm0,%xmm8 )
  298. VFMADD231PD_( %xmm3,%xmm0,%xmm9 )
  299. vmovups -4 * SIZE(AO), %xmm0
  300. VFMADD231PD_( %xmm1,%xmm0,%xmm10 )
  301. VFMADD231PD_( %xmm2,%xmm0,%xmm11 )
  302. VFMADD231PD_( %xmm3,%xmm0,%xmm12 )
  303. vmovups -2 * SIZE(AO), %xmm0
  304. VFMADD231PD_( %xmm1,%xmm0,%xmm13 )
  305. vmovddup 3 * SIZE(BO), %xmm1
  306. VFMADD231PD_( %xmm2,%xmm0,%xmm14 )
  307. vmovddup 4 * SIZE(BO), %xmm2
  308. VFMADD231PD_( %xmm3,%xmm0,%xmm15 )
  309. .endm
  310. .macro KERNEL8x3_M7
  311. vmovups 0 * SIZE(AO), %xmm0
  312. prefetcht0 A_PR1+128(AO)
  313. vmovddup 5 * SIZE(BO), %xmm3
  314. VFMADD231PD_( %xmm1,%xmm0,%xmm4 )
  315. VFMADD231PD_( %xmm2,%xmm0,%xmm5 )
  316. VFMADD231PD_( %xmm3,%xmm0,%xmm6 )
  317. vmovups 2 * SIZE(AO), %xmm0
  318. VFMADD231PD_( %xmm1,%xmm0,%xmm7 )
  319. VFMADD231PD_( %xmm2,%xmm0,%xmm8 )
  320. VFMADD231PD_( %xmm3,%xmm0,%xmm9 )
  321. vmovups 4 * SIZE(AO), %xmm0
  322. VFMADD231PD_( %xmm1,%xmm0,%xmm10 )
  323. VFMADD231PD_( %xmm2,%xmm0,%xmm11 )
  324. VFMADD231PD_( %xmm3,%xmm0,%xmm12 )
  325. vmovups 6 * SIZE(AO), %xmm0
  326. VFMADD231PD_( %xmm1,%xmm0,%xmm13 )
  327. vmovddup 6 * SIZE(BO), %xmm1
  328. VFMADD231PD_( %xmm2,%xmm0,%xmm14 )
  329. vmovddup 7 * SIZE(BO), %xmm2
  330. VFMADD231PD_( %xmm3,%xmm0,%xmm15 )
  331. .endm
  332. .macro KERNEL8x3_M8
  333. vmovups 8 * SIZE(AO), %xmm0
  334. prefetcht0 A_PR1+192(AO)
  335. vmovddup 8 * SIZE(BO), %xmm3
  336. VFMADD231PD_( %xmm1,%xmm0,%xmm4 )
  337. VFMADD231PD_( %xmm2,%xmm0,%xmm5 )
  338. VFMADD231PD_( %xmm3,%xmm0,%xmm6 )
  339. vmovups 10 * SIZE(AO), %xmm0
  340. VFMADD231PD_( %xmm1,%xmm0,%xmm7 )
  341. VFMADD231PD_( %xmm2,%xmm0,%xmm8 )
  342. VFMADD231PD_( %xmm3,%xmm0,%xmm9 )
  343. vmovups 12 * SIZE(AO), %xmm0
  344. VFMADD231PD_( %xmm1,%xmm0,%xmm10 )
  345. VFMADD231PD_( %xmm2,%xmm0,%xmm11 )
  346. VFMADD231PD_( %xmm3,%xmm0,%xmm12 )
  347. vmovups 14 * SIZE(AO), %xmm0
  348. VFMADD231PD_( %xmm1,%xmm0,%xmm13 )
  349. vmovddup 9 * SIZE(BO), %xmm1
  350. VFMADD231PD_( %xmm2,%xmm0,%xmm14 )
  351. vmovddup 10 * SIZE(BO), %xmm2
  352. VFMADD231PD_( %xmm3,%xmm0,%xmm15 )
  353. vmovddup 11 * SIZE(BO), %xmm3
  354. addq $ 32 * SIZE, AO
  355. addq $ 24 * SIZE, BO
  356. .endm
  357. .macro KERNEL8x3_E
  358. vmovups 8 * SIZE(AO), %xmm0
  359. prefetcht0 A_PR1+192(AO)
  360. vmovddup 8 * SIZE(BO), %xmm3
  361. VFMADD231PD_( %xmm1,%xmm0,%xmm4 )
  362. VFMADD231PD_( %xmm2,%xmm0,%xmm5 )
  363. VFMADD231PD_( %xmm3,%xmm0,%xmm6 )
  364. vmovups 10 * SIZE(AO), %xmm0
  365. VFMADD231PD_( %xmm1,%xmm0,%xmm7 )
  366. VFMADD231PD_( %xmm2,%xmm0,%xmm8 )
  367. VFMADD231PD_( %xmm3,%xmm0,%xmm9 )
  368. vmovups 12 * SIZE(AO), %xmm0
  369. VFMADD231PD_( %xmm1,%xmm0,%xmm10 )
  370. VFMADD231PD_( %xmm2,%xmm0,%xmm11 )
  371. VFMADD231PD_( %xmm3,%xmm0,%xmm12 )
  372. vmovups 14 * SIZE(AO), %xmm0
  373. VFMADD231PD_( %xmm1,%xmm0,%xmm13 )
  374. addq $ 32 * SIZE, AO
  375. VFMADD231PD_( %xmm2,%xmm0,%xmm14 )
  376. addq $ 21 * SIZE, BO
  377. VFMADD231PD_( %xmm3,%xmm0,%xmm15 )
  378. .endm
  379. .macro KERNEL8x3_SUBN
  380. vmovddup -12 * SIZE(BO), %xmm1
  381. vmovups -16 * SIZE(AO), %xmm0
  382. VFMADD231PD_( %xmm1,%xmm0,%xmm4 )
  383. vmovddup -11 * SIZE(BO), %xmm2
  384. VFMADD231PD_( %xmm2,%xmm0,%xmm5 )
  385. vmovddup -10 * SIZE(BO), %xmm3
  386. VFMADD231PD_( %xmm3,%xmm0,%xmm6 )
  387. vmovups -14 * SIZE(AO), %xmm0
  388. VFMADD231PD_( %xmm1,%xmm0,%xmm7 )
  389. VFMADD231PD_( %xmm2,%xmm0,%xmm8 )
  390. VFMADD231PD_( %xmm3,%xmm0,%xmm9 )
  391. vmovups -12 * SIZE(AO), %xmm0
  392. VFMADD231PD_( %xmm1,%xmm0,%xmm10 )
  393. VFMADD231PD_( %xmm2,%xmm0,%xmm11 )
  394. VFMADD231PD_( %xmm3,%xmm0,%xmm12 )
  395. vmovups -10 * SIZE(AO), %xmm0
  396. VFMADD231PD_( %xmm1,%xmm0,%xmm13 )
  397. addq $ 3 * SIZE, BO
  398. VFMADD231PD_( %xmm2,%xmm0,%xmm14 )
  399. addq $ 8 * SIZE, AO
  400. VFMADD231PD_( %xmm3,%xmm0,%xmm15 )
  401. .endm
  402. .macro SAVE8x3
  403. vmovddup ALPHA, %xmm0
  404. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  405. vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5
  406. vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  407. vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  408. vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
  409. vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
  410. vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  411. vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
  412. vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
  413. vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
  414. vfmaddpd 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12
  415. vfmaddpd 6 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15
  416. vmovups %xmm4 , (CO1)
  417. vmovups %xmm7 , 2 * SIZE(CO1)
  418. vmovups %xmm10, 4 * SIZE(CO1)
  419. vmovups %xmm13, 6 * SIZE(CO1)
  420. vmovups %xmm5 , (CO1, LDC)
  421. vmovups %xmm8 , 2 * SIZE(CO1, LDC)
  422. vmovups %xmm11, 4 * SIZE(CO1, LDC)
  423. vmovups %xmm14, 6 * SIZE(CO1, LDC)
  424. vmovups %xmm6 , (CO1, LDC, 2)
  425. vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2)
  426. vmovups %xmm12, 4 * SIZE(CO1, LDC, 2)
  427. vmovups %xmm15, 6 * SIZE(CO1, LDC, 2)
  428. prefetcht0 C_PR1(CO1)
  429. prefetcht0 C_PR1(CO1,LDC)
  430. prefetcht0 C_PR1(CO1,LDC,2)
  431. addq $ 8 * SIZE, CO1 # coffset += 8
  432. .endm
  433. /*******************************************************************************************/
  434. #define KERNEL4x3_1(xx) \
  435. vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\
  436. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  437. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  438. vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\
  439. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  440. vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\
  441. vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  442. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  443. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  444. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  445. vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\
  446. #define KERNEL4x3_2(xx) \
  447. vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\
  448. vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\
  449. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  450. vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\
  451. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  452. vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\
  453. vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  454. vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\
  455. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  456. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  457. vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\
  458. #define KERNEL4x3_3(xx) \
  459. vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\
  460. vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\
  461. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  462. vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\
  463. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  464. vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\
  465. vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  466. vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\
  467. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  468. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  469. vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\
  470. #define KERNEL4x3_4(xx) \
  471. vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\
  472. vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\
  473. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  474. vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\
  475. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  476. vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\
  477. vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  478. vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\
  479. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  480. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  481. vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\
  482. addq $12, BI ;\
  483. addq $16, %rax ;\
  484. #define KERNEL4x3_SUB(xx) \
  485. vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\
  486. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  487. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  488. vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\
  489. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  490. vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\
  491. vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  492. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  493. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  494. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  495. vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\
  496. /*******************************************************************************************/
  497. #define KERNEL2x3_1(xx) \
  498. vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\
  499. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  500. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  501. vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\
  502. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  503. vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\
  504. vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  505. #define KERNEL2x3_2(xx) \
  506. vmovddup -3 * SIZE(BO, BI, 8), %xmm1 ;\
  507. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  508. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  509. vmovddup -2 * SIZE(BO, BI, 8), %xmm2 ;\
  510. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  511. vmovddup -1 * SIZE(BO, BI, 8), %xmm3 ;\
  512. vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  513. #define KERNEL2x3_3(xx) \
  514. vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\
  515. vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\
  516. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  517. vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\
  518. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  519. vmovddup 2 * SIZE(BO, BI, 8), %xmm3 ;\
  520. vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  521. #define KERNEL2x3_4(xx) \
  522. vmovddup 3 * SIZE(BO, BI, 8), %xmm1 ;\
  523. vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\
  524. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  525. vmovddup 4 * SIZE(BO, BI, 8), %xmm2 ;\
  526. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  527. vmovddup 5 * SIZE(BO, BI, 8), %xmm3 ;\
  528. vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  529. addq $12, BI ;\
  530. addq $8, %rax ;\
  531. #define KERNEL2x3_SUB(xx) \
  532. vmovddup -6 * SIZE(BO, BI, 8), %xmm1 ;\
  533. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  534. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  535. vmovddup -5 * SIZE(BO, BI, 8), %xmm2 ;\
  536. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  537. vmovddup -4 * SIZE(BO, BI, 8), %xmm3 ;\
  538. vfmaddpd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  539. /*******************************************************************************************/
  540. #define KERNEL1x3_1(xx) \
  541. vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\
  542. vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  543. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  544. vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\
  545. vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  546. vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\
  547. vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  548. #define KERNEL1x3_2(xx) \
  549. vmovsd -3 * SIZE(BO, BI, 8), %xmm1 ;\
  550. vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\
  551. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  552. vmovsd -2 * SIZE(BO, BI, 8), %xmm2 ;\
  553. vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  554. vmovsd -1 * SIZE(BO, BI, 8), %xmm3 ;\
  555. vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  556. #define KERNEL1x3_3(xx) \
  557. vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\
  558. vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  559. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  560. vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\
  561. vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  562. vmovsd 2 * SIZE(BO, BI, 8), %xmm3 ;\
  563. vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  564. #define KERNEL1x3_4(xx) \
  565. vmovsd 3 * SIZE(BO, BI, 8), %xmm1 ;\
  566. vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\
  567. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  568. vmovsd 4 * SIZE(BO, BI, 8), %xmm2 ;\
  569. vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  570. vmovsd 5 * SIZE(BO, BI, 8), %xmm3 ;\
  571. vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  572. addq $12, BI ;\
  573. addq $4, %rax ;\
  574. #define KERNEL1x3_SUB(xx) \
  575. vmovsd -6 * SIZE(BO, BI, 8), %xmm1 ;\
  576. vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  577. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  578. vmovsd -5 * SIZE(BO, BI, 8), %xmm2 ;\
  579. vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  580. vmovsd -4 * SIZE(BO, BI, 8), %xmm3 ;\
  581. vfmaddsd %xmm6,%xmm3,%xmm0,%xmm6 ;\
  582. /*******************************************************************************************
  583. * 2 lines of N
  584. *******************************************************************************************/
  585. #define KERNEL8x2_1(xx) \
  586. prefetcht0 A_PR1(AO,%rax,8) ;\
  587. vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\
  588. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  589. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  590. vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\
  591. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  592. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  593. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  594. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  595. vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\
  596. vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\
  597. vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\
  598. vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\
  599. vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\
  600. vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\
  601. #define KERNEL8x2_2(xx) \
  602. prefetcht0 A_PR1+64(AO,%rax,8) ;\
  603. vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\
  604. vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\
  605. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  606. vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\
  607. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  608. vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\
  609. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  610. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  611. vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\
  612. vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\
  613. vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\
  614. vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\
  615. vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\
  616. vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\
  617. #define KERNEL8x2_3(xx) \
  618. prefetcht0 A_PR1+128(AO,%rax,8) ;\
  619. vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\
  620. vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\
  621. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  622. vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\
  623. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  624. vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\
  625. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  626. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  627. vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\
  628. vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\
  629. vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\
  630. vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\
  631. vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\
  632. vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\
  633. #define KERNEL8x2_4(xx) \
  634. prefetcht0 A_PR1+192(AO,%rax,8) ;\
  635. vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\
  636. vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\
  637. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  638. vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\
  639. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  640. vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\
  641. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  642. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  643. vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\
  644. vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\
  645. vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\
  646. vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\
  647. vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\
  648. vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\
  649. addq $8, BI ;\
  650. addq $32, %rax ;\
  651. #define KERNEL8x2_SUB(xx) \
  652. vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\
  653. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  654. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  655. vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\
  656. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  657. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  658. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  659. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  660. vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\
  661. vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\
  662. vfmaddpd %xmm11,%xmm2,%xmm0,%xmm11 ;\
  663. vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\
  664. vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\
  665. vfmaddpd %xmm14,%xmm2,%xmm0,%xmm14 ;\
  666. /*******************************************************************************************/
  667. #define KERNEL4x2_1(xx) \
  668. vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\
  669. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  670. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  671. vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\
  672. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  673. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  674. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  675. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  676. #define KERNEL4x2_2(xx) \
  677. vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\
  678. vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\
  679. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  680. vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\
  681. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  682. vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\
  683. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  684. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  685. #define KERNEL4x2_3(xx) \
  686. vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\
  687. vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\
  688. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  689. vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\
  690. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  691. vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\
  692. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  693. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  694. #define KERNEL4x2_4(xx) \
  695. vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\
  696. vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\
  697. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  698. vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\
  699. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  700. vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\
  701. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  702. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  703. addq $8, BI ;\
  704. addq $16, %rax ;\
  705. #define KERNEL4x2_SUB(xx) \
  706. vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\
  707. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  708. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  709. vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\
  710. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  711. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  712. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  713. vfmaddpd %xmm8,%xmm2,%xmm0,%xmm8 ;\
  714. /*******************************************************************************************/
  715. #define KERNEL2x2_1(xx) \
  716. vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\
  717. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  718. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  719. vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\
  720. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  721. #define KERNEL2x2_2(xx) \
  722. vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\
  723. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  724. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  725. vmovddup -1 * SIZE(BO, BI, 8), %xmm2 ;\
  726. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  727. #define KERNEL2x2_3(xx) \
  728. vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\
  729. vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\
  730. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  731. vmovddup 1 * SIZE(BO, BI, 8), %xmm2 ;\
  732. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  733. #define KERNEL2x2_4(xx) \
  734. vmovddup 2 * SIZE(BO, BI, 8), %xmm1 ;\
  735. vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\
  736. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  737. vmovddup 3 * SIZE(BO, BI, 8), %xmm2 ;\
  738. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  739. addq $8, BI ;\
  740. addq $8, %rax ;\
  741. #define KERNEL2x2_SUB(xx) \
  742. vmovddup -4 * SIZE(BO, BI, 8), %xmm1 ;\
  743. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  744. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  745. vmovddup -3 * SIZE(BO, BI, 8), %xmm2 ;\
  746. vfmaddpd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  747. /*******************************************************************************************/
  748. #define KERNEL1x2_1(xx) \
  749. vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\
  750. vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  751. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  752. vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\
  753. vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  754. #define KERNEL1x2_2(xx) \
  755. vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\
  756. vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\
  757. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  758. vmovsd -1 * SIZE(BO, BI, 8), %xmm2 ;\
  759. vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  760. #define KERNEL1x2_3(xx) \
  761. vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\
  762. vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  763. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  764. vmovsd 1 * SIZE(BO, BI, 8), %xmm2 ;\
  765. vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  766. #define KERNEL1x2_4(xx) \
  767. vmovsd 2 * SIZE(BO, BI, 8), %xmm1 ;\
  768. vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\
  769. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  770. vmovsd 3 * SIZE(BO, BI, 8), %xmm2 ;\
  771. vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  772. addq $8, BI ;\
  773. addq $4, %rax ;\
  774. #define KERNEL1x2_SUB(xx) \
  775. vmovsd -4 * SIZE(BO, BI, 8), %xmm1 ;\
  776. vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  777. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  778. vmovsd -3 * SIZE(BO, BI, 8), %xmm2 ;\
  779. vfmaddsd %xmm5,%xmm2,%xmm0,%xmm5 ;\
  780. /*******************************************************************************************
  781. * 1 line of N
  782. *******************************************************************************************/
  783. #define KERNEL8x1_1(xx) \
  784. prefetcht0 A_PR1(AO,%rax,8) ;\
  785. vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\
  786. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  787. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  788. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  789. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  790. vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\
  791. vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\
  792. vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\
  793. vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\
  794. #define KERNEL8x1_2(xx) \
  795. prefetcht0 A_PR1+64(AO,%rax,8) ;\
  796. vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\
  797. vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\
  798. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  799. vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\
  800. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  801. vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\
  802. vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\
  803. vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\
  804. vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\
  805. #define KERNEL8x1_3(xx) \
  806. prefetcht0 A_PR1+128(AO,%rax,8) ;\
  807. vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\
  808. vmovups 0 * SIZE(AO, %rax, 8), %xmm0 ;\
  809. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  810. vmovups 2 * SIZE(AO, %rax, 8), %xmm0 ;\
  811. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  812. vmovups 4 * SIZE(AO, %rax, 8), %xmm0 ;\
  813. vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\
  814. vmovups 6 * SIZE(AO, %rax, 8), %xmm0 ;\
  815. vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\
  816. #define KERNEL8x1_4(xx) \
  817. prefetcht0 A_PR1+192(AO,%rax,8) ;\
  818. vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\
  819. vmovups 8 * SIZE(AO, %rax, 8), %xmm0 ;\
  820. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  821. vmovups 10 * SIZE(AO, %rax, 8), %xmm0 ;\
  822. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  823. vmovups 12 * SIZE(AO, %rax, 8), %xmm0 ;\
  824. vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\
  825. vmovups 14 * SIZE(AO, %rax, 8), %xmm0 ;\
  826. vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\
  827. addq $4, BI ;\
  828. addq $32, %rax ;\
  829. #define KERNEL8x1_SUB(xx) \
  830. vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\
  831. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  832. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  833. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  834. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  835. vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\
  836. vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\
  837. vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\
  838. vfmaddpd %xmm13,%xmm1,%xmm0,%xmm13 ;\
  839. /*******************************************************************************************/
  840. #define KERNEL4x1_1(xx) \
  841. vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\
  842. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  843. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  844. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  845. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  846. #define KERNEL4x1_2(xx) \
  847. vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\
  848. vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\
  849. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  850. vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\
  851. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  852. #define KERNEL4x1_3(xx) \
  853. vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\
  854. vmovups -8 * SIZE(AO, %rax, 8), %xmm0 ;\
  855. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  856. vmovups -6 * SIZE(AO, %rax, 8), %xmm0 ;\
  857. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  858. #define KERNEL4x1_4(xx) \
  859. vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\
  860. vmovups -4 * SIZE(AO, %rax, 8), %xmm0 ;\
  861. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  862. vmovups -2 * SIZE(AO, %rax, 8), %xmm0 ;\
  863. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  864. addq $4, BI ;\
  865. addq $16, %rax ;\
  866. #define KERNEL4x1_SUB(xx) \
  867. vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\
  868. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  869. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  870. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  871. vfmaddpd %xmm7,%xmm1,%xmm0,%xmm7 ;\
  872. /*******************************************************************************************/
  873. #define KERNEL2x1_1(xx) \
  874. vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\
  875. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  876. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  877. #define KERNEL2x1_2(xx) \
  878. vmovddup -1 * SIZE(BO, BI, 8), %xmm1 ;\
  879. vmovups -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  880. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  881. #define KERNEL2x1_3(xx) \
  882. vmovddup 0 * SIZE(BO, BI, 8), %xmm1 ;\
  883. vmovups -12 * SIZE(AO, %rax, 8), %xmm0 ;\
  884. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  885. #define KERNEL2x1_4(xx) \
  886. vmovddup 1 * SIZE(BO, BI, 8), %xmm1 ;\
  887. vmovups -10 * SIZE(AO, %rax, 8), %xmm0 ;\
  888. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  889. addq $4, BI ;\
  890. addq $8, %rax ;\
  891. #define KERNEL2x1_SUB(xx) \
  892. vmovddup -2 * SIZE(BO, BI, 8), %xmm1 ;\
  893. vmovups -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  894. vfmaddpd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  895. /*******************************************************************************************/
  896. #define KERNEL1x1_1(xx) \
  897. vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\
  898. vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  899. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  900. #define KERNEL1x1_2(xx) \
  901. vmovsd -1 * SIZE(BO, BI, 8), %xmm1 ;\
  902. vmovsd -15 * SIZE(AO, %rax, 8), %xmm0 ;\
  903. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  904. #define KERNEL1x1_3(xx) \
  905. vmovsd 0 * SIZE(BO, BI, 8), %xmm1 ;\
  906. vmovsd -14 * SIZE(AO, %rax, 8), %xmm0 ;\
  907. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  908. #define KERNEL1x1_4(xx) \
  909. vmovsd 1 * SIZE(BO, BI, 8), %xmm1 ;\
  910. vmovsd -13 * SIZE(AO, %rax, 8), %xmm0 ;\
  911. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  912. addq $4, BI ;\
  913. addq $4, %rax ;\
  914. #define KERNEL1x1_SUB(xx) \
  915. vmovsd -2 * SIZE(BO, BI, 8), %xmm1 ;\
  916. vmovsd -16 * SIZE(AO, %rax, 8), %xmm0 ;\
  917. vfmaddsd %xmm4,%xmm1,%xmm0,%xmm4 ;\
  918. /*******************************************************************************************/
  919. #if !defined(TRMMKERNEL)
  920. PROLOGUE
  921. PROFCODE
  922. subq $STACKSIZE, %rsp
  923. movq %rbx, (%rsp)
  924. movq %rbp, 8(%rsp)
  925. movq %r12, 16(%rsp)
  926. movq %r13, 24(%rsp)
  927. movq %r14, 32(%rsp)
  928. movq %r15, 40(%rsp)
  929. vzeroupper
  930. #ifdef WINDOWS_ABI
  931. movq %rdi, 48(%rsp)
  932. movq %rsi, 56(%rsp)
  933. movups %xmm6, 64(%rsp)
  934. movups %xmm7, 80(%rsp)
  935. movups %xmm8, 96(%rsp)
  936. movups %xmm9, 112(%rsp)
  937. movups %xmm10, 128(%rsp)
  938. movups %xmm11, 144(%rsp)
  939. movups %xmm12, 160(%rsp)
  940. movups %xmm13, 176(%rsp)
  941. movups %xmm14, 192(%rsp)
  942. movups %xmm15, 208(%rsp)
  943. movq ARG1, OLD_M
  944. movq ARG2, OLD_N
  945. movq ARG3, OLD_K
  946. movq OLD_A, A
  947. movq OLD_B, B
  948. movq OLD_C, C
  949. movq OLD_LDC, LDC
  950. vmovaps %xmm3, %xmm0
  951. #else
  952. movq STACKSIZE + 8(%rsp), LDC
  953. #endif
  954. movq %rsp, SP # save old stack
  955. subq $128 + L_BUFFER_SIZE, %rsp
  956. andq $-4096, %rsp # align stack
  957. STACK_TOUCH
  958. cmpq $0, OLD_M
  959. je .L999
  960. cmpq $0, OLD_N
  961. je .L999
  962. cmpq $0, OLD_K
  963. je .L999
  964. movq OLD_M, M
  965. movq OLD_N, N
  966. movq OLD_K, K
  967. vmovsd %xmm0, ALPHA
  968. salq $BASE_SHIFT, LDC
  969. movq N, %rax
  970. xorq %rdx, %rdx
  971. movq $6, %rdi
  972. divq %rdi // N / 6
  973. movq %rax, Ndiv6 // N / 6
  974. movq %rdx, Nmod6 // N % 6
  975. movq Ndiv6, J
  976. cmpq $0, J
  977. je .L2_0
  978. ALIGN_4
  979. .L6_01:
  980. // copy to sub buffer
  981. movq K, %rax
  982. salq $1,%rax // K * 2
  983. movq B, BO1
  984. leaq (B,%rax,8), BO2 // next offset to BO2
  985. leaq BUFFER1, BO // first buffer to BO
  986. movq K, %rax
  987. sarq $2, %rax // K / 4
  988. jz .L6_02a
  989. ALIGN_4
  990. .L6_02:
  991. prefetcht0 B_PR1(BO1)
  992. prefetcht0 B_PR1(BO2)
  993. prefetchw B_PR1(BO)
  994. vmovups (BO1), %xmm0
  995. vmovups 2*SIZE(BO1), %xmm2
  996. vmovups 4*SIZE(BO1), %xmm4
  997. vmovups 6*SIZE(BO1), %xmm6
  998. vmovsd (BO2), %xmm1
  999. vmovsd 2*SIZE(BO2), %xmm3
  1000. vmovsd 4*SIZE(BO2), %xmm5
  1001. vmovsd 6*SIZE(BO2), %xmm7
  1002. vmovups %xmm0, (BO)
  1003. vmovsd %xmm1, 2*SIZE(BO)
  1004. vmovups %xmm2, 3*SIZE(BO)
  1005. vmovsd %xmm3, 5*SIZE(BO)
  1006. vmovups %xmm4, 6*SIZE(BO)
  1007. vmovsd %xmm5, 8*SIZE(BO)
  1008. vmovups %xmm6, 9*SIZE(BO)
  1009. vmovsd %xmm7,11*SIZE(BO)
  1010. addq $ 8*SIZE,BO1
  1011. addq $ 8*SIZE,BO2
  1012. addq $ 12*SIZE,BO
  1013. decq %rax
  1014. jnz .L6_02
  1015. .L6_02a:
  1016. movq K, %rax
  1017. andq $3, %rax // K % 4
  1018. jz .L6_02c
  1019. ALIGN_4
  1020. .L6_02b:
  1021. vmovups (BO1), %xmm0
  1022. vmovsd (BO2), %xmm1
  1023. vmovups %xmm0, (BO)
  1024. vmovsd %xmm1, 2*SIZE(BO)
  1025. addq $ 2*SIZE,BO1
  1026. addq $ 2*SIZE,BO2
  1027. addq $ 3*SIZE,BO
  1028. decq %rax
  1029. jnz .L6_02b
  1030. .L6_02c:
  1031. movq K, %rax
  1032. salq $1,%rax // K * 2
  1033. leaq (B,%rax,8), BO1 // next offset to BO1
  1034. leaq (BO1,%rax,8), BO2 // next offset to BO1
  1035. leaq BUFFER2, BO // second buffer to BO
  1036. movq K, %rax
  1037. sarq $2, %rax // k / 4
  1038. jz .L6_03a
  1039. ALIGN_4
  1040. .L6_03:
  1041. prefetcht0 B_PR1(BO2)
  1042. prefetchw B_PR1(BO)
  1043. vmovups (BO2), %xmm0
  1044. vmovups 2*SIZE(BO2), %xmm2
  1045. vmovups 4*SIZE(BO2), %xmm4
  1046. vmovups 6*SIZE(BO2), %xmm6
  1047. vmovsd 1*SIZE(BO1), %xmm1
  1048. vmovsd 3*SIZE(BO1), %xmm3
  1049. vmovsd 5*SIZE(BO1), %xmm5
  1050. vmovsd 7*SIZE(BO1), %xmm7
  1051. vmovsd %xmm1, 0*SIZE(BO)
  1052. vmovups %xmm0, 1*SIZE(BO)
  1053. vmovsd %xmm3, 3*SIZE(BO)
  1054. vmovups %xmm2, 4*SIZE(BO)
  1055. vmovsd %xmm5, 6*SIZE(BO)
  1056. vmovups %xmm4, 7*SIZE(BO)
  1057. vmovsd %xmm7, 9*SIZE(BO)
  1058. vmovups %xmm6,10*SIZE(BO)
  1059. addq $ 8*SIZE,BO1
  1060. addq $ 8*SIZE,BO2
  1061. addq $ 12*SIZE,BO
  1062. decq %rax
  1063. jnz .L6_03
  1064. .L6_03a:
  1065. movq K, %rax
  1066. andq $3, %rax // K % 4
  1067. jz .L6_03c
  1068. ALIGN_4
  1069. .L6_03b:
  1070. vmovsd 1*SIZE(BO1), %xmm0
  1071. vmovups (BO2), %xmm1
  1072. vmovsd %xmm0, (BO)
  1073. vmovups %xmm1, 1*SIZE(BO)
  1074. addq $ 2*SIZE,BO1
  1075. addq $ 2*SIZE,BO2
  1076. addq $ 3*SIZE,BO
  1077. decq %rax
  1078. jnz .L6_03b
  1079. .L6_03c:
  1080. movq BO2, B // next offset of B
  1081. .L6_10:
  1082. movq C, CO1
  1083. leaq (C, LDC, 2), C
  1084. leaq (C, LDC, 1), C // c += 3 * ldc
  1085. movq A, AO // aoffset = a
  1086. addq $16 * SIZE, AO
  1087. movq M, I
  1088. sarq $3, I // i = (m >> 3)
  1089. je .L6_20
  1090. ALIGN_4
  1091. .L6_11:
  1092. leaq BUFFER1, BO // first buffer to BO
  1093. addq $12 * SIZE, BO
  1094. movq K, %rax
  1095. sarq $3, %rax // K / 8
  1096. cmpq $3, %rax
  1097. jl .L6_13
  1098. prefetcht0 B_PR1(BO)
  1099. prefetcht0 B_PR1+64(BO)
  1100. prefetcht0 B_PR1+128(BO)
  1101. KERNEL8x3_INIT
  1102. KERNEL8x3_M2
  1103. KERNEL8x3_M3
  1104. KERNEL8x3_M4
  1105. KERNEL8x3_M5
  1106. KERNEL8x3_M6
  1107. KERNEL8x3_M7
  1108. KERNEL8x3_M8
  1109. subq $2, %rax
  1110. ALIGN_5
  1111. .L6_12:
  1112. prefetcht0 B_PR1-24(BO)
  1113. prefetcht0 B_PR1+40(BO)
  1114. KERNEL8x3_M1
  1115. KERNEL8x3_M2
  1116. KERNEL8x3_M3
  1117. KERNEL8x3_M4
  1118. KERNEL8x3_M5
  1119. prefetcht0 B_PR1+104(BO)
  1120. KERNEL8x3_M6
  1121. KERNEL8x3_M7
  1122. KERNEL8x3_M8
  1123. dec %rax
  1124. jne .L6_12
  1125. .L6_12_E:
  1126. prefetcht0 B_PR1(BO)
  1127. prefetcht0 B_PR1+64(BO)
  1128. KERNEL8x3_M1
  1129. KERNEL8x3_M2
  1130. KERNEL8x3_M3
  1131. KERNEL8x3_M4
  1132. KERNEL8x3_M5
  1133. KERNEL8x3_M6
  1134. KERNEL8x3_M7
  1135. KERNEL8x3_E
  1136. jmp .L6_16
  1137. .L6_13:
  1138. test $2, %rax
  1139. jz .L6_14
  1140. KERNEL8x3_INIT
  1141. KERNEL8x3_M2
  1142. KERNEL8x3_M3
  1143. KERNEL8x3_M4
  1144. KERNEL8x3_M5
  1145. KERNEL8x3_M6
  1146. KERNEL8x3_M7
  1147. KERNEL8x3_M8
  1148. KERNEL8x3_M1
  1149. KERNEL8x3_M2
  1150. KERNEL8x3_M3
  1151. KERNEL8x3_M4
  1152. KERNEL8x3_M5
  1153. KERNEL8x3_M6
  1154. KERNEL8x3_M7
  1155. KERNEL8x3_E
  1156. jmp .L6_16
  1157. .L6_14:
  1158. test $1, %rax
  1159. jz .L6_15
  1160. KERNEL8x3_INIT
  1161. KERNEL8x3_M2
  1162. KERNEL8x3_M3
  1163. KERNEL8x3_M4
  1164. KERNEL8x3_M5
  1165. KERNEL8x3_M6
  1166. KERNEL8x3_M7
  1167. KERNEL8x3_E
  1168. jmp .L6_16
  1169. .L6_15:
  1170. INIT8x3
  1171. .L6_16:
  1172. movq K, %rax
  1173. andq $7, %rax # if (k & 1)
  1174. je .L6_19
  1175. ALIGN_4
  1176. .L6_17:
  1177. KERNEL8x3_SUBN
  1178. dec %rax
  1179. jne .L6_17
  1180. ALIGN_4
  1181. .L6_19:
  1182. SAVE8x3
  1183. decq I # i --
  1184. jg .L6_11
  1185. /**************************************************************************
  1186. * Rest of M
  1187. ***************************************************************************/
  1188. .L6_20:
  1189. // Test rest of M
  1190. testq $7, M
  1191. jz .L7_10 // to next 3 lines of N
  1192. testq $4, M
  1193. jz .L6_30
  1194. ALIGN_4
  1195. .L6_21:
  1196. leaq BUFFER1, BO // first buffer to BO
  1197. addq $6 * SIZE, BO
  1198. vzeroall
  1199. movq K, %rax
  1200. andq $-8, %rax
  1201. je .L6_26
  1202. movq %rax, BI // Index for BO
  1203. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1204. salq $2, %rax // rax = rax * 4 ; number of values
  1205. leaq (AO, %rax, 8), AO
  1206. leaq (BO, BI, 8), BO
  1207. negq BI
  1208. negq %rax
  1209. ALIGN_4
  1210. .L6_22:
  1211. KERNEL4x3_1(xxx)
  1212. KERNEL4x3_2(xxx)
  1213. KERNEL4x3_3(xxx)
  1214. KERNEL4x3_4(xxx)
  1215. KERNEL4x3_1(xxx)
  1216. KERNEL4x3_2(xxx)
  1217. KERNEL4x3_3(xxx)
  1218. KERNEL4x3_4(xxx)
  1219. je .L6_26
  1220. KERNEL4x3_1(xxx)
  1221. KERNEL4x3_2(xxx)
  1222. KERNEL4x3_3(xxx)
  1223. KERNEL4x3_4(xxx)
  1224. KERNEL4x3_1(xxx)
  1225. KERNEL4x3_2(xxx)
  1226. KERNEL4x3_3(xxx)
  1227. KERNEL4x3_4(xxx)
  1228. je .L6_26
  1229. jmp .L6_22
  1230. ALIGN_4
  1231. .L6_26:
  1232. movq K, %rax
  1233. andq $7, %rax # if (k & 1)
  1234. je .L6_29
  1235. movq %rax, BI // Index for BO
  1236. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1237. salq $2, %rax // rax = rax * 4 ; number of values
  1238. leaq (AO, %rax, 8), AO
  1239. leaq (BO, BI, 8), BO
  1240. negq BI
  1241. negq %rax
  1242. ALIGN_4
  1243. .L6_27:
  1244. KERNEL4x3_SUB(xxx)
  1245. addq $3, BI
  1246. addq $4, %rax
  1247. jl .L6_27
  1248. ALIGN_4
  1249. .L6_29:
  1250. vmovddup ALPHA, %xmm0
  1251. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  1252. vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  1253. vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5
  1254. vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  1255. vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1256. vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
  1257. vmovups %xmm4 , (CO1)
  1258. vmovups %xmm7 , 2 * SIZE(CO1)
  1259. vmovups %xmm5 , (CO1, LDC)
  1260. vmovups %xmm8 , 2 * SIZE(CO1, LDC)
  1261. vmovups %xmm6 , (CO1, LDC, 2)
  1262. vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2)
  1263. addq $4 * SIZE, CO1 # coffset += 4
  1264. ALIGN_4
  1265. .L6_30:
  1266. testq $2, M
  1267. jz .L6_40
  1268. ALIGN_4
  1269. .L6_31:
  1270. leaq BUFFER1, BO // first buffer to BO
  1271. addq $6 * SIZE, BO
  1272. vzeroall
  1273. movq K, %rax
  1274. andq $-8, %rax
  1275. je .L6_36
  1276. movq %rax, BI // Index for BO
  1277. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1278. salq $1, %rax // rax = rax *2 ; number of values
  1279. leaq (AO, %rax, 8), AO
  1280. leaq (BO, BI, 8), BO
  1281. negq BI
  1282. negq %rax
  1283. ALIGN_4
  1284. .L6_32:
  1285. KERNEL2x3_1(xxx)
  1286. KERNEL2x3_2(xxx)
  1287. KERNEL2x3_3(xxx)
  1288. KERNEL2x3_4(xxx)
  1289. KERNEL2x3_1(xxx)
  1290. KERNEL2x3_2(xxx)
  1291. KERNEL2x3_3(xxx)
  1292. KERNEL2x3_4(xxx)
  1293. je .L6_36
  1294. KERNEL2x3_1(xxx)
  1295. KERNEL2x3_2(xxx)
  1296. KERNEL2x3_3(xxx)
  1297. KERNEL2x3_4(xxx)
  1298. KERNEL2x3_1(xxx)
  1299. KERNEL2x3_2(xxx)
  1300. KERNEL2x3_3(xxx)
  1301. KERNEL2x3_4(xxx)
  1302. je .L6_36
  1303. jmp .L6_32
  1304. ALIGN_4
  1305. .L6_36:
  1306. movq K, %rax
  1307. andq $7, %rax # if (k & 1)
  1308. je .L6_39
  1309. movq %rax, BI // Index for BO
  1310. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1311. salq $1, %rax // rax = rax *2 ; number of values
  1312. leaq (AO, %rax, 8), AO
  1313. leaq (BO, BI, 8), BO
  1314. negq BI
  1315. negq %rax
  1316. ALIGN_4
  1317. .L6_37:
  1318. KERNEL2x3_SUB(xxx)
  1319. addq $3, BI
  1320. addq $2, %rax
  1321. jl .L6_37
  1322. ALIGN_4
  1323. .L6_39:
  1324. vmovddup ALPHA, %xmm0
  1325. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  1326. vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5
  1327. vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1328. vmovups %xmm4 , (CO1)
  1329. vmovups %xmm5 , (CO1, LDC)
  1330. vmovups %xmm6 , (CO1, LDC, 2)
  1331. addq $2 * SIZE, CO1 # coffset += 2
  1332. ALIGN_4
  1333. .L6_40:
  1334. testq $1, M
  1335. jz .L7_10 // to next 3 lines of N
  1336. ALIGN_4
  1337. .L6_41:
  1338. leaq BUFFER1, BO // first buffer to BO
  1339. addq $6 * SIZE, BO
  1340. vzeroall
  1341. movq K, %rax
  1342. andq $-8, %rax
  1343. je .L6_46
  1344. movq %rax, BI // Index for BO
  1345. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1346. leaq (AO, %rax, 8), AO
  1347. leaq (BO, BI, 8), BO
  1348. negq BI
  1349. negq %rax
  1350. ALIGN_4
  1351. .L6_42:
  1352. KERNEL1x3_1(xxx)
  1353. KERNEL1x3_2(xxx)
  1354. KERNEL1x3_3(xxx)
  1355. KERNEL1x3_4(xxx)
  1356. KERNEL1x3_1(xxx)
  1357. KERNEL1x3_2(xxx)
  1358. KERNEL1x3_3(xxx)
  1359. KERNEL1x3_4(xxx)
  1360. je .L6_46
  1361. KERNEL1x3_1(xxx)
  1362. KERNEL1x3_2(xxx)
  1363. KERNEL1x3_3(xxx)
  1364. KERNEL1x3_4(xxx)
  1365. KERNEL1x3_1(xxx)
  1366. KERNEL1x3_2(xxx)
  1367. KERNEL1x3_3(xxx)
  1368. KERNEL1x3_4(xxx)
  1369. je .L6_46
  1370. jmp .L6_42
  1371. ALIGN_4
  1372. .L6_46:
  1373. movq K, %rax
  1374. andq $7, %rax # if (k & 1)
  1375. je .L6_49
  1376. movq %rax, BI // Index for BO
  1377. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1378. leaq (AO, %rax, 8), AO
  1379. leaq (BO, BI, 8), BO
  1380. negq BI
  1381. negq %rax
  1382. ALIGN_4
  1383. .L6_47:
  1384. KERNEL1x3_SUB(xxx)
  1385. addq $3, BI
  1386. addq $1, %rax
  1387. jl .L6_47
  1388. ALIGN_4
  1389. .L6_49:
  1390. vmovddup ALPHA, %xmm0
  1391. vfmaddsd (CO1),%xmm0, %xmm4,%xmm4
  1392. vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5
  1393. vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1394. vmovsd %xmm4 , (CO1)
  1395. vmovsd %xmm5 , (CO1, LDC)
  1396. vmovsd %xmm6 , (CO1, LDC, 2)
  1397. addq $1 * SIZE, CO1 # coffset += 1
  1398. ALIGN_4
  1399. /***************************************************************************************************************/
  1400. .L7_10:
  1401. movq C, CO1
  1402. leaq (C, LDC, 2), C
  1403. leaq (C, LDC, 1), C // c += 3 * ldc
  1404. movq A, AO // aoffset = a
  1405. addq $16 * SIZE, AO
  1406. movq M, I
  1407. sarq $3, I // i = (m >> 3)
  1408. je .L7_20
  1409. ALIGN_4
  1410. .L7_11:
  1411. leaq BUFFER2, BO // first buffer to BO
  1412. addq $12 * SIZE, BO
  1413. movq K, %rax
  1414. sarq $3, %rax // K / 8
  1415. cmpq $3, %rax
  1416. jl .L7_13
  1417. prefetcht0 B_PR1(BO)
  1418. prefetcht0 B_PR1+64(BO)
  1419. prefetcht0 B_PR1+128(BO)
  1420. KERNEL8x3_INIT
  1421. KERNEL8x3_M2
  1422. KERNEL8x3_M3
  1423. KERNEL8x3_M4
  1424. KERNEL8x3_M5
  1425. KERNEL8x3_M6
  1426. KERNEL8x3_M7
  1427. KERNEL8x3_M8
  1428. subq $2, %rax
  1429. ALIGN_5
  1430. .L7_12:
  1431. prefetcht0 B_PR1-24(BO)
  1432. prefetcht0 B_PR1+40(BO)
  1433. KERNEL8x3_M1
  1434. KERNEL8x3_M2
  1435. KERNEL8x3_M3
  1436. KERNEL8x3_M4
  1437. prefetcht0 B_PR1+104(BO)
  1438. KERNEL8x3_M5
  1439. KERNEL8x3_M6
  1440. KERNEL8x3_M7
  1441. KERNEL8x3_M8
  1442. dec %rax
  1443. jne .L7_12
  1444. .L7_12_E:
  1445. prefetcht0 B_PR1(BO)
  1446. prefetcht0 B_PR1+64(BO)
  1447. KERNEL8x3_M1
  1448. KERNEL8x3_M2
  1449. KERNEL8x3_M3
  1450. KERNEL8x3_M4
  1451. KERNEL8x3_M5
  1452. KERNEL8x3_M6
  1453. KERNEL8x3_M7
  1454. KERNEL8x3_E
  1455. jmp .L7_16
  1456. .L7_13:
  1457. test $2, %rax
  1458. jz .L7_14
  1459. KERNEL8x3_INIT
  1460. KERNEL8x3_M2
  1461. KERNEL8x3_M3
  1462. KERNEL8x3_M4
  1463. KERNEL8x3_M5
  1464. KERNEL8x3_M6
  1465. KERNEL8x3_M7
  1466. KERNEL8x3_M8
  1467. KERNEL8x3_M1
  1468. KERNEL8x3_M2
  1469. KERNEL8x3_M3
  1470. KERNEL8x3_M4
  1471. KERNEL8x3_M5
  1472. KERNEL8x3_M6
  1473. KERNEL8x3_M7
  1474. KERNEL8x3_E
  1475. jmp .L7_16
  1476. .L7_14:
  1477. test $1, %rax
  1478. jz .L7_15
  1479. KERNEL8x3_INIT
  1480. KERNEL8x3_M2
  1481. KERNEL8x3_M3
  1482. KERNEL8x3_M4
  1483. KERNEL8x3_M5
  1484. KERNEL8x3_M6
  1485. KERNEL8x3_M7
  1486. KERNEL8x3_E
  1487. jmp .L7_16
  1488. .L7_15:
  1489. INIT8x3
  1490. .L7_16:
  1491. movq K, %rax
  1492. andq $7, %rax # if (k & 1)
  1493. je .L7_19
  1494. ALIGN_4
  1495. .L7_17:
  1496. KERNEL8x3_SUBN
  1497. dec %rax
  1498. jne .L7_17
  1499. ALIGN_4
  1500. .L7_19:
  1501. SAVE8x3
  1502. decq I # i --
  1503. jg .L7_11
  1504. ALIGN_4
  1505. .L7_20:
  1506. // Test rest of M
  1507. testq $7, M
  1508. jz .L7_60 // to next 6 lines of N
  1509. testq $4, M
  1510. jz .L7_30
  1511. ALIGN_4
  1512. .L7_21:
  1513. leaq BUFFER2, BO // second buffer to BO
  1514. addq $6 * SIZE, BO
  1515. vzeroall
  1516. movq K, %rax
  1517. andq $-8, %rax
  1518. je .L7_26
  1519. movq %rax, BI // Index for BO
  1520. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1521. salq $2, %rax // rax = rax * 4 ; number of values
  1522. leaq (AO, %rax, 8), AO
  1523. leaq (BO, BI, 8), BO
  1524. negq BI
  1525. negq %rax
  1526. ALIGN_4
  1527. .L7_22:
  1528. KERNEL4x3_1(xxx)
  1529. KERNEL4x3_2(xxx)
  1530. KERNEL4x3_3(xxx)
  1531. KERNEL4x3_4(xxx)
  1532. KERNEL4x3_1(xxx)
  1533. KERNEL4x3_2(xxx)
  1534. KERNEL4x3_3(xxx)
  1535. KERNEL4x3_4(xxx)
  1536. je .L7_26
  1537. KERNEL4x3_1(xxx)
  1538. KERNEL4x3_2(xxx)
  1539. KERNEL4x3_3(xxx)
  1540. KERNEL4x3_4(xxx)
  1541. KERNEL4x3_1(xxx)
  1542. KERNEL4x3_2(xxx)
  1543. KERNEL4x3_3(xxx)
  1544. KERNEL4x3_4(xxx)
  1545. je .L7_26
  1546. jmp .L7_22
  1547. ALIGN_4
  1548. .L7_26:
  1549. movq K, %rax
  1550. andq $7, %rax # if (k & 1)
  1551. je .L7_29
  1552. movq %rax, BI // Index for BO
  1553. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1554. salq $2, %rax // rax = rax * 4 ; number of values
  1555. leaq (AO, %rax, 8), AO
  1556. leaq (BO, BI, 8), BO
  1557. negq BI
  1558. negq %rax
  1559. ALIGN_4
  1560. .L7_27:
  1561. KERNEL4x3_SUB(xxx)
  1562. addq $3, BI
  1563. addq $4, %rax
  1564. jl .L7_27
  1565. ALIGN_4
  1566. .L7_29:
  1567. vmovddup ALPHA, %xmm0
  1568. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  1569. vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  1570. vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5
  1571. vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  1572. vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1573. vfmaddpd 2 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9
  1574. vmovups %xmm4 , (CO1)
  1575. vmovups %xmm7 , 2 * SIZE(CO1)
  1576. vmovups %xmm5 , (CO1, LDC)
  1577. vmovups %xmm8 , 2 * SIZE(CO1, LDC)
  1578. vmovups %xmm6 , (CO1, LDC, 2)
  1579. vmovups %xmm9 , 2 * SIZE(CO1, LDC, 2)
  1580. addq $4 * SIZE, CO1 # coffset += 4
  1581. ALIGN_4
  1582. .L7_30:
  1583. testq $2, M
  1584. jz .L7_40
  1585. ALIGN_4
  1586. .L7_31:
  1587. leaq BUFFER2, BO // second buffer to BO
  1588. addq $6 * SIZE, BO
  1589. vzeroall
  1590. movq K, %rax
  1591. andq $-8, %rax
  1592. je .L7_36
  1593. movq %rax, BI // Index for BO
  1594. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1595. salq $1, %rax // rax = rax *2 ; number of values
  1596. leaq (AO, %rax, 8), AO
  1597. leaq (BO, BI, 8), BO
  1598. negq BI
  1599. negq %rax
  1600. ALIGN_4
  1601. .L7_32:
  1602. KERNEL2x3_1(xxx)
  1603. KERNEL2x3_2(xxx)
  1604. KERNEL2x3_3(xxx)
  1605. KERNEL2x3_4(xxx)
  1606. KERNEL2x3_1(xxx)
  1607. KERNEL2x3_2(xxx)
  1608. KERNEL2x3_3(xxx)
  1609. KERNEL2x3_4(xxx)
  1610. je .L7_36
  1611. KERNEL2x3_1(xxx)
  1612. KERNEL2x3_2(xxx)
  1613. KERNEL2x3_3(xxx)
  1614. KERNEL2x3_4(xxx)
  1615. KERNEL2x3_1(xxx)
  1616. KERNEL2x3_2(xxx)
  1617. KERNEL2x3_3(xxx)
  1618. KERNEL2x3_4(xxx)
  1619. je .L7_36
  1620. jmp .L7_32
  1621. ALIGN_4
  1622. .L7_36:
  1623. movq K, %rax
  1624. andq $7, %rax # if (k & 1)
  1625. je .L7_39
  1626. movq %rax, BI // Index for BO
  1627. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1628. salq $1, %rax // rax = rax *2 ; number of values
  1629. leaq (AO, %rax, 8), AO
  1630. leaq (BO, BI, 8), BO
  1631. negq BI
  1632. negq %rax
  1633. ALIGN_4
  1634. .L7_37:
  1635. KERNEL2x3_SUB(xxx)
  1636. addq $3, BI
  1637. addq $2, %rax
  1638. jl .L7_37
  1639. ALIGN_4
  1640. .L7_39:
  1641. vmovddup ALPHA, %xmm0
  1642. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  1643. vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5
  1644. vfmaddpd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1645. vmovups %xmm4 , (CO1)
  1646. vmovups %xmm5 , (CO1, LDC)
  1647. vmovups %xmm6 , (CO1, LDC, 2)
  1648. addq $2 * SIZE, CO1 # coffset += 2
  1649. ALIGN_4
  1650. .L7_40:
  1651. testq $1, M
  1652. jz .L7_60 // to next 6 lines of N
  1653. ALIGN_4
  1654. .L7_41:
  1655. leaq BUFFER2, BO // second buffer to BO
  1656. addq $6 * SIZE, BO
  1657. vzeroall
  1658. movq K, %rax
  1659. andq $-8, %rax
  1660. je .L7_46
  1661. movq %rax, BI // Index for BO
  1662. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1663. leaq (AO, %rax, 8), AO
  1664. leaq (BO, BI, 8), BO
  1665. negq BI
  1666. negq %rax
  1667. ALIGN_4
  1668. .L7_42:
  1669. KERNEL1x3_1(xxx)
  1670. KERNEL1x3_2(xxx)
  1671. KERNEL1x3_3(xxx)
  1672. KERNEL1x3_4(xxx)
  1673. KERNEL1x3_1(xxx)
  1674. KERNEL1x3_2(xxx)
  1675. KERNEL1x3_3(xxx)
  1676. KERNEL1x3_4(xxx)
  1677. je .L7_46
  1678. KERNEL1x3_1(xxx)
  1679. KERNEL1x3_2(xxx)
  1680. KERNEL1x3_3(xxx)
  1681. KERNEL1x3_4(xxx)
  1682. KERNEL1x3_1(xxx)
  1683. KERNEL1x3_2(xxx)
  1684. KERNEL1x3_3(xxx)
  1685. KERNEL1x3_4(xxx)
  1686. je .L7_46
  1687. jmp .L7_42
  1688. ALIGN_4
  1689. .L7_46:
  1690. movq K, %rax
  1691. andq $7, %rax # if (k & 1)
  1692. je .L7_49
  1693. movq %rax, BI // Index for BO
  1694. leaq (BI,BI,2), BI // BI = BI * 3 ; number of values
  1695. leaq (AO, %rax, 8), AO
  1696. leaq (BO, BI, 8), BO
  1697. negq BI
  1698. negq %rax
  1699. ALIGN_4
  1700. .L7_47:
  1701. KERNEL1x3_SUB(xxx)
  1702. addq $3, BI
  1703. addq $1, %rax
  1704. jl .L7_47
  1705. ALIGN_4
  1706. .L7_49:
  1707. vmovddup ALPHA, %xmm0
  1708. vfmaddsd (CO1),%xmm0, %xmm4,%xmm4
  1709. vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5
  1710. vfmaddsd (CO1, LDC, 2),%xmm0, %xmm6,%xmm6
  1711. vmovsd %xmm4 , (CO1)
  1712. vmovsd %xmm5 , (CO1, LDC)
  1713. vmovsd %xmm6 , (CO1, LDC, 2)
  1714. addq $1 * SIZE, CO1 # coffset += 1
  1715. .L7_60:
  1716. decq J // j --
  1717. jg .L6_01
  1718. .L2_0:
  1719. cmpq $0, Nmod6 // N % 6 == 0
  1720. je .L999
  1721. /************************************************************************************************
  1722. * Loop for Nmod6 / 2 > 0
  1723. *************************************************************************************************/
  1724. movq Nmod6, J
  1725. sarq $1, J // j = j / 2
  1726. je .L1_0
  1727. ALIGN_4
  1728. .L2_01:
  1729. // copy to sub buffer
  1730. movq B, BO1
  1731. leaq BUFFER1, BO // first buffer to BO
  1732. movq K, %rax
  1733. ALIGN_4
  1734. .L2_02b:
  1735. vmovups (BO1), %xmm0
  1736. vmovups %xmm0, (BO)
  1737. addq $2*SIZE,BO1
  1738. addq $2*SIZE,BO
  1739. decq %rax
  1740. jnz .L2_02b
  1741. .L2_02c:
  1742. movq BO1, B // next offset of B
  1743. .L2_10:
  1744. movq C, CO1
  1745. leaq (C, LDC, 2), C // c += 2 * ldc
  1746. movq A, AO // aoffset = a
  1747. addq $16 * SIZE, AO
  1748. movq M, I
  1749. sarq $3, I // i = (m >> 3)
  1750. je .L2_20
  1751. ALIGN_4
  1752. .L2_11:
  1753. leaq BUFFER1, BO // first buffer to BO
  1754. addq $4 * SIZE, BO
  1755. vzeroall
  1756. movq K, %rax
  1757. andq $-8, %rax // K = K - ( K % 8 )
  1758. je .L2_16
  1759. movq %rax, BI // Index for BO
  1760. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1761. salq $3, %rax // rax = rax * 8 ; number of values
  1762. leaq (AO, %rax, 8), AO
  1763. leaq (BO, BI, 8), BO
  1764. negq BI
  1765. negq %rax
  1766. ALIGN_4
  1767. .L2_12:
  1768. KERNEL8x2_1(xxx)
  1769. KERNEL8x2_2(xxx)
  1770. KERNEL8x2_3(xxx)
  1771. KERNEL8x2_4(xxx)
  1772. KERNEL8x2_1(xxx)
  1773. KERNEL8x2_2(xxx)
  1774. KERNEL8x2_3(xxx)
  1775. KERNEL8x2_4(xxx)
  1776. je .L2_16
  1777. KERNEL8x2_1(xxx)
  1778. KERNEL8x2_2(xxx)
  1779. KERNEL8x2_3(xxx)
  1780. KERNEL8x2_4(xxx)
  1781. KERNEL8x2_1(xxx)
  1782. KERNEL8x2_2(xxx)
  1783. KERNEL8x2_3(xxx)
  1784. KERNEL8x2_4(xxx)
  1785. je .L2_16
  1786. jmp .L2_12
  1787. ALIGN_4
  1788. .L2_16:
  1789. movq K, %rax
  1790. andq $7, %rax # if (k & 1)
  1791. je .L2_19
  1792. movq %rax, BI // Index for BO
  1793. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1794. salq $3, %rax // rax = rax * 8 ; number of values
  1795. leaq (AO, %rax, 8), AO
  1796. leaq (BO, BI, 8), BO
  1797. negq BI
  1798. negq %rax
  1799. ALIGN_4
  1800. .L2_17:
  1801. KERNEL8x2_SUB(xxx)
  1802. addq $2, BI
  1803. addq $8, %rax
  1804. jl .L2_17
  1805. ALIGN_4
  1806. .L2_19:
  1807. vmovddup ALPHA, %xmm0
  1808. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  1809. vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  1810. vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
  1811. vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
  1812. vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5
  1813. vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  1814. vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
  1815. vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
  1816. vmovups %xmm4 , (CO1)
  1817. vmovups %xmm7 , 2 * SIZE(CO1)
  1818. vmovups %xmm10, 4 * SIZE(CO1)
  1819. vmovups %xmm13, 6 * SIZE(CO1)
  1820. vmovups %xmm5 , (CO1, LDC)
  1821. vmovups %xmm8 , 2 * SIZE(CO1, LDC)
  1822. vmovups %xmm11, 4 * SIZE(CO1, LDC)
  1823. vmovups %xmm14, 6 * SIZE(CO1, LDC)
  1824. addq $8 * SIZE, CO1 # coffset += 8
  1825. decq I # i --
  1826. jg .L2_11
  1827. ALIGN_4
  1828. /**************************************************************************
  1829. * Rest of M
  1830. ***************************************************************************/
  1831. .L2_20:
  1832. // Test rest of M
  1833. testq $7, M
  1834. jz .L2_60 // to next 2 lines of N
  1835. testq $4, M
  1836. jz .L2_30
  1837. ALIGN_4
  1838. .L2_21:
  1839. leaq BUFFER1, BO // first buffer to BO
  1840. addq $4 * SIZE, BO
  1841. vzeroall
  1842. movq K, %rax
  1843. andq $-8, %rax
  1844. je .L2_26
  1845. movq %rax, BI // Index for BO
  1846. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1847. salq $2, %rax // rax = rax * 4 ; number of values
  1848. leaq (AO, %rax, 8), AO
  1849. leaq (BO, BI, 8), BO
  1850. negq BI
  1851. negq %rax
  1852. ALIGN_4
  1853. .L2_22:
  1854. KERNEL4x2_1(xxx)
  1855. KERNEL4x2_2(xxx)
  1856. KERNEL4x2_3(xxx)
  1857. KERNEL4x2_4(xxx)
  1858. KERNEL4x2_1(xxx)
  1859. KERNEL4x2_2(xxx)
  1860. KERNEL4x2_3(xxx)
  1861. KERNEL4x2_4(xxx)
  1862. je .L2_26
  1863. KERNEL4x2_1(xxx)
  1864. KERNEL4x2_2(xxx)
  1865. KERNEL4x2_3(xxx)
  1866. KERNEL4x2_4(xxx)
  1867. KERNEL4x2_1(xxx)
  1868. KERNEL4x2_2(xxx)
  1869. KERNEL4x2_3(xxx)
  1870. KERNEL4x2_4(xxx)
  1871. je .L2_26
  1872. jmp .L2_22
  1873. ALIGN_4
  1874. .L2_26:
  1875. movq K, %rax
  1876. andq $7, %rax # if (k & 1)
  1877. je .L2_29
  1878. movq %rax, BI // Index for BO
  1879. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1880. salq $2, %rax // rax = rax * 4 ; number of values
  1881. leaq (AO, %rax, 8), AO
  1882. leaq (BO, BI, 8), BO
  1883. negq BI
  1884. negq %rax
  1885. ALIGN_4
  1886. .L2_27:
  1887. KERNEL4x2_SUB(xxx)
  1888. addq $2, BI
  1889. addq $4, %rax
  1890. jl .L2_27
  1891. ALIGN_4
  1892. .L2_29:
  1893. vmovddup ALPHA, %xmm0
  1894. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  1895. vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  1896. vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5
  1897. vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  1898. vmovups %xmm4 , (CO1)
  1899. vmovups %xmm7 , 2 * SIZE(CO1)
  1900. vmovups %xmm5 , (CO1, LDC)
  1901. vmovups %xmm8 , 2 * SIZE(CO1, LDC)
  1902. addq $4 * SIZE, CO1 # coffset += 4
  1903. ALIGN_4
  1904. .L2_30:
  1905. testq $2, M
  1906. jz .L2_40
  1907. ALIGN_4
  1908. .L2_31:
  1909. leaq BUFFER1, BO // first buffer to BO
  1910. addq $4 * SIZE, BO
  1911. vzeroall
  1912. movq K, %rax
  1913. andq $-8, %rax
  1914. je .L2_36
  1915. movq %rax, BI // Index for BO
  1916. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1917. salq $1, %rax // rax = rax *2 ; number of values
  1918. leaq (AO, %rax, 8), AO
  1919. leaq (BO, BI, 8), BO
  1920. negq BI
  1921. negq %rax
  1922. ALIGN_4
  1923. .L2_32:
  1924. KERNEL2x2_1(xxx)
  1925. KERNEL2x2_2(xxx)
  1926. KERNEL2x2_3(xxx)
  1927. KERNEL2x2_4(xxx)
  1928. KERNEL2x2_1(xxx)
  1929. KERNEL2x2_2(xxx)
  1930. KERNEL2x2_3(xxx)
  1931. KERNEL2x2_4(xxx)
  1932. je .L2_36
  1933. KERNEL2x2_1(xxx)
  1934. KERNEL2x2_2(xxx)
  1935. KERNEL2x2_3(xxx)
  1936. KERNEL2x2_4(xxx)
  1937. KERNEL2x2_1(xxx)
  1938. KERNEL2x2_2(xxx)
  1939. KERNEL2x2_3(xxx)
  1940. KERNEL2x2_4(xxx)
  1941. je .L2_36
  1942. jmp .L2_32
  1943. ALIGN_4
  1944. .L2_36:
  1945. movq K, %rax
  1946. andq $7, %rax # if (k & 1)
  1947. je .L2_39
  1948. movq %rax, BI // Index for BO
  1949. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1950. salq $1, %rax // rax = rax *2 ; number of values
  1951. leaq (AO, %rax, 8), AO
  1952. leaq (BO, BI, 8), BO
  1953. negq BI
  1954. negq %rax
  1955. ALIGN_4
  1956. .L2_37:
  1957. KERNEL2x2_SUB(xxx)
  1958. addq $2, BI
  1959. addq $2, %rax
  1960. jl .L2_37
  1961. ALIGN_4
  1962. .L2_39:
  1963. vmovddup ALPHA, %xmm0
  1964. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  1965. vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5
  1966. vmovups %xmm4 , (CO1)
  1967. vmovups %xmm5 , (CO1, LDC)
  1968. addq $2 * SIZE, CO1 # coffset += 2
  1969. ALIGN_4
  1970. .L2_40:
  1971. testq $1, M
  1972. jz .L2_60 // to next 2 lines of N
  1973. ALIGN_4
  1974. .L2_41:
  1975. leaq BUFFER1, BO // first buffer to BO
  1976. addq $4 * SIZE, BO
  1977. vzeroall
  1978. movq K, %rax
  1979. andq $-8, %rax
  1980. je .L2_46
  1981. movq %rax, BI // Index for BO
  1982. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  1983. leaq (AO, %rax, 8), AO
  1984. leaq (BO, BI, 8), BO
  1985. negq BI
  1986. negq %rax
  1987. ALIGN_4
  1988. .L2_42:
  1989. KERNEL1x2_1(xxx)
  1990. KERNEL1x2_2(xxx)
  1991. KERNEL1x2_3(xxx)
  1992. KERNEL1x2_4(xxx)
  1993. KERNEL1x2_1(xxx)
  1994. KERNEL1x2_2(xxx)
  1995. KERNEL1x2_3(xxx)
  1996. KERNEL1x2_4(xxx)
  1997. je .L2_46
  1998. KERNEL1x2_1(xxx)
  1999. KERNEL1x2_2(xxx)
  2000. KERNEL1x2_3(xxx)
  2001. KERNEL1x2_4(xxx)
  2002. KERNEL1x2_1(xxx)
  2003. KERNEL1x2_2(xxx)
  2004. KERNEL1x2_3(xxx)
  2005. KERNEL1x2_4(xxx)
  2006. je .L2_46
  2007. jmp .L2_42
  2008. ALIGN_4
  2009. .L2_46:
  2010. movq K, %rax
  2011. andq $7, %rax # if (k & 1)
  2012. je .L2_49
  2013. movq %rax, BI // Index for BO
  2014. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2015. leaq (AO, %rax, 8), AO
  2016. leaq (BO, BI, 8), BO
  2017. negq BI
  2018. negq %rax
  2019. ALIGN_4
  2020. .L2_47:
  2021. KERNEL1x2_SUB(xxx)
  2022. addq $2, BI
  2023. addq $1, %rax
  2024. jl .L2_47
  2025. ALIGN_4
  2026. .L2_49:
  2027. vmovddup ALPHA, %xmm0
  2028. vfmaddsd (CO1),%xmm0, %xmm4,%xmm4
  2029. vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5
  2030. vmovsd %xmm4 , (CO1)
  2031. vmovsd %xmm5 , (CO1, LDC)
  2032. addq $1 * SIZE, CO1 # coffset += 1
  2033. ALIGN_4
  2034. .L2_60:
  2035. decq J // j --
  2036. jg .L2_01 // next 2 lines of N
  2037. .L1_0:
  2038. /************************************************************************************************
  2039. * Loop for Nmod6 % 2 > 0
  2040. *************************************************************************************************/
  2041. movq Nmod6, J
  2042. andq $1, J // j % 2
  2043. je .L999
  2044. ALIGN_4
  2045. .L1_01:
  2046. // copy to sub buffer
  2047. movq B, BO1
  2048. leaq BUFFER1, BO // first buffer to BO
  2049. movq K, %rax
  2050. ALIGN_4
  2051. .L1_02b:
  2052. vmovsd (BO1), %xmm0
  2053. vmovsd %xmm0, (BO)
  2054. addq $1*SIZE,BO1
  2055. addq $1*SIZE,BO
  2056. decq %rax
  2057. jnz .L1_02b
  2058. .L1_02c:
  2059. movq BO1, B // next offset of B
  2060. .L1_10:
  2061. movq C, CO1
  2062. leaq (C, LDC, 1), C // c += 1 * ldc
  2063. movq A, AO // aoffset = a
  2064. addq $16 * SIZE, AO
  2065. movq M, I
  2066. sarq $3, I // i = (m >> 3)
  2067. je .L1_20
  2068. ALIGN_4
  2069. .L1_11:
  2070. leaq BUFFER1, BO // first buffer to BO
  2071. addq $2 * SIZE, BO
  2072. vzeroall
  2073. movq K, %rax
  2074. andq $-8, %rax // K = K - ( K % 8 )
  2075. je .L1_16
  2076. movq %rax, BI // Index for BO
  2077. salq $3, %rax // rax = rax * 8 ; number of values
  2078. leaq (AO, %rax, 8), AO
  2079. leaq (BO, BI, 8), BO
  2080. negq BI
  2081. negq %rax
  2082. ALIGN_4
  2083. .L1_12:
  2084. KERNEL8x1_1(xxx)
  2085. KERNEL8x1_2(xxx)
  2086. KERNEL8x1_3(xxx)
  2087. KERNEL8x1_4(xxx)
  2088. KERNEL8x1_1(xxx)
  2089. KERNEL8x1_2(xxx)
  2090. KERNEL8x1_3(xxx)
  2091. KERNEL8x1_4(xxx)
  2092. je .L1_16
  2093. KERNEL8x1_1(xxx)
  2094. KERNEL8x1_2(xxx)
  2095. KERNEL8x1_3(xxx)
  2096. KERNEL8x1_4(xxx)
  2097. KERNEL8x1_1(xxx)
  2098. KERNEL8x1_2(xxx)
  2099. KERNEL8x1_3(xxx)
  2100. KERNEL8x1_4(xxx)
  2101. je .L1_16
  2102. jmp .L1_12
  2103. ALIGN_4
  2104. .L1_16:
  2105. movq K, %rax
  2106. andq $7, %rax # if (k & 1)
  2107. je .L1_19
  2108. movq %rax, BI // Index for BO
  2109. salq $3, %rax // rax = rax * 8 ; number of values
  2110. leaq (AO, %rax, 8), AO
  2111. leaq (BO, BI, 8), BO
  2112. negq BI
  2113. negq %rax
  2114. ALIGN_4
  2115. .L1_17:
  2116. KERNEL8x1_SUB(xxx)
  2117. addq $1, BI
  2118. addq $8, %rax
  2119. jl .L1_17
  2120. ALIGN_4
  2121. .L1_19:
  2122. vmovddup ALPHA, %xmm0
  2123. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  2124. vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  2125. vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
  2126. vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
  2127. vmovups %xmm4 , (CO1)
  2128. vmovups %xmm7 , 2 * SIZE(CO1)
  2129. vmovups %xmm10, 4 * SIZE(CO1)
  2130. vmovups %xmm13, 6 * SIZE(CO1)
  2131. addq $8 * SIZE, CO1 # coffset += 8
  2132. decq I # i --
  2133. jg .L1_11
  2134. ALIGN_4
  2135. /**************************************************************************
  2136. * Rest of M
  2137. ***************************************************************************/
  2138. .L1_20:
  2139. // Test rest of M
  2140. testq $7, M
  2141. jz .L999
  2142. testq $4, M
  2143. jz .L1_30
  2144. ALIGN_4
  2145. .L1_21:
  2146. leaq BUFFER1, BO // first buffer to BO
  2147. addq $2 * SIZE, BO
  2148. vzeroall
  2149. movq K, %rax
  2150. andq $-8, %rax
  2151. je .L1_26
  2152. movq %rax, BI // Index for BO
  2153. salq $2, %rax // rax = rax * 4 ; number of values
  2154. leaq (AO, %rax, 8), AO
  2155. leaq (BO, BI, 8), BO
  2156. negq BI
  2157. negq %rax
  2158. ALIGN_4
  2159. .L1_22:
  2160. KERNEL4x1_1(xxx)
  2161. KERNEL4x1_2(xxx)
  2162. KERNEL4x1_3(xxx)
  2163. KERNEL4x1_4(xxx)
  2164. KERNEL4x1_1(xxx)
  2165. KERNEL4x1_2(xxx)
  2166. KERNEL4x1_3(xxx)
  2167. KERNEL4x1_4(xxx)
  2168. je .L1_26
  2169. KERNEL4x1_1(xxx)
  2170. KERNEL4x1_2(xxx)
  2171. KERNEL4x1_3(xxx)
  2172. KERNEL4x1_4(xxx)
  2173. KERNEL4x1_1(xxx)
  2174. KERNEL4x1_2(xxx)
  2175. KERNEL4x1_3(xxx)
  2176. KERNEL4x1_4(xxx)
  2177. je .L1_26
  2178. jmp .L1_22
  2179. ALIGN_4
  2180. .L1_26:
  2181. movq K, %rax
  2182. andq $7, %rax # if (k & 1)
  2183. je .L1_29
  2184. movq %rax, BI // Index for BO
  2185. salq $2, %rax // rax = rax * 4 ; number of values
  2186. leaq (AO, %rax, 8), AO
  2187. leaq (BO, BI, 8), BO
  2188. negq BI
  2189. negq %rax
  2190. ALIGN_4
  2191. .L1_27:
  2192. KERNEL4x1_SUB(xxx)
  2193. addq $1, BI
  2194. addq $4, %rax
  2195. jl .L1_27
  2196. ALIGN_4
  2197. .L1_29:
  2198. vmovddup ALPHA, %xmm0
  2199. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  2200. vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  2201. vmovups %xmm4 , (CO1)
  2202. vmovups %xmm7 , 2 * SIZE(CO1)
  2203. addq $4 * SIZE, CO1 # coffset += 4
  2204. ALIGN_4
  2205. .L1_30:
  2206. testq $2, M
  2207. jz .L1_40
  2208. ALIGN_4
  2209. .L1_31:
  2210. leaq BUFFER1, BO // first buffer to BO
  2211. addq $2 * SIZE, BO
  2212. vzeroall
  2213. movq K, %rax
  2214. andq $-8, %rax
  2215. je .L1_36
  2216. movq %rax, BI // Index for BO
  2217. salq $1, %rax // rax = rax *2 ; number of values
  2218. leaq (AO, %rax, 8), AO
  2219. leaq (BO, BI, 8), BO
  2220. negq BI
  2221. negq %rax
  2222. ALIGN_4
  2223. .L1_32:
  2224. KERNEL2x1_1(xxx)
  2225. KERNEL2x1_2(xxx)
  2226. KERNEL2x1_3(xxx)
  2227. KERNEL2x1_4(xxx)
  2228. KERNEL2x1_1(xxx)
  2229. KERNEL2x1_2(xxx)
  2230. KERNEL2x1_3(xxx)
  2231. KERNEL2x1_4(xxx)
  2232. je .L1_36
  2233. KERNEL2x1_1(xxx)
  2234. KERNEL2x1_2(xxx)
  2235. KERNEL2x1_3(xxx)
  2236. KERNEL2x1_4(xxx)
  2237. KERNEL2x1_1(xxx)
  2238. KERNEL2x1_2(xxx)
  2239. KERNEL2x1_3(xxx)
  2240. KERNEL2x1_4(xxx)
  2241. je .L1_36
  2242. jmp .L1_32
  2243. ALIGN_4
  2244. .L1_36:
  2245. movq K, %rax
  2246. andq $7, %rax # if (k & 1)
  2247. je .L1_39
  2248. movq %rax, BI // Index for BO
  2249. salq $1, %rax // rax = rax *2 ; number of values
  2250. leaq (AO, %rax, 8), AO
  2251. leaq (BO, BI, 8), BO
  2252. negq BI
  2253. negq %rax
  2254. ALIGN_4
  2255. .L1_37:
  2256. KERNEL2x1_SUB(xxx)
  2257. addq $1, BI
  2258. addq $2, %rax
  2259. jl .L1_37
  2260. ALIGN_4
  2261. .L1_39:
  2262. vmovddup ALPHA, %xmm0
  2263. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  2264. vmovups %xmm4 , (CO1)
  2265. addq $2 * SIZE, CO1 # coffset += 2
  2266. ALIGN_4
  2267. .L1_40:
  2268. testq $1, M
  2269. jz .L999
  2270. ALIGN_4
  2271. .L1_41:
  2272. leaq BUFFER1, BO // first buffer to BO
  2273. addq $2 * SIZE, BO
  2274. vzeroall
  2275. movq K, %rax
  2276. andq $-8, %rax
  2277. je .L1_46
  2278. movq %rax, BI // Index for BO
  2279. leaq (AO, %rax, 8), AO
  2280. leaq (BO, BI, 8), BO
  2281. negq BI
  2282. negq %rax
  2283. ALIGN_4
  2284. .L1_42:
  2285. KERNEL1x1_1(xxx)
  2286. KERNEL1x1_2(xxx)
  2287. KERNEL1x1_3(xxx)
  2288. KERNEL1x1_4(xxx)
  2289. KERNEL1x1_1(xxx)
  2290. KERNEL1x1_2(xxx)
  2291. KERNEL1x1_3(xxx)
  2292. KERNEL1x1_4(xxx)
  2293. je .L1_46
  2294. KERNEL1x1_1(xxx)
  2295. KERNEL1x1_2(xxx)
  2296. KERNEL1x1_3(xxx)
  2297. KERNEL1x1_4(xxx)
  2298. KERNEL1x1_1(xxx)
  2299. KERNEL1x1_2(xxx)
  2300. KERNEL1x1_3(xxx)
  2301. KERNEL1x1_4(xxx)
  2302. je .L1_46
  2303. jmp .L1_42
  2304. ALIGN_4
  2305. .L1_46:
  2306. movq K, %rax
  2307. andq $7, %rax # if (k & 1)
  2308. je .L1_49
  2309. movq %rax, BI // Index for BO
  2310. leaq (AO, %rax, 8), AO
  2311. leaq (BO, BI, 8), BO
  2312. negq BI
  2313. negq %rax
  2314. ALIGN_4
  2315. .L1_47:
  2316. KERNEL1x1_SUB(xxx)
  2317. addq $1, BI
  2318. addq $1, %rax
  2319. jl .L1_47
  2320. ALIGN_4
  2321. .L1_49:
  2322. vmovddup ALPHA, %xmm0
  2323. vfmaddsd (CO1),%xmm0, %xmm4,%xmm4
  2324. vmovsd %xmm4 , (CO1)
  2325. addq $1 * SIZE, CO1 # coffset += 1
  2326. ALIGN_4
  2327. .L999:
  2328. movq SP, %rsp
  2329. movq (%rsp), %rbx
  2330. movq 8(%rsp), %rbp
  2331. movq 16(%rsp), %r12
  2332. movq 24(%rsp), %r13
  2333. movq 32(%rsp), %r14
  2334. movq 40(%rsp), %r15
  2335. #ifdef WINDOWS_ABI
  2336. movq 48(%rsp), %rdi
  2337. movq 56(%rsp), %rsi
  2338. movups 64(%rsp), %xmm6
  2339. movups 80(%rsp), %xmm7
  2340. movups 96(%rsp), %xmm8
  2341. movups 112(%rsp), %xmm9
  2342. movups 128(%rsp), %xmm10
  2343. movups 144(%rsp), %xmm11
  2344. movups 160(%rsp), %xmm12
  2345. movups 176(%rsp), %xmm13
  2346. movups 192(%rsp), %xmm14
  2347. movups 208(%rsp), %xmm15
  2348. #endif
  2349. addq $STACKSIZE, %rsp
  2350. ret
  2351. EPILOGUE
  2352. #else
  2353. /*************************************************************************************
  2354. * TRMM Kernel
  2355. *************************************************************************************/
  2356. PROLOGUE
  2357. PROFCODE
  2358. subq $STACKSIZE, %rsp
  2359. movq %rbx, (%rsp)
  2360. movq %rbp, 8(%rsp)
  2361. movq %r12, 16(%rsp)
  2362. movq %r13, 24(%rsp)
  2363. movq %r14, 32(%rsp)
  2364. movq %r15, 40(%rsp)
  2365. vzeroupper
  2366. #ifdef WINDOWS_ABI
  2367. movq %rdi, 48(%rsp)
  2368. movq %rsi, 56(%rsp)
  2369. movups %xmm6, 64(%rsp)
  2370. movups %xmm7, 80(%rsp)
  2371. movups %xmm8, 96(%rsp)
  2372. movups %xmm9, 112(%rsp)
  2373. movups %xmm10, 128(%rsp)
  2374. movups %xmm11, 144(%rsp)
  2375. movups %xmm12, 160(%rsp)
  2376. movups %xmm13, 176(%rsp)
  2377. movups %xmm14, 192(%rsp)
  2378. movups %xmm15, 208(%rsp)
  2379. movq ARG1, OLD_M
  2380. movq ARG2, OLD_N
  2381. movq ARG3, OLD_K
  2382. movq OLD_A, A
  2383. movq OLD_B, B
  2384. movq OLD_C, C
  2385. movq OLD_LDC, LDC
  2386. #ifdef TRMMKERNEL
  2387. movsd OLD_OFFSET, %xmm12
  2388. #endif
  2389. vmovaps %xmm3, %xmm0
  2390. #else
  2391. movq STACKSIZE + 8(%rsp), LDC
  2392. #ifdef TRMMKERNEL
  2393. movsd STACKSIZE + 16(%rsp), %xmm12
  2394. #endif
  2395. #endif
  2396. movq %rsp, SP # save old stack
  2397. subq $128 + L_BUFFER_SIZE, %rsp
  2398. andq $-4096, %rsp # align stack
  2399. STACK_TOUCH
  2400. cmpq $0, OLD_M
  2401. je .L999
  2402. cmpq $0, OLD_N
  2403. je .L999
  2404. cmpq $0, OLD_K
  2405. je .L999
  2406. movq OLD_M, M
  2407. movq OLD_N, N
  2408. movq OLD_K, K
  2409. vmovsd %xmm0, ALPHA
  2410. salq $BASE_SHIFT, LDC
  2411. movq N, %rax
  2412. xorq %rdx, %rdx
  2413. movq $2, %rdi
  2414. divq %rdi // N / 2
  2415. movq %rax, Ndiv6 // N / 2
  2416. movq %rdx, Nmod6 // N % 2
  2417. #ifdef TRMMKERNEL
  2418. vmovsd %xmm12, OFFSET
  2419. vmovsd %xmm12, KK
  2420. #ifndef LEFT
  2421. negq KK
  2422. #endif
  2423. #endif
  2424. movq Ndiv6, J
  2425. cmpq $0, J
  2426. je .L1_0
  2427. ALIGN_4
  2428. .L2_0:
  2429. .L2_01:
  2430. // copy to sub buffer
  2431. movq B, BO1
  2432. leaq BUFFER1, BO // first buffer to BO
  2433. movq K, %rax
  2434. ALIGN_4
  2435. .L2_02b:
  2436. vmovups (BO1), %xmm0
  2437. vmovups %xmm0, (BO)
  2438. addq $2*SIZE,BO1
  2439. addq $2*SIZE,BO
  2440. decq %rax
  2441. jnz .L2_02b
  2442. .L2_02c:
  2443. movq BO1, B // next offset of B
  2444. .L2_10:
  2445. movq C, CO1
  2446. leaq (C, LDC, 2), C // c += 2 * ldc
  2447. #if defined(TRMMKERNEL) && defined(LEFT)
  2448. movq OFFSET, %rax
  2449. movq %rax, KK
  2450. #endif
  2451. movq A, AO // aoffset = a
  2452. addq $16 * SIZE, AO
  2453. movq M, I
  2454. sarq $3, I // i = (m >> 3)
  2455. je .L2_20
  2456. ALIGN_4
  2457. .L2_11:
  2458. #if !defined(TRMMKERNEL) || \
  2459. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2460. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2461. leaq BUFFER1, BO // first buffer to BO
  2462. addq $4 * SIZE, BO
  2463. #else
  2464. movq KK, %rax
  2465. leaq BUFFER1, BO // first buffer to BO
  2466. addq $4 * SIZE, BO
  2467. movq %rax, BI // Index for BO
  2468. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2469. leaq (BO, BI, 8), BO
  2470. salq $3, %rax // rax = rax * 8 ; number of values
  2471. leaq (AO, %rax, 8), AO
  2472. #endif
  2473. vzeroall
  2474. #ifndef TRMMKERNEL
  2475. movq K, %rax
  2476. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2477. movq K, %rax
  2478. subq KK, %rax
  2479. movq %rax, KKK
  2480. #else
  2481. movq KK, %rax
  2482. #ifdef LEFT
  2483. addq $8, %rax // number of values in AO
  2484. #else
  2485. addq $2, %rax // number of values in BO
  2486. #endif
  2487. movq %rax, KKK
  2488. #endif
  2489. andq $-8, %rax // K = K - ( K % 8 )
  2490. je .L2_16
  2491. movq %rax, BI // Index for BO
  2492. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2493. salq $3, %rax // rax = rax * 8 ; number of values
  2494. leaq (AO, %rax, 8), AO
  2495. leaq (BO, BI, 8), BO
  2496. negq BI
  2497. negq %rax
  2498. ALIGN_4
  2499. .L2_12:
  2500. KERNEL8x2_1(xxx)
  2501. KERNEL8x2_2(xxx)
  2502. KERNEL8x2_3(xxx)
  2503. KERNEL8x2_4(xxx)
  2504. KERNEL8x2_1(xxx)
  2505. KERNEL8x2_2(xxx)
  2506. KERNEL8x2_3(xxx)
  2507. KERNEL8x2_4(xxx)
  2508. je .L2_16
  2509. KERNEL8x2_1(xxx)
  2510. KERNEL8x2_2(xxx)
  2511. KERNEL8x2_3(xxx)
  2512. KERNEL8x2_4(xxx)
  2513. KERNEL8x2_1(xxx)
  2514. KERNEL8x2_2(xxx)
  2515. KERNEL8x2_3(xxx)
  2516. KERNEL8x2_4(xxx)
  2517. je .L2_16
  2518. jmp .L2_12
  2519. ALIGN_4
  2520. .L2_16:
  2521. #ifndef TRMMKERNEL
  2522. movq K, %rax
  2523. #else
  2524. movq KKK, %rax
  2525. #endif
  2526. andq $7, %rax # if (k & 1)
  2527. je .L2_19
  2528. movq %rax, BI // Index for BO
  2529. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2530. salq $3, %rax // rax = rax * 8 ; number of values
  2531. leaq (AO, %rax, 8), AO
  2532. leaq (BO, BI, 8), BO
  2533. negq BI
  2534. negq %rax
  2535. ALIGN_4
  2536. .L2_17:
  2537. KERNEL8x2_SUB(xxx)
  2538. addq $2, BI
  2539. addq $8, %rax
  2540. jl .L2_17
  2541. ALIGN_4
  2542. .L2_19:
  2543. vmovddup ALPHA, %xmm0
  2544. #ifndef TRMMKERNEL
  2545. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  2546. vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  2547. vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
  2548. vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
  2549. vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5
  2550. vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  2551. vfmaddpd 4 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11
  2552. vfmaddpd 6 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14
  2553. #else
  2554. vmulpd %xmm0, %xmm4,%xmm4
  2555. vmulpd %xmm0, %xmm7,%xmm7
  2556. vmulpd %xmm0, %xmm10,%xmm10
  2557. vmulpd %xmm0, %xmm13,%xmm13
  2558. vmulpd %xmm0, %xmm5,%xmm5
  2559. vmulpd %xmm0, %xmm8,%xmm8
  2560. vmulpd %xmm0, %xmm11,%xmm11
  2561. vmulpd %xmm0, %xmm14,%xmm14
  2562. #endif
  2563. vmovups %xmm4 , (CO1)
  2564. vmovups %xmm7 , 2 * SIZE(CO1)
  2565. vmovups %xmm10, 4 * SIZE(CO1)
  2566. vmovups %xmm13, 6 * SIZE(CO1)
  2567. vmovups %xmm5 , (CO1, LDC)
  2568. vmovups %xmm8 , 2 * SIZE(CO1, LDC)
  2569. vmovups %xmm11, 4 * SIZE(CO1, LDC)
  2570. vmovups %xmm14, 6 * SIZE(CO1, LDC)
  2571. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2572. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2573. movq K, %rax
  2574. subq KKK, %rax
  2575. movq %rax, BI // Index for BO
  2576. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2577. leaq (BO, BI, 8), BO
  2578. salq $3, %rax // rax = rax * 8 ; number of values
  2579. leaq (AO, %rax, 8), AO
  2580. #endif
  2581. #if defined(TRMMKERNEL) && defined(LEFT)
  2582. addq $8, KK
  2583. #endif
  2584. addq $8 * SIZE, CO1 # coffset += 8
  2585. decq I # i --
  2586. jg .L2_11
  2587. ALIGN_4
  2588. /**************************************************************************
  2589. * Rest of M
  2590. ***************************************************************************/
  2591. .L2_20:
  2592. // Test rest of M
  2593. testq $7, M
  2594. jz .L2_60 // to next 2 lines of N
  2595. testq $4, M
  2596. jz .L2_30
  2597. ALIGN_4
  2598. .L2_21:
  2599. #if !defined(TRMMKERNEL) || \
  2600. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2601. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2602. leaq BUFFER1, BO // first buffer to BO
  2603. addq $4 * SIZE, BO
  2604. #else
  2605. movq KK, %rax
  2606. leaq BUFFER1, BO // first buffer to BO
  2607. addq $4 * SIZE, BO
  2608. movq %rax, BI // Index for BO
  2609. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2610. leaq (BO, BI, 8), BO
  2611. salq $2, %rax // rax = rax * 4 ; number of values
  2612. leaq (AO, %rax, 8), AO
  2613. #endif
  2614. vzeroall
  2615. #ifndef TRMMKERNEL
  2616. movq K, %rax
  2617. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2618. movq K, %rax
  2619. subq KK, %rax
  2620. movq %rax, KKK
  2621. #else
  2622. movq KK, %rax
  2623. #ifdef LEFT
  2624. addq $4, %rax // number of values in AO
  2625. #else
  2626. addq $2, %rax // number of values in BO
  2627. #endif
  2628. movq %rax, KKK
  2629. #endif
  2630. andq $-8, %rax
  2631. je .L2_26
  2632. movq %rax, BI // Index for BO
  2633. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2634. salq $2, %rax // rax = rax * 4 ; number of values
  2635. leaq (AO, %rax, 8), AO
  2636. leaq (BO, BI, 8), BO
  2637. negq BI
  2638. negq %rax
  2639. ALIGN_4
  2640. .L2_22:
  2641. KERNEL4x2_1(xxx)
  2642. KERNEL4x2_2(xxx)
  2643. KERNEL4x2_3(xxx)
  2644. KERNEL4x2_4(xxx)
  2645. KERNEL4x2_1(xxx)
  2646. KERNEL4x2_2(xxx)
  2647. KERNEL4x2_3(xxx)
  2648. KERNEL4x2_4(xxx)
  2649. je .L2_26
  2650. KERNEL4x2_1(xxx)
  2651. KERNEL4x2_2(xxx)
  2652. KERNEL4x2_3(xxx)
  2653. KERNEL4x2_4(xxx)
  2654. KERNEL4x2_1(xxx)
  2655. KERNEL4x2_2(xxx)
  2656. KERNEL4x2_3(xxx)
  2657. KERNEL4x2_4(xxx)
  2658. je .L2_26
  2659. jmp .L2_22
  2660. ALIGN_4
  2661. .L2_26:
  2662. #ifndef TRMMKERNEL
  2663. movq K, %rax
  2664. #else
  2665. movq KKK, %rax
  2666. #endif
  2667. andq $7, %rax # if (k & 1)
  2668. je .L2_29
  2669. movq %rax, BI // Index for BO
  2670. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2671. salq $2, %rax // rax = rax * 4 ; number of values
  2672. leaq (AO, %rax, 8), AO
  2673. leaq (BO, BI, 8), BO
  2674. negq BI
  2675. negq %rax
  2676. ALIGN_4
  2677. .L2_27:
  2678. KERNEL4x2_SUB(xxx)
  2679. addq $2, BI
  2680. addq $4, %rax
  2681. jl .L2_27
  2682. ALIGN_4
  2683. .L2_29:
  2684. vmovddup ALPHA, %xmm0
  2685. #ifndef TRMMKERNEL
  2686. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  2687. vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  2688. vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5
  2689. vfmaddpd 2 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8
  2690. #else
  2691. vmulpd %xmm0, %xmm4,%xmm4
  2692. vmulpd %xmm0, %xmm7,%xmm7
  2693. vmulpd %xmm0, %xmm5,%xmm5
  2694. vmulpd %xmm0, %xmm8,%xmm8
  2695. #endif
  2696. vmovups %xmm4 , (CO1)
  2697. vmovups %xmm7 , 2 * SIZE(CO1)
  2698. vmovups %xmm5 , (CO1, LDC)
  2699. vmovups %xmm8 , 2 * SIZE(CO1, LDC)
  2700. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2701. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2702. movq K, %rax
  2703. subq KKK, %rax
  2704. movq %rax, BI // Index for BO
  2705. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2706. leaq (BO, BI, 8), BO
  2707. salq $2, %rax // rax = rax * 4 ; number of values
  2708. leaq (AO, %rax, 8), AO
  2709. #endif
  2710. #if defined(TRMMKERNEL) && defined(LEFT)
  2711. addq $4, KK
  2712. #endif
  2713. addq $4 * SIZE, CO1 # coffset += 4
  2714. ALIGN_4
  2715. .L2_30:
  2716. testq $2, M
  2717. jz .L2_40
  2718. ALIGN_4
  2719. .L2_31:
  2720. #if !defined(TRMMKERNEL) || \
  2721. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2722. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2723. leaq BUFFER1, BO // first buffer to BO
  2724. addq $4 * SIZE, BO
  2725. #else
  2726. movq KK, %rax
  2727. leaq BUFFER1, BO // first buffer to BO
  2728. addq $4 * SIZE, BO
  2729. movq %rax, BI // Index for BO
  2730. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2731. leaq (BO, BI, 8), BO
  2732. salq $1, %rax // rax = rax * 2 ; number of values
  2733. leaq (AO, %rax, 8), AO
  2734. #endif
  2735. vzeroall
  2736. #ifndef TRMMKERNEL
  2737. movq K, %rax
  2738. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2739. movq K, %rax
  2740. subq KK, %rax
  2741. movq %rax, KKK
  2742. #else
  2743. movq KK, %rax
  2744. #ifdef LEFT
  2745. addq $2, %rax // number of values in AO
  2746. #else
  2747. addq $2, %rax // number of values in BO
  2748. #endif
  2749. movq %rax, KKK
  2750. #endif
  2751. andq $-8, %rax
  2752. je .L2_36
  2753. movq %rax, BI // Index for BO
  2754. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2755. salq $1, %rax // rax = rax *2 ; number of values
  2756. leaq (AO, %rax, 8), AO
  2757. leaq (BO, BI, 8), BO
  2758. negq BI
  2759. negq %rax
  2760. ALIGN_4
  2761. .L2_32:
  2762. KERNEL2x2_1(xxx)
  2763. KERNEL2x2_2(xxx)
  2764. KERNEL2x2_3(xxx)
  2765. KERNEL2x2_4(xxx)
  2766. KERNEL2x2_1(xxx)
  2767. KERNEL2x2_2(xxx)
  2768. KERNEL2x2_3(xxx)
  2769. KERNEL2x2_4(xxx)
  2770. je .L2_36
  2771. KERNEL2x2_1(xxx)
  2772. KERNEL2x2_2(xxx)
  2773. KERNEL2x2_3(xxx)
  2774. KERNEL2x2_4(xxx)
  2775. KERNEL2x2_1(xxx)
  2776. KERNEL2x2_2(xxx)
  2777. KERNEL2x2_3(xxx)
  2778. KERNEL2x2_4(xxx)
  2779. je .L2_36
  2780. jmp .L2_32
  2781. ALIGN_4
  2782. .L2_36:
  2783. #ifndef TRMMKERNEL
  2784. movq K, %rax
  2785. #else
  2786. movq KKK, %rax
  2787. #endif
  2788. andq $7, %rax # if (k & 1)
  2789. je .L2_39
  2790. movq %rax, BI // Index for BO
  2791. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2792. salq $1, %rax // rax = rax *2 ; number of values
  2793. leaq (AO, %rax, 8), AO
  2794. leaq (BO, BI, 8), BO
  2795. negq BI
  2796. negq %rax
  2797. ALIGN_4
  2798. .L2_37:
  2799. KERNEL2x2_SUB(xxx)
  2800. addq $2, BI
  2801. addq $2, %rax
  2802. jl .L2_37
  2803. ALIGN_4
  2804. .L2_39:
  2805. vmovddup ALPHA, %xmm0
  2806. #ifndef TRMMKERNEL
  2807. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  2808. vfmaddpd (CO1, LDC),%xmm0, %xmm5,%xmm5
  2809. #else
  2810. vmulpd %xmm0, %xmm4,%xmm4
  2811. vmulpd %xmm0, %xmm5,%xmm5
  2812. #endif
  2813. vmovups %xmm4 , (CO1)
  2814. vmovups %xmm5 , (CO1, LDC)
  2815. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2816. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2817. movq K, %rax
  2818. subq KKK, %rax
  2819. movq %rax, BI // Index for BO
  2820. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2821. leaq (BO, BI, 8), BO
  2822. salq $1, %rax // rax = rax * 2 ; number of values
  2823. leaq (AO, %rax, 8), AO
  2824. #endif
  2825. #if defined(TRMMKERNEL) && defined(LEFT)
  2826. addq $2, KK
  2827. #endif
  2828. addq $2 * SIZE, CO1 # coffset += 2
  2829. ALIGN_4
  2830. .L2_40:
  2831. testq $1, M
  2832. jz .L2_60 // to next 2 lines of N
  2833. ALIGN_4
  2834. .L2_41:
  2835. #if !defined(TRMMKERNEL) || \
  2836. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2837. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2838. leaq BUFFER1, BO // first buffer to BO
  2839. addq $4 * SIZE, BO
  2840. #else
  2841. movq KK, %rax
  2842. leaq BUFFER1, BO // first buffer to BO
  2843. addq $4 * SIZE, BO
  2844. movq %rax, BI // Index for BO
  2845. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2846. leaq (BO, BI, 8), BO
  2847. leaq (AO, %rax, 8), AO
  2848. #endif
  2849. vzeroall
  2850. #ifndef TRMMKERNEL
  2851. movq K, %rax
  2852. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2853. movq K, %rax
  2854. subq KK, %rax
  2855. movq %rax, KKK
  2856. #else
  2857. movq KK, %rax
  2858. #ifdef LEFT
  2859. addq $1, %rax // number of values in AO
  2860. #else
  2861. addq $2, %rax // number of values in BO
  2862. #endif
  2863. movq %rax, KKK
  2864. #endif
  2865. andq $-8, %rax
  2866. je .L2_46
  2867. movq %rax, BI // Index for BO
  2868. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2869. leaq (AO, %rax, 8), AO
  2870. leaq (BO, BI, 8), BO
  2871. negq BI
  2872. negq %rax
  2873. ALIGN_4
  2874. .L2_42:
  2875. KERNEL1x2_1(xxx)
  2876. KERNEL1x2_2(xxx)
  2877. KERNEL1x2_3(xxx)
  2878. KERNEL1x2_4(xxx)
  2879. KERNEL1x2_1(xxx)
  2880. KERNEL1x2_2(xxx)
  2881. KERNEL1x2_3(xxx)
  2882. KERNEL1x2_4(xxx)
  2883. je .L2_46
  2884. KERNEL1x2_1(xxx)
  2885. KERNEL1x2_2(xxx)
  2886. KERNEL1x2_3(xxx)
  2887. KERNEL1x2_4(xxx)
  2888. KERNEL1x2_1(xxx)
  2889. KERNEL1x2_2(xxx)
  2890. KERNEL1x2_3(xxx)
  2891. KERNEL1x2_4(xxx)
  2892. je .L2_46
  2893. jmp .L2_42
  2894. ALIGN_4
  2895. .L2_46:
  2896. #ifndef TRMMKERNEL
  2897. movq K, %rax
  2898. #else
  2899. movq KKK, %rax
  2900. #endif
  2901. andq $7, %rax # if (k & 1)
  2902. je .L2_49
  2903. movq %rax, BI // Index for BO
  2904. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2905. leaq (AO, %rax, 8), AO
  2906. leaq (BO, BI, 8), BO
  2907. negq BI
  2908. negq %rax
  2909. ALIGN_4
  2910. .L2_47:
  2911. KERNEL1x2_SUB(xxx)
  2912. addq $2, BI
  2913. addq $1, %rax
  2914. jl .L2_47
  2915. ALIGN_4
  2916. .L2_49:
  2917. vmovddup ALPHA, %xmm0
  2918. #ifndef TRMMKERNEL
  2919. vfmaddsd (CO1),%xmm0, %xmm4,%xmm4
  2920. vfmaddsd (CO1, LDC),%xmm0, %xmm5,%xmm5
  2921. #else
  2922. vmulsd %xmm0, %xmm4,%xmm4
  2923. vmulsd %xmm0, %xmm5,%xmm5
  2924. #endif
  2925. vmovsd %xmm4 , (CO1)
  2926. vmovsd %xmm5 , (CO1, LDC)
  2927. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2928. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2929. movq K, %rax
  2930. subq KKK, %rax
  2931. movq %rax, BI // Index for BO
  2932. leaq (BI,BI,1), BI // BI = BI * 2 ; number of values
  2933. leaq (BO, BI, 8), BO
  2934. leaq (AO, %rax, 8), AO
  2935. #endif
  2936. #if defined(TRMMKERNEL) && defined(LEFT)
  2937. addq $1, KK
  2938. #endif
  2939. addq $1 * SIZE, CO1 # coffset += 1
  2940. ALIGN_4
  2941. .L2_60:
  2942. #if defined(TRMMKERNEL) && !defined(LEFT)
  2943. addq $2, KK
  2944. #endif
  2945. decq J // j --
  2946. jg .L2_01 // next 2 lines of N
  2947. .L1_0:
  2948. /************************************************************************************************
  2949. * Loop for Nmod6 % 2 > 0
  2950. *************************************************************************************************/
  2951. movq Nmod6, J
  2952. andq $1, J // j % 2
  2953. je .L999
  2954. ALIGN_4
  2955. .L1_01:
  2956. // copy to sub buffer
  2957. movq B, BO1
  2958. leaq BUFFER1, BO // first buffer to BO
  2959. movq K, %rax
  2960. ALIGN_4
  2961. .L1_02b:
  2962. vmovsd (BO1), %xmm0
  2963. vmovsd %xmm0, (BO)
  2964. addq $1*SIZE,BO1
  2965. addq $1*SIZE,BO
  2966. decq %rax
  2967. jnz .L1_02b
  2968. .L1_02c:
  2969. movq BO1, B // next offset of B
  2970. .L1_10:
  2971. movq C, CO1
  2972. leaq (C, LDC, 1), C // c += 1 * ldc
  2973. #if defined(TRMMKERNEL) && defined(LEFT)
  2974. movq OFFSET, %rax
  2975. movq %rax, KK
  2976. #endif
  2977. movq A, AO // aoffset = a
  2978. addq $16 * SIZE, AO
  2979. movq M, I
  2980. sarq $3, I // i = (m >> 3)
  2981. je .L1_20
  2982. ALIGN_4
  2983. .L1_11:
  2984. #if !defined(TRMMKERNEL) || \
  2985. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2986. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2987. leaq BUFFER1, BO // first buffer to BO
  2988. addq $2 * SIZE, BO
  2989. #else
  2990. movq KK, %rax
  2991. leaq BUFFER1, BO // first buffer to BO
  2992. addq $2 * SIZE, BO
  2993. movq %rax, BI // Index for BO
  2994. leaq (BO, BI, 8), BO
  2995. salq $3, %rax // rax = rax * 8 ; number of values
  2996. leaq (AO, %rax, 8), AO
  2997. #endif
  2998. vzeroall
  2999. #ifndef TRMMKERNEL
  3000. movq K, %rax
  3001. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3002. movq K, %rax
  3003. subq KK, %rax
  3004. movq %rax, KKK
  3005. #else
  3006. movq KK, %rax
  3007. #ifdef LEFT
  3008. addq $8, %rax // number of values in AO
  3009. #else
  3010. addq $1, %rax // number of values in BO
  3011. #endif
  3012. movq %rax, KKK
  3013. #endif
  3014. andq $-8, %rax // K = K - ( K % 8 )
  3015. je .L1_16
  3016. movq %rax, BI // Index for BO
  3017. salq $3, %rax // rax = rax * 8 ; number of values
  3018. leaq (AO, %rax, 8), AO
  3019. leaq (BO, BI, 8), BO
  3020. negq BI
  3021. negq %rax
  3022. ALIGN_4
  3023. .L1_12:
  3024. KERNEL8x1_1(xxx)
  3025. KERNEL8x1_2(xxx)
  3026. KERNEL8x1_3(xxx)
  3027. KERNEL8x1_4(xxx)
  3028. KERNEL8x1_1(xxx)
  3029. KERNEL8x1_2(xxx)
  3030. KERNEL8x1_3(xxx)
  3031. KERNEL8x1_4(xxx)
  3032. je .L1_16
  3033. KERNEL8x1_1(xxx)
  3034. KERNEL8x1_2(xxx)
  3035. KERNEL8x1_3(xxx)
  3036. KERNEL8x1_4(xxx)
  3037. KERNEL8x1_1(xxx)
  3038. KERNEL8x1_2(xxx)
  3039. KERNEL8x1_3(xxx)
  3040. KERNEL8x1_4(xxx)
  3041. je .L1_16
  3042. jmp .L1_12
  3043. ALIGN_4
  3044. .L1_16:
  3045. #ifndef TRMMKERNEL
  3046. movq K, %rax
  3047. #else
  3048. movq KKK, %rax
  3049. #endif
  3050. andq $7, %rax # if (k & 1)
  3051. je .L1_19
  3052. movq %rax, BI // Index for BO
  3053. salq $3, %rax // rax = rax * 8 ; number of values
  3054. leaq (AO, %rax, 8), AO
  3055. leaq (BO, BI, 8), BO
  3056. negq BI
  3057. negq %rax
  3058. ALIGN_4
  3059. .L1_17:
  3060. KERNEL8x1_SUB(xxx)
  3061. addq $1, BI
  3062. addq $8, %rax
  3063. jl .L1_17
  3064. ALIGN_4
  3065. .L1_19:
  3066. vmovddup ALPHA, %xmm0
  3067. #ifndef TRMMKERNEL
  3068. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  3069. vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  3070. vfmaddpd 4 * SIZE(CO1),%xmm0, %xmm10,%xmm10
  3071. vfmaddpd 6 * SIZE(CO1),%xmm0, %xmm13,%xmm13
  3072. #else
  3073. vmulpd %xmm0, %xmm4,%xmm4
  3074. vmulpd %xmm0, %xmm7,%xmm7
  3075. vmulpd %xmm0, %xmm10,%xmm10
  3076. vmulpd %xmm0, %xmm13,%xmm13
  3077. #endif
  3078. vmovups %xmm4 , (CO1)
  3079. vmovups %xmm7 , 2 * SIZE(CO1)
  3080. vmovups %xmm10, 4 * SIZE(CO1)
  3081. vmovups %xmm13, 6 * SIZE(CO1)
  3082. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3083. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3084. movq K, %rax
  3085. subq KKK, %rax
  3086. movq %rax, BI // Index for BO
  3087. leaq (BO, BI, 8), BO
  3088. salq $3, %rax // rax = rax * 8 ; number of values
  3089. leaq (AO, %rax, 8), AO
  3090. #endif
  3091. #if defined(TRMMKERNEL) && defined(LEFT)
  3092. addq $8, KK
  3093. #endif
  3094. addq $8 * SIZE, CO1 # coffset += 8
  3095. decq I # i --
  3096. jg .L1_11
  3097. ALIGN_4
  3098. /**************************************************************************
  3099. * Rest of M
  3100. ***************************************************************************/
  3101. .L1_20:
  3102. // Test rest of M
  3103. testq $7, M
  3104. jz .L999
  3105. testq $4, M
  3106. jz .L1_30
  3107. ALIGN_4
  3108. .L1_21:
  3109. #if !defined(TRMMKERNEL) || \
  3110. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3111. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3112. leaq BUFFER1, BO // first buffer to BO
  3113. addq $2 * SIZE, BO
  3114. #else
  3115. movq KK, %rax
  3116. leaq BUFFER1, BO // first buffer to BO
  3117. addq $2 * SIZE, BO
  3118. movq %rax, BI // Index for BO
  3119. leaq (BO, BI, 8), BO
  3120. salq $2, %rax // rax = rax * 4 ; number of values
  3121. leaq (AO, %rax, 8), AO
  3122. #endif
  3123. vzeroall
  3124. #ifndef TRMMKERNEL
  3125. movq K, %rax
  3126. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3127. movq K, %rax
  3128. subq KK, %rax
  3129. movq %rax, KKK
  3130. #else
  3131. movq KK, %rax
  3132. #ifdef LEFT
  3133. addq $4, %rax // number of values in AO
  3134. #else
  3135. addq $1, %rax // number of values in BO
  3136. #endif
  3137. movq %rax, KKK
  3138. #endif
  3139. andq $-8, %rax
  3140. je .L1_26
  3141. movq %rax, BI // Index for BO
  3142. salq $2, %rax // rax = rax * 4 ; number of values
  3143. leaq (AO, %rax, 8), AO
  3144. leaq (BO, BI, 8), BO
  3145. negq BI
  3146. negq %rax
  3147. ALIGN_4
  3148. .L1_22:
  3149. KERNEL4x1_1(xxx)
  3150. KERNEL4x1_2(xxx)
  3151. KERNEL4x1_3(xxx)
  3152. KERNEL4x1_4(xxx)
  3153. KERNEL4x1_1(xxx)
  3154. KERNEL4x1_2(xxx)
  3155. KERNEL4x1_3(xxx)
  3156. KERNEL4x1_4(xxx)
  3157. je .L1_26
  3158. KERNEL4x1_1(xxx)
  3159. KERNEL4x1_2(xxx)
  3160. KERNEL4x1_3(xxx)
  3161. KERNEL4x1_4(xxx)
  3162. KERNEL4x1_1(xxx)
  3163. KERNEL4x1_2(xxx)
  3164. KERNEL4x1_3(xxx)
  3165. KERNEL4x1_4(xxx)
  3166. je .L1_26
  3167. jmp .L1_22
  3168. ALIGN_4
  3169. .L1_26:
  3170. #ifndef TRMMKERNEL
  3171. movq K, %rax
  3172. #else
  3173. movq KKK, %rax
  3174. #endif
  3175. andq $7, %rax # if (k & 1)
  3176. je .L1_29
  3177. movq %rax, BI // Index for BO
  3178. salq $2, %rax // rax = rax * 4 ; number of values
  3179. leaq (AO, %rax, 8), AO
  3180. leaq (BO, BI, 8), BO
  3181. negq BI
  3182. negq %rax
  3183. ALIGN_4
  3184. .L1_27:
  3185. KERNEL4x1_SUB(xxx)
  3186. addq $1, BI
  3187. addq $4, %rax
  3188. jl .L1_27
  3189. ALIGN_4
  3190. .L1_29:
  3191. vmovddup ALPHA, %xmm0
  3192. #ifndef TRMMKERNEL
  3193. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  3194. vfmaddpd 2 * SIZE(CO1),%xmm0, %xmm7,%xmm7
  3195. #else
  3196. vmulpd %xmm0, %xmm4,%xmm4
  3197. vmulpd %xmm0, %xmm7,%xmm7
  3198. #endif
  3199. vmovups %xmm4 , (CO1)
  3200. vmovups %xmm7 , 2 * SIZE(CO1)
  3201. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3202. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3203. movq K, %rax
  3204. subq KKK, %rax
  3205. movq %rax, BI // Index for BO
  3206. leaq (BO, BI, 8), BO
  3207. salq $2, %rax // rax = rax * 4 ; number of values
  3208. leaq (AO, %rax, 8), AO
  3209. #endif
  3210. #if defined(TRMMKERNEL) && defined(LEFT)
  3211. addq $4, KK
  3212. #endif
  3213. addq $4 * SIZE, CO1 # coffset += 4
  3214. ALIGN_4
  3215. .L1_30:
  3216. testq $2, M
  3217. jz .L1_40
  3218. ALIGN_4
  3219. .L1_31:
  3220. #if !defined(TRMMKERNEL) || \
  3221. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3222. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3223. leaq BUFFER1, BO // first buffer to BO
  3224. addq $2 * SIZE, BO
  3225. #else
  3226. movq KK, %rax
  3227. leaq BUFFER1, BO // first buffer to BO
  3228. addq $2 * SIZE, BO
  3229. movq %rax, BI // Index for BO
  3230. leaq (BO, BI, 8), BO
  3231. salq $1, %rax // rax = rax * 2 ; number of values
  3232. leaq (AO, %rax, 8), AO
  3233. #endif
  3234. vzeroall
  3235. #ifndef TRMMKERNEL
  3236. movq K, %rax
  3237. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3238. movq K, %rax
  3239. subq KK, %rax
  3240. movq %rax, KKK
  3241. #else
  3242. movq KK, %rax
  3243. #ifdef LEFT
  3244. addq $2, %rax // number of values in AO
  3245. #else
  3246. addq $1, %rax // number of values in BO
  3247. #endif
  3248. movq %rax, KKK
  3249. #endif
  3250. andq $-8, %rax
  3251. je .L1_36
  3252. movq %rax, BI // Index for BO
  3253. salq $1, %rax // rax = rax *2 ; number of values
  3254. leaq (AO, %rax, 8), AO
  3255. leaq (BO, BI, 8), BO
  3256. negq BI
  3257. negq %rax
  3258. ALIGN_4
  3259. .L1_32:
  3260. KERNEL2x1_1(xxx)
  3261. KERNEL2x1_2(xxx)
  3262. KERNEL2x1_3(xxx)
  3263. KERNEL2x1_4(xxx)
  3264. KERNEL2x1_1(xxx)
  3265. KERNEL2x1_2(xxx)
  3266. KERNEL2x1_3(xxx)
  3267. KERNEL2x1_4(xxx)
  3268. je .L1_36
  3269. KERNEL2x1_1(xxx)
  3270. KERNEL2x1_2(xxx)
  3271. KERNEL2x1_3(xxx)
  3272. KERNEL2x1_4(xxx)
  3273. KERNEL2x1_1(xxx)
  3274. KERNEL2x1_2(xxx)
  3275. KERNEL2x1_3(xxx)
  3276. KERNEL2x1_4(xxx)
  3277. je .L1_36
  3278. jmp .L1_32
  3279. ALIGN_4
  3280. .L1_36:
  3281. #ifndef TRMMKERNEL
  3282. movq K, %rax
  3283. #else
  3284. movq KKK, %rax
  3285. #endif
  3286. andq $7, %rax # if (k & 1)
  3287. je .L1_39
  3288. movq %rax, BI // Index for BO
  3289. salq $1, %rax // rax = rax *2 ; number of values
  3290. leaq (AO, %rax, 8), AO
  3291. leaq (BO, BI, 8), BO
  3292. negq BI
  3293. negq %rax
  3294. ALIGN_4
  3295. .L1_37:
  3296. KERNEL2x1_SUB(xxx)
  3297. addq $1, BI
  3298. addq $2, %rax
  3299. jl .L1_37
  3300. ALIGN_4
  3301. .L1_39:
  3302. vmovddup ALPHA, %xmm0
  3303. #ifndef TRMMKERNEL
  3304. vfmaddpd (CO1),%xmm0, %xmm4,%xmm4
  3305. #else
  3306. vmulpd %xmm0, %xmm4,%xmm4
  3307. #endif
  3308. vmovups %xmm4 , (CO1)
  3309. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3310. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3311. movq K, %rax
  3312. subq KKK, %rax
  3313. movq %rax, BI // Index for BO
  3314. leaq (BO, BI, 8), BO
  3315. salq $1, %rax // rax = rax * 2 ; number of values
  3316. leaq (AO, %rax, 8), AO
  3317. #endif
  3318. #if defined(TRMMKERNEL) && defined(LEFT)
  3319. addq $2, KK
  3320. #endif
  3321. addq $2 * SIZE, CO1 # coffset += 2
  3322. ALIGN_4
  3323. .L1_40:
  3324. testq $1, M
  3325. jz .L999
  3326. ALIGN_4
  3327. .L1_41:
  3328. #if !defined(TRMMKERNEL) || \
  3329. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3330. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3331. leaq BUFFER1, BO // first buffer to BO
  3332. addq $2 * SIZE, BO
  3333. #else
  3334. movq KK, %rax
  3335. leaq BUFFER1, BO // first buffer to BO
  3336. addq $2 * SIZE, BO
  3337. movq %rax, BI // Index for BO
  3338. leaq (BO, BI, 8), BO
  3339. leaq (AO, %rax, 8), AO
  3340. #endif
  3341. vzeroall
  3342. #ifndef TRMMKERNEL
  3343. movq K, %rax
  3344. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  3345. movq K, %rax
  3346. subq KK, %rax
  3347. movq %rax, KKK
  3348. #else
  3349. movq KK, %rax
  3350. #ifdef LEFT
  3351. addq $1, %rax // number of values in AO
  3352. #else
  3353. addq $1, %rax // number of values in BO
  3354. #endif
  3355. movq %rax, KKK
  3356. #endif
  3357. andq $-8, %rax
  3358. je .L1_46
  3359. movq %rax, BI // Index for BO
  3360. leaq (AO, %rax, 8), AO
  3361. leaq (BO, BI, 8), BO
  3362. negq BI
  3363. negq %rax
  3364. ALIGN_4
  3365. .L1_42:
  3366. KERNEL1x1_1(xxx)
  3367. KERNEL1x1_2(xxx)
  3368. KERNEL1x1_3(xxx)
  3369. KERNEL1x1_4(xxx)
  3370. KERNEL1x1_1(xxx)
  3371. KERNEL1x1_2(xxx)
  3372. KERNEL1x1_3(xxx)
  3373. KERNEL1x1_4(xxx)
  3374. je .L1_46
  3375. KERNEL1x1_1(xxx)
  3376. KERNEL1x1_2(xxx)
  3377. KERNEL1x1_3(xxx)
  3378. KERNEL1x1_4(xxx)
  3379. KERNEL1x1_1(xxx)
  3380. KERNEL1x1_2(xxx)
  3381. KERNEL1x1_3(xxx)
  3382. KERNEL1x1_4(xxx)
  3383. je .L1_46
  3384. jmp .L1_42
  3385. ALIGN_4
  3386. .L1_46:
  3387. #ifndef TRMMKERNEL
  3388. movq K, %rax
  3389. #else
  3390. movq KKK, %rax
  3391. #endif
  3392. andq $7, %rax # if (k & 1)
  3393. je .L1_49
  3394. movq %rax, BI // Index for BO
  3395. leaq (AO, %rax, 8), AO
  3396. leaq (BO, BI, 8), BO
  3397. negq BI
  3398. negq %rax
  3399. ALIGN_4
  3400. .L1_47:
  3401. KERNEL1x1_SUB(xxx)
  3402. addq $1, BI
  3403. addq $1, %rax
  3404. jl .L1_47
  3405. ALIGN_4
  3406. .L1_49:
  3407. vmovddup ALPHA, %xmm0
  3408. #ifndef TRMMKERNEL
  3409. vfmaddsd (CO1),%xmm0, %xmm4,%xmm4
  3410. #else
  3411. vmulsd %xmm0, %xmm4,%xmm4
  3412. #endif
  3413. vmovsd %xmm4 , (CO1)
  3414. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  3415. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  3416. movq K, %rax
  3417. subq KKK, %rax
  3418. movq %rax, BI // Index for BO
  3419. leaq (BO, BI, 8), BO
  3420. leaq (AO, %rax, 8), AO
  3421. #endif
  3422. #if defined(TRMMKERNEL) && defined(LEFT)
  3423. addq $1, KK
  3424. #endif
  3425. addq $1 * SIZE, CO1 # coffset += 1
  3426. ALIGN_4
  3427. .L999:
  3428. movq SP, %rsp
  3429. movq (%rsp), %rbx
  3430. movq 8(%rsp), %rbp
  3431. movq 16(%rsp), %r12
  3432. movq 24(%rsp), %r13
  3433. movq 32(%rsp), %r14
  3434. movq 40(%rsp), %r15
  3435. #ifdef WINDOWS_ABI
  3436. movq 48(%rsp), %rdi
  3437. movq 56(%rsp), %rsi
  3438. movups 64(%rsp), %xmm6
  3439. movups 80(%rsp), %xmm7
  3440. movups 96(%rsp), %xmm8
  3441. movups 112(%rsp), %xmm9
  3442. movups 128(%rsp), %xmm10
  3443. movups 144(%rsp), %xmm11
  3444. movups 160(%rsp), %xmm12
  3445. movups 176(%rsp), %xmm13
  3446. movups 192(%rsp), %xmm14
  3447. movups 208(%rsp), %xmm15
  3448. #endif
  3449. addq $STACKSIZE, %rsp
  3450. ret
  3451. EPILOGUE
  3452. #endif