You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gemm_kernel_8x4_barcelona.S 68 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253
  1. /*********************************************************************/
  2. /* Copyright 2009, 2010 The University of Texas at Austin. */
  3. /* All rights reserved. */
  4. /* */
  5. /* Redistribution and use in source and binary forms, with or */
  6. /* without modification, are permitted provided that the following */
  7. /* conditions are met: */
  8. /* */
  9. /* 1. Redistributions of source code must retain the above */
  10. /* copyright notice, this list of conditions and the following */
  11. /* disclaimer. */
  12. /* */
  13. /* 2. Redistributions in binary form must reproduce the above */
  14. /* copyright notice, this list of conditions and the following */
  15. /* disclaimer in the documentation and/or other materials */
  16. /* provided with the distribution. */
  17. /* */
  18. /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
  19. /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
  20. /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
  21. /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
  22. /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
  23. /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
  24. /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
  25. /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
  26. /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
  27. /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
  28. /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
  29. /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
  30. /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
  31. /* POSSIBILITY OF SUCH DAMAGE. */
  32. /* */
  33. /* The views and conclusions contained in the software and */
  34. /* documentation are those of the authors and should not be */
  35. /* interpreted as representing official policies, either expressed */
  36. /* or implied, of The University of Texas at Austin. */
  37. /*********************************************************************/
  38. #define ASSEMBLER
  39. #include "common.h"
  40. #define OLD_M %rdi
  41. #define OLD_N %rsi
  42. #define M %r13
  43. #define N %r14
  44. #define K %rdx
  45. #define A %rcx
  46. #define B %r8
  47. #define C %r9
  48. #define LDC %r10
  49. #define I %r11
  50. #define AO %rdi
  51. #define BO %rsi
  52. #define CO1 %r15
  53. #define CO2 %r12
  54. #define BB %rbp
  55. #ifndef WINDOWS_ABI
  56. #define STACKSIZE 64
  57. #else
  58. #define STACKSIZE 256
  59. #define OLD_A 40 + STACKSIZE(%rsp)
  60. #define OLD_B 48 + STACKSIZE(%rsp)
  61. #define OLD_C 56 + STACKSIZE(%rsp)
  62. #define OLD_LDC 64 + STACKSIZE(%rsp)
  63. #define OLD_OFFSET 72 + STACKSIZE(%rsp)
  64. #endif
  65. #define ALPHA 0(%rsp)
  66. #define J 16(%rsp)
  67. #define OFFSET 24(%rsp)
  68. #define KK 32(%rsp)
  69. #define KKK 40(%rsp)
  70. #define BUFFER 128(%rsp)
  71. #define PREFETCH prefetch
  72. #define PREFETCHSIZE (16 * 17 + 0)
  73. #define RPREFETCHSIZE (16 * 4 + 0)
  74. #define WPREFETCHSIZE (16 * 9 + 0)
  75. #define KERNEL1(xx) \
  76. mulps %xmm1, %xmm0 ;\
  77. mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\
  78. addps %xmm0, %xmm8 ;\
  79. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\
  80. movaps %xmm2, %xmm0 ;\
  81. addps %xmm1, %xmm12 ;\
  82. movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\
  83. mulps %xmm3, %xmm2 ;\
  84. mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\
  85. addps %xmm2, %xmm9 ;\
  86. movaps %xmm0, %xmm2 ;\
  87. addps %xmm3, %xmm13 ;\
  88. movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\
  89. mulps %xmm1, %xmm0 ;\
  90. mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\
  91. addps %xmm0, %xmm10 ;\
  92. movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\
  93. addps %xmm1, %xmm14 ;\
  94. movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\
  95. mulps %xmm3, %xmm2 ;\
  96. mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\
  97. addps %xmm2, %xmm11 ;\
  98. addps %xmm3, %xmm15 ;\
  99. movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\
  100. movaps %xmm0, %xmm2
  101. #define KERNEL2(xx) \
  102. mulps %xmm1, %xmm0 ;\
  103. mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\
  104. addps %xmm0, %xmm8 ;\
  105. movaps %xmm2, %xmm0 ;\
  106. addps %xmm1, %xmm12 ;\
  107. movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\
  108. mulps %xmm3, %xmm2 ;\
  109. mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\
  110. addps %xmm2, %xmm9 ;\
  111. movaps %xmm0, %xmm2 ;\
  112. addps %xmm3, %xmm13 ;\
  113. movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\
  114. mulps %xmm1, %xmm0 ;\
  115. mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\
  116. addps %xmm0, %xmm10 ;\
  117. addps %xmm1, %xmm14 ;\
  118. mulps %xmm3, %xmm2 ;\
  119. mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\
  120. addps %xmm2, %xmm11 ;\
  121. addps %xmm3, %xmm15 ;\
  122. movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\
  123. movaps %xmm4, %xmm2
  124. #define KERNEL3(xx) \
  125. mulps %xmm5, %xmm4 ;\
  126. mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\
  127. addps %xmm4, %xmm8 ;\
  128. movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\
  129. movaps %xmm2, %xmm4 ;\
  130. addps %xmm5, %xmm12 ;\
  131. movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\
  132. mulps %xmm3, %xmm2 ;\
  133. mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\
  134. addps %xmm2, %xmm9 ;\
  135. movaps %xmm4, %xmm2 ;\
  136. addps %xmm3, %xmm13 ;\
  137. movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\
  138. mulps %xmm5, %xmm4 ;\
  139. mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\
  140. addps %xmm4, %xmm10 ;\
  141. movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\
  142. addps %xmm5, %xmm14 ;\
  143. movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\
  144. mulps %xmm3, %xmm2 ;\
  145. mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\
  146. addps %xmm2, %xmm11 ;\
  147. addps %xmm3, %xmm15 ;\
  148. movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\
  149. movaps %xmm4, %xmm2
  150. #define KERNEL4(xx) \
  151. mulps %xmm5, %xmm4 ;\
  152. mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\
  153. movaps (AO, %rax, 4), %xmm6 ;\
  154. addps %xmm4, %xmm8 ;\
  155. movaps %xmm2, %xmm4 ;\
  156. addps %xmm5, %xmm12 ;\
  157. movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\
  158. mulps %xmm3, %xmm2 ;\
  159. mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\
  160. addps %xmm2, %xmm9 ;\
  161. movaps %xmm4, %xmm2 ;\
  162. addps %xmm3, %xmm13 ;\
  163. movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\
  164. mulps %xmm5, %xmm4 ;\
  165. mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\
  166. addps %xmm4, %xmm10 ;\
  167. addps %xmm5, %xmm14 ;\
  168. movaps 64 * SIZE(BO, %rax, 8), %xmm5 ;\
  169. mulps %xmm3, %xmm2 ;\
  170. mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\
  171. addps %xmm2, %xmm11 ;\
  172. addps %xmm3, %xmm15 ;\
  173. movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\
  174. movaps %xmm6, %xmm2
  175. #define KERNEL5(xx) \
  176. mulps %xmm1, %xmm6 ;\
  177. mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\
  178. addps %xmm6, %xmm8 ;\
  179. movaps %xmm2, %xmm6 ;\
  180. addps %xmm1, %xmm12 ;\
  181. movaps 40 * SIZE(BO, %rax, 8), %xmm1 ;\
  182. mulps %xmm3, %xmm2 ;\
  183. mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\
  184. addps %xmm2, %xmm9 ;\
  185. movaps 16 * SIZE(AO, %rax, 4), %xmm7 ;\
  186. movaps %xmm6, %xmm2 ;\
  187. addps %xmm3, %xmm13 ;\
  188. movaps 44 * SIZE(BO, %rax, 8), %xmm3 ;\
  189. mulps %xmm1, %xmm6 ;\
  190. mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\
  191. addps %xmm6, %xmm10 ;\
  192. movaps 8 * SIZE(AO, %rax, 4), %xmm6 ;\
  193. addps %xmm1, %xmm14 ;\
  194. movaps 48 * SIZE(BO, %rax, 8), %xmm1 ;\
  195. mulps %xmm3, %xmm2 ;\
  196. mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\
  197. addps %xmm2, %xmm11 ;\
  198. addps %xmm3, %xmm15 ;\
  199. movaps 52 * SIZE(BO, %rax, 8), %xmm3 ;\
  200. movaps %xmm6, %xmm2
  201. #define KERNEL6(xx) \
  202. mulps %xmm1, %xmm6 ;\
  203. mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\
  204. addps %xmm6, %xmm8 ;\
  205. movaps %xmm2, %xmm6 ;\
  206. addps %xmm1, %xmm12 ;\
  207. movaps 56 * SIZE(BO, %rax, 8), %xmm1 ;\
  208. mulps %xmm3, %xmm2 ;\
  209. mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\
  210. addps %xmm2, %xmm9 ;\
  211. movaps %xmm6, %xmm2 ;\
  212. addps %xmm3, %xmm13 ;\
  213. movaps 60 * SIZE(BO, %rax, 8), %xmm3 ;\
  214. mulps %xmm1, %xmm6 ;\
  215. mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\
  216. addps %xmm6, %xmm10 ;\
  217. movaps 32 * SIZE(AO, %rax, 4), %xmm0 ;\
  218. addps %xmm1, %xmm14 ;\
  219. mulps %xmm3, %xmm2 ;\
  220. mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\
  221. addps %xmm2, %xmm11 ;\
  222. addps %xmm3, %xmm15 ;\
  223. movaps 68 * SIZE(BO, %rax, 8), %xmm3 ;\
  224. movaps %xmm7, %xmm2
  225. #define KERNEL7(xx) \
  226. mulps %xmm5, %xmm7 ;\
  227. mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\
  228. addps %xmm7, %xmm8 ;\
  229. movaps 96 * SIZE(BO, %rax, 8), %xmm1 ;\
  230. movaps %xmm2, %xmm7 ;\
  231. addps %xmm5, %xmm12 ;\
  232. movaps 72 * SIZE(BO, %rax, 8), %xmm5 ;\
  233. mulps %xmm3, %xmm2 ;\
  234. mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\
  235. addps %xmm2, %xmm9 ;\
  236. movaps %xmm7, %xmm2 ;\
  237. addps %xmm3, %xmm13 ;\
  238. movaps 76 * SIZE(BO, %rax, 8), %xmm3 ;\
  239. mulps %xmm5, %xmm7 ;\
  240. mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\
  241. addps %xmm7, %xmm10 ;\
  242. movaps 24 * SIZE(AO, %rax, 4), %xmm7 ;\
  243. addps %xmm5, %xmm14 ;\
  244. movaps 80 * SIZE(BO, %rax, 8), %xmm5 ;\
  245. mulps %xmm3, %xmm2 ;\
  246. mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\
  247. addps %xmm2, %xmm11 ;\
  248. addps %xmm3, %xmm15 ;\
  249. movaps 84 * SIZE(BO, %rax, 8), %xmm3 ;\
  250. movaps %xmm7, %xmm2
  251. #define KERNEL8(xx) \
  252. mulps %xmm5, %xmm7 ;\
  253. mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\
  254. addps %xmm7, %xmm8 ;\
  255. movaps %xmm2, %xmm7 ;\
  256. addps %xmm5, %xmm12 ;\
  257. movaps 88 * SIZE(BO, %rax, 8), %xmm5 ;\
  258. mulps %xmm3, %xmm2 ;\
  259. mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\
  260. addps %xmm2, %xmm9 ;\
  261. movaps %xmm7, %xmm2 ;\
  262. addps %xmm3, %xmm13 ;\
  263. movaps 92 * SIZE(BO, %rax, 8), %xmm3 ;\
  264. mulps %xmm5, %xmm7 ;\
  265. mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\
  266. addps %xmm7, %xmm10 ;\
  267. movaps 48 * SIZE(AO, %rax, 4), %xmm4 ;\
  268. addps %xmm5, %xmm14 ;\
  269. movaps 128 * SIZE(BO, %rax, 8), %xmm5 ;\
  270. mulps %xmm3, %xmm2 ;\
  271. mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\
  272. addps %xmm2, %xmm11 ;\
  273. addps %xmm3, %xmm15 ;\
  274. movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\
  275. movaps %xmm0, %xmm2 ;\
  276. addq $16 * SIZE, %rax
  277. #define KERNEL_SUB1(xx) \
  278. mulps %xmm1, %xmm0 ;\
  279. mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\
  280. addps %xmm0, %xmm8 ;\
  281. movaps %xmm2, %xmm0 ;\
  282. addps %xmm1, %xmm12 ;\
  283. movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\
  284. mulps %xmm3, %xmm2 ;\
  285. mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\
  286. addps %xmm2, %xmm9 ;\
  287. movaps %xmm0, %xmm2 ;\
  288. addps %xmm3, %xmm13 ;\
  289. movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\
  290. mulps %xmm1, %xmm0 ;\
  291. mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\
  292. addps %xmm0, %xmm10 ;\
  293. movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\
  294. addps %xmm1, %xmm14 ;\
  295. movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\
  296. mulps %xmm3, %xmm2 ;\
  297. mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\
  298. addps %xmm2, %xmm11 ;\
  299. addps %xmm3, %xmm15 ;\
  300. movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\
  301. movaps %xmm0, %xmm2
  302. #define KERNEL_SUB2(xx) \
  303. mulps %xmm1, %xmm0 ;\
  304. mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\
  305. addps %xmm0, %xmm8 ;\
  306. movaps %xmm2, %xmm0 ;\
  307. addps %xmm1, %xmm12 ;\
  308. movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\
  309. mulps %xmm3, %xmm2 ;\
  310. mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\
  311. addps %xmm2, %xmm9 ;\
  312. movaps %xmm0, %xmm2 ;\
  313. addps %xmm3, %xmm13 ;\
  314. movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\
  315. mulps %xmm1, %xmm0 ;\
  316. mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\
  317. addps %xmm0, %xmm10 ;\
  318. movaps (AO, %rax, 4), %xmm0 ;\
  319. addps %xmm1, %xmm14 ;\
  320. movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\
  321. mulps %xmm3, %xmm2 ;\
  322. mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\
  323. addps %xmm2, %xmm11 ;\
  324. addps %xmm3, %xmm15 ;\
  325. movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\
  326. movaps %xmm4, %xmm2
  327. #define KERNEL_SUB3(xx) \
  328. mulps %xmm5, %xmm4 ;\
  329. mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\
  330. addps %xmm4, %xmm8 ;\
  331. movaps %xmm2, %xmm4 ;\
  332. addps %xmm5, %xmm12 ;\
  333. movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\
  334. mulps %xmm3, %xmm2 ;\
  335. mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\
  336. addps %xmm2, %xmm9 ;\
  337. movaps %xmm4, %xmm2 ;\
  338. addps %xmm3, %xmm13 ;\
  339. movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\
  340. mulps %xmm5, %xmm4 ;\
  341. mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\
  342. addps %xmm4, %xmm10 ;\
  343. movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\
  344. addps %xmm5, %xmm14 ;\
  345. movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\
  346. mulps %xmm3, %xmm2 ;\
  347. mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\
  348. addps %xmm2, %xmm11 ;\
  349. addps %xmm3, %xmm15 ;\
  350. movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\
  351. movaps %xmm4, %xmm2
  352. #define KERNEL_SUB4(xx) \
  353. mulps %xmm5, %xmm4 ;\
  354. mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\
  355. addps %xmm4, %xmm8 ;\
  356. movaps %xmm2, %xmm4 ;\
  357. addps %xmm5, %xmm12 ;\
  358. movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\
  359. mulps %xmm3, %xmm2 ;\
  360. mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\
  361. addps %xmm2, %xmm9 ;\
  362. movaps %xmm4, %xmm2 ;\
  363. addps %xmm3, %xmm13 ;\
  364. movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\
  365. mulps %xmm5, %xmm4 ;\
  366. mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\
  367. addps %xmm4, %xmm10 ;\
  368. addps %xmm5, %xmm14 ;\
  369. mulps %xmm3, %xmm2 ;\
  370. mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\
  371. addps %xmm2, %xmm11 ;\
  372. addps %xmm3, %xmm15 ;\
  373. movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\
  374. movaps %xmm0, %xmm2
  375. #if defined(OS_LINUX) && defined(CORE_BARCELONA) && !defined(TRMMKERNEL)
  376. .align 32768
  377. #endif
  378. PROLOGUE
  379. PROFCODE
  380. subq $STACKSIZE, %rsp
  381. movq %rbx, 0(%rsp)
  382. movq %rbp, 8(%rsp)
  383. movq %r12, 16(%rsp)
  384. movq %r13, 24(%rsp)
  385. movq %r14, 32(%rsp)
  386. movq %r15, 40(%rsp)
  387. #ifdef WINDOWS_ABI
  388. movq %rdi, 48(%rsp)
  389. movq %rsi, 56(%rsp)
  390. movups %xmm6, 64(%rsp)
  391. movups %xmm7, 80(%rsp)
  392. movups %xmm8, 96(%rsp)
  393. movups %xmm9, 112(%rsp)
  394. movups %xmm10, 128(%rsp)
  395. movups %xmm11, 144(%rsp)
  396. movups %xmm12, 160(%rsp)
  397. movups %xmm13, 176(%rsp)
  398. movups %xmm14, 192(%rsp)
  399. movups %xmm15, 208(%rsp)
  400. movq ARG1, OLD_M
  401. movq ARG2, OLD_N
  402. movq ARG3, K
  403. movq OLD_A, A
  404. movq OLD_B, B
  405. movq OLD_C, C
  406. movq OLD_LDC, LDC
  407. #ifdef TRMMKERNEL
  408. movsd OLD_OFFSET, %xmm12
  409. #endif
  410. movaps %xmm3, %xmm0
  411. #else
  412. movq 72(%rsp), LDC
  413. #ifdef TRMMKERNEL
  414. movsd 80(%rsp), %xmm12
  415. #endif
  416. #endif
  417. movq %rsp, %rbx # save old stack
  418. subq $128 + LOCAL_BUFFER_SIZE, %rsp
  419. andq $-4096, %rsp # align stack
  420. STACK_TOUCHING
  421. movq OLD_M, M
  422. movq OLD_N, N
  423. shufps $0, %xmm0, %xmm0
  424. movaps %xmm0, ALPHA
  425. #ifdef TRMMKERNEL
  426. movsd %xmm12, OFFSET
  427. movsd %xmm12, KK
  428. #ifndef LEFT
  429. negq KK
  430. #endif
  431. #endif
  432. subq $-32 * SIZE, A
  433. leaq (, LDC, SIZE), LDC
  434. movq N, J
  435. sarq $2, J # j = (n >> 2)
  436. jle .L50
  437. .L01:
  438. #if defined(TRMMKERNEL) && defined(LEFT)
  439. movq OFFSET, %rax
  440. movq %rax, KK
  441. #endif
  442. /* Copying to Sub Buffer */
  443. leaq BUFFER, BO
  444. movq K, %rax
  445. sarq $2, %rax
  446. jle .L03
  447. ALIGN_4
  448. .L02:
  449. prefetch (RPREFETCHSIZE + 0) * SIZE(B)
  450. movaps 0 * SIZE(B), %xmm3
  451. movaps 4 * SIZE(B), %xmm7
  452. movaps 8 * SIZE(B), %xmm11
  453. movaps 12 * SIZE(B), %xmm15
  454. prefetchw (WPREFETCHSIZE + 0) * SIZE(BO)
  455. pshufd $0x00, %xmm3, %xmm0
  456. pshufd $0x55, %xmm3, %xmm1
  457. pshufd $0xaa, %xmm3, %xmm2
  458. pshufd $0xff, %xmm3, %xmm3
  459. prefetchw (WPREFETCHSIZE + 16) * SIZE(BO)
  460. pshufd $0x00, %xmm7, %xmm4
  461. pshufd $0x55, %xmm7, %xmm5
  462. pshufd $0xaa, %xmm7, %xmm6
  463. pshufd $0xff, %xmm7, %xmm7
  464. movaps %xmm0, 0 * SIZE(BO)
  465. movaps %xmm1, 4 * SIZE(BO)
  466. movaps %xmm2, 8 * SIZE(BO)
  467. movaps %xmm3, 12 * SIZE(BO)
  468. movaps %xmm4, 16 * SIZE(BO)
  469. movaps %xmm5, 20 * SIZE(BO)
  470. movaps %xmm6, 24 * SIZE(BO)
  471. movaps %xmm7, 28 * SIZE(BO)
  472. prefetchw (WPREFETCHSIZE + 32) * SIZE(BO)
  473. pshufd $0x00, %xmm11, %xmm0
  474. pshufd $0x55, %xmm11, %xmm1
  475. pshufd $0xaa, %xmm11, %xmm2
  476. pshufd $0xff, %xmm11, %xmm3
  477. prefetchw (WPREFETCHSIZE + 48) * SIZE(BO)
  478. pshufd $0x00, %xmm15, %xmm4
  479. pshufd $0x55, %xmm15, %xmm5
  480. pshufd $0xaa, %xmm15, %xmm6
  481. pshufd $0xff, %xmm15, %xmm7
  482. movaps %xmm0, 32 * SIZE(BO)
  483. movaps %xmm1, 36 * SIZE(BO)
  484. movaps %xmm2, 40 * SIZE(BO)
  485. movaps %xmm3, 44 * SIZE(BO)
  486. movaps %xmm4, 48 * SIZE(BO)
  487. movaps %xmm5, 52 * SIZE(BO)
  488. movaps %xmm6, 56 * SIZE(BO)
  489. movaps %xmm7, 60 * SIZE(BO)
  490. addq $16 * SIZE, B
  491. addq $64 * SIZE, BO
  492. decq %rax
  493. jne .L02
  494. ALIGN_4
  495. .L03:
  496. movq K, %rax
  497. andq $3, %rax
  498. BRANCH
  499. jle .L10
  500. ALIGN_4
  501. .L04:
  502. movaps 0 * SIZE(B), %xmm3
  503. pshufd $0x00, %xmm3, %xmm0
  504. pshufd $0x55, %xmm3, %xmm1
  505. pshufd $0xaa, %xmm3, %xmm2
  506. pshufd $0xff, %xmm3, %xmm3
  507. movaps %xmm0, 0 * SIZE(BO)
  508. movaps %xmm1, 4 * SIZE(BO)
  509. movaps %xmm2, 8 * SIZE(BO)
  510. movaps %xmm3, 12 * SIZE(BO)
  511. addq $ 4 * SIZE, B
  512. addq $16 * SIZE, BO
  513. decq %rax
  514. jne .L04
  515. ALIGN_4
  516. .L10:
  517. movq C, CO1
  518. leaq (C, LDC, 1), CO2
  519. movq A, AO
  520. leaq (RPREFETCHSIZE + 0) * SIZE(B), BB
  521. movq M, I
  522. sarq $3, I # i = (m >> 3)
  523. jle .L20
  524. ALIGN_4
  525. .L11:
  526. #if !defined(TRMMKERNEL) || \
  527. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  528. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  529. leaq 32 * SIZE + BUFFER, BO
  530. #else
  531. leaq 32 * SIZE + BUFFER, BO
  532. movq KK, %rax
  533. leaq (, %rax, 8), %rax
  534. leaq (AO, %rax, 4), AO
  535. leaq (BO, %rax, 8), BO
  536. #endif
  537. movaps -32 * SIZE(AO), %xmm0
  538. movaps -32 * SIZE(BO), %xmm1
  539. xorps %xmm8, %xmm8
  540. movaps -28 * SIZE(BO), %xmm3
  541. xorps %xmm9, %xmm9
  542. movaps -16 * SIZE(AO), %xmm4
  543. xorps %xmm10, %xmm10
  544. movaps 0 * SIZE(BO), %xmm5
  545. xorps %xmm11, %xmm11
  546. prefetch -20 * SIZE(BB)
  547. prefetchw 3 * SIZE(CO1)
  548. xorps %xmm12, %xmm12
  549. prefetchw 7 * SIZE(CO2)
  550. xorps %xmm13, %xmm13
  551. prefetchw 3 * SIZE(CO1, LDC, 2)
  552. xorps %xmm14, %xmm14
  553. prefetchw 7 * SIZE(CO2, LDC, 2)
  554. xorps %xmm15, %xmm15
  555. movaps %xmm0, %xmm2
  556. #ifndef TRMMKERNEL
  557. movq K, %rax
  558. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  559. movq K, %rax
  560. subq KK, %rax
  561. movq %rax, KKK
  562. #else
  563. movq KK, %rax
  564. #ifdef LEFT
  565. addq $8, %rax
  566. #else
  567. addq $4, %rax
  568. #endif
  569. movq %rax, KKK
  570. #endif
  571. andq $-8, %rax
  572. leaq (, %rax, 8), %rax
  573. leaq (AO, %rax, 4), AO
  574. leaq (BO, %rax, 8), BO
  575. negq %rax
  576. NOBRANCH
  577. je .L15
  578. ALIGN_3
  579. .L12:
  580. KERNEL1(32 * 0)
  581. KERNEL2(32 * 0)
  582. KERNEL3(32 * 0)
  583. KERNEL4(32 * 0)
  584. KERNEL5(32 * 0)
  585. KERNEL6(32 * 0)
  586. KERNEL7(32 * 0)
  587. KERNEL8(32 * 0)
  588. NOBRANCH
  589. je .L15
  590. KERNEL1(32 * 0)
  591. KERNEL2(32 * 0)
  592. KERNEL3(32 * 0)
  593. KERNEL4(32 * 0)
  594. KERNEL5(32 * 0)
  595. KERNEL6(32 * 0)
  596. KERNEL7(32 * 0)
  597. KERNEL8(32 * 0)
  598. NOBRANCH
  599. je .L15
  600. KERNEL1(32 * 0)
  601. KERNEL2(32 * 0)
  602. KERNEL3(32 * 0)
  603. KERNEL4(32 * 0)
  604. KERNEL5(32 * 0)
  605. KERNEL6(32 * 0)
  606. KERNEL7(32 * 0)
  607. KERNEL8(32 * 0)
  608. NOBRANCH
  609. je .L15
  610. KERNEL1(32 * 0)
  611. KERNEL2(32 * 0)
  612. KERNEL3(32 * 0)
  613. KERNEL4(32 * 0)
  614. KERNEL5(32 * 0)
  615. KERNEL6(32 * 0)
  616. KERNEL7(32 * 0)
  617. KERNEL8(32 * 0)
  618. NOBRANCH
  619. je .L15
  620. KERNEL1(32 * 0)
  621. KERNEL2(32 * 0)
  622. KERNEL3(32 * 0)
  623. KERNEL4(32 * 0)
  624. KERNEL5(32 * 0)
  625. KERNEL6(32 * 0)
  626. KERNEL7(32 * 0)
  627. KERNEL8(32 * 0)
  628. NOBRANCH
  629. je .L15
  630. KERNEL1(32 * 0)
  631. KERNEL2(32 * 0)
  632. KERNEL3(32 * 0)
  633. KERNEL4(32 * 0)
  634. KERNEL5(32 * 0)
  635. KERNEL6(32 * 0)
  636. KERNEL7(32 * 0)
  637. KERNEL8(32 * 0)
  638. NOBRANCH
  639. je .L15
  640. KERNEL1(32 * 0)
  641. KERNEL2(32 * 0)
  642. KERNEL3(32 * 0)
  643. KERNEL4(32 * 0)
  644. KERNEL5(32 * 0)
  645. KERNEL6(32 * 0)
  646. KERNEL7(32 * 0)
  647. KERNEL8(32 * 0)
  648. NOBRANCH
  649. je .L15
  650. KERNEL1(32 * 0)
  651. KERNEL2(32 * 0)
  652. KERNEL3(32 * 0)
  653. KERNEL4(32 * 0)
  654. KERNEL5(32 * 0)
  655. KERNEL6(32 * 0)
  656. KERNEL7(32 * 0)
  657. KERNEL8(32 * 0)
  658. BRANCH
  659. jl .L12
  660. ALIGN_4
  661. .L15:
  662. prefetch 16 * SIZE(BB)
  663. subq $-32 * SIZE, BB
  664. movaps ALPHA, %xmm7
  665. #ifndef TRMMKERNEL
  666. movq K, %rax
  667. #else
  668. movq KKK, %rax
  669. #endif
  670. testq $4, %rax
  671. je .L16
  672. xorq %rax, %rax
  673. ALIGN_3
  674. KERNEL_SUB1(32 * 0)
  675. KERNEL_SUB2(32 * 0)
  676. KERNEL_SUB3(32 * 0)
  677. KERNEL_SUB4(32 * 0)
  678. addq $32 * SIZE, AO
  679. addq $64 * SIZE, BO
  680. ALIGN_3
  681. .L16:
  682. #ifndef TRMMKERNEL
  683. movq K, %rax
  684. #else
  685. movq KKK, %rax
  686. #endif
  687. andq $3, %rax # if (k & 1)
  688. je .L18
  689. leaq (, %rax, 8), %rax
  690. leaq (AO, %rax, 4), AO
  691. leaq (BO, %rax, 8), BO
  692. negq %rax
  693. ALIGN_4
  694. .L17:
  695. mulps %xmm1, %xmm0
  696. mulps -28 * SIZE(AO, %rax, 4), %xmm1
  697. addps %xmm0, %xmm8
  698. movaps %xmm2, %xmm0
  699. addps %xmm1, %xmm12
  700. movaps -24 * SIZE(BO, %rax, 8), %xmm1
  701. mulps %xmm3, %xmm2
  702. mulps -28 * SIZE(AO, %rax, 4), %xmm3
  703. addps %xmm2, %xmm9
  704. movaps %xmm0, %xmm2
  705. addps %xmm3, %xmm13
  706. movaps -20 * SIZE(BO, %rax, 8), %xmm3
  707. mulps %xmm1, %xmm0
  708. mulps -28 * SIZE(AO, %rax, 4), %xmm1
  709. addps %xmm0, %xmm10
  710. movaps -24 * SIZE(AO, %rax, 4), %xmm0
  711. addps %xmm1, %xmm14
  712. movaps -16 * SIZE(BO, %rax, 8), %xmm1
  713. mulps %xmm3, %xmm2
  714. mulps -28 * SIZE(AO, %rax, 4), %xmm3
  715. addps %xmm2, %xmm11
  716. addps %xmm3, %xmm15
  717. movaps -12 * SIZE(BO, %rax, 8), %xmm3
  718. movaps %xmm0, %xmm2
  719. addq $SIZE * 2, %rax
  720. jl .L17
  721. ALIGN_4
  722. .L18:
  723. #ifndef TRMMKERNEL
  724. movups 0 * SIZE(CO1), %xmm0
  725. movups 4 * SIZE(CO1), %xmm1
  726. movups 0 * SIZE(CO2), %xmm2
  727. movups 4 * SIZE(CO2), %xmm3
  728. #endif
  729. mulps %xmm7, %xmm8
  730. mulps %xmm7, %xmm9
  731. mulps %xmm7, %xmm10
  732. mulps %xmm7, %xmm11
  733. mulps %xmm7, %xmm12
  734. mulps %xmm7, %xmm13
  735. mulps %xmm7, %xmm14
  736. mulps %xmm7, %xmm15
  737. #ifndef TRMMKERNEL
  738. movups 0 * SIZE(CO1, LDC, 2), %xmm4
  739. movups 4 * SIZE(CO1, LDC, 2), %xmm5
  740. movups 0 * SIZE(CO2, LDC, 2), %xmm6
  741. movups 4 * SIZE(CO2, LDC, 2), %xmm7
  742. addps %xmm0, %xmm8
  743. addps %xmm1, %xmm12
  744. addps %xmm2, %xmm9
  745. addps %xmm3, %xmm13
  746. #endif
  747. movsd %xmm8, 0 * SIZE(CO1)
  748. movhps %xmm8, 2 * SIZE(CO1)
  749. movsd %xmm12, 4 * SIZE(CO1)
  750. movhps %xmm12, 6 * SIZE(CO1)
  751. movsd %xmm9, 0 * SIZE(CO2)
  752. movhps %xmm9, 2 * SIZE(CO2)
  753. movsd %xmm13, 4 * SIZE(CO2)
  754. movhps %xmm13, 6 * SIZE(CO2)
  755. #ifndef TRMMKERNEL
  756. addps %xmm4, %xmm10
  757. addps %xmm5, %xmm14
  758. addps %xmm6, %xmm11
  759. addps %xmm7, %xmm15
  760. #endif
  761. movsd %xmm10, 0 * SIZE(CO1, LDC, 2)
  762. movhps %xmm10, 2 * SIZE(CO1, LDC, 2)
  763. movsd %xmm14, 4 * SIZE(CO1, LDC, 2)
  764. movhps %xmm14, 6 * SIZE(CO1, LDC, 2)
  765. movsd %xmm11, 0 * SIZE(CO2, LDC, 2)
  766. movhps %xmm11, 2 * SIZE(CO2, LDC, 2)
  767. movsd %xmm15, 4 * SIZE(CO2, LDC, 2)
  768. movhps %xmm15, 6 * SIZE(CO2, LDC, 2)
  769. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  770. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  771. movq K, %rax
  772. subq KKK, %rax
  773. leaq (,%rax, 8), %rax
  774. leaq (AO, %rax, 4), AO
  775. leaq (BO, %rax, 8), BO
  776. #endif
  777. #if defined(TRMMKERNEL) && defined(LEFT)
  778. addq $8, KK
  779. #endif
  780. addq $8 * SIZE, CO1 # coffset += 4
  781. addq $8 * SIZE, CO2 # coffset += 4
  782. decq I # i --
  783. jg .L11
  784. ALIGN_4
  785. .L20:
  786. testq $4, M
  787. je .L30
  788. #if !defined(TRMMKERNEL) || \
  789. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  790. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  791. leaq BUFFER, BO
  792. #else
  793. leaq BUFFER, BO
  794. movq KK, %rax
  795. leaq (, %rax, 8), %rax
  796. leaq (AO, %rax, 2), AO
  797. leaq (BO, %rax, 8), BO
  798. #endif
  799. movaps -32 * SIZE(AO), %xmm8
  800. movaps -16 * SIZE(AO), %xmm10
  801. movaps 0 * SIZE(BO), %xmm9
  802. movaps 16 * SIZE(BO), %xmm11
  803. movaps 32 * SIZE(BO), %xmm13
  804. movaps 48 * SIZE(BO), %xmm15
  805. xorps %xmm0, %xmm0
  806. xorps %xmm1, %xmm1
  807. xorps %xmm2, %xmm2
  808. xorps %xmm3, %xmm3
  809. #ifndef TRMMKERNEL
  810. movq K, %rax
  811. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  812. movq K, %rax
  813. subq KK, %rax
  814. movq %rax, KKK
  815. #else
  816. movq KK, %rax
  817. #ifdef LEFT
  818. addq $4, %rax
  819. #else
  820. addq $4, %rax
  821. #endif
  822. movq %rax, KKK
  823. #endif
  824. sarq $3, %rax
  825. je .L25
  826. ALIGN_4
  827. .L22:
  828. mulps %xmm8, %xmm9
  829. addps %xmm9, %xmm0
  830. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  831. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  832. #endif
  833. movaps 4 * SIZE(BO), %xmm9
  834. mulps %xmm8, %xmm9
  835. addps %xmm9, %xmm1
  836. movaps 8 * SIZE(BO), %xmm9
  837. mulps %xmm8, %xmm9
  838. mulps 12 * SIZE(BO), %xmm8
  839. addps %xmm9, %xmm2
  840. movaps 64 * SIZE(BO), %xmm9
  841. addps %xmm8, %xmm3
  842. movaps -28 * SIZE(AO), %xmm8
  843. mulps %xmm8, %xmm11
  844. addps %xmm11, %xmm0
  845. movaps 20 * SIZE(BO), %xmm11
  846. mulps %xmm8, %xmm11
  847. addps %xmm11, %xmm1
  848. movaps 24 * SIZE(BO), %xmm11
  849. mulps %xmm8, %xmm11
  850. mulps 28 * SIZE(BO), %xmm8
  851. addps %xmm11, %xmm2
  852. movaps 80 * SIZE(BO), %xmm11
  853. addps %xmm8, %xmm3
  854. movaps -24 * SIZE(AO), %xmm8
  855. mulps %xmm8, %xmm13
  856. addps %xmm13, %xmm0
  857. movaps 36 * SIZE(BO), %xmm13
  858. mulps %xmm8, %xmm13
  859. addps %xmm13, %xmm1
  860. movaps 40 * SIZE(BO), %xmm13
  861. mulps %xmm8, %xmm13
  862. mulps 44 * SIZE(BO), %xmm8
  863. addps %xmm13, %xmm2
  864. movaps 96 * SIZE(BO), %xmm13
  865. addps %xmm8, %xmm3
  866. movaps -20 * SIZE(AO), %xmm8
  867. mulps %xmm8, %xmm15
  868. addps %xmm15, %xmm0
  869. movaps 52 * SIZE(BO), %xmm15
  870. mulps %xmm8, %xmm15
  871. addps %xmm15, %xmm1
  872. movaps 56 * SIZE(BO), %xmm15
  873. mulps %xmm8, %xmm15
  874. mulps 60 * SIZE(BO), %xmm8
  875. addps %xmm15, %xmm2
  876. movaps 112 * SIZE(BO), %xmm15
  877. addps %xmm8, %xmm3
  878. movaps 0 * SIZE(AO), %xmm8
  879. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  880. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  881. #endif
  882. mulps %xmm10, %xmm9
  883. addps %xmm9, %xmm0
  884. movaps 68 * SIZE(BO), %xmm9
  885. mulps %xmm10, %xmm9
  886. addps %xmm9, %xmm1
  887. movaps 72 * SIZE(BO), %xmm9
  888. mulps %xmm10, %xmm9
  889. mulps 76 * SIZE(BO), %xmm10
  890. addps %xmm9, %xmm2
  891. movaps 128 * SIZE(BO), %xmm9
  892. addps %xmm10, %xmm3
  893. movaps -12 * SIZE(AO), %xmm10
  894. mulps %xmm10, %xmm11
  895. addps %xmm11, %xmm0
  896. movaps 84 * SIZE(BO), %xmm11
  897. mulps %xmm10, %xmm11
  898. addps %xmm11, %xmm1
  899. movaps 88 * SIZE(BO), %xmm11
  900. mulps %xmm10, %xmm11
  901. mulps 92 * SIZE(BO), %xmm10
  902. addps %xmm11, %xmm2
  903. movaps 144 * SIZE(BO), %xmm11
  904. addps %xmm10, %xmm3
  905. movaps -8 * SIZE(AO), %xmm10
  906. mulps %xmm10, %xmm13
  907. addps %xmm13, %xmm0
  908. movaps 100 * SIZE(BO), %xmm13
  909. mulps %xmm10, %xmm13
  910. addps %xmm13, %xmm1
  911. movaps 104 * SIZE(BO), %xmm13
  912. mulps %xmm10, %xmm13
  913. mulps 108 * SIZE(BO), %xmm10
  914. addps %xmm13, %xmm2
  915. movaps 160 * SIZE(BO), %xmm13
  916. addps %xmm10, %xmm3
  917. movaps -4 * SIZE(AO), %xmm10
  918. mulps %xmm10, %xmm15
  919. addps %xmm15, %xmm0
  920. movaps 116 * SIZE(BO), %xmm15
  921. mulps %xmm10, %xmm15
  922. addps %xmm15, %xmm1
  923. movaps 120 * SIZE(BO), %xmm15
  924. mulps %xmm10, %xmm15
  925. mulps 124 * SIZE(BO), %xmm10
  926. addps %xmm15, %xmm2
  927. movaps 176 * SIZE(BO), %xmm15
  928. addps %xmm10, %xmm3
  929. movaps 16 * SIZE(AO), %xmm10
  930. addq $ 32 * SIZE, AO
  931. addq $128 * SIZE, BO
  932. decq %rax
  933. jne .L22
  934. ALIGN_4
  935. .L25:
  936. #ifndef TRMMKERNEL
  937. movq K, %rax
  938. #else
  939. movq KKK, %rax
  940. #endif
  941. movaps ALPHA, %xmm15
  942. andq $7, %rax # if (k & 1)
  943. BRANCH
  944. je .L28
  945. ALIGN_4
  946. .L26:
  947. mulps %xmm8, %xmm9
  948. addps %xmm9, %xmm0
  949. movaps 4 * SIZE(BO), %xmm9
  950. mulps %xmm8, %xmm9
  951. addps %xmm9, %xmm1
  952. movaps 8 * SIZE(BO), %xmm9
  953. mulps %xmm8, %xmm9
  954. mulps 12 * SIZE(BO), %xmm8
  955. addps %xmm9, %xmm2
  956. movaps 16 * SIZE(BO), %xmm9
  957. addps %xmm8, %xmm3
  958. movaps -28 * SIZE(AO), %xmm8
  959. addq $ 4 * SIZE, AO # aoffset += 4
  960. addq $16 * SIZE, BO # boffset1 += 8
  961. decq %rax
  962. jg .L26
  963. ALIGN_4
  964. .L28:
  965. mulps %xmm15, %xmm0
  966. mulps %xmm15, %xmm1
  967. mulps %xmm15, %xmm2
  968. mulps %xmm15, %xmm3
  969. #ifndef TRMMKERNEL
  970. movsd 0 * SIZE(CO1), %xmm8
  971. movhps 2 * SIZE(CO1), %xmm8
  972. movsd 0 * SIZE(CO2), %xmm10
  973. movhps 2 * SIZE(CO2), %xmm10
  974. movsd 0 * SIZE(CO1, LDC, 2), %xmm12
  975. movhps 2 * SIZE(CO1, LDC, 2), %xmm12
  976. movsd 0 * SIZE(CO2, LDC, 2), %xmm14
  977. movhps 2 * SIZE(CO2, LDC, 2), %xmm14
  978. addps %xmm8, %xmm0
  979. addps %xmm10, %xmm1
  980. addps %xmm12, %xmm2
  981. addps %xmm14, %xmm3
  982. #endif
  983. movsd %xmm0, 0 * SIZE(CO1)
  984. movhps %xmm0, 2 * SIZE(CO1)
  985. movsd %xmm1, 0 * SIZE(CO2)
  986. movhps %xmm1, 2 * SIZE(CO2)
  987. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  988. movhps %xmm2, 2 * SIZE(CO1, LDC, 2)
  989. movsd %xmm3, 0 * SIZE(CO2, LDC, 2)
  990. movhps %xmm3, 2 * SIZE(CO2, LDC, 2)
  991. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  992. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  993. movq K, %rax
  994. subq KKK, %rax
  995. leaq (,%rax, 8), %rax
  996. leaq (AO, %rax, 2), AO
  997. leaq (BO, %rax, 8), BO
  998. #endif
  999. #if defined(TRMMKERNEL) && defined(LEFT)
  1000. addq $4, KK
  1001. #endif
  1002. addq $4 * SIZE, CO1 # coffset += 4
  1003. addq $4 * SIZE, CO2 # coffset += 4
  1004. ALIGN_4
  1005. .L30:
  1006. testq $2, M
  1007. je .L40
  1008. #if !defined(TRMMKERNEL) || \
  1009. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1010. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1011. leaq BUFFER, BO
  1012. #else
  1013. leaq BUFFER, BO
  1014. movq KK, %rax
  1015. leaq (, %rax, 8), %rax
  1016. leaq (AO, %rax, 1), AO
  1017. leaq (BO, %rax, 8), BO
  1018. #endif
  1019. movaps -32 * SIZE(AO), %xmm8
  1020. movaps -24 * SIZE(AO), %xmm10
  1021. movaps 0 * SIZE(BO), %xmm9
  1022. movaps 16 * SIZE(BO), %xmm11
  1023. movaps 32 * SIZE(BO), %xmm13
  1024. movaps 48 * SIZE(BO), %xmm15
  1025. xorps %xmm0, %xmm0
  1026. xorps %xmm1, %xmm1
  1027. xorps %xmm2, %xmm2
  1028. xorps %xmm3, %xmm3
  1029. #ifndef TRMMKERNEL
  1030. movq K, %rax
  1031. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1032. movq K, %rax
  1033. subq KK, %rax
  1034. movq %rax, KKK
  1035. #else
  1036. movq KK, %rax
  1037. #ifdef LEFT
  1038. addq $2, %rax
  1039. #else
  1040. addq $4, %rax
  1041. #endif
  1042. movq %rax, KKK
  1043. #endif
  1044. sarq $3, %rax
  1045. je .L35
  1046. ALIGN_4
  1047. .L32:
  1048. mulps %xmm8, %xmm9
  1049. addps %xmm9, %xmm0
  1050. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  1051. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1052. #endif
  1053. movsd 4 * SIZE(BO), %xmm9
  1054. mulps %xmm8, %xmm9
  1055. addps %xmm9, %xmm1
  1056. movsd 8 * SIZE(BO), %xmm9
  1057. mulps %xmm8, %xmm9
  1058. addps %xmm9, %xmm2
  1059. movsd 12 * SIZE(BO), %xmm9
  1060. mulps %xmm8, %xmm9
  1061. movsd -30 * SIZE(AO), %xmm8
  1062. addps %xmm9, %xmm3
  1063. movsd 64 * SIZE(BO), %xmm9
  1064. mulps %xmm8, %xmm11
  1065. addps %xmm11, %xmm0
  1066. movsd 20 * SIZE(BO), %xmm11
  1067. mulps %xmm8, %xmm11
  1068. addps %xmm11, %xmm1
  1069. movsd 24 * SIZE(BO), %xmm11
  1070. mulps %xmm8, %xmm11
  1071. addps %xmm11, %xmm2
  1072. movsd 28 * SIZE(BO), %xmm11
  1073. mulps %xmm8, %xmm11
  1074. movsd -28 * SIZE(AO), %xmm8
  1075. addps %xmm11, %xmm3
  1076. movsd 80 * SIZE(BO), %xmm11
  1077. mulps %xmm8, %xmm13
  1078. addps %xmm13, %xmm0
  1079. movsd 36 * SIZE(BO), %xmm13
  1080. mulps %xmm8, %xmm13
  1081. addps %xmm13, %xmm1
  1082. movsd 40 * SIZE(BO), %xmm13
  1083. mulps %xmm8, %xmm13
  1084. addps %xmm13, %xmm2
  1085. movsd 44 * SIZE(BO), %xmm13
  1086. mulps %xmm8, %xmm13
  1087. movsd -26 * SIZE(AO), %xmm8
  1088. addps %xmm13, %xmm3
  1089. movsd 96 * SIZE(BO), %xmm13
  1090. mulps %xmm8, %xmm15
  1091. addps %xmm15, %xmm0
  1092. movsd 52 * SIZE(BO), %xmm15
  1093. mulps %xmm8, %xmm15
  1094. addps %xmm15, %xmm1
  1095. movsd 56 * SIZE(BO), %xmm15
  1096. mulps %xmm8, %xmm15
  1097. addps %xmm15, %xmm2
  1098. movsd 60 * SIZE(BO), %xmm15
  1099. mulps %xmm8, %xmm15
  1100. movsd -16 * SIZE(AO), %xmm8
  1101. addps %xmm15, %xmm3
  1102. movsd 112 * SIZE(BO), %xmm15
  1103. mulps %xmm10, %xmm9
  1104. addps %xmm9, %xmm0
  1105. movsd 68 * SIZE(BO), %xmm9
  1106. mulps %xmm10, %xmm9
  1107. addps %xmm9, %xmm1
  1108. movsd 72 * SIZE(BO), %xmm9
  1109. mulps %xmm10, %xmm9
  1110. addps %xmm9, %xmm2
  1111. movsd 76 * SIZE(BO), %xmm9
  1112. mulps %xmm10, %xmm9
  1113. movsd -22 * SIZE(AO), %xmm10
  1114. addps %xmm9, %xmm3
  1115. movsd 128 * SIZE(BO), %xmm9
  1116. mulps %xmm10, %xmm11
  1117. addps %xmm11, %xmm0
  1118. movsd 84 * SIZE(BO), %xmm11
  1119. mulps %xmm10, %xmm11
  1120. addps %xmm11, %xmm1
  1121. movsd 88 * SIZE(BO), %xmm11
  1122. mulps %xmm10, %xmm11
  1123. addps %xmm11, %xmm2
  1124. movsd 92 * SIZE(BO), %xmm11
  1125. mulps %xmm10, %xmm11
  1126. movsd -20 * SIZE(AO), %xmm10
  1127. addps %xmm11, %xmm3
  1128. movsd 144 * SIZE(BO), %xmm11
  1129. mulps %xmm10, %xmm13
  1130. addps %xmm13, %xmm0
  1131. movsd 100 * SIZE(BO), %xmm13
  1132. mulps %xmm10, %xmm13
  1133. addps %xmm13, %xmm1
  1134. movsd 104 * SIZE(BO), %xmm13
  1135. mulps %xmm10, %xmm13
  1136. addps %xmm13, %xmm2
  1137. movsd 108 * SIZE(BO), %xmm13
  1138. mulps %xmm10, %xmm13
  1139. movsd -18 * SIZE(AO), %xmm10
  1140. addps %xmm13, %xmm3
  1141. movsd 160 * SIZE(BO), %xmm13
  1142. mulps %xmm10, %xmm15
  1143. addps %xmm15, %xmm0
  1144. movsd 116 * SIZE(BO), %xmm15
  1145. mulps %xmm10, %xmm15
  1146. addps %xmm15, %xmm1
  1147. movsd 120 * SIZE(BO), %xmm15
  1148. mulps %xmm10, %xmm15
  1149. addps %xmm15, %xmm2
  1150. movsd 124 * SIZE(BO), %xmm15
  1151. mulps %xmm10, %xmm15
  1152. movsd -8 * SIZE(AO), %xmm10
  1153. addps %xmm15, %xmm3
  1154. movsd 176 * SIZE(BO), %xmm15
  1155. addq $ 16 * SIZE, AO
  1156. addq $128 * SIZE, BO
  1157. decq %rax
  1158. jne .L32
  1159. ALIGN_4
  1160. .L35:
  1161. #ifndef TRMMKERNEL
  1162. movq K, %rax
  1163. #else
  1164. movq KKK, %rax
  1165. #endif
  1166. movaps ALPHA, %xmm15
  1167. andq $7, %rax # if (k & 1)
  1168. BRANCH
  1169. je .L38
  1170. ALIGN_4
  1171. .L36:
  1172. mulps %xmm8, %xmm9
  1173. addps %xmm9, %xmm0
  1174. movsd 4 * SIZE(BO), %xmm9
  1175. mulps %xmm8, %xmm9
  1176. addps %xmm9, %xmm1
  1177. movsd 8 * SIZE(BO), %xmm9
  1178. mulps %xmm8, %xmm9
  1179. addps %xmm9, %xmm2
  1180. movsd 12 * SIZE(BO), %xmm9
  1181. mulps %xmm8, %xmm9
  1182. movsd -30 * SIZE(AO), %xmm8
  1183. addps %xmm9, %xmm3
  1184. movsd 16 * SIZE(BO), %xmm9
  1185. addq $ 2 * SIZE, AO # aoffset += 4
  1186. addq $16 * SIZE, BO # boffset1 += 8
  1187. decq %rax
  1188. jg .L36
  1189. ALIGN_4
  1190. .L38:
  1191. mulps %xmm15, %xmm0
  1192. mulps %xmm15, %xmm1
  1193. mulps %xmm15, %xmm2
  1194. mulps %xmm15, %xmm3
  1195. #ifndef TRMMKERNEL
  1196. movsd 0 * SIZE(CO1), %xmm8
  1197. movsd 0 * SIZE(CO2), %xmm10
  1198. movsd 0 * SIZE(CO1, LDC, 2), %xmm12
  1199. movsd 0 * SIZE(CO2, LDC, 2), %xmm14
  1200. addps %xmm8, %xmm0
  1201. addps %xmm10, %xmm1
  1202. addps %xmm12, %xmm2
  1203. addps %xmm14, %xmm3
  1204. #endif
  1205. movsd %xmm0, 0 * SIZE(CO1)
  1206. movsd %xmm1, 0 * SIZE(CO2)
  1207. movsd %xmm2, 0 * SIZE(CO1, LDC, 2)
  1208. movsd %xmm3, 0 * SIZE(CO2, LDC, 2)
  1209. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1210. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1211. movq K, %rax
  1212. subq KKK, %rax
  1213. leaq (,%rax, 8), %rax
  1214. leaq (AO, %rax, 1), AO
  1215. leaq (BO, %rax, 8), BO
  1216. #endif
  1217. #if defined(TRMMKERNEL) && defined(LEFT)
  1218. addq $2, KK
  1219. #endif
  1220. addq $2 * SIZE, CO1 # coffset += 4
  1221. addq $2 * SIZE, CO2 # coffset += 4
  1222. ALIGN_4
  1223. .L40:
  1224. testq $1, M
  1225. je .L49
  1226. #if !defined(TRMMKERNEL) || \
  1227. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1228. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1229. leaq BUFFER, BO
  1230. #else
  1231. leaq BUFFER, BO
  1232. movq KK, %rax
  1233. leaq (, %rax, 4), %rax
  1234. leaq (AO, %rax, 1), AO
  1235. leaq (BO, %rax, 8), BO
  1236. leaq (BO, %rax, 8), BO
  1237. #endif
  1238. movss -32 * SIZE(AO), %xmm8
  1239. movss -28 * SIZE(AO), %xmm10
  1240. movss 0 * SIZE(BO), %xmm9
  1241. movss 16 * SIZE(BO), %xmm11
  1242. movss 32 * SIZE(BO), %xmm13
  1243. movss 48 * SIZE(BO), %xmm15
  1244. xorps %xmm0, %xmm0
  1245. xorps %xmm1, %xmm1
  1246. xorps %xmm2, %xmm2
  1247. xorps %xmm3, %xmm3
  1248. #ifndef TRMMKERNEL
  1249. movq K, %rax
  1250. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1251. movq K, %rax
  1252. subq KK, %rax
  1253. movq %rax, KKK
  1254. #else
  1255. movq KK, %rax
  1256. #ifdef LEFT
  1257. addq $1, %rax
  1258. #else
  1259. addq $4, %rax
  1260. #endif
  1261. movq %rax, KKK
  1262. #endif
  1263. sarq $3, %rax
  1264. je .L45
  1265. ALIGN_4
  1266. .L42:
  1267. mulss %xmm8, %xmm9
  1268. addss %xmm9, %xmm0
  1269. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  1270. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1271. #endif
  1272. movss 4 * SIZE(BO), %xmm9
  1273. mulss %xmm8, %xmm9
  1274. addss %xmm9, %xmm1
  1275. movss 8 * SIZE(BO), %xmm9
  1276. mulss %xmm8, %xmm9
  1277. addss %xmm9, %xmm2
  1278. movss 12 * SIZE(BO), %xmm9
  1279. mulss %xmm8, %xmm9
  1280. movss -31 * SIZE(AO), %xmm8
  1281. addss %xmm9, %xmm3
  1282. movss 64 * SIZE(BO), %xmm9
  1283. mulss %xmm8, %xmm11
  1284. addss %xmm11, %xmm0
  1285. movss 20 * SIZE(BO), %xmm11
  1286. mulss %xmm8, %xmm11
  1287. addss %xmm11, %xmm1
  1288. movss 24 * SIZE(BO), %xmm11
  1289. mulss %xmm8, %xmm11
  1290. addss %xmm11, %xmm2
  1291. movss 28 * SIZE(BO), %xmm11
  1292. mulss %xmm8, %xmm11
  1293. movss -30 * SIZE(AO), %xmm8
  1294. addss %xmm11, %xmm3
  1295. movss 80 * SIZE(BO), %xmm11
  1296. mulss %xmm8, %xmm13
  1297. addss %xmm13, %xmm0
  1298. movss 36 * SIZE(BO), %xmm13
  1299. mulss %xmm8, %xmm13
  1300. addss %xmm13, %xmm1
  1301. movss 40 * SIZE(BO), %xmm13
  1302. mulss %xmm8, %xmm13
  1303. addss %xmm13, %xmm2
  1304. movss 44 * SIZE(BO), %xmm13
  1305. mulss %xmm8, %xmm13
  1306. movss -29 * SIZE(AO), %xmm8
  1307. addss %xmm13, %xmm3
  1308. movss 96 * SIZE(BO), %xmm13
  1309. mulss %xmm8, %xmm15
  1310. addss %xmm15, %xmm0
  1311. movss 52 * SIZE(BO), %xmm15
  1312. mulss %xmm8, %xmm15
  1313. addss %xmm15, %xmm1
  1314. movss 56 * SIZE(BO), %xmm15
  1315. mulss %xmm8, %xmm15
  1316. addss %xmm15, %xmm2
  1317. movss 60 * SIZE(BO), %xmm15
  1318. mulss %xmm8, %xmm15
  1319. movss -24 * SIZE(AO), %xmm8
  1320. addss %xmm15, %xmm3
  1321. movss 112 * SIZE(BO), %xmm15
  1322. mulss %xmm10, %xmm9
  1323. addss %xmm9, %xmm0
  1324. movss 68 * SIZE(BO), %xmm9
  1325. mulss %xmm10, %xmm9
  1326. addss %xmm9, %xmm1
  1327. movss 72 * SIZE(BO), %xmm9
  1328. mulss %xmm10, %xmm9
  1329. addss %xmm9, %xmm2
  1330. movss 76 * SIZE(BO), %xmm9
  1331. mulss %xmm10, %xmm9
  1332. movss -27 * SIZE(AO), %xmm10
  1333. addss %xmm9, %xmm3
  1334. movss 128 * SIZE(BO), %xmm9
  1335. mulss %xmm10, %xmm11
  1336. addss %xmm11, %xmm0
  1337. movss 84 * SIZE(BO), %xmm11
  1338. mulss %xmm10, %xmm11
  1339. addss %xmm11, %xmm1
  1340. movss 88 * SIZE(BO), %xmm11
  1341. mulss %xmm10, %xmm11
  1342. addss %xmm11, %xmm2
  1343. movss 92 * SIZE(BO), %xmm11
  1344. mulss %xmm10, %xmm11
  1345. movss -26 * SIZE(AO), %xmm10
  1346. addss %xmm11, %xmm3
  1347. movss 144 * SIZE(BO), %xmm11
  1348. mulss %xmm10, %xmm13
  1349. addss %xmm13, %xmm0
  1350. movss 100 * SIZE(BO), %xmm13
  1351. mulss %xmm10, %xmm13
  1352. addss %xmm13, %xmm1
  1353. movss 104 * SIZE(BO), %xmm13
  1354. mulss %xmm10, %xmm13
  1355. addss %xmm13, %xmm2
  1356. movss 108 * SIZE(BO), %xmm13
  1357. mulss %xmm10, %xmm13
  1358. movss -25 * SIZE(AO), %xmm10
  1359. addss %xmm13, %xmm3
  1360. movss 160 * SIZE(BO), %xmm13
  1361. mulss %xmm10, %xmm15
  1362. addss %xmm15, %xmm0
  1363. movss 116 * SIZE(BO), %xmm15
  1364. mulss %xmm10, %xmm15
  1365. addss %xmm15, %xmm1
  1366. movss 120 * SIZE(BO), %xmm15
  1367. mulss %xmm10, %xmm15
  1368. addss %xmm15, %xmm2
  1369. movss 124 * SIZE(BO), %xmm15
  1370. mulss %xmm10, %xmm15
  1371. movss -20 * SIZE(AO), %xmm10
  1372. addss %xmm15, %xmm3
  1373. movss 176 * SIZE(BO), %xmm15
  1374. addq $ 8 * SIZE, AO
  1375. addq $128 * SIZE, BO
  1376. decq %rax
  1377. jne .L42
  1378. ALIGN_4
  1379. .L45:
  1380. #ifndef TRMMKERNEL
  1381. movq K, %rax
  1382. #else
  1383. movq KKK, %rax
  1384. #endif
  1385. movaps ALPHA, %xmm15
  1386. andq $7, %rax # if (k & 1)
  1387. BRANCH
  1388. je .L48
  1389. ALIGN_4
  1390. .L46:
  1391. mulps %xmm8, %xmm9
  1392. addps %xmm9, %xmm0
  1393. movss 4 * SIZE(BO), %xmm9
  1394. mulps %xmm8, %xmm9
  1395. addps %xmm9, %xmm1
  1396. movss 8 * SIZE(BO), %xmm9
  1397. mulps %xmm8, %xmm9
  1398. addps %xmm9, %xmm2
  1399. movss 12 * SIZE(BO), %xmm9
  1400. mulps %xmm8, %xmm9
  1401. movss -31 * SIZE(AO), %xmm8
  1402. addps %xmm9, %xmm3
  1403. movss 16 * SIZE(BO), %xmm9
  1404. addq $ 1 * SIZE, AO # aoffset += 4
  1405. addq $16 * SIZE, BO # boffset1 += 8
  1406. decq %rax
  1407. jg .L46
  1408. ALIGN_4
  1409. .L48:
  1410. mulss %xmm15, %xmm0
  1411. mulss %xmm15, %xmm1
  1412. mulss %xmm15, %xmm2
  1413. mulss %xmm15, %xmm3
  1414. #ifndef TRMMKERNEL
  1415. movss 0 * SIZE(CO1), %xmm8
  1416. movss 0 * SIZE(CO2), %xmm10
  1417. movss 0 * SIZE(CO1, LDC, 2), %xmm12
  1418. movss 0 * SIZE(CO2, LDC, 2), %xmm14
  1419. addss %xmm8, %xmm0
  1420. addss %xmm10, %xmm1
  1421. addss %xmm12, %xmm2
  1422. addss %xmm14, %xmm3
  1423. #endif
  1424. movss %xmm0, 0 * SIZE(CO1)
  1425. movss %xmm1, 0 * SIZE(CO2)
  1426. movss %xmm2, 0 * SIZE(CO1, LDC, 2)
  1427. movss %xmm3, 0 * SIZE(CO2, LDC, 2)
  1428. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1429. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1430. movq K, %rax
  1431. subq KKK, %rax
  1432. leaq (,%rax, 4), %rax
  1433. leaq (AO, %rax, 1), AO
  1434. leaq (BO, %rax, 8), BO
  1435. leaq (BO, %rax, 8), BO
  1436. #endif
  1437. #if defined(TRMMKERNEL) && defined(LEFT)
  1438. addq $1, KK
  1439. #endif
  1440. ALIGN_4
  1441. .L49:
  1442. #if defined(TRMMKERNEL) && !defined(LEFT)
  1443. addl $4, KK
  1444. #endif
  1445. leaq (C, LDC, 4), C # c += 4 * ldc
  1446. decq J # j --
  1447. jg .L01
  1448. .L50:
  1449. testq $2, N
  1450. je .L100
  1451. .L51:
  1452. #if defined(TRMMKERNEL) && defined(LEFT)
  1453. movq OFFSET, %rax
  1454. movq %rax, KK
  1455. #endif
  1456. /* Copying to Sub Buffer */
  1457. leaq BUFFER, BO
  1458. movq K, %rax
  1459. sarq $2, %rax
  1460. jle .L53
  1461. ALIGN_4
  1462. .L52:
  1463. prefetch (RPREFETCHSIZE + 0) * SIZE(B)
  1464. movaps 0 * SIZE(B), %xmm3
  1465. movaps 4 * SIZE(B), %xmm7
  1466. prefetchw (WPREFETCHSIZE + 0) * SIZE(BO)
  1467. pshufd $0x00, %xmm3, %xmm0
  1468. pshufd $0x55, %xmm3, %xmm1
  1469. pshufd $0xaa, %xmm3, %xmm2
  1470. pshufd $0xff, %xmm3, %xmm3
  1471. prefetchw (WPREFETCHSIZE + 16) * SIZE(BO)
  1472. pshufd $0x00, %xmm7, %xmm4
  1473. pshufd $0x55, %xmm7, %xmm5
  1474. pshufd $0xaa, %xmm7, %xmm6
  1475. pshufd $0xff, %xmm7, %xmm7
  1476. movaps %xmm0, 0 * SIZE(BO)
  1477. movaps %xmm1, 4 * SIZE(BO)
  1478. movaps %xmm2, 8 * SIZE(BO)
  1479. movaps %xmm3, 12 * SIZE(BO)
  1480. movaps %xmm4, 16 * SIZE(BO)
  1481. movaps %xmm5, 20 * SIZE(BO)
  1482. movaps %xmm6, 24 * SIZE(BO)
  1483. movaps %xmm7, 28 * SIZE(BO)
  1484. addq $ 8 * SIZE, B
  1485. addq $32 * SIZE, BO
  1486. decq %rax
  1487. jne .L52
  1488. ALIGN_4
  1489. .L53:
  1490. movq K, %rax
  1491. andq $3, %rax
  1492. BRANCH
  1493. jle .L60
  1494. ALIGN_4
  1495. .L54:
  1496. movsd 0 * SIZE(B), %xmm3
  1497. pshufd $0x00, %xmm3, %xmm0
  1498. pshufd $0x55, %xmm3, %xmm1
  1499. pshufd $0x00, %xmm7, %xmm4
  1500. pshufd $0x55, %xmm7, %xmm5
  1501. pshufd $0xaa, %xmm7, %xmm6
  1502. pshufd $0xff, %xmm7, %xmm7
  1503. movaps %xmm0, 0 * SIZE(BO)
  1504. movaps %xmm1, 4 * SIZE(BO)
  1505. addq $ 2 * SIZE, B
  1506. addq $ 8 * SIZE, BO
  1507. decq %rax
  1508. jne .L54
  1509. ALIGN_4
  1510. .L60:
  1511. movq C, CO1 # coffset1 = c
  1512. leaq (C, LDC, 1), CO2 # coffset2 = c + ldc
  1513. movq A, AO # aoffset = a
  1514. movq M, I
  1515. sarq $3, I # i = (m >> 3)
  1516. jle .L70
  1517. ALIGN_4
  1518. .L61:
  1519. #if !defined(TRMMKERNEL) || \
  1520. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1521. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1522. leaq BUFFER, BO
  1523. #else
  1524. leaq BUFFER, BO
  1525. movq KK, %rax
  1526. leaq (, %rax, 8), %rax
  1527. leaq (AO, %rax, 4), AO
  1528. leaq (BO, %rax, 4), BO
  1529. #endif
  1530. movaps -32 * SIZE(AO), %xmm8
  1531. movaps -16 * SIZE(AO), %xmm10
  1532. movaps 0 * SIZE(AO), %xmm12
  1533. movaps 16 * SIZE(AO), %xmm14
  1534. movaps 0 * SIZE(BO), %xmm9
  1535. movaps 16 * SIZE(BO), %xmm11
  1536. movaps 32 * SIZE(BO), %xmm13
  1537. movaps 48 * SIZE(BO), %xmm15
  1538. xorps %xmm0, %xmm0
  1539. xorps %xmm1, %xmm1
  1540. prefetchw 4 * SIZE(CO1)
  1541. xorps %xmm4, %xmm4
  1542. prefetchw 4 * SIZE(CO2)
  1543. xorps %xmm5, %xmm5
  1544. #ifndef TRMMKERNEL
  1545. movq K, %rax
  1546. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1547. movq K, %rax
  1548. subq KK, %rax
  1549. movq %rax, KKK
  1550. #else
  1551. movq KK, %rax
  1552. #ifdef LEFT
  1553. addq $8, %rax
  1554. #else
  1555. addq $2, %rax
  1556. #endif
  1557. movq %rax, KKK
  1558. #endif
  1559. sarq $3, %rax
  1560. je .L65
  1561. ALIGN_4
  1562. .L62:
  1563. mulps %xmm8, %xmm9
  1564. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  1565. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1566. #endif
  1567. mulps 4 * SIZE(BO), %xmm8
  1568. addps %xmm9, %xmm0
  1569. movaps 0 * SIZE(BO), %xmm9
  1570. addps %xmm8, %xmm1
  1571. movaps -28 * SIZE(AO), %xmm8
  1572. mulps %xmm8, %xmm9
  1573. mulps 4 * SIZE(BO), %xmm8
  1574. addps %xmm9, %xmm4
  1575. movaps 8 * SIZE(BO), %xmm9
  1576. addps %xmm8, %xmm5
  1577. movaps -24 * SIZE(AO), %xmm8
  1578. mulps %xmm8, %xmm9
  1579. mulps 12 * SIZE(BO), %xmm8
  1580. addps %xmm9, %xmm0
  1581. movaps 8 * SIZE(BO), %xmm9
  1582. addps %xmm8, %xmm1
  1583. movaps -20 * SIZE(AO), %xmm8
  1584. mulps %xmm8, %xmm9
  1585. mulps 12 * SIZE(BO), %xmm8
  1586. addps %xmm9, %xmm4
  1587. movaps 64 * SIZE(BO), %xmm9
  1588. addps %xmm8, %xmm5
  1589. movaps 32 * SIZE(AO), %xmm8
  1590. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  1591. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  1592. #endif
  1593. mulps %xmm10, %xmm11
  1594. mulps 20 * SIZE(BO), %xmm10
  1595. addps %xmm11, %xmm0
  1596. movaps 16 * SIZE(BO), %xmm11
  1597. addps %xmm10, %xmm1
  1598. movaps -12 * SIZE(AO), %xmm10
  1599. mulps %xmm10, %xmm11
  1600. mulps 20 * SIZE(BO), %xmm10
  1601. addps %xmm11, %xmm4
  1602. movaps 24 * SIZE(BO), %xmm11
  1603. addps %xmm10, %xmm5
  1604. movaps -8 * SIZE(AO), %xmm10
  1605. mulps %xmm10, %xmm11
  1606. mulps 28 * SIZE(BO), %xmm10
  1607. addps %xmm11, %xmm0
  1608. movaps 24 * SIZE(BO), %xmm11
  1609. addps %xmm10, %xmm1
  1610. movaps -4 * SIZE(AO), %xmm10
  1611. mulps %xmm10, %xmm11
  1612. mulps 28 * SIZE(BO), %xmm10
  1613. addps %xmm11, %xmm4
  1614. movaps 80 * SIZE(BO), %xmm11
  1615. addps %xmm10, %xmm5
  1616. movaps 48 * SIZE(AO), %xmm10
  1617. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  1618. PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
  1619. #endif
  1620. mulps %xmm12, %xmm13
  1621. mulps 36 * SIZE(BO), %xmm12
  1622. addps %xmm13, %xmm0
  1623. movaps 32 * SIZE(BO), %xmm13
  1624. addps %xmm12, %xmm1
  1625. movaps 4 * SIZE(AO), %xmm12
  1626. mulps %xmm12, %xmm13
  1627. mulps 36 * SIZE(BO), %xmm12
  1628. addps %xmm13, %xmm4
  1629. movaps 40 * SIZE(BO), %xmm13
  1630. addps %xmm12, %xmm5
  1631. movaps 8 * SIZE(AO), %xmm12
  1632. mulps %xmm12, %xmm13
  1633. mulps 44 * SIZE(BO), %xmm12
  1634. addps %xmm13, %xmm0
  1635. movaps 40 * SIZE(BO), %xmm13
  1636. addps %xmm12, %xmm1
  1637. movaps 12 * SIZE(AO), %xmm12
  1638. mulps %xmm12, %xmm13
  1639. mulps 44 * SIZE(BO), %xmm12
  1640. addps %xmm13, %xmm4
  1641. movaps 96 * SIZE(BO), %xmm13
  1642. addps %xmm12, %xmm5
  1643. movaps 64 * SIZE(AO), %xmm12
  1644. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  1645. PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
  1646. #endif
  1647. mulps %xmm14, %xmm15
  1648. mulps 52 * SIZE(BO), %xmm14
  1649. addps %xmm15, %xmm0
  1650. movaps 48 * SIZE(BO), %xmm15
  1651. addps %xmm14, %xmm1
  1652. movaps 20 * SIZE(AO), %xmm14
  1653. mulps %xmm14, %xmm15
  1654. mulps 52 * SIZE(BO), %xmm14
  1655. addps %xmm15, %xmm4
  1656. movaps 56 * SIZE(BO), %xmm15
  1657. addps %xmm14, %xmm5
  1658. movaps 24 * SIZE(AO), %xmm14
  1659. mulps %xmm14, %xmm15
  1660. mulps 60 * SIZE(BO), %xmm14
  1661. addps %xmm15, %xmm0
  1662. movaps 56 * SIZE(BO), %xmm15
  1663. addps %xmm14, %xmm1
  1664. movaps 28 * SIZE(AO), %xmm14
  1665. mulps %xmm14, %xmm15
  1666. mulps 60 * SIZE(BO), %xmm14
  1667. addps %xmm15, %xmm4
  1668. movaps 112 * SIZE(BO), %xmm15
  1669. addps %xmm14, %xmm5
  1670. movaps 80 * SIZE(AO), %xmm14
  1671. addq $64 * SIZE, AO
  1672. addq $64 * SIZE, BO
  1673. decq %rax
  1674. jne .L62
  1675. ALIGN_4
  1676. .L65:
  1677. #ifndef TRMMKERNEL
  1678. movq K, %rax
  1679. #else
  1680. movq KKK, %rax
  1681. #endif
  1682. movaps ALPHA, %xmm15
  1683. andq $7, %rax # if (k & 1)
  1684. BRANCH
  1685. je .L68
  1686. ALIGN_4
  1687. .L66:
  1688. mulps %xmm8, %xmm9
  1689. mulps 4 * SIZE(BO), %xmm8
  1690. addps %xmm9, %xmm0
  1691. movaps 0 * SIZE(BO), %xmm9
  1692. addps %xmm8, %xmm1
  1693. movaps -28 * SIZE(AO), %xmm8
  1694. mulps %xmm8, %xmm9
  1695. mulps 4 * SIZE(BO), %xmm8
  1696. addps %xmm9, %xmm4
  1697. movaps 8 * SIZE(BO), %xmm9
  1698. addps %xmm8, %xmm5
  1699. movaps -24 * SIZE(AO), %xmm8
  1700. addq $8 * SIZE, AO # aoffset += 4
  1701. addq $8 * SIZE, BO # boffset1 += 8
  1702. decq %rax
  1703. jg .L66
  1704. ALIGN_4
  1705. .L68:
  1706. #ifndef TRMMKERNEL
  1707. movsd 0 * SIZE(CO1), %xmm8
  1708. movhps 2 * SIZE(CO1), %xmm8
  1709. movsd 4 * SIZE(CO1), %xmm9
  1710. movhps 6 * SIZE(CO1), %xmm9
  1711. movsd 0 * SIZE(CO2), %xmm10
  1712. movhps 2 * SIZE(CO2), %xmm10
  1713. movsd 4 * SIZE(CO2), %xmm11
  1714. movhps 6 * SIZE(CO2), %xmm11
  1715. #endif
  1716. mulps %xmm15, %xmm0
  1717. mulps %xmm15, %xmm4
  1718. mulps %xmm15, %xmm1
  1719. mulps %xmm15, %xmm5
  1720. #ifndef TRMMKERNEL
  1721. addps %xmm8, %xmm0
  1722. addps %xmm9, %xmm4
  1723. addps %xmm10, %xmm1
  1724. addps %xmm11, %xmm5
  1725. #endif
  1726. movsd %xmm0, 0 * SIZE(CO1)
  1727. movhps %xmm0, 2 * SIZE(CO1)
  1728. movsd %xmm4, 4 * SIZE(CO1)
  1729. movhps %xmm4, 6 * SIZE(CO1)
  1730. movsd %xmm1, 0 * SIZE(CO2)
  1731. movhps %xmm1, 2 * SIZE(CO2)
  1732. movsd %xmm5, 4 * SIZE(CO2)
  1733. movhps %xmm5, 6 * SIZE(CO2)
  1734. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1735. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1736. movq K, %rax
  1737. subq KKK, %rax
  1738. leaq (,%rax, 8), %rax
  1739. leaq (AO, %rax, 4), AO
  1740. leaq (BO, %rax, 4), BO
  1741. #endif
  1742. #if defined(TRMMKERNEL) && defined(LEFT)
  1743. addq $8, KK
  1744. #endif
  1745. addq $8 * SIZE, CO1 # coffset += 4
  1746. addq $8 * SIZE, CO2 # coffset += 4
  1747. decq I # i --
  1748. jg .L61
  1749. ALIGN_4
  1750. .L70:
  1751. testq $4, M
  1752. je .L80
  1753. #if !defined(TRMMKERNEL) || \
  1754. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1755. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1756. leaq BUFFER, BO
  1757. #else
  1758. leaq BUFFER, BO
  1759. movq KK, %rax
  1760. leaq (, %rax, 8), %rax
  1761. leaq (AO, %rax, 2), AO
  1762. leaq (BO, %rax, 4), BO
  1763. #endif
  1764. movaps -32 * SIZE(AO), %xmm8
  1765. movaps -16 * SIZE(AO), %xmm10
  1766. movaps 0 * SIZE(BO), %xmm9
  1767. movaps 16 * SIZE(BO), %xmm11
  1768. movaps 32 * SIZE(BO), %xmm13
  1769. movaps 48 * SIZE(BO), %xmm15
  1770. xorps %xmm0, %xmm0
  1771. xorps %xmm1, %xmm1
  1772. xorps %xmm2, %xmm2
  1773. xorps %xmm3, %xmm3
  1774. #ifndef TRMMKERNEL
  1775. movq K, %rax
  1776. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1777. movq K, %rax
  1778. subq KK, %rax
  1779. movq %rax, KKK
  1780. #else
  1781. movq KK, %rax
  1782. #ifdef LEFT
  1783. addq $4, %rax
  1784. #else
  1785. addq $2, %rax
  1786. #endif
  1787. movq %rax, KKK
  1788. #endif
  1789. sarq $3, %rax
  1790. je .L75
  1791. ALIGN_4
  1792. .L72:
  1793. mulps %xmm8, %xmm9
  1794. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  1795. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1796. #endif
  1797. mulps 4 * SIZE(BO), %xmm8
  1798. addps %xmm9, %xmm0
  1799. movaps 8 * SIZE(BO), %xmm9
  1800. addps %xmm8, %xmm1
  1801. movaps -28 * SIZE(AO), %xmm8
  1802. mulps %xmm8, %xmm9
  1803. mulps 12 * SIZE(BO), %xmm8
  1804. addps %xmm9, %xmm2
  1805. movaps 64 * SIZE(BO), %xmm9
  1806. addps %xmm8, %xmm3
  1807. movaps -24 * SIZE(AO), %xmm8
  1808. mulps %xmm8, %xmm11
  1809. mulps 20 * SIZE(BO), %xmm8
  1810. addps %xmm11, %xmm0
  1811. movaps 24 * SIZE(BO), %xmm11
  1812. addps %xmm8, %xmm1
  1813. movaps -20 * SIZE(AO), %xmm8
  1814. mulps %xmm8, %xmm11
  1815. mulps 28 * SIZE(BO), %xmm8
  1816. addps %xmm11, %xmm2
  1817. movaps 80 * SIZE(BO), %xmm11
  1818. addps %xmm8, %xmm3
  1819. movaps 0 * SIZE(AO), %xmm8
  1820. mulps %xmm10, %xmm13
  1821. mulps 36 * SIZE(BO), %xmm10
  1822. addps %xmm13, %xmm0
  1823. movaps 40 * SIZE(BO), %xmm13
  1824. addps %xmm10, %xmm1
  1825. movaps -12 * SIZE(AO), %xmm10
  1826. mulps %xmm10, %xmm13
  1827. mulps 44 * SIZE(BO), %xmm10
  1828. addps %xmm13, %xmm2
  1829. movaps 96 * SIZE(BO), %xmm13
  1830. addps %xmm10, %xmm3
  1831. movaps -8 * SIZE(AO), %xmm10
  1832. mulps %xmm10, %xmm15
  1833. mulps 52 * SIZE(BO), %xmm10
  1834. addps %xmm15, %xmm0
  1835. movaps 56 * SIZE(BO), %xmm15
  1836. addps %xmm10, %xmm1
  1837. movaps -4 * SIZE(AO), %xmm10
  1838. mulps %xmm10, %xmm15
  1839. mulps 60 * SIZE(BO), %xmm10
  1840. addps %xmm15, %xmm2
  1841. movaps 112 * SIZE(BO), %xmm15
  1842. addps %xmm10, %xmm3
  1843. movaps 16 * SIZE(AO), %xmm10
  1844. addq $32 * SIZE, AO
  1845. addq $64 * SIZE, BO
  1846. decq %rax
  1847. jne .L72
  1848. ALIGN_4
  1849. .L75:
  1850. #ifndef TRMMKERNEL
  1851. movq K, %rax
  1852. #else
  1853. movq KKK, %rax
  1854. #endif
  1855. movaps ALPHA, %xmm15
  1856. andq $7, %rax # if (k & 1)
  1857. BRANCH
  1858. je .L78
  1859. ALIGN_4
  1860. .L76:
  1861. mulps %xmm8, %xmm9
  1862. mulps 4 * SIZE(BO), %xmm8
  1863. addps %xmm9, %xmm0
  1864. movaps 8 * SIZE(BO), %xmm9
  1865. addps %xmm8, %xmm1
  1866. movaps -28 * SIZE(AO), %xmm8
  1867. addq $4 * SIZE, AO # aoffset += 4
  1868. addq $8 * SIZE, BO # boffset1 += 8
  1869. decq %rax
  1870. jg .L76
  1871. ALIGN_4
  1872. .L78:
  1873. #ifndef TRMMKERNEL
  1874. movsd 0 * SIZE(CO1), %xmm8
  1875. movhps 2 * SIZE(CO1), %xmm8
  1876. movsd 0 * SIZE(CO2), %xmm10
  1877. movhps 2 * SIZE(CO2), %xmm10
  1878. #endif
  1879. addps %xmm2, %xmm0
  1880. addps %xmm3, %xmm1
  1881. mulps %xmm15, %xmm0
  1882. mulps %xmm15, %xmm1
  1883. #ifndef TRMMKERNEL
  1884. addps %xmm8, %xmm0
  1885. addps %xmm10, %xmm1
  1886. #endif
  1887. movsd %xmm0, 0 * SIZE(CO1)
  1888. movhps %xmm0, 2 * SIZE(CO1)
  1889. movsd %xmm1, 0 * SIZE(CO2)
  1890. movhps %xmm1, 2 * SIZE(CO2)
  1891. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1892. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1893. movq K, %rax
  1894. subq KKK, %rax
  1895. leaq (,%rax, 8), %rax
  1896. leaq (AO, %rax, 2), AO
  1897. leaq (BO, %rax, 4), BO
  1898. #endif
  1899. #if defined(TRMMKERNEL) && defined(LEFT)
  1900. addq $4, KK
  1901. #endif
  1902. addq $4 * SIZE, CO1 # coffset += 4
  1903. addq $4 * SIZE, CO2 # coffset += 4
  1904. ALIGN_4
  1905. .L80:
  1906. testq $2, M
  1907. je .L90
  1908. #if !defined(TRMMKERNEL) || \
  1909. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  1910. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  1911. leaq BUFFER, BO
  1912. #else
  1913. leaq BUFFER, BO
  1914. movq KK, %rax
  1915. leaq (, %rax, 8), %rax
  1916. leaq (AO, %rax, 1), AO
  1917. leaq (BO, %rax, 4), BO
  1918. #endif
  1919. movaps -32 * SIZE(AO), %xmm8
  1920. movaps -24 * SIZE(AO), %xmm10
  1921. movaps 0 * SIZE(BO), %xmm9
  1922. movaps 16 * SIZE(BO), %xmm11
  1923. movaps 32 * SIZE(BO), %xmm13
  1924. movaps 48 * SIZE(BO), %xmm15
  1925. xorps %xmm0, %xmm0
  1926. xorps %xmm1, %xmm1
  1927. xorps %xmm2, %xmm2
  1928. xorps %xmm3, %xmm3
  1929. #ifndef TRMMKERNEL
  1930. movq K, %rax
  1931. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  1932. movq K, %rax
  1933. subq KK, %rax
  1934. movq %rax, KKK
  1935. #else
  1936. movq KK, %rax
  1937. #ifdef LEFT
  1938. addq $2, %rax
  1939. #else
  1940. addq $2, %rax
  1941. #endif
  1942. movq %rax, KKK
  1943. #endif
  1944. sarq $3, %rax
  1945. je .L85
  1946. ALIGN_4
  1947. .L82:
  1948. mulps %xmm8, %xmm9
  1949. addps %xmm9, %xmm0
  1950. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  1951. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  1952. #endif
  1953. movsd 4 * SIZE(BO), %xmm9
  1954. mulps %xmm8, %xmm9
  1955. movsd -30 * SIZE(AO), %xmm8
  1956. addps %xmm9, %xmm1
  1957. movsd 8 * SIZE(BO), %xmm9
  1958. mulps %xmm8, %xmm9
  1959. addps %xmm9, %xmm2
  1960. movsd 12 * SIZE(BO), %xmm9
  1961. mulps %xmm8, %xmm9
  1962. movsd -28 * SIZE(AO), %xmm8
  1963. addps %xmm9, %xmm3
  1964. movsd 64 * SIZE(BO), %xmm9
  1965. mulps %xmm8, %xmm11
  1966. addps %xmm11, %xmm0
  1967. movsd 20 * SIZE(BO), %xmm11
  1968. mulps %xmm8, %xmm11
  1969. movsd -26 * SIZE(AO), %xmm8
  1970. addps %xmm11, %xmm1
  1971. movsd 24 * SIZE(BO), %xmm11
  1972. mulps %xmm8, %xmm11
  1973. addps %xmm11, %xmm2
  1974. movsd 28 * SIZE(BO), %xmm11
  1975. mulps %xmm8, %xmm11
  1976. movsd -16 * SIZE(AO), %xmm8
  1977. addps %xmm11, %xmm3
  1978. movsd 80 * SIZE(BO), %xmm11
  1979. mulps %xmm10, %xmm13
  1980. addps %xmm13, %xmm0
  1981. movsd 36 * SIZE(BO), %xmm13
  1982. mulps %xmm10, %xmm13
  1983. movsd -22 * SIZE(AO), %xmm10
  1984. addps %xmm13, %xmm1
  1985. movsd 40 * SIZE(BO), %xmm13
  1986. mulps %xmm10, %xmm13
  1987. addps %xmm13, %xmm2
  1988. movsd 44 * SIZE(BO), %xmm13
  1989. mulps %xmm10, %xmm13
  1990. movsd -20 * SIZE(AO), %xmm10
  1991. addps %xmm13, %xmm3
  1992. movsd 96 * SIZE(BO), %xmm13
  1993. mulps %xmm10, %xmm15
  1994. addps %xmm15, %xmm0
  1995. movsd 52 * SIZE(BO), %xmm15
  1996. mulps %xmm10, %xmm15
  1997. movsd -18 * SIZE(AO), %xmm10
  1998. addps %xmm15, %xmm1
  1999. movsd 56 * SIZE(BO), %xmm15
  2000. mulps %xmm10, %xmm15
  2001. addps %xmm15, %xmm2
  2002. movsd 60 * SIZE(BO), %xmm15
  2003. mulps %xmm10, %xmm15
  2004. movsd -8 * SIZE(AO), %xmm10
  2005. addps %xmm15, %xmm3
  2006. movsd 112 * SIZE(BO), %xmm15
  2007. addq $16 * SIZE, AO
  2008. addq $64 * SIZE, BO
  2009. decq %rax
  2010. jne .L82
  2011. ALIGN_4
  2012. .L85:
  2013. #ifndef TRMMKERNEL
  2014. movq K, %rax
  2015. #else
  2016. movq KKK, %rax
  2017. #endif
  2018. movaps ALPHA, %xmm15
  2019. andq $7, %rax # if (k & 1)
  2020. BRANCH
  2021. je .L88
  2022. ALIGN_4
  2023. .L86:
  2024. mulps %xmm8, %xmm9
  2025. addps %xmm9, %xmm0
  2026. movsd 4 * SIZE(BO), %xmm9
  2027. mulps %xmm8, %xmm9
  2028. movsd -30 * SIZE(AO), %xmm8
  2029. addps %xmm9, %xmm1
  2030. movsd 8 * SIZE(BO), %xmm9
  2031. addq $2 * SIZE, AO # aoffset += 4
  2032. addq $8 * SIZE, BO # boffset1 += 8
  2033. decq %rax
  2034. jg .L86
  2035. ALIGN_4
  2036. .L88:
  2037. #ifndef TRMMKERNEL
  2038. movsd 0 * SIZE(CO1), %xmm8
  2039. movsd 0 * SIZE(CO2), %xmm10
  2040. #endif
  2041. addps %xmm2, %xmm0
  2042. addps %xmm3, %xmm1
  2043. mulps %xmm15, %xmm0
  2044. mulps %xmm15, %xmm1
  2045. #ifndef TRMMKERNEL
  2046. addps %xmm8, %xmm0
  2047. addps %xmm10, %xmm1
  2048. #endif
  2049. movsd %xmm0, 0 * SIZE(CO1)
  2050. movsd %xmm1, 0 * SIZE(CO2)
  2051. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2052. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2053. movq K, %rax
  2054. subq KKK, %rax
  2055. leaq (,%rax, 8), %rax
  2056. leaq (AO, %rax, 1), AO
  2057. leaq (BO, %rax, 4), BO
  2058. #endif
  2059. #if defined(TRMMKERNEL) && defined(LEFT)
  2060. addq $2, KK
  2061. #endif
  2062. addq $2 * SIZE, CO1 # coffset += 4
  2063. addq $2 * SIZE, CO2 # coffset += 4
  2064. ALIGN_4
  2065. .L90:
  2066. testq $1, M
  2067. je .L99
  2068. #if !defined(TRMMKERNEL) || \
  2069. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2070. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2071. leaq BUFFER, BO
  2072. #else
  2073. leaq BUFFER, BO
  2074. movq KK, %rax
  2075. leaq (, %rax, 4), %rax
  2076. leaq (AO, %rax, 1), AO
  2077. leaq (BO, %rax, 8), BO
  2078. #endif
  2079. movss -32 * SIZE(AO), %xmm8
  2080. movss -28 * SIZE(AO), %xmm10
  2081. movss 0 * SIZE(BO), %xmm9
  2082. movss 16 * SIZE(BO), %xmm11
  2083. movss 32 * SIZE(BO), %xmm13
  2084. movss 48 * SIZE(BO), %xmm15
  2085. xorps %xmm0, %xmm0
  2086. xorps %xmm1, %xmm1
  2087. xorps %xmm2, %xmm2
  2088. xorps %xmm3, %xmm3
  2089. #ifndef TRMMKERNEL
  2090. movq K, %rax
  2091. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2092. movq K, %rax
  2093. subq KK, %rax
  2094. movq %rax, KKK
  2095. #else
  2096. movq KK, %rax
  2097. #ifdef LEFT
  2098. addq $1, %rax
  2099. #else
  2100. addq $2, %rax
  2101. #endif
  2102. movq %rax, KKK
  2103. #endif
  2104. sarq $3, %rax
  2105. je .L95
  2106. ALIGN_4
  2107. .L92:
  2108. mulps %xmm8, %xmm9
  2109. addps %xmm9, %xmm0
  2110. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  2111. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2112. #endif
  2113. movss 4 * SIZE(BO), %xmm9
  2114. mulps %xmm8, %xmm9
  2115. movss -31 * SIZE(AO), %xmm8
  2116. addps %xmm9, %xmm1
  2117. movss 8 * SIZE(BO), %xmm9
  2118. mulps %xmm8, %xmm9
  2119. addps %xmm9, %xmm2
  2120. movss 12 * SIZE(BO), %xmm9
  2121. mulps %xmm8, %xmm9
  2122. movss -30 * SIZE(AO), %xmm8
  2123. addps %xmm9, %xmm3
  2124. movss 64 * SIZE(BO), %xmm9
  2125. mulps %xmm8, %xmm11
  2126. addps %xmm11, %xmm0
  2127. movss 20 * SIZE(BO), %xmm11
  2128. mulps %xmm8, %xmm11
  2129. movss -29 * SIZE(AO), %xmm8
  2130. addps %xmm11, %xmm1
  2131. movss 24 * SIZE(BO), %xmm11
  2132. mulps %xmm8, %xmm11
  2133. addps %xmm11, %xmm2
  2134. movss 28 * SIZE(BO), %xmm11
  2135. mulps %xmm8, %xmm11
  2136. movss -24 * SIZE(AO), %xmm8
  2137. addps %xmm11, %xmm3
  2138. movss 80 * SIZE(BO), %xmm11
  2139. mulps %xmm10, %xmm13
  2140. addps %xmm13, %xmm0
  2141. movss 36 * SIZE(BO), %xmm13
  2142. mulps %xmm10, %xmm13
  2143. movss -27 * SIZE(AO), %xmm10
  2144. addps %xmm13, %xmm1
  2145. movss 40 * SIZE(BO), %xmm13
  2146. mulps %xmm10, %xmm13
  2147. addps %xmm13, %xmm2
  2148. movss 44 * SIZE(BO), %xmm13
  2149. mulps %xmm10, %xmm13
  2150. movss -26 * SIZE(AO), %xmm10
  2151. addps %xmm13, %xmm3
  2152. movss 96 * SIZE(BO), %xmm13
  2153. mulps %xmm10, %xmm15
  2154. addps %xmm15, %xmm0
  2155. movss 52 * SIZE(BO), %xmm15
  2156. mulps %xmm10, %xmm15
  2157. movss -25 * SIZE(AO), %xmm10
  2158. addps %xmm15, %xmm1
  2159. movss 56 * SIZE(BO), %xmm15
  2160. mulps %xmm10, %xmm15
  2161. addps %xmm15, %xmm2
  2162. movss 60 * SIZE(BO), %xmm15
  2163. mulps %xmm10, %xmm15
  2164. movss -20 * SIZE(AO), %xmm10
  2165. addps %xmm15, %xmm3
  2166. movss 112 * SIZE(BO), %xmm15
  2167. addq $ 8 * SIZE, AO
  2168. addq $64 * SIZE, BO
  2169. decq %rax
  2170. jne .L92
  2171. ALIGN_4
  2172. .L95:
  2173. #ifndef TRMMKERNEL
  2174. movq K, %rax
  2175. #else
  2176. movq KKK, %rax
  2177. #endif
  2178. movaps ALPHA, %xmm15
  2179. andq $7, %rax # if (k & 1)
  2180. BRANCH
  2181. je .L98
  2182. ALIGN_4
  2183. .L96:
  2184. mulps %xmm8, %xmm9
  2185. addps %xmm9, %xmm0
  2186. movss 4 * SIZE(BO), %xmm9
  2187. mulps %xmm8, %xmm9
  2188. movss -31 * SIZE(AO), %xmm8
  2189. addps %xmm9, %xmm1
  2190. movss 8 * SIZE(BO), %xmm9
  2191. addq $1 * SIZE, AO # aoffset += 4
  2192. addq $8 * SIZE, BO # boffset1 += 8
  2193. decq %rax
  2194. jg .L96
  2195. ALIGN_4
  2196. .L98:
  2197. #ifndef TRMMKERNEL
  2198. movss 0 * SIZE(CO1), %xmm8
  2199. movss 0 * SIZE(CO2), %xmm10
  2200. #endif
  2201. addss %xmm2, %xmm0
  2202. addss %xmm3, %xmm1
  2203. mulss %xmm15, %xmm0
  2204. mulss %xmm15, %xmm1
  2205. #ifndef TRMMKERNEL
  2206. addss %xmm8, %xmm0
  2207. addss %xmm10, %xmm1
  2208. #endif
  2209. movss %xmm0, 0 * SIZE(CO1)
  2210. movss %xmm1, 0 * SIZE(CO2)
  2211. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2212. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2213. movq K, %rax
  2214. subq KKK, %rax
  2215. leaq (,%rax, 4), %rax
  2216. leaq (AO, %rax, 1), AO
  2217. leaq (BO, %rax, 8), BO
  2218. #endif
  2219. #if defined(TRMMKERNEL) && defined(LEFT)
  2220. addq $1, KK
  2221. #endif
  2222. ALIGN_4
  2223. .L99:
  2224. #if defined(TRMMKERNEL) && !defined(LEFT)
  2225. addl $2, KK
  2226. #endif
  2227. leaq (C, LDC, 2), C # c += 4 * ldc
  2228. ALIGN_4
  2229. .L100:
  2230. testq $1, N
  2231. je .L999
  2232. .L101:
  2233. #if defined(TRMMKERNEL) && defined(LEFT)
  2234. movq OFFSET, %rax
  2235. movq %rax, KK
  2236. #endif
  2237. /* Copying to Sub Buffer */
  2238. leaq BUFFER, BO
  2239. movq K, %rax
  2240. sarq $3, %rax
  2241. jle .L103
  2242. ALIGN_4
  2243. .L102:
  2244. prefetch (RPREFETCHSIZE + 0) * SIZE(B)
  2245. movups 0 * SIZE(B), %xmm3
  2246. movups 4 * SIZE(B), %xmm7
  2247. prefetchw (WPREFETCHSIZE + 0) * SIZE(BO)
  2248. pshufd $0x00, %xmm3, %xmm0
  2249. pshufd $0x55, %xmm3, %xmm1
  2250. pshufd $0xaa, %xmm3, %xmm2
  2251. pshufd $0xff, %xmm3, %xmm3
  2252. prefetchw (WPREFETCHSIZE + 16) * SIZE(BO)
  2253. pshufd $0x00, %xmm7, %xmm4
  2254. pshufd $0x55, %xmm7, %xmm5
  2255. pshufd $0xaa, %xmm7, %xmm6
  2256. pshufd $0xff, %xmm7, %xmm7
  2257. movaps %xmm0, 0 * SIZE(BO)
  2258. movaps %xmm1, 4 * SIZE(BO)
  2259. movaps %xmm2, 8 * SIZE(BO)
  2260. movaps %xmm3, 12 * SIZE(BO)
  2261. movaps %xmm4, 16 * SIZE(BO)
  2262. movaps %xmm5, 20 * SIZE(BO)
  2263. movaps %xmm6, 24 * SIZE(BO)
  2264. movaps %xmm7, 28 * SIZE(BO)
  2265. addq $ 8 * SIZE, B
  2266. addq $32 * SIZE, BO
  2267. decq %rax
  2268. jne .L102
  2269. ALIGN_4
  2270. .L103:
  2271. movq K, %rax
  2272. andq $7, %rax
  2273. BRANCH
  2274. jle .L110
  2275. ALIGN_4
  2276. .L104:
  2277. movss 0 * SIZE(B), %xmm3
  2278. pshufd $0x00, %xmm3, %xmm0
  2279. movaps %xmm0, 0 * SIZE(BO)
  2280. addq $ 1 * SIZE, B
  2281. addq $ 4 * SIZE, BO
  2282. decq %rax
  2283. jne .L104
  2284. ALIGN_4
  2285. .L110:
  2286. movq C, CO1 # coffset1 = c
  2287. movq A, AO # aoffset = a
  2288. movq M, I
  2289. sarq $3, I # i = (m >> 3)
  2290. jle .L120
  2291. ALIGN_4
  2292. .L111:
  2293. #if !defined(TRMMKERNEL) || \
  2294. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2295. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2296. leaq BUFFER, BO
  2297. #else
  2298. leaq BUFFER, BO
  2299. movq KK, %rax
  2300. leaq (, %rax, 8), %rax
  2301. leaq (AO, %rax, 4), AO
  2302. leaq (BO, %rax, 2), BO
  2303. #endif
  2304. movaps -32 * SIZE(AO), %xmm8
  2305. movaps -16 * SIZE(AO), %xmm10
  2306. movaps 0 * SIZE(AO), %xmm12
  2307. movaps 16 * SIZE(AO), %xmm14
  2308. movaps 0 * SIZE(BO), %xmm9
  2309. movaps 16 * SIZE(BO), %xmm11
  2310. movaps 32 * SIZE(BO), %xmm13
  2311. movaps 48 * SIZE(BO), %xmm15
  2312. xorps %xmm0, %xmm0
  2313. xorps %xmm1, %xmm1
  2314. prefetchw 4 * SIZE(CO1)
  2315. xorps %xmm4, %xmm4
  2316. xorps %xmm5, %xmm5
  2317. #ifndef TRMMKERNEL
  2318. movq K, %rax
  2319. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2320. movq K, %rax
  2321. subq KK, %rax
  2322. movq %rax, KKK
  2323. #else
  2324. movq KK, %rax
  2325. #ifdef LEFT
  2326. addq $8, %rax
  2327. #else
  2328. addq $1, %rax
  2329. #endif
  2330. movq %rax, KKK
  2331. #endif
  2332. sarq $3, %rax
  2333. je .L115
  2334. ALIGN_4
  2335. .L112:
  2336. mulps %xmm9, %xmm8
  2337. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  2338. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2339. #endif
  2340. mulps -28 * SIZE(AO), %xmm9
  2341. addps %xmm8, %xmm0
  2342. movaps -24 * SIZE(AO), %xmm8
  2343. addps %xmm9, %xmm4
  2344. movaps 4 * SIZE(BO), %xmm9
  2345. mulps %xmm9, %xmm8
  2346. mulps -20 * SIZE(AO), %xmm9
  2347. addps %xmm8, %xmm0
  2348. movaps 32 * SIZE(AO), %xmm8
  2349. addps %xmm9, %xmm4
  2350. movaps 8 * SIZE(BO), %xmm9
  2351. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  2352. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  2353. #endif
  2354. mulps %xmm9, %xmm10
  2355. mulps -12 * SIZE(AO), %xmm9
  2356. addps %xmm10, %xmm0
  2357. movaps -8 * SIZE(AO), %xmm10
  2358. addps %xmm9, %xmm4
  2359. movaps 12 * SIZE(BO), %xmm9
  2360. mulps %xmm9, %xmm10
  2361. mulps -4 * SIZE(AO), %xmm9
  2362. addps %xmm10, %xmm0
  2363. movaps 48 * SIZE(AO), %xmm10
  2364. addps %xmm9, %xmm4
  2365. movaps 32 * SIZE(BO), %xmm9
  2366. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  2367. PREFETCH (PREFETCHSIZE + 32) * SIZE(AO)
  2368. #endif
  2369. mulps %xmm11, %xmm12
  2370. mulps 4 * SIZE(AO), %xmm11
  2371. addps %xmm12, %xmm0
  2372. movaps 8 * SIZE(AO), %xmm12
  2373. addps %xmm11, %xmm4
  2374. movaps 20 * SIZE(BO), %xmm11
  2375. mulps %xmm11, %xmm12
  2376. mulps 12 * SIZE(AO), %xmm11
  2377. addps %xmm12, %xmm0
  2378. movaps 64 * SIZE(AO), %xmm12
  2379. addps %xmm11, %xmm4
  2380. movaps 24 * SIZE(BO), %xmm11
  2381. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  2382. PREFETCH (PREFETCHSIZE + 48) * SIZE(AO)
  2383. #endif
  2384. mulps %xmm11, %xmm14
  2385. mulps 20 * SIZE(AO), %xmm11
  2386. addps %xmm14, %xmm0
  2387. movaps 24 * SIZE(AO), %xmm14
  2388. addps %xmm11, %xmm4
  2389. movaps 28 * SIZE(BO), %xmm11
  2390. mulps %xmm11, %xmm14
  2391. mulps 28 * SIZE(AO), %xmm11
  2392. addps %xmm14, %xmm0
  2393. movaps 80 * SIZE(AO), %xmm14
  2394. addps %xmm11, %xmm4
  2395. movaps 48 * SIZE(BO), %xmm11
  2396. addq $64 * SIZE, AO
  2397. addq $32 * SIZE, BO
  2398. decq %rax
  2399. jne .L112
  2400. ALIGN_4
  2401. .L115:
  2402. #ifndef TRMMKERNEL
  2403. movq K, %rax
  2404. #else
  2405. movq KKK, %rax
  2406. #endif
  2407. movaps ALPHA, %xmm15
  2408. andq $7, %rax # if (k & 1)
  2409. BRANCH
  2410. je .L118
  2411. ALIGN_4
  2412. .L116:
  2413. mulps %xmm9, %xmm8
  2414. mulps -28 * SIZE(AO), %xmm9
  2415. addps %xmm8, %xmm0
  2416. movaps -24 * SIZE(AO), %xmm8
  2417. addps %xmm9, %xmm4
  2418. movaps 4 * SIZE(BO), %xmm9
  2419. addq $8 * SIZE, AO # aoffset += 4
  2420. addq $4 * SIZE, BO # boffset1 += 8
  2421. decq %rax
  2422. jg .L116
  2423. ALIGN_4
  2424. .L118:
  2425. #ifndef TRMMKERNEL
  2426. movsd 0 * SIZE(CO1), %xmm8
  2427. movhps 2 * SIZE(CO1), %xmm8
  2428. movsd 4 * SIZE(CO1), %xmm9
  2429. movhps 6 * SIZE(CO1), %xmm9
  2430. #endif
  2431. mulps %xmm15, %xmm0
  2432. mulps %xmm15, %xmm4
  2433. #ifndef TRMMKERNEL
  2434. addps %xmm8, %xmm0
  2435. addps %xmm9, %xmm4
  2436. #endif
  2437. movsd %xmm0, 0 * SIZE(CO1)
  2438. movhps %xmm0, 2 * SIZE(CO1)
  2439. movsd %xmm4, 4 * SIZE(CO1)
  2440. movhps %xmm4, 6 * SIZE(CO1)
  2441. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2442. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2443. movq K, %rax
  2444. subq KKK, %rax
  2445. leaq (,%rax, 8), %rax
  2446. leaq (AO, %rax, 4), AO
  2447. leaq (BO, %rax, 2), BO
  2448. #endif
  2449. #if defined(TRMMKERNEL) && defined(LEFT)
  2450. addq $8, KK
  2451. #endif
  2452. addq $8 * SIZE, CO1 # coffset += 4
  2453. decq I # i --
  2454. jg .L111
  2455. ALIGN_4
  2456. .L120:
  2457. testq $4, M
  2458. je .L130
  2459. #if !defined(TRMMKERNEL) || \
  2460. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2461. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2462. leaq BUFFER, BO
  2463. #else
  2464. leaq BUFFER, BO
  2465. movq KK, %rax
  2466. leaq (, %rax, 8), %rax
  2467. leaq (AO, %rax, 2), AO
  2468. leaq (BO, %rax, 2), BO
  2469. #endif
  2470. movaps -32 * SIZE(AO), %xmm8
  2471. movaps -16 * SIZE(AO), %xmm10
  2472. movaps 0 * SIZE(BO), %xmm9
  2473. movaps 16 * SIZE(BO), %xmm11
  2474. xorps %xmm0, %xmm0
  2475. xorps %xmm1, %xmm1
  2476. xorps %xmm2, %xmm2
  2477. xorps %xmm3, %xmm3
  2478. #ifndef TRMMKERNEL
  2479. movq K, %rax
  2480. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2481. movq K, %rax
  2482. subq KK, %rax
  2483. movq %rax, KKK
  2484. #else
  2485. movq KK, %rax
  2486. #ifdef LEFT
  2487. addq $4, %rax
  2488. #else
  2489. addq $1, %rax
  2490. #endif
  2491. movq %rax, KKK
  2492. #endif
  2493. sarq $3, %rax
  2494. je .L125
  2495. ALIGN_4
  2496. .L122:
  2497. mulps %xmm8, %xmm9
  2498. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  2499. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2500. #endif
  2501. movaps -28 * SIZE(AO), %xmm8
  2502. mulps 4 * SIZE(BO), %xmm8
  2503. addps %xmm9, %xmm0
  2504. movaps 32 * SIZE(BO), %xmm9
  2505. addps %xmm8, %xmm1
  2506. movaps -24 * SIZE(AO), %xmm8
  2507. mulps 8 * SIZE(BO), %xmm8
  2508. addps %xmm8, %xmm2
  2509. movaps -20 * SIZE(AO), %xmm8
  2510. mulps 12 * SIZE(BO), %xmm8
  2511. addps %xmm8, %xmm3
  2512. movaps 0 * SIZE(AO), %xmm8
  2513. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  2514. PREFETCH (PREFETCHSIZE + 16) * SIZE(AO)
  2515. #endif
  2516. mulps %xmm10, %xmm11
  2517. movaps -12 * SIZE(AO), %xmm10
  2518. mulps 20 * SIZE(BO), %xmm10
  2519. addps %xmm11, %xmm0
  2520. movaps 48 * SIZE(BO), %xmm11
  2521. addps %xmm10, %xmm1
  2522. movaps -8 * SIZE(AO), %xmm10
  2523. mulps 24 * SIZE(BO), %xmm10
  2524. addps %xmm10, %xmm2
  2525. movaps -4 * SIZE(AO), %xmm10
  2526. mulps 28 * SIZE(BO), %xmm10
  2527. addps %xmm10, %xmm3
  2528. movaps 16 * SIZE(AO), %xmm10
  2529. addq $32 * SIZE, AO
  2530. addq $32 * SIZE, BO
  2531. decq %rax
  2532. jne .L122
  2533. ALIGN_4
  2534. .L125:
  2535. #ifndef TRMMKERNEL
  2536. movq K, %rax
  2537. #else
  2538. movq KKK, %rax
  2539. #endif
  2540. movaps ALPHA, %xmm15
  2541. andq $7, %rax # if (k & 1)
  2542. BRANCH
  2543. je .L128
  2544. ALIGN_4
  2545. .L126:
  2546. mulps %xmm8, %xmm9
  2547. movaps -28 * SIZE(AO), %xmm8
  2548. addps %xmm9, %xmm0
  2549. movaps 4 * SIZE(BO), %xmm9
  2550. addq $4 * SIZE, AO # aoffset += 4
  2551. addq $4 * SIZE, BO # boffset1 += 8
  2552. decq %rax
  2553. jg .L126
  2554. ALIGN_4
  2555. .L128:
  2556. #ifndef TRMMKERNEL
  2557. movsd 0 * SIZE(CO1), %xmm8
  2558. movhps 2 * SIZE(CO1), %xmm8
  2559. #endif
  2560. addps %xmm1, %xmm0
  2561. addps %xmm3, %xmm2
  2562. addps %xmm2, %xmm0
  2563. mulps %xmm15, %xmm0
  2564. #ifndef TRMMKERNEL
  2565. addps %xmm8, %xmm0
  2566. #endif
  2567. movsd %xmm0, 0 * SIZE(CO1)
  2568. movhps %xmm0, 2 * SIZE(CO1)
  2569. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2570. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2571. movq K, %rax
  2572. subq KKK, %rax
  2573. leaq (,%rax, 8), %rax
  2574. leaq (AO, %rax, 2), AO
  2575. leaq (BO, %rax, 2), BO
  2576. #endif
  2577. #if defined(TRMMKERNEL) && defined(LEFT)
  2578. addq $4, KK
  2579. #endif
  2580. addq $4 * SIZE, CO1 # coffset += 4
  2581. ALIGN_4
  2582. .L130:
  2583. testq $2, M
  2584. je .L140
  2585. #if !defined(TRMMKERNEL) || \
  2586. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2587. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2588. leaq BUFFER, BO
  2589. #else
  2590. leaq BUFFER, BO
  2591. movq KK, %rax
  2592. leaq (, %rax, 8), %rax
  2593. leaq (AO, %rax, 1), AO
  2594. leaq (BO, %rax, 2), BO
  2595. #endif
  2596. movaps -32 * SIZE(AO), %xmm8
  2597. movaps -24 * SIZE(AO), %xmm10
  2598. movaps 0 * SIZE(BO), %xmm9
  2599. movaps 16 * SIZE(BO), %xmm11
  2600. xorps %xmm0, %xmm0
  2601. xorps %xmm1, %xmm1
  2602. xorps %xmm2, %xmm2
  2603. xorps %xmm3, %xmm3
  2604. #ifndef TRMMKERNEL
  2605. movq K, %rax
  2606. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2607. movq K, %rax
  2608. subq KK, %rax
  2609. movq %rax, KKK
  2610. #else
  2611. movq KK, %rax
  2612. #ifdef LEFT
  2613. addq $2, %rax
  2614. #else
  2615. addq $1, %rax
  2616. #endif
  2617. movq %rax, KKK
  2618. #endif
  2619. sarq $3, %rax
  2620. je .L135
  2621. ALIGN_4
  2622. .L132:
  2623. mulps %xmm8, %xmm9
  2624. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  2625. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2626. #endif
  2627. movsd -30 * SIZE(AO), %xmm8
  2628. addps %xmm9, %xmm0
  2629. movsd 4 * SIZE(BO), %xmm9
  2630. mulps %xmm8, %xmm9
  2631. movsd -28 * SIZE(AO), %xmm8
  2632. addps %xmm9, %xmm1
  2633. movsd 8 * SIZE(BO), %xmm9
  2634. mulps %xmm8, %xmm9
  2635. movsd -26 * SIZE(AO), %xmm8
  2636. addps %xmm9, %xmm0
  2637. movsd 12 * SIZE(BO), %xmm9
  2638. mulps %xmm8, %xmm9
  2639. movsd -16 * SIZE(AO), %xmm8
  2640. addps %xmm9, %xmm1
  2641. movsd 32 * SIZE(BO), %xmm9
  2642. mulps %xmm10, %xmm11
  2643. movsd -22 * SIZE(AO), %xmm10
  2644. addps %xmm11, %xmm0
  2645. movsd 20 * SIZE(BO), %xmm11
  2646. mulps %xmm10, %xmm11
  2647. movsd -20 * SIZE(AO), %xmm10
  2648. addps %xmm11, %xmm1
  2649. movsd 24 * SIZE(BO), %xmm11
  2650. mulps %xmm10, %xmm11
  2651. movsd -18 * SIZE(AO), %xmm10
  2652. addps %xmm11, %xmm0
  2653. movsd 28 * SIZE(BO), %xmm11
  2654. mulps %xmm10, %xmm11
  2655. movsd -8 * SIZE(AO), %xmm10
  2656. addps %xmm11, %xmm1
  2657. movsd 48 * SIZE(BO), %xmm11
  2658. addq $16 * SIZE, AO
  2659. addq $32 * SIZE, BO
  2660. decq %rax
  2661. jne .L132
  2662. ALIGN_4
  2663. .L135:
  2664. #ifndef TRMMKERNEL
  2665. movq K, %rax
  2666. #else
  2667. movq KKK, %rax
  2668. #endif
  2669. movaps ALPHA, %xmm15
  2670. andq $7, %rax # if (k & 1)
  2671. BRANCH
  2672. je .L138
  2673. ALIGN_4
  2674. .L136:
  2675. mulps %xmm8, %xmm9
  2676. movsd -30 * SIZE(AO), %xmm8
  2677. addps %xmm9, %xmm0
  2678. movsd 4 * SIZE(BO), %xmm9
  2679. addq $2 * SIZE, AO # aoffset += 4
  2680. addq $4 * SIZE, BO # boffset1 += 8
  2681. decq %rax
  2682. jg .L136
  2683. ALIGN_4
  2684. .L138:
  2685. addps %xmm1, %xmm0
  2686. mulps %xmm15, %xmm0
  2687. #ifndef TRMMKERNEL
  2688. movsd 0 * SIZE(CO1), %xmm8
  2689. addps %xmm8, %xmm0
  2690. #endif
  2691. movsd %xmm0, 0 * SIZE(CO1)
  2692. #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2693. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2694. movq K, %rax
  2695. subq KKK, %rax
  2696. leaq (,%rax, 8), %rax
  2697. leaq (AO, %rax, 1), AO
  2698. leaq (BO, %rax, 2), BO
  2699. #endif
  2700. #if defined(TRMMKERNEL) && defined(LEFT)
  2701. addq $2, KK
  2702. #endif
  2703. addq $2 * SIZE, CO1 # coffset += 4
  2704. ALIGN_4
  2705. .L140:
  2706. testq $1, M
  2707. je .L999
  2708. #if !defined(TRMMKERNEL) || \
  2709. (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \
  2710. (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
  2711. leaq BUFFER, BO
  2712. #else
  2713. leaq BUFFER, BO
  2714. movq KK, %rax
  2715. leaq (, %rax, 4), %rax
  2716. leaq (AO, %rax, 1), AO
  2717. leaq (BO, %rax, 4), BO
  2718. #endif
  2719. movss -32 * SIZE(AO), %xmm8
  2720. movss -28 * SIZE(AO), %xmm10
  2721. movss 0 * SIZE(BO), %xmm9
  2722. movss 16 * SIZE(BO), %xmm11
  2723. xorps %xmm0, %xmm0
  2724. xorps %xmm1, %xmm1
  2725. xorps %xmm2, %xmm2
  2726. xorps %xmm3, %xmm3
  2727. #ifndef TRMMKERNEL
  2728. movq K, %rax
  2729. #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
  2730. movq K, %rax
  2731. subq KK, %rax
  2732. movq %rax, KKK
  2733. #else
  2734. movq KK, %rax
  2735. #ifdef LEFT
  2736. addq $1, %rax
  2737. #else
  2738. addq $1, %rax
  2739. #endif
  2740. movq %rax, KKK
  2741. #endif
  2742. sarq $3, %rax
  2743. je .L145
  2744. ALIGN_4
  2745. .L142:
  2746. mulss %xmm8, %xmm9
  2747. #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
  2748. PREFETCH (PREFETCHSIZE + 0) * SIZE(AO)
  2749. #endif
  2750. movss -31 * SIZE(AO), %xmm8
  2751. mulss 4 * SIZE(BO), %xmm8
  2752. addss %xmm9, %xmm0
  2753. movss 32 * SIZE(BO), %xmm9
  2754. addss %xmm8, %xmm1
  2755. movss -30 * SIZE(AO), %xmm8
  2756. mulss 8 * SIZE(BO), %xmm8
  2757. addss %xmm8, %xmm2
  2758. movss -29 * SIZE(AO), %xmm8
  2759. mulss 12 * SIZE(BO), %xmm8
  2760. addss %xmm8, %xmm3
  2761. movss -24 * SIZE(AO), %xmm8
  2762. mulss %xmm10, %xmm11
  2763. movss -27 * SIZE(AO), %xmm10
  2764. mulss 20 * SIZE(BO), %xmm10
  2765. addss %xmm11, %xmm0
  2766. movss 48 * SIZE(BO), %xmm11
  2767. addss %xmm10, %xmm1
  2768. movss -26 * SIZE(AO), %xmm10
  2769. mulss 24 * SIZE(BO), %xmm10
  2770. addss %xmm10, %xmm2
  2771. movss -25 * SIZE(AO), %xmm10
  2772. mulss 28 * SIZE(BO), %xmm10
  2773. addss %xmm10, %xmm3
  2774. movss -20 * SIZE(AO), %xmm10
  2775. addq $ 8 * SIZE, AO
  2776. addq $32 * SIZE, BO
  2777. decq %rax
  2778. jne .L142
  2779. ALIGN_4
  2780. .L145:
  2781. #ifndef TRMMKERNEL
  2782. movq K, %rax
  2783. #else
  2784. movq KKK, %rax
  2785. #endif
  2786. movss ALPHA, %xmm15
  2787. andq $7, %rax # if (k & 1)
  2788. BRANCH
  2789. je .L148
  2790. ALIGN_4
  2791. .L146:
  2792. mulss %xmm8, %xmm9
  2793. movss -31 * SIZE(AO), %xmm8
  2794. addss %xmm9, %xmm0
  2795. movss 4 * SIZE(BO), %xmm9
  2796. addq $1 * SIZE, AO
  2797. addq $4 * SIZE, BO
  2798. decq %rax
  2799. jg .L146
  2800. ALIGN_4
  2801. .L148:
  2802. addss %xmm1, %xmm0
  2803. addss %xmm3, %xmm2
  2804. addss %xmm2, %xmm0
  2805. mulss %xmm15, %xmm0
  2806. #ifndef TRMMKERNEL
  2807. movss 0 * SIZE(CO1), %xmm8
  2808. addss %xmm8, %xmm0
  2809. #endif
  2810. movss %xmm0, 0 * SIZE(CO1)
  2811. ALIGN_4
  2812. .L999:
  2813. movq %rbx, %rsp
  2814. movq 0(%rsp), %rbx
  2815. movq 8(%rsp), %rbp
  2816. movq 16(%rsp), %r12
  2817. movq 24(%rsp), %r13
  2818. movq 32(%rsp), %r14
  2819. movq 40(%rsp), %r15
  2820. #ifdef WINDOWS_ABI
  2821. movq 48(%rsp), %rdi
  2822. movq 56(%rsp), %rsi
  2823. movups 64(%rsp), %xmm6
  2824. movups 80(%rsp), %xmm7
  2825. movups 96(%rsp), %xmm8
  2826. movups 112(%rsp), %xmm9
  2827. movups 128(%rsp), %xmm10
  2828. movups 144(%rsp), %xmm11
  2829. movups 160(%rsp), %xmm12
  2830. movups 176(%rsp), %xmm13
  2831. movups 192(%rsp), %xmm14
  2832. movups 208(%rsp), %xmm15
  2833. #endif
  2834. addq $STACKSIZE, %rsp
  2835. ret
  2836. EPILOGUE