You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

0001-layers-Added-auto-inst-layers.patch 186 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601
  1. From 1bb2f039b0ba3f815c8a8fd6ac966a89f6ffe5a2 Mon Sep 17 00:00:00 2001
  2. From: dpankratz <pankratz@ualberta.ca>
  3. Date: Thu, 18 Feb 2021 14:45:45 -0700
  4. Subject: [PATCH] layers: Added auto-inst layers
  5. ---
  6. CMakeLists.txt | 24 +-
  7. build-android/jni/Android.mk | 7 +
  8. docs/auto_instrument.md | 213 +++
  9. layers/CMakeLists.txt | 19 +
  10. layers/auto_inst.cpp | 1205 +++++++++++++++++
  11. layers/auto_inst.h | 465 +++++++
  12. .../auto_inst_divergence_characterization.cpp | 157 +++
  13. .../auto_inst_divergence_characterization.h | 48 +
  14. layers/auto_inst_dyn_shader_trace.cpp | 177 +++
  15. layers/auto_inst_dyn_shader_trace.h | 44 +
  16. layers/auto_inst_dyn_trace_ray_trace.cpp | 223 +++
  17. layers/auto_inst_dyn_trace_ray_trace.h | 55 +
  18. layers/auto_inst_execution_trace.cpp | 174 +++
  19. layers/auto_inst_execution_trace.h | 56 +
  20. layers/auto_inst_simt_efficiency.cpp | 67 +
  21. layers/auto_inst_simt_efficiency.h | 56 +
  22. layers/auto_inst_warp_entry_and_exit.cpp | 61 +
  23. layers/auto_inst_warp_entry_and_exit.h | 52 +
  24. layers/debug_printf.cpp | 2 +-
  25. layers/debug_printf.h | 1 +
  26. layers/generated/chassis.cpp | 53 +-
  27. layers/generated/chassis.h | 13 +
  28. layers/gpu_utils.h | 6 +
  29. layers/gpu_validation.h | 1 +
  30. layers/layer_options.cpp | 13 +
  31. layers/layer_options.h | 14 +-
  32. scripts/known_good.json | 17 +-
  33. 27 files changed, 3215 insertions(+), 8 deletions(-)
  34. create mode 100644 docs/auto_instrument.md
  35. create mode 100644 layers/auto_inst.cpp
  36. create mode 100644 layers/auto_inst.h
  37. create mode 100644 layers/auto_inst_divergence_characterization.cpp
  38. create mode 100644 layers/auto_inst_divergence_characterization.h
  39. create mode 100644 layers/auto_inst_dyn_shader_trace.cpp
  40. create mode 100644 layers/auto_inst_dyn_shader_trace.h
  41. create mode 100644 layers/auto_inst_dyn_trace_ray_trace.cpp
  42. create mode 100644 layers/auto_inst_dyn_trace_ray_trace.h
  43. create mode 100644 layers/auto_inst_execution_trace.cpp
  44. create mode 100644 layers/auto_inst_execution_trace.h
  45. create mode 100644 layers/auto_inst_simt_efficiency.cpp
  46. create mode 100644 layers/auto_inst_simt_efficiency.h
  47. create mode 100644 layers/auto_inst_warp_entry_and_exit.cpp
  48. create mode 100644 layers/auto_inst_warp_entry_and_exit.h
  49. diff --git a/CMakeLists.txt b/CMakeLists.txt
  50. index 994a9ef0..e72924fe 100644
  51. --- a/CMakeLists.txt
  52. +++ b/CMakeLists.txt
  53. @@ -227,6 +227,8 @@ if(BUILD_TESTS OR BUILD_LAYERS)
  54. if (NOT TARGET SPIRV-Tools)
  55. if(NOT SPIRV_TOOLS_INSTALL_DIR)
  56. set(SPIRV_TOOLS_INSTALL_DIR "${GLSLANG_INSTALL_DIR}")
  57. + else()
  58. + message(STATUS "Using Spirv-Tools install located at ${SPIRV_TOOLS_INSTALL_DIR}")
  59. endif()
  60. set(SPIRV_TOOLS_BINARY_ROOT "${SPIRV_TOOLS_INSTALL_DIR}/lib"
  61. @@ -242,7 +244,7 @@ if(BUILD_TESTS OR BUILD_LAYERS)
  62. find_library(SPIRV_TOOLS_LIB NAMES SPIRV-Tools HINTS ${SPIRV_TOOLS_SEARCH_PATH})
  63. find_library(SPIRV_TOOLS_OPT_LIB NAMES SPIRV-Tools-opt HINTS ${SPIRV_TOOLS_OPT_SEARCH_PATH})
  64. - if(WIN32)
  65. + if(WIN32 AND NOT SPIRV_TOOLS_LIB)
  66. add_library(SPIRV-Tools-opt STATIC IMPORTED)
  67. add_library(SPIRV-Tools STATIC IMPORTED)
  68. @@ -269,6 +271,26 @@ if(BUILD_TESTS OR BUILD_LAYERS)
  69. set(SPIRV_TOOLS_INCLUDE_DIR "${spirv-tools_SOURCE_DIR}/include" CACHE PATH "Path to spirv tools headers")
  70. endif()
  71. + if (NOT TARGET SPIRV-Cross)
  72. + if (SPIRV_CROSS_INSTALL_DIR)
  73. + message(STATUS "Using spirv-cross install located at ${SPIRV_CROSS_INSTALL_DIR}")
  74. + endif()
  75. + set(SPIRV_CROSS_INCLUDE_DIR "${SPIRV_CROSS_INSTALL_DIR}/include" CACHE PATH "Path to spirv cross headers")
  76. + set(SPIRV_CROSS_SEARCH_PATH ${SPIRV_CROSS_INSTALL_DIR}/lib)
  77. + find_library(SPIRV_CROSS_GLSL_LIB NAMES spirv-cross-glsl HINTS ${SPIRV_CROSS_SEARCH_PATH})
  78. + find_library(SPIRV_CROSS_CORE_LIB NAMES spirv-cross-core HINTS ${SPIRV_CROSS_SEARCH_PATH})
  79. + if (NOT SPIRV_CROSS_GLSL_LIB OR NOT SPIRV_CROSS_CORE_LIB)
  80. + find_library(SPIRV_CROSS_GLSL_LIB NAMES spirv-cross-glsld HINTS ${SPIRV_CROSS_SEARCH_PATH})
  81. + find_library(SPIRV_CROSS_CORE_LIB NAMES spirv-cross-cored HINTS ${SPIRV_CROSS_SEARCH_PATH})
  82. + if (NOT SPIRV_CROSS_GLSL_LIB OR NOT SPIRV_CROSS_CORE_LIB)
  83. + message(FATAL_ERROR "Could not find spirv-cross libs!")
  84. + else()
  85. + message("WARNING: using debug config of SPIRV-Cross libs. Use <--config release> option of update_deps.py to fix.")
  86. + endif()
  87. + endif()
  88. + set(SPIRV_CROSS_LIBRARIES ${SPIRV_CROSS_GLSL_LIB} ${SPIRV_CROSS_CORE_LIB})
  89. + endif()
  90. +
  91. set(GLSLANG_LIBRARIES ${GLSLANG_LIBRARIES} ${SPIRV_TOOLS_LIBRARIES})
  92. endif()
  93. diff --git a/build-android/jni/Android.mk b/build-android/jni/Android.mk
  94. index 871bdf30..e92895b9 100644
  95. --- a/build-android/jni/Android.mk
  96. +++ b/build-android/jni/Android.mk
  97. @@ -44,6 +44,13 @@ LOCAL_SRC_FILES += $(SRC_DIR)/layers/shader_validation.cpp
  98. LOCAL_SRC_FILES += $(SRC_DIR)/layers/gpu_validation.cpp
  99. LOCAL_SRC_FILES += $(SRC_DIR)/layers/gpu_utils.cpp
  100. LOCAL_SRC_FILES += $(SRC_DIR)/layers/debug_printf.cpp
  101. +LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst.cpp
  102. +LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_dyn_shader_trace.cpp
  103. +LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_dyn_trace_ray_trace.cpp
  104. +LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_execution_trace.cpp
  105. +LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_simt_efficiency.cpp
  106. +LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_divergence_characterization.cpp
  107. +LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_warp_entry_and_exit.cpp
  108. LOCAL_SRC_FILES += $(SRC_DIR)/layers/best_practices_utils.cpp
  109. LOCAL_SRC_FILES += $(SRC_DIR)/layers/generated/best_practices.cpp
  110. LOCAL_SRC_FILES += $(SRC_DIR)/layers/synchronization_validation.cpp
  111. diff --git a/docs/auto_instrument.md b/docs/auto_instrument.md
  112. new file mode 100644
  113. index 00000000..30d376ce
  114. --- /dev/null
  115. +++ b/docs/auto_instrument.md
  116. @@ -0,0 +1,213 @@
  117. +<!-- markdownlint-disable MD041 -->
  118. +
  119. +[![Khronos Vulkan][1]][2]
  120. +
  121. +[1]: https://vulkan.lunarg.com/img/Vulkan_100px_Dec16.png "https://www.khronos.org/vulkan/"
  122. +[2]: https://www.khronos.org/vulkan/
  123. +
  124. +# Auto-Instrument
  125. +
  126. +[![Creative Commons][3]][4]
  127. +
  128. +[3]: https://i.creativecommons.org/l/by-nd/4.0/88x31.png "Creative Commons License"
  129. +[4]: https://creativecommons.org/licenses/by-nd/4.0/
  130. +
  131. +Auto-Instrument is implemented in the SPIR-V Tools optimizer and the `VK_LAYER_KHRONOS_validation` layer.
  132. +It allows provides boilerplate for developers to implement custom instrumentation and analyses.
  133. +This document covers the operation of the layer portion of the implementation and subsequently the specific sublayers that perform auto-instrumentation.
  134. +
  135. +## Limitations
  136. +
  137. +Auto-Instrument shares the same limitation as Debug Printf and GPU-assisted validation, an additional bound descriptor set. Currently, Auto-Instrument only allows 1 class of pipeline to be instrumented at once (i.e. 1 of graphics, compute, ray-tracing).
  138. +
  139. +## Basic Operation
  140. +
  141. +The basic operation of Auto-Instrument is to offer the following hooks for subclasses to perform a custom analysis:
  142. +* **InitializeDeviceLayerSettings** provides the opportunity to set layer settings when the Vulkan logical device is created. This is useful to check for active extensions or to check `vk_layer_settings.txt` for sublayer specific settings.
  143. +* **InitializeInstrumentationBuffer** provides the opportunity for subclasses to change the default values in the instrumentation buffer. This is useful for communicating with the instrumentation code. For example, the instrumentation could support a sampling based approach where not all frames collect results. The instrumentation could contain a check for a specific location in the instrumentation buffer and this function would allow the subclass to populate that location.
  144. +* **RegisterPasses** allows the specific subclass to choose which auto-instrumentation pass from SPIRV-Opt to use.
  145. +* **AnalyzeRayTracing** is provided with the data collection from a ray-tracing pipeline invocation as well as the width, height, and depth of the invocation.
  146. +* **AnalyzeGraphics** receives the data from a graphics pipeline invocation.
  147. +* **AnalyzeCompute** receives the data from a compute pipeline invocation and the x, y, and z of the invocation.
  148. +
  149. +By overriding these functions, a subclass is able to implement custom analyses of instrumentation data.
  150. +
  151. +## Enabling Auto-Instrument in Vulkan-ValidationLayers
  152. +
  153. +Auto-Instrument is an object in the KHRONOS_validation layer, so the VK_LAYER_KHRONOS_validation layer must be loaded.
  154. +See the LAYER_CONFIGURATION document for information on enabling the VK_LAYER_KHRONOS_validation layer.
  155. +Validation itself is not necessary for Auto-Instrument and can be disabled without affecting Auto-Instrument functionality.
  156. +
  157. +Auto-Instrument can be enabled through *vk_layer_settings.txt* file that must be in the program's working directory.
  158. +Within the settings file, specify:
  159. +khronos_validation.enables = `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_<specific analysis>_EXT` where `<specific analysis>` is one of the auto_inst subclasses.
  160. +
  161. +Auto-Instrument has been implemented as a state tracker validation object, as a peer to GPU Assisted Validation and Debug Printf.
  162. +Because of this, and coupled with the philosophy that validation objects will not communicate with each other, one should never enable any pair of Auto-Instrument, GPU Assisted Validation and Debug Printf at the same time.
  163. +Auto-Instrument will be disabled if GPU Assisted Validation or Debug Printf is enabled.
  164. +
  165. +When using Auto-Instrument, it is recommended to disable validation, as the debug level of INFO or DEBUG causes the validation layers to produce many messages unrelated to Auto-Instrument, making it difficult to find the desired output.
  166. +
  167. +### Auto-Instrument Requirements
  168. +
  169. +* Validation Layers version: 1.2.135.0
  170. +* Vulkan API version 1.1 or greater
  171. +* VkPhysicalDevice features: fragmentStoresAndAtomics and vertexPipelineStoresAndAtomics
  172. +
  173. +### Auto-Instrument Settings
  174. +
  175. +* `khronos_validation.auto_inst_buffer_size` = `<size in bytes>`
  176. +
  177. +This setting allows you to specify the size of the per-call buffer, in bytes of device memory, for returning instrumentation data.
  178. +The default is 1024 bytes. If the buffer size is too small, Auto-Instrument will report the size that the buffer should be to collect all the instrumentation data. In subsequent executions, the history file will be used to size the instrumentation buffer.
  179. +
  180. +* `khronos_validation.auto_inst_pipeline_to_instrument` = `Graphics` or `Compute` or `RayTracing`
  181. +
  182. +This setting controls the pipeline type that is instrumented. For example, if `Compute` is chosen then compute shaders are instrumented and instrumentation buffers are created for all VkComputePiplines.
  183. +
  184. +* `khronos_validation.auto_inst_to_stdout` = 'false' or 'true'
  185. +
  186. +By default, Auto-Instrument messages are sent to the stdout, but this setting will instead send Auto-Instrument to the debug callback.
  187. +
  188. +* `khronos_validation.auto_inst_base_file_name` = `<base name>`
  189. +
  190. +Auto-Instrument analysis file names can optionally have a base file name prepended. By default there is no common prefix.
  191. +
  192. +* `khronos_validation.auto_inst_create_reference_heatmap` = `false` or `true`
  193. +
  194. +Many of the analysis emit a heatmap. This option allows a reference scale **ReferenceScale\[.bmp\.ppm\]** to be generated where the leftmost pixels correspond to 0.0 and rightmost to 1.0. The pixels in between are interpolated between 0.0 and 1.0.
  195. +
  196. +* `khronos_validation.auto_inst_debug_mode` = `atomics` or `subgroup` or `arraylength`
  197. +
  198. +This is a debug setting designed to help isolate any issues that may be occurring with auto-instrumentation. When present, this setting switches the operation of Auto-Instrument to disregard the current instrumentation mode. Instead, it runs an extremely simple instrumentation pass and analysis to check whether the atomic, subgroup, or arraylength instructions work correctly in isolation.
  199. +
  200. +* `khronos_validation.auto_inst_dump_shaders` = `false` or `true`
  201. +
  202. +When set to true, this setting instructs Auto-Instrument to dump the instrumented shader modules.
  203. +
  204. +* `khronos_validation.auto_inst_shaders_to_instrument` = `stageM, stageN, ...` E.g. `Miss1, ClosestHit2`
  205. +
  206. +By default, all shaders that correspond to the `khronos_validation.auto_inst_pipeline_to_instrument` setting are instrumented. When this setting is activated, only the shaders that match the specified stage and index are instrumented. For example, if `Miss1, ClosestHit2` is passed, then the 1st Miss shader that pass via `VkCreateShaderModule` will be instrumented, and similarly the 2nd Closest Hit shader.
  207. +
  208. +### Auto-Instrument Resources
  209. +
  210. +Analogous to GPU Assisted Validation and Debug Printf, Auto-Instrument uses device memory and a descriptor set to allow the shader instrumentation code to return values to the layer.
  211. +See the gpu_validation document for more information
  212. +
  213. +Auto-Instrument also generates a file containing the runtime instrumentation buffer utilization of previous executions of the application. In subsequent executions, the instrumentation buffers are sized according to this history. This allows many more calls to be instrumented in cases where some calls generate significantly more data.
  214. +
  215. +Auto-Instrument analyses generate output files of the form `<base file name><pipeline type><pipeline invocation index>_frame<frame number>_<analysis specific suffix>`. For example `rt0_frame0_simt_efficiency.csv` is the SIMT Efficiency measurement for the 1st ray-tracing call in the 1st frame of the application.
  216. +
  217. +### Auto-Instrument Subclasses
  218. +
  219. +This section outlines the classes the implement the Auto-Instrument interface to create detailed execution trace profiling.
  220. +
  221. +### Limitation
  222. +
  223. +Many of the analyses require tracking warp execution over time. Obtaining knowledge of which threads belong to a warp requires digging past the abstraction level of SPIR-V and a custom solution for graphics, compute and ray-tracing pipelines. Currently this is only implemented for ray-tracing pipelines so many of the analyses are limited for graphics and compute.
  224. +
  225. +Some applications invoke the RayTracing pipeline with a z-dimension size of >1 which complicates the creation of heatmaps. In future more modes for transforming these higher dimensional calls into heatmaps will be added and exposed as options.
  226. +
  227. +## Divergence Characterization
  228. +
  229. +This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT` and is currently supported only for Ray Tracing.
  230. +
  231. +### Analysis
  232. +
  233. +This analysis breaksdown the effect of indirect-function calls, early thread-exits and control-flow on divergence in terms of number of instructions affected:
  234. +* **Indirect-function call** divergence is caused by thread-varying values for the address passed to an indirect function call. This occurs frequently in ray-tracing when the threads in a warp hit multiple objects. This metric is currently only supported for ray-tracing.
  235. +* **Early thread-exits** divergence occurs when some threads in a warp complete the pipeline while other threads still have work to perform. This also occurs frequently in ray-tracing when some rays miss geometry and others bounces many times requiring many traversals and intersections.
  236. +* **Control-flow** divergence is caused by, for example, thread-varying values for if statements or thread-varying loop trip counts.
  237. +
  238. +### Output
  239. +
  240. +This analysis outputs a file **divergence_characterization.csv** which contains the respective counts of inactive instruction execution slots caused by the different divergence causes for each pipeline invocation.
  241. +
  242. +## Dynamic Shader Trace
  243. +
  244. +This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT` and is currently supported only for Ray Tracing.
  245. +
  246. +### Analysis
  247. +
  248. +This analysis reconstructs the number of dynamic shader executions for each pipeline invocation. These values are visualized with thread and warp heatmaps
  249. +
  250. +### Output
  251. +
  252. +* **dyn_shader_counts.csv** is generated per pipeline invocation and contains the shader execution counts observed at runtime.
  253. +* **shader_execution_heatmap\[.bmp\.ppm\]** is generated per piepline invocation and contains a heatmap created for each pipeline invocation visualizing each pixel's dynamic shader execution count normalized to the maximum shader execution count.
  254. +* **subgroup_shader_execution_heatmap\[.bmp\.ppm\]** is generated per piepline invocation and contains a heatmap created for each pipeline invocation visualizing each subgroups's dynamic shader execution count normalized to the maximum shader execution count for a single subgroup.
  255. +
  256. +## Dynamic TraceRay Trace
  257. +
  258. +This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT` and is currently supported only for Ray Tracing.
  259. +
  260. +### Analysis
  261. +
  262. +This analysis simulates the effect of thread compaction on the execution of a ray-tracing pipeline. Given a particular runtime traceRay invocation, the active threads are repacked into warps and then the number of warp executions required to perform the new traceRay calls is calculated. The analysis further simulates only repacking consecutive `2^k` warps to simulate different hardware buffer sizes.
  263. +
  264. +### Output
  265. +
  266. +* **thread_paths.csv** is generated per pipeline and contains bitmasks representing whether a given thread was active for a runtime invocation of a TraceRay call. For example, if a TraceRay call is contained in an if statement, some threads would have 0 to indicate they skipped the call. For each unique bitmask, the total count of threads that took the same path is totalled.
  267. +* **thread_compaction.csv** is generated per pipeline and contains the results of performing thread compaction. The data is output as follows:
  268. + ```
  269. + for each TraceRay callsite:
  270. + for each compaction window size:
  271. + for each runtime execution:
  272. + output active threads/threads required after compaction
  273. + ```
  274. +
  275. + For example:
  276. + ```
  277. + 354|
  278. + 1 0 896173/911296
  279. + ```
  280. + For callsite with id=354, compaction window size=1, visit count = 0, the number of active threads is 896173 and the number of required threads after compaction is 911296.
  281. +
  282. +## Execution Trace
  283. +
  284. +This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT`
  285. +
  286. +### Analysis
  287. +
  288. +This analysis determines the pipeline hotspots, and dynamic SPIR-V instruction execution count. This is performed by tracking each runtime execution of a basicblock along with the number of active threads.
  289. +
  290. +### Output
  291. +
  292. +* **dyn_opcode_counts.csv** is generated per pipeline invocation and contains the respective runtime execution counts for each SPIR-V opcode.
  293. +* **shader_stage_dyn_executions.glsl** is generated per pipeline invocation for each shader in the pipeline. These files present the dynamic instruction execution count of each basic block as inline comments as follows:
  294. + ```
  295. + if (gl_LaunchIDNV.z != 0u)
  296. + {
  297. + /*thread_executions=460800. SIMT Efficiency=1.000*/
  298. + _1509(2416u, subgroupBallot(true).x);
  299. + ipos.x += (_265.global_ubo.width / 2);
  300. + }
  301. + /*thread_executions=921600. SIMT Efficiency=1.000*/
  302. + _1509(2426u, subgroupBallot(true).x);
  303. + ```
  304. + The function `_1509` is the instrumentation inserted to capture the execution trace. The comment that immediately proceeds it correpsonds to the data collected from that instrumentation callsite. The first argument to `_1509` (in this example, `2416 and 2426`) is the unique id of the basic block.
  305. +* **hotspots.csv** is generated per pipeline invocation and contains the dynamic execution count of each instruction id in the pipeline. The first argument passed to the instrumentation in the annotated shaders is the instruction id. This can be searched for in the hotspots file or vice-versa.
  306. +
  307. +## SIMT Efficiency
  308. +
  309. +This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT`.
  310. +
  311. +### Analysis
  312. +
  313. +This analysis computes the SIMT efficiency which is a measure of the utilization of a SIMD architecture. In this case it is calculated as the average fraction of active threads for each basic block execution.
  314. +
  315. +### Output
  316. +
  317. +* **simt_efficiency.csv** is generated per frame with the respective SIMT efficiencies of each pipeline invocation.
  318. +
  319. +## Warp Entry and Exit
  320. +
  321. +This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT`.
  322. +
  323. +### Analysis
  324. +
  325. +This analysis counts the number of times the entry and exit of the ray-tracing pipeline is executed. This is designed to measure the effect of independent thread scheduling(ITS) on NVIDIA's Turing and Ampere architectures. This instrumentation measures to what degree the warp has diverged due to ITS.
  326. +
  327. +### Output
  328. +
  329. +* **warp_exits_vs_entires.csv** is generated per frame with the respective exits count/entry count for each pipeline invocation.
  330. diff --git a/layers/CMakeLists.txt b/layers/CMakeLists.txt
  331. index 7b1ba729..76b4b520 100644
  332. --- a/layers/CMakeLists.txt
  333. +++ b/layers/CMakeLists.txt
  334. @@ -232,6 +232,22 @@ set(GPU_UTILITY_LIBRARY_FILES
  335. gpu_utils.cpp
  336. gpu_utils.h)
  337. +set(AUTO_INST_LIBRARY_FILES
  338. + auto_inst.cpp
  339. + auto_inst_divergence_characterization.cpp
  340. + auto_inst_dyn_shader_trace.cpp
  341. + auto_inst_dyn_trace_ray_trace.cpp
  342. + auto_inst_execution_trace.cpp
  343. + auto_inst_simt_efficiency.cpp
  344. + auto_inst_warp_entry_and_exit.cpp
  345. + auto_inst.h
  346. + auto_inst_divergence_characterization.h
  347. + auto_inst_dyn_shader_trace.h
  348. + auto_inst_dyn_trace_ray_trace.h
  349. + auto_inst_execution_trace.h
  350. + auto_inst_simt_efficiency.h
  351. + auto_inst_warp_entry_and_exit.h)
  352. +
  353. set(SYNC_VALIDATION_LIBRARY_FILES
  354. synchronization_validation.cpp
  355. synchronization_validation.h)
  356. @@ -247,13 +263,16 @@ if(BUILD_LAYERS)
  357. ${GPU_UTILITY_LIBRARY_FILES}
  358. ${GPU_ASSISTED_LIBRARY_FILES}
  359. ${DEBUG_PRINTF_LIBRARY_FILES}
  360. + ${AUTO_INST_LIBRARY_FILES}
  361. ${SYNC_VALIDATION_LIBRARY_FILES})
  362. # Khronos validation additional dependencies
  363. target_include_directories(VkLayer_khronos_validation PRIVATE ${GLSLANG_SPIRV_INCLUDE_DIR})
  364. target_include_directories(VkLayer_khronos_validation PRIVATE ${SPIRV_TOOLS_INCLUDE_DIR})
  365. target_include_directories(VkLayer_khronos_validation PRIVATE ${SPIRV_HEADERS_INCLUDE_DIR})
  366. + target_include_directories(VkLayer_khronos_validation PRIVATE ${SPIRV_CROSS_INCLUDE_DIR})
  367. target_link_libraries(VkLayer_khronos_validation PRIVATE ${SPIRV_TOOLS_LIBRARIES})
  368. + target_link_libraries(VkLayer_khronos_validation PRIVATE ${SPIRV_CROSS_LIBRARIES})
  369. # Force generation of the PDB file for Release builds.
  370. # Note that CMake reduces optimization levels for RelWithDebInfo builds.
  371. diff --git a/layers/auto_inst.cpp b/layers/auto_inst.cpp
  372. new file mode 100644
  373. index 00000000..8f3669d3
  374. --- /dev/null
  375. +++ b/layers/auto_inst.cpp
  376. @@ -0,0 +1,1205 @@
  377. +/* Copyright (c) 2020 The Khronos Group Inc.
  378. + *
  379. + * Licensed under the Apache License, Version 2.0 (the "License");
  380. + * you may not use this file except in compliance with the License.
  381. + * You may obtain a copy of the License at
  382. + *
  383. + * http://www.apache.org/licenses/LICENSE-2.0
  384. + *
  385. + * Unless required by applicable law or agreed to in writing, software
  386. + * distributed under the License is distributed on an "AS IS" BASIS,
  387. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  388. + * See the License for the specific language governing permissions and
  389. + * limitations under the License.
  390. + *
  391. + * Author: David Pankratz <pankratz@ualberta.ca>
  392. + */
  393. +
  394. +#include "auto_inst.h"
  395. +#include "spirv-tools/optimizer.hpp"
  396. +#include "spirv-tools/instrument.hpp"
  397. +#if !defined(__ANDROID__)
  398. +#include "spirv_cross/spirv_glsl.hpp"
  399. +#endif
  400. +#include <iostream>
  401. +#include <fstream>
  402. +#include "layer_chassis_dispatch.h"
  403. +#include <regex>
  404. +#include <iostream>
  405. +#include <bitset>
  406. +
  407. +static const VkShaderStageFlags kShaderStageAllRayTracing =
  408. + VK_SHADER_STAGE_ANY_HIT_BIT_NV | VK_SHADER_STAGE_CALLABLE_BIT_NV | VK_SHADER_STAGE_CLOSEST_HIT_BIT_NV |
  409. + VK_SHADER_STAGE_INTERSECTION_BIT_NV | VK_SHADER_STAGE_MISS_BIT_NV | VK_SHADER_STAGE_RAYGEN_BIT_NV;
  410. +
  411. +static const VkShaderStageFlags kShaderStageAllGraphics =
  412. + VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
  413. + VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT | VK_SHADER_STAGE_GEOMETRY_BIT;
  414. +
  415. +static const VkShaderStageFlags kShaderStageAllCompute = VK_SHADER_STAGE_COMPUTE_BIT;
  416. +
  417. +// String literal was determined by VkShaderStageFlags spelling. I.e. VK_SHADER_STAGE_RAYGEN_BIT_KHR => RayGen
  418. +static const std::unordered_map<std::string, uint32_t> ShaderStageFlagLookup = {
  419. + {"RayGen", VK_SHADER_STAGE_RAYGEN_BIT_KHR},
  420. + {"ClosestHit", VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR},
  421. + {"Callable", VK_SHADER_STAGE_CALLABLE_BIT_KHR},
  422. + {"Miss", VK_SHADER_STAGE_MISS_BIT_KHR},
  423. + {"AnyHit", VK_SHADER_STAGE_ANY_HIT_BIT_KHR},
  424. + {"Intersection", VK_SHADER_STAGE_INTERSECTION_BIT_KHR},
  425. + {"Geometry", VK_SHADER_STAGE_GEOMETRY_BIT},
  426. + {"Fragment", VK_SHADER_STAGE_FRAGMENT_BIT},
  427. + {"Compute", VK_SHADER_STAGE_COMPUTE_BIT},
  428. + {"TessellationControl", VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT},
  429. + {"TessellationEvaluation", VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT},
  430. + {"Vertex", VK_SHADER_STAGE_VERTEX_BIT}};
  431. +
  432. +// Convenience function for reporting problems.
  433. +template <typename T>
  434. +void AutoInst::ReportSetupProblem(T object, std::string specific_message) const {
  435. + if (use_stdout)
  436. + std::cerr << specific_message;
  437. + else
  438. + LogError(object, "UNASSIGNED-AUTO-INST ", "Detail: (%s)", specific_message.c_str());
  439. +}
  440. +
  441. +template <typename T>
  442. +void AutoInst::ReportInfo(T object, std::string specific_message) const {
  443. + if (use_stdout)
  444. + std::cout << specific_message;
  445. + else
  446. + LogInfo(object, "UNASSIGNED-AUTO-inst", "%s", specific_message.c_str());
  447. +}
  448. +
  449. +void AutoInst::CreateImage(uint32_t width, uint32_t height, std::vector<char> &colors, std::string file_name) const {
  450. +#if defined(_WIN32)
  451. + std::ofstream ofs;
  452. + ofs.open(file_name + ".bmp", std::ios_base::binary);
  453. +
  454. + ReportInfo(device, "Creating BMP with dim=" + std::to_string(width) + "x" + std::to_string(height) + " from " +
  455. + std::to_string(colors.size()) + "\n");
  456. +
  457. + const int BYTES_PER_PIXEL = 3;
  458. +
  459. + BITMAPFILEHEADER tWBFH;
  460. + tWBFH.bfType = 0x4d42;
  461. + tWBFH.bfSize = 14 + 40 + (width * height * BYTES_PER_PIXEL);
  462. + tWBFH.bfReserved1 = 0;
  463. + tWBFH.bfReserved2 = 0;
  464. + tWBFH.bfOffBits = 14 + 40;
  465. +
  466. + BITMAPINFOHEADER tW2BH;
  467. + tW2BH.biSize = 40;
  468. + tW2BH.biWidth = width;
  469. + tW2BH.biHeight = height;
  470. + tW2BH.biPlanes = 1;
  471. + tW2BH.biBitCount = BYTES_PER_PIXEL * 8;
  472. + tW2BH.biCompression = 0;
  473. +
  474. + ofs.write((char *)(&tWBFH), 14);
  475. + ofs.write((char *)(&tW2BH), 40);
  476. +
  477. + for (int y = height - 1; y >= 0; y--) {
  478. + uint32_t x = 0;
  479. + for (x = 0; x < width; x++) {
  480. + auto thread_id = BYTES_PER_PIXEL * (y * width + x);
  481. + ofs << colors[thread_id] << colors[thread_id + 1] << colors[thread_id + 2];
  482. + }
  483. + while (x % 4 != 0) {
  484. + ofs << (char)0;
  485. + x++;
  486. + }
  487. + }
  488. +
  489. + ofs.close();
  490. +#else
  491. +
  492. + const int BYTES_PER_PIXEL = 3;
  493. + std::ofstream ofs(file_name + ".ppm", std::ios_base::out | std::ios_base::binary);
  494. + ofs << "P6" << std::endl << width << ' ' << height << std::endl << "255" << std::endl;
  495. +
  496. + for (uint32_t j = 0; j < height; j++) {
  497. + for (auto i = 0u; i < width; i++) {
  498. + auto thread_id = BYTES_PER_PIXEL * (j * width + i);
  499. + ofs << colors[thread_id] << colors[thread_id + 1] << colors[thread_id + 2];
  500. + }
  501. + }
  502. +
  503. + ofs.close();
  504. +#endif
  505. +}
  506. +
  507. +std::tuple<char, char, char> AutoInst::UnitIntervalToRGB(float val) const {
  508. + if (val < 0 || val > 1) {
  509. + ReportSetupProblem(device, "Cannot convert a value outside of interval [0,1] to heatmap colour!");
  510. + }
  511. +
  512. + float red, green, blue;
  513. +
  514. + if (val < 0.2)
  515. + red = 1.0f - 5.0f * val;
  516. + else if (val >= 0.2 && val < 0.6)
  517. + red = 0;
  518. + else if (val >= 0.6 && val < 0.8)
  519. + red = 5.0f * (val - 0.6f);
  520. + else
  521. + red = 1.0f;
  522. +
  523. + if (val < 0.4)
  524. + green = 1;
  525. + else if (val >= 0.4 && val < 0.6)
  526. + green = 5.0f * (0.6f - val);
  527. + else
  528. + green = 0;
  529. +
  530. + if (val < 0.2)
  531. + blue = 0;
  532. + else if (val >= 0.2 && val < 0.4)
  533. + blue = 5.0f * (val - 0.2f);
  534. + else if (val >= 0.4 && val < 0.8)
  535. + blue = 1;
  536. + else
  537. + blue = 5.0f * (1.0f - val);
  538. +
  539. + return std::make_tuple((char)(red * 255), (char)(green * 255), (char)(blue * 255));
  540. +}
  541. +
  542. +bool AutoInst::CreateUniqueSubgroupIdMappings(uint32_t *const debug_output_buffer,
  543. + PrimitiveIdToPrimitiveSizeMap &primitive_id2primitive_size,
  544. + ThreadIdToSubgroupIdMap &thread_id2subgroup_id_map, ThreadIdSwizzleMap &thread_id_swizzle_map,
  545. + std::function<uint32_t(uint32_t inst_id)> inst_id2prim_id) const {
  546. + // Sanity check for unique subgroup primitive size
  547. + if (primitive_id2primitive_size.count(spvtools::kAutoInstUniqueSubgroupId) == 0) return false;
  548. +
  549. + uint32_t j = 0;
  550. + uint32_t num_words_written = debug_output_buffer[WORDS_WRITTEN_INDEX];
  551. + while (j < num_words_written) {
  552. + auto inst_id = debug_output_buffer[j + NUM_BUFFER_RESERVED_WORDS];
  553. + auto prim_id = inst_id2prim_id(inst_id);
  554. + if (primitive_id2primitive_size.count(prim_id) == 0) {
  555. + ReportSetupProblem(device,
  556. + "Unknown prim_id=" + std::to_string(prim_id) + " encountered in CreateUniqueSubgroupIdMappings\n.");
  557. + return false;
  558. + }
  559. +
  560. + if (prim_id == spvtools::kAutoInstUniqueSubgroupId) {
  561. + auto unique_id_record = reinterpret_cast<AIUniqueSubgroupIdEntry *>(&debug_output_buffer[j + NUM_BUFFER_RESERVED_WORDS]);
  562. + auto subgroup_id = unique_id_record->SubgroupId();
  563. + auto flat_thread_id = unique_id_record->flat_thread_id;
  564. + thread_id2subgroup_id_map[flat_thread_id] = subgroup_id;
  565. + thread_id_swizzle_map[subgroup_id * SUBGROUP_SIZE + unique_id_record->IntraSubgroupId()] = flat_thread_id;
  566. + }
  567. + j += primitive_id2primitive_size[prim_id];
  568. + }
  569. +
  570. + return true;
  571. +}
  572. +
  573. +void AutoInst::TryReadRuntimeSizeCache(AutoInst *device_auto_inst) {
  574. + if (pipeline_to_instrument == VK_PIPELINE_BIND_POINT_MAX_ENUM) {
  575. + ReportSetupProblem(device, "Pipeline to instrument setting was not initialized. Aborting\n");
  576. + aborted = true;
  577. + }
  578. +
  579. + std::ifstream cache_file;
  580. + cache_file.open(RuntimeSizeCachePath(pipeline_to_instrument), std::ios_base::binary);
  581. + if (!cache_file) {
  582. + ReportInfo(device, "Runtime instrumentation buffer requirements cache not found. Defaulting to output_buffer_size.\n");
  583. + return;
  584. + }
  585. +
  586. + size_t num_pipeline_invocations = 0;
  587. + cache_file.read((char *)&num_pipeline_invocations, sizeof(size_t));
  588. + while (!cache_file.eof() && device_auto_inst->BufferSizeRequirementsLookup.size() < num_pipeline_invocations) {
  589. + uint32_t buffer_size = 0;
  590. + cache_file.read((char *)&buffer_size, sizeof(uint32_t));
  591. + device_auto_inst->BufferSizeRequirementsLookup.push_back(buffer_size);
  592. + }
  593. +
  594. + if (num_pipeline_invocations != device_auto_inst->BufferSizeRequirementsLookup.size()) {
  595. + ReportSetupProblem(device, "Warning incomplete cache file detected.\n");
  596. + }
  597. +
  598. + cache_file.close();
  599. +}
  600. +
  601. +void AutoInst::WriteRuntimeSizeCache() {
  602. + if (pipeline_to_instrument == VK_PIPELINE_BIND_POINT_MAX_ENUM) {
  603. + ReportSetupProblem(device, "Pipeline to instrument setting was not initialized. Aborting\n");
  604. + aborted = true;
  605. + }
  606. +
  607. + std::ofstream cache_file;
  608. + cache_file.open(RuntimeSizeCachePath(pipeline_to_instrument), std::ios_base::binary);
  609. + auto pipelines_observed = BufferSizeRequirementsLookup.size();
  610. + cache_file.write((char *)&pipelines_observed, sizeof(size_t));
  611. + for (auto size : BufferSizeRequirementsLookup) {
  612. + cache_file.write((char *)&size, 4);
  613. + }
  614. + cache_file.close();
  615. +}
  616. +
  617. +uint32_t AutoInst::FindShaderStage(std::vector<unsigned int> pgm) const {
  618. + uint32_t stage_flag = 0;
  619. + SHADER_MODULE_STATE shader;
  620. + shader.words = pgm;
  621. + if (shader.words.size() > 0) {
  622. + for (auto insn : shader) {
  623. + if (insn.opcode() == spv::OpEntryPoint) {
  624. + if (stage_flag != 0) {
  625. + // This means there are multiple entrypoints which is not
  626. + // supported by the downstream SPIRV-Opt instrumentation passes.
  627. + ReportSetupProblem(
  628. + device, "Multiple EntryPoints in single shader module encountered! Module will not be instrumented!\n");
  629. + return 0;
  630. + }
  631. + uint32_t offset = insn.offset();
  632. + spv::ExecutionModel ex_model = (spv::ExecutionModel)pgm[offset + 1];
  633. + switch (ex_model) {
  634. + case spv::ExecutionModel::ExecutionModelAnyHitKHR:
  635. + stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_ANY_HIT_BIT_KHR;
  636. + break;
  637. + case spv::ExecutionModel::ExecutionModelCallableKHR:
  638. + stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_CALLABLE_BIT_KHR;
  639. + break;
  640. + case spv::ExecutionModel::ExecutionModelClosestHitKHR:
  641. + stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR;
  642. + break;
  643. + case spv::ExecutionModel::ExecutionModelFragment:
  644. + stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_FRAGMENT_BIT;
  645. + break;
  646. + case spv::ExecutionModel::ExecutionModelGeometry:
  647. + stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_GEOMETRY_BIT;
  648. + break;
  649. + case spv::ExecutionModel::ExecutionModelGLCompute:
  650. + stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_COMPUTE_BIT;
  651. + break;
  652. + case spv::ExecutionModel::ExecutionModelIntersectionKHR:
  653. + stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_INTERSECTION_BIT_KHR;
  654. + break;
  655. + case spv::ExecutionModel::ExecutionModelMissKHR:
  656. + stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_MISS_BIT_KHR;
  657. + break;
  658. + case spv::ExecutionModel::ExecutionModelRayGenerationKHR:
  659. + stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_RAYGEN_BIT_KHR;
  660. + break;
  661. + case spv::ExecutionModel::ExecutionModelTessellationControl:
  662. + stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
  663. + break;
  664. + case spv::ExecutionModel::ExecutionModelTessellationEvaluation:
  665. + stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
  666. + break;
  667. + case spv::ExecutionModel::ExecutionModelVertex:
  668. + stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_VERTEX_BIT;
  669. + break;
  670. + default:
  671. + ReportSetupProblem(device, "Unsupported Shader Stage encountered! Shader will not be instrumented!\n");
  672. + return 0;
  673. + }
  674. + }
  675. + }
  676. + }
  677. +
  678. + return stage_flag;
  679. +}
  680. +
  681. +std::tuple<uint32_t, uint32_t, uint32_t> AutoInst::FindComputeLocalSize(std::vector<unsigned int> pgm) const {
  682. + SHADER_MODULE_STATE shader;
  683. + shader.words = pgm;
  684. +
  685. + if (shader.words.size() > 0) {
  686. + for (auto insn : shader) {
  687. + if (insn.opcode() == spv::OpExecutionMode) {
  688. + uint32_t offset = insn.offset();
  689. + if ((spv::ExecutionMode)pgm[offset + 2] != spv::ExecutionModeLocalSize) {
  690. + ReportSetupProblem(device, "Unable to determine compute LocalSize!\n");
  691. + return std::make_tuple(0, 0, 0);
  692. + }
  693. + return std::make_tuple(pgm[offset + 3], pgm[offset + 4], pgm[offset + 5]);
  694. + }
  695. + }
  696. + }
  697. + ReportSetupProblem(device, "Unable to find OpExecutionMode.\n");
  698. + return std::make_tuple(0, 0, 0);
  699. +}
  700. +
  701. +std::string AutoInst::AnnotateModuleStr(std::string &shader, std::unordered_map<uint32_t, std::string> &inst_id2str) const {
  702. + std::regex pattern("%[0-9]+? = OpFunctionCall %void %[0-9]+? %uint_(.*) *");
  703. +
  704. + std::istringstream ss{shader};
  705. + std::ostringstream out;
  706. + int i = 0;
  707. +
  708. + for (std::string line; std::getline(ss, line, '\n');) {
  709. + std::smatch sm;
  710. + std::regex_search(line, sm, pattern);
  711. + if (sm.size() > 0) {
  712. + uint32_t offset = (uint32_t)atoi(sm[1].str().c_str());
  713. + if (inst_id2str.count(offset) > 0) {
  714. + auto result_id = "%str" + std::to_string(offset) + "_" + std::to_string(i);
  715. + out << result_id << " = OpString "
  716. + << "\"" << inst_id2str[offset] << "\"\n";
  717. + out << "OpLine " << result_id << " "
  718. + << "0 0 "
  719. + << "\n";
  720. + i++;
  721. + }
  722. + } else if (line.find("OpLine") != std::string::npos) {
  723. + // Clear any other OpLines to avoid interference
  724. + continue;
  725. + }
  726. +
  727. + out << line << "\n";
  728. + }
  729. + return out.str();
  730. +}
  731. +
  732. +void AutoInst::TryCompileModuleStrToGlsl(const std::string shader, std::string file_name) const {
  733. +#if defined(__ANDROID__)
  734. + ReportInfo(device, "SPIRV-Cross not included on Android. Shader " + file_name + " generated without cross compiling.\n");
  735. + std::ofstream file;
  736. + file.open(file_name + ".spv");
  737. + file << shader;
  738. + file.close();
  739. +#else
  740. + try {
  741. + using namespace spvtools;
  742. + std::ofstream temp;
  743. + SpirvTools spirvTools(spv_target_env::SPV_ENV_VULKAN_1_2);
  744. + std::vector<uint32_t> binary;
  745. + (void)spirvTools.Assemble(shader, &binary, SPV_TEXT_TO_BINARY_OPTION_NONE);
  746. + using namespace spirv_cross;
  747. + CompilerGLSL compiler(binary);
  748. + auto options = compiler.get_common_options();
  749. + options.emit_line_directives = true;
  750. + options.vulkan_semantics = true;
  751. + compiler.set_common_options(options);
  752. + std::string glsl;
  753. + glsl = compiler.compile();
  754. + if (glsl.size() == 0) {
  755. + ReportSetupProblem(device, "Spirv-cross failed. Shader " + file_name + " generated without cross compiling.\n");
  756. + std::ofstream file;
  757. + file.open(file_name + ".glsl");
  758. + file << shader;
  759. + file.close();
  760. + } else {
  761. + // post process line annotations into comments
  762. + std::regex re("#line [0-9]* \"([^\"]*)\"");
  763. + std::ofstream file;
  764. + file.open(file_name + ".glsl");
  765. + file << std::regex_replace(glsl, re, "/*$1*/");
  766. + file.close();
  767. + }
  768. + } catch (...) {
  769. + ReportSetupProblem(device, "Spirv-cross crashed. Shader " + file_name + " generated without cross compiling.\n");
  770. + std::ofstream file;
  771. + file.open(file_name + ".glsl");
  772. + file << shader;
  773. + file.close();
  774. + }
  775. +#endif
  776. +}
  777. +
  778. +// Turn on necessary device features.
  779. +void AutoInst::PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo *create_info,
  780. + const VkAllocationCallbacks *pAllocator, VkDevice *pDevice, void *modified_create_info) {
  781. + DispatchGetPhysicalDeviceFeatures(gpu, &supported_features);
  782. + VkPhysicalDeviceFeatures features = {};
  783. + features.vertexPipelineStoresAndAtomics = true;
  784. + features.fragmentStoresAndAtomics = true;
  785. + UtilPreCallRecordCreateDevice(gpu, reinterpret_cast<safe_VkDeviceCreateInfo *>(modified_create_info), supported_features,
  786. + features);
  787. +}
  788. +
  789. +// Perform initializations that can be done at Create Device time.
  790. +void AutoInst::PostCallRecordCreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo,
  791. + const VkAllocationCallbacks *pAllocator, VkDevice *pDevice, VkResult result) {
  792. + ValidationStateTracker::PostCallRecordCreateDevice(physicalDevice, pCreateInfo, pAllocator, pDevice, result);
  793. +
  794. + ValidationObject *device_object = GetLayerDataPtr(get_dispatch_key(*pDevice), layer_data_map);
  795. + ValidationObject *validation_data = GetValidationObject(device_object->object_dispatch, this->container_type);
  796. + AutoInst *device_auto_inst = static_cast<AutoInst *>(validation_data);
  797. + device_auto_inst->physicalDevice = physicalDevice;
  798. + device_auto_inst->device = *pDevice;
  799. +
  800. + if (device_auto_inst->phys_dev_props.apiVersion < VK_API_VERSION_1_1) {
  801. + ReportSetupProblem(device, "Auto Inst requires Vulkan 1.1 or later. Auto Inst disabled.");
  802. + device_auto_inst->aborted = true;
  803. + return;
  804. + }
  805. +
  806. + if (!supported_features.fragmentStoresAndAtomics || !supported_features.vertexPipelineStoresAndAtomics) {
  807. + ReportSetupProblem(device,
  808. + "Auto Inst requires fragmentStoresAndAtomics and vertexPipelineStoresAndAtomics. "
  809. + "Auto Inst disabled.");
  810. + device_auto_inst->aborted = true;
  811. + return;
  812. + }
  813. +
  814. + if (enabled[gpu_validation] || enabled[debug_printf]) {
  815. + ReportSetupProblem(device,
  816. + "Auto inst cannot be enabled when gpu assisted validation or debug printf are enabled. "
  817. + "Auto inst disabled.");
  818. + device_auto_inst->aborted = true;
  819. + return;
  820. + }
  821. +
  822. + const char *size_string = getLayerOption("khronos_validation.auto_inst_buffer_size");
  823. + device_auto_inst->output_buffer_size = *size_string ? atoi(size_string) : 1024;
  824. + if (device_auto_inst->output_buffer_size <= 16) {
  825. + ReportSetupProblem(device, "The instrumentation buffer size must be at least 16 bytes");
  826. + device_auto_inst->aborted = true;
  827. + }
  828. + const char *stdout_string = getLayerOption("khronos_validation.auto_inst_to_stdout");
  829. + device_auto_inst->use_stdout = *stdout_string ? !strcmp(stdout_string, "false") : true;
  830. + use_stdout = device_auto_inst->use_stdout;
  831. +
  832. + const char *base_file_name = getLayerOption("khronos_validation.auto_inst_base_file_name");
  833. + device_auto_inst->base_file_name = *base_file_name ? base_file_name : "";
  834. +
  835. + const char *pti = getLayerOption("khronos_validation.auto_inst_pipeline_to_instrument");
  836. + if (!strcmp(pti, "") || !strcmp(pti, "RayTracing")) {
  837. + device_auto_inst->pipeline_to_instrument = VK_PIPELINE_BIND_POINT_RAY_TRACING_NV;
  838. + if (!device_extensions.vk_nv_ray_tracing && !device_extensions.vk_khr_ray_tracing_pipeline) {
  839. + ReportSetupProblem(device, "Cannot instrument ray-tracing pipeline since ray-tracing is not enabled.\n");
  840. + device_auto_inst->aborted = true;
  841. + return;
  842. + }
  843. +
  844. + ReportInfo(device, "Instrumenting Ray-Tracing Pipeline!\n");
  845. + } else if (!strcmp(pti, "Graphics")) {
  846. + device_auto_inst->pipeline_to_instrument = VK_PIPELINE_BIND_POINT_GRAPHICS;
  847. + ReportInfo(device, "Instrumenting Graphics Pipeline!\n");
  848. + } else if (!strcmp(pti, "Compute")) {
  849. + device_auto_inst->pipeline_to_instrument = VK_PIPELINE_BIND_POINT_COMPUTE;
  850. + ReportInfo(device, "Instrumenting Compute Pipeline!\n");
  851. + }
  852. +
  853. + pipeline_to_instrument = device_auto_inst->pipeline_to_instrument;
  854. +
  855. + const char *create_reference_heatmap = getLayerOption("khronos_validation.auto_inst_create_reference_heatmap");
  856. + if (!strcmp(create_reference_heatmap, "true")) {
  857. + ReportInfo(device, "Creating reference heatmap!\n");
  858. + std::vector<char> scale_colors;
  859. +
  860. + const uint32_t scale_width = 256;
  861. + const uint32_t scale_height = 30;
  862. +
  863. + for (int height = 0; height < scale_height; height++) {
  864. + for (float i = 0; i < scale_width; i++) {
  865. + auto rgb = UnitIntervalToRGB(i / 255.0f);
  866. +
  867. + scale_colors.push_back(std::get<0>(rgb));
  868. + scale_colors.push_back(std::get<1>(rgb));
  869. + scale_colors.push_back(std::get<2>(rgb));
  870. + }
  871. + }
  872. +
  873. + CreateImage(scale_width, scale_height, scale_colors, "ReferenceScale");
  874. + }
  875. +
  876. + const char *debug_mode = getLayerOption("khronos_validation.auto_inst_debug_mode");
  877. + if (!strcmp(debug_mode, "atomics")) {
  878. + device_auto_inst->is_debugging_atomic_ops = true;
  879. + } else if (!strcmp(debug_mode, "subgroup")) {
  880. + device_auto_inst->is_debugging_subgroup_ops = true;
  881. + } else if (!strcmp(debug_mode, "arraylength")) {
  882. + device_auto_inst->is_debugging_array_length_op = true;
  883. + }
  884. +
  885. + const char *dump_shaders = getLayerOption("khronos_validation.auto_inst_dump_shaders");
  886. + device_auto_inst->dump_shaders = *dump_shaders ? !strcmp(dump_shaders, "true") : false;
  887. +
  888. + if (device_auto_inst->is_debugging_atomic_ops || device_auto_inst->is_debugging_atomic_ops)
  889. + ReportInfo(device, "Running Auto-Inst in debug mode, normal auto-instrumentation is disabled.\n");
  890. +
  891. + const char *shaders_to_instrument = getLayerOption("khronos_validation.auto_inst_shaders_to_instrument");
  892. + if (shaders_to_instrument) {
  893. + // Format of the option is stageN, stageM, stageL where stage is
  894. + // defined in ShaderStageFlagLookup and N,M,L are integer literals
  895. + std::string shader_list(shaders_to_instrument);
  896. + size_t pos = 0;
  897. + std::string token;
  898. + while (shader_list.length() != 0) {
  899. + while (shader_list[0] == ' ') shader_list.erase(0, 1);
  900. + pos = shader_list.find(',');
  901. + if (pos != std::string::npos) {
  902. + token = shader_list.substr(0, pos);
  903. + } else {
  904. + pos = shader_list.length() - 1;
  905. + token = shader_list;
  906. + }
  907. +
  908. + size_t i = 0;
  909. + while (token[i] < '0' || token[i] > '9') {
  910. + i++;
  911. + }
  912. +
  913. + std::string stage_str = token.substr(0, i);
  914. + auto shader_index = std::atoi(token.substr(i, pos).c_str());
  915. + if (shader_index == 0) {
  916. + ReportSetupProblem(device, "Shader index must be greater than 0. Aborting\n.");
  917. + device_auto_inst->aborted = true;
  918. + }
  919. + if (ShaderStageFlagLookup.count(stage_str) > 0) {
  920. + uint32_t shader_stage = ShaderStageFlagLookup.find(stage_str)->second;
  921. + device_auto_inst->StageToInstIndices[shader_stage].insert(shader_index - 1);
  922. + } else {
  923. + // Assume that if user is specifying this setting, they care about it being correct rather than
  924. + // a more general default.
  925. + ReportSetupProblem(device, "Did not recognize stage " + stage_str + ". Aborting\n.");
  926. + device_auto_inst->aborted = true;
  927. + }
  928. + shader_list.erase(0, pos + 1);
  929. + }
  930. + }
  931. +
  932. + InitializeLayerDeviceSettings(device_auto_inst);
  933. +
  934. + TryReadRuntimeSizeCache(device_auto_inst);
  935. +
  936. + std::vector<VkDescriptorSetLayoutBinding> bindings;
  937. + VkDescriptorSetLayoutBinding binding = {3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1,
  938. + VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_COMPUTE_BIT | kShaderStageAllRayTracing,
  939. + NULL};
  940. + bindings.push_back(binding);
  941. + UtilPostCallRecordCreateDevice(pCreateInfo, bindings, device_auto_inst, device_auto_inst->phys_dev_props);
  942. +}
  943. +
  944. +void AutoInst::PreCallRecordDestroyDevice(VkDevice device, const VkAllocationCallbacks *pAllocator) {
  945. + UtilPreCallRecordDestroyDevice(this);
  946. + ValidationStateTracker::PreCallRecordDestroyDevice(device, pAllocator);
  947. + // State Tracker can end up making vma calls through callbacks - don't destroy allocator until ST is done
  948. + if (vmaAllocator) {
  949. + vmaDestroyAllocator(vmaAllocator);
  950. + }
  951. + desc_set_manager.reset();
  952. +}
  953. +
  954. +// Modify the pipeline layout to include our debug descriptor set and any needed padding with the dummy descriptor set.
  955. +void AutoInst::PreCallRecordCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo,
  956. + const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout,
  957. + void *cpl_state_data) {
  958. + if (aborted) {
  959. + return;
  960. + }
  961. +
  962. + create_pipeline_layout_api_state *cpl_state = reinterpret_cast<create_pipeline_layout_api_state *>(cpl_state_data);
  963. +
  964. + if (cpl_state->modified_create_info.setLayoutCount >= adjusted_max_desc_sets) {
  965. + std::ostringstream strm;
  966. + strm << "Pipeline Layout conflict with validation's descriptor set at slot " << desc_set_bind_index << ". "
  967. + << "Application has too many descriptor sets in the pipeline layout to continue with debug printf. "
  968. + << "Not modifying the pipeline layout. "
  969. + << "Instrumented shaders are replaced with non-instrumented shaders.";
  970. + ReportSetupProblem(device, strm.str().c_str());
  971. + } else {
  972. + UtilPreCallRecordCreatePipelineLayout(cpl_state, this, pCreateInfo);
  973. + }
  974. +}
  975. +
  976. +void AutoInst::PostCallRecordCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo,
  977. + const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout,
  978. + VkResult result) {
  979. + ValidationStateTracker::PostCallRecordCreatePipelineLayout(device, pCreateInfo, pAllocator, pPipelineLayout, result);
  980. + if (result != VK_SUCCESS) {
  981. + ReportSetupProblem(device, "Unable to create pipeline layout. Device could become unstable.");
  982. + aborted = true;
  983. + }
  984. +}
  985. +
  986. +// Free the device memory and descriptor set associated with a command buffer.
  987. +void AutoInst::ResetCommandBuffer(VkCommandBuffer commandBuffer) {
  988. + if (aborted) {
  989. + return;
  990. + }
  991. + auto auto_inst_buffer_list = GetBufferInfo(commandBuffer);
  992. + for (auto buffer_info : auto_inst_buffer_list) {
  993. + vmaDestroyBuffer(vmaAllocator, buffer_info.output_mem_block.buffer, buffer_info.output_mem_block.allocation);
  994. + if (buffer_info.desc_set != VK_NULL_HANDLE) {
  995. + desc_set_manager->PutBackDescriptorSet(buffer_info.desc_pool, buffer_info.desc_set);
  996. + }
  997. + }
  998. + command_buffer_map.erase(commandBuffer);
  999. +}
  1000. +
  1001. +// Just gives a warning about a possible deadlock.
  1002. +bool AutoInst::PreCallValidateCmdWaitEvents(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
  1003. + VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
  1004. + uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
  1005. + uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier *pBufferMemoryBarriers,
  1006. + uint32_t imageMemoryBarrierCount,
  1007. + const VkImageMemoryBarrier *pImageMemoryBarriers) const {
  1008. + if (srcStageMask & VK_PIPELINE_STAGE_HOST_BIT) {
  1009. + ReportSetupProblem(commandBuffer,
  1010. + "CmdWaitEvents recorded with VK_PIPELINE_STAGE_HOST_BIT set. "
  1011. + "Auto inst waits on queue completion. "
  1012. + "This wait could block the host's signaling of this event, resulting in deadlock.");
  1013. + }
  1014. + return false;
  1015. +}
  1016. +
  1017. +void AutoInst::PreCallRecordCreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
  1018. + const VkGraphicsPipelineCreateInfo *pCreateInfos,
  1019. + const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
  1020. + void *cgpl_state_data) {
  1021. + std::vector<safe_VkGraphicsPipelineCreateInfo> new_pipeline_create_infos;
  1022. + create_graphics_pipeline_api_state *cgpl_state = reinterpret_cast<create_graphics_pipeline_api_state *>(cgpl_state_data);
  1023. + UtilPreCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, cgpl_state->pipe_state,
  1024. + &new_pipeline_create_infos, VK_PIPELINE_BIND_POINT_GRAPHICS, this);
  1025. + cgpl_state->printf_create_infos = new_pipeline_create_infos;
  1026. + cgpl_state->pCreateInfos = reinterpret_cast<VkGraphicsPipelineCreateInfo *>(cgpl_state->printf_create_infos.data());
  1027. +}
  1028. +
  1029. +void AutoInst::PreCallRecordCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
  1030. + const VkComputePipelineCreateInfo *pCreateInfos,
  1031. + const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
  1032. + void *ccpl_state_data) {
  1033. + std::vector<safe_VkComputePipelineCreateInfo> new_pipeline_create_infos;
  1034. + auto *ccpl_state = reinterpret_cast<create_compute_pipeline_api_state *>(ccpl_state_data);
  1035. + UtilPreCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, ccpl_state->pipe_state,
  1036. + &new_pipeline_create_infos, VK_PIPELINE_BIND_POINT_COMPUTE, this);
  1037. + ccpl_state->printf_create_infos = new_pipeline_create_infos;
  1038. + ccpl_state->pCreateInfos = reinterpret_cast<VkComputePipelineCreateInfo *>(ccpl_state->gpu_create_infos.data());
  1039. +}
  1040. +
  1041. +void AutoInst::PreCallRecordCreateRayTracingPipelinesNV(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
  1042. + const VkRayTracingPipelineCreateInfoNV *pCreateInfos,
  1043. + const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
  1044. + void *crtpl_state_data) {
  1045. + std::vector<safe_VkRayTracingPipelineCreateInfoCommon> new_pipeline_create_infos;
  1046. + auto *crtpl_state = reinterpret_cast<create_ray_tracing_pipeline_api_state *>(crtpl_state_data);
  1047. + UtilPreCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, crtpl_state->pipe_state,
  1048. + &new_pipeline_create_infos, VK_PIPELINE_BIND_POINT_RAY_TRACING_NV, this);
  1049. + crtpl_state->printf_create_infos = new_pipeline_create_infos;
  1050. + crtpl_state->pCreateInfos = reinterpret_cast<VkRayTracingPipelineCreateInfoNV *>(crtpl_state->gpu_create_infos.data());
  1051. +}
  1052. +
  1053. +void AutoInst::PreCallRecordCreateRayTracingPipelinesKHR(VkDevice device, VkDeferredOperationKHR deferredOperation,
  1054. + VkPipelineCache pipelineCache, uint32_t count,
  1055. + const VkRayTracingPipelineCreateInfoKHR *pCreateInfos,
  1056. + const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
  1057. + void *crtpl_state_data) {
  1058. + std::vector<safe_VkRayTracingPipelineCreateInfoCommon> new_pipeline_create_infos;
  1059. + auto *crtpl_state = reinterpret_cast<create_ray_tracing_pipeline_khr_api_state *>(crtpl_state_data);
  1060. + UtilPreCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, crtpl_state->pipe_state,
  1061. + &new_pipeline_create_infos, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, this);
  1062. + crtpl_state->printf_create_infos = new_pipeline_create_infos;
  1063. + crtpl_state->pCreateInfos = reinterpret_cast<VkRayTracingPipelineCreateInfoKHR *>(crtpl_state->printf_create_infos.data());
  1064. +}
  1065. +
  1066. +void AutoInst::PostCallRecordCreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
  1067. + const VkGraphicsPipelineCreateInfo *pCreateInfos,
  1068. + const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
  1069. + VkResult result, void *cgpl_state_data) {
  1070. + ValidationStateTracker::PostCallRecordCreateGraphicsPipelines(device, pipelineCache, count, pCreateInfos, pAllocator,
  1071. + pPipelines, result, cgpl_state_data);
  1072. + if (pipeline_to_instrument != VK_PIPELINE_BIND_POINT_GRAPHICS) return;
  1073. + UtilPostCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, VK_PIPELINE_BIND_POINT_GRAPHICS, this);
  1074. +}
  1075. +
  1076. +void AutoInst::PostCallRecordCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
  1077. + const VkComputePipelineCreateInfo *pCreateInfos,
  1078. + const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
  1079. + VkResult result, void *ccpl_state_data) {
  1080. + ValidationStateTracker::PostCallRecordCreateComputePipelines(device, pipelineCache, count, pCreateInfos, pAllocator, pPipelines,
  1081. + result, ccpl_state_data);
  1082. + if (pipeline_to_instrument != VK_PIPELINE_BIND_POINT_COMPUTE) return;
  1083. + UtilPostCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, VK_PIPELINE_BIND_POINT_COMPUTE, this);
  1084. +}
  1085. +
  1086. +void AutoInst::PostCallRecordCreateRayTracingPipelinesNV(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
  1087. + const VkRayTracingPipelineCreateInfoNV *pCreateInfos,
  1088. + const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
  1089. + VkResult result, void *crtpl_state_data) {
  1090. + ValidationStateTracker::PostCallRecordCreateRayTracingPipelinesNV(device, pipelineCache, count, pCreateInfos, pAllocator,
  1091. + pPipelines, result, crtpl_state_data);
  1092. + if (pipeline_to_instrument != VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR) return;
  1093. + UtilPostCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, VK_PIPELINE_BIND_POINT_RAY_TRACING_NV, this);
  1094. +}
  1095. +
  1096. +void AutoInst::PostCallRecordCreateRayTracingPipelinesKHR(VkDevice device, VkDeferredOperationKHR deferredOperation,
  1097. + VkPipelineCache pipelineCache, uint32_t count,
  1098. + const VkRayTracingPipelineCreateInfoKHR *pCreateInfos,
  1099. + const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
  1100. + VkResult result, void *crtpl_state_data) {
  1101. + ValidationStateTracker::PostCallRecordCreateRayTracingPipelinesKHR(
  1102. + device, deferredOperation, pipelineCache, count, pCreateInfos, pAllocator, pPipelines, result, crtpl_state_data);
  1103. + if (pipeline_to_instrument != VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR) return;
  1104. + UtilPostCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, this);
  1105. +}
  1106. +
  1107. +// Remove all the shader trackers associated with this destroyed pipeline.
  1108. +void AutoInst::PreCallRecordDestroyPipeline(VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator) {
  1109. + for (auto it = shader_map.begin(); it != shader_map.end();) {
  1110. + if (it->second.pipeline == pipeline) {
  1111. + it = shader_map.erase(it);
  1112. + } else {
  1113. + ++it;
  1114. + }
  1115. + }
  1116. + ValidationStateTracker::PreCallRecordDestroyPipeline(device, pipeline, pAllocator);
  1117. +}
  1118. +// Call the SPIR-V Optimizer to run the instrumentation pass on the shader.
  1119. +bool AutoInst::InstrumentShader(const VkShaderModuleCreateInfo *pCreateInfo, std::vector<unsigned int> &new_pgm,
  1120. + uint32_t *unique_shader_id) {
  1121. + if (aborted) return false;
  1122. + if (pCreateInfo->pCode[0] != spv::MagicNumber) return false;
  1123. +
  1124. + // Load original shader SPIR-V
  1125. + uint32_t num_words = static_cast<uint32_t>(pCreateInfo->codeSize / 4);
  1126. + new_pgm.clear();
  1127. + new_pgm.reserve(num_words);
  1128. + new_pgm.insert(new_pgm.end(), &pCreateInfo->pCode[0], &pCreateInfo->pCode[num_words]);
  1129. +
  1130. + auto stage = FindShaderStage(new_pgm);
  1131. + if (stage == 0) return false;
  1132. +
  1133. + // Check against pipeline_to_instrument setting
  1134. + switch (pipeline_to_instrument) {
  1135. + case VK_PIPELINE_BIND_POINT_COMPUTE:
  1136. + if ((kShaderStageAllCompute & stage) == 0) return false;
  1137. + break;
  1138. + case VK_PIPELINE_BIND_POINT_GRAPHICS:
  1139. + if ((kShaderStageAllGraphics & stage) == 0) return false;
  1140. + break;
  1141. + case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
  1142. + if ((kShaderStageAllRayTracing & stage) == 0) return false;
  1143. + break;
  1144. + default:
  1145. + break;
  1146. + }
  1147. +
  1148. + // Check again shaders_to_instrument setting
  1149. + if (StageToInstIndices.size() > 0) {
  1150. + auto stage_index = Stage2SeenCount[stage];
  1151. + Stage2SeenCount[stage]++;
  1152. + if (StageToInstIndices[stage].count(stage_index) == 0) {
  1153. + // The shader stage and index was not found in the user provided setting so skip instrumenting.
  1154. + return false;
  1155. + }
  1156. + }
  1157. +
  1158. + // Call the optimizer to instrument the shader.
  1159. + // Use the unique_shader_module_id as a shader ID so we can look up its handle later in the shader_map.
  1160. + // If descriptor indexing is enabled, enable length checks and updated descriptor checks
  1161. + using namespace spvtools;
  1162. + spv_target_env target_env = SPV_ENV_VULKAN_1_2;
  1163. +
  1164. + const spvtools::MessageConsumer auto_inst_console_message_consumer =
  1165. + [this](spv_message_level_t level, const char *, const spv_position_t &position, const char *message) -> void {
  1166. + switch (level) {
  1167. + case SPV_MSG_FATAL:
  1168. + case SPV_MSG_INTERNAL_ERROR:
  1169. + case SPV_MSG_ERROR:
  1170. + this->LogError(this->device, "UNASSIGNED-Debug-Printf", "Error during shader instrumentation: line %zu: %s",
  1171. + position.index, message);
  1172. + break;
  1173. + default:
  1174. + break;
  1175. + }
  1176. + };
  1177. +
  1178. + Optimizer optimizer(target_env);
  1179. + optimizer.SetMessageConsumer(auto_inst_console_message_consumer);
  1180. + if (is_debugging_atomic_ops || is_debugging_subgroup_ops || is_debugging_array_length_op) {
  1181. + optimizer.RegisterPass(spvtools::CreateAutoInstDebugPass(desc_set_bind_index, unique_shader_module_id,
  1182. + is_debugging_atomic_ops, is_debugging_subgroup_ops));
  1183. + } else {
  1184. + RegisterPasses(&optimizer, desc_set_bind_index, unique_shader_module_id);
  1185. + }
  1186. + if (optimizer.GetPassNames().size() == 0) return false;
  1187. + bool pass = optimizer.Run(new_pgm.data(), new_pgm.size(), &new_pgm);
  1188. + if (!pass) {
  1189. + ReportSetupProblem(
  1190. + device, "Failure to instrument shader " + ShaderStageToString(stage) + ". Proceeding with non-instrumented shader.\n");
  1191. + }
  1192. +
  1193. + if (dump_shaders) {
  1194. + std::ofstream shader_dump_file;
  1195. + shader_dump_file.open(ShaderStageToString(stage) + std::to_string(unique_shader_module_id) + ".spv",
  1196. + std::ios_base::binary | std::ios_base::out);
  1197. + shader_dump_file.write((char *)new_pgm.data(), new_pgm.size() * sizeof(uint32_t));
  1198. + shader_dump_file.close();
  1199. + }
  1200. +
  1201. + instrumentation_map[unique_shader_module_id] = std::vector<unsigned int>(new_pgm);
  1202. + *unique_shader_id = unique_shader_module_id++;
  1203. + return pass;
  1204. +}
  1205. +// Create the instrumented shader data to provide to the driver.
  1206. +void AutoInst::PreCallRecordCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo *pCreateInfo,
  1207. + const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule,
  1208. + void *csm_state_data) {
  1209. + create_shader_module_api_state *csm_state = reinterpret_cast<create_shader_module_api_state *>(csm_state_data);
  1210. +
  1211. + bool pass = InstrumentShader(pCreateInfo, csm_state->instrumented_pgm, &csm_state->unique_shader_id);
  1212. + if (pass) {
  1213. + csm_state->instrumented_create_info.pCode = csm_state->instrumented_pgm.data();
  1214. + csm_state->instrumented_create_info.codeSize = csm_state->instrumented_pgm.size() * sizeof(unsigned int);
  1215. + }
  1216. +}
  1217. +
  1218. +void AutoInst::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQueue queue, VkPipelineBindPoint pipeline_bind_point,
  1219. + uint32_t operation_index, uint32_t *const debug_output_buffer) {
  1220. + if (pipeline_bind_point != pipeline_to_instrument) return;
  1221. + if (aborted) return;
  1222. +
  1223. + // debug mode tests
  1224. + if (is_debugging_atomic_ops || is_debugging_subgroup_ops || is_debugging_array_length_op) {
  1225. + if (is_debugging_atomic_ops) {
  1226. + auto message = debug_output_buffer[BUFFER_DEBUG_LOCATION] == 0 ? "Atomics instrumentation did not produce a result!\n"
  1227. + : "Atomics instrumentation produced a result!\n";
  1228. + ReportInfo(device, message);
  1229. + }
  1230. +
  1231. + if (is_debugging_subgroup_ops) {
  1232. + auto message = debug_output_buffer[BUFFER_DEBUG_LOCATION] == 0
  1233. + ? "Subgroup instrumentation did not produce a result!\n"
  1234. + : "Subgroup instrumentation produced a result" +
  1235. + std::bitset<SUBGROUP_SIZE>(debug_output_buffer[BUFFER_DEBUG_LOCATION]).to_string() + "!\n";
  1236. + ReportInfo(device, message);
  1237. + }
  1238. +
  1239. + if (is_debugging_array_length_op) {
  1240. + auto message = debug_output_buffer[BUFFER_DEBUG_LOCATION] == 0
  1241. + ? "ArrayLength instrumentation did not produce a result!\n"
  1242. + : "ArrayLength instrumentation produced a result=" +
  1243. + std::to_string(4 * debug_output_buffer[BUFFER_DEBUG_LOCATION]) + " bytes !\n";
  1244. + ReportInfo(device, message);
  1245. + }
  1246. +
  1247. + return;
  1248. + }
  1249. +
  1250. + auto words_written = debug_output_buffer[WORDS_WRITTEN_INDEX];
  1251. +
  1252. + uint32_t bytes_consumed = sizeof(uint32_t) * NUM_BUFFER_RESERVED_WORDS + sizeof(uint32_t) * words_written;
  1253. +
  1254. + VmaAllocationInfo alloc_info;
  1255. + (void)vmaGetAllocationInfo(vmaAllocator, GetBufferInfo(command_buffer)[operation_index].output_mem_block.allocation,
  1256. + &alloc_info);
  1257. + auto buffer_size = alloc_info.size;
  1258. + auto creation_index = GetBufferInfo(command_buffer)[operation_index].output_mem_block.creation_index;
  1259. +
  1260. + auto overflowed = bytes_consumed > buffer_size;
  1261. + if (overflowed) {
  1262. + auto message =
  1263. + "Output buffer size is " + std::to_string(buffer_size) + " bytes which is less than the " +
  1264. + std::to_string(bytes_consumed) +
  1265. + " bytes that the instrumentation could have written. Please rerun the application to get analysis results.\n";
  1266. + ReportInfo(device, message.c_str());
  1267. + }
  1268. +
  1269. + auto bytes_consumed_for_vma = [](uint32_t raw_bytes) {
  1270. + // set to next highest multiple of 1024
  1271. + return (raw_bytes & (~1023)) + 1024;
  1272. + };
  1273. +
  1274. + if (BufferSizeRequirementsLookup.size() <= creation_index) {
  1275. + BufferSizeRequirementsLookup.resize(creation_index + 1, sizeof(uint32_t) * NUM_BUFFER_RESERVED_WORDS);
  1276. + BufferSizeRequirementsLookup[creation_index] = bytes_consumed_for_vma(bytes_consumed);
  1277. + WriteRuntimeSizeCache();
  1278. + } else if (BufferSizeRequirementsLookup[creation_index] == output_buffer_size) {
  1279. + // Assume that if the value is the default, we're safe to lower the instrumentation buffer size
  1280. + // to save on device memory usage.
  1281. + BufferSizeRequirementsLookup[creation_index] = bytes_consumed_for_vma(bytes_consumed);
  1282. + WriteRuntimeSizeCache();
  1283. + } else if (bytes_consumed > BufferSizeRequirementsLookup[creation_index]) {
  1284. + // If the value is not the default, that implies it has already been set by a runtime observation
  1285. + // and therefore it should never decrease.
  1286. + BufferSizeRequirementsLookup[creation_index] = bytes_consumed_for_vma(bytes_consumed);
  1287. + WriteRuntimeSizeCache();
  1288. + }
  1289. +
  1290. + switch (pipeline_bind_point) {
  1291. + case VK_PIPELINE_BIND_POINT_COMPUTE: {
  1292. + auto cb_state = GetCBState(command_buffer);
  1293. + LAST_BOUND_STATE &last_bound = cb_state->lastBound[pipeline_bind_point];
  1294. + std::tuple<uint32_t, uint32_t, uint32_t> localsize_xyz = std::make_tuple(0, 0, 0);
  1295. + if (last_bound.pipeline_state) {
  1296. + PIPELINE_STATE *p_state = last_bound.pipeline_state;
  1297. + auto shader_state = GetShaderModuleState(p_state->computePipelineCI.stage.module);
  1298. + if (shader_state != NULL) {
  1299. + localsize_xyz = FindComputeLocalSize(shader_state->words);
  1300. + }
  1301. + }
  1302. + if (analysis_index >= compute_launch_records.size()) {
  1303. + ReportSetupProblem(device, "Insufficient launch records to support compute analysis.");
  1304. + break;
  1305. + }
  1306. + auto launch_dims3d = compute_launch_records[analysis_index];
  1307. + uint32_t localsize_x = std::get<0>(localsize_xyz);
  1308. + uint32_t localsize_y = std::get<1>(localsize_xyz);
  1309. + uint32_t localsize_z = std::get<2>(localsize_xyz);
  1310. + if (localsize_x == 0 || localsize_y == 0 || localsize_x == 0) {
  1311. + ReportSetupProblem(device, "Could not determine compute shader local size.\n");
  1312. + }
  1313. + is_analyzing_compute = true;
  1314. + AnalyzeCompute(debug_output_buffer, overflowed, launch_dims3d.x_dim * localsize_x, launch_dims3d.y_dim * localsize_y,
  1315. + launch_dims3d.z_dim * localsize_z);
  1316. + is_analyzing_compute = false;
  1317. + analysis_index++;
  1318. + break;
  1319. + }
  1320. + case VK_PIPELINE_BIND_POINT_GRAPHICS: {
  1321. + analysis_index++;
  1322. + is_analyzing_draw = true;
  1323. + AnalyzeGraphics(debug_output_buffer, overflowed);
  1324. + is_analyzing_draw = false;
  1325. + break;
  1326. + }
  1327. + case VK_PIPELINE_BIND_POINT_RAY_TRACING_NV: {
  1328. + if (analysis_index >= rt_launch_records.size()) {
  1329. + ReportSetupProblem(device, "Insufficient launch records to support ray tracing analysis.");
  1330. + break;
  1331. + }
  1332. + auto launch_dims3d = rt_launch_records[analysis_index];
  1333. + is_analyzing_rt = true;
  1334. + AnalyzeRayTracing(debug_output_buffer, overflowed, launch_dims3d.x_dim, launch_dims3d.y_dim, launch_dims3d.z_dim);
  1335. + is_analyzing_rt = false;
  1336. + analysis_index++;
  1337. + break;
  1338. + }
  1339. + default:
  1340. + ReportSetupProblem(device, "Unsupported pipeline type cannot be analyzed.");
  1341. + break;
  1342. + }
  1343. +
  1344. + memset(debug_output_buffer, 0, buffer_size);
  1345. +}
  1346. +
  1347. +// Issue a memory barrier to make GPU-written data available to host.
  1348. +// Wait for the queue to complete execution.
  1349. +// Check the debug buffers for all the command buffers that were submitted.
  1350. +void AutoInst::PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits, VkFence fence,
  1351. + VkResult result) {
  1352. + ValidationStateTracker::PostCallRecordQueueSubmit(queue, submitCount, pSubmits, fence, result);
  1353. +
  1354. + if (aborted || (result != VK_SUCCESS)) return;
  1355. + bool buffers_present = false;
  1356. + // Don't QueueWaitIdle if there's nothing to process
  1357. + for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
  1358. + const VkSubmitInfo *submit = &pSubmits[submit_idx];
  1359. + for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
  1360. + auto cb_node = GetCBState(submit->pCommandBuffers[i]);
  1361. + if (GetBufferInfo(cb_node->commandBuffer).size()) buffers_present = true;
  1362. + for (auto secondaryCmdBuffer : cb_node->linkedCommandBuffers) {
  1363. + if (GetBufferInfo(secondaryCmdBuffer->commandBuffer).size()) buffers_present = true;
  1364. + }
  1365. + }
  1366. + }
  1367. + if (!buffers_present) return;
  1368. +
  1369. + UtilSubmitBarrier(queue, this);
  1370. +
  1371. + DispatchQueueWaitIdle(queue);
  1372. +
  1373. + for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
  1374. + const VkSubmitInfo *submit = &pSubmits[submit_idx];
  1375. + for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
  1376. + auto cb_node = GetCBState(submit->pCommandBuffers[i]);
  1377. + UtilProcessInstrumentationBuffer(queue, cb_node, this);
  1378. + for (auto secondaryCmdBuffer : cb_node->linkedCommandBuffers) {
  1379. + UtilProcessInstrumentationBuffer(queue, secondaryCmdBuffer, this);
  1380. + }
  1381. + }
  1382. + }
  1383. +}
  1384. +
  1385. +void AutoInst::PreCallRecordCmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount,
  1386. + uint32_t firstVertex, uint32_t firstInstance) {
  1387. + AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
  1388. +}
  1389. +
  1390. +void AutoInst::PreCallRecordCmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
  1391. + uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance) {
  1392. + AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
  1393. +}
  1394. +
  1395. +void AutoInst::PreCallRecordCmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t count,
  1396. + uint32_t stride) {
  1397. + AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
  1398. +}
  1399. +
  1400. +void AutoInst::PreCallRecordCmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset,
  1401. + uint32_t count, uint32_t stride) {
  1402. + AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
  1403. +}
  1404. +
  1405. +void AutoInst::PreCallRecordCmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z) {
  1406. + AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE);
  1407. + if (pipeline_to_instrument == VK_PIPELINE_BIND_POINT_COMPUTE) {
  1408. + compute_launch_records.push_back({x, y, z});
  1409. + }
  1410. +}
  1411. +
  1412. +void AutoInst::PreCallRecordCmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset) {
  1413. + AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE);
  1414. +}
  1415. +
  1416. +void AutoInst::PreCallRecordCmdTraceRaysNV(VkCommandBuffer commandBuffer, VkBuffer raygenShaderBindingTableBuffer,
  1417. + VkDeviceSize raygenShaderBindingOffset, VkBuffer missShaderBindingTableBuffer,
  1418. + VkDeviceSize missShaderBindingOffset, VkDeviceSize missShaderBindingStride,
  1419. + VkBuffer hitShaderBindingTableBuffer, VkDeviceSize hitShaderBindingOffset,
  1420. + VkDeviceSize hitShaderBindingStride, VkBuffer callableShaderBindingTableBuffer,
  1421. + VkDeviceSize callableShaderBindingOffset, VkDeviceSize callableShaderBindingStride,
  1422. + uint32_t width, uint32_t height, uint32_t depth) {
  1423. + AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_NV);
  1424. +}
  1425. +
  1426. +void AutoInst::PostCallRecordCmdTraceRaysNV(VkCommandBuffer commandBuffer, VkBuffer raygenShaderBindingTableBuffer,
  1427. + VkDeviceSize raygenShaderBindingOffset, VkBuffer missShaderBindingTableBuffer,
  1428. + VkDeviceSize missShaderBindingOffset, VkDeviceSize missShaderBindingStride,
  1429. + VkBuffer hitShaderBindingTableBuffer, VkDeviceSize hitShaderBindingOffset,
  1430. + VkDeviceSize hitShaderBindingStride, VkBuffer callableShaderBindingTableBuffer,
  1431. + VkDeviceSize callableShaderBindingOffset, VkDeviceSize callableShaderBindingStride,
  1432. + uint32_t width, uint32_t height, uint32_t depth) {
  1433. + CMD_BUFFER_STATE *cb_state = GetCBState(commandBuffer);
  1434. + cb_state->hasTraceRaysCmd = true;
  1435. +
  1436. + rt_launch_records.push_back({width, height, depth});
  1437. +}
  1438. +
  1439. +void AutoInst::PreCallRecordCmdTraceRaysKHR(VkCommandBuffer commandBuffer,
  1440. + const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
  1441. + const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
  1442. + const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
  1443. + const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, uint32_t width,
  1444. + uint32_t height, uint32_t depth) {
  1445. + AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
  1446. +}
  1447. +
  1448. +void AutoInst::PostCallRecordCmdTraceRaysKHR(VkCommandBuffer commandBuffer,
  1449. + const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
  1450. + const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
  1451. + const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
  1452. + const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, uint32_t width,
  1453. + uint32_t height, uint32_t depth) {
  1454. + CMD_BUFFER_STATE *cb_state = GetCBState(commandBuffer);
  1455. + cb_state->hasTraceRaysCmd = true;
  1456. +
  1457. + rt_launch_records.push_back({width, height, depth});
  1458. +}
  1459. +
  1460. +void AutoInst::PreCallRecordCmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
  1461. + const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
  1462. + const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
  1463. + const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
  1464. + const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
  1465. + VkDeviceAddress indirectDeviceAddress) {
  1466. + AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
  1467. +}
  1468. +
  1469. +void AutoInst::PostCallRecordCmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
  1470. + const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
  1471. + const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
  1472. + const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
  1473. + const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
  1474. + VkDeviceAddress indirectDeviceAddress) {
  1475. + CMD_BUFFER_STATE *cb_state = GetCBState(commandBuffer);
  1476. + cb_state->hasTraceRaysCmd = true;
  1477. +}
  1478. +
  1479. +void AutoInst::PostCallRecordQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR *pPresentInfo, VkResult result) {
  1480. + if (aborted) return;
  1481. + // helper variables for recording file names generated by analyses
  1482. + frame_number++;
  1483. + analysis_index = 0;
  1484. + pipeline_creation_index = 0;
  1485. +}
  1486. +
  1487. +void AutoInst::AllocateAutoInstResources(const VkCommandBuffer cmd_buffer, const VkPipelineBindPoint bind_point) {
  1488. + if (bind_point != VK_PIPELINE_BIND_POINT_GRAPHICS && bind_point != VK_PIPELINE_BIND_POINT_COMPUTE &&
  1489. + bind_point != VK_PIPELINE_BIND_POINT_RAY_TRACING_NV) {
  1490. + return;
  1491. + }
  1492. +
  1493. + if (pipeline_to_instrument != bind_point) {
  1494. + return;
  1495. + }
  1496. +
  1497. + VkResult result;
  1498. +
  1499. + if (aborted) return;
  1500. +
  1501. + std::vector<VkDescriptorSet> desc_sets;
  1502. + VkDescriptorPool desc_pool = VK_NULL_HANDLE;
  1503. + result = desc_set_manager->GetDescriptorSets(1, &desc_pool, debug_desc_layout, &desc_sets);
  1504. + assert(result == VK_SUCCESS);
  1505. + if (result != VK_SUCCESS) {
  1506. + ReportSetupProblem(device, "Unable to allocate descriptor sets. Device could become unstable.");
  1507. + aborted = true;
  1508. + return;
  1509. + }
  1510. +
  1511. + auto buffer_size = (BufferSizeRequirementsLookup.size() <= pipeline_creation_index)
  1512. + ? output_buffer_size
  1513. + : (uint32_t)(BufferSizeRequirementsLookup[pipeline_creation_index]);
  1514. +
  1515. + VkDescriptorBufferInfo output_desc_buffer_info = {};
  1516. + output_desc_buffer_info.range = buffer_size;
  1517. +
  1518. + auto cb_node = GetCBState(cmd_buffer);
  1519. + if (!cb_node) {
  1520. + ReportSetupProblem(device, "Unrecognized command buffer");
  1521. + aborted = true;
  1522. + return;
  1523. + }
  1524. +
  1525. + // Allocate memory for the output block that the gpu will use to return values for instrumentation
  1526. + AIDeviceMemoryBlock output_block = {};
  1527. + VkBufferCreateInfo bufferInfo = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
  1528. + bufferInfo.size = buffer_size;
  1529. + bufferInfo.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
  1530. + VmaAllocationCreateInfo allocInfo = {};
  1531. + allocInfo.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
  1532. + result = vmaCreateBuffer(vmaAllocator, &bufferInfo, &allocInfo, &output_block.buffer, &output_block.allocation, nullptr);
  1533. +
  1534. + output_block.creation_index = pipeline_creation_index;
  1535. + pipeline_creation_index++;
  1536. + if (result != VK_SUCCESS) {
  1537. + ReportSetupProblem(device, "Unable to allocate device memory. Device could become unstable.");
  1538. + aborted = true;
  1539. + return;
  1540. + }
  1541. +
  1542. + // Clear the output block to zeros so that only values from the gpu will be present
  1543. + uint32_t *pData;
  1544. + result = vmaMapMemory(vmaAllocator, output_block.allocation, (void **)&pData);
  1545. + if (result == VK_SUCCESS) {
  1546. + memset(pData, 0, buffer_size);
  1547. + InitializeInstrumentationBuffer(pData);
  1548. + vmaUnmapMemory(vmaAllocator, output_block.allocation);
  1549. + }
  1550. +
  1551. + VkWriteDescriptorSet desc_writes[1] = {};
  1552. + const uint32_t desc_count = 1;
  1553. +
  1554. + // Write the descriptor
  1555. + output_desc_buffer_info.buffer = output_block.buffer;
  1556. + output_desc_buffer_info.offset = 0;
  1557. +
  1558. + desc_writes[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
  1559. + desc_writes[0].descriptorCount = 1;
  1560. + desc_writes[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
  1561. + desc_writes[0].pBufferInfo = &output_desc_buffer_info;
  1562. + desc_writes[0].dstSet = desc_sets[0];
  1563. + desc_writes[0].dstBinding = 3;
  1564. + DispatchUpdateDescriptorSets(device, desc_count, desc_writes, 0, NULL);
  1565. +
  1566. + const auto lv_bind_point = ConvertToLvlBindPoint(bind_point);
  1567. + const auto *pipeline_state = cb_node->lastBound[lv_bind_point].pipeline_state;
  1568. + if (pipeline_state) {
  1569. + if (pipeline_state->pipeline_layout->set_layouts.size() <= desc_set_bind_index) {
  1570. + DispatchCmdBindDescriptorSets(cmd_buffer, bind_point, pipeline_state->pipeline_layout->layout, desc_set_bind_index, 1,
  1571. + desc_sets.data(), 0, nullptr);
  1572. + }
  1573. + // Record buffer and memory info in CB state tracking
  1574. + GetBufferInfo(cmd_buffer).emplace_back(output_block, desc_sets[0], desc_pool, bind_point);
  1575. + } else {
  1576. + ReportSetupProblem(device, "Unable to find pipeline state");
  1577. + vmaDestroyBuffer(vmaAllocator, output_block.buffer, output_block.allocation);
  1578. + aborted = true;
  1579. + return;
  1580. + }
  1581. +}
  1582. diff --git a/layers/auto_inst.h b/layers/auto_inst.h
  1583. new file mode 100644
  1584. index 00000000..dd5dbbd9
  1585. --- /dev/null
  1586. +++ b/layers/auto_inst.h
  1587. @@ -0,0 +1,465 @@
  1588. +/* Copyright (c) 2020 The Khronos Group Inc.
  1589. + *
  1590. + * Licensed under the Apache License, Version 2.0 (the "License");
  1591. + * you may not use this file except in compliance with the License.
  1592. + * You may obtain a copy of the License at
  1593. + *
  1594. + * http://www.apache.org/licenses/LICENSE-2.0
  1595. + *
  1596. + * Unless required by applicable law or agreed to in writing, software
  1597. + * distributed under the License is distributed on an "AS IS" BASIS,
  1598. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  1599. + * See the License for the specific language governing permissions and
  1600. + * limitations under the License.
  1601. + *
  1602. + * Author: David Pankratz <pankratz@ualberta.ca>
  1603. + */
  1604. +
  1605. +#pragma once
  1606. +
  1607. +#include "chassis.h"
  1608. +#include "vk_mem_alloc.h"
  1609. +#include "state_tracker.h"
  1610. +#include "gpu_utils.h"
  1611. +#include "spirv-tools/instrument.hpp"
  1612. +#include <map>
  1613. +
  1614. +class AutoInst;
  1615. +
  1616. +struct AIDeviceMemoryBlock {
  1617. + uint32_t creation_index;
  1618. + VkBuffer buffer;
  1619. + VmaAllocation allocation;
  1620. +};
  1621. +
  1622. +struct AIBufferInfo {
  1623. + AIDeviceMemoryBlock output_mem_block;
  1624. + VkDescriptorSet desc_set;
  1625. + VkDescriptorPool desc_pool;
  1626. + VkPipelineBindPoint pipeline_bind_point;
  1627. + AIBufferInfo(AIDeviceMemoryBlock output_mem_block, VkDescriptorSet desc_set, VkDescriptorPool desc_pool,
  1628. + VkPipelineBindPoint pipeline_bind_point)
  1629. + : output_mem_block(output_mem_block), desc_set(desc_set), desc_pool(desc_pool), pipeline_bind_point(pipeline_bind_point){};
  1630. +};
  1631. +
  1632. +struct AIShaderTracker {
  1633. + VkPipeline pipeline;
  1634. + VkShaderModule shader_module;
  1635. + std::vector<unsigned int> pgm;
  1636. + VkShaderStageFlagBits stage;
  1637. +};
  1638. +
  1639. +struct AIUniqueSubgroupIdEntry {
  1640. + uint32_t inst_id;
  1641. + uint32_t flat_thread_id;
  1642. + uint32_t subgroup_ids; // Combined subgroup id and intra subgroup id.
  1643. +
  1644. + inline uint32_t SubgroupId() const { return subgroup_ids & 0x07FFFFFF; }
  1645. + inline uint32_t IntraSubgroupId() const { return (subgroup_ids & 0xF8000000) >> 27; }
  1646. +};
  1647. +
  1648. +struct LaunchDims3D {
  1649. + uint32_t x_dim;
  1650. + uint32_t y_dim;
  1651. + uint32_t z_dim;
  1652. +};
  1653. +
  1654. +class AutoInst : public ValidationStateTracker {
  1655. + VkPhysicalDeviceFeatures supported_features;
  1656. +
  1657. + uint32_t unique_shader_module_id = 0;
  1658. + std::unordered_map<VkCommandBuffer, std::vector<AIBufferInfo>> command_buffer_map;
  1659. + uint32_t output_buffer_size;
  1660. +
  1661. + protected:
  1662. + static const uint32_t SUBGROUP_SIZE = 32;
  1663. +
  1664. + // Reserved words in the buffer
  1665. + static const uint32_t WORDS_WRITTEN_INDEX = 0;
  1666. + static const uint32_t NUM_SUBGROUP_IDS_INDEX = 1;
  1667. +
  1668. + static const uint32_t NUM_BUFFER_RESERVED_WORDS = 2;
  1669. +
  1670. + // Reserved word in debug mode
  1671. + static const uint32_t BUFFER_DEBUG_LOCATION = 1;
  1672. +
  1673. + public:
  1674. + using ThreadIdToSubgroupIdMap = std::unordered_map<uint32_t, uint32_t>;
  1675. +
  1676. + // Map from subgroup_id * SUBGROUP_SIZE + thread_offset to runtime
  1677. + // thread_id
  1678. + using ThreadIdSwizzleMap = std::unordered_map<uint32_t, uint32_t>;
  1679. +
  1680. + // Map from primtive id to the number of words that the primitive wrote
  1681. + // to the StorageBuffer.
  1682. + using PrimitiveIdToPrimitiveSizeMap = std::unordered_map<uint32_t, uint32_t>;
  1683. +
  1684. + // Record pipeline invocations launch dim parameters;
  1685. + using LaunchDimRecords = std::vector<LaunchDims3D>;
  1686. +
  1687. + AutoInst() { container_type = LayerObjectTypeAutoInst; }
  1688. +
  1689. + // The pipeline type to instrument
  1690. + VkPipelineBindPoint pipeline_to_instrument = VK_PIPELINE_BIND_POINT_MAX_ENUM;
  1691. +
  1692. + // Records of the rt and compute launch sizes, this is useful for rebuilding
  1693. + // the frames for visualization
  1694. + LaunchDimRecords rt_launch_records;
  1695. + LaunchDimRecords compute_launch_records;
  1696. +
  1697. + // Helper variables for properly naming files output by the analysis
  1698. + std::string base_file_name;
  1699. + // index of next instrumented pipeline to analyze
  1700. + uint32_t analysis_index = 0;
  1701. + // index of next instrumented pipeline to create. Used for determining runtime
  1702. + // buffer size requirements in the case that previous runs were recorded.
  1703. + uint32_t pipeline_creation_index = 0;
  1704. + uint32_t frame_number = 0;
  1705. + bool is_analyzing_rt = false;
  1706. + bool is_analyzing_draw = false;
  1707. + bool is_analyzing_compute = false;
  1708. +
  1709. + // Variables for debug modes
  1710. + // This framework relies on atomic operations in SPIR-V for writing
  1711. + // instrumentation results to the StorageBuffer and Subgroup
  1712. + // operations for determining the active thread mask.
  1713. + bool is_debugging_atomic_ops = false;
  1714. + bool is_debugging_subgroup_ops = false;
  1715. + bool is_debugging_array_length_op = false;
  1716. +
  1717. + // if true, dump instrumented shaders
  1718. + // if false, do nothing.
  1719. + bool dump_shaders = false;
  1720. +
  1721. + // This map specifies the index of the shader stage to instrument. I.e.
  1722. + // if the option is Miss2 then the 2nd Miss shader that is created
  1723. + // will be instrumented.
  1724. + //
  1725. + // If this map is uninitialized (size == 0) then it is assumed that
  1726. + // all shaders should be instrumented.
  1727. + std::unordered_map<uint32_t, std::set<uint32_t>> StageToInstIndices;
  1728. +
  1729. + // Track how many of each ShaderStage has been created.
  1730. + std::unordered_map<uint32_t, uint32_t> Stage2SeenCount;
  1731. +
  1732. + // Track how many bytes were required by previous invocations of a given pipeline.
  1733. + // This data is written to a cache file that is read for subsequent executions
  1734. + // of the application.
  1735. + std::vector<uint32_t> BufferSizeRequirementsLookup;
  1736. +
  1737. + bool aborted = false;
  1738. + bool use_stdout = false;
  1739. + VkDevice device;
  1740. + VkPhysicalDevice physicalDevice;
  1741. + uint32_t adjusted_max_desc_sets;
  1742. + uint32_t desc_set_bind_index;
  1743. + VkDescriptorSetLayout debug_desc_layout = VK_NULL_HANDLE;
  1744. + VkDescriptorSetLayout dummy_desc_layout = VK_NULL_HANDLE;
  1745. + std::unique_ptr<UtilDescriptorSetManager> desc_set_manager;
  1746. + std::unordered_map<uint32_t, AIShaderTracker> shader_map;
  1747. + std::unordered_map<uint32_t, std::vector<unsigned int>> instrumentation_map;
  1748. + PFN_vkSetDeviceLoaderData vkSetDeviceLoaderData;
  1749. + VmaAllocator vmaAllocator = {};
  1750. + std::map<VkQueue, UtilQueueBarrierCommandInfo> queue_barrier_command_infos;
  1751. + std::vector<AIBufferInfo>& GetBufferInfo(const VkCommandBuffer command_buffer) {
  1752. + auto buffer_list = command_buffer_map.find(command_buffer);
  1753. + if (buffer_list == command_buffer_map.end()) {
  1754. + std::vector<AIBufferInfo> new_list{};
  1755. + command_buffer_map[command_buffer] = new_list;
  1756. + return command_buffer_map[command_buffer];
  1757. + }
  1758. + return buffer_list->second;
  1759. + }
  1760. +
  1761. + // Subclass Hooks
  1762. +
  1763. + // Opportunity for inheriting classes to initialize
  1764. + // and parse vk_settings_file.txt settings.
  1765. + virtual void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) = 0;
  1766. +
  1767. + // Opportunity for inheriting classes to set the
  1768. + // buffer to nonzero values for use-cases like PGO.
  1769. + virtual void InitializeInstrumentationBuffer(uint32_t* buffer) = 0;
  1770. +
  1771. + // Opportunity for inheriting class to register auto-inst pass
  1772. + // as well as other passes of interest (e.g. performance)
  1773. + virtual void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) = 0;
  1774. +
  1775. + virtual void AnalyzeRayTracing(uint32_t* const output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
  1776. + uint32_t depth) = 0;
  1777. +
  1778. + virtual void AnalyzeGraphics(uint32_t* const output_buffer, bool buffer_overflowed) = 0;
  1779. +
  1780. + virtual void AnalyzeCompute(uint32_t* const output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) = 0;
  1781. +
  1782. + // Helper functions
  1783. + template <typename T>
  1784. + void ReportSetupProblem(T object, std::string specific_message) const;
  1785. + template <typename T>
  1786. + void ReportInfo(T object, std::string specific_message) const;
  1787. +
  1788. + // This function takes a disassembled SPIR-V module in |shader|
  1789. + // and adds the strings in |inst_id2str| immediately before
  1790. + // the instrumentation callsites with a given inst_id.
  1791. + //
  1792. + // After the function has finished it will return the shader
  1793. + // with annotations. This is designed to be used in conjunction
  1794. + // with SPIRV-cross.
  1795. + std::string AnnotateModuleStr(std::string& shader, std::unordered_map<uint32_t, std::string>& inst_id2str) const;
  1796. +
  1797. + // This function takes an annotated spir-v |shader| module as a string and
  1798. + // attempts to cross-compile it using SPIRV-cross to the corresponding glsl.
  1799. + //
  1800. + // After cross-compiling, a post-processing step changes the
  1801. + // #line directives that are added to valid GLSL comments
  1802. + //
  1803. + // SPIR-Cross fails frequently due to unsupported builtins and the fall-back
  1804. + // path is to emit the module as .spv not .glsl. Either the .spv or .glsl will be
  1805. + // written to |file_name|
  1806. + void TryCompileModuleStrToGlsl(const std::string shader, std::string file_name) const;
  1807. +
  1808. + // Returns a file name that includes the base_file_name, analysis type,
  1809. + // frame number and finally the |analysis_specific_suffix|.
  1810. + inline std::string FrameAnalysisFileName(std::string analysis_specific_suffix) const {
  1811. + std::string analysis_type;
  1812. + if (is_analyzing_compute) {
  1813. + analysis_type = "compute";
  1814. + } else if (is_analyzing_draw) {
  1815. + analysis_type = "draw";
  1816. + } else if (is_analyzing_rt) {
  1817. + analysis_type = "rt";
  1818. + } else {
  1819. + analysis_type = "unknown";
  1820. + }
  1821. + return base_file_name + analysis_type + "_frame" + std::to_string(frame_number) + "_" + analysis_specific_suffix;
  1822. + }
  1823. +
  1824. + // Returns a file name that includes the base_file_name, analysis type, analysis specific pipeline invocation index
  1825. + // frame number and finally the |analysis_specific_suffix|.
  1826. + inline std::string PipelineAnalysisFileName(std::string analysis_specific_suffix) const {
  1827. + std::string analysis_type;
  1828. + if (is_analyzing_compute) {
  1829. + analysis_type = "compute";
  1830. + } else if (is_analyzing_draw) {
  1831. + analysis_type = "draw";
  1832. + } else if (is_analyzing_rt) {
  1833. + analysis_type = "rt";
  1834. + } else {
  1835. + analysis_type = "unknown";
  1836. + }
  1837. + return base_file_name + analysis_type + std::to_string(analysis_index) + "_frame" + std::to_string(frame_number) + "_" +
  1838. + analysis_specific_suffix;
  1839. + }
  1840. +
  1841. + static inline std::string ShaderStageToString(uint32_t stage) {
  1842. + switch (stage) {
  1843. + case VK_SHADER_STAGE_RAYGEN_BIT_KHR:
  1844. + return "RayGen";
  1845. + case VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR:
  1846. + return "ClosestHit";
  1847. + case VK_SHADER_STAGE_CALLABLE_BIT_KHR:
  1848. + return "Callable";
  1849. + case VK_SHADER_STAGE_MISS_BIT_KHR:
  1850. + return "Miss";
  1851. + case VK_SHADER_STAGE_ANY_HIT_BIT_KHR:
  1852. + return "AnyHit";
  1853. + case VK_SHADER_STAGE_INTERSECTION_BIT_KHR:
  1854. + return "Intersection";
  1855. + case VK_SHADER_STAGE_GEOMETRY_BIT:
  1856. + return "Geometry";
  1857. + case VK_SHADER_STAGE_FRAGMENT_BIT:
  1858. + return "Fragment";
  1859. + case VK_SHADER_STAGE_COMPUTE_BIT:
  1860. + return "Compute";
  1861. + case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
  1862. + return "TessellationControl";
  1863. + case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
  1864. + return "TessellationEvaluation";
  1865. + case VK_SHADER_STAGE_VERTEX_BIT:
  1866. + return "Vertex";
  1867. + default:
  1868. + return "Unknown" + std::to_string(stage);
  1869. + }
  1870. + }
  1871. +
  1872. + // Create a PPM file with size |width| * |height| by writing the values in |colors| in row-major order
  1873. + void CreateImage(uint32_t width, uint32_t height, std::vector<char>& colors, std::string file_name) const;
  1874. + // Return a color represented as RBG from a value in the unit interval [0,1].
  1875. + std::tuple<char, char, char> UnitIntervalToRGB(float val) const;
  1876. +
  1877. + // Analyze the SPIR-V module binary |pgm| to determine which execution model it
  1878. + // implements and the corresponding VkShaderStageFlag.
  1879. + //
  1880. + // If the shader stage is not supported, or the shader module implements more than
  1881. + // one execution model then this function returns 0.
  1882. + // Otherwise it returns the single bit representation the shader stage.
  1883. + uint32_t FindShaderStage(std::vector<unsigned int> pgm) const;
  1884. +
  1885. + // Analyze the SPIR-V module binary |pgm| of a compute shader to determine the
  1886. + // localsize that it implements.
  1887. + // If the shader stage is not supported this function returns 0,0,0.
  1888. + // Otherwise it returns the x,y,z values of the localsize.
  1889. + std::tuple<uint32_t, uint32_t, uint32_t> FindComputeLocalSize(std::vector<unsigned int> pgm) const;
  1890. +
  1891. + // File name of cache file containing runtime instrumentation buffer size requirements.
  1892. + inline std::string RuntimeSizeCachePath(VkPipelineBindPoint bind_point) const {
  1893. + std::string pipeline_type;
  1894. + switch (bind_point) {
  1895. + case VK_PIPELINE_BIND_POINT_COMPUTE:
  1896. + pipeline_type = "compute";
  1897. + break;
  1898. + case VK_PIPELINE_BIND_POINT_GRAPHICS:
  1899. + pipeline_type = "graphics";
  1900. + break;
  1901. + case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
  1902. + pipeline_type = "ray_tracing";
  1903. + break;
  1904. + default:
  1905. + pipeline_type = "Unknown";
  1906. + break;
  1907. + }
  1908. + return ".ai_runtime_size_cache_" + pipeline_type + ".bin";
  1909. + }
  1910. +
  1911. + // Attempts to read the cache file containing information about this application's
  1912. + // runtime buffer size requirements.
  1913. + //
  1914. + // This function populates the variable BufferSizeRequirementsLookup
  1915. + void TryReadRuntimeSizeCache(AutoInst* device_auto_inst);
  1916. +
  1917. + // Writes the current knowledge of the runtime instrumentation buffer requirements
  1918. + // to the cache file.
  1919. + void WriteRuntimeSizeCache();
  1920. +
  1921. + // Creates mappings from the data output by the 'UniqueSubgroupId' primitive.
  1922. + // |primitive_id2primitive_size| map determines how many words in the buffer
  1923. + // belong to a given primitive type.
  1924. + // |thread_id2subgroup_id_map| allows the lookup from thread id to subgroup id.
  1925. + // |thread_id_swizzle_map| allows for lookup of the original flat thread id
  1926. + // from the unique subgroup id and intra subgroup id.
  1927. + // |inst_id2_prim_id| is invoked with the first word of every entry
  1928. + // this allows an analysis to specify custom inst id's and still
  1929. + // relate them to prim ids.
  1930. + // Returns true if creating mappings is successful, false otherwise.
  1931. + bool CreateUniqueSubgroupIdMappings(
  1932. + uint32_t* const debug_output_buffer, PrimitiveIdToPrimitiveSizeMap& primitive_id2primitive_size,
  1933. + ThreadIdToSubgroupIdMap& thread_id2subgroup_id_map, ThreadIdSwizzleMap& thread_id_swizzle_map,
  1934. + std::function<uint32_t(uint32_t inst_id)> inst_id2prim_id = [](uint32_t x) { return x; }) const;
  1935. +
  1936. + // Core auto-inst functionality
  1937. + bool InstrumentShader(const VkShaderModuleCreateInfo* pCreateInfo, std::vector<unsigned int>& new_pgm,
  1938. + uint32_t* unique_shader_id);
  1939. + void AllocateAutoInstResources(const VkCommandBuffer cmd_buffer, const VkPipelineBindPoint bind_point);
  1940. +
  1941. + // Validation Layer hooks
  1942. + void PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo,
  1943. + const VkAllocationCallbacks* pAllocator, VkDevice* pDevice, void* modified_create_info) override;
  1944. + void PostCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo,
  1945. + const VkAllocationCallbacks* pAllocator, VkDevice* pDevice, VkResult result) override;
  1946. + void PreCallRecordDestroyDevice(VkDevice device, const VkAllocationCallbacks* pAllocator) override;
  1947. + void PreCallRecordCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo* pCreateInfo,
  1948. + const VkAllocationCallbacks* pAllocator, VkPipelineLayout* pPipelineLayout,
  1949. + void* cpl_state_data) override;
  1950. + void PostCallRecordCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo* pCreateInfo,
  1951. + const VkAllocationCallbacks* pAllocator, VkPipelineLayout* pPipelineLayout,
  1952. + VkResult result) override;
  1953. + void ResetCommandBuffer(VkCommandBuffer commandBuffer);
  1954. + bool PreCallValidateCmdWaitEvents(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent* pEvents,
  1955. + VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
  1956. + uint32_t memoryBarrierCount, const VkMemoryBarrier* pMemoryBarriers,
  1957. + uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier* pBufferMemoryBarriers,
  1958. + uint32_t imageMemoryBarrierCount,
  1959. + const VkImageMemoryBarrier* pImageMemoryBarriers) const override;
  1960. + void PreCallRecordCreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
  1961. + const VkGraphicsPipelineCreateInfo* pCreateInfos,
  1962. + const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
  1963. + void* cgpl_state_data) override;
  1964. + void PreCallRecordCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
  1965. + const VkComputePipelineCreateInfo* pCreateInfos,
  1966. + const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
  1967. + void* ccpl_state_data) override;
  1968. + void PreCallRecordCreateRayTracingPipelinesNV(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
  1969. + const VkRayTracingPipelineCreateInfoNV* pCreateInfos,
  1970. + const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
  1971. + void* crtpl_state_data) override;
  1972. + void PreCallRecordCreateRayTracingPipelinesKHR(VkDevice device, VkDeferredOperationKHR deferredOperation,
  1973. + VkPipelineCache pipelineCache, uint32_t count,
  1974. + const VkRayTracingPipelineCreateInfoKHR* pCreateInfos,
  1975. + const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
  1976. + void* crtpl_state_data) override;
  1977. + void PostCallRecordCreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
  1978. + const VkGraphicsPipelineCreateInfo* pCreateInfos,
  1979. + const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines, VkResult result,
  1980. + void* cgpl_state_data) override;
  1981. + void PostCallRecordCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
  1982. + const VkComputePipelineCreateInfo* pCreateInfos,
  1983. + const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines, VkResult result,
  1984. + void* ccpl_state_data) override;
  1985. + void PostCallRecordCreateRayTracingPipelinesNV(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
  1986. + const VkRayTracingPipelineCreateInfoNV* pCreateInfos,
  1987. + const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines, VkResult result,
  1988. + void* crtpl_state_data) override;
  1989. + void PostCallRecordCreateRayTracingPipelinesKHR(VkDevice device, VkDeferredOperationKHR deferredOperation,
  1990. + VkPipelineCache pipelineCache, uint32_t count,
  1991. + const VkRayTracingPipelineCreateInfoKHR* pCreateInfos,
  1992. + const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
  1993. + VkResult result, void* crtpl_state_data) override;
  1994. +
  1995. + void PreCallRecordDestroyPipeline(VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks* pAllocator) override;
  1996. + void PreCallRecordCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo* pCreateInfo,
  1997. + const VkAllocationCallbacks* pAllocator, VkShaderModule* pShaderModule,
  1998. + void* csm_state_data) override;
  1999. + void AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQueue queue, VkPipelineBindPoint pipeline_bind_point,
  2000. + uint32_t operation_index, uint32_t* const debug_output_buffer);
  2001. + void PreCallRecordCmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex,
  2002. + uint32_t firstInstance) override;
  2003. + void PreCallRecordCmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
  2004. + uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance) override;
  2005. + void PreCallRecordCmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t count,
  2006. + uint32_t stride) override;
  2007. + void PreCallRecordCmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t count,
  2008. + uint32_t stride) override;
  2009. + void PreCallRecordCmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z) override;
  2010. + void PreCallRecordCmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset) override;
  2011. + void PreCallRecordCmdTraceRaysNV(VkCommandBuffer commandBuffer, VkBuffer raygenShaderBindingTableBuffer,
  2012. + VkDeviceSize raygenShaderBindingOffset, VkBuffer missShaderBindingTableBuffer,
  2013. + VkDeviceSize missShaderBindingOffset, VkDeviceSize missShaderBindingStride,
  2014. + VkBuffer hitShaderBindingTableBuffer, VkDeviceSize hitShaderBindingOffset,
  2015. + VkDeviceSize hitShaderBindingStride, VkBuffer callableShaderBindingTableBuffer,
  2016. + VkDeviceSize callableShaderBindingOffset, VkDeviceSize callableShaderBindingStride,
  2017. + uint32_t width, uint32_t height, uint32_t depth) override;
  2018. + void PostCallRecordCmdTraceRaysNV(VkCommandBuffer commandBuffer, VkBuffer raygenShaderBindingTableBuffer,
  2019. + VkDeviceSize raygenShaderBindingOffset, VkBuffer missShaderBindingTableBuffer,
  2020. + VkDeviceSize missShaderBindingOffset, VkDeviceSize missShaderBindingStride,
  2021. + VkBuffer hitShaderBindingTableBuffer, VkDeviceSize hitShaderBindingOffset,
  2022. + VkDeviceSize hitShaderBindingStride, VkBuffer callableShaderBindingTableBuffer,
  2023. + VkDeviceSize callableShaderBindingOffset, VkDeviceSize callableShaderBindingStride,
  2024. + uint32_t width, uint32_t height, uint32_t depth) override;
  2025. + void PreCallRecordCmdTraceRaysKHR(VkCommandBuffer commandBuffer,
  2026. + const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
  2027. + const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
  2028. + const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
  2029. + const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, uint32_t width,
  2030. + uint32_t height, uint32_t depth) override;
  2031. + void PostCallRecordCmdTraceRaysKHR(VkCommandBuffer commandBuffer,
  2032. + const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
  2033. + const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
  2034. + const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
  2035. + const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, uint32_t width,
  2036. + uint32_t height, uint32_t depth) override;
  2037. + void PreCallRecordCmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
  2038. + const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
  2039. + const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
  2040. + const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
  2041. + const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
  2042. + VkDeviceAddress indirectDeviceAddress) override;
  2043. + void PostCallRecordCmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
  2044. + const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
  2045. + const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
  2046. + const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
  2047. + const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
  2048. + VkDeviceAddress indirectDeviceAddress) override;
  2049. + void PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence,
  2050. + VkResult result) override;
  2051. + void PostCallRecordQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* pPresentInfo, VkResult result) override;
  2052. +};
  2053. diff --git a/layers/auto_inst_divergence_characterization.cpp b/layers/auto_inst_divergence_characterization.cpp
  2054. new file mode 100644
  2055. index 00000000..adf2fd18
  2056. --- /dev/null
  2057. +++ b/layers/auto_inst_divergence_characterization.cpp
  2058. @@ -0,0 +1,157 @@
  2059. +/* Copyright (c) 2020 The Khronos Group Inc.
  2060. + *
  2061. + * Licensed under the Apache License, Version 2.0 (the "License");
  2062. + * you may not use this file except in compliance with the License.
  2063. + * You may obtain a copy of the License at
  2064. + *
  2065. + * http://www.apache.org/licenses/LICENSE-2.0
  2066. + *
  2067. + * Unless required by applicable law or agreed to in writing, software
  2068. + * distributed under the License is distributed on an "AS IS" BASIS,
  2069. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  2070. + * See the License for the specific language governing permissions and
  2071. + * limitations under the License.
  2072. + *
  2073. + * Author: David Pankratz <pankratz@ualberta.ca>
  2074. + */
  2075. +
  2076. +#include "auto_inst_divergence_characterization.h"
  2077. +#include <bitset>
  2078. +#include <fstream>
  2079. +
  2080. +namespace {
  2081. +
  2082. +struct DivCharRecord {
  2083. + uint32_t inst_id;
  2084. + uint32_t flat_thread_id;
  2085. + uint32_t active_thread_mask;
  2086. +};
  2087. +
  2088. +} // namespace
  2089. +
  2090. +void AutoInstDivergenceCharacterization::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
  2091. +
  2092. +void AutoInstDivergenceCharacterization::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index,
  2093. + uint32_t shader_module_id) {
  2094. + auto static_data_callback = [&](std::unordered_map<uint32_t, uint32_t>&& inst_id2prim_id,
  2095. + std::unordered_map<uint32_t, uint32_t>&& inst_id2inst_count) {
  2096. + inst_id2prim_id_.insert(inst_id2prim_id.begin(), inst_id2prim_id.end());
  2097. + inst_id2inst_count_.insert(inst_id2inst_count.begin(), inst_id2inst_count.end());
  2098. + };
  2099. +
  2100. + optimizer->RegisterPass(
  2101. + spvtools::CreateAutoInstDivergenceCharacterizationPass(desc_bind_index, shader_module_id, static_data_callback));
  2102. +}
  2103. +
  2104. +void AutoInstDivergenceCharacterization::AnalyzeRayTracing(uint32_t* const device_output_buffer, bool buffer_overflowed,
  2105. + uint32_t width, uint32_t height, uint32_t depth) {
  2106. + if (buffer_overflowed) {
  2107. + ReportSetupProblem(device, "Divergence characterization requires a complete execution trace. Aborting.\n");
  2108. + return;
  2109. + }
  2110. +
  2111. + auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
  2112. + ReportInfo(device, "Analyzing divergence characterization for " + std::to_string(runtime_words_written * 4) + " bytes! \n");
  2113. +
  2114. + auto num_threads = width * height * depth;
  2115. +
  2116. + // Create mapping from inst_id to inst_size to determine stride
  2117. + AutoInst::PrimitiveIdToPrimitiveSizeMap prim_id2_prim_size = {
  2118. + {spvtools::kAutoInstUniqueSubgroupId, (uint32_t)(sizeof(AIUniqueSubgroupIdEntry) / sizeof(uint32_t))},
  2119. + {spvtools::kAutoInstDivCharPreTraceRay, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
  2120. + {spvtools::kAutoInstDivCharPostTraceRay, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
  2121. + {spvtools::kAutoInstDivCharQuitPipeline, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
  2122. + {spvtools::kAutoInstDivCharShaderEntryPoint, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
  2123. + {spvtools::kAutoInstDivCharActiveThreads, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
  2124. + };
  2125. +
  2126. + // Create subgroup id mapping to be populated
  2127. + AutoInst::ThreadIdToSubgroupIdMap thread_id2subgroup_id;
  2128. + AutoInst::ThreadIdSwizzleMap thread_id_swizzle;
  2129. + auto res = CreateUniqueSubgroupIdMappings(device_output_buffer, prim_id2_prim_size, thread_id2subgroup_id, thread_id_swizzle,
  2130. + [&](uint32_t inst_id) { return inst_id2prim_id_[inst_id]; });
  2131. +
  2132. + if (!res || thread_id2subgroup_id.size() != num_threads || thread_id_swizzle.size() != num_threads) {
  2133. + ReportSetupProblem(device, "Ray tracing pipeline timing analysis failed to acquire unique warp id maps. Aborting.\n");
  2134. + return;
  2135. + }
  2136. +
  2137. + std::unordered_map<uint32_t, std::vector<DivCharRecord>> subgroup_id2records;
  2138. +
  2139. + // Process the runtime timing data
  2140. + uint32_t j = 0;
  2141. + while (j < runtime_words_written) {
  2142. + auto inst_id = device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS];
  2143. + auto prim_id = inst_id2prim_id_[inst_id];
  2144. +
  2145. + if (prim_id >= spvtools::kAutoInstDivCharPreTraceRay && prim_id <= spvtools::kAutoInstDivCharQuitPipeline) {
  2146. + auto subgroup_id = thread_id2subgroup_id[device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS + 1]];
  2147. + auto record = *reinterpret_cast<DivCharRecord*>(&device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS]);
  2148. + subgroup_id2records[subgroup_id].push_back(record);
  2149. + } else if (prim_id != spvtools::kAutoInstUniqueSubgroupId) {
  2150. + ReportSetupProblem(device, "Analysis received unrecognized primitive identifier. Aborting.\n");
  2151. + return;
  2152. + }
  2153. +
  2154. + j += prim_id2_prim_size[prim_id];
  2155. + }
  2156. +
  2157. + uint64_t return_divergence = 0;
  2158. + uint64_t control_flow_divergence = 0;
  2159. + uint64_t indirect_function_call_divergence = 0;
  2160. +
  2161. + for (auto warp_itr = subgroup_id2records.begin(); warp_itr != subgroup_id2records.end(); warp_itr++) {
  2162. + for (uint32_t m = 0, n = 1; m < SUBGROUP_SIZE; m++, n *= 2) {
  2163. + std::vector<bool> recurse_thread_status;
  2164. + bool is_returned = false;
  2165. + bool is_indirect_func_killed = false;
  2166. + for (const auto& offset_itr : warp_itr->second) {
  2167. + auto prim_id = inst_id2prim_id_[offset_itr.inst_id];
  2168. + bool is_thread_active = (offset_itr.active_thread_mask & n) != 0;
  2169. +
  2170. + if (prim_id == spvtools::kAutoInstDivCharQuitPipeline) {
  2171. + is_returned = is_thread_active;
  2172. + continue;
  2173. + } else if (prim_id == spvtools::kAutoInstDivCharPreTraceRay) {
  2174. + recurse_thread_status.push_back(is_thread_active);
  2175. + continue;
  2176. + } else if (prim_id == spvtools::kAutoInstDivCharPostTraceRay) {
  2177. +
  2178. + recurse_thread_status.pop_back();
  2179. + if (is_thread_active) {
  2180. + is_indirect_func_killed = false;
  2181. + }
  2182. + continue;
  2183. + } else if (prim_id == spvtools::kAutoInstDivCharShaderEntryPoint) {
  2184. + is_indirect_func_killed = recurse_thread_status.back() && !is_thread_active;
  2185. + continue;
  2186. + }
  2187. +
  2188. + if (!is_thread_active) {
  2189. + if (inst_id2inst_count_.count(offset_itr.inst_id) == 0) {
  2190. + ReportSetupProblem(device, "Missing static instruction count data. Aborting.\n");
  2191. + return;
  2192. + }
  2193. + uint32_t num_insts = inst_id2inst_count_[offset_itr.inst_id];
  2194. +
  2195. + if (is_returned) {
  2196. + return_divergence += num_insts;
  2197. + } else if (is_indirect_func_killed) {
  2198. + // Thread was active at indiret function callsite but not here
  2199. + indirect_function_call_divergence += num_insts;
  2200. + } else {
  2201. + control_flow_divergence += num_insts;
  2202. + }
  2203. + }
  2204. + }
  2205. + }
  2206. + }
  2207. +
  2208. + ReportInfo(device, "Finished analyzing buffer!\n");
  2209. +
  2210. + std::ofstream csv_file;
  2211. + csv_file.open(FrameAnalysisFileName("divergence_characterization.csv"), std::ios_base::app);
  2212. + csv_file << "inst count, indirect func, early exit, control flow,\n";
  2213. + csv_file << "," << indirect_function_call_divergence << "," << return_divergence << "," << control_flow_divergence << ",\n";
  2214. + csv_file.close();
  2215. +}
  2216. \ No newline at end of file
  2217. diff --git a/layers/auto_inst_divergence_characterization.h b/layers/auto_inst_divergence_characterization.h
  2218. new file mode 100644
  2219. index 00000000..c0226d11
  2220. --- /dev/null
  2221. +++ b/layers/auto_inst_divergence_characterization.h
  2222. @@ -0,0 +1,48 @@
  2223. +/* Copyright (c) 2020 The Khronos Group Inc.
  2224. + *
  2225. + * Licensed under the Apache License, Version 2.0 (the "License");
  2226. + * you may not use this file except in compliance with the License.
  2227. + * You may obtain a copy of the License at
  2228. + *
  2229. + * http://www.apache.org/licenses/LICENSE-2.0
  2230. + *
  2231. + * Unless required by applicable law or agreed to in writing, software
  2232. + * distributed under the License is distributed on an "AS IS" BASIS,
  2233. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  2234. + * See the License for the specific language governing permissions and
  2235. + * limitations under the License.
  2236. + *
  2237. + * Author: David Pankratz <pankratz@ualberta.ca>
  2238. + */
  2239. +
  2240. +#pragma once
  2241. +
  2242. +#include "auto_inst.h"
  2243. +
  2244. +class AutoInstDivergenceCharacterization;
  2245. +
  2246. +class AutoInstDivergenceCharacterization : public AutoInst {
  2247. + private:
  2248. + std::unordered_map<uint32_t, uint32_t> inst_id2prim_id_;
  2249. + std::unordered_map<uint32_t, uint32_t> inst_id2inst_count_;
  2250. +
  2251. + public:
  2252. + AutoInstDivergenceCharacterization() : AutoInst() {}
  2253. +
  2254. + void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
  2255. +
  2256. + void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
  2257. +
  2258. + void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
  2259. +
  2260. + void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
  2261. + uint32_t depth) override;
  2262. +
  2263. + void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
  2264. + ReportSetupProblem(device, "Divergence analysis is not compatible with draw commands.");
  2265. + }
  2266. +
  2267. + void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
  2268. + ReportSetupProblem(device, "Divergence analysis is not compatible with compute commands.");
  2269. + }
  2270. +};
  2271. diff --git a/layers/auto_inst_dyn_shader_trace.cpp b/layers/auto_inst_dyn_shader_trace.cpp
  2272. new file mode 100644
  2273. index 00000000..7b11cc0b
  2274. --- /dev/null
  2275. +++ b/layers/auto_inst_dyn_shader_trace.cpp
  2276. @@ -0,0 +1,177 @@
  2277. +/* Copyright (c) 2020 The Khronos Group Inc.
  2278. + *
  2279. + * Licensed under the Apache License, Version 2.0 (the "License");
  2280. + * you may not use this file except in compliance with the License.
  2281. + * You may obtain a copy of the License at
  2282. + *
  2283. + * http://www.apache.org/licenses/LICENSE-2.0
  2284. + *
  2285. + * Unless required by applicable law or agreed to in writing, software
  2286. + * distributed under the License is distributed on an "AS IS" BASIS,
  2287. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  2288. + * See the License for the specific language governing permissions and
  2289. + * limitations under the License.
  2290. + *
  2291. + * Author: David Pankratz <pankratz@ualberta.ca>
  2292. + */
  2293. +
  2294. +#include "auto_inst_dyn_shader_trace.h"
  2295. +#include <bitset>
  2296. +#include <fstream>
  2297. +#include <algorithm>
  2298. +
  2299. +namespace {
  2300. +struct ShaderExecutionRecord {
  2301. + uint32_t prim_id;
  2302. + uint32_t flat_thread_id;
  2303. + uint32_t shader_id;
  2304. + uint32_t active_thread_mask;
  2305. +};
  2306. +
  2307. +} // namespace
  2308. +
  2309. +void AutoInstDynShaderTrace::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
  2310. +
  2311. +void AutoInstDynShaderTrace::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
  2312. + optimizer->RegisterPass(spvtools::CreateAutoInstDynShaderTracePass(desc_bind_index, shader_module_id));
  2313. +}
  2314. +
  2315. +void AutoInstDynShaderTrace::AnalyzeRayTracing(uint32_t* const device_output_buffer, bool buffer_overflowed, uint32_t width,
  2316. + uint32_t height, uint32_t depth) {
  2317. + if (buffer_overflowed) {
  2318. + ReportSetupProblem(device, "Ray tracing dynamic shader trace analysis requires a complete execution trace. Aborting.\n");
  2319. + return;
  2320. + }
  2321. +
  2322. + auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
  2323. + ReportInfo(device, "Analyzing " + std::to_string(runtime_words_written * 4) + " bytes! \n");
  2324. +
  2325. + auto num_threads = width * height * depth;
  2326. +
  2327. + // Create mapping from inst_id to inst_size to determine stride
  2328. + AutoInst::PrimitiveIdToPrimitiveSizeMap prim_id2_prim_size = {
  2329. + {spvtools::kAutoInstUniqueSubgroupId, (uint32_t)(sizeof(AIUniqueSubgroupIdEntry) / sizeof(uint32_t))},
  2330. + {spvtools::kAutoInstDynShaderTraceEntryPoint, (uint32_t)(sizeof(ShaderExecutionRecord) / sizeof(uint32_t))},
  2331. + };
  2332. +
  2333. + // Create warp id mapping to be populated
  2334. + AutoInst::ThreadIdToSubgroupIdMap thread_id2subgroup_id;
  2335. + AutoInst::ThreadIdSwizzleMap thread_id_swizzle;
  2336. + auto res = CreateUniqueSubgroupIdMappings(device_output_buffer, prim_id2_prim_size,
  2337. + thread_id2subgroup_id, thread_id_swizzle);
  2338. +
  2339. + if (!res || thread_id2subgroup_id.size() != num_threads || thread_id_swizzle.size() != num_threads) {
  2340. + ReportSetupProblem(device, "Ray tracing pipeline timing analysis failed to acquire unique warp id maps. Aborting.\n");
  2341. + return;
  2342. + }
  2343. +
  2344. + // For heatmap
  2345. + uint32_t max_thread_exe_count = 0;
  2346. + uint32_t max_subgroup_exe_count = 0;
  2347. +
  2348. + std::unordered_map<uint32_t, uint32_t> thread_id2dyn_count;
  2349. + std::unordered_map<uint32_t, uint32_t> subgroup_id2dyn_count;
  2350. + // For CSV, ordered map for sensible output
  2351. + std::map<uint32_t, uint32_t> shader_id2dyn_count;
  2352. +
  2353. + // Process the runtime timing data
  2354. + uint32_t j = 0;
  2355. + while (j < runtime_words_written) {
  2356. + auto prim_id = device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS];
  2357. +
  2358. + if (prim_id == spvtools::kAutoInstDynShaderTraceEntryPoint) {
  2359. + auto shader_exe_record = reinterpret_cast<ShaderExecutionRecord*>(&device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS]);
  2360. + auto subgroup_id = thread_id2subgroup_id[shader_exe_record->flat_thread_id];
  2361. + for (uint32_t i = 0; i < SUBGROUP_SIZE; i++) {
  2362. + if (shader_exe_record->active_thread_mask & (1 << i)) {
  2363. + auto shader_id = shader_exe_record->shader_id;
  2364. + // For every active thread ending the pipeline, compute its complete timing
  2365. + auto swizzled_id = thread_id_swizzle[subgroup_id * SUBGROUP_SIZE + i];
  2366. + thread_id2dyn_count[swizzled_id]++;
  2367. + max_thread_exe_count = std::max(max_thread_exe_count, thread_id2dyn_count[swizzled_id]);
  2368. + shader_id2dyn_count[shader_id]++;
  2369. + }
  2370. + }
  2371. + subgroup_id2dyn_count[subgroup_id]++;
  2372. + max_subgroup_exe_count = std::max(max_subgroup_exe_count, subgroup_id2dyn_count[subgroup_id]);
  2373. +
  2374. + } else if (prim_id != spvtools::kAutoInstUniqueSubgroupId) {
  2375. + ReportSetupProblem(device, "Encountered unsupported primtive type in Ray tracing thread timing analysis. Aborting.");
  2376. + return;
  2377. + }
  2378. +
  2379. + j += prim_id2_prim_size[prim_id];
  2380. + }
  2381. +
  2382. + // Generate csv
  2383. +
  2384. + {
  2385. + // Output dyn opcode count
  2386. + std::stringstream line0, line1;
  2387. + line0 << "shader,";
  2388. + line1 << "dyn exe count,";
  2389. + for (auto& entry : shader_id2dyn_count) {
  2390. + auto shader_stage_name = ShaderStageToString(shader_map[entry.first].stage);
  2391. + line0 << shader_stage_name << "(" << entry.first << ")"
  2392. + << ",";
  2393. + line1 << entry.second << ",";
  2394. + }
  2395. + line0 << "\n";
  2396. + line1 << "\n";
  2397. +
  2398. + std::ofstream csv_file;
  2399. + csv_file.open(PipelineAnalysisFileName("dyn_shader_counts.csv"));
  2400. + csv_file << line0.str() << line1.str();
  2401. + csv_file.close();
  2402. + }
  2403. +
  2404. + {
  2405. + std::vector<char> colors(num_threads * 3);
  2406. + for (uint32_t y = 0; y < height; y++) {
  2407. + for (uint32_t x = 0; x < width; x++) {
  2408. + for (uint32_t z = 0; z < depth; z++) {
  2409. + auto thread_id = z * (width * height) + y * width + x;
  2410. +
  2411. + auto rgb = UnitIntervalToRGB((float)(thread_id2dyn_count[thread_id] / (float)max_thread_exe_count));
  2412. + uint32_t out_index = 0;
  2413. +
  2414. + if (depth > 1) // This is Quake II RTX specific
  2415. + out_index = y * (width * depth) + x * 2 + z;
  2416. + else
  2417. + out_index = y * width + x;
  2418. +
  2419. + colors[3 * out_index + 0] = std::get<0>(rgb);
  2420. + colors[3 * out_index + 1] = std::get<1>(rgb);
  2421. + colors[3 * out_index + 2] = std::get<2>(rgb);
  2422. + }
  2423. + }
  2424. + }
  2425. +
  2426. + CreateImage(width * depth, height, colors, PipelineAnalysisFileName("shader_execution_heatmap"));
  2427. + }
  2428. +
  2429. + {
  2430. + std::vector<char> colors(num_threads * 3);
  2431. + for (uint32_t y = 0; y < height; y++) {
  2432. + for (uint32_t x = 0; x < width; x++) {
  2433. + for (uint32_t z = 0; z < depth; z++) {
  2434. + auto thread_id = z * (width * height) + y * width + x;
  2435. + auto subgroup_id = thread_id2subgroup_id[thread_id];
  2436. + auto rgb = UnitIntervalToRGB((float)(subgroup_id2dyn_count[subgroup_id] / (float)max_subgroup_exe_count));
  2437. + uint32_t out_index = 0;
  2438. +
  2439. + if (depth > 1) // This is Quake II RTX specific
  2440. + out_index = y * (width * depth) + x * 2 + z;
  2441. + else
  2442. + out_index = y * width + x;
  2443. +
  2444. + colors[3 * out_index + 0] = std::get<0>(rgb);
  2445. + colors[3 * out_index + 1] = std::get<1>(rgb);
  2446. + colors[3 * out_index + 2] = std::get<2>(rgb);
  2447. + }
  2448. + }
  2449. + }
  2450. +
  2451. + CreateImage(width * depth, height, colors, PipelineAnalysisFileName("subgroup_shader_execution_heatmap"));
  2452. + }
  2453. +}
  2454. \ No newline at end of file
  2455. diff --git a/layers/auto_inst_dyn_shader_trace.h b/layers/auto_inst_dyn_shader_trace.h
  2456. new file mode 100644
  2457. index 00000000..02e8b99a
  2458. --- /dev/null
  2459. +++ b/layers/auto_inst_dyn_shader_trace.h
  2460. @@ -0,0 +1,44 @@
  2461. +/* Copyright (c) 2020 The Khronos Group Inc.
  2462. + *
  2463. + * Licensed under the Apache License, Version 2.0 (the "License");
  2464. + * you may not use this file except in compliance with the License.
  2465. + * You may obtain a copy of the License at
  2466. + *
  2467. + * http://www.apache.org/licenses/LICENSE-2.0
  2468. + *
  2469. + * Unless required by applicable law or agreed to in writing, software
  2470. + * distributed under the License is distributed on an "AS IS" BASIS,
  2471. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  2472. + * See the License for the specific language governing permissions and
  2473. + * limitations under the License.
  2474. + *
  2475. + * Author: David Pankratz <pankratz@ualberta.ca>
  2476. + */
  2477. +
  2478. +#pragma once
  2479. +
  2480. +#include "auto_inst.h"
  2481. +
  2482. +class AutoInstDynShaderTrace;
  2483. +
  2484. +class AutoInstDynShaderTrace : public AutoInst {
  2485. + public:
  2486. + AutoInstDynShaderTrace() : AutoInst() {}
  2487. +
  2488. + void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
  2489. +
  2490. + void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
  2491. +
  2492. + void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
  2493. +
  2494. + void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
  2495. + uint32_t depth) override;
  2496. +
  2497. + void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
  2498. + ReportSetupProblem(device, "Dynamic shader trace for graphics is not yet implemented!\n");
  2499. + }
  2500. +
  2501. + void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
  2502. + ReportSetupProblem(device, "Dynamic shader trace for compute is not yet implemented!\n");
  2503. + }
  2504. +};
  2505. diff --git a/layers/auto_inst_dyn_trace_ray_trace.cpp b/layers/auto_inst_dyn_trace_ray_trace.cpp
  2506. new file mode 100644
  2507. index 00000000..cec184b0
  2508. --- /dev/null
  2509. +++ b/layers/auto_inst_dyn_trace_ray_trace.cpp
  2510. @@ -0,0 +1,223 @@
  2511. +/* Copyright (c) 2020 The Khronos Group Inc.
  2512. + *
  2513. + * Licensed under the Apache License, Version 2.0 (the "License");
  2514. + * you may not use this file except in compliance with the License.
  2515. + * You may obtain a copy of the License at
  2516. + *
  2517. + * http://www.apache.org/licenses/LICENSE-2.0
  2518. + *
  2519. + * Unless required by applicable law or agreed to in writing, software
  2520. + * distributed under the License is distributed on an "AS IS" BASIS,
  2521. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  2522. + * See the License for the specific language governing permissions and
  2523. + * limitations under the License.
  2524. + *
  2525. + * Author: David Pankratz <pankratz@ualberta.ca>
  2526. + */
  2527. +
  2528. +#include "auto_inst_dyn_trace_ray_trace.h"
  2529. +#include <bitset>
  2530. +#include <fstream>
  2531. +
  2532. +namespace {} // namespace
  2533. +
  2534. +void AutoInstDynTraceRayTrace::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {
  2535. + inst_id2prim_id_.clear();
  2536. + merge_id2div_ids_.clear();
  2537. +}
  2538. +
  2539. +void AutoInstDynTraceRayTrace::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
  2540. + auto static_data_callback = [&](std::unordered_map<uint32_t, uint32_t>&& inst_id2prim_id,
  2541. + std::unordered_map<uint32_t, std::vector<uint32_t>>&& merge_id2div_ids) {
  2542. + inst_id2prim_id_.insert(inst_id2prim_id.begin(), inst_id2prim_id.end());
  2543. + merge_id2div_ids_.insert(merge_id2div_ids.begin(), merge_id2div_ids.end());
  2544. + };
  2545. + optimizer->RegisterPass(spvtools::CreateAutoInstDynTraceRayTracePass(desc_bind_index, shader_module_id, static_data_callback));
  2546. +}
  2547. +
  2548. +void AutoInstDynTraceRayTrace::AnalyzeRayTracing(uint32_t* const device_output_buffer, bool buffer_overflowed, uint32_t width,
  2549. + uint32_t height, uint32_t depth) {
  2550. + if (buffer_overflowed) {
  2551. + ReportSetupProblem(device,
  2552. + "Dynamic traceRay trace analysis cannot produce a valid result without a complete execution trace.\n");
  2553. + return;
  2554. + }
  2555. +
  2556. + auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
  2557. + ReportInfo(device, "Analyzing dynamic traceRay trace for " + std::to_string(runtime_words_written * 4) + " bytes! \n");
  2558. +
  2559. + auto num_subgroup_ids = device_output_buffer[NUM_SUBGROUP_IDS_INDEX];
  2560. + auto num_threads = width * height * depth;
  2561. +
  2562. + // Create mapping from inst_id to inst_size to determine stride
  2563. + AutoInst::PrimitiveIdToPrimitiveSizeMap prim_id2_prim_size = {
  2564. + {spvtools::kAutoInstUniqueSubgroupId, (uint32_t)(sizeof(AIUniqueSubgroupIdEntry) / sizeof(uint32_t))},
  2565. + {spvtools::kAutoInstTraceRayTracePreTraceRay, (uint32_t)(sizeof(DynTraceRayTraceRecord) / sizeof(uint32_t))},
  2566. + {spvtools::kAutoInstTraceRayTraceMergePoint, (uint32_t)(sizeof(DynTraceRayTraceRecord) / sizeof(uint32_t))},
  2567. + };
  2568. +
  2569. + // Create subgroup id mapping to be populated
  2570. + AutoInst::ThreadIdToSubgroupIdMap thread_id2subgroup_id;
  2571. + AutoInst::ThreadIdSwizzleMap thread_id_swizzle;
  2572. + auto res =
  2573. + CreateUniqueSubgroupIdMappings(device_output_buffer, prim_id2_prim_size, thread_id2subgroup_id,
  2574. + thread_id_swizzle, [&](uint32_t inst_id) { return inst_id2prim_id_[inst_id]; });
  2575. +
  2576. + if (!res || thread_id2subgroup_id.size() != num_threads || thread_id_swizzle.size() != num_threads) {
  2577. + ReportSetupProblem(device, "Failed to acquire unique subgroup id maps. Aborting.\n");
  2578. + return;
  2579. + }
  2580. +
  2581. + {
  2582. + // For thread compaction
  2583. + std::unordered_map<uint32_t, std::unordered_map<uint32_t, std::vector<bool>>> thread_paths;
  2584. + std::unordered_map<uint32_t, std::unordered_map<uint32_t, uint32_t>> merge_visit_count;
  2585. + std::unordered_map<uint32_t, uint32_t> max_visit_count;
  2586. + std::set<uint32_t> points_of_interest;
  2587. +
  2588. + uint32_t j = 0;
  2589. + while (j < runtime_words_written) {
  2590. + auto inst_id = device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j];
  2591. + auto prim_id = inst_id2prim_id_[inst_id];
  2592. + if (prim_id == spvtools::kAutoInstTraceRayTracePreTraceRay) {
  2593. + // Record a positive result (thread executed traceRay)
  2594. + auto entry = reinterpret_cast<DynTraceRayTraceRecord*>(&device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j]);
  2595. + auto subgroup_id = thread_id2subgroup_id[entry->flat_thread_id];
  2596. +
  2597. + for (uint32_t m = 0, n = 1; m < SUBGROUP_SIZE; m++, n *= 2) {
  2598. + auto bit = entry->active_thread_mask & n;
  2599. + if (bit) {
  2600. + auto thread_id = m + SUBGROUP_SIZE * subgroup_id;
  2601. + thread_paths[inst_id][thread_id].push_back(true);
  2602. + max_visit_count[inst_id] = (thread_paths[inst_id][thread_id].size() > max_visit_count[inst_id])
  2603. + ? (uint32_t)thread_paths[inst_id][thread_id].size()
  2604. + : max_visit_count[inst_id];
  2605. + }
  2606. + }
  2607. + if (points_of_interest.count(inst_id) == 0) {
  2608. + points_of_interest.insert(inst_id);
  2609. + }
  2610. +
  2611. + } else if (prim_id == spvtools::kAutoInstTraceRayTraceMergePoint) {
  2612. + auto entry = reinterpret_cast<DynTraceRayTraceRecord*>(&device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j]);
  2613. + auto subgroup_id = thread_id2subgroup_id[entry->flat_thread_id];
  2614. + // Record negative result if necessary (thread skipped traceRay)
  2615. + for (auto& label_it : merge_id2div_ids_[inst_id]) {
  2616. + if (points_of_interest.count(label_it) == 0) continue;
  2617. + for (uint32_t m = 0, n = 1; m < SUBGROUP_SIZE; m++, n *= 2) {
  2618. + auto thread_id = m + SUBGROUP_SIZE * subgroup_id;
  2619. + if ((entry->active_thread_mask & n) == 0) continue;
  2620. + merge_visit_count[label_it][thread_id]++;
  2621. +
  2622. + if (thread_paths[label_it][thread_id].size() >= merge_visit_count[label_it][thread_id]) {
  2623. + merge_visit_count[label_it][thread_id] = (uint32_t)thread_paths[label_it][thread_id].size();
  2624. + continue; // Has been set due to active thread taking branch
  2625. + }
  2626. +
  2627. + thread_paths[label_it][thread_id].push_back(false);
  2628. + max_visit_count[label_it] = (thread_paths[label_it][thread_id].size() > max_visit_count[label_it])
  2629. + ? (uint32_t)thread_paths[label_it][thread_id].size()
  2630. + : max_visit_count[label_it];
  2631. + }
  2632. + }
  2633. + } else if (prim_id != spvtools::kAutoInstUniqueSubgroupId) {
  2634. + ReportSetupProblem(device, "Unrecognized primitive. Aborting.\n");
  2635. + return;
  2636. + }
  2637. +
  2638. + j += prim_id2_prim_size[prim_id];
  2639. + }
  2640. +
  2641. + // Done analyzing StorageBuffer
  2642. + const int MAX_PATH_LEN = 1024;
  2643. +
  2644. + // Flatten the thread paths according to the maximum dynamic invocation count
  2645. + // of each traceRay callsite
  2646. + // Consider thread A that executed a inner loop once for 3 iterations of an outer loop
  2647. + // vs thread B that executed in the inner loop 3 times for 3 iterations of an outer loop
  2648. + // Before flattening:
  2649. + // thread A: 111
  2650. + // thread B: 111111111
  2651. + // After flattening
  2652. + // thread A: 001001001
  2653. + // thread B: 111111111
  2654. + std::unordered_map<std::bitset<MAX_PATH_LEN>, uint32_t> flat_path_count;
  2655. + std::bitset<MAX_PATH_LEN> flat_thread_path;
  2656. + for (uint32_t thread_id = 0; thread_id < num_threads; thread_id++) {
  2657. + std::size_t k = 0;
  2658. + flat_thread_path.reset();
  2659. + for (auto& label_id : points_of_interest) {
  2660. + j = 0;
  2661. + for (j = 0; j < thread_paths[label_id][thread_id].size(); j++) {
  2662. + if (thread_paths[label_id][thread_id][j]) {
  2663. + flat_thread_path.set(k, 1);
  2664. + }
  2665. + k += 1;
  2666. + }
  2667. + if (j > max_visit_count[label_id]) {
  2668. + ReportSetupProblem(device, "Max visit count not set correctly. Aborting\n");
  2669. + return;
  2670. + }
  2671. + k += max_visit_count[label_id] - j;
  2672. + if (k > MAX_PATH_LEN)
  2673. + ReportSetupProblem(device, ("Encountered more than " + std::to_string(MAX_PATH_LEN) + " branches!").c_str());
  2674. + }
  2675. + flat_path_count[flat_thread_path]++;
  2676. + }
  2677. +
  2678. + // Record thread paths and their respective counts
  2679. + std::ofstream csv_file;
  2680. + csv_file.open(PipelineAnalysisFileName("thread_paths.csv"));
  2681. + csv_file << "path,count,\n";
  2682. + for (auto& path_it : flat_path_count) csv_file << path_it.first << "," << path_it.second << "\n";
  2683. + csv_file.close();
  2684. +
  2685. + csv_file.open(PipelineAnalysisFileName("thread_compaction.csv"));
  2686. +
  2687. + ReportInfo(device, "Done simulated threads\n");
  2688. + for (auto& poi_label : points_of_interest) {
  2689. + csv_file << poi_label << "\n";
  2690. + std::vector<uint32_t> active_threads;
  2691. + std::vector<uint32_t> active_threads_per_window;
  2692. + std::vector<uint32_t> total_threads;
  2693. + for (uint32_t window_size = 1; window_size < num_subgroup_ids * 2; window_size <<= 1) {
  2694. + active_threads.clear();
  2695. + total_threads.clear();
  2696. + // window size unit is subgroups
  2697. + for (uint32_t window_base = 0; window_base < num_subgroup_ids; window_base += window_size) {
  2698. + active_threads_per_window.clear();
  2699. + for (uint32_t window_offset = 0; window_offset < window_size; window_offset++) {
  2700. + if (window_base + window_offset >= num_subgroup_ids) continue;
  2701. + for (uint32_t thread_offset = 0; thread_offset < SUBGROUP_SIZE; thread_offset++) {
  2702. + auto thread_id = (window_base + window_offset) * SUBGROUP_SIZE + thread_offset;
  2703. + auto num_visits = thread_paths[poi_label][thread_id].size();
  2704. + if (active_threads_per_window.size() < num_visits) active_threads_per_window.resize(num_visits);
  2705. + for (uint32_t visit_count = 0; visit_count < num_visits; visit_count++) {
  2706. + if (thread_paths[poi_label][thread_id][visit_count]) active_threads_per_window[visit_count]++;
  2707. + }
  2708. + }
  2709. + }
  2710. +
  2711. + if (active_threads.size() < active_threads_per_window.size()) {
  2712. + active_threads.resize(active_threads_per_window.size());
  2713. + total_threads.resize(active_threads.size());
  2714. + }
  2715. +
  2716. + for (uint32_t visit_count = 0; visit_count < active_threads_per_window.size(); visit_count++) {
  2717. + active_threads[visit_count] += active_threads_per_window[visit_count];
  2718. + total_threads[visit_count] += ((active_threads_per_window[visit_count] / SUBGROUP_SIZE) +
  2719. + ((active_threads_per_window[visit_count] % SUBGROUP_SIZE != 0) ? 1 : 0)) *
  2720. + SUBGROUP_SIZE;
  2721. + }
  2722. + }
  2723. + for (uint32_t visit_count = 0; visit_count < active_threads.size(); visit_count++) {
  2724. + if (active_threads[visit_count] == 0 && total_threads[visit_count] == 0) continue;
  2725. +
  2726. + csv_file << "," << window_size << "," << visit_count << "," << active_threads[visit_count] << "/"
  2727. + << total_threads[visit_count] << "\n";
  2728. + }
  2729. + }
  2730. + }
  2731. + csv_file.close();
  2732. + }
  2733. +}
  2734. \ No newline at end of file
  2735. diff --git a/layers/auto_inst_dyn_trace_ray_trace.h b/layers/auto_inst_dyn_trace_ray_trace.h
  2736. new file mode 100644
  2737. index 00000000..769b0cc4
  2738. --- /dev/null
  2739. +++ b/layers/auto_inst_dyn_trace_ray_trace.h
  2740. @@ -0,0 +1,55 @@
  2741. +/* Copyright (c) 2020 The Khronos Group Inc.
  2742. + *
  2743. + * Licensed under the Apache License, Version 2.0 (the "License");
  2744. + * you may not use this file except in compliance with the License.
  2745. + * You may obtain a copy of the License at
  2746. + *
  2747. + * http://www.apache.org/licenses/LICENSE-2.0
  2748. + *
  2749. + * Unless required by applicable law or agreed to in writing, software
  2750. + * distributed under the License is distributed on an "AS IS" BASIS,
  2751. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  2752. + * See the License for the specific language governing permissions and
  2753. + * limitations under the License.
  2754. + *
  2755. + * Author: David Pankratz <pankratz@ualberta.ca>
  2756. + */
  2757. +
  2758. +#pragma once
  2759. +
  2760. +#include "auto_inst.h"
  2761. +
  2762. +class AutoInstDynTraceRayTrace;
  2763. +
  2764. +struct DynTraceRayTraceRecord {
  2765. + uint32_t inst_id;
  2766. + uint32_t flat_thread_id;
  2767. + uint32_t active_thread_mask;
  2768. +};
  2769. +
  2770. +class AutoInstDynTraceRayTrace : public AutoInst {
  2771. + private:
  2772. + std::unordered_map<uint32_t, uint32_t> inst_id2prim_id_;
  2773. + std::unordered_map<uint32_t, std::vector<uint32_t>> merge_id2div_ids_;
  2774. +
  2775. + public:
  2776. + AutoInstDynTraceRayTrace() : AutoInst() {}
  2777. +
  2778. + void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
  2779. +
  2780. + void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
  2781. +
  2782. + void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
  2783. +
  2784. + void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
  2785. + uint32_t depth) override;
  2786. + ;
  2787. +
  2788. + void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
  2789. + ReportSetupProblem(device, "Dynamic TraceRays Trace analysis does not support graphics.\n");
  2790. + };
  2791. +
  2792. + void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
  2793. + ReportSetupProblem(device, "Dynamic TraceRays Trace analysis does not support compute.\n");
  2794. + };
  2795. +};
  2796. diff --git a/layers/auto_inst_execution_trace.cpp b/layers/auto_inst_execution_trace.cpp
  2797. new file mode 100644
  2798. index 00000000..6f27d804
  2799. --- /dev/null
  2800. +++ b/layers/auto_inst_execution_trace.cpp
  2801. @@ -0,0 +1,174 @@
  2802. +/* Copyright (c) 2020 The Khronos Group Inc.
  2803. + *
  2804. + * Licensed under the Apache License, Version 2.0 (the "License");
  2805. + * you may not use this file except in compliance with the License.
  2806. + * You may obtain a copy of the License at
  2807. + *
  2808. + * http://www.apache.org/licenses/LICENSE-2.0
  2809. + *
  2810. + * Unless required by applicable law or agreed to in writing, software
  2811. + * distributed under the License is distributed on an "AS IS" BASIS,
  2812. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  2813. + * See the License for the specific language governing permissions and
  2814. + * limitations under the License.
  2815. + *
  2816. + * Author: David Pankratz <pankratz@ualberta.ca>
  2817. + */
  2818. +
  2819. +#include "auto_inst_execution_trace.h"
  2820. +#include <bitset>
  2821. +#include <fstream>
  2822. +
  2823. +namespace {
  2824. +
  2825. +struct ExecutionTraceRecord {
  2826. + uint32_t inst_id;
  2827. + uint32_t active_thread_mask;
  2828. +};
  2829. +
  2830. +static inline uint32_t shader_id(uint32_t inst_id) { return (inst_id & 0xFFF00000) >> 20; }
  2831. +
  2832. +} // namespace
  2833. +
  2834. +void AutoInstExecutionTrace::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
  2835. +
  2836. +void AutoInstExecutionTrace::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
  2837. + // In order to compute the dynamic instruction execution count of the pipeline, it is necessary to know all the other
  2838. + // instructions in the same basic block as the instrumentation callsite. This callback allows the auto-inst pass to
  2839. + // populate such a mapping.
  2840. + auto static_data_callback = [&](std::unordered_map<uint32_t, std::set<uint32_t>>&& inst_id2bb_inst_ids,
  2841. + std::unordered_map<uint32_t, uint32_t>&& inst_id2opcode) {
  2842. + inst_id2bb_inst_ids_.insert(inst_id2bb_inst_ids.begin(), inst_id2bb_inst_ids.end());
  2843. + inst_id2opcode_.insert(inst_id2opcode.begin(), inst_id2opcode.end());
  2844. + };
  2845. + optimizer->RegisterPass(spvtools::CreateAutoInstExecutionTracePass(desc_bind_index, shader_module_id, static_data_callback));
  2846. +}
  2847. +
  2848. +void AutoInstExecutionTrace::Analyze(uint32_t* const device_output_buffer, bool buffer_overflowed) {
  2849. + if (buffer_overflowed) {
  2850. + ReportSetupProblem(device, "Execution trace analysis cannot produce a valid result without a complete execution trace.\n");
  2851. + return;
  2852. + }
  2853. +
  2854. + auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
  2855. + ReportInfo(device, "Analyzing execution trace for " + std::to_string(runtime_words_written * 4) + " bytes! \n");
  2856. +
  2857. + struct ActiveTotalThreadCounts {
  2858. + uint32_t active_count;
  2859. + uint32_t total_count;
  2860. +
  2861. + float inline SimtEfficiency() const { return (float)active_count / (float)total_count; }
  2862. + };
  2863. +
  2864. + std::map<uint32_t, uint32_t> opcode2dyn_execution_count;
  2865. +
  2866. + // For annotated shaders
  2867. + std::map<uint32_t, uint32_t> inst_id2dyn_execution_count;
  2868. + std::map<uint32_t, ActiveTotalThreadCounts> inst_id2active_and_total_thread_counts;
  2869. +
  2870. + uint32_t j = 0;
  2871. + while (j < runtime_words_written) {
  2872. + const auto output_record = reinterpret_cast<ExecutionTraceRecord*>(&device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j]);
  2873. + uint32_t active_thread_count = (uint32_t)std::bitset<SUBGROUP_SIZE>(output_record->active_thread_mask).count();
  2874. +
  2875. + if (inst_id2bb_inst_ids_.count(output_record->inst_id) == 0) {
  2876. + ReportSetupProblem(device, "Execution trace was unable to locate instrumentation id=" +
  2877. + std::to_string(output_record->inst_id) + "in static mapping. Aborting.\n");
  2878. + return;
  2879. + }
  2880. +
  2881. + // Add to the opcode totals based on how many threads were active
  2882. + for (const auto& inst_id : inst_id2bb_inst_ids_[output_record->inst_id]) {
  2883. + inst_id2dyn_execution_count[inst_id] += active_thread_count;
  2884. + inst_id2active_and_total_thread_counts[inst_id].active_count += active_thread_count;
  2885. + inst_id2active_and_total_thread_counts[inst_id].total_count += SUBGROUP_SIZE;
  2886. + auto opcode = inst_id2opcode_[inst_id];
  2887. + if (inst_id2opcode_.count(inst_id) == 0) {
  2888. + ReportSetupProblem(device, "Encountered instruction id without a corresponding Opcode. Aborting.\n");
  2889. + return;
  2890. + }
  2891. + opcode2dyn_execution_count[opcode] += active_thread_count;
  2892. + }
  2893. +
  2894. + j += sizeof(ExecutionTraceRecord) / sizeof(uint32_t);
  2895. + }
  2896. +
  2897. + {
  2898. + // Output dyn opcode count
  2899. + std::stringstream line0, line1;
  2900. + line0 << "opcode,";
  2901. + line1 << "dyn exe count,";
  2902. + for (auto entry : opcode2dyn_execution_count) {
  2903. + line0 << entry.first << ",";
  2904. + line1 << entry.second << ",";
  2905. + }
  2906. + line0 << "\n";
  2907. + line1 << "\n";
  2908. +
  2909. + std::ofstream csv_file;
  2910. + csv_file.open(PipelineAnalysisFileName("dyn_opcode_counts.csv"));
  2911. + csv_file << line0.str() << line1.str();
  2912. + csv_file.close();
  2913. + }
  2914. + {
  2915. + // Output hotspots
  2916. + std::stringstream line0, line1, line2;
  2917. + line0 << "pc,";
  2918. + line1 << "dyn exe count,";
  2919. + line2 << "simt efficiency,";
  2920. + for (auto entry : inst_id2dyn_execution_count) {
  2921. + line0 << entry.first << ",";
  2922. + line1 << entry.second << ",";
  2923. + line2 << inst_id2active_and_total_thread_counts[entry.first].SimtEfficiency() << ",";
  2924. + }
  2925. + line0 << "\n";
  2926. + line1 << "\n";
  2927. + line2 << "\n";
  2928. +
  2929. + ActiveTotalThreadCounts combined = {0, 0};
  2930. + for (auto entry : inst_id2active_and_total_thread_counts) {
  2931. + combined.active_count += entry.second.active_count;
  2932. + combined.total_count += entry.second.total_count;
  2933. + }
  2934. +
  2935. + std::ofstream csv_file;
  2936. + csv_file.open(PipelineAnalysisFileName("hotspots.csv"));
  2937. + csv_file << line0.str() << line1.str() << line2.str() << "Overall SIMT efficiency=" << combined.SimtEfficiency() << "\n";
  2938. + csv_file.close();
  2939. + }
  2940. + {
  2941. + std::set<uint32_t> shaders_with_data;
  2942. + // Output annotated shaders
  2943. + std::unordered_map<uint32_t, std::string> annotations;
  2944. + for (const auto& entry : inst_id2bb_inst_ids_) {
  2945. + auto instrumentation_id = entry.first;
  2946. + auto visits = inst_id2dyn_execution_count[instrumentation_id];
  2947. + if (visits > 0) {
  2948. + auto simt_efficiency = inst_id2active_and_total_thread_counts[entry.first].SimtEfficiency();
  2949. + shaders_with_data.insert(shader_id(entry.first));
  2950. + annotations[instrumentation_id] =
  2951. + "thread_executions=" + std::to_string(visits) + ". SIMT Efficiency=" + std::to_string(simt_efficiency);
  2952. + }
  2953. + }
  2954. +
  2955. + for (auto entry : instrumentation_map) {
  2956. + if (shaders_with_data.count(entry.first) == 0) continue;
  2957. + using namespace spvtools;
  2958. + SpirvTools spirvTools(spv_target_env::SPV_ENV_VULKAN_1_2);
  2959. + std::string program;
  2960. + spirvTools.SetMessageConsumer([this](spv_message_level_t level, const char* source, const spv_position_t& pos,
  2961. + const char* message) { ReportSetupProblem(this->device, message); });
  2962. + bool res = spirvTools.Disassemble(entry.second, &program, SPV_BINARY_TO_TEXT_OPTION_FRIENDLY_NAMES);
  2963. + if (res) {
  2964. + program = AnnotateModuleStr(program, annotations);
  2965. + } else {
  2966. + ReportSetupProblem(device, "Could not disassemble shader with id=" + std::to_string(entry.first) + ". Skipping.\n");
  2967. + continue;
  2968. + }
  2969. + std::ofstream spv_file;
  2970. + auto file_name = ShaderStageToString(shader_map[entry.first].stage) + std::to_string(entry.first) + "_dyn_executions";
  2971. +
  2972. + TryCompileModuleStrToGlsl(program, PipelineAnalysisFileName(file_name));
  2973. + }
  2974. + }
  2975. +}
  2976. \ No newline at end of file
  2977. diff --git a/layers/auto_inst_execution_trace.h b/layers/auto_inst_execution_trace.h
  2978. new file mode 100644
  2979. index 00000000..fb5b4eb0
  2980. --- /dev/null
  2981. +++ b/layers/auto_inst_execution_trace.h
  2982. @@ -0,0 +1,56 @@
  2983. +/* Copyright (c) 2020 The Khronos Group Inc.
  2984. + *
  2985. + * Licensed under the Apache License, Version 2.0 (the "License");
  2986. + * you may not use this file except in compliance with the License.
  2987. + * You may obtain a copy of the License at
  2988. + *
  2989. + * http://www.apache.org/licenses/LICENSE-2.0
  2990. + *
  2991. + * Unless required by applicable law or agreed to in writing, software
  2992. + * distributed under the License is distributed on an "AS IS" BASIS,
  2993. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  2994. + * See the License for the specific language governing permissions and
  2995. + * limitations under the License.
  2996. + *
  2997. + * Author: David Pankratz <pankratz@ualberta.ca>
  2998. + */
  2999. +
  3000. +#pragma once
  3001. +
  3002. +#include "auto_inst.h"
  3003. +
  3004. +class AutoInstExecutionTrace;
  3005. +
  3006. +class AutoInstExecutionTrace : public AutoInst {
  3007. + public:
  3008. + // Mapping from instrumented instruction id to the ids of other instructions in the basic block
  3009. + std::unordered_map<uint32_t, std::set<uint32_t>> inst_id2bb_inst_ids_;
  3010. +
  3011. + // Mapping from instruction id to instruction opcode. Used for calculating dynamic instruction mix.
  3012. + std::unordered_map<uint32_t, uint32_t> inst_id2opcode_;
  3013. +
  3014. + AutoInstExecutionTrace() : AutoInst() {}
  3015. +
  3016. + void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
  3017. +
  3018. + void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
  3019. +
  3020. + void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
  3021. +
  3022. + void Analyze(uint32_t* const debug_output_buffer, bool buffer_overflowed);
  3023. +
  3024. + virtual void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
  3025. + uint32_t depth) override {
  3026. + Analyze(debug_output_buffer, buffer_overflowed);
  3027. + };
  3028. +
  3029. + // TODO: What are useful dimensions to pass to graphics pipeline analysis
  3030. + virtual void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
  3031. + Analyze(debug_output_buffer, buffer_overflowed);
  3032. + };
  3033. +
  3034. + virtual void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y,
  3035. + uint32_t z) override {
  3036. + Analyze(debug_output_buffer, buffer_overflowed);
  3037. + };
  3038. +};
  3039. diff --git a/layers/auto_inst_simt_efficiency.cpp b/layers/auto_inst_simt_efficiency.cpp
  3040. new file mode 100644
  3041. index 00000000..0e100509
  3042. --- /dev/null
  3043. +++ b/layers/auto_inst_simt_efficiency.cpp
  3044. @@ -0,0 +1,67 @@
  3045. +/* Copyright (c) 2020 The Khronos Group Inc.
  3046. + *
  3047. + * Licensed under the Apache License, Version 2.0 (the "License");
  3048. + * you may not use this file except in compliance with the License.
  3049. + * You may obtain a copy of the License at
  3050. + *
  3051. + * http://www.apache.org/licenses/LICENSE-2.0
  3052. + *
  3053. + * Unless required by applicable law or agreed to in writing, software
  3054. + * distributed under the License is distributed on an "AS IS" BASIS,
  3055. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  3056. + * See the License for the specific language governing permissions and
  3057. + * limitations under the License.
  3058. + *
  3059. + * Author: David Pankratz <pankratz@ualberta.ca>
  3060. + */
  3061. +
  3062. +#include "auto_inst_simt_efficiency.h"
  3063. +#include <bitset>
  3064. +#include <fstream>
  3065. +void AutoInstSimtEfficiency::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
  3066. +
  3067. +void AutoInstSimtEfficiency::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
  3068. + optimizer->RegisterPass(spvtools::CreateAutoInstSimtEfficiencyPass(desc_bind_index, shader_module_id, 1));
  3069. +}
  3070. +
  3071. +void AutoInstSimtEfficiency::Analyze(uint32_t* const device_output_buffer, bool buffer_overflowed) {
  3072. + if (buffer_overflowed) {
  3073. + ReportSetupProblem(device, "SIMT Efficiency analysis cannot produce a valid result without a complete execution trace.\n");
  3074. + return;
  3075. + }
  3076. +
  3077. + uint32_t active_thread_count = 0;
  3078. + uint32_t possible_thread_count = 0;
  3079. +
  3080. + auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
  3081. +
  3082. + if (runtime_words_written == 0) {
  3083. + ReportInfo(device, "No data found. Skipping Analysis.\n");
  3084. + return;
  3085. + }
  3086. +
  3087. + ReportInfo(device, "Analyzing SIMT Efficiency for " + std::to_string(runtime_words_written * 4) + " bytes! \n");
  3088. +
  3089. + uint32_t j = 0;
  3090. + while (j < runtime_words_written) {
  3091. + uint32_t active_thread_mask = device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j];
  3092. + auto active_threads = (uint32_t)std::bitset<SUBGROUP_SIZE>(active_thread_mask).count();
  3093. +
  3094. + if (active_threads == 0) {
  3095. + ReportSetupProblem(device, "Invalid active thread count encountered. Quitting Analysis!\n");
  3096. + return;
  3097. + }
  3098. + active_thread_count += active_threads;
  3099. + possible_thread_count += SUBGROUP_SIZE;
  3100. + j += sizeof(SimtEfficiencyRecord) / sizeof(uint32_t);
  3101. + }
  3102. +
  3103. + float simt_efficiency = (float)active_thread_count / (float)possible_thread_count;
  3104. +
  3105. + std::ofstream simt_eff_file;
  3106. + simt_eff_file.open(FrameAnalysisFileName("simt_efficiency.csv"), std::ios_base::app);
  3107. + simt_eff_file << simt_efficiency << "\n";
  3108. + simt_eff_file.close();
  3109. +
  3110. + ReportInfo(device, "SIMT Efficiency = " + std::to_string(simt_efficiency * 100.0) + "%\n");
  3111. +}
  3112. \ No newline at end of file
  3113. diff --git a/layers/auto_inst_simt_efficiency.h b/layers/auto_inst_simt_efficiency.h
  3114. new file mode 100644
  3115. index 00000000..d6c5e2ce
  3116. --- /dev/null
  3117. +++ b/layers/auto_inst_simt_efficiency.h
  3118. @@ -0,0 +1,56 @@
  3119. +/* Copyright (c) 2020 The Khronos Group Inc.
  3120. + *
  3121. + * Licensed under the Apache License, Version 2.0 (the "License");
  3122. + * you may not use this file except in compliance with the License.
  3123. + * You may obtain a copy of the License at
  3124. + *
  3125. + * http://www.apache.org/licenses/LICENSE-2.0
  3126. + *
  3127. + * Unless required by applicable law or agreed to in writing, software
  3128. + * distributed under the License is distributed on an "AS IS" BASIS,
  3129. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  3130. + * See the License for the specific language governing permissions and
  3131. + * limitations under the License.
  3132. + *
  3133. + * Author: David Pankratz <pankratz@ualberta.ca>
  3134. + */
  3135. +
  3136. +#pragma once
  3137. +
  3138. +#include "auto_inst.h"
  3139. +
  3140. +class AutoInstSimtEfficiency;
  3141. +
  3142. +struct SimtEfficiencyRecord {
  3143. + uint32_t active_thread_mask;
  3144. +};
  3145. +
  3146. +class AutoInstSimtEfficiency : public AutoInst {
  3147. + public:
  3148. + AutoInstSimtEfficiency() : AutoInst() {}
  3149. +
  3150. + void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
  3151. +
  3152. + void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
  3153. +
  3154. + void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
  3155. +
  3156. + // Opportunity for inheriting class to perform hybrid analysis using
  3157. + // 1) static_data
  3158. + // 2) runtime_data
  3159. + // 3) shader_map
  3160. + void Analyze(uint32_t* const debug_output_buffer, bool buffer_overflowed);
  3161. +
  3162. + void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
  3163. + uint32_t depth) override {
  3164. + Analyze(debug_output_buffer, buffer_overflowed);
  3165. + };
  3166. +
  3167. + void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
  3168. + Analyze(debug_output_buffer, buffer_overflowed);
  3169. + };
  3170. +
  3171. + void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
  3172. + Analyze(debug_output_buffer, buffer_overflowed);
  3173. + };
  3174. +};
  3175. diff --git a/layers/auto_inst_warp_entry_and_exit.cpp b/layers/auto_inst_warp_entry_and_exit.cpp
  3176. new file mode 100644
  3177. index 00000000..9c19ce3d
  3178. --- /dev/null
  3179. +++ b/layers/auto_inst_warp_entry_and_exit.cpp
  3180. @@ -0,0 +1,61 @@
  3181. +/* Copyright (c) 2020 The Khronos Group Inc.
  3182. + *
  3183. + * Licensed under the Apache License, Version 2.0 (the "License");
  3184. + * you may not use this file except in compliance with the License.
  3185. + * You may obtain a copy of the License at
  3186. + *
  3187. + * http://www.apache.org/licenses/LICENSE-2.0
  3188. + *
  3189. + * Unless required by applicable law or agreed to in writing, software
  3190. + * distributed under the License is distributed on an "AS IS" BASIS,
  3191. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  3192. + * See the License for the specific language governing permissions and
  3193. + * limitations under the License.
  3194. + *
  3195. + * Author: David Pankratz <pankratz@ualberta.ca>
  3196. + */
  3197. +
  3198. +#include "auto_inst_warp_entry_and_exit.h"
  3199. +#include <fstream>
  3200. +
  3201. +void AutoInstWarpEntryAndExit::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
  3202. +
  3203. +void AutoInstWarpEntryAndExit::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
  3204. + optimizer->RegisterPass(spvtools::CreateAutoInstWarpEntryAndExitPass(desc_bind_index, shader_module_id));
  3205. +}
  3206. +
  3207. +void AutoInstWarpEntryAndExit::Analyze(uint32_t* const device_output_buffer, bool buffer_overflowed) {
  3208. + if (buffer_overflowed) {
  3209. + ReportSetupProblem(device, "Analysis cannot produce a valid result without a complete execution trace.\n");
  3210. + return;
  3211. + }
  3212. +
  3213. + uint32_t entry_count = 0;
  3214. + uint32_t exit_count = 0;
  3215. +
  3216. + auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
  3217. + ReportInfo(device, "Analyzing Warp Entries vs Exits in " + std::to_string(runtime_words_written * 4) + " bytes! \n");
  3218. +
  3219. + uint32_t j = 0;
  3220. + while (j < runtime_words_written) {
  3221. + uint32_t prim_id = device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j];
  3222. + if (prim_id == spvtools::kAutoInstWarpEntryAndExitBeginPipeline) {
  3223. + entry_count++;
  3224. + } else if (prim_id == spvtools::kAutoInstWarpEntryAndExitEndPipeline) {
  3225. + exit_count++;
  3226. + } else {
  3227. + ReportSetupProblem(device, "Received unexpected primitive id. Aborting!\n");
  3228. + return;
  3229. + }
  3230. + j++;
  3231. + }
  3232. +
  3233. + float divergence_factor = (float)exit_count / (float)entry_count;
  3234. +
  3235. + std::ofstream csv_file;
  3236. + csv_file.open(FrameAnalysisFileName("exits_vs_entries.csv"), std::ios_base::app);
  3237. + csv_file << divergence_factor << "\n";
  3238. + csv_file.close();
  3239. +
  3240. + ReportInfo(device, "Exits/entries= " + std::to_string(divergence_factor) + "\n");
  3241. +}
  3242. \ No newline at end of file
  3243. diff --git a/layers/auto_inst_warp_entry_and_exit.h b/layers/auto_inst_warp_entry_and_exit.h
  3244. new file mode 100644
  3245. index 00000000..17adfdfb
  3246. --- /dev/null
  3247. +++ b/layers/auto_inst_warp_entry_and_exit.h
  3248. @@ -0,0 +1,52 @@
  3249. +/* Copyright (c) 2020 The Khronos Group Inc.
  3250. + *
  3251. + * Licensed under the Apache License, Version 2.0 (the "License");
  3252. + * you may not use this file except in compliance with the License.
  3253. + * You may obtain a copy of the License at
  3254. + *
  3255. + * http://www.apache.org/licenses/LICENSE-2.0
  3256. + *
  3257. + * Unless required by applicable law or agreed to in writing, software
  3258. + * distributed under the License is distributed on an "AS IS" BASIS,
  3259. + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  3260. + * See the License for the specific language governing permissions and
  3261. + * limitations under the License.
  3262. + *
  3263. + * Author: David Pankratz <pankratz@ualberta.ca>
  3264. + */
  3265. +
  3266. +#pragma once
  3267. +
  3268. +#include "auto_inst.h"
  3269. +
  3270. +class AutoInstWarpEntryAndExit;
  3271. +
  3272. +class AutoInstWarpEntryAndExit : public AutoInst {
  3273. + public:
  3274. + AutoInstWarpEntryAndExit() : AutoInst() {}
  3275. +
  3276. + void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
  3277. +
  3278. + void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
  3279. +
  3280. + void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
  3281. +
  3282. + // Opportunity for inheriting class to perform hybrid analysis using
  3283. + // 1) static_data
  3284. + // 2) runtime_data
  3285. + // 3) shader_map
  3286. + void Analyze(uint32_t* const debug_output_buffer, bool buffer_overflowed);
  3287. +
  3288. + void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
  3289. + uint32_t depth) override {
  3290. + Analyze(debug_output_buffer, buffer_overflowed);
  3291. + };
  3292. +
  3293. + void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
  3294. + Analyze(debug_output_buffer, buffer_overflowed);
  3295. + };
  3296. +
  3297. + void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
  3298. + Analyze(debug_output_buffer, buffer_overflowed);
  3299. + };
  3300. +};
  3301. diff --git a/layers/debug_printf.cpp b/layers/debug_printf.cpp
  3302. index e9181812..a8ba17df 100644
  3303. --- a/layers/debug_printf.cpp
  3304. +++ b/layers/debug_printf.cpp
  3305. @@ -80,7 +80,7 @@ void DebugPrintf::PostCallRecordCreateDevice(VkPhysicalDevice physicalDevice, co
  3306. if (enabled[gpu_validation]) {
  3307. ReportSetupProblem(device,
  3308. - "Debug Printf cannot be enabled when gpu assisted validation is enabled. "
  3309. + "Debug Printf cannot be enabled when gpu assisted validation or auto-inst are enabled. "
  3310. "Debug Printf disabled.");
  3311. device_debug_printf->aborted = true;
  3312. return;
  3313. diff --git a/layers/debug_printf.h b/layers/debug_printf.h
  3314. index 915d5a6d..deb85031 100644
  3315. --- a/layers/debug_printf.h
  3316. +++ b/layers/debug_printf.h
  3317. @@ -45,6 +45,7 @@ struct DPFShaderTracker {
  3318. VkPipeline pipeline;
  3319. VkShaderModule shader_module;
  3320. std::vector<unsigned int> pgm;
  3321. + VkShaderStageFlagBits stage;
  3322. };
  3323. enum vartype { varsigned, varunsigned, varfloat };
  3324. diff --git a/layers/generated/chassis.cpp b/layers/generated/chassis.cpp
  3325. index 9a4799dd..6ecc487a 100644
  3326. --- a/layers/generated/chassis.cpp
  3327. +++ b/layers/generated/chassis.cpp
  3328. @@ -49,6 +49,12 @@ bool wrap_handles = true;
  3329. #include "gpu_validation.h"
  3330. #include "object_lifetime_validation.h"
  3331. #include "debug_printf.h"
  3332. +#include "auto_inst_dyn_shader_trace.h"
  3333. +#include "auto_inst_dyn_trace_ray_trace.h"
  3334. +#include "auto_inst_execution_trace.h"
  3335. +#include "auto_inst_simt_efficiency.h"
  3336. +#include "auto_inst_divergence_characterization.h"
  3337. +#include "auto_inst_warp_entry_and_exit.h"
  3338. #include "stateless_validation.h"
  3339. #include "synchronization_validation.h"
  3340. #include "thread_safety.h"
  3341. @@ -299,6 +305,24 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
  3342. auto sync_validation_obj = new SyncValidator;
  3343. sync_validation_obj->RegisterValidationObject(local_enables[sync_validation], api_version, report_data, local_object_dispatch);
  3344. + auto auto_inst_simt_efficiency_obj = new AutoInstSimtEfficiency;
  3345. + auto_inst_simt_efficiency_obj->RegisterValidationObject(local_enables[auto_inst_simt_efficiency], api_version, report_data, local_object_dispatch);
  3346. +
  3347. + auto auto_inst_execution_trace_obj = new AutoInstExecutionTrace;
  3348. + auto_inst_execution_trace_obj->RegisterValidationObject(local_enables[auto_inst_execution_trace], api_version, report_data, local_object_dispatch);
  3349. +
  3350. + auto auto_inst_dyn_trace_ray_trace_obj = new AutoInstDynTraceRayTrace;
  3351. + auto_inst_dyn_trace_ray_trace_obj->RegisterValidationObject(local_enables[auto_inst_dyn_trace_ray_trace], api_version, report_data, local_object_dispatch);
  3352. +
  3353. + auto auto_inst_divergence_characterization_obj = new AutoInstDivergenceCharacterization;
  3354. + auto_inst_divergence_characterization_obj->RegisterValidationObject(local_enables[auto_inst_divergence_characterization], api_version, report_data, local_object_dispatch);
  3355. +
  3356. + auto auto_inst_warp_entry_and_exit_obj = new AutoInstWarpEntryAndExit;
  3357. + auto_inst_warp_entry_and_exit_obj->RegisterValidationObject(local_enables[auto_inst_warp_entry_and_exit], api_version, report_data, local_object_dispatch);
  3358. +
  3359. + auto auto_inst_dyn_shader_trace_obj = new AutoInstDynShaderTrace;
  3360. + auto_inst_dyn_shader_trace_obj->RegisterValidationObject(local_enables[auto_inst_dyn_shader_trace], api_version, report_data, local_object_dispatch);
  3361. +
  3362. // If handle wrapping is disabled via the ValidationFeatures extension, override build flag
  3363. if (local_disables[handle_wrapping]) {
  3364. wrap_handles = false;
  3365. @@ -344,6 +368,12 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
  3366. gpu_assisted_obj->FinalizeInstanceValidationObject(framework);
  3367. debug_printf_obj->FinalizeInstanceValidationObject(framework);
  3368. sync_validation_obj->FinalizeInstanceValidationObject(framework);
  3369. + auto_inst_simt_efficiency_obj->FinalizeInstanceValidationObject(framework);
  3370. + auto_inst_execution_trace_obj->FinalizeInstanceValidationObject(framework);
  3371. + auto_inst_dyn_trace_ray_trace_obj->FinalizeInstanceValidationObject(framework);
  3372. + auto_inst_divergence_characterization_obj->FinalizeInstanceValidationObject(framework);
  3373. + auto_inst_warp_entry_and_exit_obj->FinalizeInstanceValidationObject(framework);
  3374. + auto_inst_dyn_shader_trace_obj->FinalizeInstanceValidationObject(framework);
  3375. for (auto intercept : framework->object_dispatch) {
  3376. auto lock = intercept->write_lock();
  3377. @@ -354,7 +384,8 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
  3378. std::vector<ValidationObject*> local_objs = {
  3379. thread_checker_obj, object_tracker_obj, parameter_validation_obj,
  3380. core_checks_obj, best_practices_obj, gpu_assisted_obj, debug_printf_obj,
  3381. - sync_validation_obj,
  3382. + sync_validation_obj, auto_inst_simt_efficiency_obj, auto_inst_execution_trace_obj, auto_inst_dyn_trace_ray_trace_obj,
  3383. + auto_inst_divergence_characterization_obj, auto_inst_warp_entry_and_exit_obj, auto_inst_dyn_shader_trace_obj
  3384. };
  3385. for (auto obj : local_objs) {
  3386. if (std::find(local_object_dispatch.begin(), local_object_dispatch.end(), obj) == local_object_dispatch.end()) {
  3387. @@ -490,6 +521,24 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(VkPhysicalDevice gpu, const VkDevice
  3388. auto sync_validation_obj = new SyncValidator;
  3389. sync_validation_obj->InitDeviceValidationObject(enables[sync_validation], instance_interceptor, device_interceptor);
  3390. + auto auto_inst_simt_efficiency_obj = new AutoInstSimtEfficiency;
  3391. + auto_inst_simt_efficiency_obj->InitDeviceValidationObject(enables[auto_inst_simt_efficiency], instance_interceptor, device_interceptor);
  3392. +
  3393. + auto auto_inst_execution_trace_obj = new AutoInstExecutionTrace;
  3394. + auto_inst_execution_trace_obj->InitDeviceValidationObject(enables[auto_inst_execution_trace], instance_interceptor, device_interceptor);
  3395. +
  3396. + auto auto_inst_dyn_trace_ray_trace_obj = new AutoInstDynTraceRayTrace;
  3397. + auto_inst_dyn_trace_ray_trace_obj->InitDeviceValidationObject(enables[auto_inst_dyn_trace_ray_trace], instance_interceptor, device_interceptor);
  3398. +
  3399. + auto auto_inst_divergence_characterization_obj = new AutoInstDivergenceCharacterization;
  3400. + auto_inst_divergence_characterization_obj->InitDeviceValidationObject(enables[auto_inst_divergence_characterization], instance_interceptor, device_interceptor);
  3401. +
  3402. + auto auto_inst_warp_entry_and_exit_obj = new AutoInstWarpEntryAndExit;
  3403. + auto_inst_warp_entry_and_exit_obj->InitDeviceValidationObject(enables[auto_inst_warp_entry_and_exit], instance_interceptor, device_interceptor);
  3404. +
  3405. + auto auto_inst_dyn_shader_trace_obj = new AutoInstDynShaderTrace;
  3406. + auto_inst_dyn_shader_trace_obj->InitDeviceValidationObject(enables[auto_inst_dyn_shader_trace], instance_interceptor, device_interceptor);
  3407. +
  3408. // Delete unused validation objects to avoid memory leak.
  3409. std::vector<ValidationObject *> local_objs = {
  3410. thread_safety_obj, stateless_validation_obj, object_tracker_obj,
  3411. @@ -568,6 +617,7 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateGraphicsPipelines(
  3412. auto usepCreateInfos = (!cgpl_state[LayerObjectTypeGpuAssisted].pCreateInfos) ? pCreateInfos : cgpl_state[LayerObjectTypeGpuAssisted].pCreateInfos;
  3413. if (cgpl_state[LayerObjectTypeDebugPrintf].pCreateInfos) usepCreateInfos = cgpl_state[LayerObjectTypeDebugPrintf].pCreateInfos;
  3414. + else if (cgpl_state[LayerObjectTypeAutoInst].pCreateInfos) usepCreateInfos = cgpl_state[LayerObjectTypeAutoInst].pCreateInfos;
  3415. VkResult result = DispatchCreateGraphicsPipelines(device, pipelineCache, createInfoCount, usepCreateInfos, pAllocator, pPipelines);
  3416. @@ -604,6 +654,7 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateComputePipelines(
  3417. auto usepCreateInfos = (!ccpl_state[LayerObjectTypeGpuAssisted].pCreateInfos) ? pCreateInfos : ccpl_state[LayerObjectTypeGpuAssisted].pCreateInfos;
  3418. if (ccpl_state[LayerObjectTypeDebugPrintf].pCreateInfos) usepCreateInfos = ccpl_state[LayerObjectTypeDebugPrintf].pCreateInfos;
  3419. + else if (ccpl_state[LayerObjectTypeAutoInst].pCreateInfos) usepCreateInfos = ccpl_state[LayerObjectTypeAutoInst].pCreateInfos;
  3420. VkResult result = DispatchCreateComputePipelines(device, pipelineCache, createInfoCount, usepCreateInfos, pAllocator, pPipelines);
  3421. diff --git a/layers/generated/chassis.h b/layers/generated/chassis.h
  3422. index 23649ec6..5b432a9e 100644
  3423. --- a/layers/generated/chassis.h
  3424. +++ b/layers/generated/chassis.h
  3425. @@ -52,6 +52,12 @@
  3426. #include "vk_safe_struct.h"
  3427. #include "vk_typemap_helper.h"
  3428. +#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT (VkValidationFeatureEnableEXT)5
  3429. +#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT (VkValidationFeatureEnableEXT)6
  3430. +#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT (VkValidationFeatureEnableEXT)7
  3431. +#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT (VkValidationFeatureEnableEXT)8
  3432. +#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT (VkValidationFeatureEnableEXT)9
  3433. +#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT (VkValidationFeatureEnableEXT)10
  3434. extern std::atomic<uint64_t> global_unique_id;
  3435. @@ -2834,6 +2840,7 @@ enum LayerObjectTypeId {
  3436. LayerObjectTypeBestPractices, // Instance or device best practices layer object
  3437. LayerObjectTypeGpuAssisted, // Instance or device gpu assisted validation layer object
  3438. LayerObjectTypeDebugPrintf, // Instance or device shader debug printf layer object
  3439. + LayerObjectTypeAutoInst, // Instance or device shader auto instrumentation layer object
  3440. LayerObjectTypeCommandCounter, // Command Counter validation object, child of corechecks
  3441. LayerObjectTypeSyncValidation, // Instance or device synchronization validation layer object
  3442. LayerObjectTypeMaxEnum, // Max enum count
  3443. @@ -2899,6 +2906,12 @@ typedef enum EnableFlags {
  3444. vendor_specific_arm,
  3445. debug_printf,
  3446. sync_validation,
  3447. + auto_inst_simt_efficiency,
  3448. + auto_inst_execution_trace,
  3449. + auto_inst_dyn_trace_ray_trace,
  3450. + auto_inst_divergence_characterization,
  3451. + auto_inst_warp_entry_and_exit,
  3452. + auto_inst_dyn_shader_trace,
  3453. // Insert new enables above this line
  3454. kMaxEnableFlags,
  3455. } EnableFlags;
  3456. diff --git a/layers/gpu_utils.h b/layers/gpu_utils.h
  3457. index 01197b94..0e31f00e 100644
  3458. --- a/layers/gpu_utils.h
  3459. +++ b/layers/gpu_utils.h
  3460. @@ -314,11 +314,17 @@ void UtilPostCallRecordPipelineCreations(const uint32_t count, const CreateInfo
  3461. VkShaderModule shader_module = VK_NULL_HANDLE;
  3462. if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
  3463. shader_module = pipeline_state->graphicsPipelineCI.pStages[stage].module;
  3464. + object_ptr->shader_map[shader_state->gpu_validation_shader_id].stage =
  3465. + pipeline_state->graphicsPipelineCI.pStages[stage].stage;
  3466. } else if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) {
  3467. assert(stage == 0);
  3468. shader_module = pipeline_state->computePipelineCI.stage.module;
  3469. + object_ptr->shader_map[shader_state->gpu_validation_shader_id].stage =
  3470. + pipeline_state->computePipelineCI.stage.stage;
  3471. } else if (bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_NV) {
  3472. shader_module = pipeline_state->raytracingPipelineCI.pStages[stage].module;
  3473. + object_ptr->shader_map[shader_state->gpu_validation_shader_id].stage =
  3474. + pipeline_state->raytracingPipelineCI.pStages[stage].stage;
  3475. } else {
  3476. assert(false);
  3477. }
  3478. diff --git a/layers/gpu_validation.h b/layers/gpu_validation.h
  3479. index 291cfb85..2ff6e0fc 100644
  3480. --- a/layers/gpu_validation.h
  3481. +++ b/layers/gpu_validation.h
  3482. @@ -54,6 +54,7 @@ struct GpuAssistedShaderTracker {
  3483. VkPipeline pipeline;
  3484. VkShaderModule shader_module;
  3485. std::vector<unsigned int> pgm;
  3486. + VkShaderStageFlagBits stage;
  3487. };
  3488. struct GpuAssistedAccelerationStructureBuildValidationBufferInfo {
  3489. diff --git a/layers/layer_options.cpp b/layers/layer_options.cpp
  3490. index 7a791087..022a9466 100644
  3491. --- a/layers/layer_options.cpp
  3492. +++ b/layers/layer_options.cpp
  3493. @@ -92,6 +92,19 @@ void SetValidationEnable(CHECK_ENABLED &enable_data, const ValidationCheckEnable
  3494. // Set the local enable flag for a single VK_VALIDATION_FEATURE_ENABLE_* flag
  3495. void SetValidationFeatureEnable(CHECK_ENABLED &enable_data, const VkValidationFeatureEnableEXT feature_enable) {
  3496. + if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT) {
  3497. + enable_data[auto_inst_simt_efficiency] = true;
  3498. + } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT) {
  3499. + enable_data[auto_inst_execution_trace] = true;
  3500. + } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT) {
  3501. + enable_data[auto_inst_dyn_trace_ray_trace] = true;
  3502. + } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT) {
  3503. + enable_data[auto_inst_divergence_characterization] = true;
  3504. + } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT) {
  3505. + enable_data[auto_inst_warp_entry_and_exit] = true;
  3506. + } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT) {
  3507. + enable_data[auto_inst_dyn_shader_trace] = true;
  3508. + }
  3509. switch (feature_enable) {
  3510. case VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT:
  3511. enable_data[gpu_validation] = true;
  3512. diff --git a/layers/layer_options.h b/layers/layer_options.h
  3513. index 861b9abe..d1b5ea68 100644
  3514. --- a/layers/layer_options.h
  3515. +++ b/layers/layer_options.h
  3516. @@ -50,6 +50,12 @@ static const std::unordered_map<std::string, VkValidationFeatureEnableEXT> VkVal
  3517. {"VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT", VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT},
  3518. {"VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT", VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT},
  3519. {"VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT", VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT},
  3520. + {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT},
  3521. + {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT},
  3522. + {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT},
  3523. + {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT},
  3524. + {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT},
  3525. + {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT},
  3526. };
  3527. static const std::unordered_map<std::string, VkValidationFeatureEnable> VkValFeatureEnableLookup2 = {
  3528. @@ -93,7 +99,13 @@ static const std::vector<std::string> EnableFlagNameHelper = {
  3529. "VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT", // best_practices,
  3530. "VALIDATION_CHECK_ENABLE_VENDOR_SPECIFIC_ARM", // vendor_specific_arm,
  3531. "VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT", // debug_printf,
  3532. - "VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION" // sync_validation,
  3533. + "VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION", // sync_validation,
  3534. + "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT", // auto_inst_simt_efficiency
  3535. + "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT", // auto_inst_execution_trace
  3536. + "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT", // auto_inst_dyn_trace_ray_trace
  3537. + "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT", // auto_inst_divergence_characterization
  3538. + "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT", // auto_inst_warp_entry_and_exit
  3539. + "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT", // auto_inst_dyn_shader_trace
  3540. };
  3541. void ProcessConfigAndEnvSettings(ConfigAndEnvSettings *settings_data);
  3542. diff --git a/scripts/known_good.json b/scripts/known_good.json
  3543. index 9a53e452..3a27076e 100755
  3544. --- a/scripts/known_good.json
  3545. +++ b/scripts/known_good.json
  3546. @@ -29,11 +29,20 @@
  3547. "build_dir": "SPIRV-Headers/build",
  3548. "install_dir": "SPIRV-Headers/build/install",
  3549. "commit": "f027d53ded7e230e008d37c8b47ede7cd308e19d"
  3550. + },
  3551. + {
  3552. + "name": "SPIRV-Cross",
  3553. + "url": "https://github.com/KhronosGroup/SPIRV-Cross.git",
  3554. + "sub_dir": "spirv-cross",
  3555. + "build_dir": "spirv-cross/build",
  3556. + "install_dir": "spirv-cross/build/install",
  3557. + "commit": "e50f7d1ce8e162d0c826e84168cfa234e4de4ec9"
  3558. }
  3559. ],
  3560. - "install_names" : {
  3561. - "glslang" : "GLSLANG_INSTALL_DIR",
  3562. - "Vulkan-Headers" : "VULKAN_HEADERS_INSTALL_DIR",
  3563. - "SPIRV-Headers" : "SPIRV_HEADERS_INSTALL_DIR"
  3564. + "install_names": {
  3565. + "glslang": "GLSLANG_INSTALL_DIR",
  3566. + "Vulkan-Headers": "VULKAN_HEADERS_INSTALL_DIR",
  3567. + "SPIRV-Headers": "SPIRV_HEADERS_INSTALL_DIR",
  3568. + "SPIRV-Cross": "SPIRV_CROSS_INSTALL_DIR"
  3569. }
  3570. }
  3571. --
  3572. 2.29.2.windows.2