From 5414baa006d362c52b3582a60ec45563e1848868 Mon Sep 17 00:00:00 2001
From: davidpankratz <david.pankratz2@huawei.com>
Date: Mon, 1 Feb 2021 11:31:03 -0700
Subject: [PATCH] Added Vulkan Vision, a framework for fine-grained analysis of
 Vulkan pipeline execution on the GPU published at CGO 2021. Analysis includes
 AI workloads through the compute pipeline.

Vulkan Vision inserts dynamic instrumentation to gain unprecedented execution trace information of vulkan applications. For example, the execution trace can be used to determine the hotness of various regions of a compute shader. In addition, the runtime SIMD utilization can be cross-referenced with the hotness to find optimization candidates. Vulkan Vision was accepted to the Code Generation and Optimization 2021 conference
---
 README.md                                     |    5 +
 build/scripts/build_vulkan_vision_linux.sh    |   74 +
 build/scripts/build_vulkan_vision_windows.bat |   71 +
 ecosystem_tools/VulkanVision/README.md        |   25 +
 .../0001-spirv-opt-Add-auto-inst-passes.patch | 3188 ++++++++++++++
 .../VulkanVision/st-patches/vvision-st.diff   |    1 +
 .../0001-layers-Added-auto-inst-layers.patch  | 3846 +++++++++++++++++
 .../VulkanVision/vv-patches/vvision-vv.diff   |    1 +
 8 files changed, 7211 insertions(+)
 create mode 100644 build/scripts/build_vulkan_vision_linux.sh
 create mode 100644 build/scripts/build_vulkan_vision_windows.bat
 create mode 100644 ecosystem_tools/VulkanVision/README.md
 create mode 100644 ecosystem_tools/VulkanVision/st-patches/0001-spirv-opt-Add-auto-inst-passes.patch
 create mode 100644 ecosystem_tools/VulkanVision/st-patches/vvision-st.diff
 create mode 100644 ecosystem_tools/VulkanVision/vv-patches/0001-layers-Added-auto-inst-layers.patch
 create mode 100644 ecosystem_tools/VulkanVision/vv-patches/vvision-vv.diff

diff --git a/README.md b/README.md
index 1bc2356c..2fbbd043 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@
 - [Community](#community)
     - [Governance](#governance)
     - [Communication](#communication)
+- [Vulkan Vision](#vulkan-vision)
 - [Contributing](#contributing)
 - [Release Notes](#release-notes)
 - [License](#license)
@@ -156,6 +157,10 @@ Check out how MindSpore Open Governance [works](https://gitee.com/mindspore/comm
 - Video Conferencing: TBD
 - Mailing-list: <https://mailweb.mindspore.cn/postorius/lists>
 
+## Vulkan Vision
+
+Vulkan Vision(V-Vision) provides an unprecedented level of detail into the execution of Vulkan applications through dynamic instrumentation. V-Vision supports analyzing AI workloads implemented using the a compute pipeline as well as traditional raster and ray-tracing Vulkan applications. To use V-Vision please refer to the [build instructions](https://gitee.com/mindspore/mindspore/ecosystem_tools/VulkanVision/README.md).
+
 ## Contributing
 
 Welcome contributions. See our [Contributor Wiki](https://gitee.com/mindspore/mindspore/blob/master/CONTRIBUTING.md) for more details.
diff --git a/build/scripts/build_vulkan_vision_linux.sh b/build/scripts/build_vulkan_vision_linux.sh
new file mode 100644
index 00000000..79a7c737
--- /dev/null
+++ b/build/scripts/build_vulkan_vision_linux.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+start_dir=$(pwd)
+cd "$(dirname "$0")"/../../ecosystem_tools/VulkanVision
+
+if [[ ! -d  "SPIRV-Tools" ]]
+then
+    echo "Cloning SPIRV-Tools"
+    git clone https://github.com/KhronosGroup/SPIRV-Tools
+    cp st-patches/*.patch SPIRV-Tools
+    cd SPIRV-Tools
+    git checkout 17ffa89097b26efeb323e6963220326b5ffb2baf
+    # These are the current stable changes and can be updated with new releases
+    git apply 0001-spirv-opt-Add-auto-inst-passes.patch
+    rm *.patch
+    cd ..
+fi
+
+if [[ ! -d  "Vulkan-ValidationLayers" ]] 
+then
+    echo "Cloning Vulkan-ValidationLayers"
+    git clone https://github.com/KhronosGroup/Vulkan-ValidationLayers
+    cp vv-patches/*.patch Vulkan-ValidationLayers
+    cd Vulkan-ValidationLayers
+    git checkout aa076dae88e282d7b6cada4f900b2fa7dac8ed08
+    # These are the current stable changes and can be updated with new releases
+    git apply 0001-layers-Added-auto-inst-layers.patch
+    rm *.patch
+    cd ..
+fi
+
+
+build_dir=$(pwd)
+
+echo "Building SPIRV-Tools"
+cd SPIRV-Tools
+git clone https://github.com/KhronosGroup/SPIRV-Headers.git external/spirv-headers
+cd external/spirv-headers
+git checkout f027d53ded7e230e008d37c8b47ede7cd308e19d
+cd ../..
+git clone https://github.com/google/effcee.git external/effcee
+git clone https://github.com/google/re2.git external/re2
+mkdir build
+cd build
+mkdir install
+cmake -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=install ..
+cmake --build . --target install --config Release -- -j 4
+cd $build_dir
+
+echo "Building Vulkan-ValidationLayers"
+cd Vulkan-ValidationLayers
+mkdir build 
+cd build
+mkdir install
+python ../scripts/update_deps.py --config release
+cmake -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=install -DSPIRV_TOOLS_INSTALL_DIR=$build_dir/SPIRV-Tools/build/install -C helper.cmake ..
+cmake --build . --target install --config Release -- -j 4
+
+echo "Build completed at $build_dir"!
+
+cd $start_dir
\ No newline at end of file
diff --git a/build/scripts/build_vulkan_vision_windows.bat b/build/scripts/build_vulkan_vision_windows.bat
new file mode 100644
index 00000000..55a66709
--- /dev/null
+++ b/build/scripts/build_vulkan_vision_windows.bat
@@ -0,0 +1,71 @@
+@echo off
+REM Copyright 2021 Huawei Technologies Co., Ltd.
+REM
+REM Licensed under the Apache License, Version 2.0 (the "License");
+REM you may not use this file except in compliance with the License.
+REM You may obtain a copy of the License at
+REM
+REM     http://www.apache.org/licenses/LICENSE-2.0
+REM
+REM Unless required by applicable law or agreed to in writing, software
+REM distributed under the License is distributed on an "AS IS" BASIS,
+REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+REM See the License for the specific language governing permissions and
+REM limitations under the License.
+
+set start_dir=%cd%
+cd %~dp0..\..\ecosystem_tools\VulkanVision
+
+IF NOT EXIST SPIRV-Tools (
+    echo Cloning SPIRV-Tools
+    git clone https://github.com/KhronosGroup/SPIRV-Tools
+    copy st-patches\*.patch SPIRV-Tools
+    cd SPIRV-Tools
+    git checkout 17ffa89097b26efeb323e6963220326b5ffb2baf
+    REM These are the current stable changes and can be updated with new releases
+    git apply 0001-spirv-opt-Add-auto-inst-passes.patch
+    del *.patch
+    cd ..
+)
+
+IF NOT EXIST Vulkan-ValidationLayers (
+    echo Cloning Vulkan-ValidationLayers
+    git clone https://github.com/KhronosGroup/Vulkan-ValidationLayers
+    copy vv-patches\*.patch Vulkan-ValidationLayers
+    cd Vulkan-ValidationLayers
+    git checkout aa076dae88e282d7b6cada4f900b2fa7dac8ed08
+    REM These are the current stable changes and can be updated with new releases
+    git apply 0001-layers-Added-auto-inst-layers.patch
+    del *.patch
+    cd ..
+)
+
+set build_dir=%cd%
+
+echo Building SPIRV-Tools
+cd SPIRV-Tools
+git clone https://github.com/KhronosGroup/SPIRV-Headers.git external\spirv-headers
+cd external\spirv-headers
+git checkout f027d53ded7e230e008d37c8b47ede7cd308e19d
+cd ..\..
+git clone https://github.com/google/effcee.git external\effcee
+git clone https://github.com/google/re2.git external\re2
+mkdir build
+cd build
+mkdir install
+cmake -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=install ..
+cmake --build . --target install --config Release
+cd %build_dir%
+
+echo Building Vulkan-ValidationLayers
+cd Vulkan-ValidationLayers
+mkdir build 
+cd build
+mkdir install
+python ../scripts/update_deps.py --config release
+cmake -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=install -DSPIRV_TOOLS_INSTALL_DIR=%cd%/../../SPIRV-Tools/build/install -C helper.cmake ..
+cmake --build . --target install --config Release
+
+echo Build completed at %build_dir%!
+
+cd %start_dir%
\ No newline at end of file
diff --git a/ecosystem_tools/VulkanVision/README.md b/ecosystem_tools/VulkanVision/README.md
new file mode 100644
index 00000000..16824a7e
--- /dev/null
+++ b/ecosystem_tools/VulkanVision/README.md
@@ -0,0 +1,25 @@
+# Vulkan Vision
+
+If Vulkan Vision is useful to you, please cite "Vulkan Vision: Ray Tracing Workload Characterization using Automatic Graphics Instrumentation".
+
+Vulkan Vision is released as patches on the Khronos Group [Vulkan-ValidationLayers](https://github.com/KhronosGroup/Vulkan-ValidationLayers) and [SPIRV-Tools](https://github.com/KhronosGroup/SPIRV-Tools) repositories.
+
+To generate a vvision build:
+
+## Windows
+
+```bat
+..\..\build\scripts\build_vulkan_vision_windows.bat
+```
+
+## Linux
+
+```bash
+../../build/scripts/build_vulkan_vision_linux.sh
+```
+
+The completed build will be at `mindinsight/ecosystem_tools/VulkanVision/Vulkan-ValidationLayers/build/install`
+
+V-Vision Documentation will be at `mindinsight/ecosystem_tools/VulkanVision/Vulkan-ValidationLayers/docs/auto_instrument.md`
+
+Documentation for enabling and using Vulkan Validation layers can be found [here](https://vulkan.lunarg.com/doc/sdk/1.2.162.0/windows/layer_configuration.html)
\ No newline at end of file
diff --git a/ecosystem_tools/VulkanVision/st-patches/0001-spirv-opt-Add-auto-inst-passes.patch b/ecosystem_tools/VulkanVision/st-patches/0001-spirv-opt-Add-auto-inst-passes.patch
new file mode 100644
index 00000000..0b712af9
--- /dev/null
+++ b/ecosystem_tools/VulkanVision/st-patches/0001-spirv-opt-Add-auto-inst-passes.patch
@@ -0,0 +1,3188 @@
+From 48520bd5e6344d792840aa37fe1ba5d564232788 Mon Sep 17 00:00:00 2001
+From: dpankratz <pankratz@ualberta.ca>
+Date: Wed, 27 Jan 2021 09:18:00 -0700
+Subject: [PATCH] spirv-opt: Add auto-inst passes
+
+---
+ Android.mk                                    |   8 +
+ include/spirv-tools/instrument.hpp            |  39 +
+ include/spirv-tools/optimizer.hpp             |  98 ++
+ source/opt/CMakeLists.txt                     |  18 +-
+ source/opt/auto_inst_debug_pass.cpp           |  68 ++
+ source/opt/auto_inst_debug_pass.h             | 106 ++
+ ..._inst_divergence_characterization_pass.cpp | 134 +++
+ ...to_inst_divergence_characterization_pass.h | 106 ++
+ .../opt/auto_inst_dyn_shader_trace_pass.cpp   |  52 +
+ source/opt/auto_inst_dyn_shader_trace_pass.h  | 103 ++
+ .../auto_inst_dyn_trace_ray_trace_pass.cpp    | 184 ++++
+ .../opt/auto_inst_dyn_trace_ray_trace_pass.h  | 135 +++
+ source/opt/auto_inst_execution_trace_pass.cpp |  68 ++
+ source/opt/auto_inst_execution_trace_pass.h   | 122 +++
+ source/opt/auto_inst_pass.cpp                 | 927 ++++++++++++++++++
+ source/opt/auto_inst_pass.h                   | 322 ++++++
+ source/opt/auto_inst_simt_efficiency_pass.cpp |  39 +
+ source/opt/auto_inst_simt_efficiency_pass.h   | 101 ++
+ .../auto_inst_warp_entry_and_exit_pass.cpp    |  43 +
+ .../opt/auto_inst_warp_entry_and_exit_pass.h  |  99 ++
+ source/opt/instrument_pass.cpp                |   5 +-
+ source/opt/instrument_pass.h                  |   1 +
+ source/opt/ir_builder.h                       |  26 +
+ source/opt/ir_context.cpp                     |  13 +
+ source/opt/optimizer.cpp                      |  60 ++
+ source/opt/passes.h                           |   7 +
+ 26 files changed, 2882 insertions(+), 2 deletions(-)
+ create mode 100644 source/opt/auto_inst_debug_pass.cpp
+ create mode 100644 source/opt/auto_inst_debug_pass.h
+ create mode 100644 source/opt/auto_inst_divergence_characterization_pass.cpp
+ create mode 100644 source/opt/auto_inst_divergence_characterization_pass.h
+ create mode 100644 source/opt/auto_inst_dyn_shader_trace_pass.cpp
+ create mode 100644 source/opt/auto_inst_dyn_shader_trace_pass.h
+ create mode 100644 source/opt/auto_inst_dyn_trace_ray_trace_pass.cpp
+ create mode 100644 source/opt/auto_inst_dyn_trace_ray_trace_pass.h
+ create mode 100644 source/opt/auto_inst_execution_trace_pass.cpp
+ create mode 100644 source/opt/auto_inst_execution_trace_pass.h
+ create mode 100644 source/opt/auto_inst_pass.cpp
+ create mode 100644 source/opt/auto_inst_pass.h
+ create mode 100644 source/opt/auto_inst_simt_efficiency_pass.cpp
+ create mode 100644 source/opt/auto_inst_simt_efficiency_pass.h
+ create mode 100644 source/opt/auto_inst_warp_entry_and_exit_pass.cpp
+ create mode 100644 source/opt/auto_inst_warp_entry_and_exit_pass.h
+
+diff --git a/Android.mk b/Android.mk
+index 0b64ea6d..46728c67 100644
+--- a/Android.mk
++++ b/Android.mk
+@@ -73,6 +73,14 @@ SPVTOOLS_SRC_FILES := \
+ 		source/val/validate_type.cpp
+ 
+ SPVTOOLS_OPT_SRC_FILES := \
++		source/opt/auto_inst_pass.cpp \
++		source/opt/auto_inst_debug_pass.cpp \
++		source/opt/auto_inst_divergence_characterization_pass.cpp \
++		source/opt/auto_inst_dyn_trace_ray_trace_pass.cpp \
++		source/opt/auto_inst_dyn_shader_trace_pass.cpp \
++		source/opt/auto_inst_execution_trace_pass.cpp \
++		source/opt/auto_inst_simt_efficiency_pass.cpp \
++		source/opt/auto_inst_warp_entry_and_exit_pass.cpp \
+ 		source/opt/aggressive_dead_code_elim_pass.cpp \
+ 		source/opt/amd_ext_to_khr.cpp \
+ 		source/opt/basic_block.cpp \
+diff --git a/include/spirv-tools/instrument.hpp b/include/spirv-tools/instrument.hpp
+index 2b47a564..5a2b00c9 100644
+--- a/include/spirv-tools/instrument.hpp
++++ b/include/spirv-tools/instrument.hpp
+@@ -250,6 +250,45 @@ static const int kDebugInputBuffAddrPtrOffset = 1;
+ // not a valid buffer, the length associated with the 0x0 address is zero.
+ static const int kDebugInputBuffAddrLengthOffset = 0;
+ 
++// Auto-Inst Bindings
++//
++// These bindings are used primarily to differentiate different instrumentation
++// primitives which have meaning to the instrumentation generating them
++// and the analysis parsing them.
++//
++// NOTE: do not set these values to 0 since, by default, the instrumentation
++// buffer is 0 initialized and would cause pernicious bugs.
++
++static const int kAutoInstUniqueSubgroupId = 1;
++
++// auto_inst_divergence_characterization
++// Primitive to track the beginning of a traceRay
++static const uint32_t kAutoInstDivCharPreTraceRay = 2;
++// Primitive to track the end of a traceRay
++static const uint32_t kAutoInstDivCharPostTraceRay = 3;
++// Primitive to get the active threads in a basic block
++static const uint32_t kAutoInstDivCharActiveThreads = 4;
++// Primtive to get the active threads at the
++// beginning of a ray tracing shader
++static const uint32_t kAutoInstDivCharShaderEntryPoint = 5;
++// Primitive to track when threads have finished executing
++// the pipeline.
++static const uint32_t kAutoInstDivCharQuitPipeline = 6;
++
++// auto_inst_dyn_shader_trace
++// Track when shader is executed
++static const uint32_t kAutoInstDynShaderTraceEntryPoint = 2;
++
++// auto_inst_dyn_trace_ray_trace
++// Track when traceRay call started
++static const uint32_t kAutoInstTraceRayTracePreTraceRay = 2;
++// Track when merge point after traceRay is executed
++static const uint32_t kAutoInstTraceRayTraceMergePoint = 3;
++
++// auto_inst_warp_entry_and_exit
++static const uint32_t kAutoInstWarpEntryAndExitBeginPipeline = 1;
++static const uint32_t kAutoInstWarpEntryAndExitEndPipeline = 2;
++
+ }  // namespace spvtools
+ 
+ #endif  // INCLUDE_SPIRV_TOOLS_INSTRUMENT_HPP_
+diff --git a/include/spirv-tools/optimizer.hpp b/include/spirv-tools/optimizer.hpp
+index 27352b25..a8cc6d34 100644
+--- a/include/spirv-tools/optimizer.hpp
++++ b/include/spirv-tools/optimizer.hpp
+@@ -15,8 +15,10 @@
+ #ifndef INCLUDE_SPIRV_TOOLS_OPTIMIZER_HPP_
+ #define INCLUDE_SPIRV_TOOLS_OPTIMIZER_HPP_
+ 
++#include <functional>
+ #include <memory>
+ #include <ostream>
++#include <set>
+ #include <string>
+ #include <unordered_map>
+ #include <vector>
+@@ -792,6 +794,102 @@ Optimizer::PassToken CreateInstBuffAddrCheckPass(uint32_t desc_set,
+ Optimizer::PassToken CreateInstDebugPrintfPass(uint32_t desc_set,
+                                                uint32_t shader_id);
+ 
++// Create a pass which will help debug problems with other autoinstrumentation
++// passes.
++//
++// If |test_atomic_ops| is true, then an atomicAdd will be added to the shader
++// and its result will be written to the buffer given by |desc_set|. If
++// |test_subgroup_ops| is true, then a subgroupElect will be added to the shader
++// and its result will be written to the buffer.
++// If both are false then a constant will be written to the buffer.
++Optimizer::PassToken CreateAutoInstDebugPass(uint32_t desc_set,
++                                             uint32_t shader_id,
++                                             bool test_atomic_ops,
++                                             bool test_subgroup_ops);
++
++// Create a pass which will automatically insert instrumentation to
++// capture the extent of different contributors to divergence.
++//
++// The instrumentation will write buffers in debug descriptor set |desc_set|.
++// It will write |shader_id| in each output record to identify the shader
++// module which generated the record if necessary.
++Optimizer::PassToken CreateAutoInstDivergenceCharacterizationPass(
++    uint32_t desc_set, uint32_t shader_id,
++    std::function<
++        void(std::unordered_map<uint32_t, uint32_t>&& inst_id2prim_id,
++             std::unordered_map<uint32_t, uint32_t>&& inst_id2inst_count)>
++        static_data_callback);
++
++// Create a pass which will automatically insert instrumentation to
++// determine the runtime execution counts of each shader.
++//
++// The instrumentation will write buffers in debug descriptor set |desc_set|.
++// It will write |shader_id| in each output record to identify the shader
++// module which generated the record.
++Optimizer::PassToken CreateAutoInstDynShaderTracePass(uint32_t desc_set,
++                                                      uint32_t shader_id);
++
++// Create a pass which will automatically insert instrumentation to
++// disambiguate runtime traceRay calls found within control-flow.
++//
++// The instrumentation will write buffers in debug descriptor set |desc_set|.
++// It will write |shader_id| in each output record to identify the shader
++// module which generated the record if necessary.
++//
++// The |static_data_callback| is called after the instrumentation pass has
++// finished. It is populated with a mapping from instrumentation callsite id
++// to instrumentation type. It is also populated with a mapping from merge point
++// to all the traceRay calls sites that could have executed within the
++// control-flow.
++Optimizer::PassToken CreateAutoInstDynTraceRayTracePass(
++    uint32_t desc_set, uint32_t shader_id,
++    std::function<void(std::unordered_map<uint32_t, uint32_t>&&,
++                       std::unordered_map<uint32_t, std::vector<uint32_t>>&&)>
++        static_data_callback);
++
++// Create a pass which will automatically insert instrumentation to
++// compute the simt efficiency of the shader module.
++//
++// The instrumentation will write buffers in debug descriptor set |desc_set|.
++// |reserved_words_count| is the number of lower words in the buffer that
++// have a fixed function and are reserved.
++//
++// It will write |shader_id| in each output record to identify the shader
++// module which generated the record if necessary.
++Optimizer::PassToken CreateAutoInstSimtEfficiencyPass(
++    uint32_t desc_set, uint32_t shader_id, uint32_t reserved_words_count);
++
++// Create a pass which will automatically insert instrumentation to
++// capture the number of times the ray tracing pipeline entrypoint is executed
++// vs how many times the exit is executed. On architectures with a SIMD
++// execution model #entries == #exits. On MIMD (or psuedo-MIMD) execution models
++// #entries != #exits.
++//
++// The instrumentation will write buffers in debug descriptor set |desc_set|.
++// It will write |shader_id| in each output record to identify the shader
++// module which generated the record if necessary.
++Optimizer::PassToken CreateAutoInstWarpEntryAndExitPass(uint32_t desc_set,
++                                                        uint32_t shader_id);
++
++
++// Create a pass which will automatically insert instrumentation to
++// capture the execution trace of the pipeline.
++//
++// The instrumentation will write buffers in debug descriptor set |desc_set|.
++// It will write |shader_id| in each output record to identify the shader
++// module which generated the record.
++//
++// The |static_data_callback| is called after the instrumentation pass has
++// finished. It is populated with a mapping from instrumentation callsite id
++// to the other opcodes in the basic block. This data allows the analysis
++// to develop complete dynamic instruction counts of the shader module without
++// needing to transfer the data at runtime.
++Optimizer::PassToken CreateAutoInstExecutionTracePass(
++    uint32_t desc_set, uint32_t shader_id,
++    std::function<void(std::unordered_map<uint32_t, std::set<uint32_t>>&&,
++                       std::unordered_map<uint32_t, uint32_t>&&)>
++        static_data_callback);
++
+ // Create a pass to upgrade to the VulkanKHR memory model.
+ // This pass upgrades the Logical GLSL450 memory model to Logical VulkanKHR.
+ // Additionally, it modifies memory, image, atomic and barrier operations to
+diff --git a/source/opt/CMakeLists.txt b/source/opt/CMakeLists.txt
+index f3ac5906..a59b18c8 100644
+--- a/source/opt/CMakeLists.txt
++++ b/source/opt/CMakeLists.txt
+@@ -14,6 +14,14 @@
+ set(SPIRV_TOOLS_OPT_SOURCES
+   aggressive_dead_code_elim_pass.h
+   amd_ext_to_khr.h
++  auto_inst_pass.h
++  auto_inst_debug_pass.h
++  auto_inst_divergence_characterization_pass.h
++  auto_inst_dyn_shader_trace_pass.h
++  auto_inst_dyn_trace_ray_trace_pass.h
++  auto_inst_execution_trace_pass.h            
++  auto_inst_simt_efficiency_pass.h    
++  auto_inst_warp_entry_and_exit_pass.h  
+   basic_block.h
+   block_merge_pass.h
+   block_merge_util.h
+@@ -122,6 +130,14 @@ set(SPIRV_TOOLS_OPT_SOURCES
+ 
+   aggressive_dead_code_elim_pass.cpp
+   amd_ext_to_khr.cpp
++  auto_inst_pass.cpp
++  auto_inst_debug_pass.cpp
++  auto_inst_divergence_characterization_pass.cpp
++  auto_inst_dyn_shader_trace_pass.cpp
++  auto_inst_dyn_trace_ray_trace_pass.cpp
++  auto_inst_execution_trace_pass.cpp        
++  auto_inst_simt_efficiency_pass.cpp  
++  auto_inst_warp_entry_and_exit_pass.cpp  
+   basic_block.cpp
+   block_merge_pass.cpp
+   block_merge_util.cpp
+@@ -167,7 +183,7 @@ set(SPIRV_TOOLS_OPT_SOURCES
+   inline_pass.cpp
+   inst_bindless_check_pass.cpp
+   inst_buff_addr_check_pass.cpp
+-  inst_debug_printf_pass.cpp
++  inst_debug_printf_pass.cpp  
+   instruction.cpp
+   instruction_list.cpp
+   instrument_pass.cpp
+diff --git a/source/opt/auto_inst_debug_pass.cpp b/source/opt/auto_inst_debug_pass.cpp
+new file mode 100644
+index 00000000..d4321e92
+--- /dev/null
++++ b/source/opt/auto_inst_debug_pass.cpp
+@@ -0,0 +1,68 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++//
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++#include "auto_inst_debug_pass.h"
++
++namespace spvtools {
++namespace opt {
++
++bool AutoInstDebugPass::PreEntryPointInstrument(InstructionBuilder* builder,
++                                                uint32_t stage_idx) {
++  (void)stage_idx;
++
++  uint32_t buf_id = GetOutputBufferId();
++  uint32_t buf_uint_ptr_id = GetOutputBufferPtrId();
++
++  if (test_atomic_ops_) {
++    Instruction* offset_ptr = builder->AddTernaryOp(
++        buf_uint_ptr_id, SpvOpAccessChain, buf_id,
++        builder->GetUintConstantId(kDebugOutputDataOffset),
++        builder->GetUintConstantId(0));
++    uint32_t mask_none_id = builder->GetUintConstantId(SpvMemoryAccessMaskNone);
++    uint32_t scope_invok_id = builder->GetUintConstantId(SpvScopeInvocation);
++    (void)builder->AddQuadOp(GetUintId(), SpvOpAtomicIAdd,
++                             offset_ptr->result_id(), scope_invok_id,
++                             mask_none_id, builder->GetUintConstantId(1));
++  } else if (test_subgroup_ops_) {
++    Instruction* subgroup_leader_cond =
++        builder->AddUnaryOp(GetBoolId(), SpvOpGroupNonUniformElect,
++                            builder->GetUintConstantId(SpvScopeSubgroup));
++
++    auto active_thread_mask =
++        GenSubgroupBallotId(builder, subgroup_leader_cond->result_id());
++
++    Instruction* offset_ptr = builder->AddTernaryOp(
++        buf_uint_ptr_id, SpvOpAccessChain, buf_id,
++        builder->GetUintConstantId(kDebugOutputDataOffset),
++        builder->GetUintConstantId(0));
++    builder->AddStore(offset_ptr->result_id(), active_thread_mask);
++
++  } else if (!test_atomic_ops_ && !test_subgroup_ops_) {
++    Instruction* buffer_capacity =
++        builder->AddIdLiteralOp(GetUintId(), SpvOpArrayLength,
++                                GetOutputBufferId(), kDebugOutputDataOffset);
++
++    Instruction* offset_ptr = builder->AddTernaryOp(
++        buf_uint_ptr_id, SpvOpAccessChain, buf_id,
++        builder->GetUintConstantId(kDebugOutputDataOffset),
++        builder->GetUintConstantId(0));
++
++    builder->AddStore(offset_ptr->result_id(), buffer_capacity->result_id());
++  }
++
++  return true;
++}
++
++}  // namespace opt
++}  // namespace spvtools
+diff --git a/source/opt/auto_inst_debug_pass.h b/source/opt/auto_inst_debug_pass.h
+new file mode 100644
+index 00000000..7fb59430
+--- /dev/null
++++ b/source/opt/auto_inst_debug_pass.h
+@@ -0,0 +1,106 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++#ifndef LIBSPIRV_OPT_AUTO_INST_DEBUG_PASS_H_
++#define LIBSPIRV_OPT_AUTO_INST_DEBUG_PASS_H_
++
++#include "auto_inst_pass.h"
++
++namespace spvtools {
++namespace opt {
++
++class AutoInstDebugPass : public AutoInstPass {
++ public:
++  AutoInstDebugPass(uint32_t desc_set, uint32_t shader_id, bool test_atomic_ops,
++                    bool test_subgroup_ops)
++      : AutoInstPass(desc_set, shader_id, kDefaultReservedWordsCnt),
++        test_atomic_ops_(test_atomic_ops),
++        test_subgroup_ops_(test_subgroup_ops) {}
++
++  const char* name() const override { return "auto-inst-debug-pass"; }
++
++ protected:
++  const bool test_atomic_ops_;
++  const bool test_subgroup_ops_;
++
++ private:
++  // Allows inheriting classes to initialize their knowledge
++  // of module before beginning instrumentation
++  void InitializeInstrumentation() override{};
++
++  // Allows inheriting classes to finalize before
++  // the pass finishes executing.
++  void FinalizeInstrumentation() override{};
++
++  // Any instructions added via |builder| will appear before |inst|
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                uint32_t stage_idx) override {
++    (void)inst;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will appear after |inst|.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                 uint32_t stage_idx) override {
++    (void)inst;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will appear before the content of
++  // |bb|. |stage_idx| contains the SpvExecutionModel that builder is operating
++  // in. This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool BasicBlockInstrument(BasicBlock* bb, InstructionBuilder* builder,
++                            uint32_t stage_idx) override {
++    (void)bb;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreEntryPointInstrument(InstructionBuilder* builder,
++                               uint32_t stage_idx) override;
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostEntryPointInstrument(InstructionBuilder* builder,
++                                uint32_t stage_idx) override {
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  }
++};
++
++}  // namespace opt
++}  // namespace spvtools
++
++#endif  // LIBSPIRV_OPT_AUTO_INST_DEBUG_PASS_H_
+diff --git a/source/opt/auto_inst_divergence_characterization_pass.cpp b/source/opt/auto_inst_divergence_characterization_pass.cpp
+new file mode 100644
+index 00000000..a462c05e
+--- /dev/null
++++ b/source/opt/auto_inst_divergence_characterization_pass.cpp
+@@ -0,0 +1,134 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++//
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++#include "auto_inst_divergence_characterization_pass.h"
++
++namespace {
++std::unordered_set<uint32_t> pseudo_ops = {
++    SpvOpPhi,         SpvOpFunction, SpvOpFunctionParameter,
++    SpvOpFunctionEnd, SpvOpVariable, SpvOpLoopMerge};
++
++static const std::set<uint32_t> kAllRayTracingStages = {
++    SpvExecutionModelRayGenerationNV, SpvExecutionModelAnyHitNV,
++    SpvExecutionModelClosestHitNV,    SpvExecutionModelMissNV,
++    SpvExecutionModelIntersectionNV,  SpvExecutionModelCallableNV};
++
++}  // namespace
++
++namespace spvtools {
++namespace opt {
++
++void AutoInstDivergenceCharacterizationPass::GenInst(
++    InstructionBuilder* builder, Instruction* instruction, uint32_t prim_id,
++    uint32_t stage_idx) {
++  uint32_t inst_id = GenInstCallsiteId(instruction);
++
++  auto flat_thread_id = GenFlatRtThreadId(builder, stage_idx);
++  auto active_thread_mask_id =
++      GenSubgroupBallotId(builder, builder->GetBoolConstant(true)->result_id());
++
++  GenSubgroupUpdateCall(builder, {builder->GetUintConstantId(inst_id),
++                              flat_thread_id, active_thread_mask_id});
++
++  inst_id2prim_id_[inst_id] = prim_id;
++}
++
++void AutoInstDivergenceCharacterizationPass::InitializeInstrumentation() {
++  inst_id2inst_count_.clear();
++  inst_id2prim_id_.clear();
++}
++
++bool AutoInstDivergenceCharacterizationPass::PreInstructionInstrument(
++    Instruction* inst, InstructionBuilder* builder, uint32_t stage_idx) {
++  if (inst->opcode() == SpvOpTraceNV || inst->opcode() == SpvOpTraceRayKHR) {
++    // Record when threads begin a traceRay
++    GenInst(builder, &*builder->GetInsertPoint(), kAutoInstDivCharPreTraceRay,
++            stage_idx);
++    return true;
++  }
++
++  return false;
++};
++
++bool AutoInstDivergenceCharacterizationPass::PostInstructionInstrument(
++    Instruction* inst, InstructionBuilder* builder, uint32_t stage_idx) {
++  if (inst->opcode() == SpvOpTraceNV || inst->opcode() == SpvOpTraceRayKHR) {
++    // Record when threads end a traceRay
++    GenInst(builder, &*builder->GetInsertPoint(), kAutoInstDivCharPostTraceRay,
++            stage_idx);
++    return true;
++  }
++
++  return false;
++}
++
++bool AutoInstDivergenceCharacterizationPass::PreEntryPointInstrument(
++    InstructionBuilder* builder, uint32_t stage_idx) {
++  if (stage_idx == SpvExecutionModelRayGenerationKHR) {
++    inst_id2prim_id_[kAutoInstUniqueSubgroupId] = kAutoInstUniqueSubgroupId;
++    // Create an instrumentation id which will be used by the analysis
++    // to determine how the subsequent words should be understood.
++    auto unique_warp_id_inst_id =
++        builder->GetUintConstantId(kAutoInstUniqueSubgroupId);
++
++    GenUniqueSubgroupIdFuncCall(builder, unique_warp_id_inst_id, stage_idx);
++
++    return true;
++  } else if (kAllRayTracingStages.count(stage_idx) != 0) {
++    // Record when threads run a shader during a traceRay
++    GenInst(builder,
++            builder->GetIntConstant(kAutoInstDivCharShaderEntryPoint, false),
++            kAutoInstDivCharShaderEntryPoint, stage_idx);
++    return true;
++  }
++  return false;
++}
++
++bool AutoInstDivergenceCharacterizationPass::PostEntryPointInstrument(
++    InstructionBuilder* builder, uint32_t stage_idx) {
++  if (stage_idx != SpvExecutionModelRayGenerationKHR) return false;
++
++  // Record the threads that quit the pipeline
++  GenInst(builder, builder->GetIntConstant(kAutoInstDivCharQuitPipeline, false),
++          kAutoInstDivCharQuitPipeline, stage_idx);
++
++  return true;
++}
++
++bool AutoInstDivergenceCharacterizationPass::BasicBlockInstrument(
++    BasicBlock* bb, InstructionBuilder* builder, uint32_t stage_idx) {
++  if (kAllRayTracingStages.count(stage_idx) == 0) return false;
++
++  auto inst = builder->GetInsertPoint();
++
++  // Record active threads in each basic block execution
++  GenInst(builder, &*inst, kAutoInstDivCharActiveThreads, stage_idx);
++
++  uint32_t count = 0;
++  for (auto& ii : *bb) {
++    if (pseudo_ops.count(ii.opcode()) != 0) count++;
++  }
++
++  inst_id2inst_count_[GenInstCallsiteId(&*inst)] = count;
++
++  return true;
++}
++
++void AutoInstDivergenceCharacterizationPass::FinalizeInstrumentation() {
++  static_data_callback_(std::move(inst_id2prim_id_),
++                        std::move(inst_id2inst_count_));
++}
++
++}  // namespace opt
++}  // namespace spvtools
+diff --git a/source/opt/auto_inst_divergence_characterization_pass.h b/source/opt/auto_inst_divergence_characterization_pass.h
+new file mode 100644
+index 00000000..54c967a9
+--- /dev/null
++++ b/source/opt/auto_inst_divergence_characterization_pass.h
+@@ -0,0 +1,106 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++#ifndef LIBSPIRV_OPT_AUTO_INST_DIVERGENCE_CHARACTERIZATION_PASS_H_
++#define LIBSPIRV_OPT_AUTO_INST_DIVERGENCE_CHARACTERIZATION_PASS_H_
++
++#include <functional>
++
++#include "auto_inst_pass.h"
++
++namespace spvtools {
++namespace opt {
++
++class AutoInstDivergenceCharacterizationPass : public AutoInstPass {
++ public:
++  AutoInstDivergenceCharacterizationPass(
++      uint32_t desc_set, uint32_t shader_id,
++      std::function<
++          void(std::unordered_map<uint32_t, uint32_t>&& inst_id2prim_id,
++               std::unordered_map<uint32_t, uint32_t>&& inst_id2inst_count)>
++          static_data_callback)
++      : AutoInstPass(desc_set, shader_id, kDefaultReservedWordsCnt),
++        static_data_callback_(static_data_callback) {}
++
++  const char* name() const override {
++    return "auto-inst-divergence-characterization-pass";
++  }
++
++ private:
++  // Mapping from inst callsite id to primitive id
++  std::unordered_map<AutoInstId, AutoInstId> inst_id2prim_id_;
++
++  // Mapping from inst callsite id to the number of insts in the bb
++  std::unordered_map<AutoInstId, AutoInstId> inst_id2inst_count_;
++
++  // Callback for sending static data
++  std::function<void(
++      std::unordered_map<AutoInstId, AutoInstId>&& inst_id2prim_id,
++      std::unordered_map<AutoInstId, AutoInstId>&& inst_id2inst_count)>
++      static_data_callback_;
++
++  // Generate instrumentation for this pass
++  void GenInst(InstructionBuilder* builder, Instruction* inst,
++               AutoInstId prim_id, AutoInstId stage_idx);
++
++  // Allows inheriting classes to initialize their knowledge
++  // of module before beginning instrumentation
++  void InitializeInstrumentation() override;
++
++  // Allows inheriting classes to finalize before
++  // the pass finishes executing.
++  void FinalizeInstrumentation() override;
++
++  // Any instructions added via |builder| will appear before |inst|
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                uint32_t stage_idx) override;
++
++  // Any instructions added via |builder| will appear after |inst|.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                 uint32_t stage_idx) override;
++
++  // Any instructions added via |builder| will appear before the content of
++  // |bb|. |stage_idx| contains the SpvExecutionModel that builder is operating
++  // in. This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool BasicBlockInstrument(BasicBlock* bb, InstructionBuilder* builder,
++                            uint32_t stage_idx) override;
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreEntryPointInstrument(InstructionBuilder* builder,
++                               uint32_t stage_idx) override;
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostEntryPointInstrument(InstructionBuilder* builder,
++                                uint32_t stage_idx) override;
++};
++
++}  // namespace opt
++}  // namespace spvtools
++
++#endif  // LIBSPIRV_OPT_AUTO_INST_DIVERGENCE_CHARACTERIZATION_PASS_H_
+diff --git a/source/opt/auto_inst_dyn_shader_trace_pass.cpp b/source/opt/auto_inst_dyn_shader_trace_pass.cpp
+new file mode 100644
+index 00000000..74416f43
+--- /dev/null
++++ b/source/opt/auto_inst_dyn_shader_trace_pass.cpp
+@@ -0,0 +1,52 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++//
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++
++#include "auto_inst_dyn_shader_trace_pass.h"
++
++namespace {
++
++static const std::set<uint32_t> kAllRayTracingStages = {
++    SpvExecutionModelRayGenerationNV, SpvExecutionModelAnyHitNV,
++    SpvExecutionModelClosestHitNV,    SpvExecutionModelMissNV,
++    SpvExecutionModelIntersectionNV,  SpvExecutionModelCallableNV};
++}
++
++namespace spvtools {
++namespace opt {
++
++bool AutoInstDynShaderTracePass::PreEntryPointInstrument(
++    InstructionBuilder* builder, uint32_t stage_idx) {
++  // Ray Generation begins and ends the ray tracing pipeline
++  if (stage_idx == SpvExecutionModelRayGenerationNV) {
++    // Create an instrumentation id which will be used by the analysis
++    // to determine how the subsequent words should be understood.
++    auto unique_warp_id_inst_id =
++        builder->GetUintConstantId(kAutoInstUniqueSubgroupId);
++    GenUniqueSubgroupIdFuncCall(builder, unique_warp_id_inst_id, stage_idx);
++  }
++
++  auto prim_id = builder->GetUintConstantId(kAutoInstDynShaderTraceEntryPoint);
++  auto flat_thread_id = GenFlatRtThreadId(builder, stage_idx);
++  auto shader_id = builder->GetUintConstantId(shader_id_);
++  auto active_thread_mask_id =
++      GenSubgroupBallotId(builder, builder->GetBoolConstant(true)->result_id());
++  GenSubgroupUpdateCall(
++      builder, {prim_id, flat_thread_id, shader_id, active_thread_mask_id});
++
++  return true;
++}
++
++}  // namespace opt
++}  // namespace spvtools
+diff --git a/source/opt/auto_inst_dyn_shader_trace_pass.h b/source/opt/auto_inst_dyn_shader_trace_pass.h
+new file mode 100644
+index 00000000..840f8942
+--- /dev/null
++++ b/source/opt/auto_inst_dyn_shader_trace_pass.h
+@@ -0,0 +1,103 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++#ifndef LIBSPIRV_OPT_AUTO_INST_DYN_SHADER_TRACE_PASS_H_
++#define LIBSPIRV_OPT_AUTO_INST_DYN_SHADER_TRACE_PASS_H_
++
++#include <functional>
++
++#include "auto_inst_pass.h"
++
++namespace spvtools {
++namespace opt {
++
++class AutoInstDynShaderTracePass : public AutoInstPass {
++ public:
++  AutoInstDynShaderTracePass(uint32_t desc_set, uint32_t shader_id)
++      : AutoInstPass(desc_set, shader_id, kDefaultReservedWordsCnt) {}
++
++  const char* name() const override {
++    return "auto-inst-dyn-shader-trace-pass";
++  }
++
++ private:
++  // Allows inheriting classes to initialize their knowledge
++  // of module before beginning instrumentation
++  void InitializeInstrumentation() override{};
++
++  // Allows inheriting classes to finalize before
++  // the pass finishes executing.
++  void FinalizeInstrumentation() override{};
++
++  // Any instructions added via |builder| will appear before |inst|
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                uint32_t stage_idx) override {
++    (void)inst;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will appear after |inst|.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                 uint32_t stage_idx) override {
++    (void)inst;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will appear before the content of
++  // |bb|. |stage_idx| contains the SpvExecutionModel that builder is operating
++  // in. This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool BasicBlockInstrument(BasicBlock* bb, InstructionBuilder* builder,
++                            uint32_t stage_idx) override {
++    (void)bb;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  }
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreEntryPointInstrument(InstructionBuilder* builder,
++                               uint32_t stage_idx) override;
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostEntryPointInstrument(InstructionBuilder* builder,
++                                uint32_t stage_idx) override {
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  }
++};
++
++}  // namespace opt
++}  // namespace spvtools
++
++#endif  // LIBSPIRV_OPT_AUTO_INST_DYN_SHADER_TRACE_PASS_H_
+diff --git a/source/opt/auto_inst_dyn_trace_ray_trace_pass.cpp b/source/opt/auto_inst_dyn_trace_ray_trace_pass.cpp
+new file mode 100644
+index 00000000..edd63b34
+--- /dev/null
++++ b/source/opt/auto_inst_dyn_trace_ray_trace_pass.cpp
+@@ -0,0 +1,184 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++//
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++
++#include "auto_inst_dyn_trace_ray_trace_pass.h"
++
++namespace {
++
++static const std::set<uint32_t> kAllRayTracingStages = {
++    SpvExecutionModelRayGenerationNV, SpvExecutionModelAnyHitNV,
++    SpvExecutionModelClosestHitNV,    SpvExecutionModelMissNV,
++    SpvExecutionModelIntersectionNV,  SpvExecutionModelCallableNV};
++
++static const int kEntryPointFunctionIdInIdx = 1;
++}  // namespace
++
++namespace spvtools {
++namespace opt {
++
++void AutoInstDynTraceRayTracePass::GenInst(InstructionBuilder* builder,
++                                           uint32_t inst_id, uint32_t prim_type,
++                                           uint32_t stage_idx) {
++  if (inst_id2prim_type_.count(inst_id) != 0) {
++    consumer()(SPV_MSG_ERROR, 0, {0, 0, 0},
++               "Multiple instrumentation sites with the same id detected!\n");
++    return;
++  }
++
++  auto flat_thread_id = GenFlatRtThreadId(builder, stage_idx);
++  auto active_thread_mask_id =
++      GenSubgroupBallotId(builder, builder->GetBoolConstant(true)->result_id());
++
++  GenSubgroupUpdateCall(builder, {builder->GetUintConstantId(inst_id),
++                              flat_thread_id, active_thread_mask_id});
++
++  inst_id2prim_type_[inst_id] = prim_type;
++}
++
++void AutoInstDynTraceRayTracePass::InitializeInstrumentation() {
++  inst_id2prim_type_.clear();
++  fn_id_2_divergent_ids_.clear();
++  merge_id_2_divergent_ids_.clear();
++  loop_begin_bb_ids_.clear();
++
++  Function* entry_point_func = nullptr;
++
++  for (auto e : get_module()->entry_points()) {
++    auto func_id = e.GetSingleWordInOperand(kEntryPointFunctionIdInIdx);
++    entry_point_func = id2function_[func_id];
++  }
++
++  auto active_merge_ids = std::stack<uint32_t>();
++  PopulateDivergentLabelsInfo(entry_point_func, active_merge_ids);
++}
++
++bool AutoInstDynTraceRayTracePass::PreEntryPointInstrument(
++    InstructionBuilder* builder, uint32_t stage_idx) {
++  if (stage_idx != SpvExecutionModelRayGenerationKHR) return false;
++
++  // Create an instrumentation id which will be used by the analysis
++  // to determine how the subsequent words should be understood.
++  auto unique_warp_id_inst_id =
++      builder->GetUintConstantId(kAutoInstUniqueSubgroupId);
++
++  // This instrumentation is unique across the pipeline so it is sufficient
++  // to choose an inst_id that can never alias with the other types of
++  // instrumentation.
++  inst_id2prim_type_[kAutoInstUniqueSubgroupId] = kAutoInstUniqueSubgroupId;
++
++  GenUniqueSubgroupIdFuncCall(builder, unique_warp_id_inst_id, stage_idx);
++
++  return true;
++}
++
++bool AutoInstDynTraceRayTracePass::BasicBlockInstrument(
++    BasicBlock* bb, InstructionBuilder* builder, uint32_t stage_idx) {
++  if (kAllRayTracingStages.count(stage_idx) == 0) return false;
++
++  if (merge_id_2_divergent_ids_.count(bb->id()) != 0) {
++    GenInst(builder, bb->id(), kAutoInstTraceRayTraceMergePoint, stage_idx);
++    return true;
++  }
++
++  for (auto& ii : *bb) {
++    if (ii.opcode() == SpvOpTraceRayKHR || ii.opcode() == SpvOpTraceNV) {
++      builder->SetInsertPoint(&ii);
++      GenInst(builder, bb->id(), kAutoInstTraceRayTracePreTraceRay, stage_idx);
++      return true;
++    }
++  }
++
++  return false;
++}
++
++void AutoInstDynTraceRayTracePass::FinalizeInstrumentation() {
++  static_data_callback_(std::move(inst_id2prim_type_),
++                        std::move(merge_id_2_divergent_ids_));
++}
++
++void AutoInstDynTraceRayTracePass::PopulateDivergentLabelsInfo(
++    Function* func, std::stack<uint32_t>& active_merge_ids) {
++  if (fn_id_2_divergent_ids_.count(func->result_id()) > 0) return;
++
++  bool is_divergent_control_flow = active_merge_ids.size() > 0;
++  std::vector<uint32_t> divergent_labels;
++  std::stack<uint32_t> active_loop_ids;
++
++  for (auto& blk : *func) {
++    if (active_merge_ids.size() > 0 && blk.id() == active_merge_ids.top()) {
++      // Need to move the merge label to the beginning of the loop
++      // iteration in order to determine how many times the label was visited at
++      // runtime
++      if (active_loop_ids.size() > 0) {
++        if (merge_id_2_divergent_ids_.count(active_merge_ids.top()) > 0) {
++          loop_begin_bb_ids_.insert(active_loop_ids.top());
++        }
++        merge_id_2_divergent_ids_[active_loop_ids.top()] =
++            merge_id_2_divergent_ids_[active_merge_ids.top()];
++        merge_id_2_divergent_ids_.erase(active_merge_ids.top());
++        active_loop_ids.pop();
++      }
++      active_merge_ids.pop();
++    }
++
++    for (auto& inst : blk) {
++      // Determine divergent labels to track
++      if (inst.opcode() == SpvOpTraceRayKHR || inst.opcode() == SpvOpTraceNV) {
++        divergent_labels.push_back(blk.id());
++      } else if (inst.opcode() == SpvOpFunctionCall) {
++        // Add divergent labels according to func being called
++        auto func_to_call_id = inst.GetSingleWordOperand(2);
++        if (fn_id_2_divergent_ids_.count(func_to_call_id) == 0) {
++          // recurse if fn not discovered yet
++          PopulateDivergentLabelsInfo(id2function_[func_to_call_id],
++                                      active_merge_ids);
++        }
++        divergent_labels = fn_id_2_divergent_ids_[func_to_call_id];
++
++      } else if (inst.opcode() == SpvOpSelectionMerge ||
++                 inst.opcode() == SpvOpLoopMerge) {
++        auto merge_id = inst.GetSingleWordOperand(0);
++
++        if (active_merge_ids.size() == 0) {
++          active_merge_ids.push(merge_id);
++        } else if (inst.opcode() == SpvOpLoopMerge) {
++          active_merge_ids.push(merge_id);
++          active_loop_ids.push(inst.GetSingleWordOperand(1));
++        }
++      }
++
++      // Update datastructures with divergent labels
++      if (divergent_labels.size() > 0) {
++        fn_id_2_divergent_ids_[func->result_id()].insert(
++            fn_id_2_divergent_ids_[func->result_id()].end(),
++            divergent_labels.begin(), divergent_labels.end());
++
++        // Update all active to-be-merged labels
++        if (active_merge_ids.size() > 0 && !is_divergent_control_flow) {
++          auto id = active_merge_ids.top();
++
++          merge_id_2_divergent_ids_[id].insert(
++              merge_id_2_divergent_ids_[id].end(), divergent_labels.begin(),
++              divergent_labels.end());
++        }
++
++        divergent_labels.clear();
++      }
++    }
++  }
++}
++
++}  // namespace opt
++}  // namespace spvtools
+diff --git a/source/opt/auto_inst_dyn_trace_ray_trace_pass.h b/source/opt/auto_inst_dyn_trace_ray_trace_pass.h
+new file mode 100644
+index 00000000..d095398d
+--- /dev/null
++++ b/source/opt/auto_inst_dyn_trace_ray_trace_pass.h
+@@ -0,0 +1,135 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++#ifndef LIBSPIRV_OPT_AUTO_INST_DYN_TRACE_RAY_TRACE_PASS_H_
++#define LIBSPIRV_OPT_AUTO_INST_DYN_TRACE_RAY_TRACE_PASS_H_
++
++#include <functional>
++
++#include "auto_inst_pass.h"
++
++namespace spvtools {
++namespace opt {
++
++class AutoInstDynTraceRayTracePass : public AutoInstPass {
++ public:
++  AutoInstDynTraceRayTracePass(
++      uint32_t desc_set, uint32_t shader_id,
++      std::function<void(std::unordered_map<uint32_t, uint32_t>&&,
++                         std::unordered_map<uint32_t, std::vector<uint32_t>>&&)>
++          static_data_callback)
++      : AutoInstPass(desc_set, shader_id, kDefaultReservedWordsCnt),
++        static_data_callback_(static_data_callback) {}
++
++  const char* name() const override {
++    return "auto-inst-dyn-trace-ray-trace-pass";
++  }
++
++ private:
++  // Callback to transfer static data to creator of the pass.
++  std::function<void(std::unordered_map<uint32_t, uint32_t>&&,
++                     std::unordered_map<uint32_t, std::vector<uint32_t>>&&)>
++      static_data_callback_;
++
++  // Static data which forms a mapping from instrumentation callsite id
++  // to the instructions in the basic block.
++  std::unordered_map<uint32_t, uint32_t> inst_id2prim_type_;
++
++  // Static data which contains the mapping of MergePoint instrumentation
++  // to all the traceRay callsites that must have executed.
++  std::unordered_map<uint32_t, std::vector<uint32_t>> merge_id_2_divergent_ids_;
++
++  // Memoization table for each function
++  std::unordered_map<uint32_t, std::vector<uint32_t>> fn_id_2_divergent_ids_;
++
++  // Which bbs to add instrumentation before to track loop iterations.
++  std::set<uint32_t> loop_begin_bb_ids_;
++
++  // Generate instrumentation for this pass
++  void GenInst(InstructionBuilder* builder, uint32_t inst_id,
++               uint32_t prim_type, uint32_t stage_idx);
++
++  // Allows inheriting classes to initialize their knowledge
++  // of module before beginning instrumentation
++  void InitializeInstrumentation() override;
++
++  // Allows inheriting classes to finalize before
++  // the pass finishes executing.
++  void FinalizeInstrumentation() override;
++
++  // Any instructions added via |builder| will appear before |inst|
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                uint32_t stage_idx) override {
++    (void)inst;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will appear after |inst|.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                 uint32_t stage_idx) override {
++    (void)inst;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  }
++
++  // Any instructions added via |builder| will appear before the content of
++  // |bb|. |stage_idx| contains the SpvExecutionModel that builder is operating
++  // in. This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool BasicBlockInstrument(BasicBlock* bb, InstructionBuilder* builder,
++                            uint32_t stage_idx) override;
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreEntryPointInstrument(InstructionBuilder* builder,
++                               uint32_t stage_idx) override;
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostEntryPointInstrument(InstructionBuilder* builder,
++                                uint32_t stage_idx) override {
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // This function generates an analysis of |func| to determine
++  // 1) the SpvOpTraceRays that exist within control-flow.
++  // 2) where to add instrumentation to detect loop iterations and
++  //    opportunities to execute traceRays.
++  //
++  // This allows runtime traceRay calls to be disambiguated.
++  void PopulateDivergentLabelsInfo(Function* func,
++                                   std::stack<uint32_t>& active_merge_ids);
++};
++
++}  // namespace opt
++}  // namespace spvtools
++
++#endif  // LIBSPIRV_OPT_AUTO_INST_DYN_TRACE_RAY_TRACE_PASS_H_
+diff --git a/source/opt/auto_inst_execution_trace_pass.cpp b/source/opt/auto_inst_execution_trace_pass.cpp
+new file mode 100644
+index 00000000..3eeff48e
+--- /dev/null
++++ b/source/opt/auto_inst_execution_trace_pass.cpp
+@@ -0,0 +1,68 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++//
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++#include "auto_inst_execution_trace_pass.h"
++
++namespace spvtools {
++namespace opt {
++
++void AutoInstExecutionTracePass::InitializeInstrumentation() {
++  inst_id2bb_inst_ids_.clear();
++  inst_id2opcode_.clear();
++}
++
++bool AutoInstExecutionTracePass::BasicBlockInstrument(
++    BasicBlock* bb, InstructionBuilder* builder, uint32_t stage_idx) {
++  (void)stage_idx;
++  auto module_offset = uid2offset_[bb->begin()->unique_id()];
++  if (shader_id_ >= (1 << 12) || module_offset >= (1 << 20)) {
++    std::string message =
++        "Shader id count or shader module size are too large!\n";
++    consumer()(SPV_MSG_ERROR, 0, {0, 0, 0}, message.c_str());
++    return false;
++  }
++
++  // Create an instrumentation callsite id that is unique across
++  // the whole pipeline by including the shader id in the upper bits.
++  auto inst = &*builder->GetInsertPoint();
++
++  uint32_t inst_id = GenInstCallsiteId(inst);
++
++  for (auto inst_itr : *bb) {
++    auto other_inst_id = GenInstCallsiteId(&inst_itr);
++
++    // Create group of instructions which must execute
++    // if the instrumentation executed.
++    inst_id2bb_inst_ids_[inst_id].insert(other_inst_id);
++    // Add opcode to the static metadata map so it can be added
++    // to a dynamic opcode total.
++    inst_id2opcode_[other_inst_id] = inst_itr.opcode();
++  }
++  // Write the same inst_id as in the static data so that when a
++  // buffer entry is parsed, the inst_id can be used to look up
++  // the other instructions that must have also been executed.
++  auto active_thread_mask_id =
++      GenSubgroupBallotId(builder, builder->GetBoolConstant(true)->result_id());
++  GenSubgroupUpdateCall(
++      builder, {builder->GetUintConstantId(inst_id), active_thread_mask_id});
++  return true;
++}
++
++void AutoInstExecutionTracePass::FinalizeInstrumentation() {
++  static_data_callback_(std::move(inst_id2bb_inst_ids_),
++                        std::move(inst_id2opcode_));
++}
++
++}  // namespace opt
++}  // namespace spvtools
+diff --git a/source/opt/auto_inst_execution_trace_pass.h b/source/opt/auto_inst_execution_trace_pass.h
+new file mode 100644
+index 00000000..124f3ecd
+--- /dev/null
++++ b/source/opt/auto_inst_execution_trace_pass.h
+@@ -0,0 +1,122 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++#ifndef LIBSPIRV_OPT_AUTO_INST_EXECUTION_TRACE_PASS_H_
++#define LIBSPIRV_OPT_AUTO_INST_EXECUTION_TRACE_PASS_H_
++
++#include <functional>
++
++#include "auto_inst_pass.h"
++
++namespace spvtools {
++namespace opt {
++
++class AutoInstExecutionTracePass : public AutoInstPass {
++ public:
++  AutoInstExecutionTracePass(
++      uint32_t desc_set, uint32_t shader_id,
++      std::function<
++          void(std::unordered_map<uint32_t, std::set<uint32_t>>&&
++                   inst_id2bb_inst_ids,
++               std::unordered_map<uint32_t, uint32_t>&& inst_id2opcode)>
++          static_data_callback)
++      : AutoInstPass(desc_set, shader_id, kDefaultReservedWordsCnt),
++        static_data_callback_(static_data_callback) {}
++
++  const char* name() const override { return "auto-inst-execution-trace-pass"; }
++
++ private:
++  // Callback to transfer static data to creator of the pass.
++  const std::function<void(
++      std::unordered_map<uint32_t, std::set<uint32_t>>&& inst_id2bb_inst_ids,
++      std::unordered_map<uint32_t, uint32_t>&& inst_id2opcode)>
++      static_data_callback_;
++
++  // Static data which forms a mapping from instrumentation callsite id
++  // to the ids of other instructions in the basic block.
++  std::unordered_map<uint32_t, std::set<uint32_t>> inst_id2bb_inst_ids_;
++
++  // Static data which forms a mappign from inst_id to opcode.
++  // This together with |inst_id2bb_inst_ids_| gives the runtime
++  // instruction mix.
++  std::unordered_map<uint32_t, uint32_t> inst_id2opcode_;
++
++  // Allows inheriting classes to initialize their knowledge
++  // of module before beginning instrumentation
++  void InitializeInstrumentation() override;
++
++  // Allows inheriting classes to finalize before
++  // the pass finishes executing.
++  void FinalizeInstrumentation() override;
++
++  // Any instructions added via |builder| will appear before |inst|
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                uint32_t stage_idx) override {
++    (void)inst;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will appear after |inst|.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                 uint32_t stage_idx) override {
++    (void)inst;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will appear before the content of
++  // |bb|. |stage_idx| contains the SpvExecutionModel that builder is operating
++  // in. This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool BasicBlockInstrument(BasicBlock* bb, InstructionBuilder* builder,
++                            uint32_t stage_idx) override;
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreEntryPointInstrument(InstructionBuilder* builder,
++                               uint32_t stage_idx) override {
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  }
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostEntryPointInstrument(InstructionBuilder* builder,
++                                uint32_t stage_idx) override {
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++};
++
++}  // namespace opt
++}  // namespace spvtools
++
++#endif  // LIBSPIRV_OPT_AUTO_INST_EXECUTION_TRACE_PASS_H_
+diff --git a/source/opt/auto_inst_pass.cpp b/source/opt/auto_inst_pass.cpp
+new file mode 100644
+index 00000000..6a5f7f60
+--- /dev/null
++++ b/source/opt/auto_inst_pass.cpp
+@@ -0,0 +1,927 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++//
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++
++#include "auto_inst_pass.h"
++
++namespace {
++
++// Operand indices of SpvOpEntryPoint
++static const int kEntryPointFunctionIdInIdx = 1;
++static const int kEntryPointExecutionModelInIdx = 0;
++
++// Type of function from OpFunction
++static const int kFunctionTypeIdx = 3;
++
++// UniqueSubgroupId function constants
++static const int kUniqueSubgroupIdParamCnt = 1;
++static const int kUniqueSubgroupIdParamInstIdIdx = 0;
++}  // namespace
++
++namespace spvtools {
++namespace opt {
++
++bool AutoInstPass::HandleInstrumentHooks(
++    BasicBlock::iterator ref_inst_itr,
++    UptrVectorIterator<BasicBlock> ref_block_itr, uint32_t stage_idx,
++    InstructionBuilder* builder) {
++  bool is_instrumented = false;
++
++  // Initialize instrumentation validity based on inst opcode.
++  bool skip_pre_instrumentation =
++      uninstrumentable_pre_ops.count(ref_inst_itr->opcode()) > 0;
++  bool skip_post_instrumentation =
++      uninstrumentable_post_ops.count(ref_inst_itr->opcode()) > 0;
++
++  if (ref_block_itr->GetParent()->begin() == ref_block_itr) {
++    // Despite the SPIR-V specification stating:
++    // "All OpVariable instructions in a function must be in the first block in
++    // the function. " The real restriction is as follows: "All OpVariable
++    // instructions in a function must be the first instructions in the first
++    // block." Therefore this check prevents instrumenting OpVariables in the
++    // first block of a function.
++    if (ref_inst_itr->opcode() == SpvOpVariable) {
++      skip_pre_instrumentation = true;
++    }
++    if (std::next(ref_inst_itr) != ref_block_itr->end() &&
++        std::next(ref_inst_itr)->opcode() == SpvOpVariable) {
++      skip_post_instrumentation = true;
++    }
++  }
++
++  if (skip_pre_instrumentation && skip_post_instrumentation) return false;
++
++  if (!skip_pre_instrumentation) {
++    // set insert point to immediately before current inst
++    builder->SetInsertPoint(&*ref_inst_itr);
++
++    if (instrumented_bb_ids.count(ref_block_itr->id()) == 0) {
++      is_instrumented |=
++          BasicBlockInstrument(&*ref_block_itr, builder, stage_idx);
++    }
++
++    if (instrumented_inst_ids.count(ref_inst_itr->unique_id()) == 0) {
++      is_instrumented |=
++          PreInstructionInstrument(&*ref_inst_itr, builder, stage_idx);
++    }
++  }
++  if (!skip_post_instrumentation) {
++    if (instrumented_inst_ids.count(ref_inst_itr->unique_id()) == 0) {
++      // Before next inst is after this inst
++      builder->SetInsertPoint(&*std::next(ref_inst_itr));
++      is_instrumented |=
++          PostInstructionInstrument(&*ref_inst_itr, builder, stage_idx);
++    }
++  }
++
++  if (is_instrumented) {
++    // Record the bb and inst that were just visited
++    instrumented_bb_ids.insert(ref_block_itr->id());
++    instrumented_inst_ids.insert(ref_inst_itr->unique_id());
++  }
++
++  return is_instrumented;
++}
++
++void AutoInstPass::GenInstProgrammableCode(
++    BasicBlock::iterator ref_inst_itr,
++    UptrVectorIterator<BasicBlock> ref_block_itr, uint32_t stage_idx) {
++  // Initialize DefUse manager before dismantling module
++  (void)get_def_use_mgr();
++
++  InstructionBuilder builder(context(), &*ref_block_itr);
++
++  bool is_instrumented =
++      HandleInstrumentHooks(ref_inst_itr, ref_block_itr, stage_idx, &builder);
++  if (!is_instrumented) return;
++  has_added_instrumentation_ = true;
++}
++
++uint32_t AutoInstPass::GenSubgroupBallotId(InstructionBuilder* builder,
++                                           uint32_t pred_id) {
++  if (!get_feature_mgr()->HasExtension(kSPV_KHR_subgroup_vote)) {
++    context()->AddExtension("SPV_KHR_subgroup_vote");
++  }
++
++  if (!get_feature_mgr()->HasCapability(SpvCapabilityGroupNonUniformBallot)) {
++    context()->AddCapability(SpvCapabilityGroupNonUniformBallot);
++  }
++
++  uint32_t scope_ballot_idx = builder->GetUintConstantId(SpvScopeSubgroup);
++  Instruction* ballot_inst = builder->AddBinaryOp(
++      GetVec4UintId(), SpvOpGroupNonUniformBallot, scope_ballot_idx, pred_id);
++
++  return builder
++      ->AddIdLiteralOp(GetUintId(), SpvOpCompositeExtract,
++                       ballot_inst->result_id(), 0)
++      ->result_id();
++}
++
++std::pair<uint32_t, uint32_t> AutoInstPass::GenReadClockIds(
++    InstructionBuilder* builder) {
++  if (!get_feature_mgr()->HasExtension(kSPV_KHR_shader_clock)) {
++    context()->AddExtension("SPV_KHR_shader_clock");
++  }
++
++  if (!get_feature_mgr()->HasCapability(SpvCapabilityShaderClockKHR)) {
++    context()->AddCapability(SpvCapabilityShaderClockKHR);
++  }
++
++  auto time_inst =
++      builder->AddUnaryOp(GetVecUintId(2u), SpvOpReadClockKHR,
++                          builder->GetUintConstantId(SpvScopeDevice));
++  Instruction* time_lower = builder->AddIdLiteralOp(
++      GetUintId(), SpvOpCompositeExtract, time_inst->result_id(), 0);
++  Instruction* time_upper = builder->AddIdLiteralOp(
++      GetUintId(), SpvOpCompositeExtract, time_inst->result_id(), 1);
++  return std::make_pair(time_lower->result_id(), time_upper->result_id());
++}
++
++uint32_t AutoInstPass::GenFlatRtThreadId(InstructionBuilder* builder,
++                                         uint32_t stage_idx) {
++  switch (stage_idx) {
++    case SpvExecutionModelRayGenerationNV:
++    case SpvExecutionModelIntersectionNV:
++    case SpvExecutionModelAnyHitNV:
++    case SpvExecutionModelClosestHitNV:
++    case SpvExecutionModelMissNV:
++    case SpvExecutionModelCallableNV: {
++      auto launch_id = GenVarLoad(
++          context()->GetBuiltinInputVarId(SpvBuiltInLaunchIdKHR), builder);
++      Instruction* launch_x = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, launch_id, 0);
++      Instruction* launch_y = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, launch_id, 1);
++      Instruction* launch_z = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, launch_id, 2);
++
++      auto launch_size_id = GenVarLoad(
++          context()->GetBuiltinInputVarId(SpvBuiltInLaunchSizeKHR), builder);
++      Instruction* launch_size_x = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, launch_size_id, 0);
++      Instruction* launch_size_y = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, launch_size_id, 1);
++
++      auto xy_size = builder->AddBinaryOp(GetUintId(), SpvOpIMul,
++                                          launch_size_x->result_id(),
++                                          launch_size_y->result_id());
++      auto z_term = builder->AddBinaryOp(
++          GetUintId(), SpvOpIMul, launch_z->result_id(), xy_size->result_id());
++
++      auto y_term =
++          builder->AddBinaryOp(GetUintId(), SpvOpIMul, launch_y->result_id(),
++                               launch_size_x->result_id());
++
++      auto flat_thread_id = builder->AddBinaryOp(
++          GetUintId(), SpvOpIAdd, z_term->result_id(), y_term->result_id());
++      flat_thread_id = builder->AddBinaryOp(GetUintId(), SpvOpIAdd,
++                                            flat_thread_id->result_id(),
++                                            launch_x->result_id());
++      return flat_thread_id->result_id();
++    }
++
++    default:
++      consumer()(
++          SPV_MSG_ERROR, 0, {0, 0, 0},
++          "Cannot create a flattened rt thread id for requested shader stage! "
++          "Defaulting to 0.\n");
++      return builder->GetUintConstantId(0);
++  }
++}
++
++uint32_t AutoInstPass::GenFlatComputeThreadId(InstructionBuilder* builder,
++                                              uint32_t stage_idx) {
++  if (stage_idx != SpvExecutionModelGLCompute) {
++    consumer()(SPV_MSG_ERROR, 0, {0, 0, 0},
++               "Cannot create a flattened compute thread id for requested "
++               "shader stage! "
++               "Defaulting to 0.\n");
++    return builder->GetUintConstantId(0);
++  }
++  auto invocation_id = GenVarLoad(
++      context()->GetBuiltinInputVarId(SpvBuiltInGlobalInvocationId), builder);
++  Instruction* invocation_x = builder->AddIdLiteralOp(
++      GetUintId(), SpvOpCompositeExtract, invocation_id, 0);
++  Instruction* invocation_y = builder->AddIdLiteralOp(
++      GetUintId(), SpvOpCompositeExtract, invocation_id, 1);
++  Instruction* invocation_z = builder->AddIdLiteralOp(
++      GetUintId(), SpvOpCompositeExtract, invocation_id, 2);
++
++  auto num_workgroups_id = GenVarLoad(
++      context()->GetBuiltinInputVarId(SpvBuiltInNumWorkgroups), builder);
++  Instruction* num_workgroups_x = builder->AddIdLiteralOp(
++      GetUintId(), SpvOpCompositeExtract, num_workgroups_id, 0);
++  Instruction* num_workgroups_y = builder->AddIdLiteralOp(
++      GetUintId(), SpvOpCompositeExtract, num_workgroups_id, 1);
++
++  auto workgroup_size_id = GenVarLoad(
++      context()->GetBuiltinInputVarId(SpvBuiltInWorkgroupSize), builder);
++
++  Instruction* workgroup_size_x = builder->AddIdLiteralOp(
++      GetUintId(), SpvOpCompositeExtract, workgroup_size_id, 0);
++  Instruction* workgroup_size_y = builder->AddIdLiteralOp(
++      GetUintId(), SpvOpCompositeExtract, workgroup_size_id, 1);
++
++  Instruction* global_size_x = builder->AddBinaryOp(
++      GetUintId(), SpvOpIMul, num_workgroups_x->result_id(),
++      workgroup_size_x->result_id());
++
++  Instruction* global_size_y = builder->AddBinaryOp(
++      GetUintId(), SpvOpIMul, num_workgroups_y->result_id(),
++      workgroup_size_y->result_id());
++
++  auto xy_size =
++      builder->AddBinaryOp(GetUintId(), SpvOpIMul, global_size_x->result_id(),
++                           global_size_y->result_id());
++  auto z_term = builder->AddBinaryOp(
++      GetUintId(), SpvOpIMul, invocation_z->result_id(), xy_size->result_id());
++
++  auto y_term =
++      builder->AddBinaryOp(GetUintId(), SpvOpIMul, invocation_y->result_id(),
++                           global_size_x->result_id());
++
++  auto flat_thread_id = builder->AddBinaryOp(
++      GetUintId(), SpvOpIAdd, z_term->result_id(), y_term->result_id());
++  flat_thread_id =
++      builder->AddBinaryOp(GetUintId(), SpvOpIAdd, flat_thread_id->result_id(),
++                           invocation_x->result_id());
++  return flat_thread_id->result_id();
++}
++
++std::vector<uint32_t> AutoInstPass::GenThreadId(InstructionBuilder* builder,
++                                                uint32_t stage_idx) {
++  switch (stage_idx) {
++    case SpvExecutionModelVertex: {
++      // Load and store VertexId and InstanceId
++      auto vertex_id = GenVarLoad(
++          context()->GetBuiltinInputVarId(SpvBuiltInVertexIndex), builder);
++      auto instance_id = GenVarLoad(
++          context()->GetBuiltinInputVarId(SpvBuiltInInstanceIndex), builder);
++      return {vertex_id, instance_id};
++    }
++    case SpvExecutionModelGLCompute: {
++      // Load and store GlobalInvocationId.
++
++      uint32_t load_id = GenVarLoad(
++          context()->GetBuiltinInputVarId(SpvBuiltInGlobalInvocationId),
++          builder);
++      Instruction* x_inst = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, load_id, 0);
++      Instruction* y_inst = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, load_id, 1);
++      Instruction* z_inst = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, load_id, 2);
++      return {x_inst->result_id(), y_inst->result_id(), z_inst->result_id()};
++    }
++    case SpvExecutionModelTessellationControl:
++    case SpvExecutionModelGeometry: {
++      // Load and store PrimitiveId and InvocationId.
++      auto primitive_id = GenVarLoad(
++          context()->GetBuiltinInputVarId(SpvBuiltInPrimitiveId), builder);
++      auto instance_id = GenVarLoad(
++          context()->GetBuiltinInputVarId(SpvBuiltInInvocationId), builder);
++      return {primitive_id, instance_id};
++    }
++
++    case SpvExecutionModelTessellationEvaluation: {
++      // Load and store PrimitiveId and TessCoord.uv
++      auto primitive_id = GenVarLoad(
++          context()->GetBuiltinInputVarId(SpvBuiltInPrimitiveId), builder);
++      uint32_t load_id = GenVarLoad(
++          context()->GetBuiltinInputVarId(SpvBuiltInTessCoord), builder);
++      Instruction* uvec3_cast_inst =
++          builder->AddUnaryOp(GetVec3UintId(), SpvOpBitcast, load_id);
++      uint32_t uvec3_cast_id = uvec3_cast_inst->result_id();
++      Instruction* u_inst = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, uvec3_cast_id, 0);
++      Instruction* v_inst = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, uvec3_cast_id, 1);
++      return {primitive_id, u_inst->result_id(), v_inst->result_id()};
++    }
++    case SpvExecutionModelFragment: {
++      // Load FragCoord and convert to Uint
++      Instruction* frag_coord_inst = builder->AddUnaryOp(
++          GetVec4FloatId(), SpvOpLoad,
++          context()->GetBuiltinInputVarId(SpvBuiltInFragCoord));
++      Instruction* uint_frag_coord_inst = builder->AddUnaryOp(
++          GetVec4UintId(), SpvOpBitcast, frag_coord_inst->result_id());
++      Instruction* x_inst =
++          builder->AddIdLiteralOp(GetUintId(), SpvOpCompositeExtract,
++                                  uint_frag_coord_inst->result_id(), 0);
++      Instruction* y_inst =
++          builder->AddIdLiteralOp(GetUintId(), SpvOpCompositeExtract,
++                                  uint_frag_coord_inst->result_id(), 1);
++      Instruction* z_inst =
++          builder->AddIdLiteralOp(GetUintId(), SpvOpCompositeExtract,
++                                  uint_frag_coord_inst->result_id(), 2);
++      return {x_inst->result_id(), y_inst->result_id(), z_inst->result_id()};
++    }
++    case SpvExecutionModelRayGenerationNV:
++    case SpvExecutionModelIntersectionNV:
++    case SpvExecutionModelAnyHitNV:
++    case SpvExecutionModelClosestHitNV:
++    case SpvExecutionModelMissNV:
++    case SpvExecutionModelCallableNV: {
++      // Load and store LaunchIdNV.
++      auto launch_id = GenVarLoad(
++          context()->GetBuiltinInputVarId(SpvBuiltInLaunchIdKHR), builder);
++      Instruction* launch_x = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, launch_id, 0);
++      Instruction* launch_y = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, launch_id, 1);
++      Instruction* launch_z = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, launch_id, 2);
++
++      auto launch_size_id = GenVarLoad(
++          context()->GetBuiltinInputVarId(SpvBuiltInLaunchSizeKHR), builder);
++      Instruction* launch_size_x = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, launch_size_id, 0);
++      Instruction* launch_size_y = builder->AddIdLiteralOp(
++          GetUintId(), SpvOpCompositeExtract, launch_size_id, 1);
++
++      auto xy_size = builder->AddBinaryOp(GetUintId(), SpvOpIMul,
++                                          launch_size_x->result_id(),
++                                          launch_size_y->result_id());
++      auto z_term = builder->AddBinaryOp(
++          GetUintId(), SpvOpIMul, launch_z->result_id(), xy_size->result_id());
++
++      auto y_term =
++          builder->AddBinaryOp(GetUintId(), SpvOpIMul, launch_y->result_id(),
++                               launch_size_x->result_id());
++
++      auto flat_thread_id = builder->AddBinaryOp(
++          GetUintId(), SpvOpIAdd, z_term->result_id(), y_term->result_id());
++      flat_thread_id = builder->AddBinaryOp(GetUintId(), SpvOpIAdd,
++                                            flat_thread_id->result_id(),
++                                            launch_x->result_id());
++      return {flat_thread_id->result_id()};
++    }
++    default: {
++      consumer()(SPV_MSG_ERROR, 0, {0, 0, 0},
++                 "Cannot create a thread id for requested shader stage!\n");
++      return {};
++    }
++  }
++}
++
++uint32_t AutoInstPass::GenInstCallsiteId(Instruction* inst) {
++  auto module_offset = uid2offset_[inst->unique_id()];
++  if (shader_id_ >= (1 << 12) || module_offset >= (1 << 20)) {
++    std::string message =
++        "Shader id count or shader module size are too large!\n";
++    consumer()(SPV_MSG_ERROR, 0, {0, 0, 0}, message.c_str());
++    return 0;
++  }
++
++  // Create an instrumentation callsite id that is unique across
++  // the whole pipeline by including the shader id in the upper bits.
++  return (shader_id_ << 20) | module_offset;
++}
++
++uint32_t AutoInstPass::GenSubgroupLocalInvocationId(
++    InstructionBuilder* builder) {
++  if (!get_feature_mgr()->HasCapability(SpvCapabilityGroupNonUniform)) {
++    context()->AddCapability(SpvCapabilityGroupNonUniform);
++  }
++  return GenVarLoad(
++      context()->GetBuiltinInputVarId(SpvBuiltInSubgroupLocalInvocationId),
++      builder);
++}
++
++std::unique_ptr<BasicBlock> AutoInstPass::GenIfStatement(
++    uint32_t condition_id, std::unique_ptr<BasicBlock> curr_block,
++    std::function<std::unique_ptr<BasicBlock>(InstructionBuilder* builder,
++                                              std::unique_ptr<BasicBlock>)>
++        inside_if_callback) {
++  auto output_func = curr_block->GetParent();
++  InstructionBuilder builder(
++      context(), &*curr_block,
++      IRContext::kAnalysisDefUse | IRContext::kAnalysisInstrToBlockMapping);
++  uint32_t merge_blk_id = TakeNextId();
++  uint32_t inside_if_blk_id = TakeNextId();
++  std::unique_ptr<Instruction> merge_label(NewLabel(merge_blk_id));
++  std::unique_ptr<Instruction> inside_if_label(NewLabel(inside_if_blk_id));
++  (void)builder.AddConditionalBranch(condition_id, inside_if_blk_id,
++                                     merge_blk_id, merge_blk_id,
++                                     SpvSelectionControlMaskNone);
++
++  output_func->AddBasicBlock(std::move(curr_block));
++  curr_block = MakeUnique<BasicBlock>(std::move(inside_if_label));
++  curr_block->SetParent(&*output_func);
++
++  builder.SetInsertPoint(&*curr_block);
++
++  curr_block = inside_if_callback(&builder, std::move(curr_block));
++
++  builder.SetInsertPoint(&*curr_block);
++  if (!curr_block->IsReturn()) (void)builder.AddBranch(merge_blk_id);
++
++  output_func->AddBasicBlock(std::move(curr_block));
++  curr_block = MakeUnique<BasicBlock>(std::move(merge_label));
++  curr_block->SetParent(&*output_func);
++  return curr_block;
++}
++
++std::unique_ptr<BasicBlock> AutoInstPass::GenThreadUpdate(
++    InstructionBuilder* builder, std::unique_ptr<BasicBlock> curr_block,
++    std::vector<uint32_t> element_ids) {
++  uint32_t buf_id = GetOutputBufferId();
++  uint32_t buf_uint_ptr_id = GetOutputBufferPtrId();
++  if (element_ids.size() > 65535) {
++    std::string message =
++        "ThreadUpdate does not support more than 65535 elements in a single "
++        "entry!";
++    consumer()(SPV_MSG_ERROR, 0, {0, 0, 0}, message.c_str());
++  }
++  // Compute size of entry that will be written to the buffer.
++  uint32_t entry_size =
++      builder->GetUintConstantId((uint32_t)element_ids.size());
++
++  // Update number of words written
++  uint32_t mask_none_id = builder->GetUintConstantId(SpvMemoryAccessMaskNone);
++  uint32_t scope_invok_id = builder->GetUintConstantId(SpvScopeInvocation);
++  Instruction* buffer_consumed_ptr =
++      builder->AddBinaryOp(buf_uint_ptr_id, SpvOpAccessChain, buf_id,
++                           builder->GetUintConstantId(kDebugOutputSizeOffset));
++  Instruction* write_offset = builder->AddQuadOp(
++      GetUintId(), SpvOpAtomicIAdd, buffer_consumed_ptr->result_id(),
++      scope_invok_id, mask_none_id, entry_size);
++
++  Instruction* updated_consumed_value = builder->AddBinaryOp(
++      GetUintId(), SpvOpIAdd, write_offset->result_id(), entry_size);
++  updated_consumed_value = builder->AddBinaryOp(
++      GetUintId(), SpvOpIAdd, updated_consumed_value->result_id(),
++      builder->GetUintConstantId(reserved_words_count_));
++  Instruction* buffer_capacity =
++      builder->AddIdLiteralOp(GetUintId(), SpvOpArrayLength,
++                              GetOutputBufferId(), kDebugOutputDataOffset);
++
++  Instruction* out_of_bounds_cond = builder->AddBinaryOp(
++      GetBoolId(), SpvOpUGreaterThanEqual, updated_consumed_value->result_id(),
++      buffer_capacity->result_id());
++
++  curr_block =
++      GenIfStatement(out_of_bounds_cond->result_id(), std::move(curr_block),
++                     [this](InstructionBuilder* ir_builder,
++                            std::unique_ptr<BasicBlock> block) {
++                       ir_builder->AddNullaryOp(0, SpvOpReturn);
++                       return block;
++                     });
++  builder->SetInsertPoint(&*curr_block);
++
++  uint32_t i = reserved_words_count_;
++  for (auto it : element_ids) {
++    Instruction* element_offset =
++        builder->AddBinaryOp(GetUintId(), SpvOpIAdd, write_offset->result_id(),
++                             builder->GetUintConstantId(i));
++    Instruction* offset_ptr = builder->AddTernaryOp(
++        buf_uint_ptr_id, SpvOpAccessChain, buf_id,
++        builder->GetUintConstantId(kDebugOutputDataOffset),
++        element_offset->result_id());
++    (void)builder->AddStore(offset_ptr->result_id(), it);
++    i++;
++  }
++
++  return curr_block;
++}
++
++uint32_t AutoInstPass::GetThreadUpdateFuncId(uint32_t num_parameters) {
++  if (param_cnt2thread_update_func_id_[num_parameters] == 0) {
++    // Create function
++    param_cnt2thread_update_func_id_[num_parameters] = TakeNextId();
++    analysis::TypeManager* type_mgr = context()->get_type_mgr();
++    std::vector<const analysis::Type*> param_types;
++    for (uint32_t c = 0; c < num_parameters; ++c)
++      param_types.push_back(type_mgr->GetType(GetUintId()));
++    analysis::Function func_ty(type_mgr->GetType(GetVoidId()), param_types);
++    analysis::Type* reg_func_ty = type_mgr->GetRegisteredType(&func_ty);
++    std::unique_ptr<Instruction> func_inst(
++        new Instruction(get_module()->context(), SpvOpFunction, GetVoidId(),
++                        param_cnt2thread_update_func_id_[num_parameters],
++                        {{spv_operand_type_t::SPV_OPERAND_TYPE_LITERAL_INTEGER,
++                          {SpvFunctionControlMaskNone}},
++                         {spv_operand_type_t::SPV_OPERAND_TYPE_ID,
++                          {type_mgr->GetTypeInstruction(reg_func_ty)}}}));
++    get_def_use_mgr()->AnalyzeInstDefUse(&*func_inst);
++    std::unique_ptr<Function> output_func =
++        MakeUnique<Function>(std::move(func_inst));
++    // Add parameters
++    std::vector<uint32_t> param_vec;
++    for (uint32_t c = 0; c < num_parameters; ++c) {
++      uint32_t pid = TakeNextId();
++      param_vec.push_back(pid);
++      std::unique_ptr<Instruction> param_inst(
++          new Instruction(get_module()->context(), SpvOpFunctionParameter,
++                          GetUintId(), pid, {}));
++      get_def_use_mgr()->AnalyzeInstDefUse(&*param_inst);
++      output_func->AddParameter(std::move(param_inst));
++    }
++
++    // Create first block
++    uint32_t test_blk_id = TakeNextId();
++    std::unique_ptr<Instruction> test_label(NewLabel(test_blk_id));
++    std::unique_ptr<BasicBlock> new_blk_ptr =
++        MakeUnique<BasicBlock>(std::move(test_label));
++    new_blk_ptr->SetParent(&*output_func);
++    InstructionBuilder builder(
++        context(), &*new_blk_ptr,
++        IRContext::kAnalysisDefUse | IRContext::kAnalysisInstrToBlockMapping);
++
++    builder.SetInsertPoint(&*new_blk_ptr);
++
++    new_blk_ptr = GenThreadUpdate(&builder, std::move(new_blk_ptr), param_vec);
++
++    builder.SetInsertPoint(&*new_blk_ptr);
++
++    // Close merge block and function and add function to module
++    (void)builder.AddNullaryOp(0, SpvOpReturn);
++    new_blk_ptr->SetParent(&*output_func);
++    output_func->AddBasicBlock(std::move(new_blk_ptr));
++    std::unique_ptr<Instruction> func_end_inst(
++        new Instruction(get_module()->context(), SpvOpFunctionEnd, 0, 0, {}));
++    get_def_use_mgr()->AnalyzeInstDefUse(&*func_end_inst);
++    output_func->SetFunctionEnd(std::move(func_end_inst));
++    context()->AddFunction(std::move(output_func));
++  }
++  return param_cnt2thread_update_func_id_[num_parameters];
++}
++
++void AutoInstPass::GenThreadUpdateCall(InstructionBuilder* builder,
++                                       std::vector<uint32_t> param_ids) {
++  uint32_t func_id = GetThreadUpdateFuncId((uint32_t)param_ids.size());
++
++  std::vector<uint32_t> operands = {func_id};
++  operands.insert(operands.end(), param_ids.begin(), param_ids.end());
++
++  (void)builder->AddNaryOp(GetVoidId(), SpvOpFunctionCall, operands);
++}
++
++std::unique_ptr<BasicBlock> AutoInstPass::GenSubgroupUpdate(
++    InstructionBuilder* builder, std::unique_ptr<BasicBlock> curr_block,
++    std::vector<uint32_t> element_ids) {
++  Instruction* subgroup_leader_cond =
++      builder->AddUnaryOp(GetBoolId(), SpvOpGroupNonUniformElect,
++                          builder->GetUintConstantId(SpvScopeSubgroup));
++  curr_block = GenIfStatement(
++      subgroup_leader_cond->result_id(), std::move(curr_block),
++      [this, element_ids](InstructionBuilder* ir_builder,
++                          std::unique_ptr<BasicBlock> block) {
++        block = GenThreadUpdate(ir_builder, std::move(block), element_ids);
++        ir_builder->SetInsertPoint(&*block);
++        return block;
++      });
++  builder->SetInsertPoint(&*curr_block);
++
++  return curr_block;
++}
++
++uint32_t AutoInstPass::GetSubgroupUpdateFuncId(uint32_t num_parameters) {
++  if (param_cnt2subgroup_update_func_id_[num_parameters] == 0) {
++    // Create function
++    param_cnt2subgroup_update_func_id_[num_parameters] = TakeNextId();
++    analysis::TypeManager* type_mgr = context()->get_type_mgr();
++    std::vector<const analysis::Type*> param_types;
++    for (uint32_t c = 0; c < num_parameters; ++c)
++      param_types.push_back(type_mgr->GetType(GetUintId()));
++    analysis::Function func_ty(type_mgr->GetType(GetVoidId()), param_types);
++    analysis::Type* reg_func_ty = type_mgr->GetRegisteredType(&func_ty);
++    std::unique_ptr<Instruction> func_inst(
++        new Instruction(get_module()->context(), SpvOpFunction, GetVoidId(),
++                        param_cnt2subgroup_update_func_id_[num_parameters],
++                        {{spv_operand_type_t::SPV_OPERAND_TYPE_LITERAL_INTEGER,
++                          {SpvFunctionControlMaskNone}},
++                         {spv_operand_type_t::SPV_OPERAND_TYPE_ID,
++                          {type_mgr->GetTypeInstruction(reg_func_ty)}}}));
++    get_def_use_mgr()->AnalyzeInstDefUse(&*func_inst);
++    std::unique_ptr<Function> output_func =
++        MakeUnique<Function>(std::move(func_inst));
++    // Add parameters
++    std::vector<uint32_t> param_vec;
++    for (uint32_t c = 0; c < num_parameters; ++c) {
++      uint32_t pid = TakeNextId();
++      param_vec.push_back(pid);
++      std::unique_ptr<Instruction> param_inst(
++          new Instruction(get_module()->context(), SpvOpFunctionParameter,
++                          GetUintId(), pid, {}));
++      get_def_use_mgr()->AnalyzeInstDefUse(&*param_inst);
++      output_func->AddParameter(std::move(param_inst));
++    }
++
++    // Create first block
++    uint32_t test_blk_id = TakeNextId();
++    std::unique_ptr<Instruction> test_label(NewLabel(test_blk_id));
++    std::unique_ptr<BasicBlock> new_blk_ptr =
++        MakeUnique<BasicBlock>(std::move(test_label));
++    new_blk_ptr->SetParent(&*output_func);
++    InstructionBuilder builder(
++        context(), &*new_blk_ptr,
++        IRContext::kAnalysisDefUse | IRContext::kAnalysisInstrToBlockMapping);
++
++    builder.SetInsertPoint(&*new_blk_ptr);
++
++    new_blk_ptr = GenSubgroupUpdate(&builder, std::move(new_blk_ptr), param_vec);
++
++    builder.SetInsertPoint(&*new_blk_ptr);
++
++    // Close merge block and function and add function to module
++    (void)builder.AddNullaryOp(0, SpvOpReturn);
++    new_blk_ptr->SetParent(&*output_func);
++    output_func->AddBasicBlock(std::move(new_blk_ptr));
++    std::unique_ptr<Instruction> func_end_inst(
++        new Instruction(get_module()->context(), SpvOpFunctionEnd, 0, 0, {}));
++    get_def_use_mgr()->AnalyzeInstDefUse(&*func_end_inst);
++    output_func->SetFunctionEnd(std::move(func_end_inst));
++    context()->AddFunction(std::move(output_func));
++  }
++  return param_cnt2subgroup_update_func_id_[num_parameters];
++}
++
++void AutoInstPass::GenSubgroupUpdateCall(InstructionBuilder* builder,
++                                     std::vector<uint32_t> param_ids) {
++  uint32_t func_id = GetSubgroupUpdateFuncId((uint32_t)param_ids.size());
++
++  std::vector<uint32_t> operands = {func_id};
++  operands.insert(operands.end(), param_ids.begin(), param_ids.end());
++
++  (void)builder->AddNaryOp(GetVoidId(), SpvOpFunctionCall, operands);
++}
++
++void AutoInstPass::GenUniqueSubgroupIdFuncCall(InstructionBuilder* builder,
++                                           uint32_t inst_id,
++                                           uint32_t stage_idx) {
++  if (stage_idx != SpvExecutionModelRayGenerationNV &&
++      stage_idx != SpvExecutionModelGLCompute) {
++    std::string message =
++        "Unique function id call cannot be generated unless the shader stage "
++        "is compute or RayGeneration\n";
++    consumer()(SPV_MSG_ERROR, 0, {0, 0, 0}, message.c_str());
++    return;
++  }
++
++  uint32_t output_func_id = GetUniqueSubgroupIdFunctionId(stage_idx);
++  (void)builder->AddNaryOp(GetVoidId(), SpvOpFunctionCall,
++                           {output_func_id, inst_id});
++}
++
++uint32_t AutoInstPass::GetUniqueSubgroupIdFunctionId(uint32_t stage_idx) {
++  if (unique_subgroup_id_function_id_ == 0) {
++    // Create function
++    unique_subgroup_id_function_id_ = TakeNextId();
++    analysis::TypeManager* type_mgr = context()->get_type_mgr();
++    std::vector<const analysis::Type*> param_types;
++
++    for (uint32_t c = 0; c < kUniqueSubgroupIdParamCnt; ++c)
++      param_types.push_back(type_mgr->GetType(GetUintId()));
++
++    analysis::Function func_ty(type_mgr->GetType(GetVoidId()), param_types);
++    analysis::Type* reg_func_ty = type_mgr->GetRegisteredType(&func_ty);
++    std::unique_ptr<Instruction> func_inst(
++        new Instruction(get_module()->context(), SpvOpFunction, GetVoidId(),
++                        unique_subgroup_id_function_id_,
++                        {{spv_operand_type_t::SPV_OPERAND_TYPE_LITERAL_INTEGER,
++                          {SpvFunctionControlMaskNone}},
++                         {spv_operand_type_t::SPV_OPERAND_TYPE_ID,
++                          {type_mgr->GetTypeInstruction(reg_func_ty)}}}));
++    get_def_use_mgr()->AnalyzeInstDefUse(&*func_inst);
++    std::unique_ptr<Function> output_func =
++        MakeUnique<Function>(std::move(func_inst));
++
++    // Add parameters
++    std::vector<uint32_t> param_vec;
++    for (uint32_t c = 0; c < kUniqueSubgroupIdParamCnt; ++c) {
++      uint32_t pid = TakeNextId();
++      param_vec.push_back(pid);
++      std::unique_ptr<Instruction> param_inst(
++          new Instruction(get_module()->context(), SpvOpFunctionParameter,
++                          GetUintId(), pid, {}));
++      get_def_use_mgr()->AnalyzeInstDefUse(&*param_inst);
++      output_func->AddParameter(std::move(param_inst));
++    }
++
++    // Create first block
++    uint32_t test_blk_id = TakeNextId();
++    std::unique_ptr<Instruction> test_label(NewLabel(test_blk_id));
++    std::unique_ptr<BasicBlock> new_blk_ptr =
++        MakeUnique<BasicBlock>(std::move(test_label));
++    new_blk_ptr->SetParent(&*output_func);
++    InstructionBuilder builder(
++        context(), &*new_blk_ptr,
++        IRContext::kAnalysisDefUse | IRContext::kAnalysisInstrToBlockMapping);
++
++    // Create variable to hold subgroup id computed by leader
++
++    uint32_t varTyPtrId = context()->get_type_mgr()->FindPointerToType(
++        GetUintId(), SpvStorageClassFunction);
++    assert(varTyPtrId && "Cannot create uint ptr type.");
++    auto zero = builder.GetUintConstantId(0);
++
++    auto new_var_op =
++        builder.AddUnaryOp(varTyPtrId, SpvOpVariable, SpvStorageClassFunction);
++    auto unique_subgroup_ptr_id = new_var_op->result_id();
++    builder.AddStore(new_var_op->result_id(), zero);
++
++    Instruction* subgroup_leader_cond =
++        builder.AddUnaryOp(GetBoolId(), SpvOpGroupNonUniformElect,
++                           builder.GetUintConstantId(SpvScopeSubgroup));
++
++    new_blk_ptr = GenIfStatement(
++        subgroup_leader_cond->result_id(), std::move(new_blk_ptr),
++        [this, unique_subgroup_ptr_id](InstructionBuilder* ir_builder,
++                                   std::unique_ptr<BasicBlock> block) {
++          uint32_t mask_none_id =
++              ir_builder->GetUintConstantId(SpvMemoryAccessMaskNone);
++          uint32_t scope_invok_id =
++              ir_builder->GetUintConstantId(SpvScopeInvocation);
++          Instruction* unique_id_ptr = ir_builder->AddTernaryOp(
++              GetOutputBufferPtrId(), SpvOpAccessChain, GetOutputBufferId(),
++              ir_builder->GetUintConstantId(kDebugOutputDataOffset),
++              ir_builder->GetUintConstantId(0));
++          Instruction* unique_id = ir_builder->AddQuadOp(
++              GetUintId(), SpvOpAtomicIAdd, unique_id_ptr->result_id(),
++              scope_invok_id, mask_none_id, ir_builder->GetUintConstantId(1));
++
++          ir_builder->AddStore(unique_subgroup_ptr_id, unique_id->result_id());
++          return block;
++        });
++    builder.SetInsertPoint(&*new_blk_ptr);
++    Instruction* broadcasted_id =
++        builder.AddBinaryOp(GetUintId(), SpvOpGroupNonUniformBroadcastFirst,
++                            builder.GetUintConstantId(SpvScopeSubgroup),
++                            GenVarLoad(unique_subgroup_ptr_id, &builder));
++
++    uint32_t intra_subgroup_id = GenSubgroupLocalInvocationId(&builder);
++    // Shift the thread id in the subgroup in to the top log2(SUBGROUP_SIZE)=5 bits
++    Instruction* shifted_subgroup_id = builder.AddBinaryOp(
++        GetUintId(), SpvOpShiftLeftLogical, intra_subgroup_id,
++        builder.GetUintConstantId(27 /*= 32 - log2(32) */));
++    // Combine the unique subgroup id and intra subgroup id
++    Instruction* joined_subgroup_ids = builder.AddBinaryOp(
++        GetUintId(), SpvOpBitwiseOr, shifted_subgroup_id->result_id(),
++        broadcasted_id->result_id());
++
++    // Generate thread id which will be used to created thread_id -> subgroup_id
++    // mapping
++    auto flat_thread_id = (stage_idx == SpvExecutionModelRayGenerationNV)
++                              ? GenFlatRtThreadId(&builder, stage_idx)
++                              : GenFlatComputeThreadId(&builder, stage_idx);
++
++    auto inst_id = param_vec[kUniqueSubgroupIdParamInstIdIdx];
++
++    new_blk_ptr = GenThreadUpdate(
++        &builder, std::move(new_blk_ptr),
++        {inst_id, flat_thread_id, joined_subgroup_ids->result_id()});
++
++    // Close merge block and function and add function to module
++    (void)builder.AddNullaryOp(0, SpvOpReturn);
++    new_blk_ptr->SetParent(&*output_func);
++    output_func->AddBasicBlock(std::move(new_blk_ptr));
++    std::unique_ptr<Instruction> func_end_inst(
++        new Instruction(get_module()->context(), SpvOpFunctionEnd, 0, 0, {}));
++    get_def_use_mgr()->AnalyzeInstDefUse(&*func_end_inst);
++    output_func->SetFunctionEnd(std::move(func_end_inst));
++    context()->AddFunction(std::move(output_func));
++  }
++  return unique_subgroup_id_function_id_;
++}
++
++void AutoInstPass::GenInstrumentedEntryPoints() {
++  for (auto entry_point_inst : get_module()->entry_points()) {
++    auto stage_idx =
++        entry_point_inst.GetSingleWordInOperand(kEntryPointExecutionModelInIdx);
++    auto entry_point_func_id =
++        entry_point_inst.GetSingleWordInOperand(kEntryPointFunctionIdInIdx);
++    Instruction* entry_point_func =
++        get_def_use_mgr()->GetDef(entry_point_func_id);
++
++    auto dummy_func_id = TakeNextId();
++    analysis::TypeManager* type_mgr = context()->get_type_mgr();
++    analysis::Function func_ty(type_mgr->GetType(GetVoidId()), {});
++    analysis::Type* reg_func_ty = type_mgr->GetRegisteredType(&func_ty);
++    auto expected_ty_id = type_mgr->GetId(reg_func_ty);
++
++    auto entry_point_func_ty_id =
++        entry_point_func->GetSingleWordOperand(kFunctionTypeIdx);
++
++    if (expected_ty_id != entry_point_func_ty_id) {
++      std::string message =
++          "Could not generate dummy entrypoint due to an unexpected EntryPoint "
++          "function signature.";
++      consumer()(SPV_MSG_ERROR, 0, {0, 0, 0}, message.c_str());
++      return;
++    }
++
++    // Create dummy function to original entry point
++    std::unique_ptr<Instruction> func_inst(new Instruction(
++        get_module()->context(), SpvOpFunction, GetVoidId(), dummy_func_id,
++        {{spv_operand_type_t::SPV_OPERAND_TYPE_LITERAL_INTEGER,
++          {SpvFunctionControlMaskNone}},
++         {spv_operand_type_t::SPV_OPERAND_TYPE_ID,
++          {type_mgr->GetTypeInstruction(reg_func_ty)}}}));
++    get_def_use_mgr()->AnalyzeInstDefUse(&*func_inst);
++
++    std::unique_ptr<Function> output_func =
++        MakeUnique<Function>(std::move(func_inst));
++
++    // Create first block
++    uint32_t test_blk_id = TakeNextId();
++    std::unique_ptr<Instruction> test_label(NewLabel(test_blk_id));
++    std::unique_ptr<BasicBlock> new_blk_ptr =
++        MakeUnique<BasicBlock>(std::move(test_label));
++    new_blk_ptr->SetParent(&*output_func);
++    InstructionBuilder builder(
++        context(), &*new_blk_ptr,
++        IRContext::kAnalysisDefUse | IRContext::kAnalysisInstrToBlockMapping);
++
++    builder.SetInsertPoint(&*new_blk_ptr);
++
++    // Invoke instrumentation hook
++    auto is_instrumented = PreEntryPointInstrument(&builder, stage_idx);
++
++    // Call original entrypoint
++    (void)builder.AddNaryOp(GetVoidId(), SpvOpFunctionCall,
++                            {entry_point_func_id});
++
++    // Invoke instrumentation hook
++    is_instrumented |= PostEntryPointInstrument(&builder, stage_idx);
++
++    // Close merge block and function and add function to module
++    (void)builder.AddNullaryOp(0, SpvOpReturn);
++    new_blk_ptr->SetParent(&*output_func);
++    output_func->AddBasicBlock(std::move(new_blk_ptr));
++    std::unique_ptr<Instruction> func_end_inst(
++        new Instruction(get_module()->context(), SpvOpFunctionEnd, 0, 0, {}));
++    get_def_use_mgr()->AnalyzeInstDefUse(&*func_end_inst);
++    output_func->SetFunctionEnd(std::move(func_end_inst));
++
++    if (is_instrumented) {
++      // If the instrumentation hooks insert code then
++      // add dummy entrypoint and replace the original
++      // EntryPoint with dummy entrypoint.
++      context()->AddFunction(std::move(output_func));
++      context()->ReplaceAllUsesWithPredicate(
++          entry_point_func_id, dummy_func_id, [](Instruction* inst) {
++            return inst->opcode() != SpvOpFunctionCall;
++          });
++    }
++  }
++}
++
++Pass::Status AutoInstPass::ProcessImpl() {
++  for (auto fii = get_module()->begin(); fii != get_module()->end(); ++fii) {
++    auto bb = fii->begin();
++    bb->IsLoopHeader();
++  }
++
++  InstProcessFunction pfn =
++      [this](BasicBlock::iterator ref_inst_itr,
++             UptrVectorIterator<BasicBlock> ref_block_itr, uint32_t stage_idx,
++             std::vector<std::unique_ptr<BasicBlock>>* new_blocks) {
++        (void)new_blocks;
++        GenInstProgrammableCode(ref_inst_itr, ref_block_itr, stage_idx);
++      };
++  InstProcessEntryPointCallTree(pfn);
++
++  // Add new entrypoint after other instrumentation to avoid it also being
++  // instrumented.
++  GenInstrumentedEntryPoints();
++
++  context()->BuildInvalidAnalyses(IRContext::kAnalysisDefUse |
++                                  IRContext::kAnalysisInstrToBlockMapping);
++  return has_added_instrumentation_ ? Status::SuccessWithChange
++                                    : Status::SuccessWithoutChange;
++}
++
++Pass::Status AutoInstPass::Process() {
++  // Initialize base class
++  InitializeInstrument();
++
++  // init auto instrumentation metadata
++  instrumented_bb_ids.clear();
++  instrumented_inst_ids.clear();
++  has_added_instrumentation_ = false;
++
++  // initialize inheriting class
++  InitializeInstrumentation();
++
++  auto res = ProcessImpl();
++
++  // finalize inheriting class
++  FinalizeInstrumentation();
++
++  // insert instrumentation
++  return res;
++}
++
++}  // namespace opt
++}  // namespace spvtools
+diff --git a/source/opt/auto_inst_pass.h b/source/opt/auto_inst_pass.h
+new file mode 100644
+index 00000000..ed91a44d
+--- /dev/null
++++ b/source/opt/auto_inst_pass.h
+@@ -0,0 +1,322 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++#ifndef LIBSPIRV_OPT_AUTO_INST_PASS_H_
++#define LIBSPIRV_OPT_AUTO_INST_PASS_H_
++
++#include <functional>
++
++#include "instrument_pass.h"
++
++namespace spvtools {
++namespace opt {
++
++class AutoInstPass : public InstrumentPass {
++ public:
++  AutoInstPass(uint32_t desc_set, uint32_t shader_id,
++               uint32_t reserved_words_count)
++      : InstrumentPass(desc_set, shader_id, kInstValidationIdAuto),
++        reserved_words_count_(reserved_words_count) {}
++
++  ~AutoInstPass() override = default;
++
++  // See optimizer.hpp for pass user documentation.
++  Status Process() override;
++
++  const char* name() const override { return "auto-inst-pass"; }
++
++ private:
++  // Track whether any instrumentation occurred
++  bool has_added_instrumentation_ = false;
++
++  // Track the function ids created to support thread/subgroup update
++  // accepting different numbers of parameters
++  std::unordered_map<uint32_t, uint32_t> param_cnt2thread_update_func_id_;
++  std::unordered_map<uint32_t, uint32_t> param_cnt2subgroup_update_func_id_;
++
++  // Track the function id for creating unique subgroup ids.
++  uint32_t unique_subgroup_id_function_id_ = 0;
++
++  // Instructions whose semantics are destroyed by having instrumentation
++  // inserted directly before them.
++  const std::set<uint32_t> uninstrumentable_pre_ops = {SpvOpPhi,
++                                                       SpvOpUnreachable};
++
++  // Instructions whose semantics are destroyed by having instrumentation
++  // inserted directly after them.
++  const std::set<uint32_t> uninstrumentable_post_ops = {
++      SpvOpSelectionMerge,  // Both merge instructions must precede a branch
++      SpvOpLoopMerge,
++      SpvOpBranch,  // Branch instructions end a basic block which would put the
++                    // instrumentation in limbo
++      SpvOpBranchConditional,
++      SpvOpSwitch,
++      SpvOpReturn,  // Instrumenting after returns/unreachable would never
++                    // execute
++      SpvOpReturnValue,
++      SpvOpUnreachable,
++  };
++
++  // Add a function to the module that appends
++  // an entry to the buffer containing |num_params| words
++  // for every active thread invoking the function.
++  uint32_t GetThreadUpdateFuncId(uint32_t num_params);
++
++  // Generate a sequence of instructions in |builder| in function |function|
++  // that write the value corresponding to the ids in |element_ids|. The
++  // output buffer will be written to once by each thread.
++  // The sequence of instructions will be inserted starting with |curr_block|.
++  // |curr_block| will be consumed by this function call due to control flow.
++  //
++  // |element_ids| is a vector of uint32_t which contains the
++  // id's of values that will be written to the output buffer.
++  std::unique_ptr<BasicBlock> GenThreadUpdate(
++      InstructionBuilder* builder, std::unique_ptr<BasicBlock> curr_block,
++      std::vector<uint32_t> param_ids);
++
++  // Add a function to the module that appends
++  // an entry to the buffer containing |num_params| words
++  // for every subgroup invoking the function.
++  uint32_t GetSubgroupUpdateFuncId(uint32_t num_params);
++
++  // Create function containing functionality for generating a
++  // unique subgroup (or subgroup) id. This function should only
++  // be called at the beginning of a shader in uniform control flow.
++  //
++  // The buffer entry created will be of the following form:
++  // word 0: <inst id>
++  // word 1: <flat thread id>
++  // word 2: <unique subgroup id | (intra subgroup id << 27)>
++  //
++  // This information can be used by the analysis to create a mapping
++  // from flattened thread id (available anywhere in the rt pipeline)
++  // to subgroup id which allows for inter-shader subgroup tracking.
++  // Furthermore, the intra-subgroup-id allows for attribution of subgroup-level
++  // instrumentation to individual threads (i.e. for heatmap visualizations).
++  uint32_t GetUniqueSubgroupIdFunctionId(uint32_t stage_idx);
++
++  // Generate a sequence of instructions in |builder| in function |function|
++  // that write the value corresponding to the ids in |element_ids|. The
++  // output buffer will only be written to by the subgroup leader.
++  // The sequence of instructions will be inserted starting with |curr_block|.
++  // |curr_block| will be consumed by this function call due to control flow.
++  //
++  // |element_ids| is a vector of uint32_t which contains the
++  // id's of values that will be written to the output buffer.
++  std::unique_ptr<BasicBlock> GenSubgroupUpdate(
++      InstructionBuilder* builder, std::unique_ptr<BasicBlock> curr_block,
++      std::vector<uint32_t> param_ids);
++
++ protected:
++  // In this class it can be very confusing differentiating
++  // between instruction SSA ids and ids for instrumentation.
++  // This type is designed to make it explicit which type of id
++  // it is when mixing and matching.
++  using AutoInstId = uint32_t;
++
++  // For some analyses keeping track of which threads belong
++  // to which subgroups and also how many subgroups executed the shader stage
++  // is interesting. Since saving 1 word is not important,
++  // the default for this value is set to 1 so CreateUniquesubgroupIdCall
++  // works out of the box.
++  static const int kDefaultReservedWordsCnt = 1;
++
++  // The number of lowers words in the instrumentation buffer that are reserved
++  // for fixed functions (i.e. not dynamically appended runtime entries)
++  // NOTE: this does not include the buffer size which is always tracked
++  const uint32_t reserved_words_count_;
++
++  // Track which basic blocks and instructions the pass has
++  // given an opportunity to instrument to prevent reinstrumenting.
++  std::set<uint32_t> instrumented_bb_ids;
++  std::set<uint32_t> instrumented_inst_ids;
++
++  // Apply GenDebugPrintfCode to every instruction in module.
++  Pass::Status ProcessImpl();
++
++  // Allows inheriting classes to initialize their knowledge
++  // of module before beginning instrumentation
++  virtual void InitializeInstrumentation() = 0;
++
++  // Allows inheriting classes to finalize before
++  // the pass finishes executing.
++  virtual void FinalizeInstrumentation() = 0;
++
++  // Any instructions added via |builder| will appear before |inst|.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  virtual bool PreInstructionInstrument(Instruction* inst,
++                                        InstructionBuilder* builder,
++                                        uint32_t stage_idx) = 0;
++
++  // Any instructions added via |builder| will appear after |inst|.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  virtual bool PostInstructionInstrument(Instruction* inst,
++                                         InstructionBuilder* builder,
++                                         uint32_t stage_idx) = 0;
++
++  // Any instructions added via |builder| will appear before the content of
++  // |bb|. |stage_idx| contains the SpvExecutionModel that builder is operating
++  // in. This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  virtual bool BasicBlockInstrument(BasicBlock* bb, InstructionBuilder* builder,
++                                    uint32_t stage_idx) = 0;
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  virtual bool PreEntryPointInstrument(InstructionBuilder* builder,
++                                       uint32_t stage_idx) = 0;
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  virtual bool PostEntryPointInstrument(InstructionBuilder* builder,
++                                        uint32_t stage_idx) = 0;
++
++  // If |ref_inst_itr| is selected as an instrumentation location,
++  // return in |new_blocks| the result of adding instrumentation.
++  // The instructions write a record to the output buffer stream
++  // The block at |ref_block_itr| can just be replaced with the
++  // block in |new_blocks|. Besides the buffer writes, this
++  // block will comprise all instructions preceding and following
++  // |ref_inst_itr|.
++  //
++  // This function is designed to be passed to
++  // InstrumentPass::InstProcessEntryPointCallTree(), which applies the
++  // function to each instruction in a module and replaces the instruction
++  // if warranted.
++  //
++  void GenInstProgrammableCode(BasicBlock::iterator ref_inst_itr,
++                               UptrVectorIterator<BasicBlock> ref_block_itr,
++                               uint32_t stage_idx);
++
++  // Generate a sequence of instructions in |builder| which
++  // materialize the lower and upper 32 bits of from
++  // OpReadClock.
++  //
++  // Returns std::pair<lower 32 bits,upper 32 bits>
++  std::pair<uint32_t, uint32_t> GenReadClockIds(InstructionBuilder* builder);
++
++  // Generate a sequence of instructions in |builder| which
++  // materialize a 32-bit thread mask where each bit
++  // is true if the thread is active and false otherwise.
++  // The uint32_t value returned corresponds to the thread_mask.
++  uint32_t GenSubgroupBallotId(InstructionBuilder* builder, uint32_t pred_id);
++
++  // Generate a sequence of instructions in |builder| which
++  // materialize the value of SpvBuiltinSubgroupLocalInvocationId.
++  uint32_t GenSubgroupLocalInvocationId(InstructionBuilder* builder);
++
++  // Returns an id corresponding to a uint created in |builder|
++  // which contains a flattened thread id calculated from the
++  // unique work id of the shader stage.
++  //
++  // This is primarily useful for tracking threads' execution between
++  // shaders in the ray tracing pipeline.
++  uint32_t GenFlatRtThreadId(InstructionBuilder* builder, uint32_t stage_idx);
++
++  // Returns an id corresponding to a uint created in |builder|
++  // which contains a flattened thread id calculated from the
++  // GlobalSize and GlobalLaunchID
++  //
++  // This is primarily useful for tracking threads' execution between
++  // different compute pipelines.
++  uint32_t GenFlatComputeThreadId(InstructionBuilder* builder,
++                                  uint32_t stage_idx);
++
++  // Returns a vector of ids corresponding to a uint created in |builder|
++  // which contains a unique work id of the shader stage.
++  //
++  // This is primarily useful for tracking threads' execution behaviour over
++  // time.
++  std::vector<uint32_t> GenThreadId(InstructionBuilder* builder,
++                                    uint32_t stage_idx);
++
++  // Returns an identifier
++  // for an instrumentation callsite which is unique across the
++  // whole ray-tracing pipeline.
++  uint32_t GenInstCallsiteId(Instruction* inst);
++
++  // Generate a sequence of instructions in function |function| that
++  // create an if statement where the body is executed iff the value
++  // corresponding to |condition_id| evaluates to true at runtime.
++  // |old_block| will be closed by an OpBranchConditional
++  //
++  // The callback |inside_if_callback| will be invoked in the body
++  // of the if statement. The |inside_if_callback| accepts an
++  // InstructionBuilder |builder| at the beginning of the if body.
++  // As well as a unique_ptr |curr_block| to the BasicBlock of the
++  // if body. The |inside_if_callback| may add more basic blocks
++  // but must return a unique_ptr to the basic block that ends the if
++  // body.
++  std::unique_ptr<BasicBlock> GenIfStatement(
++      uint32_t condition_id, std::unique_ptr<BasicBlock> old_block,
++      std::function<std::unique_ptr<BasicBlock>(
++          InstructionBuilder* builder, std::unique_ptr<BasicBlock> curr_block)>
++          inside_if_callback);
++
++  // Generates a seuqence of instructions in |builder| which invoke the
++  // ThreadUpdate function which writes the values that are identifier in
++  // |param_ids| to the StorageBuffer for each thread that invokes the call.
++  void GenThreadUpdateCall(InstructionBuilder* builder,
++                           std::vector<uint32_t> param_ids);
++
++  // Generates a seuqence of instructions in |builder| which invoke the
++  // subgroupUpdate function which writes the values that are identifier in
++  // |param_ids| to the StorageBuffer for each subgroup that invokes the call.
++  void GenSubgroupUpdateCall(InstructionBuilder* builder,
++                         std::vector<uint32_t> param_ids);
++
++  // Generate a function call in a block which will be appended to |new_blocks|
++  // This function should only bGe called at the beginning of a shader in
++  // uniform control flow. This ensures that every thread in the subgroup
++  // receives the value computed by the leader.
++  //
++  // |inst_offset_id| is used to report instrumentation metadata to validation
++  // layer. |stage_idx| is the current SpvExecutionMode.
++  void GenUniqueSubgroupIdFuncCall(InstructionBuilder* builder,
++                               uint32_t inst_offset_id, uint32_t stage_idx);
++
++  // Pass the current context in terms of:
++  // 1) instruction in |ref_inst_itr|
++  // 2) BB in |ref_block_itr|
++  // 3) shader stage in |stage_idx|
++  //
++  // This allows the instrumentation hooks to decide what
++  // instrumentation to add to |builder|.
++  // If instrumentation is added then this function returns true
++  // otherwise false.
++  //
++  bool HandleInstrumentHooks(BasicBlock::iterator ref_inst_itr,
++                             UptrVectorIterator<BasicBlock> ref_block_itr,
++                             uint32_t stage_idx, InstructionBuilder* builder);
++
++  // Generate dummy EntryPoints which invoke the PreEntryPointInstrument
++  // and PostEntryPointInstrument hooks around a call to the original entrypoint
++  // function.
++  void GenInstrumentedEntryPoints();
++};
++
++}  // namespace opt
++}  // namespace spvtools
++
++#endif  // LIBSPIRV_OPT_INST_PROGRAMMABLE_PASS_H_
+diff --git a/source/opt/auto_inst_simt_efficiency_pass.cpp b/source/opt/auto_inst_simt_efficiency_pass.cpp
+new file mode 100644
+index 00000000..cfb7e38a
+--- /dev/null
++++ b/source/opt/auto_inst_simt_efficiency_pass.cpp
+@@ -0,0 +1,39 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++//
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++
++#include "auto_inst_simt_efficiency_pass.h"
++
++namespace spvtools {
++namespace opt {
++
++bool AutoInstSimtEfficiencyPass::BasicBlockInstrument(
++    BasicBlock* bb, InstructionBuilder* builder, uint32_t stage_idx) {
++  // Suppress unused parameter warnings
++  (void)bb;
++  (void)stage_idx;
++
++  Instruction* true_constant_op = builder->GetBoolConstant(true);
++  uint32_t true_constant_id = true_constant_op->result_id();
++  // Create active thread mask by having all threads vote true
++  uint32_t active_thread_mask_id =
++      GenSubgroupBallotId(builder, true_constant_id);
++  // Write active thread mask
++  GenSubgroupUpdateCall(builder, {active_thread_mask_id});
++
++  return true;
++}
++
++}  // namespace opt
++}  // namespace spvtools
+diff --git a/source/opt/auto_inst_simt_efficiency_pass.h b/source/opt/auto_inst_simt_efficiency_pass.h
+new file mode 100644
+index 00000000..3d2b58cf
+--- /dev/null
++++ b/source/opt/auto_inst_simt_efficiency_pass.h
+@@ -0,0 +1,101 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++#ifndef LIBSPIRV_OPT_AUTO_INST_SIMT_EFFICIENCY_PASS_H_
++#define LIBSPIRV_OPT_AUTO_INST_SIMT_EFFICIENCY_PASS_H_
++
++#include <functional>
++
++#include "auto_inst_pass.h"
++
++namespace spvtools {
++namespace opt {
++
++class AutoInstSimtEfficiencyPass : public AutoInstPass {
++ public:
++  AutoInstSimtEfficiencyPass(uint32_t desc_set, uint32_t shader_id,
++                             uint32_t reserved_words_count)
++      : AutoInstPass(desc_set, shader_id, reserved_words_count) {}
++
++  const char* name() const override { return "auto-inst-simt-efficiency-pass"; }
++
++ private:
++  // Allows inheriting classes to initialize their knowledge
++  // of module before beginning instrumentation
++  void InitializeInstrumentation() override{};
++
++  // Allows inheriting classes to finalize before
++  // the pass finishes executing.
++  void FinalizeInstrumentation() override{};
++
++  // Any instructions added via |builder| will appear before |inst|
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                uint32_t stage_idx) override {
++    (void)inst;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will appear after |inst|.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                 uint32_t stage_idx) override {
++    (void)inst;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will appear before the content of
++  // |bb|. |stage_idx| contains the SpvExecutionModel that builder is operating
++  // in. This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool BasicBlockInstrument(BasicBlock* bb, InstructionBuilder* builder,
++                            uint32_t stage_idx) override;
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreEntryPointInstrument(InstructionBuilder* builder,
++                               uint32_t stage_idx) override {
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  }
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostEntryPointInstrument(InstructionBuilder* builder,
++                                uint32_t stage_idx) override {
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++};
++
++}  // namespace opt
++}  // namespace spvtools
++
++#endif  // LIBSPIRV_OPT_AUTO_INST_SIMT_EFFICIENCY_PASS_H_
+diff --git a/source/opt/auto_inst_warp_entry_and_exit_pass.cpp b/source/opt/auto_inst_warp_entry_and_exit_pass.cpp
+new file mode 100644
+index 00000000..d985a70d
+--- /dev/null
++++ b/source/opt/auto_inst_warp_entry_and_exit_pass.cpp
+@@ -0,0 +1,43 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++//
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++#include "auto_inst_warp_entry_and_exit_pass.h"
++
++namespace spvtools {
++namespace opt {
++
++bool AutoInstWarpEntryAndExitPass::PreEntryPointInstrument(
++    InstructionBuilder* builder, uint32_t stage_idx) {
++  if (stage_idx != SpvExecutionModelRayGenerationKHR) return false;
++
++  // Record every warp that began the pipeline
++  auto prim_id =
++      builder->GetUintConstantId(kAutoInstWarpEntryAndExitBeginPipeline);
++  GenSubgroupUpdateCall(builder, {prim_id});
++  return true;
++}
++
++bool AutoInstWarpEntryAndExitPass::PostEntryPointInstrument(
++    InstructionBuilder* builder, uint32_t stage_idx) {
++  if (stage_idx != SpvExecutionModelRayGenerationKHR) return false;
++
++  // Record every warp that completed the the pipeline
++  auto prim_id =
++      builder->GetUintConstantId(kAutoInstWarpEntryAndExitEndPipeline);
++  GenSubgroupUpdateCall(builder, {prim_id});
++  return true;
++}
++
++}  // namespace opt
++}  // namespace spvtools
+diff --git a/source/opt/auto_inst_warp_entry_and_exit_pass.h b/source/opt/auto_inst_warp_entry_and_exit_pass.h
+new file mode 100644
+index 00000000..44a5d175
+--- /dev/null
++++ b/source/opt/auto_inst_warp_entry_and_exit_pass.h
+@@ -0,0 +1,99 @@
++// Copyright (c) 2021 The Khronos Group Inc.
++
++// Licensed under the Apache License, Version 2.0 (the "License");
++// you may not use this file except in compliance with the License.
++// You may obtain a copy of the License at
++//
++//     http://www.apache.org/licenses/LICENSE-2.0
++//
++// Unless required by applicable law or agreed to in writing, software
++// distributed under the License is distributed on an "AS IS" BASIS,
++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++// See the License for the specific language governing permissions and
++// limitations under the License.
++
++#ifndef LIBSPIRV_OPT_AUTO_INST_WARP_ENTRY_AND_EXIT_PASS_H_
++#define LIBSPIRV_OPT_AUTO_INST_WARP_ENTRY_AND_EXIT_PASS_H_
++
++#include <functional>
++
++#include "auto_inst_pass.h"
++
++namespace spvtools {
++namespace opt {
++
++class AutoInstWarpEntryAndExitPass : public AutoInstPass {
++ public:
++  AutoInstWarpEntryAndExitPass(uint32_t desc_set, uint32_t shader_id)
++      : AutoInstPass(desc_set, shader_id, kDefaultReservedWordsCnt) {}
++
++  const char* name() const override {
++    return "auto-inst-warp-entry-and-exit-pass";
++  }
++
++ private:
++  // Allows inheriting classes to initialize their knowledge
++  // of module before beginning instrumentation
++  void InitializeInstrumentation() override{};
++
++  // Allows inheriting classes to finalize before
++  // the pass finishes executing.
++  void FinalizeInstrumentation() override{};
++
++  // Any instructions added via |builder| will appear before |inst|
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                uint32_t stage_idx) override {
++    (void)inst;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will appear after |inst|.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostInstructionInstrument(Instruction* inst, InstructionBuilder* builder,
++                                 uint32_t stage_idx) override {
++    (void)inst;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will appear before the content of
++  // |bb|. |stage_idx| contains the SpvExecutionModel that builder is operating
++  // in. This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool BasicBlockInstrument(BasicBlock* bb, InstructionBuilder* builder,
++                            uint32_t stage_idx) override {
++    (void)bb;
++    (void)builder;
++    (void)stage_idx;
++    return false;
++  };
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PreEntryPointInstrument(InstructionBuilder* builder,
++                               uint32_t stage_idx) override;
++
++  // Any instructions added via |builder| will execute before the
++  // entrypoint function of the shader.
++  // |stage_idx| contains the SpvExecutionModel that builder is operating in.
++  // This function is expected to return true if it added instructions to
++  // builder, otherwise false.
++  bool PostEntryPointInstrument(InstructionBuilder* builder,
++                                uint32_t stage_idx) override;
++};
++
++}  // namespace opt
++}  // namespace spvtools
++
++#endif  // LIBSPIRV_OPT_AUTO_INST_WARP_ENTRY_AND_EXIT_PASS_H_
+diff --git a/source/opt/instrument_pass.cpp b/source/opt/instrument_pass.cpp
+index ed34fb02..1a84a002 100644
+--- a/source/opt/instrument_pass.cpp
++++ b/source/opt/instrument_pass.cpp
+@@ -288,7 +288,9 @@ void InstrumentPass::GenStageStreamWriteCode(uint32_t stage_idx,
+       GenDebugOutputFieldCode(base_offset_id, kInstRayTracingOutLaunchIdZ,
+                               z_launch_inst->result_id(), builder);
+     } break;
+-    default: { assert(false && "unsupported stage"); } break;
++    default: {
++      assert(false && "unsupported stage");
++    } break;
+   }
+ }
+ 
+@@ -435,6 +437,7 @@ uint32_t InstrumentPass::GetOutputBufferBinding() {
+     case kInstValidationIdBuffAddr:
+       return kDebugOutputBindingStream;
+     case kInstValidationIdDebugPrintf:
++    case kInstValidationIdAuto:
+       return kDebugOutputPrintfStream;
+     default:
+       assert(false && "unexpected validation id");
+diff --git a/source/opt/instrument_pass.h b/source/opt/instrument_pass.h
+index 12b939d4..5fb49f8b 100644
+--- a/source/opt/instrument_pass.h
++++ b/source/opt/instrument_pass.h
+@@ -62,6 +62,7 @@ namespace opt {
+ static const uint32_t kInstValidationIdBindless = 0;
+ static const uint32_t kInstValidationIdBuffAddr = 1;
+ static const uint32_t kInstValidationIdDebugPrintf = 2;
++static const uint32_t kInstValidationIdAuto = 3;
+ 
+ class InstrumentPass : public Pass {
+   using cbb_ptr = const BasicBlock*;
+diff --git a/source/opt/ir_builder.h b/source/opt/ir_builder.h
+index fe5feff5..b7c8e0c5 100644
+--- a/source/opt/ir_builder.h
++++ b/source/opt/ir_builder.h
+@@ -392,6 +392,10 @@ class InstructionBuilder {
+     return uint_inst->result_id();
+   }
+ 
++  uint32_t GetIntConstantId(int value) {
++    return GetIntConstant(value, true)->result_id();
++  }
++
+   // Adds either a signed or unsigned 32 bit integer constant to the binary
+   // depedning on the |sign|. If |sign| is true then the value is added as a
+   // signed constant otherwise as an unsigned constant. If |sign| is false the
+@@ -428,6 +432,28 @@ class InstructionBuilder {
+     return GetContext()->get_constant_mgr()->GetDefiningInstruction(constant);
+   }
+ 
++  Instruction* GetBoolConstant(bool value) {
++    analysis::Bool bool_type{};
++
++    // Get or create the integer type. This rebuilds the type and manages the
++    // memory for the rebuilt type.
++    uint32_t type_id =
++        GetContext()->get_type_mgr()->GetTypeInstruction(&bool_type);
++
++    // Get the memory managed type so that it is safe to be stored by
++    // GetConstant.
++    analysis::Type* rebuilt_type =
++        GetContext()->get_type_mgr()->GetType(type_id);
++
++    // Create the constant value.
++    const analysis::Constant* constant =
++        GetContext()->get_constant_mgr()->GetConstant(rebuilt_type,
++                                                      {(uint32_t)value});
++
++    // Create the OpConstant instruction using the type and the value.
++    return GetContext()->get_constant_mgr()->GetDefiningInstruction(constant);
++  }
++
+   Instruction* AddCompositeExtract(uint32_t type, uint32_t id_of_composite,
+                                    const std::vector<uint32_t>& index_list) {
+     std::vector<Operand> operands;
+diff --git a/source/opt/ir_context.cpp b/source/opt/ir_context.cpp
+index 82107b5c..094513c3 100644
+--- a/source/opt/ir_context.cpp
++++ b/source/opt/ir_context.cpp
+@@ -805,6 +805,15 @@ uint32_t IRContext::GetBuiltinInputVarId(uint32_t builtin) {
+         reg_type = type_mgr->GetRegisteredType(&v4float_ty);
+         break;
+       }
++      case SpvBuiltInNumSubgroups:
++      case SpvBuiltInWorkgroupId:
++      case SpvBuiltInWarpIDNV:
++      case SpvBuiltInWarpsPerSMNV:
++      case SpvBuiltInSMIDNV:
++      case SpvBuiltInSMCountNV:
++      case SpvBuiltInSubgroupId:
++      case SpvBuiltInLocalInvocationIndex:
++      case SpvBuiltInSubgroupSize:
+       case SpvBuiltInVertexIndex:
+       case SpvBuiltInInstanceIndex:
+       case SpvBuiltInPrimitiveId:
+@@ -814,7 +823,11 @@ uint32_t IRContext::GetBuiltinInputVarId(uint32_t builtin) {
+         reg_type = type_mgr->GetRegisteredType(&uint_ty);
+         break;
+       }
++      case SpvBuiltInLocalInvocationId:
++      case SpvBuiltInLaunchSizeNV:
+       case SpvBuiltInGlobalInvocationId:
++      case SpvBuiltInNumWorkgroups:
++      case SpvBuiltInWorkgroupSize:
+       case SpvBuiltInLaunchIdNV: {
+         analysis::Integer uint_ty(32, false);
+         analysis::Type* reg_uint_ty = type_mgr->GetRegisteredType(&uint_ty);
+diff --git a/source/opt/optimizer.cpp b/source/opt/optimizer.cpp
+index 8726ff93..5497fab7 100644
+--- a/source/opt/optimizer.cpp
++++ b/source/opt/optimizer.cpp
+@@ -908,6 +908,66 @@ Optimizer::PassToken CreateInstDebugPrintfPass(uint32_t desc_set,
+       MakeUnique<opt::InstDebugPrintfPass>(desc_set, shader_id));
+ }
+ 
++Optimizer::PassToken CreateAutoInstDebugPass(uint32_t desc_set,
++                                             uint32_t shader_id,
++                                             bool test_atomic_ops,
++                                             bool test_subgroup_ops) {
++  return MakeUnique<Optimizer::PassToken::Impl>(
++      MakeUnique<opt::AutoInstDebugPass>(desc_set, shader_id, test_atomic_ops,
++                                         test_subgroup_ops));
++}
++
++Optimizer::PassToken CreateAutoInstDivergenceCharacterizationPass(
++    uint32_t desc_set, uint32_t shader_id,
++    std::function<
++        void(std::unordered_map<uint32_t, uint32_t>&& inst_id2prim_id,
++             std::unordered_map<uint32_t, uint32_t>&& inst_id2inst_count)>
++        static_data_callback) {
++  return MakeUnique<Optimizer::PassToken::Impl>(
++      MakeUnique<opt::AutoInstDivergenceCharacterizationPass>(
++          desc_set, shader_id, static_data_callback));
++}
++
++Optimizer::PassToken CreateAutoInstDynShaderTracePass(uint32_t desc_set,
++                                                      uint32_t shader_id) {
++  return MakeUnique<Optimizer::PassToken::Impl>(
++      MakeUnique<opt::AutoInstDynShaderTracePass>(desc_set, shader_id));
++}
++
++Optimizer::PassToken CreateAutoInstDynTraceRayTracePass(
++    uint32_t desc_set, uint32_t shader_id,
++    std::function<void(std::unordered_map<uint32_t, uint32_t>&&,
++                       std::unordered_map<uint32_t, std::vector<uint32_t>>&&)>
++        static_data_callback) {
++  return MakeUnique<Optimizer::PassToken::Impl>(
++      MakeUnique<opt::AutoInstDynTraceRayTracePass>(desc_set, shader_id,
++                                                    static_data_callback));
++}
++
++Optimizer::PassToken CreateAutoInstExecutionTracePass(
++    uint32_t desc_set, uint32_t shader_id,
++    std::function<
++        void(std::unordered_map<uint32_t, std::set<uint32_t>>&&,
++             std::unordered_map<uint32_t, uint32_t>&& inst_id2bb_opcodes)>
++        static_data_callback) {
++  return MakeUnique<Optimizer::PassToken::Impl>(
++      MakeUnique<opt::AutoInstExecutionTracePass>(desc_set, shader_id,
++                                                  static_data_callback));
++}
++
++Optimizer::PassToken CreateAutoInstSimtEfficiencyPass(
++    uint32_t desc_set, uint32_t shader_id, uint32_t reserved_words_count) {
++  return MakeUnique<Optimizer::PassToken::Impl>(
++      MakeUnique<opt::AutoInstSimtEfficiencyPass>(desc_set, shader_id,
++                                                  reserved_words_count));
++}
++
++Optimizer::PassToken CreateAutoInstWarpEntryAndExitPass(uint32_t desc_set,
++                                                        uint32_t shader_id) {
++  return MakeUnique<Optimizer::PassToken::Impl>(
++      MakeUnique<opt::AutoInstWarpEntryAndExitPass>(desc_set, shader_id));
++}
++
+ Optimizer::PassToken CreateInstBuffAddrCheckPass(uint32_t desc_set,
+                                                  uint32_t shader_id) {
+   return MakeUnique<Optimizer::PassToken::Impl>(
+diff --git a/source/opt/passes.h b/source/opt/passes.h
+index d47cc1ce..9a7c9c22 100644
+--- a/source/opt/passes.h
++++ b/source/opt/passes.h
+@@ -19,6 +19,13 @@
+ 
+ #include "source/opt/aggressive_dead_code_elim_pass.h"
+ #include "source/opt/amd_ext_to_khr.h"
++#include "source/opt/auto_inst_debug_pass.h"
++#include "source/opt/auto_inst_divergence_characterization_pass.h"
++#include "source/opt/auto_inst_dyn_trace_ray_trace_pass.h"
++#include "source/opt/auto_inst_dyn_shader_trace_pass.h"
++#include "source/opt/auto_inst_execution_trace_pass.h"
++#include "source/opt/auto_inst_simt_efficiency_pass.h"
++#include "source/opt/auto_inst_warp_entry_and_exit_pass.h"
+ #include "source/opt/block_merge_pass.h"
+ #include "source/opt/ccp_pass.h"
+ #include "source/opt/cfg_cleanup_pass.h"
+-- 
+2.29.2.windows.2
+
diff --git a/ecosystem_tools/VulkanVision/st-patches/vvision-st.diff b/ecosystem_tools/VulkanVision/st-patches/vvision-st.diff
new file mode 100644
index 00000000..5f51cce6
--- /dev/null
+++ b/ecosystem_tools/VulkanVision/st-patches/vvision-st.diff
@@ -0,0 +1 @@
+0001-spirv-opt-Add-auto-inst-passes.patch
diff --git a/ecosystem_tools/VulkanVision/vv-patches/0001-layers-Added-auto-inst-layers.patch b/ecosystem_tools/VulkanVision/vv-patches/0001-layers-Added-auto-inst-layers.patch
new file mode 100644
index 00000000..d36ddec7
--- /dev/null
+++ b/ecosystem_tools/VulkanVision/vv-patches/0001-layers-Added-auto-inst-layers.patch
@@ -0,0 +1,3846 @@
+From 94f9a7c9ea98fa400d47b0a71efbea8792113b08 Mon Sep 17 00:00:00 2001
+From: dpankratz <pankratz@ualberta.ca>
+Date: Wed, 27 Jan 2021 09:17:26 -0700
+Subject: [PATCH] layers: Added auto-inst layers
+
+---
+ CMakeLists.txt                                |   27 +-
+ build-android/jni/Android.mk                  |    7 +
+ build-android/known_good.json                 |    2 +-
+ docs/auto_instrument.md                       |  213 +++
+ layers/CMakeLists.txt                         |   25 +-
+ layers/auto_inst.cpp                          | 1205 +++++++++++++++++
+ layers/auto_inst.h                            |  465 +++++++
+ .../auto_inst_divergence_characterization.cpp |  157 +++
+ .../auto_inst_divergence_characterization.h   |   48 +
+ layers/auto_inst_dyn_shader_trace.cpp         |  177 +++
+ layers/auto_inst_dyn_shader_trace.h           |   44 +
+ layers/auto_inst_dyn_trace_ray_trace.cpp      |  223 +++
+ layers/auto_inst_dyn_trace_ray_trace.h        |   55 +
+ layers/auto_inst_execution_trace.cpp          |  174 +++
+ layers/auto_inst_execution_trace.h            |   56 +
+ layers/auto_inst_simt_efficiency.cpp          |   67 +
+ layers/auto_inst_simt_efficiency.h            |   56 +
+ layers/auto_inst_warp_entry_and_exit.cpp      |   61 +
+ layers/auto_inst_warp_entry_and_exit.h        |   52 +
+ layers/debug_printf.cpp                       |    2 +-
+ layers/debug_printf.h                         |    1 +
+ layers/generated/chassis.cpp                  |   78 +-
+ layers/generated/chassis.h                    |   15 +-
+ layers/gpu_utils.h                            |    7 +
+ layers/gpu_validation.h                       |    4 +-
+ layers/layer_options.cpp                      |   13 +
+ layers/layer_options.h                        |   19 +-
+ scripts/known_good.json                       |   17 +-
+ scripts/layer_chassis_generator.py            |   72 +-
+ 29 files changed, 3305 insertions(+), 37 deletions(-)
+ create mode 100644 docs/auto_instrument.md
+ create mode 100644 layers/auto_inst.cpp
+ create mode 100644 layers/auto_inst.h
+ create mode 100644 layers/auto_inst_divergence_characterization.cpp
+ create mode 100644 layers/auto_inst_divergence_characterization.h
+ create mode 100644 layers/auto_inst_dyn_shader_trace.cpp
+ create mode 100644 layers/auto_inst_dyn_shader_trace.h
+ create mode 100644 layers/auto_inst_dyn_trace_ray_trace.cpp
+ create mode 100644 layers/auto_inst_dyn_trace_ray_trace.h
+ create mode 100644 layers/auto_inst_execution_trace.cpp
+ create mode 100644 layers/auto_inst_execution_trace.h
+ create mode 100644 layers/auto_inst_simt_efficiency.cpp
+ create mode 100644 layers/auto_inst_simt_efficiency.h
+ create mode 100644 layers/auto_inst_warp_entry_and_exit.cpp
+ create mode 100644 layers/auto_inst_warp_entry_and_exit.h
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 994a9ef0..da32e3b0 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -227,6 +227,8 @@ if(BUILD_TESTS OR BUILD_LAYERS)
+     if (NOT TARGET SPIRV-Tools)
+         if(NOT SPIRV_TOOLS_INSTALL_DIR)
+             set(SPIRV_TOOLS_INSTALL_DIR "${GLSLANG_INSTALL_DIR}")
++        else()
++            message(STATUS "Using Spirv-Tools install located at ${SPIRV_TOOLS_INSTALL_DIR}")
+         endif()
+ 
+         set(SPIRV_TOOLS_BINARY_ROOT "${SPIRV_TOOLS_INSTALL_DIR}/lib"
+@@ -240,9 +242,12 @@ if(BUILD_TESTS OR BUILD_LAYERS)
+         set(SPIRV_TOOLS_OPT_DEBUG_SEARCH_PATH "${SPIRV_TOOLS_INSTALL_DIR}/lib")
+ 
+         find_library(SPIRV_TOOLS_LIB NAMES SPIRV-Tools HINTS ${SPIRV_TOOLS_SEARCH_PATH})
++        if (NOT SPIRV_TOOLS_LIB)
++            message(FATAL_ERROR "Could not find SPIRV-Tools")
++        endif()
+         find_library(SPIRV_TOOLS_OPT_LIB NAMES SPIRV-Tools-opt HINTS ${SPIRV_TOOLS_OPT_SEARCH_PATH})
+ 
+-        if(WIN32)
++        if(WIN32 AND NOT SPIRV_TOOLS_LIB)
+             add_library(SPIRV-Tools-opt STATIC IMPORTED)
+             add_library(SPIRV-Tools STATIC IMPORTED)
+ 
+@@ -269,6 +274,26 @@ if(BUILD_TESTS OR BUILD_LAYERS)
+         set(SPIRV_TOOLS_INCLUDE_DIR "${spirv-tools_SOURCE_DIR}/include" CACHE PATH "Path to spirv tools headers")
+     endif()
+ 
++    if (NOT TARGET SPIRV-Cross)
++        if (SPIRV_CROSS_INSTALL_DIR)
++            message(STATUS "Using spirv-cross install located at ${SPIRV_CROSS_INSTALL_DIR}")
++        endif()
++        set(SPIRV_CROSS_INCLUDE_DIR "${SPIRV_CROSS_INSTALL_DIR}/include" CACHE PATH "Path to spirv cross headers")
++        set(SPIRV_CROSS_SEARCH_PATH ${SPIRV_CROSS_INSTALL_DIR}/lib)
++        find_library(SPIRV_CROSS_GLSL_LIB NAMES spirv-cross-glsl HINTS ${SPIRV_CROSS_SEARCH_PATH})
++        find_library(SPIRV_CROSS_CORE_LIB NAMES spirv-cross-core HINTS ${SPIRV_CROSS_SEARCH_PATH})
++        if (NOT SPIRV_CROSS_GLSL_LIB OR NOT SPIRV_CROSS_CORE_LIB)
++            find_library(SPIRV_CROSS_GLSL_LIB NAMES spirv-cross-glsld HINTS ${SPIRV_CROSS_SEARCH_PATH})
++            find_library(SPIRV_CROSS_CORE_LIB NAMES spirv-cross-cored HINTS ${SPIRV_CROSS_SEARCH_PATH})
++            if (NOT SPIRV_CROSS_GLSL_LIB OR NOT SPIRV_CROSS_CORE_LIB)
++                    message(FATAL_ERROR "Could not find spirv-cross libs!")
++            else()
++                    message("WARNING: using debug config of SPIRV-Cross libs. Use <--config release> option of update_deps.py to fix.")
++            endif()
++        endif()
++        set(SPIRV_CROSS_LIBRARIES ${SPIRV_CROSS_GLSL_LIB} ${SPIRV_CROSS_CORE_LIB})
++    endif()
++
+     set(GLSLANG_LIBRARIES ${GLSLANG_LIBRARIES} ${SPIRV_TOOLS_LIBRARIES})
+ endif()
+ 
+diff --git a/build-android/jni/Android.mk b/build-android/jni/Android.mk
+index f0955dd5..75c49def 100644
+--- a/build-android/jni/Android.mk
++++ b/build-android/jni/Android.mk
+@@ -45,6 +45,13 @@ LOCAL_SRC_FILES += $(SRC_DIR)/layers/generated/spirv_validation_helper.cpp
+ LOCAL_SRC_FILES += $(SRC_DIR)/layers/gpu_validation.cpp
+ LOCAL_SRC_FILES += $(SRC_DIR)/layers/gpu_utils.cpp
+ LOCAL_SRC_FILES += $(SRC_DIR)/layers/debug_printf.cpp
++LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst.cpp
++LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_dyn_shader_trace.cpp
++LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_dyn_trace_ray_trace.cpp
++LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_execution_trace.cpp
++LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_simt_efficiency.cpp
++LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_divergence_characterization.cpp
++LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_warp_entry_and_exit.cpp
+ LOCAL_SRC_FILES += $(SRC_DIR)/layers/best_practices_utils.cpp
+ LOCAL_SRC_FILES += $(SRC_DIR)/layers/generated/best_practices.cpp
+ LOCAL_SRC_FILES += $(SRC_DIR)/layers/synchronization_validation.cpp
+diff --git a/build-android/known_good.json b/build-android/known_good.json
+index 1a77e5ae..d8635c37 100755
+--- a/build-android/known_good.json
++++ b/build-android/known_good.json
+@@ -1,5 +1,5 @@
+ {
+-  "repos" : [
++  "repos": [
+     {
+       "name" : "shaderc",
+       "url" : "https://github.com/google/shaderc.git",
+diff --git a/docs/auto_instrument.md b/docs/auto_instrument.md
+new file mode 100644
+index 00000000..30d376ce
+--- /dev/null
++++ b/docs/auto_instrument.md
+@@ -0,0 +1,213 @@
++<!-- markdownlint-disable MD041 -->
++
++[![Khronos Vulkan][1]][2]
++
++[1]: https://vulkan.lunarg.com/img/Vulkan_100px_Dec16.png "https://www.khronos.org/vulkan/"
++[2]: https://www.khronos.org/vulkan/
++
++# Auto-Instrument
++
++[![Creative Commons][3]][4]
++
++[3]: https://i.creativecommons.org/l/by-nd/4.0/88x31.png "Creative Commons License"
++[4]: https://creativecommons.org/licenses/by-nd/4.0/
++
++Auto-Instrument is implemented in the SPIR-V Tools optimizer and the `VK_LAYER_KHRONOS_validation` layer.
++It allows provides boilerplate for developers to implement custom instrumentation and analyses.
++This document covers the operation of the layer portion of the implementation and subsequently the specific sublayers that perform auto-instrumentation.
++
++## Limitations
++
++Auto-Instrument shares the same limitation as Debug Printf and GPU-assisted validation, an additional bound descriptor set. Currently, Auto-Instrument only allows 1 class of pipeline to be instrumented at once (i.e. 1 of graphics, compute, ray-tracing). 
++
++## Basic Operation
++
++The basic operation of Auto-Instrument is to offer the following hooks for subclasses to perform a custom analysis:
++* **InitializeDeviceLayerSettings** provides the opportunity to set layer settings when the Vulkan logical device is created. This is useful to check for active extensions or to check `vk_layer_settings.txt` for sublayer specific settings.
++* **InitializeInstrumentationBuffer** provides the opportunity for subclasses to change the default values in the instrumentation buffer. This is useful for communicating with the instrumentation code. For example, the instrumentation could support a sampling based approach where not all frames collect results. The instrumentation could contain a check for a specific location in the instrumentation buffer and this function would allow the subclass to populate that location.
++* **RegisterPasses** allows the specific subclass to choose which auto-instrumentation pass from SPIRV-Opt to use. 
++* **AnalyzeRayTracing**  is provided with the data collection from a ray-tracing pipeline invocation as well as the width, height, and depth of the invocation. 
++* **AnalyzeGraphics** receives the data from a graphics pipeline invocation.
++* **AnalyzeCompute** receives the data from a compute pipeline invocation and the x, y, and z of the invocation.
++
++By overriding these functions, a subclass is able to implement custom analyses of instrumentation data.
++
++## Enabling Auto-Instrument in Vulkan-ValidationLayers
++
++Auto-Instrument is an object in the KHRONOS_validation layer, so the VK_LAYER_KHRONOS_validation layer must be loaded.
++See the LAYER_CONFIGURATION document for information on enabling the VK_LAYER_KHRONOS_validation layer.
++Validation itself is not necessary for Auto-Instrument and can be disabled without affecting Auto-Instrument functionality.
++
++Auto-Instrument can be enabled through *vk_layer_settings.txt* file that must be in the program's working directory.
++Within the settings file, specify:
++khronos_validation.enables = `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_<specific analysis>_EXT` where `<specific analysis>` is one of the auto_inst subclasses. 
++
++Auto-Instrument has been implemented as a state tracker validation object, as a peer to GPU Assisted Validation and Debug Printf.
++Because of this, and coupled with the philosophy that validation objects will not communicate with each other, one should never enable any pair of Auto-Instrument, GPU Assisted Validation and Debug Printf at the same time.
++Auto-Instrument will be disabled if GPU Assisted Validation or Debug Printf is enabled.
++
++When using Auto-Instrument, it is recommended to disable validation, as the debug level of INFO or DEBUG causes the validation layers to produce many messages unrelated to Auto-Instrument, making it difficult to find the desired output.
++
++### Auto-Instrument Requirements
++
++* Validation Layers version: 1.2.135.0
++* Vulkan API version 1.1 or greater
++* VkPhysicalDevice features: fragmentStoresAndAtomics and vertexPipelineStoresAndAtomics
++
++### Auto-Instrument Settings
++
++* `khronos_validation.auto_inst_buffer_size` = `<size in bytes>`
++
++This setting allows you to specify the size of the per-call buffer, in bytes of device memory, for returning instrumentation data.
++The default is 1024 bytes. If the buffer size is too small, Auto-Instrument will report the size that the buffer should be to collect all the instrumentation data. In subsequent executions, the history file will be used to size the instrumentation buffer.
++
++* `khronos_validation.auto_inst_pipeline_to_instrument` = `Graphics` or `Compute` or `RayTracing`
++
++This setting controls the pipeline type that is instrumented. For example, if `Compute` is chosen then compute shaders are instrumented and instrumentation buffers are created for all VkComputePiplines.
++
++* `khronos_validation.auto_inst_to_stdout` = 'false' or 'true'
++
++By default, Auto-Instrument messages are sent to the stdout, but this setting will instead send Auto-Instrument to the debug callback.
++
++* `khronos_validation.auto_inst_base_file_name` = `<base name>`
++
++Auto-Instrument analysis file names can optionally have a base file name prepended. By default there is no common prefix.
++
++* `khronos_validation.auto_inst_create_reference_heatmap` = `false` or `true`
++
++Many of the analysis emit a heatmap. This option allows a reference scale **ReferenceScale\[.bmp\.ppm\]** to be generated where the leftmost pixels correspond to 0.0 and rightmost to 1.0. The pixels in between are interpolated between 0.0 and 1.0. 
++
++* `khronos_validation.auto_inst_debug_mode` = `atomics` or `subgroup` or `arraylength`
++
++This is a debug setting designed to help isolate any issues that may be occurring with auto-instrumentation. When present, this setting switches the operation of Auto-Instrument to disregard the current instrumentation mode. Instead, it runs an extremely simple instrumentation pass and analysis to check whether the atomic, subgroup, or arraylength instructions work correctly in isolation. 
++
++* `khronos_validation.auto_inst_dump_shaders` = `false` or `true` 
++
++When set to true, this setting instructs Auto-Instrument to dump the instrumented shader modules.
++
++* `khronos_validation.auto_inst_shaders_to_instrument` = `stageM, stageN, ...` E.g. `Miss1, ClosestHit2`
++
++By default, all shaders that correspond to the `khronos_validation.auto_inst_pipeline_to_instrument` setting are instrumented. When this setting is activated, only the shaders that match the specified stage and index are instrumented. For example, if `Miss1, ClosestHit2` is passed, then the 1st Miss shader that pass via `VkCreateShaderModule` will be instrumented, and similarly the 2nd Closest Hit shader. 
++
++### Auto-Instrument Resources
++
++Analogous to GPU Assisted Validation and Debug Printf, Auto-Instrument uses device memory and a descriptor set to allow the shader instrumentation code to return values to the layer.  
++See the gpu_validation document for more information
++
++Auto-Instrument also generates a file containing the runtime instrumentation buffer utilization of previous executions of the application. In subsequent executions, the instrumentation buffers are sized according to this history. This allows many more calls to be instrumented in cases where some calls generate significantly more data.
++
++Auto-Instrument analyses generate output files of the form `<base file name><pipeline type><pipeline invocation index>_frame<frame number>_<analysis specific suffix>`. For example `rt0_frame0_simt_efficiency.csv` is the SIMT Efficiency measurement for the 1st ray-tracing call in the 1st frame of the application. 
++
++### Auto-Instrument Subclasses
++
++This section outlines the classes the implement the Auto-Instrument interface to create detailed execution trace profiling.
++
++### Limitation
++
++Many of the analyses require tracking warp execution over time. Obtaining knowledge of which threads belong to a warp requires digging past the abstraction level of SPIR-V and a custom solution for graphics, compute and ray-tracing pipelines. Currently this is only implemented for ray-tracing pipelines so many of the analyses are limited for graphics and compute. 
++
++Some applications invoke the RayTracing pipeline with a z-dimension size of >1 which complicates the creation of heatmaps. In future more modes for transforming these higher dimensional calls into heatmaps will be added and exposed as options.
++
++## Divergence Characterization
++
++This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT` and is currently supported only for Ray Tracing.
++
++### Analysis
++
++This analysis breaksdown the effect of indirect-function calls, early thread-exits and control-flow on divergence in terms of number of instructions affected:
++* **Indirect-function call** divergence is caused by thread-varying values for the address passed to an indirect function call. This occurs frequently in ray-tracing when the threads in a warp hit multiple objects. This metric is currently only supported for ray-tracing.
++* **Early thread-exits** divergence occurs when some threads in a warp complete the pipeline while other threads still have work to perform. This also occurs frequently in ray-tracing when some rays miss geometry and others bounces many times requiring many traversals and intersections.
++* **Control-flow** divergence is caused by, for example, thread-varying values for if statements or thread-varying loop trip counts. 
++
++### Output
++
++This analysis outputs a file **divergence_characterization.csv** which contains the respective counts of inactive instruction execution slots caused by the different divergence causes for each pipeline invocation.
++
++## Dynamic Shader Trace
++
++This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT` and is currently supported only for Ray Tracing.
++
++### Analysis
++
++This analysis reconstructs the number of dynamic shader executions for each pipeline invocation. These values are visualized with thread and warp heatmaps
++
++### Output
++
++* **dyn_shader_counts.csv** is generated per pipeline invocation and contains the shader execution counts observed at runtime.
++* **shader_execution_heatmap\[.bmp\.ppm\]** is generated per piepline invocation and contains a heatmap created for each pipeline invocation visualizing each pixel's dynamic shader execution count normalized to the maximum shader execution count. 
++* **subgroup_shader_execution_heatmap\[.bmp\.ppm\]** is generated per piepline invocation and contains a heatmap created for each pipeline invocation visualizing each subgroups's dynamic shader execution count normalized to the maximum shader execution count for a single subgroup. 
++
++## Dynamic TraceRay Trace
++
++This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT` and is currently supported only for Ray Tracing.
++
++### Analysis
++
++This analysis simulates the effect of thread compaction on the execution of a ray-tracing pipeline. Given a particular runtime traceRay invocation, the active threads are repacked into warps and then the number of warp executions required to perform the new traceRay calls is calculated. The analysis further simulates only repacking consecutive `2^k` warps to simulate different hardware buffer sizes. 
++
++### Output
++
++* **thread_paths.csv** is generated per pipeline and contains bitmasks representing whether a given thread was active for a runtime invocation of a TraceRay call. For example, if a TraceRay call is contained in an if statement, some threads would have 0 to indicate they skipped the call. For each unique bitmask, the total count of threads that took the same path is totalled. 
++* **thread_compaction.csv** is generated per pipeline and contains the results of performing thread compaction. The data is output as follows:
++    ```
++    for each TraceRay callsite:
++        for each compaction window size:
++            for each runtime execution:
++                output active threads/threads required after compaction
++    ```
++
++    For example:
++    ```
++    354|			
++        1	0	896173/911296
++    ```
++    For callsite with id=354, compaction window size=1, visit count = 0, the number of active threads is 896173 and the number of required threads after compaction is 911296.
++
++## Execution Trace
++
++This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT`
++
++### Analysis
++
++This analysis determines the pipeline hotspots, and dynamic SPIR-V instruction execution count. This is performed by tracking each runtime execution of a basicblock along with the number of active threads. 
++
++### Output
++
++* **dyn_opcode_counts.csv** is generated per pipeline invocation and contains the respective runtime execution counts for each SPIR-V opcode. 
++* **shader_stage_dyn_executions.glsl** is generated per pipeline invocation for each shader in the pipeline. These files present the dynamic instruction execution count of each basic block as inline comments as follows:
++    ```
++    if (gl_LaunchIDNV.z != 0u)
++    {
++    /*thread_executions=460800. SIMT Efficiency=1.000*/
++        _1509(2416u, subgroupBallot(true).x);
++        ipos.x += (_265.global_ubo.width / 2);
++    }
++    /*thread_executions=921600. SIMT Efficiency=1.000*/
++    _1509(2426u, subgroupBallot(true).x);
++    ```
++    The function `_1509` is the instrumentation inserted to capture the execution trace. The comment that immediately proceeds it correpsonds to the data collected from that instrumentation callsite. The first argument to `_1509` (in this example, `2416 and 2426`) is the unique id of the basic block.
++* **hotspots.csv** is generated per pipeline invocation and contains the dynamic execution count of each instruction id in the pipeline. The first argument passed to the instrumentation in the annotated shaders is the instruction id. This can be searched for in the hotspots file or vice-versa. 
++
++## SIMT Efficiency
++
++This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT`.
++
++### Analysis
++
++This analysis computes the SIMT efficiency which is a measure of the utilization of a SIMD architecture. In this case it is calculated as the average fraction of active threads for each basic block execution.
++
++### Output
++
++* **simt_efficiency.csv** is generated per frame with the respective SIMT efficiencies of each pipeline invocation.
++
++## Warp Entry and Exit
++
++This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT`.
++
++### Analysis
++
++This analysis counts the number of times the entry and exit of the ray-tracing pipeline is executed. This is designed to measure the effect of independent thread scheduling(ITS) on NVIDIA's Turing and Ampere architectures. This instrumentation measures to what degree the warp has diverged due to ITS.
++
++### Output
++
++* **warp_exits_vs_entires.csv** is generated per frame with the respective exits count/entry count for each pipeline invocation.
+diff --git a/layers/CMakeLists.txt b/layers/CMakeLists.txt
+index 851b98d5..93758bab 100644
+--- a/layers/CMakeLists.txt
++++ b/layers/CMakeLists.txt
+@@ -234,9 +234,27 @@ set(GPU_UTILITY_LIBRARY_FILES
+     gpu_utils.cpp
+     gpu_utils.h)
+ 
++set(AUTO_INST_LIBRARY_FILES
++    auto_inst.cpp
++    auto_inst_divergence_characterization.cpp
++    auto_inst_dyn_shader_trace.cpp
++    auto_inst_dyn_trace_ray_trace.cpp    
++    auto_inst_execution_trace.cpp        
++    auto_inst_simt_efficiency.cpp    
++    auto_inst_warp_entry_and_exit.cpp    
++    auto_inst.h
++    auto_inst_divergence_characterization.h
++    auto_inst_dyn_shader_trace.h
++    auto_inst_dyn_trace_ray_trace.h    
++    auto_inst_execution_trace.h         
++    auto_inst_simt_efficiency.h    
++    auto_inst_warp_entry_and_exit.h    
++    )
++
+ set(SYNC_VALIDATION_LIBRARY_FILES
+     synchronization_validation.cpp
+-    synchronization_validation.h)
++    synchronization_validation.h
++    )
+ 
+ # Validation Layer performance instrumentation support using Optick.
+ # https://optick.dev/ https://github.com/bombomby/optick
+@@ -268,6 +286,7 @@ if(INSTRUMENT_OPTICK)
+     endif()
+ endif()
+ 
++
+ if(BUILD_LAYERS)
+     AddVkLayer(khronos_validation "${KHRONOS_LAYER_COMPILE_DEFINITIONS}"
+         ${CHASSIS_LIBRARY_FILES}
+@@ -279,6 +298,7 @@ if(BUILD_LAYERS)
+         ${GPU_UTILITY_LIBRARY_FILES}
+         ${GPU_ASSISTED_LIBRARY_FILES}
+         ${DEBUG_PRINTF_LIBRARY_FILES}
++        ${AUTO_INST_LIBRARY_FILES}
+         ${SYNC_VALIDATION_LIBRARY_FILES}
+         ${OPTICK_SOURCE_FILES})
+ 
+@@ -290,14 +310,17 @@ if(BUILD_LAYERS)
+         target_link_libraries(VkLayer_khronos_validation PRIVATE "$<$<CONFIG:Release>:-DEBUG:FULL>")
+     endif()
+ 
++
+     # Khronos validation additional dependencies
+     target_include_directories(VkLayer_khronos_validation PRIVATE ${GLSLANG_SPIRV_INCLUDE_DIR})
+     target_include_directories(VkLayer_khronos_validation PRIVATE ${SPIRV_TOOLS_INCLUDE_DIR})
++    target_include_directories(VkLayer_khronos_validation PRIVATE ${SPIRV_CROSS_INCLUDE_DIR})
+     target_include_directories(VkLayer_khronos_validation PRIVATE ${SPIRV_HEADERS_INCLUDE_DIR})
+     if(INSTRUMENT_OPTICK)
+         target_include_directories(VkLayer_khronos_validation PRIVATE ${OPTICK_SOURCE_DIR})
+     endif()
+     target_link_libraries(VkLayer_khronos_validation PRIVATE ${SPIRV_TOOLS_LIBRARIES})
++    target_link_libraries(VkLayer_khronos_validation PRIVATE ${SPIRV_CROSS_LIBRARIES})
+ 
+     # The output file needs Unix "/" separators or Windows "\" separators On top of that, Windows separators actually need to be doubled
+     # because the json format uses backslash escapes
+diff --git a/layers/auto_inst.cpp b/layers/auto_inst.cpp
+new file mode 100644
+index 00000000..8f3669d3
+--- /dev/null
++++ b/layers/auto_inst.cpp
+@@ -0,0 +1,1205 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#include "auto_inst.h"
++#include "spirv-tools/optimizer.hpp"
++#include "spirv-tools/instrument.hpp"
++#if !defined(__ANDROID__)
++#include "spirv_cross/spirv_glsl.hpp"
++#endif
++#include <iostream>
++#include <fstream>
++#include "layer_chassis_dispatch.h"
++#include <regex>
++#include <iostream>
++#include <bitset>
++
++static const VkShaderStageFlags kShaderStageAllRayTracing =
++    VK_SHADER_STAGE_ANY_HIT_BIT_NV | VK_SHADER_STAGE_CALLABLE_BIT_NV | VK_SHADER_STAGE_CLOSEST_HIT_BIT_NV |
++    VK_SHADER_STAGE_INTERSECTION_BIT_NV | VK_SHADER_STAGE_MISS_BIT_NV | VK_SHADER_STAGE_RAYGEN_BIT_NV;
++
++static const VkShaderStageFlags kShaderStageAllGraphics =
++    VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
++    VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT | VK_SHADER_STAGE_GEOMETRY_BIT;
++
++static const VkShaderStageFlags kShaderStageAllCompute = VK_SHADER_STAGE_COMPUTE_BIT;
++
++// String literal was determined by VkShaderStageFlags spelling. I.e. VK_SHADER_STAGE_RAYGEN_BIT_KHR => RayGen
++static const std::unordered_map<std::string, uint32_t> ShaderStageFlagLookup = {
++    {"RayGen", VK_SHADER_STAGE_RAYGEN_BIT_KHR},
++    {"ClosestHit", VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR},
++    {"Callable", VK_SHADER_STAGE_CALLABLE_BIT_KHR},
++    {"Miss", VK_SHADER_STAGE_MISS_BIT_KHR},
++    {"AnyHit", VK_SHADER_STAGE_ANY_HIT_BIT_KHR},
++    {"Intersection", VK_SHADER_STAGE_INTERSECTION_BIT_KHR},
++    {"Geometry", VK_SHADER_STAGE_GEOMETRY_BIT},
++    {"Fragment", VK_SHADER_STAGE_FRAGMENT_BIT},
++    {"Compute", VK_SHADER_STAGE_COMPUTE_BIT},
++    {"TessellationControl", VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT},
++    {"TessellationEvaluation", VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT},
++    {"Vertex", VK_SHADER_STAGE_VERTEX_BIT}};
++
++// Convenience function for reporting problems.
++template <typename T>
++void AutoInst::ReportSetupProblem(T object, std::string specific_message) const {
++    if (use_stdout)
++        std::cerr << specific_message;
++    else
++        LogError(object, "UNASSIGNED-AUTO-INST ", "Detail: (%s)", specific_message.c_str());
++}
++
++template <typename T>
++void AutoInst::ReportInfo(T object, std::string specific_message) const {
++    if (use_stdout)
++        std::cout << specific_message;
++    else
++        LogInfo(object, "UNASSIGNED-AUTO-inst", "%s", specific_message.c_str());
++}
++
++void AutoInst::CreateImage(uint32_t width, uint32_t height, std::vector<char> &colors, std::string file_name) const {
++#if defined(_WIN32)
++    std::ofstream ofs;
++    ofs.open(file_name + ".bmp", std::ios_base::binary);
++
++    ReportInfo(device, "Creating BMP with dim=" + std::to_string(width) + "x" + std::to_string(height) + " from " +
++                           std::to_string(colors.size()) + "\n");
++
++    const int BYTES_PER_PIXEL = 3;
++
++    BITMAPFILEHEADER tWBFH;
++    tWBFH.bfType = 0x4d42;
++    tWBFH.bfSize = 14 + 40 + (width * height * BYTES_PER_PIXEL);
++    tWBFH.bfReserved1 = 0;
++    tWBFH.bfReserved2 = 0;
++    tWBFH.bfOffBits = 14 + 40;
++
++    BITMAPINFOHEADER tW2BH;
++    tW2BH.biSize = 40;
++    tW2BH.biWidth = width;
++    tW2BH.biHeight = height;
++    tW2BH.biPlanes = 1;
++    tW2BH.biBitCount = BYTES_PER_PIXEL * 8;
++    tW2BH.biCompression = 0;
++
++    ofs.write((char *)(&tWBFH), 14);
++    ofs.write((char *)(&tW2BH), 40);
++
++    for (int y = height - 1; y >= 0; y--) {
++        uint32_t x = 0;
++        for (x = 0; x < width; x++) {
++            auto thread_id = BYTES_PER_PIXEL * (y * width + x);
++            ofs << colors[thread_id] << colors[thread_id + 1] << colors[thread_id + 2];
++        }
++        while (x % 4 != 0) {
++            ofs << (char)0;
++            x++;
++        }
++    }
++
++    ofs.close();
++#else
++
++    const int BYTES_PER_PIXEL = 3;
++    std::ofstream ofs(file_name + ".ppm", std::ios_base::out | std::ios_base::binary);
++    ofs << "P6" << std::endl << width << ' ' << height << std::endl << "255" << std::endl;
++
++    for (uint32_t j = 0; j < height; j++) {
++        for (auto i = 0u; i < width; i++) {
++            auto thread_id = BYTES_PER_PIXEL * (j * width + i);
++            ofs << colors[thread_id] << colors[thread_id + 1] << colors[thread_id + 2];
++        }
++    }
++
++    ofs.close();
++#endif
++}
++
++std::tuple<char, char, char> AutoInst::UnitIntervalToRGB(float val) const {
++    if (val < 0 || val > 1) {
++        ReportSetupProblem(device, "Cannot convert a value outside of interval [0,1] to heatmap colour!");
++    }
++
++    float red, green, blue;
++
++    if (val < 0.2)
++        red = 1.0f - 5.0f * val;
++    else if (val >= 0.2 && val < 0.6)
++        red = 0;
++    else if (val >= 0.6 && val < 0.8)
++        red = 5.0f * (val - 0.6f);
++    else
++        red = 1.0f;
++
++    if (val < 0.4)
++        green = 1;
++    else if (val >= 0.4 && val < 0.6)
++        green = 5.0f * (0.6f - val);
++    else
++        green = 0;
++
++    if (val < 0.2)
++        blue = 0;
++    else if (val >= 0.2 && val < 0.4)
++        blue = 5.0f * (val - 0.2f);
++    else if (val >= 0.4 && val < 0.8)
++        blue = 1;
++    else
++        blue = 5.0f * (1.0f - val);
++
++    return std::make_tuple((char)(red * 255), (char)(green * 255), (char)(blue * 255));
++}
++
++bool AutoInst::CreateUniqueSubgroupIdMappings(uint32_t *const debug_output_buffer,
++                                          PrimitiveIdToPrimitiveSizeMap &primitive_id2primitive_size,
++                                          ThreadIdToSubgroupIdMap &thread_id2subgroup_id_map, ThreadIdSwizzleMap &thread_id_swizzle_map,
++                                          std::function<uint32_t(uint32_t inst_id)> inst_id2prim_id) const {
++    // Sanity check for unique subgroup primitive size
++    if (primitive_id2primitive_size.count(spvtools::kAutoInstUniqueSubgroupId) == 0) return false;
++
++    uint32_t j = 0;
++    uint32_t num_words_written = debug_output_buffer[WORDS_WRITTEN_INDEX];
++    while (j < num_words_written) {
++        auto inst_id = debug_output_buffer[j + NUM_BUFFER_RESERVED_WORDS];
++        auto prim_id = inst_id2prim_id(inst_id);
++        if (primitive_id2primitive_size.count(prim_id) == 0) {
++            ReportSetupProblem(device,
++                               "Unknown prim_id=" + std::to_string(prim_id) + " encountered in CreateUniqueSubgroupIdMappings\n.");
++            return false;
++        }
++
++        if (prim_id == spvtools::kAutoInstUniqueSubgroupId) {
++            auto unique_id_record = reinterpret_cast<AIUniqueSubgroupIdEntry *>(&debug_output_buffer[j + NUM_BUFFER_RESERVED_WORDS]);
++            auto subgroup_id = unique_id_record->SubgroupId();
++            auto flat_thread_id = unique_id_record->flat_thread_id;
++            thread_id2subgroup_id_map[flat_thread_id] = subgroup_id;
++            thread_id_swizzle_map[subgroup_id * SUBGROUP_SIZE + unique_id_record->IntraSubgroupId()] = flat_thread_id;
++        }
++        j += primitive_id2primitive_size[prim_id];
++    }
++
++    return true;
++}
++
++void AutoInst::TryReadRuntimeSizeCache(AutoInst *device_auto_inst) {
++    if (pipeline_to_instrument == VK_PIPELINE_BIND_POINT_MAX_ENUM) {
++        ReportSetupProblem(device, "Pipeline to instrument setting was not initialized. Aborting\n");
++        aborted = true;
++    }
++
++    std::ifstream cache_file;
++    cache_file.open(RuntimeSizeCachePath(pipeline_to_instrument), std::ios_base::binary);
++    if (!cache_file) {
++        ReportInfo(device, "Runtime instrumentation buffer requirements cache not found. Defaulting to output_buffer_size.\n");
++        return;
++    }
++
++    size_t num_pipeline_invocations = 0;
++    cache_file.read((char *)&num_pipeline_invocations, sizeof(size_t));
++    while (!cache_file.eof() && device_auto_inst->BufferSizeRequirementsLookup.size() < num_pipeline_invocations) {
++        uint32_t buffer_size = 0;
++        cache_file.read((char *)&buffer_size, sizeof(uint32_t));
++        device_auto_inst->BufferSizeRequirementsLookup.push_back(buffer_size);
++    }
++
++    if (num_pipeline_invocations != device_auto_inst->BufferSizeRequirementsLookup.size()) {
++        ReportSetupProblem(device, "Warning incomplete cache file detected.\n");
++    }
++
++    cache_file.close();
++}
++
++void AutoInst::WriteRuntimeSizeCache() {
++    if (pipeline_to_instrument == VK_PIPELINE_BIND_POINT_MAX_ENUM) {
++        ReportSetupProblem(device, "Pipeline to instrument setting was not initialized. Aborting\n");
++        aborted = true;
++    }
++
++    std::ofstream cache_file;
++    cache_file.open(RuntimeSizeCachePath(pipeline_to_instrument), std::ios_base::binary);
++    auto pipelines_observed = BufferSizeRequirementsLookup.size();
++    cache_file.write((char *)&pipelines_observed, sizeof(size_t));
++    for (auto size : BufferSizeRequirementsLookup) {
++        cache_file.write((char *)&size, 4);
++    }
++    cache_file.close();
++}
++
++uint32_t AutoInst::FindShaderStage(std::vector<unsigned int> pgm) const {
++    uint32_t stage_flag = 0;
++    SHADER_MODULE_STATE shader;
++    shader.words = pgm;
++    if (shader.words.size() > 0) {
++        for (auto insn : shader) {
++            if (insn.opcode() == spv::OpEntryPoint) {
++                if (stage_flag != 0) {
++                    // This means there are multiple entrypoints which is not
++                    // supported by the downstream SPIRV-Opt instrumentation passes.
++                    ReportSetupProblem(
++                        device, "Multiple EntryPoints in single shader module encountered! Module will not be instrumented!\n");
++                    return 0;
++                }
++                uint32_t offset = insn.offset();
++                spv::ExecutionModel ex_model = (spv::ExecutionModel)pgm[offset + 1];
++                switch (ex_model) {
++                    case spv::ExecutionModel::ExecutionModelAnyHitKHR:
++                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_ANY_HIT_BIT_KHR;
++                        break;
++                    case spv::ExecutionModel::ExecutionModelCallableKHR:
++                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_CALLABLE_BIT_KHR;
++                        break;
++                    case spv::ExecutionModel::ExecutionModelClosestHitKHR:
++                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR;
++                        break;
++                    case spv::ExecutionModel::ExecutionModelFragment:
++                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_FRAGMENT_BIT;
++                        break;
++                    case spv::ExecutionModel::ExecutionModelGeometry:
++                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_GEOMETRY_BIT;
++                        break;
++                    case spv::ExecutionModel::ExecutionModelGLCompute:
++                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_COMPUTE_BIT;
++                        break;
++                    case spv::ExecutionModel::ExecutionModelIntersectionKHR:
++                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_INTERSECTION_BIT_KHR;
++                        break;
++                    case spv::ExecutionModel::ExecutionModelMissKHR:
++                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_MISS_BIT_KHR;
++                        break;
++                    case spv::ExecutionModel::ExecutionModelRayGenerationKHR:
++                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_RAYGEN_BIT_KHR;
++                        break;
++                    case spv::ExecutionModel::ExecutionModelTessellationControl:
++                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
++                        break;
++                    case spv::ExecutionModel::ExecutionModelTessellationEvaluation:
++                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
++                        break;
++                    case spv::ExecutionModel::ExecutionModelVertex:
++                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_VERTEX_BIT;
++                        break;
++                    default:
++                        ReportSetupProblem(device, "Unsupported Shader Stage encountered! Shader will not be instrumented!\n");
++                        return 0;
++                }
++            }
++        }
++    }
++
++    return stage_flag;
++}
++
++std::tuple<uint32_t, uint32_t, uint32_t> AutoInst::FindComputeLocalSize(std::vector<unsigned int> pgm) const {
++    SHADER_MODULE_STATE shader;
++    shader.words = pgm;
++
++    if (shader.words.size() > 0) {
++        for (auto insn : shader) {
++            if (insn.opcode() == spv::OpExecutionMode) {
++                uint32_t offset = insn.offset();
++                if ((spv::ExecutionMode)pgm[offset + 2] != spv::ExecutionModeLocalSize) {
++                    ReportSetupProblem(device, "Unable to determine compute LocalSize!\n");
++                    return std::make_tuple(0, 0, 0);
++                }
++                return std::make_tuple(pgm[offset + 3], pgm[offset + 4], pgm[offset + 5]);
++            }
++        }
++    }
++    ReportSetupProblem(device, "Unable to find OpExecutionMode.\n");
++    return std::make_tuple(0, 0, 0);
++}
++
++std::string AutoInst::AnnotateModuleStr(std::string &shader, std::unordered_map<uint32_t, std::string> &inst_id2str) const {
++    std::regex pattern("%[0-9]+? = OpFunctionCall %void %[0-9]+? %uint_(.*) *");
++    
++    std::istringstream ss{shader};
++    std::ostringstream out;
++    int i = 0;
++
++    for (std::string line; std::getline(ss, line, '\n');) {
++        std::smatch sm;
++        std::regex_search(line, sm, pattern);
++        if (sm.size() > 0) {
++            uint32_t offset = (uint32_t)atoi(sm[1].str().c_str());
++            if (inst_id2str.count(offset) > 0) {
++                auto result_id = "%str" + std::to_string(offset) + "_" + std::to_string(i);
++                out << result_id << " = OpString "
++                    << "\"" << inst_id2str[offset] << "\"\n";
++                out << "OpLine " << result_id << " "
++                    << "0 0 "
++                    << "\n";
++                i++;
++            }
++        } else if (line.find("OpLine") != std::string::npos) {
++            // Clear any other OpLines to avoid interference
++            continue;
++        }
++
++        out << line << "\n";
++    }
++    return out.str();
++}
++
++void AutoInst::TryCompileModuleStrToGlsl(const std::string shader, std::string file_name) const {
++#if defined(__ANDROID__)
++    ReportInfo(device, "SPIRV-Cross not included on Android. Shader " + file_name + " generated without cross compiling.\n");
++    std::ofstream file;
++    file.open(file_name + ".spv");
++    file << shader;
++    file.close();
++#else
++    try {
++        using namespace spvtools;
++        std::ofstream temp;
++        SpirvTools spirvTools(spv_target_env::SPV_ENV_VULKAN_1_2);
++        std::vector<uint32_t> binary;
++        (void)spirvTools.Assemble(shader, &binary, SPV_TEXT_TO_BINARY_OPTION_NONE);
++        using namespace spirv_cross;
++        CompilerGLSL compiler(binary);
++        auto options = compiler.get_common_options();
++        options.emit_line_directives = true;
++        options.vulkan_semantics = true;
++        compiler.set_common_options(options);
++        std::string glsl;
++        glsl = compiler.compile();
++        if (glsl.size() == 0) {
++            ReportSetupProblem(device, "Spirv-cross failed. Shader " + file_name + " generated without cross compiling.\n");
++            std::ofstream file;
++            file.open(file_name + ".glsl");
++            file << shader;
++            file.close();
++        } else {
++            // post process line annotations into comments
++            std::regex re("#line [0-9]* \"([^\"]*)\"");
++            std::ofstream file;
++            file.open(file_name + ".glsl");
++            file << std::regex_replace(glsl, re, "/*$1*/");
++            file.close();
++        }
++    } catch (...) {
++        ReportSetupProblem(device, "Spirv-cross crashed. Shader " + file_name + " generated without cross compiling.\n");
++        std::ofstream file;
++        file.open(file_name + ".glsl");
++        file << shader;
++        file.close();
++    }
++#endif
++}
++
++// Turn on necessary device features.
++void AutoInst::PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo *create_info,
++                                         const VkAllocationCallbacks *pAllocator, VkDevice *pDevice, void *modified_create_info) {
++    DispatchGetPhysicalDeviceFeatures(gpu, &supported_features);
++    VkPhysicalDeviceFeatures features = {};
++    features.vertexPipelineStoresAndAtomics = true;
++    features.fragmentStoresAndAtomics = true;
++    UtilPreCallRecordCreateDevice(gpu, reinterpret_cast<safe_VkDeviceCreateInfo *>(modified_create_info), supported_features,
++                                  features);
++}
++
++// Perform initializations that can be done at Create Device time.
++void AutoInst::PostCallRecordCreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo,
++                                          const VkAllocationCallbacks *pAllocator, VkDevice *pDevice, VkResult result) {
++    ValidationStateTracker::PostCallRecordCreateDevice(physicalDevice, pCreateInfo, pAllocator, pDevice, result);
++
++    ValidationObject *device_object = GetLayerDataPtr(get_dispatch_key(*pDevice), layer_data_map);
++    ValidationObject *validation_data = GetValidationObject(device_object->object_dispatch, this->container_type);
++    AutoInst *device_auto_inst = static_cast<AutoInst *>(validation_data);
++    device_auto_inst->physicalDevice = physicalDevice;
++    device_auto_inst->device = *pDevice;
++
++    if (device_auto_inst->phys_dev_props.apiVersion < VK_API_VERSION_1_1) {
++        ReportSetupProblem(device, "Auto Inst requires Vulkan 1.1 or later.  Auto Inst disabled.");
++        device_auto_inst->aborted = true;
++        return;
++    }
++
++    if (!supported_features.fragmentStoresAndAtomics || !supported_features.vertexPipelineStoresAndAtomics) {
++        ReportSetupProblem(device,
++                           "Auto Inst requires fragmentStoresAndAtomics and vertexPipelineStoresAndAtomics.  "
++                           "Auto Inst disabled.");
++        device_auto_inst->aborted = true;
++        return;
++    }
++
++    if (enabled[gpu_validation] || enabled[debug_printf]) {
++        ReportSetupProblem(device,
++                           "Auto inst cannot be enabled when gpu assisted validation or debug printf are enabled.  "
++                           "Auto inst disabled.");
++        device_auto_inst->aborted = true;
++        return;
++    }
++
++    const char *size_string = getLayerOption("khronos_validation.auto_inst_buffer_size");
++    device_auto_inst->output_buffer_size = *size_string ? atoi(size_string) : 1024;
++    if (device_auto_inst->output_buffer_size <= 16) {
++        ReportSetupProblem(device, "The instrumentation buffer size must be at least 16 bytes");
++        device_auto_inst->aborted = true;
++    }
++    const char *stdout_string = getLayerOption("khronos_validation.auto_inst_to_stdout");
++    device_auto_inst->use_stdout = *stdout_string ? !strcmp(stdout_string, "false") : true;
++    use_stdout = device_auto_inst->use_stdout;
++
++    const char *base_file_name = getLayerOption("khronos_validation.auto_inst_base_file_name");
++    device_auto_inst->base_file_name = *base_file_name ? base_file_name : "";
++
++    const char *pti = getLayerOption("khronos_validation.auto_inst_pipeline_to_instrument");
++    if (!strcmp(pti, "") || !strcmp(pti, "RayTracing")) {
++        device_auto_inst->pipeline_to_instrument = VK_PIPELINE_BIND_POINT_RAY_TRACING_NV;
++        if (!device_extensions.vk_nv_ray_tracing && !device_extensions.vk_khr_ray_tracing_pipeline) {
++            ReportSetupProblem(device, "Cannot instrument ray-tracing pipeline since ray-tracing is not enabled.\n");
++            device_auto_inst->aborted = true;
++            return;
++        }
++
++        ReportInfo(device, "Instrumenting Ray-Tracing Pipeline!\n");
++    } else if (!strcmp(pti, "Graphics")) {
++        device_auto_inst->pipeline_to_instrument = VK_PIPELINE_BIND_POINT_GRAPHICS;
++        ReportInfo(device, "Instrumenting Graphics Pipeline!\n");
++    } else if (!strcmp(pti, "Compute")) {
++        device_auto_inst->pipeline_to_instrument = VK_PIPELINE_BIND_POINT_COMPUTE;
++        ReportInfo(device, "Instrumenting Compute Pipeline!\n");
++    }
++
++    pipeline_to_instrument = device_auto_inst->pipeline_to_instrument;
++
++    const char *create_reference_heatmap = getLayerOption("khronos_validation.auto_inst_create_reference_heatmap");
++    if (!strcmp(create_reference_heatmap, "true")) {
++        ReportInfo(device, "Creating reference heatmap!\n");
++        std::vector<char> scale_colors;
++
++        const uint32_t scale_width = 256;
++        const uint32_t scale_height = 30;
++
++        for (int height = 0; height < scale_height; height++) {
++            for (float i = 0; i < scale_width; i++) {
++                auto rgb = UnitIntervalToRGB(i / 255.0f);
++
++                scale_colors.push_back(std::get<0>(rgb));
++                scale_colors.push_back(std::get<1>(rgb));
++                scale_colors.push_back(std::get<2>(rgb));
++            }
++        }
++
++        CreateImage(scale_width, scale_height, scale_colors, "ReferenceScale");
++    }
++
++    const char *debug_mode = getLayerOption("khronos_validation.auto_inst_debug_mode");
++    if (!strcmp(debug_mode, "atomics")) {
++        device_auto_inst->is_debugging_atomic_ops = true;
++    } else if (!strcmp(debug_mode, "subgroup")) {
++        device_auto_inst->is_debugging_subgroup_ops = true;
++    } else if (!strcmp(debug_mode, "arraylength")) {
++        device_auto_inst->is_debugging_array_length_op = true;
++    }
++
++    const char *dump_shaders = getLayerOption("khronos_validation.auto_inst_dump_shaders");
++    device_auto_inst->dump_shaders = *dump_shaders ? !strcmp(dump_shaders, "true") : false;
++
++    if (device_auto_inst->is_debugging_atomic_ops || device_auto_inst->is_debugging_atomic_ops)
++        ReportInfo(device, "Running Auto-Inst in debug mode, normal auto-instrumentation is disabled.\n");
++
++    const char *shaders_to_instrument = getLayerOption("khronos_validation.auto_inst_shaders_to_instrument");
++    if (shaders_to_instrument) {
++        // Format of the option is stageN, stageM, stageL where stage is
++        // defined in ShaderStageFlagLookup and N,M,L are integer literals
++        std::string shader_list(shaders_to_instrument);
++        size_t pos = 0;
++        std::string token;
++        while (shader_list.length() != 0) {
++            while (shader_list[0] == ' ') shader_list.erase(0, 1);
++            pos = shader_list.find(',');
++            if (pos != std::string::npos) {
++                token = shader_list.substr(0, pos);
++            } else {
++                pos = shader_list.length() - 1;
++                token = shader_list;
++            }
++
++            size_t i = 0;
++            while (token[i] < '0' || token[i] > '9') {
++                i++;
++            }
++
++            std::string stage_str = token.substr(0, i);
++            auto shader_index = std::atoi(token.substr(i, pos).c_str());
++            if (shader_index == 0) {
++                ReportSetupProblem(device, "Shader index must be greater than 0. Aborting\n.");
++                device_auto_inst->aborted = true;
++            }
++            if (ShaderStageFlagLookup.count(stage_str) > 0) {
++                uint32_t shader_stage = ShaderStageFlagLookup.find(stage_str)->second;
++                device_auto_inst->StageToInstIndices[shader_stage].insert(shader_index - 1);
++            } else {
++                // Assume that if user is specifying this setting, they care about it being correct rather than
++                // a more general default.
++                ReportSetupProblem(device, "Did not recognize stage " + stage_str + ". Aborting\n.");
++                device_auto_inst->aborted = true;
++            }
++            shader_list.erase(0, pos + 1);
++        }
++    }
++
++    InitializeLayerDeviceSettings(device_auto_inst);
++
++    TryReadRuntimeSizeCache(device_auto_inst);
++
++    std::vector<VkDescriptorSetLayoutBinding> bindings;
++    VkDescriptorSetLayoutBinding binding = {3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1,
++                                            VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_COMPUTE_BIT | kShaderStageAllRayTracing,
++                                            NULL};
++    bindings.push_back(binding);
++    UtilPostCallRecordCreateDevice(pCreateInfo, bindings, device_auto_inst, device_auto_inst->phys_dev_props);
++}
++
++void AutoInst::PreCallRecordDestroyDevice(VkDevice device, const VkAllocationCallbacks *pAllocator) {
++    UtilPreCallRecordDestroyDevice(this);
++    ValidationStateTracker::PreCallRecordDestroyDevice(device, pAllocator);
++    // State Tracker can end up making vma calls through callbacks - don't destroy allocator until ST is done
++    if (vmaAllocator) {
++        vmaDestroyAllocator(vmaAllocator);
++    }
++    desc_set_manager.reset();
++}
++
++// Modify the pipeline layout to include our debug descriptor set and any needed padding with the dummy descriptor set.
++void AutoInst::PreCallRecordCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo,
++                                                 const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout,
++                                                 void *cpl_state_data) {
++    if (aborted) {
++        return;
++    }
++
++    create_pipeline_layout_api_state *cpl_state = reinterpret_cast<create_pipeline_layout_api_state *>(cpl_state_data);
++
++    if (cpl_state->modified_create_info.setLayoutCount >= adjusted_max_desc_sets) {
++        std::ostringstream strm;
++        strm << "Pipeline Layout conflict with validation's descriptor set at slot " << desc_set_bind_index << ". "
++             << "Application has too many descriptor sets in the pipeline layout to continue with debug printf. "
++             << "Not modifying the pipeline layout. "
++             << "Instrumented shaders are replaced with non-instrumented shaders.";
++        ReportSetupProblem(device, strm.str().c_str());
++    } else {
++        UtilPreCallRecordCreatePipelineLayout(cpl_state, this, pCreateInfo);
++    }
++}
++
++void AutoInst::PostCallRecordCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo,
++                                                  const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout,
++                                                  VkResult result) {
++    ValidationStateTracker::PostCallRecordCreatePipelineLayout(device, pCreateInfo, pAllocator, pPipelineLayout, result);
++    if (result != VK_SUCCESS) {
++        ReportSetupProblem(device, "Unable to create pipeline layout.  Device could become unstable.");
++        aborted = true;
++    }
++}
++
++// Free the device memory and descriptor set associated with a command buffer.
++void AutoInst::ResetCommandBuffer(VkCommandBuffer commandBuffer) {
++    if (aborted) {
++        return;
++    }
++    auto auto_inst_buffer_list = GetBufferInfo(commandBuffer);
++    for (auto buffer_info : auto_inst_buffer_list) {
++        vmaDestroyBuffer(vmaAllocator, buffer_info.output_mem_block.buffer, buffer_info.output_mem_block.allocation);
++        if (buffer_info.desc_set != VK_NULL_HANDLE) {
++            desc_set_manager->PutBackDescriptorSet(buffer_info.desc_pool, buffer_info.desc_set);
++        }
++    }
++    command_buffer_map.erase(commandBuffer);
++}
++
++// Just gives a warning about a possible deadlock.
++bool AutoInst::PreCallValidateCmdWaitEvents(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
++                                            VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
++                                            uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
++                                            uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier *pBufferMemoryBarriers,
++                                            uint32_t imageMemoryBarrierCount,
++                                            const VkImageMemoryBarrier *pImageMemoryBarriers) const {
++    if (srcStageMask & VK_PIPELINE_STAGE_HOST_BIT) {
++        ReportSetupProblem(commandBuffer,
++                           "CmdWaitEvents recorded with VK_PIPELINE_STAGE_HOST_BIT set. "
++                           "Auto inst waits on queue completion. "
++                           "This wait could block the host's signaling of this event, resulting in deadlock.");
++    }
++    return false;
++}
++
++void AutoInst::PreCallRecordCreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
++                                                    const VkGraphicsPipelineCreateInfo *pCreateInfos,
++                                                    const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
++                                                    void *cgpl_state_data) {
++    std::vector<safe_VkGraphicsPipelineCreateInfo> new_pipeline_create_infos;
++    create_graphics_pipeline_api_state *cgpl_state = reinterpret_cast<create_graphics_pipeline_api_state *>(cgpl_state_data);
++    UtilPreCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, cgpl_state->pipe_state,
++                                       &new_pipeline_create_infos, VK_PIPELINE_BIND_POINT_GRAPHICS, this);
++    cgpl_state->printf_create_infos = new_pipeline_create_infos;
++    cgpl_state->pCreateInfos = reinterpret_cast<VkGraphicsPipelineCreateInfo *>(cgpl_state->printf_create_infos.data());
++}
++
++void AutoInst::PreCallRecordCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
++                                                   const VkComputePipelineCreateInfo *pCreateInfos,
++                                                   const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
++                                                   void *ccpl_state_data) {
++    std::vector<safe_VkComputePipelineCreateInfo> new_pipeline_create_infos;
++    auto *ccpl_state = reinterpret_cast<create_compute_pipeline_api_state *>(ccpl_state_data);
++    UtilPreCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, ccpl_state->pipe_state,
++                                       &new_pipeline_create_infos, VK_PIPELINE_BIND_POINT_COMPUTE, this);
++    ccpl_state->printf_create_infos = new_pipeline_create_infos;
++    ccpl_state->pCreateInfos = reinterpret_cast<VkComputePipelineCreateInfo *>(ccpl_state->gpu_create_infos.data());
++}
++
++void AutoInst::PreCallRecordCreateRayTracingPipelinesNV(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
++                                                        const VkRayTracingPipelineCreateInfoNV *pCreateInfos,
++                                                        const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
++                                                        void *crtpl_state_data) {
++    std::vector<safe_VkRayTracingPipelineCreateInfoCommon> new_pipeline_create_infos;
++    auto *crtpl_state = reinterpret_cast<create_ray_tracing_pipeline_api_state *>(crtpl_state_data);
++    UtilPreCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, crtpl_state->pipe_state,
++                                       &new_pipeline_create_infos, VK_PIPELINE_BIND_POINT_RAY_TRACING_NV, this);
++    crtpl_state->printf_create_infos = new_pipeline_create_infos;
++    crtpl_state->pCreateInfos = reinterpret_cast<VkRayTracingPipelineCreateInfoNV *>(crtpl_state->gpu_create_infos.data());
++}
++
++void AutoInst::PreCallRecordCreateRayTracingPipelinesKHR(VkDevice device, VkDeferredOperationKHR deferredOperation,
++                                                         VkPipelineCache pipelineCache, uint32_t count,
++                                                         const VkRayTracingPipelineCreateInfoKHR *pCreateInfos,
++                                                         const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
++                                                         void *crtpl_state_data) {
++    std::vector<safe_VkRayTracingPipelineCreateInfoCommon> new_pipeline_create_infos;
++    auto *crtpl_state = reinterpret_cast<create_ray_tracing_pipeline_khr_api_state *>(crtpl_state_data);
++    UtilPreCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, crtpl_state->pipe_state,
++                                       &new_pipeline_create_infos, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, this);
++    crtpl_state->printf_create_infos = new_pipeline_create_infos;
++    crtpl_state->pCreateInfos = reinterpret_cast<VkRayTracingPipelineCreateInfoKHR *>(crtpl_state->printf_create_infos.data());
++}
++
++void AutoInst::PostCallRecordCreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
++                                                     const VkGraphicsPipelineCreateInfo *pCreateInfos,
++                                                     const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
++                                                     VkResult result, void *cgpl_state_data) {
++    ValidationStateTracker::PostCallRecordCreateGraphicsPipelines(device, pipelineCache, count, pCreateInfos, pAllocator,
++                                                                  pPipelines, result, cgpl_state_data);
++    if (pipeline_to_instrument != VK_PIPELINE_BIND_POINT_GRAPHICS) return;
++    UtilPostCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, VK_PIPELINE_BIND_POINT_GRAPHICS, this);
++}
++
++void AutoInst::PostCallRecordCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
++                                                    const VkComputePipelineCreateInfo *pCreateInfos,
++                                                    const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
++                                                    VkResult result, void *ccpl_state_data) {
++    ValidationStateTracker::PostCallRecordCreateComputePipelines(device, pipelineCache, count, pCreateInfos, pAllocator, pPipelines,
++                                                                 result, ccpl_state_data);
++    if (pipeline_to_instrument != VK_PIPELINE_BIND_POINT_COMPUTE) return;
++    UtilPostCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, VK_PIPELINE_BIND_POINT_COMPUTE, this);
++}
++
++void AutoInst::PostCallRecordCreateRayTracingPipelinesNV(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
++                                                         const VkRayTracingPipelineCreateInfoNV *pCreateInfos,
++                                                         const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
++                                                         VkResult result, void *crtpl_state_data) {
++    ValidationStateTracker::PostCallRecordCreateRayTracingPipelinesNV(device, pipelineCache, count, pCreateInfos, pAllocator,
++                                                                      pPipelines, result, crtpl_state_data);
++    if (pipeline_to_instrument != VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR) return;
++    UtilPostCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, VK_PIPELINE_BIND_POINT_RAY_TRACING_NV, this);
++}
++
++void AutoInst::PostCallRecordCreateRayTracingPipelinesKHR(VkDevice device, VkDeferredOperationKHR deferredOperation,
++                                                          VkPipelineCache pipelineCache, uint32_t count,
++                                                          const VkRayTracingPipelineCreateInfoKHR *pCreateInfos,
++                                                          const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
++                                                          VkResult result, void *crtpl_state_data) {
++    ValidationStateTracker::PostCallRecordCreateRayTracingPipelinesKHR(
++        device, deferredOperation, pipelineCache, count, pCreateInfos, pAllocator, pPipelines, result, crtpl_state_data);
++    if (pipeline_to_instrument != VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR) return;
++    UtilPostCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, this);
++}
++
++// Remove all the shader trackers associated with this destroyed pipeline.
++void AutoInst::PreCallRecordDestroyPipeline(VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator) {
++    for (auto it = shader_map.begin(); it != shader_map.end();) {
++        if (it->second.pipeline == pipeline) {
++            it = shader_map.erase(it);
++        } else {
++            ++it;
++        }
++    }
++    ValidationStateTracker::PreCallRecordDestroyPipeline(device, pipeline, pAllocator);
++}
++// Call the SPIR-V Optimizer to run the instrumentation pass on the shader.
++bool AutoInst::InstrumentShader(const VkShaderModuleCreateInfo *pCreateInfo, std::vector<unsigned int> &new_pgm,
++                                uint32_t *unique_shader_id) {
++    if (aborted) return false;
++    if (pCreateInfo->pCode[0] != spv::MagicNumber) return false;
++
++    // Load original shader SPIR-V
++    uint32_t num_words = static_cast<uint32_t>(pCreateInfo->codeSize / 4);
++    new_pgm.clear();
++    new_pgm.reserve(num_words);
++    new_pgm.insert(new_pgm.end(), &pCreateInfo->pCode[0], &pCreateInfo->pCode[num_words]);
++
++    auto stage = FindShaderStage(new_pgm);
++    if (stage == 0) return false;
++
++    // Check against pipeline_to_instrument setting
++    switch (pipeline_to_instrument) {
++        case VK_PIPELINE_BIND_POINT_COMPUTE:
++            if ((kShaderStageAllCompute & stage) == 0) return false;
++            break;
++        case VK_PIPELINE_BIND_POINT_GRAPHICS:
++            if ((kShaderStageAllGraphics & stage) == 0) return false;
++            break;
++        case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
++            if ((kShaderStageAllRayTracing & stage) == 0) return false;
++            break;
++        default:
++            break;
++    }
++
++    // Check again shaders_to_instrument setting
++    if (StageToInstIndices.size() > 0) {
++        auto stage_index = Stage2SeenCount[stage];
++        Stage2SeenCount[stage]++;
++        if (StageToInstIndices[stage].count(stage_index) == 0) {
++            // The shader stage and index was not found in the user provided setting so skip instrumenting.
++            return false;
++        }
++    }
++
++    // Call the optimizer to instrument the shader.
++    // Use the unique_shader_module_id as a shader ID so we can look up its handle later in the shader_map.
++    // If descriptor indexing is enabled, enable length checks and updated descriptor checks
++    using namespace spvtools;
++    spv_target_env target_env = SPV_ENV_VULKAN_1_2;
++
++    const spvtools::MessageConsumer auto_inst_console_message_consumer =
++        [this](spv_message_level_t level, const char *, const spv_position_t &position, const char *message) -> void {
++        switch (level) {
++            case SPV_MSG_FATAL:
++            case SPV_MSG_INTERNAL_ERROR:
++            case SPV_MSG_ERROR:
++                this->LogError(this->device, "UNASSIGNED-Debug-Printf", "Error during shader instrumentation: line %zu: %s",
++                               position.index, message);
++                break;
++            default:
++                break;
++        }
++    };
++
++    Optimizer optimizer(target_env);
++    optimizer.SetMessageConsumer(auto_inst_console_message_consumer);
++    if (is_debugging_atomic_ops || is_debugging_subgroup_ops || is_debugging_array_length_op) {
++        optimizer.RegisterPass(spvtools::CreateAutoInstDebugPass(desc_set_bind_index, unique_shader_module_id,
++                                                                 is_debugging_atomic_ops, is_debugging_subgroup_ops));
++    } else {
++        RegisterPasses(&optimizer, desc_set_bind_index, unique_shader_module_id);
++    }
++    if (optimizer.GetPassNames().size() == 0) return false;
++    bool pass = optimizer.Run(new_pgm.data(), new_pgm.size(), &new_pgm);
++    if (!pass) {
++        ReportSetupProblem(
++            device, "Failure to instrument shader " + ShaderStageToString(stage) + ".  Proceeding with non-instrumented shader.\n");
++    }
++
++    if (dump_shaders) {
++        std::ofstream shader_dump_file;
++        shader_dump_file.open(ShaderStageToString(stage) + std::to_string(unique_shader_module_id) + ".spv",
++                              std::ios_base::binary | std::ios_base::out);
++        shader_dump_file.write((char *)new_pgm.data(), new_pgm.size() * sizeof(uint32_t));
++        shader_dump_file.close();
++    }
++
++    instrumentation_map[unique_shader_module_id] = std::vector<unsigned int>(new_pgm);
++    *unique_shader_id = unique_shader_module_id++;
++    return pass;
++}
++// Create the instrumented shader data to provide to the driver.
++void AutoInst::PreCallRecordCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo *pCreateInfo,
++                                               const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule,
++                                               void *csm_state_data) {
++    create_shader_module_api_state *csm_state = reinterpret_cast<create_shader_module_api_state *>(csm_state_data);
++
++    bool pass = InstrumentShader(pCreateInfo, csm_state->instrumented_pgm, &csm_state->unique_shader_id);
++    if (pass) {
++        csm_state->instrumented_create_info.pCode = csm_state->instrumented_pgm.data();
++        csm_state->instrumented_create_info.codeSize = csm_state->instrumented_pgm.size() * sizeof(unsigned int);
++    }
++}
++
++void AutoInst::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQueue queue, VkPipelineBindPoint pipeline_bind_point,
++                                          uint32_t operation_index, uint32_t *const debug_output_buffer) {
++    if (pipeline_bind_point != pipeline_to_instrument) return;
++    if (aborted) return;
++
++    // debug mode tests
++    if (is_debugging_atomic_ops || is_debugging_subgroup_ops || is_debugging_array_length_op) {
++        if (is_debugging_atomic_ops) {
++            auto message = debug_output_buffer[BUFFER_DEBUG_LOCATION] == 0 ? "Atomics instrumentation did not produce a result!\n"
++                                                                           : "Atomics instrumentation produced a result!\n";
++            ReportInfo(device, message);
++        }
++
++        if (is_debugging_subgroup_ops) {
++            auto message = debug_output_buffer[BUFFER_DEBUG_LOCATION] == 0
++                               ? "Subgroup instrumentation did not produce a result!\n"
++                               : "Subgroup instrumentation produced a result" +
++                                     std::bitset<SUBGROUP_SIZE>(debug_output_buffer[BUFFER_DEBUG_LOCATION]).to_string() + "!\n";
++            ReportInfo(device, message);
++        }
++
++        if (is_debugging_array_length_op) {
++            auto message = debug_output_buffer[BUFFER_DEBUG_LOCATION] == 0
++                               ? "ArrayLength instrumentation did not produce a result!\n"
++                               : "ArrayLength instrumentation produced a result=" +
++                                     std::to_string(4 * debug_output_buffer[BUFFER_DEBUG_LOCATION]) + " bytes !\n";
++            ReportInfo(device, message);
++        }
++
++        return;
++    }
++
++    auto words_written = debug_output_buffer[WORDS_WRITTEN_INDEX];
++
++    uint32_t bytes_consumed = sizeof(uint32_t) * NUM_BUFFER_RESERVED_WORDS + sizeof(uint32_t) * words_written;
++
++    VmaAllocationInfo alloc_info;
++    (void)vmaGetAllocationInfo(vmaAllocator, GetBufferInfo(command_buffer)[operation_index].output_mem_block.allocation,
++                               &alloc_info);
++    auto buffer_size = alloc_info.size;
++    auto creation_index = GetBufferInfo(command_buffer)[operation_index].output_mem_block.creation_index;
++
++    auto overflowed = bytes_consumed > buffer_size;
++    if (overflowed) {
++        auto message =
++            "Output buffer size is " + std::to_string(buffer_size) + " bytes which is less than the " +
++            std::to_string(bytes_consumed) +
++            " bytes that the instrumentation could have written. Please rerun the application to get analysis results.\n";
++        ReportInfo(device, message.c_str());
++    }
++
++    auto bytes_consumed_for_vma = [](uint32_t raw_bytes) {
++        // set to next highest multiple of 1024
++        return (raw_bytes & (~1023)) + 1024;
++    };
++
++    if (BufferSizeRequirementsLookup.size() <= creation_index) {
++        BufferSizeRequirementsLookup.resize(creation_index + 1, sizeof(uint32_t) * NUM_BUFFER_RESERVED_WORDS);
++        BufferSizeRequirementsLookup[creation_index] = bytes_consumed_for_vma(bytes_consumed);
++        WriteRuntimeSizeCache();
++    } else if (BufferSizeRequirementsLookup[creation_index] == output_buffer_size) {
++        // Assume that if the value is the default, we're safe to lower the instrumentation buffer size
++        // to save on device memory usage.
++        BufferSizeRequirementsLookup[creation_index] = bytes_consumed_for_vma(bytes_consumed);
++        WriteRuntimeSizeCache();
++    } else if (bytes_consumed > BufferSizeRequirementsLookup[creation_index]) {
++        // If the value is not the default, that implies it has already been set by a runtime observation
++        // and therefore it should never decrease.
++        BufferSizeRequirementsLookup[creation_index] = bytes_consumed_for_vma(bytes_consumed);
++        WriteRuntimeSizeCache();
++    }
++
++    switch (pipeline_bind_point) {
++        case VK_PIPELINE_BIND_POINT_COMPUTE: {
++            auto cb_state = GetCBState(command_buffer);
++            LAST_BOUND_STATE &last_bound = cb_state->lastBound[pipeline_bind_point];
++            std::tuple<uint32_t, uint32_t, uint32_t> localsize_xyz = std::make_tuple(0, 0, 0);
++            if (last_bound.pipeline_state) {
++                PIPELINE_STATE *p_state = last_bound.pipeline_state;
++                auto shader_state = GetShaderModuleState(p_state->computePipelineCI.stage.module);
++                if (shader_state != NULL) {
++                    localsize_xyz = FindComputeLocalSize(shader_state->words);
++                }
++            }
++            if (analysis_index >= compute_launch_records.size()) {
++                ReportSetupProblem(device, "Insufficient launch records to support compute analysis.");
++                break;
++            }
++            auto launch_dims3d = compute_launch_records[analysis_index];
++            uint32_t localsize_x = std::get<0>(localsize_xyz);
++            uint32_t localsize_y = std::get<1>(localsize_xyz);
++            uint32_t localsize_z = std::get<2>(localsize_xyz);
++            if (localsize_x == 0 || localsize_y == 0 || localsize_x == 0) {
++                ReportSetupProblem(device, "Could not determine compute shader local size.\n");
++            }
++            is_analyzing_compute = true;
++            AnalyzeCompute(debug_output_buffer, overflowed, launch_dims3d.x_dim * localsize_x, launch_dims3d.y_dim * localsize_y,
++                           launch_dims3d.z_dim * localsize_z);
++            is_analyzing_compute = false;
++            analysis_index++;
++            break;
++        }
++        case VK_PIPELINE_BIND_POINT_GRAPHICS: {
++            analysis_index++;
++            is_analyzing_draw = true;
++            AnalyzeGraphics(debug_output_buffer, overflowed);
++            is_analyzing_draw = false;
++            break;
++        }
++        case VK_PIPELINE_BIND_POINT_RAY_TRACING_NV: {
++            if (analysis_index >= rt_launch_records.size()) {
++                ReportSetupProblem(device, "Insufficient launch records to support ray tracing analysis.");
++                break;
++            }
++            auto launch_dims3d = rt_launch_records[analysis_index];
++            is_analyzing_rt = true;
++            AnalyzeRayTracing(debug_output_buffer, overflowed, launch_dims3d.x_dim, launch_dims3d.y_dim, launch_dims3d.z_dim);
++            is_analyzing_rt = false;
++            analysis_index++;
++            break;
++        }
++        default:
++            ReportSetupProblem(device, "Unsupported pipeline type cannot be analyzed.");
++            break;
++    }
++
++    memset(debug_output_buffer, 0, buffer_size);
++}
++
++// Issue a memory barrier to make GPU-written data available to host.
++// Wait for the queue to complete execution.
++// Check the debug buffers for all the command buffers that were submitted.
++void AutoInst::PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits, VkFence fence,
++                                         VkResult result) {
++    ValidationStateTracker::PostCallRecordQueueSubmit(queue, submitCount, pSubmits, fence, result);
++
++    if (aborted || (result != VK_SUCCESS)) return;
++    bool buffers_present = false;
++    // Don't QueueWaitIdle if there's nothing to process
++    for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
++        const VkSubmitInfo *submit = &pSubmits[submit_idx];
++        for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
++            auto cb_node = GetCBState(submit->pCommandBuffers[i]);
++            if (GetBufferInfo(cb_node->commandBuffer).size()) buffers_present = true;
++            for (auto secondaryCmdBuffer : cb_node->linkedCommandBuffers) {
++                if (GetBufferInfo(secondaryCmdBuffer->commandBuffer).size()) buffers_present = true;
++            }
++        }
++    }
++    if (!buffers_present) return;
++
++    UtilSubmitBarrier(queue, this);
++
++    DispatchQueueWaitIdle(queue);
++
++    for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
++        const VkSubmitInfo *submit = &pSubmits[submit_idx];
++        for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
++            auto cb_node = GetCBState(submit->pCommandBuffers[i]);
++            UtilProcessInstrumentationBuffer(queue, cb_node, this);
++            for (auto secondaryCmdBuffer : cb_node->linkedCommandBuffers) {
++                UtilProcessInstrumentationBuffer(queue, secondaryCmdBuffer, this);
++            }
++        }
++    }
++}
++
++void AutoInst::PreCallRecordCmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount,
++                                    uint32_t firstVertex, uint32_t firstInstance) {
++    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
++}
++
++void AutoInst::PreCallRecordCmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
++                                           uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance) {
++    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
++}
++
++void AutoInst::PreCallRecordCmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t count,
++                                            uint32_t stride) {
++    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
++}
++
++void AutoInst::PreCallRecordCmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset,
++                                                   uint32_t count, uint32_t stride) {
++    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
++}
++
++void AutoInst::PreCallRecordCmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z) {
++    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE);
++    if (pipeline_to_instrument == VK_PIPELINE_BIND_POINT_COMPUTE) {
++        compute_launch_records.push_back({x, y, z});
++    }
++}
++
++void AutoInst::PreCallRecordCmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset) {
++    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE);
++}
++
++void AutoInst::PreCallRecordCmdTraceRaysNV(VkCommandBuffer commandBuffer, VkBuffer raygenShaderBindingTableBuffer,
++                                           VkDeviceSize raygenShaderBindingOffset, VkBuffer missShaderBindingTableBuffer,
++                                           VkDeviceSize missShaderBindingOffset, VkDeviceSize missShaderBindingStride,
++                                           VkBuffer hitShaderBindingTableBuffer, VkDeviceSize hitShaderBindingOffset,
++                                           VkDeviceSize hitShaderBindingStride, VkBuffer callableShaderBindingTableBuffer,
++                                           VkDeviceSize callableShaderBindingOffset, VkDeviceSize callableShaderBindingStride,
++                                           uint32_t width, uint32_t height, uint32_t depth) {
++    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_NV);
++}
++
++void AutoInst::PostCallRecordCmdTraceRaysNV(VkCommandBuffer commandBuffer, VkBuffer raygenShaderBindingTableBuffer,
++                                            VkDeviceSize raygenShaderBindingOffset, VkBuffer missShaderBindingTableBuffer,
++                                            VkDeviceSize missShaderBindingOffset, VkDeviceSize missShaderBindingStride,
++                                            VkBuffer hitShaderBindingTableBuffer, VkDeviceSize hitShaderBindingOffset,
++                                            VkDeviceSize hitShaderBindingStride, VkBuffer callableShaderBindingTableBuffer,
++                                            VkDeviceSize callableShaderBindingOffset, VkDeviceSize callableShaderBindingStride,
++                                            uint32_t width, uint32_t height, uint32_t depth) {
++    CMD_BUFFER_STATE *cb_state = GetCBState(commandBuffer);
++    cb_state->hasTraceRaysCmd = true;
++
++    rt_launch_records.push_back({width, height, depth});
++}
++
++void AutoInst::PreCallRecordCmdTraceRaysKHR(VkCommandBuffer commandBuffer,
++                                            const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
++                                            const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
++                                            const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
++                                            const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, uint32_t width,
++                                            uint32_t height, uint32_t depth) {
++    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
++}
++
++void AutoInst::PostCallRecordCmdTraceRaysKHR(VkCommandBuffer commandBuffer,
++                                             const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
++                                             const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
++                                             const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
++                                             const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, uint32_t width,
++                                             uint32_t height, uint32_t depth) {
++    CMD_BUFFER_STATE *cb_state = GetCBState(commandBuffer);
++    cb_state->hasTraceRaysCmd = true;
++
++    rt_launch_records.push_back({width, height, depth});
++}
++
++void AutoInst::PreCallRecordCmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
++                                                    const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
++                                                    const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
++                                                    const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
++                                                    const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
++                                                    VkDeviceAddress indirectDeviceAddress) {
++    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
++}
++
++void AutoInst::PostCallRecordCmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
++                                                     const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
++                                                     const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
++                                                     const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
++                                                     const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
++                                                     VkDeviceAddress indirectDeviceAddress) {
++    CMD_BUFFER_STATE *cb_state = GetCBState(commandBuffer);
++    cb_state->hasTraceRaysCmd = true;
++}
++
++void AutoInst::PostCallRecordQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR *pPresentInfo, VkResult result) {
++    if (aborted) return;
++    // helper variables for recording file names generated by analyses
++    frame_number++;
++    analysis_index = 0;
++    pipeline_creation_index = 0;
++}
++
++void AutoInst::AllocateAutoInstResources(const VkCommandBuffer cmd_buffer, const VkPipelineBindPoint bind_point) {
++    if (bind_point != VK_PIPELINE_BIND_POINT_GRAPHICS && bind_point != VK_PIPELINE_BIND_POINT_COMPUTE &&
++        bind_point != VK_PIPELINE_BIND_POINT_RAY_TRACING_NV) {
++        return;
++    }
++
++    if (pipeline_to_instrument != bind_point) {
++        return;
++    }
++
++    VkResult result;
++
++    if (aborted) return;
++
++    std::vector<VkDescriptorSet> desc_sets;
++    VkDescriptorPool desc_pool = VK_NULL_HANDLE;
++    result = desc_set_manager->GetDescriptorSets(1, &desc_pool, debug_desc_layout, &desc_sets);
++    assert(result == VK_SUCCESS);
++    if (result != VK_SUCCESS) {
++        ReportSetupProblem(device, "Unable to allocate descriptor sets.  Device could become unstable.");
++        aborted = true;
++        return;
++    }
++
++    auto buffer_size = (BufferSizeRequirementsLookup.size() <= pipeline_creation_index)
++                           ? output_buffer_size
++                           : (uint32_t)(BufferSizeRequirementsLookup[pipeline_creation_index]);
++
++    VkDescriptorBufferInfo output_desc_buffer_info = {};
++    output_desc_buffer_info.range = buffer_size;
++
++    auto cb_node = GetCBState(cmd_buffer);
++    if (!cb_node) {
++        ReportSetupProblem(device, "Unrecognized command buffer");
++        aborted = true;
++        return;
++    }
++
++    // Allocate memory for the output block that the gpu will use to return values for instrumentation
++    AIDeviceMemoryBlock output_block = {};
++    VkBufferCreateInfo bufferInfo = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
++    bufferInfo.size = buffer_size;
++    bufferInfo.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
++    VmaAllocationCreateInfo allocInfo = {};
++    allocInfo.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
++    result = vmaCreateBuffer(vmaAllocator, &bufferInfo, &allocInfo, &output_block.buffer, &output_block.allocation, nullptr);
++
++    output_block.creation_index = pipeline_creation_index;
++    pipeline_creation_index++;
++    if (result != VK_SUCCESS) {
++        ReportSetupProblem(device, "Unable to allocate device memory.  Device could become unstable.");
++        aborted = true;
++        return;
++    }
++
++    // Clear the output block to zeros so that only values from the gpu will be present
++    uint32_t *pData;
++    result = vmaMapMemory(vmaAllocator, output_block.allocation, (void **)&pData);
++    if (result == VK_SUCCESS) {
++        memset(pData, 0, buffer_size);
++        InitializeInstrumentationBuffer(pData);
++        vmaUnmapMemory(vmaAllocator, output_block.allocation);
++    }
++
++    VkWriteDescriptorSet desc_writes[1] = {};
++    const uint32_t desc_count = 1;
++
++    // Write the descriptor
++    output_desc_buffer_info.buffer = output_block.buffer;
++    output_desc_buffer_info.offset = 0;
++
++    desc_writes[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
++    desc_writes[0].descriptorCount = 1;
++    desc_writes[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
++    desc_writes[0].pBufferInfo = &output_desc_buffer_info;
++    desc_writes[0].dstSet = desc_sets[0];
++    desc_writes[0].dstBinding = 3;
++    DispatchUpdateDescriptorSets(device, desc_count, desc_writes, 0, NULL);
++
++    const auto lv_bind_point = ConvertToLvlBindPoint(bind_point);
++    const auto *pipeline_state = cb_node->lastBound[lv_bind_point].pipeline_state;
++    if (pipeline_state) {
++        if (pipeline_state->pipeline_layout->set_layouts.size() <= desc_set_bind_index) {
++            DispatchCmdBindDescriptorSets(cmd_buffer, bind_point, pipeline_state->pipeline_layout->layout, desc_set_bind_index, 1,
++                                          desc_sets.data(), 0, nullptr);
++        }
++        // Record buffer and memory info in CB state tracking
++        GetBufferInfo(cmd_buffer).emplace_back(output_block, desc_sets[0], desc_pool, bind_point);
++    } else {
++        ReportSetupProblem(device, "Unable to find pipeline state");
++        vmaDestroyBuffer(vmaAllocator, output_block.buffer, output_block.allocation);
++        aborted = true;
++        return;
++    }
++}
+diff --git a/layers/auto_inst.h b/layers/auto_inst.h
+new file mode 100644
+index 00000000..dd5dbbd9
+--- /dev/null
++++ b/layers/auto_inst.h
+@@ -0,0 +1,465 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#pragma once
++
++#include "chassis.h"
++#include "vk_mem_alloc.h"
++#include "state_tracker.h"
++#include "gpu_utils.h"
++#include "spirv-tools/instrument.hpp"
++#include <map>
++
++class AutoInst;
++
++struct AIDeviceMemoryBlock {
++    uint32_t creation_index;
++    VkBuffer buffer;
++    VmaAllocation allocation;
++};
++
++struct AIBufferInfo {
++    AIDeviceMemoryBlock output_mem_block;
++    VkDescriptorSet desc_set;
++    VkDescriptorPool desc_pool;
++    VkPipelineBindPoint pipeline_bind_point;
++    AIBufferInfo(AIDeviceMemoryBlock output_mem_block, VkDescriptorSet desc_set, VkDescriptorPool desc_pool,
++                 VkPipelineBindPoint pipeline_bind_point)
++        : output_mem_block(output_mem_block), desc_set(desc_set), desc_pool(desc_pool), pipeline_bind_point(pipeline_bind_point){};
++};
++
++struct AIShaderTracker {
++    VkPipeline pipeline;
++    VkShaderModule shader_module;
++    std::vector<unsigned int> pgm;
++    VkShaderStageFlagBits stage;
++};
++
++struct AIUniqueSubgroupIdEntry {
++    uint32_t inst_id;
++    uint32_t flat_thread_id;
++    uint32_t subgroup_ids;  // Combined subgroup id and intra subgroup id.
++
++    inline uint32_t SubgroupId() const { return subgroup_ids & 0x07FFFFFF; }
++    inline uint32_t IntraSubgroupId() const { return (subgroup_ids & 0xF8000000) >> 27; }
++};
++
++struct LaunchDims3D {
++    uint32_t x_dim;
++    uint32_t y_dim;
++    uint32_t z_dim;
++};
++
++class AutoInst : public ValidationStateTracker {
++    VkPhysicalDeviceFeatures supported_features;
++
++    uint32_t unique_shader_module_id = 0;
++    std::unordered_map<VkCommandBuffer, std::vector<AIBufferInfo>> command_buffer_map;
++    uint32_t output_buffer_size;
++
++  protected:
++    static const uint32_t SUBGROUP_SIZE = 32;
++
++    // Reserved words in the buffer
++    static const uint32_t WORDS_WRITTEN_INDEX = 0;
++    static const uint32_t NUM_SUBGROUP_IDS_INDEX = 1;
++
++    static const uint32_t NUM_BUFFER_RESERVED_WORDS = 2;
++
++    // Reserved word in debug mode
++    static const uint32_t BUFFER_DEBUG_LOCATION = 1;
++
++  public:
++    using ThreadIdToSubgroupIdMap = std::unordered_map<uint32_t, uint32_t>;
++
++    // Map from subgroup_id * SUBGROUP_SIZE + thread_offset to runtime
++    // thread_id
++    using ThreadIdSwizzleMap = std::unordered_map<uint32_t, uint32_t>;
++
++    // Map from primtive id to the number of words that the primitive wrote
++    // to the StorageBuffer.
++    using PrimitiveIdToPrimitiveSizeMap = std::unordered_map<uint32_t, uint32_t>;
++
++    // Record pipeline invocations launch dim parameters;
++    using LaunchDimRecords = std::vector<LaunchDims3D>;
++
++    AutoInst() { container_type = LayerObjectTypeAutoInst; }
++
++    // The pipeline type to instrument
++    VkPipelineBindPoint pipeline_to_instrument = VK_PIPELINE_BIND_POINT_MAX_ENUM;
++
++    // Records of the rt and compute launch sizes, this is useful for rebuilding
++    // the frames for visualization
++    LaunchDimRecords rt_launch_records;
++    LaunchDimRecords compute_launch_records;
++
++    // Helper variables for properly naming files output by the analysis
++    std::string base_file_name;
++    // index of next instrumented pipeline to analyze
++    uint32_t analysis_index = 0;
++    // index of next instrumented pipeline to create. Used for determining runtime
++    // buffer size requirements in the case that previous runs were recorded.
++    uint32_t pipeline_creation_index = 0;
++    uint32_t frame_number = 0;
++    bool is_analyzing_rt = false;
++    bool is_analyzing_draw = false;
++    bool is_analyzing_compute = false;
++
++    // Variables for debug modes
++    // This framework relies on atomic operations in SPIR-V for writing
++    // instrumentation results to the StorageBuffer and Subgroup
++    // operations for determining the active thread mask.
++    bool is_debugging_atomic_ops = false;
++    bool is_debugging_subgroup_ops = false;
++    bool is_debugging_array_length_op = false;
++
++    // if true, dump instrumented shaders
++    // if false, do nothing.
++    bool dump_shaders = false;
++
++    // This map specifies the index of the shader stage to instrument. I.e.
++    // if the option is Miss2 then the 2nd Miss shader that is created
++    // will be instrumented.
++    //
++    // If this map is uninitialized (size == 0) then it is assumed that
++    // all shaders should be instrumented.
++    std::unordered_map<uint32_t, std::set<uint32_t>> StageToInstIndices;
++
++    // Track how many of each ShaderStage has been created.
++    std::unordered_map<uint32_t, uint32_t> Stage2SeenCount;
++
++    // Track how many bytes were required by previous invocations of a given pipeline.
++    // This data is written to a cache file that is read for subsequent executions
++    // of the application.
++    std::vector<uint32_t> BufferSizeRequirementsLookup;
++
++    bool aborted = false;
++    bool use_stdout = false;
++    VkDevice device;
++    VkPhysicalDevice physicalDevice;
++    uint32_t adjusted_max_desc_sets;
++    uint32_t desc_set_bind_index;
++    VkDescriptorSetLayout debug_desc_layout = VK_NULL_HANDLE;
++    VkDescriptorSetLayout dummy_desc_layout = VK_NULL_HANDLE;
++    std::unique_ptr<UtilDescriptorSetManager> desc_set_manager;
++    std::unordered_map<uint32_t, AIShaderTracker> shader_map;
++    std::unordered_map<uint32_t, std::vector<unsigned int>> instrumentation_map;
++    PFN_vkSetDeviceLoaderData vkSetDeviceLoaderData;
++    VmaAllocator vmaAllocator = {};
++    std::map<VkQueue, UtilQueueBarrierCommandInfo> queue_barrier_command_infos;
++    std::vector<AIBufferInfo>& GetBufferInfo(const VkCommandBuffer command_buffer) {
++        auto buffer_list = command_buffer_map.find(command_buffer);
++        if (buffer_list == command_buffer_map.end()) {
++            std::vector<AIBufferInfo> new_list{};
++            command_buffer_map[command_buffer] = new_list;
++            return command_buffer_map[command_buffer];
++        }
++        return buffer_list->second;
++    }
++
++    // Subclass Hooks
++
++    // Opportunity for inheriting classes to initialize
++    // and parse vk_settings_file.txt settings.
++    virtual void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) = 0;
++
++    // Opportunity for inheriting classes to set the
++    // buffer to nonzero values for use-cases like PGO.
++    virtual void InitializeInstrumentationBuffer(uint32_t* buffer) = 0;
++
++    // Opportunity for inheriting class to register auto-inst pass
++    // as well as other passes of interest (e.g. performance)
++    virtual void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) = 0;
++
++    virtual void AnalyzeRayTracing(uint32_t* const output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
++                                   uint32_t depth) = 0;
++
++    virtual void AnalyzeGraphics(uint32_t* const output_buffer, bool buffer_overflowed) = 0;
++
++    virtual void AnalyzeCompute(uint32_t* const output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) = 0;
++
++    // Helper functions
++    template <typename T>
++    void ReportSetupProblem(T object, std::string specific_message) const;
++    template <typename T>
++    void ReportInfo(T object, std::string specific_message) const;
++
++    // This function takes a disassembled SPIR-V module in |shader|
++    // and adds the strings in |inst_id2str| immediately before
++    // the instrumentation callsites with a given inst_id.
++    //
++    // After the function has finished it  will return the shader
++    // with annotations. This is designed to be used in conjunction
++    // with SPIRV-cross.
++    std::string AnnotateModuleStr(std::string& shader, std::unordered_map<uint32_t, std::string>& inst_id2str) const;
++
++    // This function takes an annotated spir-v |shader| module as a string and
++    // attempts to cross-compile it using SPIRV-cross to the corresponding glsl.
++    //
++    // After cross-compiling, a post-processing step changes the
++    // #line directives that are added to valid GLSL comments
++    //
++    // SPIR-Cross fails frequently due to unsupported builtins and the fall-back
++    // path is to emit the module as .spv not .glsl. Either the .spv or .glsl will be
++    // written to |file_name|
++    void TryCompileModuleStrToGlsl(const std::string shader, std::string file_name) const;
++
++    // Returns a file name that includes the base_file_name, analysis type,
++    // frame number and finally the |analysis_specific_suffix|.
++    inline std::string FrameAnalysisFileName(std::string analysis_specific_suffix) const {
++        std::string analysis_type;
++        if (is_analyzing_compute) {
++            analysis_type = "compute";
++        } else if (is_analyzing_draw) {
++            analysis_type = "draw";
++        } else if (is_analyzing_rt) {
++            analysis_type = "rt";
++        } else {
++            analysis_type = "unknown";
++        }
++        return base_file_name + analysis_type + "_frame" + std::to_string(frame_number) + "_" + analysis_specific_suffix;
++    }
++
++    // Returns a file name that includes the base_file_name, analysis type, analysis specific pipeline invocation index
++    // frame number and finally the |analysis_specific_suffix|.
++    inline std::string PipelineAnalysisFileName(std::string analysis_specific_suffix) const {
++        std::string analysis_type;
++        if (is_analyzing_compute) {
++            analysis_type = "compute";
++        } else if (is_analyzing_draw) {
++            analysis_type = "draw";
++        } else if (is_analyzing_rt) {
++            analysis_type = "rt";
++        } else {
++            analysis_type = "unknown";
++        }
++        return base_file_name + analysis_type + std::to_string(analysis_index) + "_frame" + std::to_string(frame_number) + "_" +
++               analysis_specific_suffix;
++    }
++
++    static inline std::string ShaderStageToString(uint32_t stage) {
++        switch (stage) {
++            case VK_SHADER_STAGE_RAYGEN_BIT_KHR:
++                return "RayGen";
++            case VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR:
++                return "ClosestHit";
++            case VK_SHADER_STAGE_CALLABLE_BIT_KHR:
++                return "Callable";
++            case VK_SHADER_STAGE_MISS_BIT_KHR:
++                return "Miss";
++            case VK_SHADER_STAGE_ANY_HIT_BIT_KHR:
++                return "AnyHit";
++            case VK_SHADER_STAGE_INTERSECTION_BIT_KHR:
++                return "Intersection";
++            case VK_SHADER_STAGE_GEOMETRY_BIT:
++                return "Geometry";
++            case VK_SHADER_STAGE_FRAGMENT_BIT:
++                return "Fragment";
++            case VK_SHADER_STAGE_COMPUTE_BIT:
++                return "Compute";
++            case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
++                return "TessellationControl";
++            case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
++                return "TessellationEvaluation";
++            case VK_SHADER_STAGE_VERTEX_BIT:
++                return "Vertex";
++            default:
++                return "Unknown" + std::to_string(stage);
++        }
++    }
++
++    // Create a PPM file with size |width| * |height| by writing the values in |colors| in row-major order
++    void CreateImage(uint32_t width, uint32_t height, std::vector<char>& colors, std::string file_name) const;
++    // Return a color represented as RBG from a value in the unit interval [0,1].
++    std::tuple<char, char, char> UnitIntervalToRGB(float val) const;
++
++    // Analyze the SPIR-V module binary |pgm| to determine which execution model it
++    // implements and the corresponding VkShaderStageFlag.
++    //
++    // If the shader stage is not supported, or the shader module implements more than
++    // one execution model then this function returns 0.
++    // Otherwise it returns the single bit representation the shader stage.
++    uint32_t FindShaderStage(std::vector<unsigned int> pgm) const;
++
++    // Analyze the SPIR-V module binary |pgm| of a compute shader to determine the
++    // localsize that it implements.
++    // If the shader stage is not supported this function returns 0,0,0.
++    // Otherwise it returns the x,y,z values of the localsize.
++    std::tuple<uint32_t, uint32_t, uint32_t> FindComputeLocalSize(std::vector<unsigned int> pgm) const;
++
++    // File name of cache file containing runtime instrumentation buffer size requirements.
++    inline std::string RuntimeSizeCachePath(VkPipelineBindPoint bind_point) const {
++        std::string pipeline_type;
++        switch (bind_point) {
++            case VK_PIPELINE_BIND_POINT_COMPUTE:
++                pipeline_type = "compute";
++                break;
++            case VK_PIPELINE_BIND_POINT_GRAPHICS:
++                pipeline_type = "graphics";
++                break;
++            case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
++                pipeline_type = "ray_tracing";
++                break;
++            default:
++                pipeline_type = "Unknown";
++                break;
++        }
++        return ".ai_runtime_size_cache_" + pipeline_type + ".bin";
++    }
++
++    // Attempts to read the cache file containing information about this application's
++    // runtime buffer size requirements.
++    //
++    // This function populates the variable BufferSizeRequirementsLookup
++    void TryReadRuntimeSizeCache(AutoInst* device_auto_inst);
++
++    // Writes the current knowledge of the runtime instrumentation buffer requirements
++    // to the cache file.
++    void WriteRuntimeSizeCache();
++
++    // Creates mappings from the data output by the 'UniqueSubgroupId' primitive.
++    // |primitive_id2primitive_size| map determines how many words in the buffer
++    // belong to a given primitive type.
++    // |thread_id2subgroup_id_map| allows the lookup from thread id to subgroup id.
++    // |thread_id_swizzle_map| allows for lookup of the original flat thread id
++    // from the unique subgroup id and intra subgroup id.
++    // |inst_id2_prim_id| is invoked with the first word of every entry
++    // this allows an analysis to specify custom inst id's and still
++    // relate them to prim ids.
++    // Returns true if creating mappings is successful, false otherwise.
++    bool CreateUniqueSubgroupIdMappings(
++        uint32_t* const debug_output_buffer, PrimitiveIdToPrimitiveSizeMap& primitive_id2primitive_size,
++        ThreadIdToSubgroupIdMap& thread_id2subgroup_id_map, ThreadIdSwizzleMap& thread_id_swizzle_map,
++        std::function<uint32_t(uint32_t inst_id)> inst_id2prim_id = [](uint32_t x) { return x; }) const;
++
++    // Core auto-inst functionality
++    bool InstrumentShader(const VkShaderModuleCreateInfo* pCreateInfo, std::vector<unsigned int>& new_pgm,
++                          uint32_t* unique_shader_id);
++    void AllocateAutoInstResources(const VkCommandBuffer cmd_buffer, const VkPipelineBindPoint bind_point);
++
++    // Validation Layer hooks
++    void PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo,
++                                   const VkAllocationCallbacks* pAllocator, VkDevice* pDevice, void* modified_create_info) override;
++    void PostCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo,
++                                    const VkAllocationCallbacks* pAllocator, VkDevice* pDevice, VkResult result) override;
++    void PreCallRecordDestroyDevice(VkDevice device, const VkAllocationCallbacks* pAllocator) override;
++    void PreCallRecordCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo* pCreateInfo,
++                                           const VkAllocationCallbacks* pAllocator, VkPipelineLayout* pPipelineLayout,
++                                           void* cpl_state_data) override;
++    void PostCallRecordCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo* pCreateInfo,
++                                            const VkAllocationCallbacks* pAllocator, VkPipelineLayout* pPipelineLayout,
++                                            VkResult result) override;
++    void ResetCommandBuffer(VkCommandBuffer commandBuffer);
++    bool PreCallValidateCmdWaitEvents(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent* pEvents,
++                                      VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
++                                      uint32_t memoryBarrierCount, const VkMemoryBarrier* pMemoryBarriers,
++                                      uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier* pBufferMemoryBarriers,
++                                      uint32_t imageMemoryBarrierCount,
++                                      const VkImageMemoryBarrier* pImageMemoryBarriers) const override;
++    void PreCallRecordCreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
++                                              const VkGraphicsPipelineCreateInfo* pCreateInfos,
++                                              const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
++                                              void* cgpl_state_data) override;
++    void PreCallRecordCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
++                                             const VkComputePipelineCreateInfo* pCreateInfos,
++                                             const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
++                                             void* ccpl_state_data) override;
++    void PreCallRecordCreateRayTracingPipelinesNV(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
++                                                  const VkRayTracingPipelineCreateInfoNV* pCreateInfos,
++                                                  const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
++                                                  void* crtpl_state_data) override;
++    void PreCallRecordCreateRayTracingPipelinesKHR(VkDevice device, VkDeferredOperationKHR deferredOperation,
++                                                   VkPipelineCache pipelineCache, uint32_t count,
++                                                   const VkRayTracingPipelineCreateInfoKHR* pCreateInfos,
++                                                   const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
++                                                   void* crtpl_state_data) override;
++    void PostCallRecordCreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
++                                               const VkGraphicsPipelineCreateInfo* pCreateInfos,
++                                               const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines, VkResult result,
++                                               void* cgpl_state_data) override;
++    void PostCallRecordCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
++                                              const VkComputePipelineCreateInfo* pCreateInfos,
++                                              const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines, VkResult result,
++                                              void* ccpl_state_data) override;
++    void PostCallRecordCreateRayTracingPipelinesNV(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
++                                                   const VkRayTracingPipelineCreateInfoNV* pCreateInfos,
++                                                   const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines, VkResult result,
++                                                   void* crtpl_state_data) override;
++    void PostCallRecordCreateRayTracingPipelinesKHR(VkDevice device, VkDeferredOperationKHR deferredOperation,
++                                                    VkPipelineCache pipelineCache, uint32_t count,
++                                                    const VkRayTracingPipelineCreateInfoKHR* pCreateInfos,
++                                                    const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
++                                                    VkResult result, void* crtpl_state_data) override;
++
++    void PreCallRecordDestroyPipeline(VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks* pAllocator) override;
++    void PreCallRecordCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo* pCreateInfo,
++                                         const VkAllocationCallbacks* pAllocator, VkShaderModule* pShaderModule,
++                                         void* csm_state_data) override;
++    void AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQueue queue, VkPipelineBindPoint pipeline_bind_point,
++                                    uint32_t operation_index, uint32_t* const debug_output_buffer);
++    void PreCallRecordCmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex,
++                              uint32_t firstInstance) override;
++    void PreCallRecordCmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
++                                     uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance) override;
++    void PreCallRecordCmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t count,
++                                      uint32_t stride) override;
++    void PreCallRecordCmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t count,
++                                             uint32_t stride) override;
++    void PreCallRecordCmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z) override;
++    void PreCallRecordCmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset) override;
++    void PreCallRecordCmdTraceRaysNV(VkCommandBuffer commandBuffer, VkBuffer raygenShaderBindingTableBuffer,
++                                     VkDeviceSize raygenShaderBindingOffset, VkBuffer missShaderBindingTableBuffer,
++                                     VkDeviceSize missShaderBindingOffset, VkDeviceSize missShaderBindingStride,
++                                     VkBuffer hitShaderBindingTableBuffer, VkDeviceSize hitShaderBindingOffset,
++                                     VkDeviceSize hitShaderBindingStride, VkBuffer callableShaderBindingTableBuffer,
++                                     VkDeviceSize callableShaderBindingOffset, VkDeviceSize callableShaderBindingStride,
++                                     uint32_t width, uint32_t height, uint32_t depth) override;
++    void PostCallRecordCmdTraceRaysNV(VkCommandBuffer commandBuffer, VkBuffer raygenShaderBindingTableBuffer,
++                                      VkDeviceSize raygenShaderBindingOffset, VkBuffer missShaderBindingTableBuffer,
++                                      VkDeviceSize missShaderBindingOffset, VkDeviceSize missShaderBindingStride,
++                                      VkBuffer hitShaderBindingTableBuffer, VkDeviceSize hitShaderBindingOffset,
++                                      VkDeviceSize hitShaderBindingStride, VkBuffer callableShaderBindingTableBuffer,
++                                      VkDeviceSize callableShaderBindingOffset, VkDeviceSize callableShaderBindingStride,
++                                      uint32_t width, uint32_t height, uint32_t depth) override;
++    void PreCallRecordCmdTraceRaysKHR(VkCommandBuffer commandBuffer,
++                                      const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
++                                      const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
++                                      const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
++                                      const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, uint32_t width,
++                                      uint32_t height, uint32_t depth) override;
++    void PostCallRecordCmdTraceRaysKHR(VkCommandBuffer commandBuffer,
++                                       const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
++                                       const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
++                                       const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
++                                       const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, uint32_t width,
++                                       uint32_t height, uint32_t depth) override;
++    void PreCallRecordCmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
++                                              const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
++                                              const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
++                                              const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
++                                              const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
++                                              VkDeviceAddress indirectDeviceAddress) override;
++    void PostCallRecordCmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
++                                               const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
++                                               const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
++                                               const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
++                                               const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
++                                               VkDeviceAddress indirectDeviceAddress) override;
++    void PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence,
++                                   VkResult result) override;
++    void PostCallRecordQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* pPresentInfo, VkResult result) override;
++};
+diff --git a/layers/auto_inst_divergence_characterization.cpp b/layers/auto_inst_divergence_characterization.cpp
+new file mode 100644
+index 00000000..adf2fd18
+--- /dev/null
++++ b/layers/auto_inst_divergence_characterization.cpp
+@@ -0,0 +1,157 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#include "auto_inst_divergence_characterization.h"
++#include <bitset>
++#include <fstream>
++
++namespace {
++
++struct DivCharRecord {
++    uint32_t inst_id;
++    uint32_t flat_thread_id;
++    uint32_t active_thread_mask;
++};
++
++}  // namespace
++
++void AutoInstDivergenceCharacterization::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
++
++void AutoInstDivergenceCharacterization::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index,
++                                                        uint32_t shader_module_id) {
++    auto static_data_callback = [&](std::unordered_map<uint32_t, uint32_t>&& inst_id2prim_id,
++                                    std::unordered_map<uint32_t, uint32_t>&& inst_id2inst_count) {
++        inst_id2prim_id_.insert(inst_id2prim_id.begin(), inst_id2prim_id.end());
++        inst_id2inst_count_.insert(inst_id2inst_count.begin(), inst_id2inst_count.end());
++    };
++
++    optimizer->RegisterPass(
++        spvtools::CreateAutoInstDivergenceCharacterizationPass(desc_bind_index, shader_module_id, static_data_callback));
++}
++
++void AutoInstDivergenceCharacterization::AnalyzeRayTracing(uint32_t* const device_output_buffer, bool buffer_overflowed,
++                                                           uint32_t width, uint32_t height, uint32_t depth) {
++    if (buffer_overflowed) {
++        ReportSetupProblem(device, "Divergence characterization requires a complete execution trace. Aborting.\n");
++        return;
++    }
++
++    auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
++    ReportInfo(device, "Analyzing divergence characterization for " + std::to_string(runtime_words_written * 4) + " bytes! \n");
++    
++    auto num_threads = width * height * depth;
++
++    // Create mapping from inst_id to inst_size to determine stride
++    AutoInst::PrimitiveIdToPrimitiveSizeMap prim_id2_prim_size = {
++        {spvtools::kAutoInstUniqueSubgroupId, (uint32_t)(sizeof(AIUniqueSubgroupIdEntry) / sizeof(uint32_t))},
++        {spvtools::kAutoInstDivCharPreTraceRay, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
++        {spvtools::kAutoInstDivCharPostTraceRay, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
++        {spvtools::kAutoInstDivCharQuitPipeline, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
++        {spvtools::kAutoInstDivCharShaderEntryPoint, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
++        {spvtools::kAutoInstDivCharActiveThreads, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
++    };
++
++    // Create subgroup id mapping to be populated
++    AutoInst::ThreadIdToSubgroupIdMap thread_id2subgroup_id;
++    AutoInst::ThreadIdSwizzleMap thread_id_swizzle;
++    auto res = CreateUniqueSubgroupIdMappings(device_output_buffer, prim_id2_prim_size, thread_id2subgroup_id, thread_id_swizzle,
++                                          [&](uint32_t inst_id) { return inst_id2prim_id_[inst_id]; });
++
++    if (!res || thread_id2subgroup_id.size() != num_threads || thread_id_swizzle.size() != num_threads) {
++        ReportSetupProblem(device, "Ray tracing pipeline timing analysis failed to acquire unique warp id maps. Aborting.\n");
++        return;
++    }
++
++    std::unordered_map<uint32_t, std::vector<DivCharRecord>> subgroup_id2records;
++
++    // Process the runtime timing data
++    uint32_t j = 0;
++    while (j < runtime_words_written) {
++        auto inst_id = device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS];
++        auto prim_id = inst_id2prim_id_[inst_id];
++
++        if (prim_id >= spvtools::kAutoInstDivCharPreTraceRay && prim_id <= spvtools::kAutoInstDivCharQuitPipeline) {
++            auto subgroup_id = thread_id2subgroup_id[device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS + 1]];
++            auto record = *reinterpret_cast<DivCharRecord*>(&device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS]);
++            subgroup_id2records[subgroup_id].push_back(record);
++        } else if (prim_id != spvtools::kAutoInstUniqueSubgroupId) {
++            ReportSetupProblem(device, "Analysis received unrecognized primitive identifier. Aborting.\n");
++            return;
++        }
++
++        j += prim_id2_prim_size[prim_id];
++    }
++
++    uint64_t return_divergence = 0;
++    uint64_t control_flow_divergence = 0;
++    uint64_t indirect_function_call_divergence = 0;
++
++    for (auto warp_itr = subgroup_id2records.begin(); warp_itr != subgroup_id2records.end(); warp_itr++) {
++        for (uint32_t m = 0, n = 1; m < SUBGROUP_SIZE; m++, n *= 2) {
++            std::vector<bool> recurse_thread_status;
++            bool is_returned = false;
++            bool is_indirect_func_killed = false;
++            for (const auto& offset_itr : warp_itr->second) {
++                auto prim_id = inst_id2prim_id_[offset_itr.inst_id];
++                bool is_thread_active = (offset_itr.active_thread_mask & n) != 0;
++
++                if (prim_id == spvtools::kAutoInstDivCharQuitPipeline) {
++                    is_returned = is_thread_active;
++                    continue;
++                } else if (prim_id == spvtools::kAutoInstDivCharPreTraceRay) {
++                    recurse_thread_status.push_back(is_thread_active);
++                    continue;
++                } else if (prim_id == spvtools::kAutoInstDivCharPostTraceRay) {
++
++                    recurse_thread_status.pop_back();
++                    if (is_thread_active) {
++                        is_indirect_func_killed = false;
++                    }
++                    continue;
++                } else if (prim_id == spvtools::kAutoInstDivCharShaderEntryPoint) {
++                    is_indirect_func_killed = recurse_thread_status.back() && !is_thread_active;
++                    continue;
++                }
++
++                if (!is_thread_active) {
++                    if (inst_id2inst_count_.count(offset_itr.inst_id) == 0) {
++                        ReportSetupProblem(device, "Missing static instruction count data. Aborting.\n");                        
++                        return;
++                    }
++                    uint32_t num_insts = inst_id2inst_count_[offset_itr.inst_id];
++
++                    if (is_returned) {
++                        return_divergence += num_insts;
++                    } else if (is_indirect_func_killed) {
++                        // Thread was active at indiret function callsite but not here
++                        indirect_function_call_divergence += num_insts;
++                    } else {
++                        control_flow_divergence += num_insts;
++                    }
++                }
++            }
++        }
++    }
++
++    ReportInfo(device, "Finished analyzing buffer!\n");
++
++    std::ofstream csv_file;
++    csv_file.open(FrameAnalysisFileName("divergence_characterization.csv"), std::ios_base::app);
++    csv_file << "inst count, indirect func, early exit, control flow,\n";
++    csv_file << "," << indirect_function_call_divergence << "," << return_divergence << "," << control_flow_divergence << ",\n";
++    csv_file.close();
++}
+\ No newline at end of file
+diff --git a/layers/auto_inst_divergence_characterization.h b/layers/auto_inst_divergence_characterization.h
+new file mode 100644
+index 00000000..c0226d11
+--- /dev/null
++++ b/layers/auto_inst_divergence_characterization.h
+@@ -0,0 +1,48 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#pragma once
++
++#include "auto_inst.h"
++
++class AutoInstDivergenceCharacterization;
++
++class AutoInstDivergenceCharacterization : public AutoInst {
++  private:
++    std::unordered_map<uint32_t, uint32_t> inst_id2prim_id_;
++    std::unordered_map<uint32_t, uint32_t> inst_id2inst_count_;
++
++  public:
++    AutoInstDivergenceCharacterization() : AutoInst() {}
++
++    void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
++
++    void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
++
++    void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
++
++    void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
++                           uint32_t depth) override;
++
++    void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
++        ReportSetupProblem(device, "Divergence analysis is not compatible with draw commands.");
++    }
++
++    void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
++        ReportSetupProblem(device, "Divergence analysis is not compatible with compute commands.");
++    }
++};
+diff --git a/layers/auto_inst_dyn_shader_trace.cpp b/layers/auto_inst_dyn_shader_trace.cpp
+new file mode 100644
+index 00000000..7b11cc0b
+--- /dev/null
++++ b/layers/auto_inst_dyn_shader_trace.cpp
+@@ -0,0 +1,177 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#include "auto_inst_dyn_shader_trace.h"
++#include <bitset>
++#include <fstream>
++#include <algorithm>
++
++namespace {
++struct ShaderExecutionRecord {
++    uint32_t prim_id;
++    uint32_t flat_thread_id;
++    uint32_t shader_id;
++    uint32_t active_thread_mask;
++};
++
++}  // namespace
++
++void AutoInstDynShaderTrace::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
++
++void AutoInstDynShaderTrace::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
++    optimizer->RegisterPass(spvtools::CreateAutoInstDynShaderTracePass(desc_bind_index, shader_module_id));
++}
++
++void AutoInstDynShaderTrace::AnalyzeRayTracing(uint32_t* const device_output_buffer, bool buffer_overflowed, uint32_t width,
++                                               uint32_t height, uint32_t depth) {
++    if (buffer_overflowed) {
++        ReportSetupProblem(device, "Ray tracing dynamic shader trace analysis requires a complete execution trace. Aborting.\n");
++        return;
++    }
++
++    auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
++    ReportInfo(device, "Analyzing " + std::to_string(runtime_words_written * 4) + " bytes! \n");
++
++    auto num_threads = width * height * depth;
++
++    // Create mapping from inst_id to inst_size to determine stride
++    AutoInst::PrimitiveIdToPrimitiveSizeMap prim_id2_prim_size = {
++        {spvtools::kAutoInstUniqueSubgroupId, (uint32_t)(sizeof(AIUniqueSubgroupIdEntry) / sizeof(uint32_t))},
++        {spvtools::kAutoInstDynShaderTraceEntryPoint, (uint32_t)(sizeof(ShaderExecutionRecord) / sizeof(uint32_t))},
++    };
++
++    // Create warp id mapping to be populated
++    AutoInst::ThreadIdToSubgroupIdMap thread_id2subgroup_id;
++    AutoInst::ThreadIdSwizzleMap thread_id_swizzle;
++    auto res = CreateUniqueSubgroupIdMappings(device_output_buffer, prim_id2_prim_size,
++                                          thread_id2subgroup_id, thread_id_swizzle);
++
++    if (!res || thread_id2subgroup_id.size() != num_threads || thread_id_swizzle.size() != num_threads) {
++        ReportSetupProblem(device, "Ray tracing pipeline timing analysis failed to acquire unique warp id maps. Aborting.\n");
++        return;
++    }
++
++    // For heatmap
++    uint32_t max_thread_exe_count = 0;
++    uint32_t max_subgroup_exe_count = 0;
++
++    std::unordered_map<uint32_t, uint32_t> thread_id2dyn_count;
++    std::unordered_map<uint32_t, uint32_t> subgroup_id2dyn_count;
++    // For CSV, ordered map for sensible output
++    std::map<uint32_t, uint32_t> shader_id2dyn_count;
++
++    // Process the runtime timing data
++    uint32_t j = 0;
++    while (j < runtime_words_written) {
++        auto prim_id = device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS];
++
++        if (prim_id == spvtools::kAutoInstDynShaderTraceEntryPoint) {
++            auto shader_exe_record = reinterpret_cast<ShaderExecutionRecord*>(&device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS]);
++            auto subgroup_id = thread_id2subgroup_id[shader_exe_record->flat_thread_id];
++            for (uint32_t i = 0; i < SUBGROUP_SIZE; i++) {
++                if (shader_exe_record->active_thread_mask & (1 << i)) {
++                    auto shader_id = shader_exe_record->shader_id;
++                    // For every active thread ending the pipeline, compute its complete timing
++                    auto swizzled_id = thread_id_swizzle[subgroup_id * SUBGROUP_SIZE + i];
++                    thread_id2dyn_count[swizzled_id]++;
++                    max_thread_exe_count = std::max(max_thread_exe_count, thread_id2dyn_count[swizzled_id]);
++                    shader_id2dyn_count[shader_id]++;
++                }
++            }
++            subgroup_id2dyn_count[subgroup_id]++;
++            max_subgroup_exe_count = std::max(max_subgroup_exe_count, subgroup_id2dyn_count[subgroup_id]);
++
++        } else if (prim_id != spvtools::kAutoInstUniqueSubgroupId) {
++            ReportSetupProblem(device, "Encountered unsupported primtive type in Ray tracing thread timing analysis. Aborting.");
++            return;
++        }
++
++        j += prim_id2_prim_size[prim_id];
++    }
++
++    // Generate csv
++
++    {
++        // Output dyn opcode count
++        std::stringstream line0, line1;
++        line0 << "shader,";
++        line1 << "dyn exe count,";
++        for (auto& entry : shader_id2dyn_count) {
++            auto shader_stage_name = ShaderStageToString(shader_map[entry.first].stage);
++            line0 << shader_stage_name << "(" << entry.first << ")"
++                  << ",";
++            line1 << entry.second << ",";
++        }
++        line0 << "\n";
++        line1 << "\n";
++
++        std::ofstream csv_file;
++        csv_file.open(PipelineAnalysisFileName("dyn_shader_counts.csv"));
++        csv_file << line0.str() << line1.str();
++        csv_file.close();
++    }
++
++    {
++        std::vector<char> colors(num_threads * 3);
++        for (uint32_t y = 0; y < height; y++) {
++            for (uint32_t x = 0; x < width; x++) {
++                for (uint32_t z = 0; z < depth; z++) {
++                    auto thread_id = z * (width * height) + y * width + x;
++
++                    auto rgb = UnitIntervalToRGB((float)(thread_id2dyn_count[thread_id] / (float)max_thread_exe_count));
++                    uint32_t out_index = 0;
++
++                    if (depth > 1)  // This is Quake II RTX specific
++                        out_index = y * (width * depth) + x * 2 + z;
++                    else
++                        out_index = y * width + x;
++
++                    colors[3 * out_index + 0] = std::get<0>(rgb);
++                    colors[3 * out_index + 1] = std::get<1>(rgb);
++                    colors[3 * out_index + 2] = std::get<2>(rgb);
++                }
++            }
++        }
++
++        CreateImage(width * depth, height, colors, PipelineAnalysisFileName("shader_execution_heatmap"));
++    }
++
++    {
++        std::vector<char> colors(num_threads * 3);
++        for (uint32_t y = 0; y < height; y++) {
++            for (uint32_t x = 0; x < width; x++) {
++                for (uint32_t z = 0; z < depth; z++) {
++                    auto thread_id = z * (width * height) + y * width + x;
++                    auto subgroup_id = thread_id2subgroup_id[thread_id];
++                    auto rgb = UnitIntervalToRGB((float)(subgroup_id2dyn_count[subgroup_id] / (float)max_subgroup_exe_count));
++                    uint32_t out_index = 0;
++
++                    if (depth > 1)  // This is Quake II RTX specific
++                        out_index = y * (width * depth) + x * 2 + z;
++                    else
++                        out_index = y * width + x;
++
++                    colors[3 * out_index + 0] = std::get<0>(rgb);
++                    colors[3 * out_index + 1] = std::get<1>(rgb);
++                    colors[3 * out_index + 2] = std::get<2>(rgb);
++                }
++            }
++        }
++
++        CreateImage(width * depth, height, colors, PipelineAnalysisFileName("subgroup_shader_execution_heatmap"));
++    }
++}
+\ No newline at end of file
+diff --git a/layers/auto_inst_dyn_shader_trace.h b/layers/auto_inst_dyn_shader_trace.h
+new file mode 100644
+index 00000000..02e8b99a
+--- /dev/null
++++ b/layers/auto_inst_dyn_shader_trace.h
+@@ -0,0 +1,44 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#pragma once
++
++#include "auto_inst.h"
++
++class AutoInstDynShaderTrace;
++
++class AutoInstDynShaderTrace : public AutoInst {
++  public:
++    AutoInstDynShaderTrace() : AutoInst() {}
++
++    void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
++
++    void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
++
++    void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
++
++    void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
++                           uint32_t depth) override;
++
++    void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
++        ReportSetupProblem(device, "Dynamic shader trace for graphics is not yet implemented!\n");
++    }
++
++    void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
++        ReportSetupProblem(device, "Dynamic shader trace for compute is not yet implemented!\n");
++    }
++};
+diff --git a/layers/auto_inst_dyn_trace_ray_trace.cpp b/layers/auto_inst_dyn_trace_ray_trace.cpp
+new file mode 100644
+index 00000000..cec184b0
+--- /dev/null
++++ b/layers/auto_inst_dyn_trace_ray_trace.cpp
+@@ -0,0 +1,223 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#include "auto_inst_dyn_trace_ray_trace.h"
++#include <bitset>
++#include <fstream>
++
++namespace {}  // namespace
++
++void AutoInstDynTraceRayTrace::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {
++    inst_id2prim_id_.clear();
++    merge_id2div_ids_.clear();
++}
++
++void AutoInstDynTraceRayTrace::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
++    auto static_data_callback = [&](std::unordered_map<uint32_t, uint32_t>&& inst_id2prim_id,
++                                    std::unordered_map<uint32_t, std::vector<uint32_t>>&& merge_id2div_ids) {
++        inst_id2prim_id_.insert(inst_id2prim_id.begin(), inst_id2prim_id.end());
++        merge_id2div_ids_.insert(merge_id2div_ids.begin(), merge_id2div_ids.end());
++    };
++    optimizer->RegisterPass(spvtools::CreateAutoInstDynTraceRayTracePass(desc_bind_index, shader_module_id, static_data_callback));
++}
++
++void AutoInstDynTraceRayTrace::AnalyzeRayTracing(uint32_t* const device_output_buffer, bool buffer_overflowed, uint32_t width,
++                                                 uint32_t height, uint32_t depth) {
++    if (buffer_overflowed) {
++        ReportSetupProblem(device,
++                           "Dynamic traceRay trace analysis cannot produce a valid result without a complete execution trace.\n");
++        return;
++    }
++
++    auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
++    ReportInfo(device, "Analyzing dynamic traceRay trace for " + std::to_string(runtime_words_written * 4) + " bytes! \n");
++
++    auto num_subgroup_ids = device_output_buffer[NUM_SUBGROUP_IDS_INDEX];
++    auto num_threads = width * height * depth;
++
++    // Create mapping from inst_id to inst_size to determine stride
++    AutoInst::PrimitiveIdToPrimitiveSizeMap prim_id2_prim_size = {
++        {spvtools::kAutoInstUniqueSubgroupId, (uint32_t)(sizeof(AIUniqueSubgroupIdEntry) / sizeof(uint32_t))},
++        {spvtools::kAutoInstTraceRayTracePreTraceRay, (uint32_t)(sizeof(DynTraceRayTraceRecord) / sizeof(uint32_t))},
++        {spvtools::kAutoInstTraceRayTraceMergePoint, (uint32_t)(sizeof(DynTraceRayTraceRecord) / sizeof(uint32_t))},
++    };
++
++    // Create subgroup id mapping to be populated
++    AutoInst::ThreadIdToSubgroupIdMap thread_id2subgroup_id;
++    AutoInst::ThreadIdSwizzleMap thread_id_swizzle;
++    auto res =
++        CreateUniqueSubgroupIdMappings(device_output_buffer, prim_id2_prim_size, thread_id2subgroup_id,
++                                   thread_id_swizzle, [&](uint32_t inst_id) { return inst_id2prim_id_[inst_id]; });
++
++    if (!res || thread_id2subgroup_id.size() != num_threads || thread_id_swizzle.size() != num_threads) {
++        ReportSetupProblem(device, "Failed to acquire unique subgroup id maps. Aborting.\n");
++        return;
++    }
++
++    {
++        // For thread compaction
++        std::unordered_map<uint32_t, std::unordered_map<uint32_t, std::vector<bool>>> thread_paths;
++        std::unordered_map<uint32_t, std::unordered_map<uint32_t, uint32_t>> merge_visit_count;
++        std::unordered_map<uint32_t, uint32_t> max_visit_count;
++        std::set<uint32_t> points_of_interest;
++
++        uint32_t j = 0;
++        while (j < runtime_words_written) {
++            auto inst_id = device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j];
++            auto prim_id = inst_id2prim_id_[inst_id];
++            if (prim_id == spvtools::kAutoInstTraceRayTracePreTraceRay) {
++                // Record a positive result (thread executed traceRay)
++                auto entry = reinterpret_cast<DynTraceRayTraceRecord*>(&device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j]);
++                auto subgroup_id = thread_id2subgroup_id[entry->flat_thread_id];
++
++                for (uint32_t m = 0, n = 1; m < SUBGROUP_SIZE; m++, n *= 2) {
++                    auto bit = entry->active_thread_mask & n;
++                    if (bit) {
++                        auto thread_id = m + SUBGROUP_SIZE * subgroup_id;
++                        thread_paths[inst_id][thread_id].push_back(true);
++                        max_visit_count[inst_id] = (thread_paths[inst_id][thread_id].size() > max_visit_count[inst_id])
++                                                       ? (uint32_t)thread_paths[inst_id][thread_id].size()
++                                                       : max_visit_count[inst_id];
++                    }
++                }
++                if (points_of_interest.count(inst_id) == 0) {
++                    points_of_interest.insert(inst_id);
++                }
++
++            } else if (prim_id == spvtools::kAutoInstTraceRayTraceMergePoint) {
++                auto entry = reinterpret_cast<DynTraceRayTraceRecord*>(&device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j]);
++                auto subgroup_id = thread_id2subgroup_id[entry->flat_thread_id];
++                // Record negative result if necessary (thread skipped traceRay)
++                for (auto& label_it : merge_id2div_ids_[inst_id]) {
++                    if (points_of_interest.count(label_it) == 0) continue;
++                    for (uint32_t m = 0, n = 1; m < SUBGROUP_SIZE; m++, n *= 2) {
++                        auto thread_id = m + SUBGROUP_SIZE * subgroup_id;
++                        if ((entry->active_thread_mask & n) == 0) continue;
++                        merge_visit_count[label_it][thread_id]++;
++
++                        if (thread_paths[label_it][thread_id].size() >= merge_visit_count[label_it][thread_id]) {
++                            merge_visit_count[label_it][thread_id] = (uint32_t)thread_paths[label_it][thread_id].size();
++                            continue;  // Has been set due to active thread taking branch
++                        }
++
++                        thread_paths[label_it][thread_id].push_back(false);
++                        max_visit_count[label_it] = (thread_paths[label_it][thread_id].size() > max_visit_count[label_it])
++                                                        ? (uint32_t)thread_paths[label_it][thread_id].size()
++                                                        : max_visit_count[label_it];
++                    }
++                }
++            } else if (prim_id != spvtools::kAutoInstUniqueSubgroupId) {
++                ReportSetupProblem(device, "Unrecognized primitive. Aborting.\n");
++                return;
++            }
++
++            j += prim_id2_prim_size[prim_id];
++        }
++
++        // Done analyzing StorageBuffer
++        const int MAX_PATH_LEN = 1024;
++
++        // Flatten the thread paths according to the maximum dynamic invocation count
++        // of each traceRay callsite
++        // Consider thread A that executed a inner loop once for 3 iterations of an outer loop
++        // vs thread B that executed in the inner loop 3 times for 3 iterations of an outer loop
++        // Before flattening:
++        // thread A: 111
++        // thread B: 111111111
++        // After flattening
++        // thread A: 001001001
++        // thread B: 111111111
++        std::unordered_map<std::bitset<MAX_PATH_LEN>, uint32_t> flat_path_count;
++        std::bitset<MAX_PATH_LEN> flat_thread_path;
++        for (uint32_t thread_id = 0; thread_id < num_threads; thread_id++) {
++            std::size_t k = 0;
++            flat_thread_path.reset();
++            for (auto& label_id : points_of_interest) {
++                j = 0;
++                for (j = 0; j < thread_paths[label_id][thread_id].size(); j++) {
++                    if (thread_paths[label_id][thread_id][j]) {
++                        flat_thread_path.set(k, 1);
++                    }
++                    k += 1;
++                }
++                if (j > max_visit_count[label_id]) {
++                    ReportSetupProblem(device, "Max visit count not set correctly. Aborting\n");
++                    return;
++                }
++                k += max_visit_count[label_id] - j;
++                if (k > MAX_PATH_LEN)
++                    ReportSetupProblem(device, ("Encountered more than " + std::to_string(MAX_PATH_LEN) + " branches!").c_str());
++            }
++            flat_path_count[flat_thread_path]++;
++        }
++
++        // Record thread paths and their respective counts
++        std::ofstream csv_file;
++        csv_file.open(PipelineAnalysisFileName("thread_paths.csv"));
++        csv_file << "path,count,\n";
++        for (auto& path_it : flat_path_count) csv_file << path_it.first << "," << path_it.second << "\n";
++        csv_file.close();
++
++        csv_file.open(PipelineAnalysisFileName("thread_compaction.csv"));
++
++        ReportInfo(device, "Done simulated threads\n");
++        for (auto& poi_label : points_of_interest) {
++            csv_file << poi_label << "\n";
++            std::vector<uint32_t> active_threads;
++            std::vector<uint32_t> active_threads_per_window;
++            std::vector<uint32_t> total_threads;
++            for (uint32_t window_size = 1; window_size < num_subgroup_ids * 2; window_size <<= 1) {
++                active_threads.clear();
++                total_threads.clear();
++                // window size unit is subgroups
++                for (uint32_t window_base = 0; window_base < num_subgroup_ids; window_base += window_size) {
++                    active_threads_per_window.clear();
++                    for (uint32_t window_offset = 0; window_offset < window_size; window_offset++) {
++                        if (window_base + window_offset >= num_subgroup_ids) continue;
++                        for (uint32_t thread_offset = 0; thread_offset < SUBGROUP_SIZE; thread_offset++) {
++                            auto thread_id = (window_base + window_offset) * SUBGROUP_SIZE + thread_offset;
++                            auto num_visits = thread_paths[poi_label][thread_id].size();
++                            if (active_threads_per_window.size() < num_visits) active_threads_per_window.resize(num_visits);
++                            for (uint32_t visit_count = 0; visit_count < num_visits; visit_count++) {
++                                if (thread_paths[poi_label][thread_id][visit_count]) active_threads_per_window[visit_count]++;
++                            }
++                        }
++                    }
++
++                    if (active_threads.size() < active_threads_per_window.size()) {
++                        active_threads.resize(active_threads_per_window.size());
++                        total_threads.resize(active_threads.size());
++                    }
++
++                    for (uint32_t visit_count = 0; visit_count < active_threads_per_window.size(); visit_count++) {
++                        active_threads[visit_count] += active_threads_per_window[visit_count];
++                        total_threads[visit_count] += ((active_threads_per_window[visit_count] / SUBGROUP_SIZE) +
++                                                       ((active_threads_per_window[visit_count] % SUBGROUP_SIZE != 0) ? 1 : 0)) *
++                                                      SUBGROUP_SIZE;
++                    }
++                }
++                for (uint32_t visit_count = 0; visit_count < active_threads.size(); visit_count++) {
++                    if (active_threads[visit_count] == 0 && total_threads[visit_count] == 0) continue;
++
++                    csv_file << "," << window_size << "," << visit_count << "," << active_threads[visit_count] << "/"
++                             << total_threads[visit_count] << "\n";
++                }
++            }
++        }
++        csv_file.close();
++    }
++}
+\ No newline at end of file
+diff --git a/layers/auto_inst_dyn_trace_ray_trace.h b/layers/auto_inst_dyn_trace_ray_trace.h
+new file mode 100644
+index 00000000..769b0cc4
+--- /dev/null
++++ b/layers/auto_inst_dyn_trace_ray_trace.h
+@@ -0,0 +1,55 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#pragma once
++
++#include "auto_inst.h"
++
++class AutoInstDynTraceRayTrace;
++
++struct DynTraceRayTraceRecord {
++    uint32_t inst_id;
++    uint32_t flat_thread_id;
++    uint32_t active_thread_mask;
++};
++
++class AutoInstDynTraceRayTrace : public AutoInst {
++  private:
++    std::unordered_map<uint32_t, uint32_t> inst_id2prim_id_;
++    std::unordered_map<uint32_t, std::vector<uint32_t>> merge_id2div_ids_;
++
++  public:
++    AutoInstDynTraceRayTrace() : AutoInst() {}
++
++    void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
++
++    void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
++
++    void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
++
++    void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
++                           uint32_t depth) override;
++    ;
++
++    void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
++        ReportSetupProblem(device, "Dynamic TraceRays Trace analysis does not support graphics.\n");
++    };
++
++    void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
++        ReportSetupProblem(device, "Dynamic TraceRays Trace analysis does not support compute.\n");
++    };
++};
+diff --git a/layers/auto_inst_execution_trace.cpp b/layers/auto_inst_execution_trace.cpp
+new file mode 100644
+index 00000000..ec4c876b
+--- /dev/null
++++ b/layers/auto_inst_execution_trace.cpp
+@@ -0,0 +1,174 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#include "auto_inst_execution_trace.h"
++#include <bitset>
++#include <fstream>
++
++namespace {
++
++struct ExecutionTraceRecord {
++    uint32_t inst_id;
++    uint32_t active_thread_mask;
++};
++
++static inline uint32_t shader_id(uint32_t inst_id) { return (inst_id & 0xFFF00000) >> 20; }
++
++}  // namespace
++
++void AutoInstExecutionTrace::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
++
++void AutoInstExecutionTrace::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
++    // In order to compute the dynamic instruction execution count of the pipeline, it is necessary to know all the other
++    // instructions in the same basic block as the instrumentation callsite. This callback allows the auto-inst pass to
++    // populate such a mapping.
++    auto static_data_callback = [&](std::unordered_map<uint32_t, std::set<uint32_t>>&& inst_id2bb_inst_ids,
++                                    std::unordered_map<uint32_t, uint32_t>&& inst_id2opcode) {
++        inst_id2bb_inst_ids_.insert(inst_id2bb_inst_ids.begin(), inst_id2bb_inst_ids.end());
++        inst_id2opcode_.insert(inst_id2opcode.begin(), inst_id2opcode.end());
++    };
++    optimizer->RegisterPass(spvtools::CreateAutoInstExecutionTracePass(desc_bind_index, shader_module_id, static_data_callback));
++}
++
++void AutoInstExecutionTrace::Analyze(uint32_t* const device_output_buffer, bool buffer_overflowed) {
++    if (buffer_overflowed) {
++        ReportSetupProblem(device, "Execution trace analysis cannot produce a valid result without a complete execution trace.\n");
++        return;
++    }
++
++    auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
++    ReportInfo(device, "Analyzing execution trace for " + std::to_string(runtime_words_written * 4) + " bytes! \n");
++
++    struct ActiveTotalThreadCounts {
++        uint32_t active_count;
++        uint32_t total_count;
++
++        float inline SimtEfficiency() const { return (float)active_count / (float)total_count; }
++    };
++
++    std::map<uint32_t, uint32_t> opcode2dyn_execution_count;
++
++    // For annotated shaders
++    std::map<uint32_t, uint32_t> inst_id2dyn_execution_count;
++    std::map<uint32_t, ActiveTotalThreadCounts> inst_id2active_and_total_thread_counts;
++
++    uint32_t j = 0;
++    while (j < runtime_words_written) {
++        const auto output_record = reinterpret_cast<ExecutionTraceRecord*>(&device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j]);
++        uint32_t active_thread_count = (uint32_t)std::bitset<SUBGROUP_SIZE>(output_record->active_thread_mask).count();
++
++        if (inst_id2bb_inst_ids_.count(output_record->inst_id) == 0) {
++            ReportSetupProblem(device, "Execution trace was unable to locate instrumentation id=" +
++                                           std::to_string(output_record->inst_id) + "in static mapping. Aborting.\n");
++            return;
++        }
++
++        // Add to the opcode totals based on how many threads were active
++        for (const auto& inst_id : inst_id2bb_inst_ids_[output_record->inst_id]) {
++            inst_id2dyn_execution_count[inst_id] += active_thread_count;
++            inst_id2active_and_total_thread_counts[inst_id].active_count += active_thread_count;
++            inst_id2active_and_total_thread_counts[inst_id].total_count += SUBGROUP_SIZE;
++            auto opcode = inst_id2opcode_[inst_id];
++            if (inst_id2opcode_.count(inst_id) == 0) {
++                ReportSetupProblem(device, "Encountered instruction id without a corresponding Opcode. Aborting.\n");
++                return;
++            }
++            opcode2dyn_execution_count[opcode] += active_thread_count;
++        }
++
++        j += sizeof(ExecutionTraceRecord) / sizeof(uint32_t);
++    }
++
++    {
++        // Output dyn opcode count
++        std::stringstream line0, line1;
++        line0 << "opcode,";
++        line1 << "dyn exe count,";
++        for (auto entry : opcode2dyn_execution_count) {
++            line0 << entry.first << ",";
++            line1 << entry.second << ",";
++        }
++        line0 << "\n";
++        line1 << "\n";
++
++        std::ofstream csv_file;
++        csv_file.open(PipelineAnalysisFileName("dyn_opcode_counts.csv"));
++        csv_file << line0.str() << line1.str();
++        csv_file.close();
++    }
++    {
++        // Output hotspots
++        std::stringstream line0, line1, line2;
++        line0 << "pc,";
++        line1 << "dyn exe count,";
++        line2 << "simt efficiency,";
++        for (auto entry : inst_id2dyn_execution_count) {
++            line0 << entry.first << ",";
++            line1 << entry.second << ",";
++            line2 << inst_id2active_and_total_thread_counts[entry.first].SimtEfficiency() << ",";
++        }
++        line0 << "\n";
++        line1 << "\n";
++        line2 << "\n";
++
++        ActiveTotalThreadCounts combined = {0, 0};
++        for (auto entry : inst_id2active_and_total_thread_counts) {
++            combined.active_count += entry.second.active_count;
++            combined.total_count += entry.second.total_count;
++        }
++
++        std::ofstream csv_file;
++        csv_file.open(PipelineAnalysisFileName("hotspots.csv"));
++        csv_file << line0.str() << line1.str() << line2.str() << "Overall SIMT efficiency=" << combined.SimtEfficiency() << "\n";
++        csv_file.close();
++    }
++    {
++        std::set<uint32_t> shaders_with_data;
++        // Output annotated shaders
++        std::unordered_map<uint32_t, std::string> annotations;
++        for (const auto& entry : inst_id2bb_inst_ids_) {
++            auto instrumentation_id = *entry.second.begin();
++            auto visits = inst_id2dyn_execution_count[instrumentation_id];
++            if (visits > 0) {
++                auto simt_efficiency = inst_id2active_and_total_thread_counts[entry.first].SimtEfficiency();
++                shaders_with_data.insert(shader_id(entry.first));
++                annotations[instrumentation_id] =
++                    "thread_executions=" + std::to_string(visits) + ". SIMT Efficiency=" + std::to_string(simt_efficiency);
++            }
++        }
++
++        for (auto entry : instrumentation_map) {
++            if (shaders_with_data.count(entry.first) == 0) continue;
++            using namespace spvtools;
++            SpirvTools spirvTools(spv_target_env::SPV_ENV_VULKAN_1_2);
++            std::string program;
++            spirvTools.SetMessageConsumer([this](spv_message_level_t level, const char* source, const spv_position_t& pos,
++                                                 const char* message) { ReportSetupProblem(this->device, message); });
++            bool res = spirvTools.Disassemble(entry.second, &program, SPV_BINARY_TO_TEXT_OPTION_FRIENDLY_NAMES);
++            if (res) {
++                program = AnnotateModuleStr(program, annotations);
++            } else {
++                ReportSetupProblem(device, "Could not disassemble shader with id=" + std::to_string(entry.first) + ". Skipping.\n");
++                continue;
++            }
++            std::ofstream spv_file;
++            auto file_name = ShaderStageToString(shader_map[entry.first].stage) + std::to_string(entry.first) + "_dyn_executions";
++
++            TryCompileModuleStrToGlsl(program, PipelineAnalysisFileName(file_name));
++        }
++    }
++}
+\ No newline at end of file
+diff --git a/layers/auto_inst_execution_trace.h b/layers/auto_inst_execution_trace.h
+new file mode 100644
+index 00000000..fb5b4eb0
+--- /dev/null
++++ b/layers/auto_inst_execution_trace.h
+@@ -0,0 +1,56 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#pragma once
++
++#include "auto_inst.h"
++
++class AutoInstExecutionTrace;
++
++class AutoInstExecutionTrace : public AutoInst {
++  public:
++    // Mapping from instrumented instruction id to the ids of other instructions in the basic block
++    std::unordered_map<uint32_t, std::set<uint32_t>> inst_id2bb_inst_ids_;
++
++    // Mapping from instruction id to instruction opcode. Used for calculating dynamic instruction mix.
++    std::unordered_map<uint32_t, uint32_t> inst_id2opcode_;
++
++    AutoInstExecutionTrace() : AutoInst() {}
++
++    void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
++
++    void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
++
++    void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
++
++    void Analyze(uint32_t* const debug_output_buffer, bool buffer_overflowed);
++
++    virtual void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
++                                   uint32_t depth) override {
++        Analyze(debug_output_buffer, buffer_overflowed);
++    };
++
++    // TODO: What are useful dimensions to pass to graphics pipeline analysis
++    virtual void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
++        Analyze(debug_output_buffer, buffer_overflowed);
++    };
++
++    virtual void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y,
++                                uint32_t z) override {
++        Analyze(debug_output_buffer, buffer_overflowed);
++    };
++};
+diff --git a/layers/auto_inst_simt_efficiency.cpp b/layers/auto_inst_simt_efficiency.cpp
+new file mode 100644
+index 00000000..0e100509
+--- /dev/null
++++ b/layers/auto_inst_simt_efficiency.cpp
+@@ -0,0 +1,67 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#include "auto_inst_simt_efficiency.h"
++#include <bitset>
++#include <fstream>
++void AutoInstSimtEfficiency::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
++
++void AutoInstSimtEfficiency::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
++    optimizer->RegisterPass(spvtools::CreateAutoInstSimtEfficiencyPass(desc_bind_index, shader_module_id, 1));
++}
++
++void AutoInstSimtEfficiency::Analyze(uint32_t* const device_output_buffer, bool buffer_overflowed) {
++    if (buffer_overflowed) {
++        ReportSetupProblem(device, "SIMT Efficiency analysis cannot produce a valid result without a complete execution trace.\n");
++        return;
++    }
++
++    uint32_t active_thread_count = 0;
++    uint32_t possible_thread_count = 0;
++
++    auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
++
++    if (runtime_words_written == 0) {
++        ReportInfo(device, "No data found. Skipping Analysis.\n");
++        return;
++    }
++
++    ReportInfo(device, "Analyzing SIMT Efficiency for " + std::to_string(runtime_words_written * 4) + " bytes! \n");
++
++    uint32_t j = 0;
++    while (j < runtime_words_written) {
++        uint32_t active_thread_mask = device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j];
++        auto active_threads = (uint32_t)std::bitset<SUBGROUP_SIZE>(active_thread_mask).count();
++
++        if (active_threads == 0) {
++            ReportSetupProblem(device, "Invalid active thread count encountered. Quitting Analysis!\n");
++            return;
++        }
++        active_thread_count += active_threads;
++        possible_thread_count += SUBGROUP_SIZE;
++        j += sizeof(SimtEfficiencyRecord) / sizeof(uint32_t);
++    }
++
++    float simt_efficiency = (float)active_thread_count / (float)possible_thread_count;
++
++    std::ofstream simt_eff_file;
++    simt_eff_file.open(FrameAnalysisFileName("simt_efficiency.csv"), std::ios_base::app);
++    simt_eff_file << simt_efficiency << "\n";
++    simt_eff_file.close();
++
++    ReportInfo(device, "SIMT Efficiency = " + std::to_string(simt_efficiency * 100.0) + "%\n");
++}
+\ No newline at end of file
+diff --git a/layers/auto_inst_simt_efficiency.h b/layers/auto_inst_simt_efficiency.h
+new file mode 100644
+index 00000000..d6c5e2ce
+--- /dev/null
++++ b/layers/auto_inst_simt_efficiency.h
+@@ -0,0 +1,56 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#pragma once
++
++#include "auto_inst.h"
++
++class AutoInstSimtEfficiency;
++
++struct SimtEfficiencyRecord {
++    uint32_t active_thread_mask;
++};
++
++class AutoInstSimtEfficiency : public AutoInst {
++  public:
++    AutoInstSimtEfficiency() : AutoInst() {}
++
++    void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
++
++    void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
++
++    void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
++
++    // Opportunity for inheriting class to perform hybrid analysis using
++    // 1) static_data
++    // 2) runtime_data
++    // 3) shader_map
++    void Analyze(uint32_t* const debug_output_buffer, bool buffer_overflowed);
++
++    void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
++                           uint32_t depth) override {
++        Analyze(debug_output_buffer, buffer_overflowed);
++    };
++
++    void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
++        Analyze(debug_output_buffer, buffer_overflowed);
++    };
++
++    void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
++        Analyze(debug_output_buffer, buffer_overflowed);
++    };
++};
+diff --git a/layers/auto_inst_warp_entry_and_exit.cpp b/layers/auto_inst_warp_entry_and_exit.cpp
+new file mode 100644
+index 00000000..9c19ce3d
+--- /dev/null
++++ b/layers/auto_inst_warp_entry_and_exit.cpp
+@@ -0,0 +1,61 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#include "auto_inst_warp_entry_and_exit.h"
++#include <fstream>
++
++void AutoInstWarpEntryAndExit::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
++
++void AutoInstWarpEntryAndExit::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
++    optimizer->RegisterPass(spvtools::CreateAutoInstWarpEntryAndExitPass(desc_bind_index, shader_module_id));
++}
++
++void AutoInstWarpEntryAndExit::Analyze(uint32_t* const device_output_buffer, bool buffer_overflowed) {
++    if (buffer_overflowed) {
++        ReportSetupProblem(device, "Analysis cannot produce a valid result without a complete execution trace.\n");
++        return;
++    }
++
++    uint32_t entry_count = 0;
++    uint32_t exit_count = 0;
++
++    auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
++    ReportInfo(device, "Analyzing Warp Entries vs Exits in " + std::to_string(runtime_words_written * 4) + " bytes! \n");
++
++    uint32_t j = 0;
++    while (j < runtime_words_written) {
++        uint32_t prim_id = device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j];
++        if (prim_id == spvtools::kAutoInstWarpEntryAndExitBeginPipeline) {
++            entry_count++;
++        } else if (prim_id == spvtools::kAutoInstWarpEntryAndExitEndPipeline) {
++            exit_count++;
++        } else {
++            ReportSetupProblem(device, "Received unexpected primitive id. Aborting!\n");
++            return;
++        }
++        j++;
++    }
++
++    float divergence_factor = (float)exit_count / (float)entry_count;
++
++    std::ofstream csv_file;
++    csv_file.open(FrameAnalysisFileName("exits_vs_entries.csv"), std::ios_base::app);
++    csv_file << divergence_factor << "\n";
++    csv_file.close();
++
++    ReportInfo(device, "Exits/entries= " + std::to_string(divergence_factor) + "\n");
++}
+\ No newline at end of file
+diff --git a/layers/auto_inst_warp_entry_and_exit.h b/layers/auto_inst_warp_entry_and_exit.h
+new file mode 100644
+index 00000000..17adfdfb
+--- /dev/null
++++ b/layers/auto_inst_warp_entry_and_exit.h
+@@ -0,0 +1,52 @@
++/* Copyright (c) 2020 The Khronos Group Inc.
++ *
++ * Licensed under the Apache License, Version 2.0 (the "License");
++ * you may not use this file except in compliance with the License.
++ * You may obtain a copy of the License at
++ *
++ *     http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing, software
++ * distributed under the License is distributed on an "AS IS" BASIS,
++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++ * See the License for the specific language governing permissions and
++ * limitations under the License.
++ *
++ * Author: David Pankratz <pankratz@ualberta.ca>
++ */
++
++#pragma once
++
++#include "auto_inst.h"
++
++class AutoInstWarpEntryAndExit;
++
++class AutoInstWarpEntryAndExit : public AutoInst {
++  public:
++    AutoInstWarpEntryAndExit() : AutoInst() {}
++
++    void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
++
++    void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
++
++    void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
++
++    // Opportunity for inheriting class to perform hybrid analysis using
++    // 1) static_data
++    // 2) runtime_data
++    // 3) shader_map
++    void Analyze(uint32_t* const debug_output_buffer, bool buffer_overflowed);
++
++    void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
++                           uint32_t depth) override {
++        Analyze(debug_output_buffer, buffer_overflowed);
++    };
++
++    void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
++        Analyze(debug_output_buffer, buffer_overflowed);
++    };
++
++    void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
++        Analyze(debug_output_buffer, buffer_overflowed);
++    };
++};
+diff --git a/layers/debug_printf.cpp b/layers/debug_printf.cpp
+index f04f4852..417341b5 100644
+--- a/layers/debug_printf.cpp
++++ b/layers/debug_printf.cpp
+@@ -80,7 +80,7 @@ void DebugPrintf::PostCallRecordCreateDevice(VkPhysicalDevice physicalDevice, co
+ 
+     if (enabled[gpu_validation]) {
+         ReportSetupProblem(device,
+-                           "Debug Printf cannot be enabled when gpu assisted validation is enabled.  "
++                           "Debug Printf cannot be enabled when gpu assisted validation or auto-inst are enabled.  "
+                            "Debug Printf disabled.");
+         device_debug_printf->aborted = true;
+         return;
+diff --git a/layers/debug_printf.h b/layers/debug_printf.h
+index 915d5a6d..deb85031 100644
+--- a/layers/debug_printf.h
++++ b/layers/debug_printf.h
+@@ -45,6 +45,7 @@ struct DPFShaderTracker {
+     VkPipeline pipeline;
+     VkShaderModule shader_module;
+     std::vector<unsigned int> pgm;
++    VkShaderStageFlagBits stage;
+ };
+ 
+ enum vartype { varsigned, varunsigned, varfloat };
+diff --git a/layers/generated/chassis.cpp b/layers/generated/chassis.cpp
+index 75a4ebfa..6548ddc9 100644
+--- a/layers/generated/chassis.cpp
++++ b/layers/generated/chassis.cpp
+@@ -50,6 +50,12 @@ bool wrap_handles = true;
+ #include "gpu_validation.h"
+ #include "object_lifetime_validation.h"
+ #include "debug_printf.h"
++#include "auto_inst_dyn_shader_trace.h"
++#include "auto_inst_dyn_trace_ray_trace.h"
++#include "auto_inst_execution_trace.h"
++#include "auto_inst_simt_efficiency.h"
++#include "auto_inst_divergence_characterization.h"
++#include "auto_inst_warp_entry_and_exit.h"
+ #include "stateless_validation.h"
+ #include "synchronization_validation.h"
+ #include "thread_safety.h"
+@@ -306,6 +312,24 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
+     auto sync_validation_obj = new SyncValidator;
+     sync_validation_obj->RegisterValidationObject(local_enables[sync_validation], api_version, report_data, local_object_dispatch);
+ 
++    auto auto_inst_simt_efficiency_obj = new AutoInstSimtEfficiency;
++    auto_inst_simt_efficiency_obj->RegisterValidationObject(local_enables[auto_inst_simt_efficiency], api_version, report_data, local_object_dispatch);
++
++    auto auto_inst_execution_trace_obj = new AutoInstExecutionTrace;
++    auto_inst_execution_trace_obj->RegisterValidationObject(local_enables[auto_inst_execution_trace], api_version, report_data, local_object_dispatch);
++
++    auto auto_inst_dyn_trace_ray_trace_obj = new AutoInstDynTraceRayTrace;
++    auto_inst_dyn_trace_ray_trace_obj->RegisterValidationObject(local_enables[auto_inst_dyn_trace_ray_trace], api_version, report_data, local_object_dispatch);
++
++    auto auto_inst_divergence_characterization_obj = new AutoInstDivergenceCharacterization;
++    auto_inst_divergence_characterization_obj->RegisterValidationObject(local_enables[auto_inst_divergence_characterization], api_version, report_data, local_object_dispatch);
++
++    auto auto_inst_warp_entry_and_exit_obj = new AutoInstWarpEntryAndExit;
++    auto_inst_warp_entry_and_exit_obj->RegisterValidationObject(local_enables[auto_inst_warp_entry_and_exit], api_version, report_data, local_object_dispatch);
++    
++    auto auto_inst_dyn_shader_trace_obj = new AutoInstDynShaderTrace;
++    auto_inst_dyn_shader_trace_obj->RegisterValidationObject(local_enables[auto_inst_dyn_shader_trace], api_version, report_data, local_object_dispatch);
++
+     // If handle wrapping is disabled via the ValidationFeatures extension, override build flag
+     if (local_disables[handle_wrapping]) {
+         wrap_handles = false;
+@@ -338,7 +362,7 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
+     framework->report_data = report_data;
+     framework->api_version = api_version;
+     framework->instance_extensions.InitFromInstanceCreateInfo(specified_version, pCreateInfo);
+-
++    
+     OutputLayerStatusInfo(framework);
+ 
+     thread_checker_obj->FinalizeInstanceValidationObject(framework);
+@@ -348,9 +372,15 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
+     core_checks_obj->instance = *pInstance;
+     core_checks_obj->instance_state = core_checks_obj;
+     best_practices_obj->FinalizeInstanceValidationObject(framework);
+-    gpu_assisted_obj->FinalizeInstanceValidationObject(framework);
++    gpu_assisted_obj->FinalizeInstanceValidationObject(framework);  
+     debug_printf_obj->FinalizeInstanceValidationObject(framework);
+     sync_validation_obj->FinalizeInstanceValidationObject(framework);
++    auto_inst_simt_efficiency_obj->FinalizeInstanceValidationObject(framework);    
++    auto_inst_execution_trace_obj->FinalizeInstanceValidationObject(framework);
++    auto_inst_dyn_trace_ray_trace_obj->FinalizeInstanceValidationObject(framework);
++    auto_inst_divergence_characterization_obj->FinalizeInstanceValidationObject(framework);
++    auto_inst_warp_entry_and_exit_obj->FinalizeInstanceValidationObject(framework);
++    auto_inst_dyn_shader_trace_obj->FinalizeInstanceValidationObject(framework);
+ 
+     for (auto intercept : framework->object_dispatch) {
+         auto lock = intercept->write_lock();
+@@ -360,8 +390,9 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
+     // Delete unused validation objects to avoid memory leak.
+     std::vector<ValidationObject*> local_objs = {
+         thread_checker_obj, object_tracker_obj, parameter_validation_obj,
+-        core_checks_obj, best_practices_obj, gpu_assisted_obj, debug_printf_obj,
+-        sync_validation_obj,
++        core_checks_obj, best_practices_obj, gpu_assisted_obj, debug_printf_obj, sync_validation_obj, 
++        auto_inst_simt_efficiency_obj, auto_inst_execution_trace_obj, auto_inst_dyn_trace_ray_trace_obj,
++         auto_inst_divergence_characterization_obj, auto_inst_warp_entry_and_exit_obj, auto_inst_dyn_shader_trace_obj
+     };
+     for (auto obj : local_objs) {
+         if (std::find(local_object_dispatch.begin(), local_object_dispatch.end(), obj) == local_object_dispatch.end()) {
+@@ -493,22 +524,27 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(VkPhysicalDevice gpu, const VkDevice
+ 
+     auto debug_printf_obj = new DebugPrintf;
+     debug_printf_obj->InitDeviceValidationObject(enables[debug_printf], instance_interceptor, device_interceptor);
+-
++    
+     auto sync_validation_obj = new SyncValidator;
+     sync_validation_obj->InitDeviceValidationObject(enables[sync_validation], instance_interceptor, device_interceptor);
+ 
+-    // Delete unused validation objects to avoid memory leak.
+-    std::vector<ValidationObject *> local_objs = {
+-        thread_safety_obj, stateless_validation_obj, object_tracker_obj,
+-        core_checks_obj, best_practices_obj, gpu_assisted_obj, debug_printf_obj,
+-        sync_validation_obj,
+-    };
+-    for (auto obj : local_objs) {
+-        if (std::find(device_interceptor->object_dispatch.begin(), device_interceptor->object_dispatch.end(), obj) ==
+-            device_interceptor->object_dispatch.end()) {
+-            delete obj;
+-        }
+-    }
++    auto auto_inst_simt_efficiency_obj = new AutoInstSimtEfficiency;
++    auto_inst_simt_efficiency_obj->InitDeviceValidationObject(enables[auto_inst_simt_efficiency], instance_interceptor, device_interceptor);
++
++    auto auto_inst_execution_trace_obj = new AutoInstExecutionTrace;
++    auto_inst_execution_trace_obj->InitDeviceValidationObject(enables[auto_inst_execution_trace], instance_interceptor, device_interceptor);
++    
++    auto auto_inst_dyn_trace_ray_trace_obj = new AutoInstDynTraceRayTrace;
++    auto_inst_dyn_trace_ray_trace_obj->InitDeviceValidationObject(enables[auto_inst_dyn_trace_ray_trace], instance_interceptor, device_interceptor);
++    
++    auto auto_inst_divergence_characterization_obj = new AutoInstDivergenceCharacterization;
++    auto_inst_divergence_characterization_obj->InitDeviceValidationObject(enables[auto_inst_divergence_characterization], instance_interceptor, device_interceptor);
++    
++    auto auto_inst_warp_entry_and_exit_obj = new AutoInstWarpEntryAndExit;
++    auto_inst_warp_entry_and_exit_obj->InitDeviceValidationObject(enables[auto_inst_warp_entry_and_exit], instance_interceptor, device_interceptor);
++        
++    auto auto_inst_dyn_shader_trace_obj = new AutoInstDynShaderTrace;
++    auto_inst_dyn_shader_trace_obj->InitDeviceValidationObject(enables[auto_inst_dyn_shader_trace], instance_interceptor, device_interceptor);
+ 
+     for (auto intercept : instance_interceptor->object_dispatch) {
+         auto lock = intercept->write_lock();
+@@ -574,7 +610,8 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateGraphicsPipelines(
+     }
+ 
+     auto usepCreateInfos = (!cgpl_state[LayerObjectTypeGpuAssisted].pCreateInfos) ? pCreateInfos : cgpl_state[LayerObjectTypeGpuAssisted].pCreateInfos;
+-    if (cgpl_state[LayerObjectTypeDebugPrintf].pCreateInfos) usepCreateInfos = cgpl_state[LayerObjectTypeDebugPrintf].pCreateInfos;
++    if (cgpl_state[LayerObjectTypeDebugPrintf].pCreateInfos) usepCreateInfos = cgpl_state[LayerObjectTypeDebugPrintf].pCreateInfos;    
++    else if (cgpl_state[LayerObjectTypeAutoInst].pCreateInfos) usepCreateInfos = cgpl_state[LayerObjectTypeAutoInst].pCreateInfos;
+ 
+     VkResult result = DispatchCreateGraphicsPipelines(device, pipelineCache, createInfoCount, usepCreateInfos, pAllocator, pPipelines);
+ 
+@@ -610,7 +647,8 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateComputePipelines(
+     }
+ 
+     auto usepCreateInfos = (!ccpl_state[LayerObjectTypeGpuAssisted].pCreateInfos) ? pCreateInfos : ccpl_state[LayerObjectTypeGpuAssisted].pCreateInfos;
+-    if (ccpl_state[LayerObjectTypeDebugPrintf].pCreateInfos) usepCreateInfos = ccpl_state[LayerObjectTypeDebugPrintf].pCreateInfos;
++    if (ccpl_state[LayerObjectTypeDebugPrintf].pCreateInfos) usepCreateInfos = ccpl_state[LayerObjectTypeDebugPrintf].pCreateInfos;    
++    else if (ccpl_state[LayerObjectTypeAutoInst].pCreateInfos) usepCreateInfos = ccpl_state[LayerObjectTypeAutoInst].pCreateInfos;
+ 
+     VkResult result = DispatchCreateComputePipelines(device, pipelineCache, createInfoCount, usepCreateInfos, pAllocator, pPipelines);
+ 
+@@ -654,6 +692,8 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateRayTracingPipelinesNV(
+                                                              pPipelines, result, &(crtpl_state[intercept->container_type]));
+     }
+     return result;
++
++
+ }
+ 
+ VKAPI_ATTR VkResult VKAPI_CALL CreateRayTracingPipelinesKHR(
+diff --git a/layers/generated/chassis.h b/layers/generated/chassis.h
+index 7f67fbe3..269aa727 100644
+--- a/layers/generated/chassis.h
++++ b/layers/generated/chassis.h
+@@ -52,6 +52,12 @@
+ #include "vk_safe_struct.h"
+ #include "vk_typemap_helper.h"
+ 
++#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT (VkValidationFeatureEnableEXT)5
++#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT (VkValidationFeatureEnableEXT)6
++#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT (VkValidationFeatureEnableEXT)7
++#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT (VkValidationFeatureEnableEXT)8
++#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT (VkValidationFeatureEnableEXT)9
++#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT (VkValidationFeatureEnableEXT)10
+ 
+ extern std::atomic<uint64_t> global_unique_id;
+ 
+@@ -2834,6 +2840,7 @@ enum LayerObjectTypeId {
+     LayerObjectTypeBestPractices,               // Instance or device best practices layer object
+     LayerObjectTypeGpuAssisted,                 // Instance or device gpu assisted validation layer object
+     LayerObjectTypeDebugPrintf,                 // Instance or device shader debug printf layer object
++    LayerObjectTypeAutoInst,                    // Instance or device shader auto instrumentation layer object
+     LayerObjectTypeCommandCounter,              // Command Counter validation object, child of corechecks
+     LayerObjectTypeSyncValidation,              // Instance or device synchronization validation layer object
+     LayerObjectTypeMaxEnum,                     // Max enum count
+@@ -2897,8 +2904,14 @@ typedef enum EnableFlags {
+     gpu_validation_reserve_binding_slot,
+     best_practices,
+     vendor_specific_arm,
+-    debug_printf,
++    debug_printf,    
+     sync_validation,
++    auto_inst_simt_efficiency,    
++    auto_inst_execution_trace,
++    auto_inst_dyn_trace_ray_trace,
++    auto_inst_divergence_characterization,
++    auto_inst_warp_entry_and_exit,
++    auto_inst_dyn_shader_trace,                    
+     // Insert new enables above this line
+     kMaxEnableFlags,
+ } EnableFlags;
+diff --git a/layers/gpu_utils.h b/layers/gpu_utils.h
+index 01197b94..e04ee285 100644
+--- a/layers/gpu_utils.h
++++ b/layers/gpu_utils.h
+@@ -314,14 +314,21 @@ void UtilPostCallRecordPipelineCreations(const uint32_t count, const CreateInfo
+             VkShaderModule shader_module = VK_NULL_HANDLE;
+             if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
+                 shader_module = pipeline_state->graphicsPipelineCI.pStages[stage].module;
++                object_ptr->shader_map[shader_state->gpu_validation_shader_id].stage =
++                    pipeline_state->graphicsPipelineCI.pStages[stage].stage;
+             } else if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) {
+                 assert(stage == 0);
+                 shader_module = pipeline_state->computePipelineCI.stage.module;
++                object_ptr->shader_map[shader_state->gpu_validation_shader_id].stage =
++                    pipeline_state->computePipelineCI.stage.stage;
+             } else if (bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_NV) {
+                 shader_module = pipeline_state->raytracingPipelineCI.pStages[stage].module;
++                object_ptr->shader_map[shader_state->gpu_validation_shader_id].stage =
++                    pipeline_state->raytracingPipelineCI.pStages[stage].stage;
+             } else {
+                 assert(false);
+             }
++
+             object_ptr->shader_map[shader_state->gpu_validation_shader_id].shader_module = shader_module;
+             object_ptr->shader_map[shader_state->gpu_validation_shader_id].pgm = std::move(code);
+         }
+diff --git a/layers/gpu_validation.h b/layers/gpu_validation.h
+index 706d3fb7..b48b84ab 100644
+--- a/layers/gpu_validation.h
++++ b/layers/gpu_validation.h
+@@ -54,6 +54,7 @@ struct GpuAssistedShaderTracker {
+     VkPipeline pipeline;
+     VkShaderModule shader_module;
+     std::vector<unsigned int> pgm;
++    VkShaderStageFlagBits stage;
+ };
+ 
+ struct GpuAssistedAccelerationStructureBuildValidationBufferInfo {
+@@ -149,7 +150,8 @@ class GpuAssisted : public ValidationStateTracker {
+                                       VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
+                                       uint32_t memoryBarrierCount, const VkMemoryBarrier* pMemoryBarriers,
+                                       uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier* pBufferMemoryBarriers,
+-                                      uint32_t imageMemoryBarrierCount, const VkImageMemoryBarrier* pImageMemoryBarriers) const override;
++                                      uint32_t imageMemoryBarrierCount,
++                                      const VkImageMemoryBarrier* pImageMemoryBarriers) const override;
+     void PreCallRecordCreateBuffer(VkDevice device, const VkBufferCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator,
+                                    VkBuffer* pBuffer, void* cb_state_data) override;
+     void CreateAccelerationStructureBuildValidationState(GpuAssisted* device_GpuAssisted);
+diff --git a/layers/layer_options.cpp b/layers/layer_options.cpp
+index 3c6f5dfe..0b6ec389 100644
+--- a/layers/layer_options.cpp
++++ b/layers/layer_options.cpp
+@@ -92,6 +92,19 @@ void SetValidationEnable(CHECK_ENABLED &enable_data, const ValidationCheckEnable
+ 
+ // Set the local enable flag for a single VK_VALIDATION_FEATURE_ENABLE_* flag
+ void SetValidationFeatureEnable(CHECK_ENABLED &enable_data, const VkValidationFeatureEnableEXT feature_enable) {
++    if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT) {
++        enable_data[auto_inst_simt_efficiency] = true;
++    } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT) {
++        enable_data[auto_inst_execution_trace] = true;
++    } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT) {
++        enable_data[auto_inst_dyn_trace_ray_trace] = true;
++    } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT) {
++        enable_data[auto_inst_divergence_characterization] = true;
++    } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT) {
++        enable_data[auto_inst_warp_entry_and_exit] = true;
++    } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT) {
++        enable_data[auto_inst_dyn_shader_trace] = true;
++    } 
+     switch (feature_enable) {
+         case VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT:
+             enable_data[gpu_validation] = true;
+diff --git a/layers/layer_options.h b/layers/layer_options.h
+index 861b9abe..d73768cd 100644
+--- a/layers/layer_options.h
++++ b/layers/layer_options.h
+@@ -45,11 +45,16 @@ static const std::unordered_map<std::string, VkValidationFeatureDisableEXT> VkVa
+ 
+ static const std::unordered_map<std::string, VkValidationFeatureEnableEXT> VkValFeatureEnableLookup = {
+     {"VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT", VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT},
+-    {"VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT",
+-     VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT},
++    {"VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT", VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT},
+     {"VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT", VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT},
+     {"VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT", VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT},
+     {"VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT", VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT},
++    {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT},    
++    {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT},
++    {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT},
++    {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT},
++    {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT},
++    {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT},    
+ };
+ 
+ static const std::unordered_map<std::string, VkValidationFeatureEnable> VkValFeatureEnableLookup2 = {
+@@ -93,7 +98,13 @@ static const std::vector<std::string> EnableFlagNameHelper = {
+     "VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT",                     // best_practices,
+     "VALIDATION_CHECK_ENABLE_VENDOR_SPECIFIC_ARM",                         // vendor_specific_arm,
+     "VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT",                       // debug_printf,
+-    "VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION"              // sync_validation,
++    "VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION",              // sync_validation,
++    "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT",              // auto_inst_simt_efficiency    
++    "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT",              // auto_inst_execution_trace
++    "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT",          // auto_inst_dyn_trace_ray_trace
++    "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT",  // auto_inst_divergence_characterization
++    "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT",          // auto_inst_warp_entry_and_exit
++    "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT",             // auto_inst_dyn_shader_trace
+ };
+-
++  
+ void ProcessConfigAndEnvSettings(ConfigAndEnvSettings *settings_data);
+diff --git a/scripts/known_good.json b/scripts/known_good.json
+index 9a53e452..b4fef368 100755
+--- a/scripts/known_good.json
++++ b/scripts/known_good.json
+@@ -1,6 +1,7 @@
+ {
+-  "repos" : [
++  "repos": [
+     {
++
+       "name" : "glslang",
+       "url" : "https://github.com/KhronosGroup/glslang.git",
+       "sub_dir" : "glslang",
+@@ -10,7 +11,7 @@
+       "prebuild" : [
+         "python update_glslang_sources.py"
+       ],
+-      "cmake_options" : [
++      "cmake_options": [
+         "-DUSE_CCACHE=ON"
+       ]
+     },
+@@ -28,12 +29,22 @@
+       "sub_dir": "SPIRV-Headers",
+       "build_dir": "SPIRV-Headers/build",
+       "install_dir": "SPIRV-Headers/build/install",
++
+       "commit": "f027d53ded7e230e008d37c8b47ede7cd308e19d"
++    },
++    {
++      "name": "SPIRV-Cross",
++      "url": "https://github.com/KhronosGroup/SPIRV-Cross.git",
++      "sub_dir": "spirv-cross",
++      "build_dir": "spirv-cross/build",
++      "install_dir": "spirv-cross/build/install",  
++      "commit": "e50f7d1ce8e162d0c826e84168cfa234e4de4ec9"
+     }
+   ],
+   "install_names" : {
+       "glslang" : "GLSLANG_INSTALL_DIR",
+       "Vulkan-Headers" : "VULKAN_HEADERS_INSTALL_DIR",
+-      "SPIRV-Headers" : "SPIRV_HEADERS_INSTALL_DIR"
++      "SPIRV-Headers": "SPIRV_HEADERS_INSTALL_DIR",
++      "SPIRV-Cross" :  "SPIRV_CROSS_INSTALL_DIR"
+   }
+ }
+diff --git a/scripts/layer_chassis_generator.py b/scripts/layer_chassis_generator.py
+index 8c4a4c4d..f53055c7 100644
+--- a/scripts/layer_chassis_generator.py
++++ b/scripts/layer_chassis_generator.py
+@@ -241,6 +241,13 @@ class LayerChassisOutputGenerator(OutputGenerator):
+ #include "vk_safe_struct.h"
+ #include "vk_typemap_helper.h"
+ 
++// Define here as a placeholder during development. 
++#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT (VkValidationFeatureEnableEXT)5
++#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT (VkValidationFeatureEnableEXT)7
++#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT (VkValidationFeatureEnableEXT)8
++#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT (VkValidationFeatureEnableEXT)9
++#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT (VkValidationFeatureEnableEXT)10
++#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT (VkValidationFeatureEnableEXT)11
+ 
+ extern std::atomic<uint64_t> global_unique_id;
+ 
+@@ -343,6 +350,12 @@ typedef enum EnableFlags {
+     vendor_specific_arm,
+     debug_printf,
+     sync_validation,
++    auto_inst_simt_efficiency,    
++    auto_inst_execution_trace,
++    auto_inst_dyn_trace_ray_trace,
++    auto_inst_divergence_characterization,
++    auto_inst_warp_entry_and_exit,
++    auto_inst_dyn_shader_trace,    
+     // Insert new enables above this line
+     kMaxEnableFlags,
+ } EnableFlags;
+@@ -685,6 +698,12 @@ bool wrap_handles = true;
+ #include "gpu_validation.h"
+ #include "object_lifetime_validation.h"
+ #include "debug_printf.h"
++#include "auto_inst_dyn_shader_trace.h"
++#include "auto_inst_dyn_trace_ray_trace.h"
++#include "auto_inst_execution_trace.h"
++#include "auto_inst_simt_efficiency.h"
++#include "auto_inst_divergence_characterization.h"
++#include "auto_inst_warp_entry_and_exit.h"
+ #include "stateless_validation.h"
+ #include "synchronization_validation.h"
+ #include "thread_safety.h"
+@@ -941,6 +960,24 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
+     auto sync_validation_obj = new SyncValidator;
+     sync_validation_obj->RegisterValidationObject(local_enables[sync_validation], api_version, report_data, local_object_dispatch);
+ 
++    auto auto_inst_simt_efficiency_obj = new AutoInstSimtEfficiency;
++    auto_inst_simt_efficiency_obj->RegisterValidationObject(local_enables[auto_inst_simt_efficiency], api_version, report_data, local_object_dispatch);
++
++    auto auto_inst_execution_trace_obj = new AutoInstExecutionTrace;
++    auto_inst_execution_trace_obj->RegisterValidationObject(local_enables[auto_inst_execution_trace], api_version, report_data, local_object_dispatch);
++
++    auto auto_inst_dyn_trace_ray_trace_obj = new AutoInstDynTraceRayTrace;
++    auto_inst_dyn_trace_ray_trace_obj->RegisterValidationObject(local_enables[auto_inst_dyn_trace_ray_trace], api_version, report_data, local_object_dispatch);
++
++    auto auto_inst_divergence_characterization_obj = new AutoInstDivergenceCharacterization;
++    auto_inst_divergence_characterization_obj->RegisterValidationObject(local_enables[auto_inst_divergence_characterization], api_version, report_data, local_object_dispatch);
++
++    auto auto_inst_warp_entry_and_exit_obj = new AutoInstWarpEntryAndExit;
++    auto_inst_warp_entry_and_exit_obj->RegisterValidationObject(local_enables[auto_inst_warp_entry_and_exit], api_version, report_data, local_object_dispatch);
++    
++    auto auto_inst_dyn_shader_trace_obj = new AutoInstDynShaderTrace;
++    auto_inst_dyn_shader_trace_obj->RegisterValidationObject(local_enables[auto_inst_dyn_shader_trace], api_version, report_data, local_object_dispatch);
++
+     // If handle wrapping is disabled via the ValidationFeatures extension, override build flag
+     if (local_disables[handle_wrapping]) {
+         wrap_handles = false;
+@@ -986,7 +1023,13 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
+     gpu_assisted_obj->FinalizeInstanceValidationObject(framework);
+     debug_printf_obj->FinalizeInstanceValidationObject(framework);
+     sync_validation_obj->FinalizeInstanceValidationObject(framework);
+-
++    auto_inst_simt_efficiency_obj->FinalizeInstanceValidationObject(framework);    
++    auto_inst_execution_trace_obj->FinalizeInstanceValidationObject(framework);
++    auto_inst_dyn_trace_ray_trace_obj->FinalizeInstanceValidationObject(framework);
++    auto_inst_divergence_characterization_obj->FinalizeInstanceValidationObject(framework);
++    auto_inst_warp_entry_and_exit_obj->FinalizeInstanceValidationObject(framework);
++    auto_inst_dyn_shader_trace_obj->FinalizeInstanceValidationObject(framework);
++    
+     for (auto intercept : framework->object_dispatch) {
+         auto lock = intercept->write_lock();
+         intercept->PostCallRecordCreateInstance(pCreateInfo, pAllocator, pInstance, result);
+@@ -1132,11 +1175,30 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(VkPhysicalDevice gpu, const VkDevice
+     auto sync_validation_obj = new SyncValidator;
+     sync_validation_obj->InitDeviceValidationObject(enables[sync_validation], instance_interceptor, device_interceptor);
+ 
++    auto auto_inst_simt_efficiency_obj = new AutoInstSimtEfficiency;
++    auto_inst_simt_efficiency_obj->InitDeviceValidationObject(enables[auto_inst_simt_efficiency], instance_interceptor, device_interceptor);
++
++    auto auto_inst_execution_trace_obj = new AutoInstExecutionTrace;
++    auto_inst_execution_trace_obj->InitDeviceValidationObject(enables[auto_inst_execution_trace], instance_interceptor, device_interceptor);
++    
++    auto auto_inst_dyn_trace_ray_trace_obj = new AutoInstDynTraceRayTrace;
++    auto_inst_dyn_trace_ray_trace_obj->InitDeviceValidationObject(enables[auto_inst_dyn_trace_ray_trace], instance_interceptor, device_interceptor);
++    
++    auto auto_inst_divergence_characterization_obj = new AutoInstDivergenceCharacterization;
++    auto_inst_divergence_characterization_obj->InitDeviceValidationObject(enables[auto_inst_divergence_characterization], instance_interceptor, device_interceptor);
++    
++    auto auto_inst_warp_entry_and_exit_obj = new AutoInstWarpEntryAndExit;
++    auto_inst_warp_entry_and_exit_obj->InitDeviceValidationObject(enables[auto_inst_warp_entry_and_exit], instance_interceptor, device_interceptor);
++        
++    auto auto_inst_dyn_shader_trace_obj = new AutoInstDynShaderTrace;
++    auto_inst_dyn_shader_trace_obj->InitDeviceValidationObject(enables[auto_inst_dyn_shader_trace], instance_interceptor, device_interceptor);
++
+     // Delete unused validation objects to avoid memory leak.
+-    std::vector<ValidationObject *> local_objs = {
+-        thread_safety_obj, stateless_validation_obj, object_tracker_obj,
+-        core_checks_obj, best_practices_obj, gpu_assisted_obj, debug_printf_obj,
+-        sync_validation_obj,
++    std::vector<ValidationObject*> local_objs = {
++        thread_checker_obj, object_tracker_obj, parameter_validation_obj,
++        core_checks_obj, best_practices_obj, gpu_assisted_obj, debug_printf_obj, sync_validation_obj, 
++        auto_inst_simt_efficiency_obj, auto_inst_execution_trace_obj, auto_inst_dyn_trace_ray_trace_obj,
++         auto_inst_divergence_characterization_obj, auto_inst_warp_entry_and_exit_obj, auto_inst_dyn_shader_trace_obj
+     };
+     for (auto obj : local_objs) {
+         if (std::find(device_interceptor->object_dispatch.begin(), device_interceptor->object_dispatch.end(), obj) ==
+-- 
+2.29.2.windows.2
+
diff --git a/ecosystem_tools/VulkanVision/vv-patches/vvision-vv.diff b/ecosystem_tools/VulkanVision/vv-patches/vvision-vv.diff
new file mode 100644
index 00000000..2caedf95
--- /dev/null
+++ b/ecosystem_tools/VulkanVision/vv-patches/vvision-vv.diff
@@ -0,0 +1 @@
+0001-layers-Added-auto-inst-layers.patch