Huawei_Technology
/
mindspore-mindinsight

 
			
			   
				 
					
						
						
							
							From 94f9a7c9ea98fa400d47b0a71efbea8792113b08 Mon Sep 17 00:00:00 2001
From: dpankratz <pankratz@ualberta.ca>
Date: Wed, 27 Jan 2021 09:17:26 -0700
Subject: [PATCH] layers: Added auto-inst layers

---
 CMakeLists.txt                                |   27 +-
 build-android/jni/Android.mk                  |    7 +
 build-android/known_good.json                 |    2 +-
 docs/auto_instrument.md                       |  213 +++
 layers/CMakeLists.txt                         |   25 +-
 layers/auto_inst.cpp                          | 1205 +++++++++++++++++
 layers/auto_inst.h                            |  465 +++++++
 .../auto_inst_divergence_characterization.cpp |  157 +++
 .../auto_inst_divergence_characterization.h   |   48 +
 layers/auto_inst_dyn_shader_trace.cpp         |  177 +++
 layers/auto_inst_dyn_shader_trace.h           |   44 +
 layers/auto_inst_dyn_trace_ray_trace.cpp      |  223 +++
 layers/auto_inst_dyn_trace_ray_trace.h        |   55 +
 layers/auto_inst_execution_trace.cpp          |  174 +++
 layers/auto_inst_execution_trace.h            |   56 +
 layers/auto_inst_simt_efficiency.cpp          |   67 +
 layers/auto_inst_simt_efficiency.h            |   56 +
 layers/auto_inst_warp_entry_and_exit.cpp      |   61 +
 layers/auto_inst_warp_entry_and_exit.h        |   52 +
 layers/debug_printf.cpp                       |    2 +-
 layers/debug_printf.h                         |    1 +
 layers/generated/chassis.cpp                  |   78 +-
 layers/generated/chassis.h                    |   15 +-
 layers/gpu_utils.h                            |    7 +
 layers/gpu_validation.h                       |    4 +-
 layers/layer_options.cpp                      |   13 +
 layers/layer_options.h                        |   19 +-
 scripts/known_good.json                       |   17 +-
 scripts/layer_chassis_generator.py            |   72 +-
 29 files changed, 3305 insertions(+), 37 deletions(-)
 create mode 100644 docs/auto_instrument.md
 create mode 100644 layers/auto_inst.cpp
 create mode 100644 layers/auto_inst.h
 create mode 100644 layers/auto_inst_divergence_characterization.cpp
 create mode 100644 layers/auto_inst_divergence_characterization.h
 create mode 100644 layers/auto_inst_dyn_shader_trace.cpp
 create mode 100644 layers/auto_inst_dyn_shader_trace.h
 create mode 100644 layers/auto_inst_dyn_trace_ray_trace.cpp
 create mode 100644 layers/auto_inst_dyn_trace_ray_trace.h
 create mode 100644 layers/auto_inst_execution_trace.cpp
 create mode 100644 layers/auto_inst_execution_trace.h
 create mode 100644 layers/auto_inst_simt_efficiency.cpp
 create mode 100644 layers/auto_inst_simt_efficiency.h
 create mode 100644 layers/auto_inst_warp_entry_and_exit.cpp
 create mode 100644 layers/auto_inst_warp_entry_and_exit.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 994a9ef0..da32e3b0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -227,6 +227,8 @@ if(BUILD_TESTS OR BUILD_LAYERS)
     if (NOT TARGET SPIRV-Tools)
         if(NOT SPIRV_TOOLS_INSTALL_DIR)
             set(SPIRV_TOOLS_INSTALL_DIR "${GLSLANG_INSTALL_DIR}")
+        else()
+            message(STATUS "Using Spirv-Tools install located at ${SPIRV_TOOLS_INSTALL_DIR}")
         endif()
 
         set(SPIRV_TOOLS_BINARY_ROOT "${SPIRV_TOOLS_INSTALL_DIR}/lib"
@@ -240,9 +242,12 @@ if(BUILD_TESTS OR BUILD_LAYERS)
         set(SPIRV_TOOLS_OPT_DEBUG_SEARCH_PATH "${SPIRV_TOOLS_INSTALL_DIR}/lib")
 
         find_library(SPIRV_TOOLS_LIB NAMES SPIRV-Tools HINTS ${SPIRV_TOOLS_SEARCH_PATH})
+        if (NOT SPIRV_TOOLS_LIB)
+            message(FATAL_ERROR "Could not find SPIRV-Tools")
+        endif()
         find_library(SPIRV_TOOLS_OPT_LIB NAMES SPIRV-Tools-opt HINTS ${SPIRV_TOOLS_OPT_SEARCH_PATH})
 
-        if(WIN32)
+        if(WIN32 AND NOT SPIRV_TOOLS_LIB)
             add_library(SPIRV-Tools-opt STATIC IMPORTED)
             add_library(SPIRV-Tools STATIC IMPORTED)
 
@@ -269,6 +274,26 @@ if(BUILD_TESTS OR BUILD_LAYERS)
         set(SPIRV_TOOLS_INCLUDE_DIR "${spirv-tools_SOURCE_DIR}/include" CACHE PATH "Path to spirv tools headers")
     endif()
 
+    if (NOT TARGET SPIRV-Cross)
+        if (SPIRV_CROSS_INSTALL_DIR)
+            message(STATUS "Using spirv-cross install located at ${SPIRV_CROSS_INSTALL_DIR}")
+        endif()
+        set(SPIRV_CROSS_INCLUDE_DIR "${SPIRV_CROSS_INSTALL_DIR}/include" CACHE PATH "Path to spirv cross headers")
+        set(SPIRV_CROSS_SEARCH_PATH ${SPIRV_CROSS_INSTALL_DIR}/lib)
+        find_library(SPIRV_CROSS_GLSL_LIB NAMES spirv-cross-glsl HINTS ${SPIRV_CROSS_SEARCH_PATH})
+        find_library(SPIRV_CROSS_CORE_LIB NAMES spirv-cross-core HINTS ${SPIRV_CROSS_SEARCH_PATH})
+        if (NOT SPIRV_CROSS_GLSL_LIB OR NOT SPIRV_CROSS_CORE_LIB)
+            find_library(SPIRV_CROSS_GLSL_LIB NAMES spirv-cross-glsld HINTS ${SPIRV_CROSS_SEARCH_PATH})
+            find_library(SPIRV_CROSS_CORE_LIB NAMES spirv-cross-cored HINTS ${SPIRV_CROSS_SEARCH_PATH})
+            if (NOT SPIRV_CROSS_GLSL_LIB OR NOT SPIRV_CROSS_CORE_LIB)
+                    message(FATAL_ERROR "Could not find spirv-cross libs!")
+            else()
+                    message("WARNING: using debug config of SPIRV-Cross libs. Use <--config release> option of update_deps.py to fix.")
+            endif()
+        endif()
+        set(SPIRV_CROSS_LIBRARIES ${SPIRV_CROSS_GLSL_LIB} ${SPIRV_CROSS_CORE_LIB})
+    endif()
+
     set(GLSLANG_LIBRARIES ${GLSLANG_LIBRARIES} ${SPIRV_TOOLS_LIBRARIES})
 endif()
 
diff --git a/build-android/jni/Android.mk b/build-android/jni/Android.mk
index f0955dd5..75c49def 100644
--- a/build-android/jni/Android.mk
+++ b/build-android/jni/Android.mk
@@ -45,6 +45,13 @@ LOCAL_SRC_FILES += $(SRC_DIR)/layers/generated/spirv_validation_helper.cpp
 LOCAL_SRC_FILES += $(SRC_DIR)/layers/gpu_validation.cpp
 LOCAL_SRC_FILES += $(SRC_DIR)/layers/gpu_utils.cpp
 LOCAL_SRC_FILES += $(SRC_DIR)/layers/debug_printf.cpp
+LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst.cpp
+LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_dyn_shader_trace.cpp
+LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_dyn_trace_ray_trace.cpp
+LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_execution_trace.cpp
+LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_simt_efficiency.cpp
+LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_divergence_characterization.cpp
+LOCAL_SRC_FILES += $(SRC_DIR)/layers/auto_inst_warp_entry_and_exit.cpp
 LOCAL_SRC_FILES += $(SRC_DIR)/layers/best_practices_utils.cpp
 LOCAL_SRC_FILES += $(SRC_DIR)/layers/generated/best_practices.cpp
 LOCAL_SRC_FILES += $(SRC_DIR)/layers/synchronization_validation.cpp
diff --git a/build-android/known_good.json b/build-android/known_good.json
index 1a77e5ae..d8635c37 100755
--- a/build-android/known_good.json
+++ b/build-android/known_good.json
@@ -1,5 +1,5 @@
 {
-  "repos" : [
+  "repos": [
     {
       "name" : "shaderc",
       "url" : "https://github.com/google/shaderc.git",
diff --git a/docs/auto_instrument.md b/docs/auto_instrument.md
new file mode 100644
index 00000000..30d376ce
--- /dev/null
+++ b/docs/auto_instrument.md
@@ -0,0 +1,213 @@
+<!-- markdownlint-disable MD041 -->
+
+[![Khronos Vulkan][1]][2]
+
+[1]: https://vulkan.lunarg.com/img/Vulkan_100px_Dec16.png "https://www.khronos.org/vulkan/"
+[2]: https://www.khronos.org/vulkan/
+
+# Auto-Instrument
+
+[![Creative Commons][3]][4]
+
+[3]: https://i.creativecommons.org/l/by-nd/4.0/88x31.png "Creative Commons License"
+[4]: https://creativecommons.org/licenses/by-nd/4.0/
+
+Auto-Instrument is implemented in the SPIR-V Tools optimizer and the `VK_LAYER_KHRONOS_validation` layer.
+It allows provides boilerplate for developers to implement custom instrumentation and analyses.
+This document covers the operation of the layer portion of the implementation and subsequently the specific sublayers that perform auto-instrumentation.
+
+## Limitations
+
+Auto-Instrument shares the same limitation as Debug Printf and GPU-assisted validation, an additional bound descriptor set. Currently, Auto-Instrument only allows 1 class of pipeline to be instrumented at once (i.e. 1 of graphics, compute, ray-tracing). 
+
+## Basic Operation
+
+The basic operation of Auto-Instrument is to offer the following hooks for subclasses to perform a custom analysis:
+* **InitializeDeviceLayerSettings** provides the opportunity to set layer settings when the Vulkan logical device is created. This is useful to check for active extensions or to check `vk_layer_settings.txt` for sublayer specific settings.
+* **InitializeInstrumentationBuffer** provides the opportunity for subclasses to change the default values in the instrumentation buffer. This is useful for communicating with the instrumentation code. For example, the instrumentation could support a sampling based approach where not all frames collect results. The instrumentation could contain a check for a specific location in the instrumentation buffer and this function would allow the subclass to populate that location.
+* **RegisterPasses** allows the specific subclass to choose which auto-instrumentation pass from SPIRV-Opt to use. 
+* **AnalyzeRayTracing**  is provided with the data collection from a ray-tracing pipeline invocation as well as the width, height, and depth of the invocation. 
+* **AnalyzeGraphics** receives the data from a graphics pipeline invocation.
+* **AnalyzeCompute** receives the data from a compute pipeline invocation and the x, y, and z of the invocation.
+
+By overriding these functions, a subclass is able to implement custom analyses of instrumentation data.
+
+## Enabling Auto-Instrument in Vulkan-ValidationLayers
+
+Auto-Instrument is an object in the KHRONOS_validation layer, so the VK_LAYER_KHRONOS_validation layer must be loaded.
+See the LAYER_CONFIGURATION document for information on enabling the VK_LAYER_KHRONOS_validation layer.
+Validation itself is not necessary for Auto-Instrument and can be disabled without affecting Auto-Instrument functionality.
+
+Auto-Instrument can be enabled through *vk_layer_settings.txt* file that must be in the program's working directory.
+Within the settings file, specify:
+khronos_validation.enables = `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_<specific analysis>_EXT` where `<specific analysis>` is one of the auto_inst subclasses. 
+
+Auto-Instrument has been implemented as a state tracker validation object, as a peer to GPU Assisted Validation and Debug Printf.
+Because of this, and coupled with the philosophy that validation objects will not communicate with each other, one should never enable any pair of Auto-Instrument, GPU Assisted Validation and Debug Printf at the same time.
+Auto-Instrument will be disabled if GPU Assisted Validation or Debug Printf is enabled.
+
+When using Auto-Instrument, it is recommended to disable validation, as the debug level of INFO or DEBUG causes the validation layers to produce many messages unrelated to Auto-Instrument, making it difficult to find the desired output.
+
+### Auto-Instrument Requirements
+
+* Validation Layers version: 1.2.135.0
+* Vulkan API version 1.1 or greater
+* VkPhysicalDevice features: fragmentStoresAndAtomics and vertexPipelineStoresAndAtomics
+
+### Auto-Instrument Settings
+
+* `khronos_validation.auto_inst_buffer_size` = `<size in bytes>`
+
+This setting allows you to specify the size of the per-call buffer, in bytes of device memory, for returning instrumentation data.
+The default is 1024 bytes. If the buffer size is too small, Auto-Instrument will report the size that the buffer should be to collect all the instrumentation data. In subsequent executions, the history file will be used to size the instrumentation buffer.
+
+* `khronos_validation.auto_inst_pipeline_to_instrument` = `Graphics` or `Compute` or `RayTracing`
+
+This setting controls the pipeline type that is instrumented. For example, if `Compute` is chosen then compute shaders are instrumented and instrumentation buffers are created for all VkComputePiplines.
+
+* `khronos_validation.auto_inst_to_stdout` = 'false' or 'true'
+
+By default, Auto-Instrument messages are sent to the stdout, but this setting will instead send Auto-Instrument to the debug callback.
+
+* `khronos_validation.auto_inst_base_file_name` = `<base name>`
+
+Auto-Instrument analysis file names can optionally have a base file name prepended. By default there is no common prefix.
+
+* `khronos_validation.auto_inst_create_reference_heatmap` = `false` or `true`
+
+Many of the analysis emit a heatmap. This option allows a reference scale **ReferenceScale\[.bmp\.ppm\]** to be generated where the leftmost pixels correspond to 0.0 and rightmost to 1.0. The pixels in between are interpolated between 0.0 and 1.0. 
+
+* `khronos_validation.auto_inst_debug_mode` = `atomics` or `subgroup` or `arraylength`
+
+This is a debug setting designed to help isolate any issues that may be occurring with auto-instrumentation. When present, this setting switches the operation of Auto-Instrument to disregard the current instrumentation mode. Instead, it runs an extremely simple instrumentation pass and analysis to check whether the atomic, subgroup, or arraylength instructions work correctly in isolation. 
+
+* `khronos_validation.auto_inst_dump_shaders` = `false` or `true` 
+
+When set to true, this setting instructs Auto-Instrument to dump the instrumented shader modules.
+
+* `khronos_validation.auto_inst_shaders_to_instrument` = `stageM, stageN, ...` E.g. `Miss1, ClosestHit2`
+
+By default, all shaders that correspond to the `khronos_validation.auto_inst_pipeline_to_instrument` setting are instrumented. When this setting is activated, only the shaders that match the specified stage and index are instrumented. For example, if `Miss1, ClosestHit2` is passed, then the 1st Miss shader that pass via `VkCreateShaderModule` will be instrumented, and similarly the 2nd Closest Hit shader. 
+
+### Auto-Instrument Resources
+
+Analogous to GPU Assisted Validation and Debug Printf, Auto-Instrument uses device memory and a descriptor set to allow the shader instrumentation code to return values to the layer.  
+See the gpu_validation document for more information
+
+Auto-Instrument also generates a file containing the runtime instrumentation buffer utilization of previous executions of the application. In subsequent executions, the instrumentation buffers are sized according to this history. This allows many more calls to be instrumented in cases where some calls generate significantly more data.
+
+Auto-Instrument analyses generate output files of the form `<base file name><pipeline type><pipeline invocation index>_frame<frame number>_<analysis specific suffix>`. For example `rt0_frame0_simt_efficiency.csv` is the SIMT Efficiency measurement for the 1st ray-tracing call in the 1st frame of the application. 
+
+### Auto-Instrument Subclasses
+
+This section outlines the classes the implement the Auto-Instrument interface to create detailed execution trace profiling.
+
+### Limitation
+
+Many of the analyses require tracking warp execution over time. Obtaining knowledge of which threads belong to a warp requires digging past the abstraction level of SPIR-V and a custom solution for graphics, compute and ray-tracing pipelines. Currently this is only implemented for ray-tracing pipelines so many of the analyses are limited for graphics and compute. 
+
+Some applications invoke the RayTracing pipeline with a z-dimension size of >1 which complicates the creation of heatmaps. In future more modes for transforming these higher dimensional calls into heatmaps will be added and exposed as options.
+
+## Divergence Characterization
+
+This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT` and is currently supported only for Ray Tracing.
+
+### Analysis
+
+This analysis breaksdown the effect of indirect-function calls, early thread-exits and control-flow on divergence in terms of number of instructions affected:
+* **Indirect-function call** divergence is caused by thread-varying values for the address passed to an indirect function call. This occurs frequently in ray-tracing when the threads in a warp hit multiple objects. This metric is currently only supported for ray-tracing.
+* **Early thread-exits** divergence occurs when some threads in a warp complete the pipeline while other threads still have work to perform. This also occurs frequently in ray-tracing when some rays miss geometry and others bounces many times requiring many traversals and intersections.
+* **Control-flow** divergence is caused by, for example, thread-varying values for if statements or thread-varying loop trip counts. 
+
+### Output
+
+This analysis outputs a file **divergence_characterization.csv** which contains the respective counts of inactive instruction execution slots caused by the different divergence causes for each pipeline invocation.
+
+## Dynamic Shader Trace
+
+This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT` and is currently supported only for Ray Tracing.
+
+### Analysis
+
+This analysis reconstructs the number of dynamic shader executions for each pipeline invocation. These values are visualized with thread and warp heatmaps
+
+### Output
+
+* **dyn_shader_counts.csv** is generated per pipeline invocation and contains the shader execution counts observed at runtime.
+* **shader_execution_heatmap\[.bmp\.ppm\]** is generated per piepline invocation and contains a heatmap created for each pipeline invocation visualizing each pixel's dynamic shader execution count normalized to the maximum shader execution count. 
+* **subgroup_shader_execution_heatmap\[.bmp\.ppm\]** is generated per piepline invocation and contains a heatmap created for each pipeline invocation visualizing each subgroups's dynamic shader execution count normalized to the maximum shader execution count for a single subgroup. 
+
+## Dynamic TraceRay Trace
+
+This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT` and is currently supported only for Ray Tracing.
+
+### Analysis
+
+This analysis simulates the effect of thread compaction on the execution of a ray-tracing pipeline. Given a particular runtime traceRay invocation, the active threads are repacked into warps and then the number of warp executions required to perform the new traceRay calls is calculated. The analysis further simulates only repacking consecutive `2^k` warps to simulate different hardware buffer sizes. 
+
+### Output
+
+* **thread_paths.csv** is generated per pipeline and contains bitmasks representing whether a given thread was active for a runtime invocation of a TraceRay call. For example, if a TraceRay call is contained in an if statement, some threads would have 0 to indicate they skipped the call. For each unique bitmask, the total count of threads that took the same path is totalled. 
+* **thread_compaction.csv** is generated per pipeline and contains the results of performing thread compaction. The data is output as follows:
+    ```
+    for each TraceRay callsite:
+        for each compaction window size:
+            for each runtime execution:
+                output active threads/threads required after compaction
+    ```
+
+    For example:
+    ```
+    354|			
+        1	0	896173/911296
+    ```
+    For callsite with id=354, compaction window size=1, visit count = 0, the number of active threads is 896173 and the number of required threads after compaction is 911296.
+
+## Execution Trace
+
+This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT`
+
+### Analysis
+
+This analysis determines the pipeline hotspots, and dynamic SPIR-V instruction execution count. This is performed by tracking each runtime execution of a basicblock along with the number of active threads. 
+
+### Output
+
+* **dyn_opcode_counts.csv** is generated per pipeline invocation and contains the respective runtime execution counts for each SPIR-V opcode. 
+* **shader_stage_dyn_executions.glsl** is generated per pipeline invocation for each shader in the pipeline. These files present the dynamic instruction execution count of each basic block as inline comments as follows:
+    ```
+    if (gl_LaunchIDNV.z != 0u)
+    {
+    /*thread_executions=460800. SIMT Efficiency=1.000*/
+        _1509(2416u, subgroupBallot(true).x);
+        ipos.x += (_265.global_ubo.width / 2);
+    }
+    /*thread_executions=921600. SIMT Efficiency=1.000*/
+    _1509(2426u, subgroupBallot(true).x);
+    ```
+    The function `_1509` is the instrumentation inserted to capture the execution trace. The comment that immediately proceeds it correpsonds to the data collected from that instrumentation callsite. The first argument to `_1509` (in this example, `2416 and 2426`) is the unique id of the basic block.
+* **hotspots.csv** is generated per pipeline invocation and contains the dynamic execution count of each instruction id in the pipeline. The first argument passed to the instrumentation in the annotated shaders is the instruction id. This can be searched for in the hotspots file or vice-versa. 
+
+## SIMT Efficiency
+
+This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT`.
+
+### Analysis
+
+This analysis computes the SIMT efficiency which is a measure of the utilization of a SIMD architecture. In this case it is calculated as the average fraction of active threads for each basic block execution.
+
+### Output
+
+* **simt_efficiency.csv** is generated per frame with the respective SIMT efficiencies of each pipeline invocation.
+
+## Warp Entry and Exit
+
+This analysis can be activated using `VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT`.
+
+### Analysis
+
+This analysis counts the number of times the entry and exit of the ray-tracing pipeline is executed. This is designed to measure the effect of independent thread scheduling(ITS) on NVIDIA's Turing and Ampere architectures. This instrumentation measures to what degree the warp has diverged due to ITS.
+
+### Output
+
+* **warp_exits_vs_entires.csv** is generated per frame with the respective exits count/entry count for each pipeline invocation.
diff --git a/layers/CMakeLists.txt b/layers/CMakeLists.txt
index 851b98d5..93758bab 100644
--- a/layers/CMakeLists.txt
+++ b/layers/CMakeLists.txt
@@ -234,9 +234,27 @@ set(GPU_UTILITY_LIBRARY_FILES
     gpu_utils.cpp
     gpu_utils.h)
 
+set(AUTO_INST_LIBRARY_FILES
+    auto_inst.cpp
+    auto_inst_divergence_characterization.cpp
+    auto_inst_dyn_shader_trace.cpp
+    auto_inst_dyn_trace_ray_trace.cpp    
+    auto_inst_execution_trace.cpp        
+    auto_inst_simt_efficiency.cpp    
+    auto_inst_warp_entry_and_exit.cpp    
+    auto_inst.h
+    auto_inst_divergence_characterization.h
+    auto_inst_dyn_shader_trace.h
+    auto_inst_dyn_trace_ray_trace.h    
+    auto_inst_execution_trace.h         
+    auto_inst_simt_efficiency.h    
+    auto_inst_warp_entry_and_exit.h    
+    )
+
 set(SYNC_VALIDATION_LIBRARY_FILES
     synchronization_validation.cpp
-    synchronization_validation.h)
+    synchronization_validation.h
+    )
 
 # Validation Layer performance instrumentation support using Optick.
 # https://optick.dev/ https://github.com/bombomby/optick
@@ -268,6 +286,7 @@ if(INSTRUMENT_OPTICK)
     endif()
 endif()
 
+
 if(BUILD_LAYERS)
     AddVkLayer(khronos_validation "${KHRONOS_LAYER_COMPILE_DEFINITIONS}"
         ${CHASSIS_LIBRARY_FILES}
@@ -279,6 +298,7 @@ if(BUILD_LAYERS)
         ${GPU_UTILITY_LIBRARY_FILES}
         ${GPU_ASSISTED_LIBRARY_FILES}
         ${DEBUG_PRINTF_LIBRARY_FILES}
+        ${AUTO_INST_LIBRARY_FILES}
         ${SYNC_VALIDATION_LIBRARY_FILES}
         ${OPTICK_SOURCE_FILES})
 
@@ -290,14 +310,17 @@ if(BUILD_LAYERS)
         target_link_libraries(VkLayer_khronos_validation PRIVATE "$<$<CONFIG:Release>:-DEBUG:FULL>")
     endif()
 
+
     # Khronos validation additional dependencies
     target_include_directories(VkLayer_khronos_validation PRIVATE ${GLSLANG_SPIRV_INCLUDE_DIR})
     target_include_directories(VkLayer_khronos_validation PRIVATE ${SPIRV_TOOLS_INCLUDE_DIR})
+    target_include_directories(VkLayer_khronos_validation PRIVATE ${SPIRV_CROSS_INCLUDE_DIR})
     target_include_directories(VkLayer_khronos_validation PRIVATE ${SPIRV_HEADERS_INCLUDE_DIR})
     if(INSTRUMENT_OPTICK)
         target_include_directories(VkLayer_khronos_validation PRIVATE ${OPTICK_SOURCE_DIR})
     endif()
     target_link_libraries(VkLayer_khronos_validation PRIVATE ${SPIRV_TOOLS_LIBRARIES})
+    target_link_libraries(VkLayer_khronos_validation PRIVATE ${SPIRV_CROSS_LIBRARIES})
 
     # The output file needs Unix "/" separators or Windows "\" separators On top of that, Windows separators actually need to be doubled
     # because the json format uses backslash escapes
diff --git a/layers/auto_inst.cpp b/layers/auto_inst.cpp
new file mode 100644
index 00000000..8f3669d3
--- /dev/null
+++ b/layers/auto_inst.cpp
@@ -0,0 +1,1205 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#include "auto_inst.h"
+#include "spirv-tools/optimizer.hpp"
+#include "spirv-tools/instrument.hpp"
+#if !defined(__ANDROID__)
+#include "spirv_cross/spirv_glsl.hpp"
+#endif
+#include <iostream>
+#include <fstream>
+#include "layer_chassis_dispatch.h"
+#include <regex>
+#include <iostream>
+#include <bitset>
+
+static const VkShaderStageFlags kShaderStageAllRayTracing =
+    VK_SHADER_STAGE_ANY_HIT_BIT_NV | VK_SHADER_STAGE_CALLABLE_BIT_NV | VK_SHADER_STAGE_CLOSEST_HIT_BIT_NV |
+    VK_SHADER_STAGE_INTERSECTION_BIT_NV | VK_SHADER_STAGE_MISS_BIT_NV | VK_SHADER_STAGE_RAYGEN_BIT_NV;
+
+static const VkShaderStageFlags kShaderStageAllGraphics =
+    VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT |
+    VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT | VK_SHADER_STAGE_GEOMETRY_BIT;
+
+static const VkShaderStageFlags kShaderStageAllCompute = VK_SHADER_STAGE_COMPUTE_BIT;
+
+// String literal was determined by VkShaderStageFlags spelling. I.e. VK_SHADER_STAGE_RAYGEN_BIT_KHR => RayGen
+static const std::unordered_map<std::string, uint32_t> ShaderStageFlagLookup = {
+    {"RayGen", VK_SHADER_STAGE_RAYGEN_BIT_KHR},
+    {"ClosestHit", VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR},
+    {"Callable", VK_SHADER_STAGE_CALLABLE_BIT_KHR},
+    {"Miss", VK_SHADER_STAGE_MISS_BIT_KHR},
+    {"AnyHit", VK_SHADER_STAGE_ANY_HIT_BIT_KHR},
+    {"Intersection", VK_SHADER_STAGE_INTERSECTION_BIT_KHR},
+    {"Geometry", VK_SHADER_STAGE_GEOMETRY_BIT},
+    {"Fragment", VK_SHADER_STAGE_FRAGMENT_BIT},
+    {"Compute", VK_SHADER_STAGE_COMPUTE_BIT},
+    {"TessellationControl", VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT},
+    {"TessellationEvaluation", VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT},
+    {"Vertex", VK_SHADER_STAGE_VERTEX_BIT}};
+
+// Convenience function for reporting problems.
+template <typename T>
+void AutoInst::ReportSetupProblem(T object, std::string specific_message) const {
+    if (use_stdout)
+        std::cerr << specific_message;
+    else
+        LogError(object, "UNASSIGNED-AUTO-INST ", "Detail: (%s)", specific_message.c_str());
+}
+
+template <typename T>
+void AutoInst::ReportInfo(T object, std::string specific_message) const {
+    if (use_stdout)
+        std::cout << specific_message;
+    else
+        LogInfo(object, "UNASSIGNED-AUTO-inst", "%s", specific_message.c_str());
+}
+
+void AutoInst::CreateImage(uint32_t width, uint32_t height, std::vector<char> &colors, std::string file_name) const {
+#if defined(_WIN32)
+    std::ofstream ofs;
+    ofs.open(file_name + ".bmp", std::ios_base::binary);
+
+    ReportInfo(device, "Creating BMP with dim=" + std::to_string(width) + "x" + std::to_string(height) + " from " +
+                           std::to_string(colors.size()) + "\n");
+
+    const int BYTES_PER_PIXEL = 3;
+
+    BITMAPFILEHEADER tWBFH;
+    tWBFH.bfType = 0x4d42;
+    tWBFH.bfSize = 14 + 40 + (width * height * BYTES_PER_PIXEL);
+    tWBFH.bfReserved1 = 0;
+    tWBFH.bfReserved2 = 0;
+    tWBFH.bfOffBits = 14 + 40;
+
+    BITMAPINFOHEADER tW2BH;
+    tW2BH.biSize = 40;
+    tW2BH.biWidth = width;
+    tW2BH.biHeight = height;
+    tW2BH.biPlanes = 1;
+    tW2BH.biBitCount = BYTES_PER_PIXEL * 8;
+    tW2BH.biCompression = 0;
+
+    ofs.write((char *)(&tWBFH), 14);
+    ofs.write((char *)(&tW2BH), 40);
+
+    for (int y = height - 1; y >= 0; y--) {
+        uint32_t x = 0;
+        for (x = 0; x < width; x++) {
+            auto thread_id = BYTES_PER_PIXEL * (y * width + x);
+            ofs << colors[thread_id] << colors[thread_id + 1] << colors[thread_id + 2];
+        }
+        while (x % 4 != 0) {
+            ofs << (char)0;
+            x++;
+        }
+    }
+
+    ofs.close();
+#else
+
+    const int BYTES_PER_PIXEL = 3;
+    std::ofstream ofs(file_name + ".ppm", std::ios_base::out | std::ios_base::binary);
+    ofs << "P6" << std::endl << width << ' ' << height << std::endl << "255" << std::endl;
+
+    for (uint32_t j = 0; j < height; j++) {
+        for (auto i = 0u; i < width; i++) {
+            auto thread_id = BYTES_PER_PIXEL * (j * width + i);
+            ofs << colors[thread_id] << colors[thread_id + 1] << colors[thread_id + 2];
+        }
+    }
+
+    ofs.close();
+#endif
+}
+
+std::tuple<char, char, char> AutoInst::UnitIntervalToRGB(float val) const {
+    if (val < 0 || val > 1) {
+        ReportSetupProblem(device, "Cannot convert a value outside of interval [0,1] to heatmap colour!");
+    }
+
+    float red, green, blue;
+
+    if (val < 0.2)
+        red = 1.0f - 5.0f * val;
+    else if (val >= 0.2 && val < 0.6)
+        red = 0;
+    else if (val >= 0.6 && val < 0.8)
+        red = 5.0f * (val - 0.6f);
+    else
+        red = 1.0f;
+
+    if (val < 0.4)
+        green = 1;
+    else if (val >= 0.4 && val < 0.6)
+        green = 5.0f * (0.6f - val);
+    else
+        green = 0;
+
+    if (val < 0.2)
+        blue = 0;
+    else if (val >= 0.2 && val < 0.4)
+        blue = 5.0f * (val - 0.2f);
+    else if (val >= 0.4 && val < 0.8)
+        blue = 1;
+    else
+        blue = 5.0f * (1.0f - val);
+
+    return std::make_tuple((char)(red * 255), (char)(green * 255), (char)(blue * 255));
+}
+
+bool AutoInst::CreateUniqueSubgroupIdMappings(uint32_t *const debug_output_buffer,
+                                          PrimitiveIdToPrimitiveSizeMap &primitive_id2primitive_size,
+                                          ThreadIdToSubgroupIdMap &thread_id2subgroup_id_map, ThreadIdSwizzleMap &thread_id_swizzle_map,
+                                          std::function<uint32_t(uint32_t inst_id)> inst_id2prim_id) const {
+    // Sanity check for unique subgroup primitive size
+    if (primitive_id2primitive_size.count(spvtools::kAutoInstUniqueSubgroupId) == 0) return false;
+
+    uint32_t j = 0;
+    uint32_t num_words_written = debug_output_buffer[WORDS_WRITTEN_INDEX];
+    while (j < num_words_written) {
+        auto inst_id = debug_output_buffer[j + NUM_BUFFER_RESERVED_WORDS];
+        auto prim_id = inst_id2prim_id(inst_id);
+        if (primitive_id2primitive_size.count(prim_id) == 0) {
+            ReportSetupProblem(device,
+                               "Unknown prim_id=" + std::to_string(prim_id) + " encountered in CreateUniqueSubgroupIdMappings\n.");
+            return false;
+        }
+
+        if (prim_id == spvtools::kAutoInstUniqueSubgroupId) {
+            auto unique_id_record = reinterpret_cast<AIUniqueSubgroupIdEntry *>(&debug_output_buffer[j + NUM_BUFFER_RESERVED_WORDS]);
+            auto subgroup_id = unique_id_record->SubgroupId();
+            auto flat_thread_id = unique_id_record->flat_thread_id;
+            thread_id2subgroup_id_map[flat_thread_id] = subgroup_id;
+            thread_id_swizzle_map[subgroup_id * SUBGROUP_SIZE + unique_id_record->IntraSubgroupId()] = flat_thread_id;
+        }
+        j += primitive_id2primitive_size[prim_id];
+    }
+
+    return true;
+}
+
+void AutoInst::TryReadRuntimeSizeCache(AutoInst *device_auto_inst) {
+    if (pipeline_to_instrument == VK_PIPELINE_BIND_POINT_MAX_ENUM) {
+        ReportSetupProblem(device, "Pipeline to instrument setting was not initialized. Aborting\n");
+        aborted = true;
+    }
+
+    std::ifstream cache_file;
+    cache_file.open(RuntimeSizeCachePath(pipeline_to_instrument), std::ios_base::binary);
+    if (!cache_file) {
+        ReportInfo(device, "Runtime instrumentation buffer requirements cache not found. Defaulting to output_buffer_size.\n");
+        return;
+    }
+
+    size_t num_pipeline_invocations = 0;
+    cache_file.read((char *)&num_pipeline_invocations, sizeof(size_t));
+    while (!cache_file.eof() && device_auto_inst->BufferSizeRequirementsLookup.size() < num_pipeline_invocations) {
+        uint32_t buffer_size = 0;
+        cache_file.read((char *)&buffer_size, sizeof(uint32_t));
+        device_auto_inst->BufferSizeRequirementsLookup.push_back(buffer_size);
+    }
+
+    if (num_pipeline_invocations != device_auto_inst->BufferSizeRequirementsLookup.size()) {
+        ReportSetupProblem(device, "Warning incomplete cache file detected.\n");
+    }
+
+    cache_file.close();
+}
+
+void AutoInst::WriteRuntimeSizeCache() {
+    if (pipeline_to_instrument == VK_PIPELINE_BIND_POINT_MAX_ENUM) {
+        ReportSetupProblem(device, "Pipeline to instrument setting was not initialized. Aborting\n");
+        aborted = true;
+    }
+
+    std::ofstream cache_file;
+    cache_file.open(RuntimeSizeCachePath(pipeline_to_instrument), std::ios_base::binary);
+    auto pipelines_observed = BufferSizeRequirementsLookup.size();
+    cache_file.write((char *)&pipelines_observed, sizeof(size_t));
+    for (auto size : BufferSizeRequirementsLookup) {
+        cache_file.write((char *)&size, 4);
+    }
+    cache_file.close();
+}
+
+uint32_t AutoInst::FindShaderStage(std::vector<unsigned int> pgm) const {
+    uint32_t stage_flag = 0;
+    SHADER_MODULE_STATE shader;
+    shader.words = pgm;
+    if (shader.words.size() > 0) {
+        for (auto insn : shader) {
+            if (insn.opcode() == spv::OpEntryPoint) {
+                if (stage_flag != 0) {
+                    // This means there are multiple entrypoints which is not
+                    // supported by the downstream SPIRV-Opt instrumentation passes.
+                    ReportSetupProblem(
+                        device, "Multiple EntryPoints in single shader module encountered! Module will not be instrumented!\n");
+                    return 0;
+                }
+                uint32_t offset = insn.offset();
+                spv::ExecutionModel ex_model = (spv::ExecutionModel)pgm[offset + 1];
+                switch (ex_model) {
+                    case spv::ExecutionModel::ExecutionModelAnyHitKHR:
+                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_ANY_HIT_BIT_KHR;
+                        break;
+                    case spv::ExecutionModel::ExecutionModelCallableKHR:
+                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_CALLABLE_BIT_KHR;
+                        break;
+                    case spv::ExecutionModel::ExecutionModelClosestHitKHR:
+                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR;
+                        break;
+                    case spv::ExecutionModel::ExecutionModelFragment:
+                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_FRAGMENT_BIT;
+                        break;
+                    case spv::ExecutionModel::ExecutionModelGeometry:
+                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_GEOMETRY_BIT;
+                        break;
+                    case spv::ExecutionModel::ExecutionModelGLCompute:
+                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_COMPUTE_BIT;
+                        break;
+                    case spv::ExecutionModel::ExecutionModelIntersectionKHR:
+                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_INTERSECTION_BIT_KHR;
+                        break;
+                    case spv::ExecutionModel::ExecutionModelMissKHR:
+                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_MISS_BIT_KHR;
+                        break;
+                    case spv::ExecutionModel::ExecutionModelRayGenerationKHR:
+                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_RAYGEN_BIT_KHR;
+                        break;
+                    case spv::ExecutionModel::ExecutionModelTessellationControl:
+                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
+                        break;
+                    case spv::ExecutionModel::ExecutionModelTessellationEvaluation:
+                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT;
+                        break;
+                    case spv::ExecutionModel::ExecutionModelVertex:
+                        stage_flag = VkShaderStageFlagBits::VK_SHADER_STAGE_VERTEX_BIT;
+                        break;
+                    default:
+                        ReportSetupProblem(device, "Unsupported Shader Stage encountered! Shader will not be instrumented!\n");
+                        return 0;
+                }
+            }
+        }
+    }
+
+    return stage_flag;
+}
+
+std::tuple<uint32_t, uint32_t, uint32_t> AutoInst::FindComputeLocalSize(std::vector<unsigned int> pgm) const {
+    SHADER_MODULE_STATE shader;
+    shader.words = pgm;
+
+    if (shader.words.size() > 0) {
+        for (auto insn : shader) {
+            if (insn.opcode() == spv::OpExecutionMode) {
+                uint32_t offset = insn.offset();
+                if ((spv::ExecutionMode)pgm[offset + 2] != spv::ExecutionModeLocalSize) {
+                    ReportSetupProblem(device, "Unable to determine compute LocalSize!\n");
+                    return std::make_tuple(0, 0, 0);
+                }
+                return std::make_tuple(pgm[offset + 3], pgm[offset + 4], pgm[offset + 5]);
+            }
+        }
+    }
+    ReportSetupProblem(device, "Unable to find OpExecutionMode.\n");
+    return std::make_tuple(0, 0, 0);
+}
+
+std::string AutoInst::AnnotateModuleStr(std::string &shader, std::unordered_map<uint32_t, std::string> &inst_id2str) const {
+    std::regex pattern("%[0-9]+? = OpFunctionCall %void %[0-9]+? %uint_(.*) *");
+    
+    std::istringstream ss{shader};
+    std::ostringstream out;
+    int i = 0;
+
+    for (std::string line; std::getline(ss, line, '\n');) {
+        std::smatch sm;
+        std::regex_search(line, sm, pattern);
+        if (sm.size() > 0) {
+            uint32_t offset = (uint32_t)atoi(sm[1].str().c_str());
+            if (inst_id2str.count(offset) > 0) {
+                auto result_id = "%str" + std::to_string(offset) + "_" + std::to_string(i);
+                out << result_id << " = OpString "
+                    << "\"" << inst_id2str[offset] << "\"\n";
+                out << "OpLine " << result_id << " "
+                    << "0 0 "
+                    << "\n";
+                i++;
+            }
+        } else if (line.find("OpLine") != std::string::npos) {
+            // Clear any other OpLines to avoid interference
+            continue;
+        }
+
+        out << line << "\n";
+    }
+    return out.str();
+}
+
+void AutoInst::TryCompileModuleStrToGlsl(const std::string shader, std::string file_name) const {
+#if defined(__ANDROID__)
+    ReportInfo(device, "SPIRV-Cross not included on Android. Shader " + file_name + " generated without cross compiling.\n");
+    std::ofstream file;
+    file.open(file_name + ".spv");
+    file << shader;
+    file.close();
+#else
+    try {
+        using namespace spvtools;
+        std::ofstream temp;
+        SpirvTools spirvTools(spv_target_env::SPV_ENV_VULKAN_1_2);
+        std::vector<uint32_t> binary;
+        (void)spirvTools.Assemble(shader, &binary, SPV_TEXT_TO_BINARY_OPTION_NONE);
+        using namespace spirv_cross;
+        CompilerGLSL compiler(binary);
+        auto options = compiler.get_common_options();
+        options.emit_line_directives = true;
+        options.vulkan_semantics = true;
+        compiler.set_common_options(options);
+        std::string glsl;
+        glsl = compiler.compile();
+        if (glsl.size() == 0) {
+            ReportSetupProblem(device, "Spirv-cross failed. Shader " + file_name + " generated without cross compiling.\n");
+            std::ofstream file;
+            file.open(file_name + ".glsl");
+            file << shader;
+            file.close();
+        } else {
+            // post process line annotations into comments
+            std::regex re("#line [0-9]* \"([^\"]*)\"");
+            std::ofstream file;
+            file.open(file_name + ".glsl");
+            file << std::regex_replace(glsl, re, "/*$1*/");
+            file.close();
+        }
+    } catch (...) {
+        ReportSetupProblem(device, "Spirv-cross crashed. Shader " + file_name + " generated without cross compiling.\n");
+        std::ofstream file;
+        file.open(file_name + ".glsl");
+        file << shader;
+        file.close();
+    }
+#endif
+}
+
+// Turn on necessary device features.
+void AutoInst::PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo *create_info,
+                                         const VkAllocationCallbacks *pAllocator, VkDevice *pDevice, void *modified_create_info) {
+    DispatchGetPhysicalDeviceFeatures(gpu, &supported_features);
+    VkPhysicalDeviceFeatures features = {};
+    features.vertexPipelineStoresAndAtomics = true;
+    features.fragmentStoresAndAtomics = true;
+    UtilPreCallRecordCreateDevice(gpu, reinterpret_cast<safe_VkDeviceCreateInfo *>(modified_create_info), supported_features,
+                                  features);
+}
+
+// Perform initializations that can be done at Create Device time.
+void AutoInst::PostCallRecordCreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo,
+                                          const VkAllocationCallbacks *pAllocator, VkDevice *pDevice, VkResult result) {
+    ValidationStateTracker::PostCallRecordCreateDevice(physicalDevice, pCreateInfo, pAllocator, pDevice, result);
+
+    ValidationObject *device_object = GetLayerDataPtr(get_dispatch_key(*pDevice), layer_data_map);
+    ValidationObject *validation_data = GetValidationObject(device_object->object_dispatch, this->container_type);
+    AutoInst *device_auto_inst = static_cast<AutoInst *>(validation_data);
+    device_auto_inst->physicalDevice = physicalDevice;
+    device_auto_inst->device = *pDevice;
+
+    if (device_auto_inst->phys_dev_props.apiVersion < VK_API_VERSION_1_1) {
+        ReportSetupProblem(device, "Auto Inst requires Vulkan 1.1 or later.  Auto Inst disabled.");
+        device_auto_inst->aborted = true;
+        return;
+    }
+
+    if (!supported_features.fragmentStoresAndAtomics || !supported_features.vertexPipelineStoresAndAtomics) {
+        ReportSetupProblem(device,
+                           "Auto Inst requires fragmentStoresAndAtomics and vertexPipelineStoresAndAtomics.  "
+                           "Auto Inst disabled.");
+        device_auto_inst->aborted = true;
+        return;
+    }
+
+    if (enabled[gpu_validation] || enabled[debug_printf]) {
+        ReportSetupProblem(device,
+                           "Auto inst cannot be enabled when gpu assisted validation or debug printf are enabled.  "
+                           "Auto inst disabled.");
+        device_auto_inst->aborted = true;
+        return;
+    }
+
+    const char *size_string = getLayerOption("khronos_validation.auto_inst_buffer_size");
+    device_auto_inst->output_buffer_size = *size_string ? atoi(size_string) : 1024;
+    if (device_auto_inst->output_buffer_size <= 16) {
+        ReportSetupProblem(device, "The instrumentation buffer size must be at least 16 bytes");
+        device_auto_inst->aborted = true;
+    }
+    const char *stdout_string = getLayerOption("khronos_validation.auto_inst_to_stdout");
+    device_auto_inst->use_stdout = *stdout_string ? !strcmp(stdout_string, "false") : true;
+    use_stdout = device_auto_inst->use_stdout;
+
+    const char *base_file_name = getLayerOption("khronos_validation.auto_inst_base_file_name");
+    device_auto_inst->base_file_name = *base_file_name ? base_file_name : "";
+
+    const char *pti = getLayerOption("khronos_validation.auto_inst_pipeline_to_instrument");
+    if (!strcmp(pti, "") || !strcmp(pti, "RayTracing")) {
+        device_auto_inst->pipeline_to_instrument = VK_PIPELINE_BIND_POINT_RAY_TRACING_NV;
+        if (!device_extensions.vk_nv_ray_tracing && !device_extensions.vk_khr_ray_tracing_pipeline) {
+            ReportSetupProblem(device, "Cannot instrument ray-tracing pipeline since ray-tracing is not enabled.\n");
+            device_auto_inst->aborted = true;
+            return;
+        }
+
+        ReportInfo(device, "Instrumenting Ray-Tracing Pipeline!\n");
+    } else if (!strcmp(pti, "Graphics")) {
+        device_auto_inst->pipeline_to_instrument = VK_PIPELINE_BIND_POINT_GRAPHICS;
+        ReportInfo(device, "Instrumenting Graphics Pipeline!\n");
+    } else if (!strcmp(pti, "Compute")) {
+        device_auto_inst->pipeline_to_instrument = VK_PIPELINE_BIND_POINT_COMPUTE;
+        ReportInfo(device, "Instrumenting Compute Pipeline!\n");
+    }
+
+    pipeline_to_instrument = device_auto_inst->pipeline_to_instrument;
+
+    const char *create_reference_heatmap = getLayerOption("khronos_validation.auto_inst_create_reference_heatmap");
+    if (!strcmp(create_reference_heatmap, "true")) {
+        ReportInfo(device, "Creating reference heatmap!\n");
+        std::vector<char> scale_colors;
+
+        const uint32_t scale_width = 256;
+        const uint32_t scale_height = 30;
+
+        for (int height = 0; height < scale_height; height++) {
+            for (float i = 0; i < scale_width; i++) {
+                auto rgb = UnitIntervalToRGB(i / 255.0f);
+
+                scale_colors.push_back(std::get<0>(rgb));
+                scale_colors.push_back(std::get<1>(rgb));
+                scale_colors.push_back(std::get<2>(rgb));
+            }
+        }
+
+        CreateImage(scale_width, scale_height, scale_colors, "ReferenceScale");
+    }
+
+    const char *debug_mode = getLayerOption("khronos_validation.auto_inst_debug_mode");
+    if (!strcmp(debug_mode, "atomics")) {
+        device_auto_inst->is_debugging_atomic_ops = true;
+    } else if (!strcmp(debug_mode, "subgroup")) {
+        device_auto_inst->is_debugging_subgroup_ops = true;
+    } else if (!strcmp(debug_mode, "arraylength")) {
+        device_auto_inst->is_debugging_array_length_op = true;
+    }
+
+    const char *dump_shaders = getLayerOption("khronos_validation.auto_inst_dump_shaders");
+    device_auto_inst->dump_shaders = *dump_shaders ? !strcmp(dump_shaders, "true") : false;
+
+    if (device_auto_inst->is_debugging_atomic_ops || device_auto_inst->is_debugging_atomic_ops)
+        ReportInfo(device, "Running Auto-Inst in debug mode, normal auto-instrumentation is disabled.\n");
+
+    const char *shaders_to_instrument = getLayerOption("khronos_validation.auto_inst_shaders_to_instrument");
+    if (shaders_to_instrument) {
+        // Format of the option is stageN, stageM, stageL where stage is
+        // defined in ShaderStageFlagLookup and N,M,L are integer literals
+        std::string shader_list(shaders_to_instrument);
+        size_t pos = 0;
+        std::string token;
+        while (shader_list.length() != 0) {
+            while (shader_list[0] == ' ') shader_list.erase(0, 1);
+            pos = shader_list.find(',');
+            if (pos != std::string::npos) {
+                token = shader_list.substr(0, pos);
+            } else {
+                pos = shader_list.length() - 1;
+                token = shader_list;
+            }
+
+            size_t i = 0;
+            while (token[i] < '0' || token[i] > '9') {
+                i++;
+            }
+
+            std::string stage_str = token.substr(0, i);
+            auto shader_index = std::atoi(token.substr(i, pos).c_str());
+            if (shader_index == 0) {
+                ReportSetupProblem(device, "Shader index must be greater than 0. Aborting\n.");
+                device_auto_inst->aborted = true;
+            }
+            if (ShaderStageFlagLookup.count(stage_str) > 0) {
+                uint32_t shader_stage = ShaderStageFlagLookup.find(stage_str)->second;
+                device_auto_inst->StageToInstIndices[shader_stage].insert(shader_index - 1);
+            } else {
+                // Assume that if user is specifying this setting, they care about it being correct rather than
+                // a more general default.
+                ReportSetupProblem(device, "Did not recognize stage " + stage_str + ". Aborting\n.");
+                device_auto_inst->aborted = true;
+            }
+            shader_list.erase(0, pos + 1);
+        }
+    }
+
+    InitializeLayerDeviceSettings(device_auto_inst);
+
+    TryReadRuntimeSizeCache(device_auto_inst);
+
+    std::vector<VkDescriptorSetLayoutBinding> bindings;
+    VkDescriptorSetLayoutBinding binding = {3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1,
+                                            VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_COMPUTE_BIT | kShaderStageAllRayTracing,
+                                            NULL};
+    bindings.push_back(binding);
+    UtilPostCallRecordCreateDevice(pCreateInfo, bindings, device_auto_inst, device_auto_inst->phys_dev_props);
+}
+
+void AutoInst::PreCallRecordDestroyDevice(VkDevice device, const VkAllocationCallbacks *pAllocator) {
+    UtilPreCallRecordDestroyDevice(this);
+    ValidationStateTracker::PreCallRecordDestroyDevice(device, pAllocator);
+    // State Tracker can end up making vma calls through callbacks - don't destroy allocator until ST is done
+    if (vmaAllocator) {
+        vmaDestroyAllocator(vmaAllocator);
+    }
+    desc_set_manager.reset();
+}
+
+// Modify the pipeline layout to include our debug descriptor set and any needed padding with the dummy descriptor set.
+void AutoInst::PreCallRecordCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo,
+                                                 const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout,
+                                                 void *cpl_state_data) {
+    if (aborted) {
+        return;
+    }
+
+    create_pipeline_layout_api_state *cpl_state = reinterpret_cast<create_pipeline_layout_api_state *>(cpl_state_data);
+
+    if (cpl_state->modified_create_info.setLayoutCount >= adjusted_max_desc_sets) {
+        std::ostringstream strm;
+        strm << "Pipeline Layout conflict with validation's descriptor set at slot " << desc_set_bind_index << ". "
+             << "Application has too many descriptor sets in the pipeline layout to continue with debug printf. "
+             << "Not modifying the pipeline layout. "
+             << "Instrumented shaders are replaced with non-instrumented shaders.";
+        ReportSetupProblem(device, strm.str().c_str());
+    } else {
+        UtilPreCallRecordCreatePipelineLayout(cpl_state, this, pCreateInfo);
+    }
+}
+
+void AutoInst::PostCallRecordCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo *pCreateInfo,
+                                                  const VkAllocationCallbacks *pAllocator, VkPipelineLayout *pPipelineLayout,
+                                                  VkResult result) {
+    ValidationStateTracker::PostCallRecordCreatePipelineLayout(device, pCreateInfo, pAllocator, pPipelineLayout, result);
+    if (result != VK_SUCCESS) {
+        ReportSetupProblem(device, "Unable to create pipeline layout.  Device could become unstable.");
+        aborted = true;
+    }
+}
+
+// Free the device memory and descriptor set associated with a command buffer.
+void AutoInst::ResetCommandBuffer(VkCommandBuffer commandBuffer) {
+    if (aborted) {
+        return;
+    }
+    auto auto_inst_buffer_list = GetBufferInfo(commandBuffer);
+    for (auto buffer_info : auto_inst_buffer_list) {
+        vmaDestroyBuffer(vmaAllocator, buffer_info.output_mem_block.buffer, buffer_info.output_mem_block.allocation);
+        if (buffer_info.desc_set != VK_NULL_HANDLE) {
+            desc_set_manager->PutBackDescriptorSet(buffer_info.desc_pool, buffer_info.desc_set);
+        }
+    }
+    command_buffer_map.erase(commandBuffer);
+}
+
+// Just gives a warning about a possible deadlock.
+bool AutoInst::PreCallValidateCmdWaitEvents(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
+                                            VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
+                                            uint32_t memoryBarrierCount, const VkMemoryBarrier *pMemoryBarriers,
+                                            uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier *pBufferMemoryBarriers,
+                                            uint32_t imageMemoryBarrierCount,
+                                            const VkImageMemoryBarrier *pImageMemoryBarriers) const {
+    if (srcStageMask & VK_PIPELINE_STAGE_HOST_BIT) {
+        ReportSetupProblem(commandBuffer,
+                           "CmdWaitEvents recorded with VK_PIPELINE_STAGE_HOST_BIT set. "
+                           "Auto inst waits on queue completion. "
+                           "This wait could block the host's signaling of this event, resulting in deadlock.");
+    }
+    return false;
+}
+
+void AutoInst::PreCallRecordCreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
+                                                    const VkGraphicsPipelineCreateInfo *pCreateInfos,
+                                                    const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
+                                                    void *cgpl_state_data) {
+    std::vector<safe_VkGraphicsPipelineCreateInfo> new_pipeline_create_infos;
+    create_graphics_pipeline_api_state *cgpl_state = reinterpret_cast<create_graphics_pipeline_api_state *>(cgpl_state_data);
+    UtilPreCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, cgpl_state->pipe_state,
+                                       &new_pipeline_create_infos, VK_PIPELINE_BIND_POINT_GRAPHICS, this);
+    cgpl_state->printf_create_infos = new_pipeline_create_infos;
+    cgpl_state->pCreateInfos = reinterpret_cast<VkGraphicsPipelineCreateInfo *>(cgpl_state->printf_create_infos.data());
+}
+
+void AutoInst::PreCallRecordCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
+                                                   const VkComputePipelineCreateInfo *pCreateInfos,
+                                                   const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
+                                                   void *ccpl_state_data) {
+    std::vector<safe_VkComputePipelineCreateInfo> new_pipeline_create_infos;
+    auto *ccpl_state = reinterpret_cast<create_compute_pipeline_api_state *>(ccpl_state_data);
+    UtilPreCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, ccpl_state->pipe_state,
+                                       &new_pipeline_create_infos, VK_PIPELINE_BIND_POINT_COMPUTE, this);
+    ccpl_state->printf_create_infos = new_pipeline_create_infos;
+    ccpl_state->pCreateInfos = reinterpret_cast<VkComputePipelineCreateInfo *>(ccpl_state->gpu_create_infos.data());
+}
+
+void AutoInst::PreCallRecordCreateRayTracingPipelinesNV(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
+                                                        const VkRayTracingPipelineCreateInfoNV *pCreateInfos,
+                                                        const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
+                                                        void *crtpl_state_data) {
+    std::vector<safe_VkRayTracingPipelineCreateInfoCommon> new_pipeline_create_infos;
+    auto *crtpl_state = reinterpret_cast<create_ray_tracing_pipeline_api_state *>(crtpl_state_data);
+    UtilPreCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, crtpl_state->pipe_state,
+                                       &new_pipeline_create_infos, VK_PIPELINE_BIND_POINT_RAY_TRACING_NV, this);
+    crtpl_state->printf_create_infos = new_pipeline_create_infos;
+    crtpl_state->pCreateInfos = reinterpret_cast<VkRayTracingPipelineCreateInfoNV *>(crtpl_state->gpu_create_infos.data());
+}
+
+void AutoInst::PreCallRecordCreateRayTracingPipelinesKHR(VkDevice device, VkDeferredOperationKHR deferredOperation,
+                                                         VkPipelineCache pipelineCache, uint32_t count,
+                                                         const VkRayTracingPipelineCreateInfoKHR *pCreateInfos,
+                                                         const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
+                                                         void *crtpl_state_data) {
+    std::vector<safe_VkRayTracingPipelineCreateInfoCommon> new_pipeline_create_infos;
+    auto *crtpl_state = reinterpret_cast<create_ray_tracing_pipeline_khr_api_state *>(crtpl_state_data);
+    UtilPreCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, crtpl_state->pipe_state,
+                                       &new_pipeline_create_infos, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, this);
+    crtpl_state->printf_create_infos = new_pipeline_create_infos;
+    crtpl_state->pCreateInfos = reinterpret_cast<VkRayTracingPipelineCreateInfoKHR *>(crtpl_state->printf_create_infos.data());
+}
+
+void AutoInst::PostCallRecordCreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
+                                                     const VkGraphicsPipelineCreateInfo *pCreateInfos,
+                                                     const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
+                                                     VkResult result, void *cgpl_state_data) {
+    ValidationStateTracker::PostCallRecordCreateGraphicsPipelines(device, pipelineCache, count, pCreateInfos, pAllocator,
+                                                                  pPipelines, result, cgpl_state_data);
+    if (pipeline_to_instrument != VK_PIPELINE_BIND_POINT_GRAPHICS) return;
+    UtilPostCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, VK_PIPELINE_BIND_POINT_GRAPHICS, this);
+}
+
+void AutoInst::PostCallRecordCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
+                                                    const VkComputePipelineCreateInfo *pCreateInfos,
+                                                    const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
+                                                    VkResult result, void *ccpl_state_data) {
+    ValidationStateTracker::PostCallRecordCreateComputePipelines(device, pipelineCache, count, pCreateInfos, pAllocator, pPipelines,
+                                                                 result, ccpl_state_data);
+    if (pipeline_to_instrument != VK_PIPELINE_BIND_POINT_COMPUTE) return;
+    UtilPostCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, VK_PIPELINE_BIND_POINT_COMPUTE, this);
+}
+
+void AutoInst::PostCallRecordCreateRayTracingPipelinesNV(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
+                                                         const VkRayTracingPipelineCreateInfoNV *pCreateInfos,
+                                                         const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
+                                                         VkResult result, void *crtpl_state_data) {
+    ValidationStateTracker::PostCallRecordCreateRayTracingPipelinesNV(device, pipelineCache, count, pCreateInfos, pAllocator,
+                                                                      pPipelines, result, crtpl_state_data);
+    if (pipeline_to_instrument != VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR) return;
+    UtilPostCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, VK_PIPELINE_BIND_POINT_RAY_TRACING_NV, this);
+}
+
+void AutoInst::PostCallRecordCreateRayTracingPipelinesKHR(VkDevice device, VkDeferredOperationKHR deferredOperation,
+                                                          VkPipelineCache pipelineCache, uint32_t count,
+                                                          const VkRayTracingPipelineCreateInfoKHR *pCreateInfos,
+                                                          const VkAllocationCallbacks *pAllocator, VkPipeline *pPipelines,
+                                                          VkResult result, void *crtpl_state_data) {
+    ValidationStateTracker::PostCallRecordCreateRayTracingPipelinesKHR(
+        device, deferredOperation, pipelineCache, count, pCreateInfos, pAllocator, pPipelines, result, crtpl_state_data);
+    if (pipeline_to_instrument != VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR) return;
+    UtilPostCallRecordPipelineCreations(count, pCreateInfos, pAllocator, pPipelines, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR, this);
+}
+
+// Remove all the shader trackers associated with this destroyed pipeline.
+void AutoInst::PreCallRecordDestroyPipeline(VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks *pAllocator) {
+    for (auto it = shader_map.begin(); it != shader_map.end();) {
+        if (it->second.pipeline == pipeline) {
+            it = shader_map.erase(it);
+        } else {
+            ++it;
+        }
+    }
+    ValidationStateTracker::PreCallRecordDestroyPipeline(device, pipeline, pAllocator);
+}
+// Call the SPIR-V Optimizer to run the instrumentation pass on the shader.
+bool AutoInst::InstrumentShader(const VkShaderModuleCreateInfo *pCreateInfo, std::vector<unsigned int> &new_pgm,
+                                uint32_t *unique_shader_id) {
+    if (aborted) return false;
+    if (pCreateInfo->pCode[0] != spv::MagicNumber) return false;
+
+    // Load original shader SPIR-V
+    uint32_t num_words = static_cast<uint32_t>(pCreateInfo->codeSize / 4);
+    new_pgm.clear();
+    new_pgm.reserve(num_words);
+    new_pgm.insert(new_pgm.end(), &pCreateInfo->pCode[0], &pCreateInfo->pCode[num_words]);
+
+    auto stage = FindShaderStage(new_pgm);
+    if (stage == 0) return false;
+
+    // Check against pipeline_to_instrument setting
+    switch (pipeline_to_instrument) {
+        case VK_PIPELINE_BIND_POINT_COMPUTE:
+            if ((kShaderStageAllCompute & stage) == 0) return false;
+            break;
+        case VK_PIPELINE_BIND_POINT_GRAPHICS:
+            if ((kShaderStageAllGraphics & stage) == 0) return false;
+            break;
+        case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
+            if ((kShaderStageAllRayTracing & stage) == 0) return false;
+            break;
+        default:
+            break;
+    }
+
+    // Check again shaders_to_instrument setting
+    if (StageToInstIndices.size() > 0) {
+        auto stage_index = Stage2SeenCount[stage];
+        Stage2SeenCount[stage]++;
+        if (StageToInstIndices[stage].count(stage_index) == 0) {
+            // The shader stage and index was not found in the user provided setting so skip instrumenting.
+            return false;
+        }
+    }
+
+    // Call the optimizer to instrument the shader.
+    // Use the unique_shader_module_id as a shader ID so we can look up its handle later in the shader_map.
+    // If descriptor indexing is enabled, enable length checks and updated descriptor checks
+    using namespace spvtools;
+    spv_target_env target_env = SPV_ENV_VULKAN_1_2;
+
+    const spvtools::MessageConsumer auto_inst_console_message_consumer =
+        [this](spv_message_level_t level, const char *, const spv_position_t &position, const char *message) -> void {
+        switch (level) {
+            case SPV_MSG_FATAL:
+            case SPV_MSG_INTERNAL_ERROR:
+            case SPV_MSG_ERROR:
+                this->LogError(this->device, "UNASSIGNED-Debug-Printf", "Error during shader instrumentation: line %zu: %s",
+                               position.index, message);
+                break;
+            default:
+                break;
+        }
+    };
+
+    Optimizer optimizer(target_env);
+    optimizer.SetMessageConsumer(auto_inst_console_message_consumer);
+    if (is_debugging_atomic_ops || is_debugging_subgroup_ops || is_debugging_array_length_op) {
+        optimizer.RegisterPass(spvtools::CreateAutoInstDebugPass(desc_set_bind_index, unique_shader_module_id,
+                                                                 is_debugging_atomic_ops, is_debugging_subgroup_ops));
+    } else {
+        RegisterPasses(&optimizer, desc_set_bind_index, unique_shader_module_id);
+    }
+    if (optimizer.GetPassNames().size() == 0) return false;
+    bool pass = optimizer.Run(new_pgm.data(), new_pgm.size(), &new_pgm);
+    if (!pass) {
+        ReportSetupProblem(
+            device, "Failure to instrument shader " + ShaderStageToString(stage) + ".  Proceeding with non-instrumented shader.\n");
+    }
+
+    if (dump_shaders) {
+        std::ofstream shader_dump_file;
+        shader_dump_file.open(ShaderStageToString(stage) + std::to_string(unique_shader_module_id) + ".spv",
+                              std::ios_base::binary | std::ios_base::out);
+        shader_dump_file.write((char *)new_pgm.data(), new_pgm.size() * sizeof(uint32_t));
+        shader_dump_file.close();
+    }
+
+    instrumentation_map[unique_shader_module_id] = std::vector<unsigned int>(new_pgm);
+    *unique_shader_id = unique_shader_module_id++;
+    return pass;
+}
+// Create the instrumented shader data to provide to the driver.
+void AutoInst::PreCallRecordCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo *pCreateInfo,
+                                               const VkAllocationCallbacks *pAllocator, VkShaderModule *pShaderModule,
+                                               void *csm_state_data) {
+    create_shader_module_api_state *csm_state = reinterpret_cast<create_shader_module_api_state *>(csm_state_data);
+
+    bool pass = InstrumentShader(pCreateInfo, csm_state->instrumented_pgm, &csm_state->unique_shader_id);
+    if (pass) {
+        csm_state->instrumented_create_info.pCode = csm_state->instrumented_pgm.data();
+        csm_state->instrumented_create_info.codeSize = csm_state->instrumented_pgm.size() * sizeof(unsigned int);
+    }
+}
+
+void AutoInst::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQueue queue, VkPipelineBindPoint pipeline_bind_point,
+                                          uint32_t operation_index, uint32_t *const debug_output_buffer) {
+    if (pipeline_bind_point != pipeline_to_instrument) return;
+    if (aborted) return;
+
+    // debug mode tests
+    if (is_debugging_atomic_ops || is_debugging_subgroup_ops || is_debugging_array_length_op) {
+        if (is_debugging_atomic_ops) {
+            auto message = debug_output_buffer[BUFFER_DEBUG_LOCATION] == 0 ? "Atomics instrumentation did not produce a result!\n"
+                                                                           : "Atomics instrumentation produced a result!\n";
+            ReportInfo(device, message);
+        }
+
+        if (is_debugging_subgroup_ops) {
+            auto message = debug_output_buffer[BUFFER_DEBUG_LOCATION] == 0
+                               ? "Subgroup instrumentation did not produce a result!\n"
+                               : "Subgroup instrumentation produced a result" +
+                                     std::bitset<SUBGROUP_SIZE>(debug_output_buffer[BUFFER_DEBUG_LOCATION]).to_string() + "!\n";
+            ReportInfo(device, message);
+        }
+
+        if (is_debugging_array_length_op) {
+            auto message = debug_output_buffer[BUFFER_DEBUG_LOCATION] == 0
+                               ? "ArrayLength instrumentation did not produce a result!\n"
+                               : "ArrayLength instrumentation produced a result=" +
+                                     std::to_string(4 * debug_output_buffer[BUFFER_DEBUG_LOCATION]) + " bytes !\n";
+            ReportInfo(device, message);
+        }
+
+        return;
+    }
+
+    auto words_written = debug_output_buffer[WORDS_WRITTEN_INDEX];
+
+    uint32_t bytes_consumed = sizeof(uint32_t) * NUM_BUFFER_RESERVED_WORDS + sizeof(uint32_t) * words_written;
+
+    VmaAllocationInfo alloc_info;
+    (void)vmaGetAllocationInfo(vmaAllocator, GetBufferInfo(command_buffer)[operation_index].output_mem_block.allocation,
+                               &alloc_info);
+    auto buffer_size = alloc_info.size;
+    auto creation_index = GetBufferInfo(command_buffer)[operation_index].output_mem_block.creation_index;
+
+    auto overflowed = bytes_consumed > buffer_size;
+    if (overflowed) {
+        auto message =
+            "Output buffer size is " + std::to_string(buffer_size) + " bytes which is less than the " +
+            std::to_string(bytes_consumed) +
+            " bytes that the instrumentation could have written. Please rerun the application to get analysis results.\n";
+        ReportInfo(device, message.c_str());
+    }
+
+    auto bytes_consumed_for_vma = [](uint32_t raw_bytes) {
+        // set to next highest multiple of 1024
+        return (raw_bytes & (~1023)) + 1024;
+    };
+
+    if (BufferSizeRequirementsLookup.size() <= creation_index) {
+        BufferSizeRequirementsLookup.resize(creation_index + 1, sizeof(uint32_t) * NUM_BUFFER_RESERVED_WORDS);
+        BufferSizeRequirementsLookup[creation_index] = bytes_consumed_for_vma(bytes_consumed);
+        WriteRuntimeSizeCache();
+    } else if (BufferSizeRequirementsLookup[creation_index] == output_buffer_size) {
+        // Assume that if the value is the default, we're safe to lower the instrumentation buffer size
+        // to save on device memory usage.
+        BufferSizeRequirementsLookup[creation_index] = bytes_consumed_for_vma(bytes_consumed);
+        WriteRuntimeSizeCache();
+    } else if (bytes_consumed > BufferSizeRequirementsLookup[creation_index]) {
+        // If the value is not the default, that implies it has already been set by a runtime observation
+        // and therefore it should never decrease.
+        BufferSizeRequirementsLookup[creation_index] = bytes_consumed_for_vma(bytes_consumed);
+        WriteRuntimeSizeCache();
+    }
+
+    switch (pipeline_bind_point) {
+        case VK_PIPELINE_BIND_POINT_COMPUTE: {
+            auto cb_state = GetCBState(command_buffer);
+            LAST_BOUND_STATE &last_bound = cb_state->lastBound[pipeline_bind_point];
+            std::tuple<uint32_t, uint32_t, uint32_t> localsize_xyz = std::make_tuple(0, 0, 0);
+            if (last_bound.pipeline_state) {
+                PIPELINE_STATE *p_state = last_bound.pipeline_state;
+                auto shader_state = GetShaderModuleState(p_state->computePipelineCI.stage.module);
+                if (shader_state != NULL) {
+                    localsize_xyz = FindComputeLocalSize(shader_state->words);
+                }
+            }
+            if (analysis_index >= compute_launch_records.size()) {
+                ReportSetupProblem(device, "Insufficient launch records to support compute analysis.");
+                break;
+            }
+            auto launch_dims3d = compute_launch_records[analysis_index];
+            uint32_t localsize_x = std::get<0>(localsize_xyz);
+            uint32_t localsize_y = std::get<1>(localsize_xyz);
+            uint32_t localsize_z = std::get<2>(localsize_xyz);
+            if (localsize_x == 0 || localsize_y == 0 || localsize_x == 0) {
+                ReportSetupProblem(device, "Could not determine compute shader local size.\n");
+            }
+            is_analyzing_compute = true;
+            AnalyzeCompute(debug_output_buffer, overflowed, launch_dims3d.x_dim * localsize_x, launch_dims3d.y_dim * localsize_y,
+                           launch_dims3d.z_dim * localsize_z);
+            is_analyzing_compute = false;
+            analysis_index++;
+            break;
+        }
+        case VK_PIPELINE_BIND_POINT_GRAPHICS: {
+            analysis_index++;
+            is_analyzing_draw = true;
+            AnalyzeGraphics(debug_output_buffer, overflowed);
+            is_analyzing_draw = false;
+            break;
+        }
+        case VK_PIPELINE_BIND_POINT_RAY_TRACING_NV: {
+            if (analysis_index >= rt_launch_records.size()) {
+                ReportSetupProblem(device, "Insufficient launch records to support ray tracing analysis.");
+                break;
+            }
+            auto launch_dims3d = rt_launch_records[analysis_index];
+            is_analyzing_rt = true;
+            AnalyzeRayTracing(debug_output_buffer, overflowed, launch_dims3d.x_dim, launch_dims3d.y_dim, launch_dims3d.z_dim);
+            is_analyzing_rt = false;
+            analysis_index++;
+            break;
+        }
+        default:
+            ReportSetupProblem(device, "Unsupported pipeline type cannot be analyzed.");
+            break;
+    }
+
+    memset(debug_output_buffer, 0, buffer_size);
+}
+
+// Issue a memory barrier to make GPU-written data available to host.
+// Wait for the queue to complete execution.
+// Check the debug buffers for all the command buffers that were submitted.
+void AutoInst::PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits, VkFence fence,
+                                         VkResult result) {
+    ValidationStateTracker::PostCallRecordQueueSubmit(queue, submitCount, pSubmits, fence, result);
+
+    if (aborted || (result != VK_SUCCESS)) return;
+    bool buffers_present = false;
+    // Don't QueueWaitIdle if there's nothing to process
+    for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
+        const VkSubmitInfo *submit = &pSubmits[submit_idx];
+        for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
+            auto cb_node = GetCBState(submit->pCommandBuffers[i]);
+            if (GetBufferInfo(cb_node->commandBuffer).size()) buffers_present = true;
+            for (auto secondaryCmdBuffer : cb_node->linkedCommandBuffers) {
+                if (GetBufferInfo(secondaryCmdBuffer->commandBuffer).size()) buffers_present = true;
+            }
+        }
+    }
+    if (!buffers_present) return;
+
+    UtilSubmitBarrier(queue, this);
+
+    DispatchQueueWaitIdle(queue);
+
+    for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
+        const VkSubmitInfo *submit = &pSubmits[submit_idx];
+        for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
+            auto cb_node = GetCBState(submit->pCommandBuffers[i]);
+            UtilProcessInstrumentationBuffer(queue, cb_node, this);
+            for (auto secondaryCmdBuffer : cb_node->linkedCommandBuffers) {
+                UtilProcessInstrumentationBuffer(queue, secondaryCmdBuffer, this);
+            }
+        }
+    }
+}
+
+void AutoInst::PreCallRecordCmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount,
+                                    uint32_t firstVertex, uint32_t firstInstance) {
+    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
+}
+
+void AutoInst::PreCallRecordCmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
+                                           uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance) {
+    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
+}
+
+void AutoInst::PreCallRecordCmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t count,
+                                            uint32_t stride) {
+    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
+}
+
+void AutoInst::PreCallRecordCmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset,
+                                                   uint32_t count, uint32_t stride) {
+    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
+}
+
+void AutoInst::PreCallRecordCmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z) {
+    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE);
+    if (pipeline_to_instrument == VK_PIPELINE_BIND_POINT_COMPUTE) {
+        compute_launch_records.push_back({x, y, z});
+    }
+}
+
+void AutoInst::PreCallRecordCmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset) {
+    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE);
+}
+
+void AutoInst::PreCallRecordCmdTraceRaysNV(VkCommandBuffer commandBuffer, VkBuffer raygenShaderBindingTableBuffer,
+                                           VkDeviceSize raygenShaderBindingOffset, VkBuffer missShaderBindingTableBuffer,
+                                           VkDeviceSize missShaderBindingOffset, VkDeviceSize missShaderBindingStride,
+                                           VkBuffer hitShaderBindingTableBuffer, VkDeviceSize hitShaderBindingOffset,
+                                           VkDeviceSize hitShaderBindingStride, VkBuffer callableShaderBindingTableBuffer,
+                                           VkDeviceSize callableShaderBindingOffset, VkDeviceSize callableShaderBindingStride,
+                                           uint32_t width, uint32_t height, uint32_t depth) {
+    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_NV);
+}
+
+void AutoInst::PostCallRecordCmdTraceRaysNV(VkCommandBuffer commandBuffer, VkBuffer raygenShaderBindingTableBuffer,
+                                            VkDeviceSize raygenShaderBindingOffset, VkBuffer missShaderBindingTableBuffer,
+                                            VkDeviceSize missShaderBindingOffset, VkDeviceSize missShaderBindingStride,
+                                            VkBuffer hitShaderBindingTableBuffer, VkDeviceSize hitShaderBindingOffset,
+                                            VkDeviceSize hitShaderBindingStride, VkBuffer callableShaderBindingTableBuffer,
+                                            VkDeviceSize callableShaderBindingOffset, VkDeviceSize callableShaderBindingStride,
+                                            uint32_t width, uint32_t height, uint32_t depth) {
+    CMD_BUFFER_STATE *cb_state = GetCBState(commandBuffer);
+    cb_state->hasTraceRaysCmd = true;
+
+    rt_launch_records.push_back({width, height, depth});
+}
+
+void AutoInst::PreCallRecordCmdTraceRaysKHR(VkCommandBuffer commandBuffer,
+                                            const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
+                                            const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
+                                            const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
+                                            const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, uint32_t width,
+                                            uint32_t height, uint32_t depth) {
+    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
+}
+
+void AutoInst::PostCallRecordCmdTraceRaysKHR(VkCommandBuffer commandBuffer,
+                                             const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
+                                             const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
+                                             const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
+                                             const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, uint32_t width,
+                                             uint32_t height, uint32_t depth) {
+    CMD_BUFFER_STATE *cb_state = GetCBState(commandBuffer);
+    cb_state->hasTraceRaysCmd = true;
+
+    rt_launch_records.push_back({width, height, depth});
+}
+
+void AutoInst::PreCallRecordCmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
+                                                    const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
+                                                    const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
+                                                    const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
+                                                    const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
+                                                    VkDeviceAddress indirectDeviceAddress) {
+    AllocateAutoInstResources(commandBuffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
+}
+
+void AutoInst::PostCallRecordCmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
+                                                     const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
+                                                     const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
+                                                     const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
+                                                     const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
+                                                     VkDeviceAddress indirectDeviceAddress) {
+    CMD_BUFFER_STATE *cb_state = GetCBState(commandBuffer);
+    cb_state->hasTraceRaysCmd = true;
+}
+
+void AutoInst::PostCallRecordQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR *pPresentInfo, VkResult result) {
+    if (aborted) return;
+    // helper variables for recording file names generated by analyses
+    frame_number++;
+    analysis_index = 0;
+    pipeline_creation_index = 0;
+}
+
+void AutoInst::AllocateAutoInstResources(const VkCommandBuffer cmd_buffer, const VkPipelineBindPoint bind_point) {
+    if (bind_point != VK_PIPELINE_BIND_POINT_GRAPHICS && bind_point != VK_PIPELINE_BIND_POINT_COMPUTE &&
+        bind_point != VK_PIPELINE_BIND_POINT_RAY_TRACING_NV) {
+        return;
+    }
+
+    if (pipeline_to_instrument != bind_point) {
+        return;
+    }
+
+    VkResult result;
+
+    if (aborted) return;
+
+    std::vector<VkDescriptorSet> desc_sets;
+    VkDescriptorPool desc_pool = VK_NULL_HANDLE;
+    result = desc_set_manager->GetDescriptorSets(1, &desc_pool, debug_desc_layout, &desc_sets);
+    assert(result == VK_SUCCESS);
+    if (result != VK_SUCCESS) {
+        ReportSetupProblem(device, "Unable to allocate descriptor sets.  Device could become unstable.");
+        aborted = true;
+        return;
+    }
+
+    auto buffer_size = (BufferSizeRequirementsLookup.size() <= pipeline_creation_index)
+                           ? output_buffer_size
+                           : (uint32_t)(BufferSizeRequirementsLookup[pipeline_creation_index]);
+
+    VkDescriptorBufferInfo output_desc_buffer_info = {};
+    output_desc_buffer_info.range = buffer_size;
+
+    auto cb_node = GetCBState(cmd_buffer);
+    if (!cb_node) {
+        ReportSetupProblem(device, "Unrecognized command buffer");
+        aborted = true;
+        return;
+    }
+
+    // Allocate memory for the output block that the gpu will use to return values for instrumentation
+    AIDeviceMemoryBlock output_block = {};
+    VkBufferCreateInfo bufferInfo = {VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO};
+    bufferInfo.size = buffer_size;
+    bufferInfo.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+    VmaAllocationCreateInfo allocInfo = {};
+    allocInfo.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
+    result = vmaCreateBuffer(vmaAllocator, &bufferInfo, &allocInfo, &output_block.buffer, &output_block.allocation, nullptr);
+
+    output_block.creation_index = pipeline_creation_index;
+    pipeline_creation_index++;
+    if (result != VK_SUCCESS) {
+        ReportSetupProblem(device, "Unable to allocate device memory.  Device could become unstable.");
+        aborted = true;
+        return;
+    }
+
+    // Clear the output block to zeros so that only values from the gpu will be present
+    uint32_t *pData;
+    result = vmaMapMemory(vmaAllocator, output_block.allocation, (void **)&pData);
+    if (result == VK_SUCCESS) {
+        memset(pData, 0, buffer_size);
+        InitializeInstrumentationBuffer(pData);
+        vmaUnmapMemory(vmaAllocator, output_block.allocation);
+    }
+
+    VkWriteDescriptorSet desc_writes[1] = {};
+    const uint32_t desc_count = 1;
+
+    // Write the descriptor
+    output_desc_buffer_info.buffer = output_block.buffer;
+    output_desc_buffer_info.offset = 0;
+
+    desc_writes[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+    desc_writes[0].descriptorCount = 1;
+    desc_writes[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    desc_writes[0].pBufferInfo = &output_desc_buffer_info;
+    desc_writes[0].dstSet = desc_sets[0];
+    desc_writes[0].dstBinding = 3;
+    DispatchUpdateDescriptorSets(device, desc_count, desc_writes, 0, NULL);
+
+    const auto lv_bind_point = ConvertToLvlBindPoint(bind_point);
+    const auto *pipeline_state = cb_node->lastBound[lv_bind_point].pipeline_state;
+    if (pipeline_state) {
+        if (pipeline_state->pipeline_layout->set_layouts.size() <= desc_set_bind_index) {
+            DispatchCmdBindDescriptorSets(cmd_buffer, bind_point, pipeline_state->pipeline_layout->layout, desc_set_bind_index, 1,
+                                          desc_sets.data(), 0, nullptr);
+        }
+        // Record buffer and memory info in CB state tracking
+        GetBufferInfo(cmd_buffer).emplace_back(output_block, desc_sets[0], desc_pool, bind_point);
+    } else {
+        ReportSetupProblem(device, "Unable to find pipeline state");
+        vmaDestroyBuffer(vmaAllocator, output_block.buffer, output_block.allocation);
+        aborted = true;
+        return;
+    }
+}
diff --git a/layers/auto_inst.h b/layers/auto_inst.h
new file mode 100644
index 00000000..dd5dbbd9
--- /dev/null
+++ b/layers/auto_inst.h
@@ -0,0 +1,465 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#pragma once
+
+#include "chassis.h"
+#include "vk_mem_alloc.h"
+#include "state_tracker.h"
+#include "gpu_utils.h"
+#include "spirv-tools/instrument.hpp"
+#include <map>
+
+class AutoInst;
+
+struct AIDeviceMemoryBlock {
+    uint32_t creation_index;
+    VkBuffer buffer;
+    VmaAllocation allocation;
+};
+
+struct AIBufferInfo {
+    AIDeviceMemoryBlock output_mem_block;
+    VkDescriptorSet desc_set;
+    VkDescriptorPool desc_pool;
+    VkPipelineBindPoint pipeline_bind_point;
+    AIBufferInfo(AIDeviceMemoryBlock output_mem_block, VkDescriptorSet desc_set, VkDescriptorPool desc_pool,
+                 VkPipelineBindPoint pipeline_bind_point)
+        : output_mem_block(output_mem_block), desc_set(desc_set), desc_pool(desc_pool), pipeline_bind_point(pipeline_bind_point){};
+};
+
+struct AIShaderTracker {
+    VkPipeline pipeline;
+    VkShaderModule shader_module;
+    std::vector<unsigned int> pgm;
+    VkShaderStageFlagBits stage;
+};
+
+struct AIUniqueSubgroupIdEntry {
+    uint32_t inst_id;
+    uint32_t flat_thread_id;
+    uint32_t subgroup_ids;  // Combined subgroup id and intra subgroup id.
+
+    inline uint32_t SubgroupId() const { return subgroup_ids & 0x07FFFFFF; }
+    inline uint32_t IntraSubgroupId() const { return (subgroup_ids & 0xF8000000) >> 27; }
+};
+
+struct LaunchDims3D {
+    uint32_t x_dim;
+    uint32_t y_dim;
+    uint32_t z_dim;
+};
+
+class AutoInst : public ValidationStateTracker {
+    VkPhysicalDeviceFeatures supported_features;
+
+    uint32_t unique_shader_module_id = 0;
+    std::unordered_map<VkCommandBuffer, std::vector<AIBufferInfo>> command_buffer_map;
+    uint32_t output_buffer_size;
+
+  protected:
+    static const uint32_t SUBGROUP_SIZE = 32;
+
+    // Reserved words in the buffer
+    static const uint32_t WORDS_WRITTEN_INDEX = 0;
+    static const uint32_t NUM_SUBGROUP_IDS_INDEX = 1;
+
+    static const uint32_t NUM_BUFFER_RESERVED_WORDS = 2;
+
+    // Reserved word in debug mode
+    static const uint32_t BUFFER_DEBUG_LOCATION = 1;
+
+  public:
+    using ThreadIdToSubgroupIdMap = std::unordered_map<uint32_t, uint32_t>;
+
+    // Map from subgroup_id * SUBGROUP_SIZE + thread_offset to runtime
+    // thread_id
+    using ThreadIdSwizzleMap = std::unordered_map<uint32_t, uint32_t>;
+
+    // Map from primtive id to the number of words that the primitive wrote
+    // to the StorageBuffer.
+    using PrimitiveIdToPrimitiveSizeMap = std::unordered_map<uint32_t, uint32_t>;
+
+    // Record pipeline invocations launch dim parameters;
+    using LaunchDimRecords = std::vector<LaunchDims3D>;
+
+    AutoInst() { container_type = LayerObjectTypeAutoInst; }
+
+    // The pipeline type to instrument
+    VkPipelineBindPoint pipeline_to_instrument = VK_PIPELINE_BIND_POINT_MAX_ENUM;
+
+    // Records of the rt and compute launch sizes, this is useful for rebuilding
+    // the frames for visualization
+    LaunchDimRecords rt_launch_records;
+    LaunchDimRecords compute_launch_records;
+
+    // Helper variables for properly naming files output by the analysis
+    std::string base_file_name;
+    // index of next instrumented pipeline to analyze
+    uint32_t analysis_index = 0;
+    // index of next instrumented pipeline to create. Used for determining runtime
+    // buffer size requirements in the case that previous runs were recorded.
+    uint32_t pipeline_creation_index = 0;
+    uint32_t frame_number = 0;
+    bool is_analyzing_rt = false;
+    bool is_analyzing_draw = false;
+    bool is_analyzing_compute = false;
+
+    // Variables for debug modes
+    // This framework relies on atomic operations in SPIR-V for writing
+    // instrumentation results to the StorageBuffer and Subgroup
+    // operations for determining the active thread mask.
+    bool is_debugging_atomic_ops = false;
+    bool is_debugging_subgroup_ops = false;
+    bool is_debugging_array_length_op = false;
+
+    // if true, dump instrumented shaders
+    // if false, do nothing.
+    bool dump_shaders = false;
+
+    // This map specifies the index of the shader stage to instrument. I.e.
+    // if the option is Miss2 then the 2nd Miss shader that is created
+    // will be instrumented.
+    //
+    // If this map is uninitialized (size == 0) then it is assumed that
+    // all shaders should be instrumented.
+    std::unordered_map<uint32_t, std::set<uint32_t>> StageToInstIndices;
+
+    // Track how many of each ShaderStage has been created.
+    std::unordered_map<uint32_t, uint32_t> Stage2SeenCount;
+
+    // Track how many bytes were required by previous invocations of a given pipeline.
+    // This data is written to a cache file that is read for subsequent executions
+    // of the application.
+    std::vector<uint32_t> BufferSizeRequirementsLookup;
+
+    bool aborted = false;
+    bool use_stdout = false;
+    VkDevice device;
+    VkPhysicalDevice physicalDevice;
+    uint32_t adjusted_max_desc_sets;
+    uint32_t desc_set_bind_index;
+    VkDescriptorSetLayout debug_desc_layout = VK_NULL_HANDLE;
+    VkDescriptorSetLayout dummy_desc_layout = VK_NULL_HANDLE;
+    std::unique_ptr<UtilDescriptorSetManager> desc_set_manager;
+    std::unordered_map<uint32_t, AIShaderTracker> shader_map;
+    std::unordered_map<uint32_t, std::vector<unsigned int>> instrumentation_map;
+    PFN_vkSetDeviceLoaderData vkSetDeviceLoaderData;
+    VmaAllocator vmaAllocator = {};
+    std::map<VkQueue, UtilQueueBarrierCommandInfo> queue_barrier_command_infos;
+    std::vector<AIBufferInfo>& GetBufferInfo(const VkCommandBuffer command_buffer) {
+        auto buffer_list = command_buffer_map.find(command_buffer);
+        if (buffer_list == command_buffer_map.end()) {
+            std::vector<AIBufferInfo> new_list{};
+            command_buffer_map[command_buffer] = new_list;
+            return command_buffer_map[command_buffer];
+        }
+        return buffer_list->second;
+    }
+
+    // Subclass Hooks
+
+    // Opportunity for inheriting classes to initialize
+    // and parse vk_settings_file.txt settings.
+    virtual void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) = 0;
+
+    // Opportunity for inheriting classes to set the
+    // buffer to nonzero values for use-cases like PGO.
+    virtual void InitializeInstrumentationBuffer(uint32_t* buffer) = 0;
+
+    // Opportunity for inheriting class to register auto-inst pass
+    // as well as other passes of interest (e.g. performance)
+    virtual void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) = 0;
+
+    virtual void AnalyzeRayTracing(uint32_t* const output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
+                                   uint32_t depth) = 0;
+
+    virtual void AnalyzeGraphics(uint32_t* const output_buffer, bool buffer_overflowed) = 0;
+
+    virtual void AnalyzeCompute(uint32_t* const output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) = 0;
+
+    // Helper functions
+    template <typename T>
+    void ReportSetupProblem(T object, std::string specific_message) const;
+    template <typename T>
+    void ReportInfo(T object, std::string specific_message) const;
+
+    // This function takes a disassembled SPIR-V module in |shader|
+    // and adds the strings in |inst_id2str| immediately before
+    // the instrumentation callsites with a given inst_id.
+    //
+    // After the function has finished it  will return the shader
+    // with annotations. This is designed to be used in conjunction
+    // with SPIRV-cross.
+    std::string AnnotateModuleStr(std::string& shader, std::unordered_map<uint32_t, std::string>& inst_id2str) const;
+
+    // This function takes an annotated spir-v |shader| module as a string and
+    // attempts to cross-compile it using SPIRV-cross to the corresponding glsl.
+    //
+    // After cross-compiling, a post-processing step changes the
+    // #line directives that are added to valid GLSL comments
+    //
+    // SPIR-Cross fails frequently due to unsupported builtins and the fall-back
+    // path is to emit the module as .spv not .glsl. Either the .spv or .glsl will be
+    // written to |file_name|
+    void TryCompileModuleStrToGlsl(const std::string shader, std::string file_name) const;
+
+    // Returns a file name that includes the base_file_name, analysis type,
+    // frame number and finally the |analysis_specific_suffix|.
+    inline std::string FrameAnalysisFileName(std::string analysis_specific_suffix) const {
+        std::string analysis_type;
+        if (is_analyzing_compute) {
+            analysis_type = "compute";
+        } else if (is_analyzing_draw) {
+            analysis_type = "draw";
+        } else if (is_analyzing_rt) {
+            analysis_type = "rt";
+        } else {
+            analysis_type = "unknown";
+        }
+        return base_file_name + analysis_type + "_frame" + std::to_string(frame_number) + "_" + analysis_specific_suffix;
+    }
+
+    // Returns a file name that includes the base_file_name, analysis type, analysis specific pipeline invocation index
+    // frame number and finally the |analysis_specific_suffix|.
+    inline std::string PipelineAnalysisFileName(std::string analysis_specific_suffix) const {
+        std::string analysis_type;
+        if (is_analyzing_compute) {
+            analysis_type = "compute";
+        } else if (is_analyzing_draw) {
+            analysis_type = "draw";
+        } else if (is_analyzing_rt) {
+            analysis_type = "rt";
+        } else {
+            analysis_type = "unknown";
+        }
+        return base_file_name + analysis_type + std::to_string(analysis_index) + "_frame" + std::to_string(frame_number) + "_" +
+               analysis_specific_suffix;
+    }
+
+    static inline std::string ShaderStageToString(uint32_t stage) {
+        switch (stage) {
+            case VK_SHADER_STAGE_RAYGEN_BIT_KHR:
+                return "RayGen";
+            case VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR:
+                return "ClosestHit";
+            case VK_SHADER_STAGE_CALLABLE_BIT_KHR:
+                return "Callable";
+            case VK_SHADER_STAGE_MISS_BIT_KHR:
+                return "Miss";
+            case VK_SHADER_STAGE_ANY_HIT_BIT_KHR:
+                return "AnyHit";
+            case VK_SHADER_STAGE_INTERSECTION_BIT_KHR:
+                return "Intersection";
+            case VK_SHADER_STAGE_GEOMETRY_BIT:
+                return "Geometry";
+            case VK_SHADER_STAGE_FRAGMENT_BIT:
+                return "Fragment";
+            case VK_SHADER_STAGE_COMPUTE_BIT:
+                return "Compute";
+            case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
+                return "TessellationControl";
+            case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
+                return "TessellationEvaluation";
+            case VK_SHADER_STAGE_VERTEX_BIT:
+                return "Vertex";
+            default:
+                return "Unknown" + std::to_string(stage);
+        }
+    }
+
+    // Create a PPM file with size |width| * |height| by writing the values in |colors| in row-major order
+    void CreateImage(uint32_t width, uint32_t height, std::vector<char>& colors, std::string file_name) const;
+    // Return a color represented as RBG from a value in the unit interval [0,1].
+    std::tuple<char, char, char> UnitIntervalToRGB(float val) const;
+
+    // Analyze the SPIR-V module binary |pgm| to determine which execution model it
+    // implements and the corresponding VkShaderStageFlag.
+    //
+    // If the shader stage is not supported, or the shader module implements more than
+    // one execution model then this function returns 0.
+    // Otherwise it returns the single bit representation the shader stage.
+    uint32_t FindShaderStage(std::vector<unsigned int> pgm) const;
+
+    // Analyze the SPIR-V module binary |pgm| of a compute shader to determine the
+    // localsize that it implements.
+    // If the shader stage is not supported this function returns 0,0,0.
+    // Otherwise it returns the x,y,z values of the localsize.
+    std::tuple<uint32_t, uint32_t, uint32_t> FindComputeLocalSize(std::vector<unsigned int> pgm) const;
+
+    // File name of cache file containing runtime instrumentation buffer size requirements.
+    inline std::string RuntimeSizeCachePath(VkPipelineBindPoint bind_point) const {
+        std::string pipeline_type;
+        switch (bind_point) {
+            case VK_PIPELINE_BIND_POINT_COMPUTE:
+                pipeline_type = "compute";
+                break;
+            case VK_PIPELINE_BIND_POINT_GRAPHICS:
+                pipeline_type = "graphics";
+                break;
+            case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
+                pipeline_type = "ray_tracing";
+                break;
+            default:
+                pipeline_type = "Unknown";
+                break;
+        }
+        return ".ai_runtime_size_cache_" + pipeline_type + ".bin";
+    }
+
+    // Attempts to read the cache file containing information about this application's
+    // runtime buffer size requirements.
+    //
+    // This function populates the variable BufferSizeRequirementsLookup
+    void TryReadRuntimeSizeCache(AutoInst* device_auto_inst);
+
+    // Writes the current knowledge of the runtime instrumentation buffer requirements
+    // to the cache file.
+    void WriteRuntimeSizeCache();
+
+    // Creates mappings from the data output by the 'UniqueSubgroupId' primitive.
+    // |primitive_id2primitive_size| map determines how many words in the buffer
+    // belong to a given primitive type.
+    // |thread_id2subgroup_id_map| allows the lookup from thread id to subgroup id.
+    // |thread_id_swizzle_map| allows for lookup of the original flat thread id
+    // from the unique subgroup id and intra subgroup id.
+    // |inst_id2_prim_id| is invoked with the first word of every entry
+    // this allows an analysis to specify custom inst id's and still
+    // relate them to prim ids.
+    // Returns true if creating mappings is successful, false otherwise.
+    bool CreateUniqueSubgroupIdMappings(
+        uint32_t* const debug_output_buffer, PrimitiveIdToPrimitiveSizeMap& primitive_id2primitive_size,
+        ThreadIdToSubgroupIdMap& thread_id2subgroup_id_map, ThreadIdSwizzleMap& thread_id_swizzle_map,
+        std::function<uint32_t(uint32_t inst_id)> inst_id2prim_id = [](uint32_t x) { return x; }) const;
+
+    // Core auto-inst functionality
+    bool InstrumentShader(const VkShaderModuleCreateInfo* pCreateInfo, std::vector<unsigned int>& new_pgm,
+                          uint32_t* unique_shader_id);
+    void AllocateAutoInstResources(const VkCommandBuffer cmd_buffer, const VkPipelineBindPoint bind_point);
+
+    // Validation Layer hooks
+    void PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo,
+                                   const VkAllocationCallbacks* pAllocator, VkDevice* pDevice, void* modified_create_info) override;
+    void PostCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo,
+                                    const VkAllocationCallbacks* pAllocator, VkDevice* pDevice, VkResult result) override;
+    void PreCallRecordDestroyDevice(VkDevice device, const VkAllocationCallbacks* pAllocator) override;
+    void PreCallRecordCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo* pCreateInfo,
+                                           const VkAllocationCallbacks* pAllocator, VkPipelineLayout* pPipelineLayout,
+                                           void* cpl_state_data) override;
+    void PostCallRecordCreatePipelineLayout(VkDevice device, const VkPipelineLayoutCreateInfo* pCreateInfo,
+                                            const VkAllocationCallbacks* pAllocator, VkPipelineLayout* pPipelineLayout,
+                                            VkResult result) override;
+    void ResetCommandBuffer(VkCommandBuffer commandBuffer);
+    bool PreCallValidateCmdWaitEvents(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent* pEvents,
+                                      VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
+                                      uint32_t memoryBarrierCount, const VkMemoryBarrier* pMemoryBarriers,
+                                      uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier* pBufferMemoryBarriers,
+                                      uint32_t imageMemoryBarrierCount,
+                                      const VkImageMemoryBarrier* pImageMemoryBarriers) const override;
+    void PreCallRecordCreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
+                                              const VkGraphicsPipelineCreateInfo* pCreateInfos,
+                                              const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
+                                              void* cgpl_state_data) override;
+    void PreCallRecordCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
+                                             const VkComputePipelineCreateInfo* pCreateInfos,
+                                             const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
+                                             void* ccpl_state_data) override;
+    void PreCallRecordCreateRayTracingPipelinesNV(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
+                                                  const VkRayTracingPipelineCreateInfoNV* pCreateInfos,
+                                                  const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
+                                                  void* crtpl_state_data) override;
+    void PreCallRecordCreateRayTracingPipelinesKHR(VkDevice device, VkDeferredOperationKHR deferredOperation,
+                                                   VkPipelineCache pipelineCache, uint32_t count,
+                                                   const VkRayTracingPipelineCreateInfoKHR* pCreateInfos,
+                                                   const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
+                                                   void* crtpl_state_data) override;
+    void PostCallRecordCreateGraphicsPipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
+                                               const VkGraphicsPipelineCreateInfo* pCreateInfos,
+                                               const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines, VkResult result,
+                                               void* cgpl_state_data) override;
+    void PostCallRecordCreateComputePipelines(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
+                                              const VkComputePipelineCreateInfo* pCreateInfos,
+                                              const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines, VkResult result,
+                                              void* ccpl_state_data) override;
+    void PostCallRecordCreateRayTracingPipelinesNV(VkDevice device, VkPipelineCache pipelineCache, uint32_t count,
+                                                   const VkRayTracingPipelineCreateInfoNV* pCreateInfos,
+                                                   const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines, VkResult result,
+                                                   void* crtpl_state_data) override;
+    void PostCallRecordCreateRayTracingPipelinesKHR(VkDevice device, VkDeferredOperationKHR deferredOperation,
+                                                    VkPipelineCache pipelineCache, uint32_t count,
+                                                    const VkRayTracingPipelineCreateInfoKHR* pCreateInfos,
+                                                    const VkAllocationCallbacks* pAllocator, VkPipeline* pPipelines,
+                                                    VkResult result, void* crtpl_state_data) override;
+
+    void PreCallRecordDestroyPipeline(VkDevice device, VkPipeline pipeline, const VkAllocationCallbacks* pAllocator) override;
+    void PreCallRecordCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo* pCreateInfo,
+                                         const VkAllocationCallbacks* pAllocator, VkShaderModule* pShaderModule,
+                                         void* csm_state_data) override;
+    void AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQueue queue, VkPipelineBindPoint pipeline_bind_point,
+                                    uint32_t operation_index, uint32_t* const debug_output_buffer);
+    void PreCallRecordCmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex,
+                              uint32_t firstInstance) override;
+    void PreCallRecordCmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount,
+                                     uint32_t firstIndex, int32_t vertexOffset, uint32_t firstInstance) override;
+    void PreCallRecordCmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t count,
+                                      uint32_t stride) override;
+    void PreCallRecordCmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, uint32_t count,
+                                             uint32_t stride) override;
+    void PreCallRecordCmdDispatch(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z) override;
+    void PreCallRecordCmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset) override;
+    void PreCallRecordCmdTraceRaysNV(VkCommandBuffer commandBuffer, VkBuffer raygenShaderBindingTableBuffer,
+                                     VkDeviceSize raygenShaderBindingOffset, VkBuffer missShaderBindingTableBuffer,
+                                     VkDeviceSize missShaderBindingOffset, VkDeviceSize missShaderBindingStride,
+                                     VkBuffer hitShaderBindingTableBuffer, VkDeviceSize hitShaderBindingOffset,
+                                     VkDeviceSize hitShaderBindingStride, VkBuffer callableShaderBindingTableBuffer,
+                                     VkDeviceSize callableShaderBindingOffset, VkDeviceSize callableShaderBindingStride,
+                                     uint32_t width, uint32_t height, uint32_t depth) override;
+    void PostCallRecordCmdTraceRaysNV(VkCommandBuffer commandBuffer, VkBuffer raygenShaderBindingTableBuffer,
+                                      VkDeviceSize raygenShaderBindingOffset, VkBuffer missShaderBindingTableBuffer,
+                                      VkDeviceSize missShaderBindingOffset, VkDeviceSize missShaderBindingStride,
+                                      VkBuffer hitShaderBindingTableBuffer, VkDeviceSize hitShaderBindingOffset,
+                                      VkDeviceSize hitShaderBindingStride, VkBuffer callableShaderBindingTableBuffer,
+                                      VkDeviceSize callableShaderBindingOffset, VkDeviceSize callableShaderBindingStride,
+                                      uint32_t width, uint32_t height, uint32_t depth) override;
+    void PreCallRecordCmdTraceRaysKHR(VkCommandBuffer commandBuffer,
+                                      const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
+                                      const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
+                                      const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
+                                      const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, uint32_t width,
+                                      uint32_t height, uint32_t depth) override;
+    void PostCallRecordCmdTraceRaysKHR(VkCommandBuffer commandBuffer,
+                                       const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
+                                       const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
+                                       const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
+                                       const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable, uint32_t width,
+                                       uint32_t height, uint32_t depth) override;
+    void PreCallRecordCmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
+                                              const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
+                                              const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
+                                              const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
+                                              const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
+                                              VkDeviceAddress indirectDeviceAddress) override;
+    void PostCallRecordCmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
+                                               const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
+                                               const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
+                                               const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
+                                               const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
+                                               VkDeviceAddress indirectDeviceAddress) override;
+    void PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits, VkFence fence,
+                                   VkResult result) override;
+    void PostCallRecordQueuePresentKHR(VkQueue queue, const VkPresentInfoKHR* pPresentInfo, VkResult result) override;
+};
diff --git a/layers/auto_inst_divergence_characterization.cpp b/layers/auto_inst_divergence_characterization.cpp
new file mode 100644
index 00000000..adf2fd18
--- /dev/null
+++ b/layers/auto_inst_divergence_characterization.cpp
@@ -0,0 +1,157 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#include "auto_inst_divergence_characterization.h"
+#include <bitset>
+#include <fstream>
+
+namespace {
+
+struct DivCharRecord {
+    uint32_t inst_id;
+    uint32_t flat_thread_id;
+    uint32_t active_thread_mask;
+};
+
+}  // namespace
+
+void AutoInstDivergenceCharacterization::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
+
+void AutoInstDivergenceCharacterization::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index,
+                                                        uint32_t shader_module_id) {
+    auto static_data_callback = [&](std::unordered_map<uint32_t, uint32_t>&& inst_id2prim_id,
+                                    std::unordered_map<uint32_t, uint32_t>&& inst_id2inst_count) {
+        inst_id2prim_id_.insert(inst_id2prim_id.begin(), inst_id2prim_id.end());
+        inst_id2inst_count_.insert(inst_id2inst_count.begin(), inst_id2inst_count.end());
+    };
+
+    optimizer->RegisterPass(
+        spvtools::CreateAutoInstDivergenceCharacterizationPass(desc_bind_index, shader_module_id, static_data_callback));
+}
+
+void AutoInstDivergenceCharacterization::AnalyzeRayTracing(uint32_t* const device_output_buffer, bool buffer_overflowed,
+                                                           uint32_t width, uint32_t height, uint32_t depth) {
+    if (buffer_overflowed) {
+        ReportSetupProblem(device, "Divergence characterization requires a complete execution trace. Aborting.\n");
+        return;
+    }
+
+    auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
+    ReportInfo(device, "Analyzing divergence characterization for " + std::to_string(runtime_words_written * 4) + " bytes! \n");
+    
+    auto num_threads = width * height * depth;
+
+    // Create mapping from inst_id to inst_size to determine stride
+    AutoInst::PrimitiveIdToPrimitiveSizeMap prim_id2_prim_size = {
+        {spvtools::kAutoInstUniqueSubgroupId, (uint32_t)(sizeof(AIUniqueSubgroupIdEntry) / sizeof(uint32_t))},
+        {spvtools::kAutoInstDivCharPreTraceRay, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
+        {spvtools::kAutoInstDivCharPostTraceRay, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
+        {spvtools::kAutoInstDivCharQuitPipeline, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
+        {spvtools::kAutoInstDivCharShaderEntryPoint, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
+        {spvtools::kAutoInstDivCharActiveThreads, (uint32_t)(sizeof(DivCharRecord) / sizeof(uint32_t))},
+    };
+
+    // Create subgroup id mapping to be populated
+    AutoInst::ThreadIdToSubgroupIdMap thread_id2subgroup_id;
+    AutoInst::ThreadIdSwizzleMap thread_id_swizzle;
+    auto res = CreateUniqueSubgroupIdMappings(device_output_buffer, prim_id2_prim_size, thread_id2subgroup_id, thread_id_swizzle,
+                                          [&](uint32_t inst_id) { return inst_id2prim_id_[inst_id]; });
+
+    if (!res || thread_id2subgroup_id.size() != num_threads || thread_id_swizzle.size() != num_threads) {
+        ReportSetupProblem(device, "Ray tracing pipeline timing analysis failed to acquire unique warp id maps. Aborting.\n");
+        return;
+    }
+
+    std::unordered_map<uint32_t, std::vector<DivCharRecord>> subgroup_id2records;
+
+    // Process the runtime timing data
+    uint32_t j = 0;
+    while (j < runtime_words_written) {
+        auto inst_id = device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS];
+        auto prim_id = inst_id2prim_id_[inst_id];
+
+        if (prim_id >= spvtools::kAutoInstDivCharPreTraceRay && prim_id <= spvtools::kAutoInstDivCharQuitPipeline) {
+            auto subgroup_id = thread_id2subgroup_id[device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS + 1]];
+            auto record = *reinterpret_cast<DivCharRecord*>(&device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS]);
+            subgroup_id2records[subgroup_id].push_back(record);
+        } else if (prim_id != spvtools::kAutoInstUniqueSubgroupId) {
+            ReportSetupProblem(device, "Analysis received unrecognized primitive identifier. Aborting.\n");
+            return;
+        }
+
+        j += prim_id2_prim_size[prim_id];
+    }
+
+    uint64_t return_divergence = 0;
+    uint64_t control_flow_divergence = 0;
+    uint64_t indirect_function_call_divergence = 0;
+
+    for (auto warp_itr = subgroup_id2records.begin(); warp_itr != subgroup_id2records.end(); warp_itr++) {
+        for (uint32_t m = 0, n = 1; m < SUBGROUP_SIZE; m++, n *= 2) {
+            std::vector<bool> recurse_thread_status;
+            bool is_returned = false;
+            bool is_indirect_func_killed = false;
+            for (const auto& offset_itr : warp_itr->second) {
+                auto prim_id = inst_id2prim_id_[offset_itr.inst_id];
+                bool is_thread_active = (offset_itr.active_thread_mask & n) != 0;
+
+                if (prim_id == spvtools::kAutoInstDivCharQuitPipeline) {
+                    is_returned = is_thread_active;
+                    continue;
+                } else if (prim_id == spvtools::kAutoInstDivCharPreTraceRay) {
+                    recurse_thread_status.push_back(is_thread_active);
+                    continue;
+                } else if (prim_id == spvtools::kAutoInstDivCharPostTraceRay) {
+
+                    recurse_thread_status.pop_back();
+                    if (is_thread_active) {
+                        is_indirect_func_killed = false;
+                    }
+                    continue;
+                } else if (prim_id == spvtools::kAutoInstDivCharShaderEntryPoint) {
+                    is_indirect_func_killed = recurse_thread_status.back() && !is_thread_active;
+                    continue;
+                }
+
+                if (!is_thread_active) {
+                    if (inst_id2inst_count_.count(offset_itr.inst_id) == 0) {
+                        ReportSetupProblem(device, "Missing static instruction count data. Aborting.\n");                        
+                        return;
+                    }
+                    uint32_t num_insts = inst_id2inst_count_[offset_itr.inst_id];
+
+                    if (is_returned) {
+                        return_divergence += num_insts;
+                    } else if (is_indirect_func_killed) {
+                        // Thread was active at indiret function callsite but not here
+                        indirect_function_call_divergence += num_insts;
+                    } else {
+                        control_flow_divergence += num_insts;
+                    }
+                }
+            }
+        }
+    }
+
+    ReportInfo(device, "Finished analyzing buffer!\n");
+
+    std::ofstream csv_file;
+    csv_file.open(FrameAnalysisFileName("divergence_characterization.csv"), std::ios_base::app);
+    csv_file << "inst count, indirect func, early exit, control flow,\n";
+    csv_file << "," << indirect_function_call_divergence << "," << return_divergence << "," << control_flow_divergence << ",\n";
+    csv_file.close();
+}
\ No newline at end of file
diff --git a/layers/auto_inst_divergence_characterization.h b/layers/auto_inst_divergence_characterization.h
new file mode 100644
index 00000000..c0226d11
--- /dev/null
+++ b/layers/auto_inst_divergence_characterization.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#pragma once
+
+#include "auto_inst.h"
+
+class AutoInstDivergenceCharacterization;
+
+class AutoInstDivergenceCharacterization : public AutoInst {
+  private:
+    std::unordered_map<uint32_t, uint32_t> inst_id2prim_id_;
+    std::unordered_map<uint32_t, uint32_t> inst_id2inst_count_;
+
+  public:
+    AutoInstDivergenceCharacterization() : AutoInst() {}
+
+    void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
+
+    void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
+
+    void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
+
+    void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
+                           uint32_t depth) override;
+
+    void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
+        ReportSetupProblem(device, "Divergence analysis is not compatible with draw commands.");
+    }
+
+    void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
+        ReportSetupProblem(device, "Divergence analysis is not compatible with compute commands.");
+    }
+};
diff --git a/layers/auto_inst_dyn_shader_trace.cpp b/layers/auto_inst_dyn_shader_trace.cpp
new file mode 100644
index 00000000..7b11cc0b
--- /dev/null
+++ b/layers/auto_inst_dyn_shader_trace.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#include "auto_inst_dyn_shader_trace.h"
+#include <bitset>
+#include <fstream>
+#include <algorithm>
+
+namespace {
+struct ShaderExecutionRecord {
+    uint32_t prim_id;
+    uint32_t flat_thread_id;
+    uint32_t shader_id;
+    uint32_t active_thread_mask;
+};
+
+}  // namespace
+
+void AutoInstDynShaderTrace::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
+
+void AutoInstDynShaderTrace::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
+    optimizer->RegisterPass(spvtools::CreateAutoInstDynShaderTracePass(desc_bind_index, shader_module_id));
+}
+
+void AutoInstDynShaderTrace::AnalyzeRayTracing(uint32_t* const device_output_buffer, bool buffer_overflowed, uint32_t width,
+                                               uint32_t height, uint32_t depth) {
+    if (buffer_overflowed) {
+        ReportSetupProblem(device, "Ray tracing dynamic shader trace analysis requires a complete execution trace. Aborting.\n");
+        return;
+    }
+
+    auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
+    ReportInfo(device, "Analyzing " + std::to_string(runtime_words_written * 4) + " bytes! \n");
+
+    auto num_threads = width * height * depth;
+
+    // Create mapping from inst_id to inst_size to determine stride
+    AutoInst::PrimitiveIdToPrimitiveSizeMap prim_id2_prim_size = {
+        {spvtools::kAutoInstUniqueSubgroupId, (uint32_t)(sizeof(AIUniqueSubgroupIdEntry) / sizeof(uint32_t))},
+        {spvtools::kAutoInstDynShaderTraceEntryPoint, (uint32_t)(sizeof(ShaderExecutionRecord) / sizeof(uint32_t))},
+    };
+
+    // Create warp id mapping to be populated
+    AutoInst::ThreadIdToSubgroupIdMap thread_id2subgroup_id;
+    AutoInst::ThreadIdSwizzleMap thread_id_swizzle;
+    auto res = CreateUniqueSubgroupIdMappings(device_output_buffer, prim_id2_prim_size,
+                                          thread_id2subgroup_id, thread_id_swizzle);
+
+    if (!res || thread_id2subgroup_id.size() != num_threads || thread_id_swizzle.size() != num_threads) {
+        ReportSetupProblem(device, "Ray tracing pipeline timing analysis failed to acquire unique warp id maps. Aborting.\n");
+        return;
+    }
+
+    // For heatmap
+    uint32_t max_thread_exe_count = 0;
+    uint32_t max_subgroup_exe_count = 0;
+
+    std::unordered_map<uint32_t, uint32_t> thread_id2dyn_count;
+    std::unordered_map<uint32_t, uint32_t> subgroup_id2dyn_count;
+    // For CSV, ordered map for sensible output
+    std::map<uint32_t, uint32_t> shader_id2dyn_count;
+
+    // Process the runtime timing data
+    uint32_t j = 0;
+    while (j < runtime_words_written) {
+        auto prim_id = device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS];
+
+        if (prim_id == spvtools::kAutoInstDynShaderTraceEntryPoint) {
+            auto shader_exe_record = reinterpret_cast<ShaderExecutionRecord*>(&device_output_buffer[j + NUM_BUFFER_RESERVED_WORDS]);
+            auto subgroup_id = thread_id2subgroup_id[shader_exe_record->flat_thread_id];
+            for (uint32_t i = 0; i < SUBGROUP_SIZE; i++) {
+                if (shader_exe_record->active_thread_mask & (1 << i)) {
+                    auto shader_id = shader_exe_record->shader_id;
+                    // For every active thread ending the pipeline, compute its complete timing
+                    auto swizzled_id = thread_id_swizzle[subgroup_id * SUBGROUP_SIZE + i];
+                    thread_id2dyn_count[swizzled_id]++;
+                    max_thread_exe_count = std::max(max_thread_exe_count, thread_id2dyn_count[swizzled_id]);
+                    shader_id2dyn_count[shader_id]++;
+                }
+            }
+            subgroup_id2dyn_count[subgroup_id]++;
+            max_subgroup_exe_count = std::max(max_subgroup_exe_count, subgroup_id2dyn_count[subgroup_id]);
+
+        } else if (prim_id != spvtools::kAutoInstUniqueSubgroupId) {
+            ReportSetupProblem(device, "Encountered unsupported primtive type in Ray tracing thread timing analysis. Aborting.");
+            return;
+        }
+
+        j += prim_id2_prim_size[prim_id];
+    }
+
+    // Generate csv
+
+    {
+        // Output dyn opcode count
+        std::stringstream line0, line1;
+        line0 << "shader,";
+        line1 << "dyn exe count,";
+        for (auto& entry : shader_id2dyn_count) {
+            auto shader_stage_name = ShaderStageToString(shader_map[entry.first].stage);
+            line0 << shader_stage_name << "(" << entry.first << ")"
+                  << ",";
+            line1 << entry.second << ",";
+        }
+        line0 << "\n";
+        line1 << "\n";
+
+        std::ofstream csv_file;
+        csv_file.open(PipelineAnalysisFileName("dyn_shader_counts.csv"));
+        csv_file << line0.str() << line1.str();
+        csv_file.close();
+    }
+
+    {
+        std::vector<char> colors(num_threads * 3);
+        for (uint32_t y = 0; y < height; y++) {
+            for (uint32_t x = 0; x < width; x++) {
+                for (uint32_t z = 0; z < depth; z++) {
+                    auto thread_id = z * (width * height) + y * width + x;
+
+                    auto rgb = UnitIntervalToRGB((float)(thread_id2dyn_count[thread_id] / (float)max_thread_exe_count));
+                    uint32_t out_index = 0;
+
+                    if (depth > 1)  // This is Quake II RTX specific
+                        out_index = y * (width * depth) + x * 2 + z;
+                    else
+                        out_index = y * width + x;
+
+                    colors[3 * out_index + 0] = std::get<0>(rgb);
+                    colors[3 * out_index + 1] = std::get<1>(rgb);
+                    colors[3 * out_index + 2] = std::get<2>(rgb);
+                }
+            }
+        }
+
+        CreateImage(width * depth, height, colors, PipelineAnalysisFileName("shader_execution_heatmap"));
+    }
+
+    {
+        std::vector<char> colors(num_threads * 3);
+        for (uint32_t y = 0; y < height; y++) {
+            for (uint32_t x = 0; x < width; x++) {
+                for (uint32_t z = 0; z < depth; z++) {
+                    auto thread_id = z * (width * height) + y * width + x;
+                    auto subgroup_id = thread_id2subgroup_id[thread_id];
+                    auto rgb = UnitIntervalToRGB((float)(subgroup_id2dyn_count[subgroup_id] / (float)max_subgroup_exe_count));
+                    uint32_t out_index = 0;
+
+                    if (depth > 1)  // This is Quake II RTX specific
+                        out_index = y * (width * depth) + x * 2 + z;
+                    else
+                        out_index = y * width + x;
+
+                    colors[3 * out_index + 0] = std::get<0>(rgb);
+                    colors[3 * out_index + 1] = std::get<1>(rgb);
+                    colors[3 * out_index + 2] = std::get<2>(rgb);
+                }
+            }
+        }
+
+        CreateImage(width * depth, height, colors, PipelineAnalysisFileName("subgroup_shader_execution_heatmap"));
+    }
+}
\ No newline at end of file
diff --git a/layers/auto_inst_dyn_shader_trace.h b/layers/auto_inst_dyn_shader_trace.h
new file mode 100644
index 00000000..02e8b99a
--- /dev/null
+++ b/layers/auto_inst_dyn_shader_trace.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#pragma once
+
+#include "auto_inst.h"
+
+class AutoInstDynShaderTrace;
+
+class AutoInstDynShaderTrace : public AutoInst {
+  public:
+    AutoInstDynShaderTrace() : AutoInst() {}
+
+    void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
+
+    void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
+
+    void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
+
+    void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
+                           uint32_t depth) override;
+
+    void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
+        ReportSetupProblem(device, "Dynamic shader trace for graphics is not yet implemented!\n");
+    }
+
+    void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
+        ReportSetupProblem(device, "Dynamic shader trace for compute is not yet implemented!\n");
+    }
+};
diff --git a/layers/auto_inst_dyn_trace_ray_trace.cpp b/layers/auto_inst_dyn_trace_ray_trace.cpp
new file mode 100644
index 00000000..cec184b0
--- /dev/null
+++ b/layers/auto_inst_dyn_trace_ray_trace.cpp
@@ -0,0 +1,223 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#include "auto_inst_dyn_trace_ray_trace.h"
+#include <bitset>
+#include <fstream>
+
+namespace {}  // namespace
+
+void AutoInstDynTraceRayTrace::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {
+    inst_id2prim_id_.clear();
+    merge_id2div_ids_.clear();
+}
+
+void AutoInstDynTraceRayTrace::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
+    auto static_data_callback = [&](std::unordered_map<uint32_t, uint32_t>&& inst_id2prim_id,
+                                    std::unordered_map<uint32_t, std::vector<uint32_t>>&& merge_id2div_ids) {
+        inst_id2prim_id_.insert(inst_id2prim_id.begin(), inst_id2prim_id.end());
+        merge_id2div_ids_.insert(merge_id2div_ids.begin(), merge_id2div_ids.end());
+    };
+    optimizer->RegisterPass(spvtools::CreateAutoInstDynTraceRayTracePass(desc_bind_index, shader_module_id, static_data_callback));
+}
+
+void AutoInstDynTraceRayTrace::AnalyzeRayTracing(uint32_t* const device_output_buffer, bool buffer_overflowed, uint32_t width,
+                                                 uint32_t height, uint32_t depth) {
+    if (buffer_overflowed) {
+        ReportSetupProblem(device,
+                           "Dynamic traceRay trace analysis cannot produce a valid result without a complete execution trace.\n");
+        return;
+    }
+
+    auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
+    ReportInfo(device, "Analyzing dynamic traceRay trace for " + std::to_string(runtime_words_written * 4) + " bytes! \n");
+
+    auto num_subgroup_ids = device_output_buffer[NUM_SUBGROUP_IDS_INDEX];
+    auto num_threads = width * height * depth;
+
+    // Create mapping from inst_id to inst_size to determine stride
+    AutoInst::PrimitiveIdToPrimitiveSizeMap prim_id2_prim_size = {
+        {spvtools::kAutoInstUniqueSubgroupId, (uint32_t)(sizeof(AIUniqueSubgroupIdEntry) / sizeof(uint32_t))},
+        {spvtools::kAutoInstTraceRayTracePreTraceRay, (uint32_t)(sizeof(DynTraceRayTraceRecord) / sizeof(uint32_t))},
+        {spvtools::kAutoInstTraceRayTraceMergePoint, (uint32_t)(sizeof(DynTraceRayTraceRecord) / sizeof(uint32_t))},
+    };
+
+    // Create subgroup id mapping to be populated
+    AutoInst::ThreadIdToSubgroupIdMap thread_id2subgroup_id;
+    AutoInst::ThreadIdSwizzleMap thread_id_swizzle;
+    auto res =
+        CreateUniqueSubgroupIdMappings(device_output_buffer, prim_id2_prim_size, thread_id2subgroup_id,
+                                   thread_id_swizzle, [&](uint32_t inst_id) { return inst_id2prim_id_[inst_id]; });
+
+    if (!res || thread_id2subgroup_id.size() != num_threads || thread_id_swizzle.size() != num_threads) {
+        ReportSetupProblem(device, "Failed to acquire unique subgroup id maps. Aborting.\n");
+        return;
+    }
+
+    {
+        // For thread compaction
+        std::unordered_map<uint32_t, std::unordered_map<uint32_t, std::vector<bool>>> thread_paths;
+        std::unordered_map<uint32_t, std::unordered_map<uint32_t, uint32_t>> merge_visit_count;
+        std::unordered_map<uint32_t, uint32_t> max_visit_count;
+        std::set<uint32_t> points_of_interest;
+
+        uint32_t j = 0;
+        while (j < runtime_words_written) {
+            auto inst_id = device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j];
+            auto prim_id = inst_id2prim_id_[inst_id];
+            if (prim_id == spvtools::kAutoInstTraceRayTracePreTraceRay) {
+                // Record a positive result (thread executed traceRay)
+                auto entry = reinterpret_cast<DynTraceRayTraceRecord*>(&device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j]);
+                auto subgroup_id = thread_id2subgroup_id[entry->flat_thread_id];
+
+                for (uint32_t m = 0, n = 1; m < SUBGROUP_SIZE; m++, n *= 2) {
+                    auto bit = entry->active_thread_mask & n;
+                    if (bit) {
+                        auto thread_id = m + SUBGROUP_SIZE * subgroup_id;
+                        thread_paths[inst_id][thread_id].push_back(true);
+                        max_visit_count[inst_id] = (thread_paths[inst_id][thread_id].size() > max_visit_count[inst_id])
+                                                       ? (uint32_t)thread_paths[inst_id][thread_id].size()
+                                                       : max_visit_count[inst_id];
+                    }
+                }
+                if (points_of_interest.count(inst_id) == 0) {
+                    points_of_interest.insert(inst_id);
+                }
+
+            } else if (prim_id == spvtools::kAutoInstTraceRayTraceMergePoint) {
+                auto entry = reinterpret_cast<DynTraceRayTraceRecord*>(&device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j]);
+                auto subgroup_id = thread_id2subgroup_id[entry->flat_thread_id];
+                // Record negative result if necessary (thread skipped traceRay)
+                for (auto& label_it : merge_id2div_ids_[inst_id]) {
+                    if (points_of_interest.count(label_it) == 0) continue;
+                    for (uint32_t m = 0, n = 1; m < SUBGROUP_SIZE; m++, n *= 2) {
+                        auto thread_id = m + SUBGROUP_SIZE * subgroup_id;
+                        if ((entry->active_thread_mask & n) == 0) continue;
+                        merge_visit_count[label_it][thread_id]++;
+
+                        if (thread_paths[label_it][thread_id].size() >= merge_visit_count[label_it][thread_id]) {
+                            merge_visit_count[label_it][thread_id] = (uint32_t)thread_paths[label_it][thread_id].size();
+                            continue;  // Has been set due to active thread taking branch
+                        }
+
+                        thread_paths[label_it][thread_id].push_back(false);
+                        max_visit_count[label_it] = (thread_paths[label_it][thread_id].size() > max_visit_count[label_it])
+                                                        ? (uint32_t)thread_paths[label_it][thread_id].size()
+                                                        : max_visit_count[label_it];
+                    }
+                }
+            } else if (prim_id != spvtools::kAutoInstUniqueSubgroupId) {
+                ReportSetupProblem(device, "Unrecognized primitive. Aborting.\n");
+                return;
+            }
+
+            j += prim_id2_prim_size[prim_id];
+        }
+
+        // Done analyzing StorageBuffer
+        const int MAX_PATH_LEN = 1024;
+
+        // Flatten the thread paths according to the maximum dynamic invocation count
+        // of each traceRay callsite
+        // Consider thread A that executed a inner loop once for 3 iterations of an outer loop
+        // vs thread B that executed in the inner loop 3 times for 3 iterations of an outer loop
+        // Before flattening:
+        // thread A: 111
+        // thread B: 111111111
+        // After flattening
+        // thread A: 001001001
+        // thread B: 111111111
+        std::unordered_map<std::bitset<MAX_PATH_LEN>, uint32_t> flat_path_count;
+        std::bitset<MAX_PATH_LEN> flat_thread_path;
+        for (uint32_t thread_id = 0; thread_id < num_threads; thread_id++) {
+            std::size_t k = 0;
+            flat_thread_path.reset();
+            for (auto& label_id : points_of_interest) {
+                j = 0;
+                for (j = 0; j < thread_paths[label_id][thread_id].size(); j++) {
+                    if (thread_paths[label_id][thread_id][j]) {
+                        flat_thread_path.set(k, 1);
+                    }
+                    k += 1;
+                }
+                if (j > max_visit_count[label_id]) {
+                    ReportSetupProblem(device, "Max visit count not set correctly. Aborting\n");
+                    return;
+                }
+                k += max_visit_count[label_id] - j;
+                if (k > MAX_PATH_LEN)
+                    ReportSetupProblem(device, ("Encountered more than " + std::to_string(MAX_PATH_LEN) + " branches!").c_str());
+            }
+            flat_path_count[flat_thread_path]++;
+        }
+
+        // Record thread paths and their respective counts
+        std::ofstream csv_file;
+        csv_file.open(PipelineAnalysisFileName("thread_paths.csv"));
+        csv_file << "path,count,\n";
+        for (auto& path_it : flat_path_count) csv_file << path_it.first << "," << path_it.second << "\n";
+        csv_file.close();
+
+        csv_file.open(PipelineAnalysisFileName("thread_compaction.csv"));
+
+        ReportInfo(device, "Done simulated threads\n");
+        for (auto& poi_label : points_of_interest) {
+            csv_file << poi_label << "\n";
+            std::vector<uint32_t> active_threads;
+            std::vector<uint32_t> active_threads_per_window;
+            std::vector<uint32_t> total_threads;
+            for (uint32_t window_size = 1; window_size < num_subgroup_ids * 2; window_size <<= 1) {
+                active_threads.clear();
+                total_threads.clear();
+                // window size unit is subgroups
+                for (uint32_t window_base = 0; window_base < num_subgroup_ids; window_base += window_size) {
+                    active_threads_per_window.clear();
+                    for (uint32_t window_offset = 0; window_offset < window_size; window_offset++) {
+                        if (window_base + window_offset >= num_subgroup_ids) continue;
+                        for (uint32_t thread_offset = 0; thread_offset < SUBGROUP_SIZE; thread_offset++) {
+                            auto thread_id = (window_base + window_offset) * SUBGROUP_SIZE + thread_offset;
+                            auto num_visits = thread_paths[poi_label][thread_id].size();
+                            if (active_threads_per_window.size() < num_visits) active_threads_per_window.resize(num_visits);
+                            for (uint32_t visit_count = 0; visit_count < num_visits; visit_count++) {
+                                if (thread_paths[poi_label][thread_id][visit_count]) active_threads_per_window[visit_count]++;
+                            }
+                        }
+                    }
+
+                    if (active_threads.size() < active_threads_per_window.size()) {
+                        active_threads.resize(active_threads_per_window.size());
+                        total_threads.resize(active_threads.size());
+                    }
+
+                    for (uint32_t visit_count = 0; visit_count < active_threads_per_window.size(); visit_count++) {
+                        active_threads[visit_count] += active_threads_per_window[visit_count];
+                        total_threads[visit_count] += ((active_threads_per_window[visit_count] / SUBGROUP_SIZE) +
+                                                       ((active_threads_per_window[visit_count] % SUBGROUP_SIZE != 0) ? 1 : 0)) *
+                                                      SUBGROUP_SIZE;
+                    }
+                }
+                for (uint32_t visit_count = 0; visit_count < active_threads.size(); visit_count++) {
+                    if (active_threads[visit_count] == 0 && total_threads[visit_count] == 0) continue;
+
+                    csv_file << "," << window_size << "," << visit_count << "," << active_threads[visit_count] << "/"
+                             << total_threads[visit_count] << "\n";
+                }
+            }
+        }
+        csv_file.close();
+    }
+}
\ No newline at end of file
diff --git a/layers/auto_inst_dyn_trace_ray_trace.h b/layers/auto_inst_dyn_trace_ray_trace.h
new file mode 100644
index 00000000..769b0cc4
--- /dev/null
+++ b/layers/auto_inst_dyn_trace_ray_trace.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#pragma once
+
+#include "auto_inst.h"
+
+class AutoInstDynTraceRayTrace;
+
+struct DynTraceRayTraceRecord {
+    uint32_t inst_id;
+    uint32_t flat_thread_id;
+    uint32_t active_thread_mask;
+};
+
+class AutoInstDynTraceRayTrace : public AutoInst {
+  private:
+    std::unordered_map<uint32_t, uint32_t> inst_id2prim_id_;
+    std::unordered_map<uint32_t, std::vector<uint32_t>> merge_id2div_ids_;
+
+  public:
+    AutoInstDynTraceRayTrace() : AutoInst() {}
+
+    void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
+
+    void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
+
+    void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
+
+    void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
+                           uint32_t depth) override;
+    ;
+
+    void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
+        ReportSetupProblem(device, "Dynamic TraceRays Trace analysis does not support graphics.\n");
+    };
+
+    void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
+        ReportSetupProblem(device, "Dynamic TraceRays Trace analysis does not support compute.\n");
+    };
+};
diff --git a/layers/auto_inst_execution_trace.cpp b/layers/auto_inst_execution_trace.cpp
new file mode 100644
index 00000000..ec4c876b
--- /dev/null
+++ b/layers/auto_inst_execution_trace.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#include "auto_inst_execution_trace.h"
+#include <bitset>
+#include <fstream>
+
+namespace {
+
+struct ExecutionTraceRecord {
+    uint32_t inst_id;
+    uint32_t active_thread_mask;
+};
+
+static inline uint32_t shader_id(uint32_t inst_id) { return (inst_id & 0xFFF00000) >> 20; }
+
+}  // namespace
+
+void AutoInstExecutionTrace::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
+
+void AutoInstExecutionTrace::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
+    // In order to compute the dynamic instruction execution count of the pipeline, it is necessary to know all the other
+    // instructions in the same basic block as the instrumentation callsite. This callback allows the auto-inst pass to
+    // populate such a mapping.
+    auto static_data_callback = [&](std::unordered_map<uint32_t, std::set<uint32_t>>&& inst_id2bb_inst_ids,
+                                    std::unordered_map<uint32_t, uint32_t>&& inst_id2opcode) {
+        inst_id2bb_inst_ids_.insert(inst_id2bb_inst_ids.begin(), inst_id2bb_inst_ids.end());
+        inst_id2opcode_.insert(inst_id2opcode.begin(), inst_id2opcode.end());
+    };
+    optimizer->RegisterPass(spvtools::CreateAutoInstExecutionTracePass(desc_bind_index, shader_module_id, static_data_callback));
+}
+
+void AutoInstExecutionTrace::Analyze(uint32_t* const device_output_buffer, bool buffer_overflowed) {
+    if (buffer_overflowed) {
+        ReportSetupProblem(device, "Execution trace analysis cannot produce a valid result without a complete execution trace.\n");
+        return;
+    }
+
+    auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
+    ReportInfo(device, "Analyzing execution trace for " + std::to_string(runtime_words_written * 4) + " bytes! \n");
+
+    struct ActiveTotalThreadCounts {
+        uint32_t active_count;
+        uint32_t total_count;
+
+        float inline SimtEfficiency() const { return (float)active_count / (float)total_count; }
+    };
+
+    std::map<uint32_t, uint32_t> opcode2dyn_execution_count;
+
+    // For annotated shaders
+    std::map<uint32_t, uint32_t> inst_id2dyn_execution_count;
+    std::map<uint32_t, ActiveTotalThreadCounts> inst_id2active_and_total_thread_counts;
+
+    uint32_t j = 0;
+    while (j < runtime_words_written) {
+        const auto output_record = reinterpret_cast<ExecutionTraceRecord*>(&device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j]);
+        uint32_t active_thread_count = (uint32_t)std::bitset<SUBGROUP_SIZE>(output_record->active_thread_mask).count();
+
+        if (inst_id2bb_inst_ids_.count(output_record->inst_id) == 0) {
+            ReportSetupProblem(device, "Execution trace was unable to locate instrumentation id=" +
+                                           std::to_string(output_record->inst_id) + "in static mapping. Aborting.\n");
+            return;
+        }
+
+        // Add to the opcode totals based on how many threads were active
+        for (const auto& inst_id : inst_id2bb_inst_ids_[output_record->inst_id]) {
+            inst_id2dyn_execution_count[inst_id] += active_thread_count;
+            inst_id2active_and_total_thread_counts[inst_id].active_count += active_thread_count;
+            inst_id2active_and_total_thread_counts[inst_id].total_count += SUBGROUP_SIZE;
+            auto opcode = inst_id2opcode_[inst_id];
+            if (inst_id2opcode_.count(inst_id) == 0) {
+                ReportSetupProblem(device, "Encountered instruction id without a corresponding Opcode. Aborting.\n");
+                return;
+            }
+            opcode2dyn_execution_count[opcode] += active_thread_count;
+        }
+
+        j += sizeof(ExecutionTraceRecord) / sizeof(uint32_t);
+    }
+
+    {
+        // Output dyn opcode count
+        std::stringstream line0, line1;
+        line0 << "opcode,";
+        line1 << "dyn exe count,";
+        for (auto entry : opcode2dyn_execution_count) {
+            line0 << entry.first << ",";
+            line1 << entry.second << ",";
+        }
+        line0 << "\n";
+        line1 << "\n";
+
+        std::ofstream csv_file;
+        csv_file.open(PipelineAnalysisFileName("dyn_opcode_counts.csv"));
+        csv_file << line0.str() << line1.str();
+        csv_file.close();
+    }
+    {
+        // Output hotspots
+        std::stringstream line0, line1, line2;
+        line0 << "pc,";
+        line1 << "dyn exe count,";
+        line2 << "simt efficiency,";
+        for (auto entry : inst_id2dyn_execution_count) {
+            line0 << entry.first << ",";
+            line1 << entry.second << ",";
+            line2 << inst_id2active_and_total_thread_counts[entry.first].SimtEfficiency() << ",";
+        }
+        line0 << "\n";
+        line1 << "\n";
+        line2 << "\n";
+
+        ActiveTotalThreadCounts combined = {0, 0};
+        for (auto entry : inst_id2active_and_total_thread_counts) {
+            combined.active_count += entry.second.active_count;
+            combined.total_count += entry.second.total_count;
+        }
+
+        std::ofstream csv_file;
+        csv_file.open(PipelineAnalysisFileName("hotspots.csv"));
+        csv_file << line0.str() << line1.str() << line2.str() << "Overall SIMT efficiency=" << combined.SimtEfficiency() << "\n";
+        csv_file.close();
+    }
+    {
+        std::set<uint32_t> shaders_with_data;
+        // Output annotated shaders
+        std::unordered_map<uint32_t, std::string> annotations;
+        for (const auto& entry : inst_id2bb_inst_ids_) {
+            auto instrumentation_id = *entry.second.begin();
+            auto visits = inst_id2dyn_execution_count[instrumentation_id];
+            if (visits > 0) {
+                auto simt_efficiency = inst_id2active_and_total_thread_counts[entry.first].SimtEfficiency();
+                shaders_with_data.insert(shader_id(entry.first));
+                annotations[instrumentation_id] =
+                    "thread_executions=" + std::to_string(visits) + ". SIMT Efficiency=" + std::to_string(simt_efficiency);
+            }
+        }
+
+        for (auto entry : instrumentation_map) {
+            if (shaders_with_data.count(entry.first) == 0) continue;
+            using namespace spvtools;
+            SpirvTools spirvTools(spv_target_env::SPV_ENV_VULKAN_1_2);
+            std::string program;
+            spirvTools.SetMessageConsumer([this](spv_message_level_t level, const char* source, const spv_position_t& pos,
+                                                 const char* message) { ReportSetupProblem(this->device, message); });
+            bool res = spirvTools.Disassemble(entry.second, &program, SPV_BINARY_TO_TEXT_OPTION_FRIENDLY_NAMES);
+            if (res) {
+                program = AnnotateModuleStr(program, annotations);
+            } else {
+                ReportSetupProblem(device, "Could not disassemble shader with id=" + std::to_string(entry.first) + ". Skipping.\n");
+                continue;
+            }
+            std::ofstream spv_file;
+            auto file_name = ShaderStageToString(shader_map[entry.first].stage) + std::to_string(entry.first) + "_dyn_executions";
+
+            TryCompileModuleStrToGlsl(program, PipelineAnalysisFileName(file_name));
+        }
+    }
+}
\ No newline at end of file
diff --git a/layers/auto_inst_execution_trace.h b/layers/auto_inst_execution_trace.h
new file mode 100644
index 00000000..fb5b4eb0
--- /dev/null
+++ b/layers/auto_inst_execution_trace.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#pragma once
+
+#include "auto_inst.h"
+
+class AutoInstExecutionTrace;
+
+class AutoInstExecutionTrace : public AutoInst {
+  public:
+    // Mapping from instrumented instruction id to the ids of other instructions in the basic block
+    std::unordered_map<uint32_t, std::set<uint32_t>> inst_id2bb_inst_ids_;
+
+    // Mapping from instruction id to instruction opcode. Used for calculating dynamic instruction mix.
+    std::unordered_map<uint32_t, uint32_t> inst_id2opcode_;
+
+    AutoInstExecutionTrace() : AutoInst() {}
+
+    void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
+
+    void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
+
+    void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
+
+    void Analyze(uint32_t* const debug_output_buffer, bool buffer_overflowed);
+
+    virtual void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
+                                   uint32_t depth) override {
+        Analyze(debug_output_buffer, buffer_overflowed);
+    };
+
+    // TODO: What are useful dimensions to pass to graphics pipeline analysis
+    virtual void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
+        Analyze(debug_output_buffer, buffer_overflowed);
+    };
+
+    virtual void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y,
+                                uint32_t z) override {
+        Analyze(debug_output_buffer, buffer_overflowed);
+    };
+};
diff --git a/layers/auto_inst_simt_efficiency.cpp b/layers/auto_inst_simt_efficiency.cpp
new file mode 100644
index 00000000..0e100509
--- /dev/null
+++ b/layers/auto_inst_simt_efficiency.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#include "auto_inst_simt_efficiency.h"
+#include <bitset>
+#include <fstream>
+void AutoInstSimtEfficiency::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
+
+void AutoInstSimtEfficiency::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
+    optimizer->RegisterPass(spvtools::CreateAutoInstSimtEfficiencyPass(desc_bind_index, shader_module_id, 1));
+}
+
+void AutoInstSimtEfficiency::Analyze(uint32_t* const device_output_buffer, bool buffer_overflowed) {
+    if (buffer_overflowed) {
+        ReportSetupProblem(device, "SIMT Efficiency analysis cannot produce a valid result without a complete execution trace.\n");
+        return;
+    }
+
+    uint32_t active_thread_count = 0;
+    uint32_t possible_thread_count = 0;
+
+    auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
+
+    if (runtime_words_written == 0) {
+        ReportInfo(device, "No data found. Skipping Analysis.\n");
+        return;
+    }
+
+    ReportInfo(device, "Analyzing SIMT Efficiency for " + std::to_string(runtime_words_written * 4) + " bytes! \n");
+
+    uint32_t j = 0;
+    while (j < runtime_words_written) {
+        uint32_t active_thread_mask = device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j];
+        auto active_threads = (uint32_t)std::bitset<SUBGROUP_SIZE>(active_thread_mask).count();
+
+        if (active_threads == 0) {
+            ReportSetupProblem(device, "Invalid active thread count encountered. Quitting Analysis!\n");
+            return;
+        }
+        active_thread_count += active_threads;
+        possible_thread_count += SUBGROUP_SIZE;
+        j += sizeof(SimtEfficiencyRecord) / sizeof(uint32_t);
+    }
+
+    float simt_efficiency = (float)active_thread_count / (float)possible_thread_count;
+
+    std::ofstream simt_eff_file;
+    simt_eff_file.open(FrameAnalysisFileName("simt_efficiency.csv"), std::ios_base::app);
+    simt_eff_file << simt_efficiency << "\n";
+    simt_eff_file.close();
+
+    ReportInfo(device, "SIMT Efficiency = " + std::to_string(simt_efficiency * 100.0) + "%\n");
+}
\ No newline at end of file
diff --git a/layers/auto_inst_simt_efficiency.h b/layers/auto_inst_simt_efficiency.h
new file mode 100644
index 00000000..d6c5e2ce
--- /dev/null
+++ b/layers/auto_inst_simt_efficiency.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#pragma once
+
+#include "auto_inst.h"
+
+class AutoInstSimtEfficiency;
+
+struct SimtEfficiencyRecord {
+    uint32_t active_thread_mask;
+};
+
+class AutoInstSimtEfficiency : public AutoInst {
+  public:
+    AutoInstSimtEfficiency() : AutoInst() {}
+
+    void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
+
+    void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
+
+    void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
+
+    // Opportunity for inheriting class to perform hybrid analysis using
+    // 1) static_data
+    // 2) runtime_data
+    // 3) shader_map
+    void Analyze(uint32_t* const debug_output_buffer, bool buffer_overflowed);
+
+    void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
+                           uint32_t depth) override {
+        Analyze(debug_output_buffer, buffer_overflowed);
+    };
+
+    void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
+        Analyze(debug_output_buffer, buffer_overflowed);
+    };
+
+    void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
+        Analyze(debug_output_buffer, buffer_overflowed);
+    };
+};
diff --git a/layers/auto_inst_warp_entry_and_exit.cpp b/layers/auto_inst_warp_entry_and_exit.cpp
new file mode 100644
index 00000000..9c19ce3d
--- /dev/null
+++ b/layers/auto_inst_warp_entry_and_exit.cpp
@@ -0,0 +1,61 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#include "auto_inst_warp_entry_and_exit.h"
+#include <fstream>
+
+void AutoInstWarpEntryAndExit::InitializeLayerDeviceSettings(AutoInst* device_auto_inst) {}
+
+void AutoInstWarpEntryAndExit::RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_bind_index, uint32_t shader_module_id) {
+    optimizer->RegisterPass(spvtools::CreateAutoInstWarpEntryAndExitPass(desc_bind_index, shader_module_id));
+}
+
+void AutoInstWarpEntryAndExit::Analyze(uint32_t* const device_output_buffer, bool buffer_overflowed) {
+    if (buffer_overflowed) {
+        ReportSetupProblem(device, "Analysis cannot produce a valid result without a complete execution trace.\n");
+        return;
+    }
+
+    uint32_t entry_count = 0;
+    uint32_t exit_count = 0;
+
+    auto runtime_words_written = device_output_buffer[WORDS_WRITTEN_INDEX];
+    ReportInfo(device, "Analyzing Warp Entries vs Exits in " + std::to_string(runtime_words_written * 4) + " bytes! \n");
+
+    uint32_t j = 0;
+    while (j < runtime_words_written) {
+        uint32_t prim_id = device_output_buffer[NUM_BUFFER_RESERVED_WORDS + j];
+        if (prim_id == spvtools::kAutoInstWarpEntryAndExitBeginPipeline) {
+            entry_count++;
+        } else if (prim_id == spvtools::kAutoInstWarpEntryAndExitEndPipeline) {
+            exit_count++;
+        } else {
+            ReportSetupProblem(device, "Received unexpected primitive id. Aborting!\n");
+            return;
+        }
+        j++;
+    }
+
+    float divergence_factor = (float)exit_count / (float)entry_count;
+
+    std::ofstream csv_file;
+    csv_file.open(FrameAnalysisFileName("exits_vs_entries.csv"), std::ios_base::app);
+    csv_file << divergence_factor << "\n";
+    csv_file.close();
+
+    ReportInfo(device, "Exits/entries= " + std::to_string(divergence_factor) + "\n");
+}
\ No newline at end of file
diff --git a/layers/auto_inst_warp_entry_and_exit.h b/layers/auto_inst_warp_entry_and_exit.h
new file mode 100644
index 00000000..17adfdfb
--- /dev/null
+++ b/layers/auto_inst_warp_entry_and_exit.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Author: David Pankratz <pankratz@ualberta.ca>
+ */
+
+#pragma once
+
+#include "auto_inst.h"
+
+class AutoInstWarpEntryAndExit;
+
+class AutoInstWarpEntryAndExit : public AutoInst {
+  public:
+    AutoInstWarpEntryAndExit() : AutoInst() {}
+
+    void InitializeLayerDeviceSettings(AutoInst* device_auto_inst) override;
+
+    void InitializeInstrumentationBuffer(uint32_t* buffer) override{};
+
+    void RegisterPasses(spvtools::Optimizer* optimizer, uint32_t desc_set_bind_index, uint32_t unique_shader_module_id) override;
+
+    // Opportunity for inheriting class to perform hybrid analysis using
+    // 1) static_data
+    // 2) runtime_data
+    // 3) shader_map
+    void Analyze(uint32_t* const debug_output_buffer, bool buffer_overflowed);
+
+    void AnalyzeRayTracing(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t width, uint32_t height,
+                           uint32_t depth) override {
+        Analyze(debug_output_buffer, buffer_overflowed);
+    };
+
+    void AnalyzeGraphics(uint32_t* const debug_output_buffer, bool buffer_overflowed) override {
+        Analyze(debug_output_buffer, buffer_overflowed);
+    };
+
+    void AnalyzeCompute(uint32_t* const debug_output_buffer, bool buffer_overflowed, uint32_t x, uint32_t y, uint32_t z) override {
+        Analyze(debug_output_buffer, buffer_overflowed);
+    };
+};
diff --git a/layers/debug_printf.cpp b/layers/debug_printf.cpp
index f04f4852..417341b5 100644
--- a/layers/debug_printf.cpp
+++ b/layers/debug_printf.cpp
@@ -80,7 +80,7 @@ void DebugPrintf::PostCallRecordCreateDevice(VkPhysicalDevice physicalDevice, co
 
     if (enabled[gpu_validation]) {
         ReportSetupProblem(device,
-                           "Debug Printf cannot be enabled when gpu assisted validation is enabled.  "
+                           "Debug Printf cannot be enabled when gpu assisted validation or auto-inst are enabled.  "
                            "Debug Printf disabled.");
         device_debug_printf->aborted = true;
         return;
diff --git a/layers/debug_printf.h b/layers/debug_printf.h
index 915d5a6d..deb85031 100644
--- a/layers/debug_printf.h
+++ b/layers/debug_printf.h
@@ -45,6 +45,7 @@ struct DPFShaderTracker {
     VkPipeline pipeline;
     VkShaderModule shader_module;
     std::vector<unsigned int> pgm;
+    VkShaderStageFlagBits stage;
 };
 
 enum vartype { varsigned, varunsigned, varfloat };
diff --git a/layers/generated/chassis.cpp b/layers/generated/chassis.cpp
index 75a4ebfa..6548ddc9 100644
--- a/layers/generated/chassis.cpp
+++ b/layers/generated/chassis.cpp
@@ -50,6 +50,12 @@ bool wrap_handles = true;
 #include "gpu_validation.h"
 #include "object_lifetime_validation.h"
 #include "debug_printf.h"
+#include "auto_inst_dyn_shader_trace.h"
+#include "auto_inst_dyn_trace_ray_trace.h"
+#include "auto_inst_execution_trace.h"
+#include "auto_inst_simt_efficiency.h"
+#include "auto_inst_divergence_characterization.h"
+#include "auto_inst_warp_entry_and_exit.h"
 #include "stateless_validation.h"
 #include "synchronization_validation.h"
 #include "thread_safety.h"
@@ -306,6 +312,24 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
     auto sync_validation_obj = new SyncValidator;
     sync_validation_obj->RegisterValidationObject(local_enables[sync_validation], api_version, report_data, local_object_dispatch);
 
+    auto auto_inst_simt_efficiency_obj = new AutoInstSimtEfficiency;
+    auto_inst_simt_efficiency_obj->RegisterValidationObject(local_enables[auto_inst_simt_efficiency], api_version, report_data, local_object_dispatch);
+
+    auto auto_inst_execution_trace_obj = new AutoInstExecutionTrace;
+    auto_inst_execution_trace_obj->RegisterValidationObject(local_enables[auto_inst_execution_trace], api_version, report_data, local_object_dispatch);
+
+    auto auto_inst_dyn_trace_ray_trace_obj = new AutoInstDynTraceRayTrace;
+    auto_inst_dyn_trace_ray_trace_obj->RegisterValidationObject(local_enables[auto_inst_dyn_trace_ray_trace], api_version, report_data, local_object_dispatch);
+
+    auto auto_inst_divergence_characterization_obj = new AutoInstDivergenceCharacterization;
+    auto_inst_divergence_characterization_obj->RegisterValidationObject(local_enables[auto_inst_divergence_characterization], api_version, report_data, local_object_dispatch);
+
+    auto auto_inst_warp_entry_and_exit_obj = new AutoInstWarpEntryAndExit;
+    auto_inst_warp_entry_and_exit_obj->RegisterValidationObject(local_enables[auto_inst_warp_entry_and_exit], api_version, report_data, local_object_dispatch);
+    
+    auto auto_inst_dyn_shader_trace_obj = new AutoInstDynShaderTrace;
+    auto_inst_dyn_shader_trace_obj->RegisterValidationObject(local_enables[auto_inst_dyn_shader_trace], api_version, report_data, local_object_dispatch);
+
     // If handle wrapping is disabled via the ValidationFeatures extension, override build flag
     if (local_disables[handle_wrapping]) {
         wrap_handles = false;
@@ -338,7 +362,7 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
     framework->report_data = report_data;
     framework->api_version = api_version;
     framework->instance_extensions.InitFromInstanceCreateInfo(specified_version, pCreateInfo);
-
+    
     OutputLayerStatusInfo(framework);
 
     thread_checker_obj->FinalizeInstanceValidationObject(framework);
@@ -348,9 +372,15 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
     core_checks_obj->instance = *pInstance;
     core_checks_obj->instance_state = core_checks_obj;
     best_practices_obj->FinalizeInstanceValidationObject(framework);
-    gpu_assisted_obj->FinalizeInstanceValidationObject(framework);
+    gpu_assisted_obj->FinalizeInstanceValidationObject(framework);  
     debug_printf_obj->FinalizeInstanceValidationObject(framework);
     sync_validation_obj->FinalizeInstanceValidationObject(framework);
+    auto_inst_simt_efficiency_obj->FinalizeInstanceValidationObject(framework);    
+    auto_inst_execution_trace_obj->FinalizeInstanceValidationObject(framework);
+    auto_inst_dyn_trace_ray_trace_obj->FinalizeInstanceValidationObject(framework);
+    auto_inst_divergence_characterization_obj->FinalizeInstanceValidationObject(framework);
+    auto_inst_warp_entry_and_exit_obj->FinalizeInstanceValidationObject(framework);
+    auto_inst_dyn_shader_trace_obj->FinalizeInstanceValidationObject(framework);
 
     for (auto intercept : framework->object_dispatch) {
         auto lock = intercept->write_lock();
@@ -360,8 +390,9 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
     // Delete unused validation objects to avoid memory leak.
     std::vector<ValidationObject*> local_objs = {
         thread_checker_obj, object_tracker_obj, parameter_validation_obj,
-        core_checks_obj, best_practices_obj, gpu_assisted_obj, debug_printf_obj,
-        sync_validation_obj,
+        core_checks_obj, best_practices_obj, gpu_assisted_obj, debug_printf_obj, sync_validation_obj, 
+        auto_inst_simt_efficiency_obj, auto_inst_execution_trace_obj, auto_inst_dyn_trace_ray_trace_obj,
+         auto_inst_divergence_characterization_obj, auto_inst_warp_entry_and_exit_obj, auto_inst_dyn_shader_trace_obj
     };
     for (auto obj : local_objs) {
         if (std::find(local_object_dispatch.begin(), local_object_dispatch.end(), obj) == local_object_dispatch.end()) {
@@ -493,22 +524,27 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(VkPhysicalDevice gpu, const VkDevice
 
     auto debug_printf_obj = new DebugPrintf;
     debug_printf_obj->InitDeviceValidationObject(enables[debug_printf], instance_interceptor, device_interceptor);
-
+    
     auto sync_validation_obj = new SyncValidator;
     sync_validation_obj->InitDeviceValidationObject(enables[sync_validation], instance_interceptor, device_interceptor);
 
-    // Delete unused validation objects to avoid memory leak.
-    std::vector<ValidationObject *> local_objs = {
-        thread_safety_obj, stateless_validation_obj, object_tracker_obj,
-        core_checks_obj, best_practices_obj, gpu_assisted_obj, debug_printf_obj,
-        sync_validation_obj,
-    };
-    for (auto obj : local_objs) {
-        if (std::find(device_interceptor->object_dispatch.begin(), device_interceptor->object_dispatch.end(), obj) ==
-            device_interceptor->object_dispatch.end()) {
-            delete obj;
-        }
-    }
+    auto auto_inst_simt_efficiency_obj = new AutoInstSimtEfficiency;
+    auto_inst_simt_efficiency_obj->InitDeviceValidationObject(enables[auto_inst_simt_efficiency], instance_interceptor, device_interceptor);
+
+    auto auto_inst_execution_trace_obj = new AutoInstExecutionTrace;
+    auto_inst_execution_trace_obj->InitDeviceValidationObject(enables[auto_inst_execution_trace], instance_interceptor, device_interceptor);
+    
+    auto auto_inst_dyn_trace_ray_trace_obj = new AutoInstDynTraceRayTrace;
+    auto_inst_dyn_trace_ray_trace_obj->InitDeviceValidationObject(enables[auto_inst_dyn_trace_ray_trace], instance_interceptor, device_interceptor);
+    
+    auto auto_inst_divergence_characterization_obj = new AutoInstDivergenceCharacterization;
+    auto_inst_divergence_characterization_obj->InitDeviceValidationObject(enables[auto_inst_divergence_characterization], instance_interceptor, device_interceptor);
+    
+    auto auto_inst_warp_entry_and_exit_obj = new AutoInstWarpEntryAndExit;
+    auto_inst_warp_entry_and_exit_obj->InitDeviceValidationObject(enables[auto_inst_warp_entry_and_exit], instance_interceptor, device_interceptor);
+        
+    auto auto_inst_dyn_shader_trace_obj = new AutoInstDynShaderTrace;
+    auto_inst_dyn_shader_trace_obj->InitDeviceValidationObject(enables[auto_inst_dyn_shader_trace], instance_interceptor, device_interceptor);
 
     for (auto intercept : instance_interceptor->object_dispatch) {
         auto lock = intercept->write_lock();
@@ -574,7 +610,8 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateGraphicsPipelines(
     }
 
     auto usepCreateInfos = (!cgpl_state[LayerObjectTypeGpuAssisted].pCreateInfos) ? pCreateInfos : cgpl_state[LayerObjectTypeGpuAssisted].pCreateInfos;
-    if (cgpl_state[LayerObjectTypeDebugPrintf].pCreateInfos) usepCreateInfos = cgpl_state[LayerObjectTypeDebugPrintf].pCreateInfos;
+    if (cgpl_state[LayerObjectTypeDebugPrintf].pCreateInfos) usepCreateInfos = cgpl_state[LayerObjectTypeDebugPrintf].pCreateInfos;    
+    else if (cgpl_state[LayerObjectTypeAutoInst].pCreateInfos) usepCreateInfos = cgpl_state[LayerObjectTypeAutoInst].pCreateInfos;
 
     VkResult result = DispatchCreateGraphicsPipelines(device, pipelineCache, createInfoCount, usepCreateInfos, pAllocator, pPipelines);
 
@@ -610,7 +647,8 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateComputePipelines(
     }
 
     auto usepCreateInfos = (!ccpl_state[LayerObjectTypeGpuAssisted].pCreateInfos) ? pCreateInfos : ccpl_state[LayerObjectTypeGpuAssisted].pCreateInfos;
-    if (ccpl_state[LayerObjectTypeDebugPrintf].pCreateInfos) usepCreateInfos = ccpl_state[LayerObjectTypeDebugPrintf].pCreateInfos;
+    if (ccpl_state[LayerObjectTypeDebugPrintf].pCreateInfos) usepCreateInfos = ccpl_state[LayerObjectTypeDebugPrintf].pCreateInfos;    
+    else if (ccpl_state[LayerObjectTypeAutoInst].pCreateInfos) usepCreateInfos = ccpl_state[LayerObjectTypeAutoInst].pCreateInfos;
 
     VkResult result = DispatchCreateComputePipelines(device, pipelineCache, createInfoCount, usepCreateInfos, pAllocator, pPipelines);
 
@@ -654,6 +692,8 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateRayTracingPipelinesNV(
                                                              pPipelines, result, &(crtpl_state[intercept->container_type]));
     }
     return result;
+
+
 }
 
 VKAPI_ATTR VkResult VKAPI_CALL CreateRayTracingPipelinesKHR(
diff --git a/layers/generated/chassis.h b/layers/generated/chassis.h
index 7f67fbe3..269aa727 100644
--- a/layers/generated/chassis.h
+++ b/layers/generated/chassis.h
@@ -52,6 +52,12 @@
 #include "vk_safe_struct.h"
 #include "vk_typemap_helper.h"
 
+#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT (VkValidationFeatureEnableEXT)5
+#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT (VkValidationFeatureEnableEXT)6
+#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT (VkValidationFeatureEnableEXT)7
+#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT (VkValidationFeatureEnableEXT)8
+#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT (VkValidationFeatureEnableEXT)9
+#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT (VkValidationFeatureEnableEXT)10
 
 extern std::atomic<uint64_t> global_unique_id;
 
@@ -2834,6 +2840,7 @@ enum LayerObjectTypeId {
     LayerObjectTypeBestPractices,               // Instance or device best practices layer object
     LayerObjectTypeGpuAssisted,                 // Instance or device gpu assisted validation layer object
     LayerObjectTypeDebugPrintf,                 // Instance or device shader debug printf layer object
+    LayerObjectTypeAutoInst,                    // Instance or device shader auto instrumentation layer object
     LayerObjectTypeCommandCounter,              // Command Counter validation object, child of corechecks
     LayerObjectTypeSyncValidation,              // Instance or device synchronization validation layer object
     LayerObjectTypeMaxEnum,                     // Max enum count
@@ -2897,8 +2904,14 @@ typedef enum EnableFlags {
     gpu_validation_reserve_binding_slot,
     best_practices,
     vendor_specific_arm,
-    debug_printf,
+    debug_printf,    
     sync_validation,
+    auto_inst_simt_efficiency,    
+    auto_inst_execution_trace,
+    auto_inst_dyn_trace_ray_trace,
+    auto_inst_divergence_characterization,
+    auto_inst_warp_entry_and_exit,
+    auto_inst_dyn_shader_trace,                    
     // Insert new enables above this line
     kMaxEnableFlags,
 } EnableFlags;
diff --git a/layers/gpu_utils.h b/layers/gpu_utils.h
index 01197b94..e04ee285 100644
--- a/layers/gpu_utils.h
+++ b/layers/gpu_utils.h
@@ -314,14 +314,21 @@ void UtilPostCallRecordPipelineCreations(const uint32_t count, const CreateInfo
             VkShaderModule shader_module = VK_NULL_HANDLE;
             if (bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
                 shader_module = pipeline_state->graphicsPipelineCI.pStages[stage].module;
+                object_ptr->shader_map[shader_state->gpu_validation_shader_id].stage =
+                    pipeline_state->graphicsPipelineCI.pStages[stage].stage;
             } else if (bind_point == VK_PIPELINE_BIND_POINT_COMPUTE) {
                 assert(stage == 0);
                 shader_module = pipeline_state->computePipelineCI.stage.module;
+                object_ptr->shader_map[shader_state->gpu_validation_shader_id].stage =
+                    pipeline_state->computePipelineCI.stage.stage;
             } else if (bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_NV) {
                 shader_module = pipeline_state->raytracingPipelineCI.pStages[stage].module;
+                object_ptr->shader_map[shader_state->gpu_validation_shader_id].stage =
+                    pipeline_state->raytracingPipelineCI.pStages[stage].stage;
             } else {
                 assert(false);
             }
+
             object_ptr->shader_map[shader_state->gpu_validation_shader_id].shader_module = shader_module;
             object_ptr->shader_map[shader_state->gpu_validation_shader_id].pgm = std::move(code);
         }
diff --git a/layers/gpu_validation.h b/layers/gpu_validation.h
index 706d3fb7..b48b84ab 100644
--- a/layers/gpu_validation.h
+++ b/layers/gpu_validation.h
@@ -54,6 +54,7 @@ struct GpuAssistedShaderTracker {
     VkPipeline pipeline;
     VkShaderModule shader_module;
     std::vector<unsigned int> pgm;
+    VkShaderStageFlagBits stage;
 };
 
 struct GpuAssistedAccelerationStructureBuildValidationBufferInfo {
@@ -149,7 +150,8 @@ class GpuAssisted : public ValidationStateTracker {
                                       VkPipelineStageFlags srcStageMask, VkPipelineStageFlags dstStageMask,
                                       uint32_t memoryBarrierCount, const VkMemoryBarrier* pMemoryBarriers,
                                       uint32_t bufferMemoryBarrierCount, const VkBufferMemoryBarrier* pBufferMemoryBarriers,
-                                      uint32_t imageMemoryBarrierCount, const VkImageMemoryBarrier* pImageMemoryBarriers) const override;
+                                      uint32_t imageMemoryBarrierCount,
+                                      const VkImageMemoryBarrier* pImageMemoryBarriers) const override;
     void PreCallRecordCreateBuffer(VkDevice device, const VkBufferCreateInfo* pCreateInfo, const VkAllocationCallbacks* pAllocator,
                                    VkBuffer* pBuffer, void* cb_state_data) override;
     void CreateAccelerationStructureBuildValidationState(GpuAssisted* device_GpuAssisted);
diff --git a/layers/layer_options.cpp b/layers/layer_options.cpp
index 3c6f5dfe..0b6ec389 100644
--- a/layers/layer_options.cpp
+++ b/layers/layer_options.cpp
@@ -92,6 +92,19 @@ void SetValidationEnable(CHECK_ENABLED &enable_data, const ValidationCheckEnable
 
 // Set the local enable flag for a single VK_VALIDATION_FEATURE_ENABLE_* flag
 void SetValidationFeatureEnable(CHECK_ENABLED &enable_data, const VkValidationFeatureEnableEXT feature_enable) {
+    if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT) {
+        enable_data[auto_inst_simt_efficiency] = true;
+    } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT) {
+        enable_data[auto_inst_execution_trace] = true;
+    } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT) {
+        enable_data[auto_inst_dyn_trace_ray_trace] = true;
+    } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT) {
+        enable_data[auto_inst_divergence_characterization] = true;
+    } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT) {
+        enable_data[auto_inst_warp_entry_and_exit] = true;
+    } else if (feature_enable == VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT) {
+        enable_data[auto_inst_dyn_shader_trace] = true;
+    } 
     switch (feature_enable) {
         case VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT:
             enable_data[gpu_validation] = true;
diff --git a/layers/layer_options.h b/layers/layer_options.h
index 861b9abe..d73768cd 100644
--- a/layers/layer_options.h
+++ b/layers/layer_options.h
@@ -45,11 +45,16 @@ static const std::unordered_map<std::string, VkValidationFeatureDisableEXT> VkVa
 
 static const std::unordered_map<std::string, VkValidationFeatureEnableEXT> VkValFeatureEnableLookup = {
     {"VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT", VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT},
-    {"VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT",
-     VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT},
+    {"VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT", VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT},
     {"VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT", VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT},
     {"VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT", VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT},
     {"VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT", VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT},
+    {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT},    
+    {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT},
+    {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT},
+    {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT},
+    {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT},
+    {"VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT", VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT},    
 };
 
 static const std::unordered_map<std::string, VkValidationFeatureEnable> VkValFeatureEnableLookup2 = {
@@ -93,7 +98,13 @@ static const std::vector<std::string> EnableFlagNameHelper = {
     "VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT",                     // best_practices,
     "VALIDATION_CHECK_ENABLE_VENDOR_SPECIFIC_ARM",                         // vendor_specific_arm,
     "VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT",                       // debug_printf,
-    "VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION"              // sync_validation,
+    "VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION",              // sync_validation,
+    "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT",              // auto_inst_simt_efficiency    
+    "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT",              // auto_inst_execution_trace
+    "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT",          // auto_inst_dyn_trace_ray_trace
+    "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT",  // auto_inst_divergence_characterization
+    "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT",          // auto_inst_warp_entry_and_exit
+    "VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT",             // auto_inst_dyn_shader_trace
 };
-
+  
 void ProcessConfigAndEnvSettings(ConfigAndEnvSettings *settings_data);
diff --git a/scripts/known_good.json b/scripts/known_good.json
index 9a53e452..b4fef368 100755
--- a/scripts/known_good.json
+++ b/scripts/known_good.json
@@ -1,6 +1,7 @@
 {
-  "repos" : [
+  "repos": [
     {
+
       "name" : "glslang",
       "url" : "https://github.com/KhronosGroup/glslang.git",
       "sub_dir" : "glslang",
@@ -10,7 +11,7 @@
       "prebuild" : [
         "python update_glslang_sources.py"
       ],
-      "cmake_options" : [
+      "cmake_options": [
         "-DUSE_CCACHE=ON"
       ]
     },
@@ -28,12 +29,22 @@
       "sub_dir": "SPIRV-Headers",
       "build_dir": "SPIRV-Headers/build",
       "install_dir": "SPIRV-Headers/build/install",
+
       "commit": "f027d53ded7e230e008d37c8b47ede7cd308e19d"
+    },
+    {
+      "name": "SPIRV-Cross",
+      "url": "https://github.com/KhronosGroup/SPIRV-Cross.git",
+      "sub_dir": "spirv-cross",
+      "build_dir": "spirv-cross/build",
+      "install_dir": "spirv-cross/build/install",  
+      "commit": "e50f7d1ce8e162d0c826e84168cfa234e4de4ec9"
     }
   ],
   "install_names" : {
       "glslang" : "GLSLANG_INSTALL_DIR",
       "Vulkan-Headers" : "VULKAN_HEADERS_INSTALL_DIR",
-      "SPIRV-Headers" : "SPIRV_HEADERS_INSTALL_DIR"
+      "SPIRV-Headers": "SPIRV_HEADERS_INSTALL_DIR",
+      "SPIRV-Cross" :  "SPIRV_CROSS_INSTALL_DIR"
   }
 }
diff --git a/scripts/layer_chassis_generator.py b/scripts/layer_chassis_generator.py
index 8c4a4c4d..f53055c7 100644
--- a/scripts/layer_chassis_generator.py
+++ b/scripts/layer_chassis_generator.py
@@ -241,6 +241,13 @@ class LayerChassisOutputGenerator(OutputGenerator):
 #include "vk_safe_struct.h"
 #include "vk_typemap_helper.h"
 
+// Define here as a placeholder during development. 
+#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_SIMT_EFFICIENCY_EXT (VkValidationFeatureEnableEXT)5
+#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_EXECUTION_TRACE_EXT (VkValidationFeatureEnableEXT)7
+#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_TRACE_RAY_TRACE_EXT (VkValidationFeatureEnableEXT)8
+#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DIVERGENCE_CHARACTERIZATION_EXT (VkValidationFeatureEnableEXT)9
+#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_WARP_ENTRY_AND_EXIT_EXT (VkValidationFeatureEnableEXT)10
+#define VK_VALIDATION_FEATURE_ENABLE_AUTO_INST_DYN_SHADER_TRACE_EXT (VkValidationFeatureEnableEXT)11
 
 extern std::atomic<uint64_t> global_unique_id;
 
@@ -343,6 +350,12 @@ typedef enum EnableFlags {
     vendor_specific_arm,
     debug_printf,
     sync_validation,
+    auto_inst_simt_efficiency,    
+    auto_inst_execution_trace,
+    auto_inst_dyn_trace_ray_trace,
+    auto_inst_divergence_characterization,
+    auto_inst_warp_entry_and_exit,
+    auto_inst_dyn_shader_trace,    
     // Insert new enables above this line
     kMaxEnableFlags,
 } EnableFlags;
@@ -685,6 +698,12 @@ bool wrap_handles = true;
 #include "gpu_validation.h"
 #include "object_lifetime_validation.h"
 #include "debug_printf.h"
+#include "auto_inst_dyn_shader_trace.h"
+#include "auto_inst_dyn_trace_ray_trace.h"
+#include "auto_inst_execution_trace.h"
+#include "auto_inst_simt_efficiency.h"
+#include "auto_inst_divergence_characterization.h"
+#include "auto_inst_warp_entry_and_exit.h"
 #include "stateless_validation.h"
 #include "synchronization_validation.h"
 #include "thread_safety.h"
@@ -941,6 +960,24 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
     auto sync_validation_obj = new SyncValidator;
     sync_validation_obj->RegisterValidationObject(local_enables[sync_validation], api_version, report_data, local_object_dispatch);
 
+    auto auto_inst_simt_efficiency_obj = new AutoInstSimtEfficiency;
+    auto_inst_simt_efficiency_obj->RegisterValidationObject(local_enables[auto_inst_simt_efficiency], api_version, report_data, local_object_dispatch);
+
+    auto auto_inst_execution_trace_obj = new AutoInstExecutionTrace;
+    auto_inst_execution_trace_obj->RegisterValidationObject(local_enables[auto_inst_execution_trace], api_version, report_data, local_object_dispatch);
+
+    auto auto_inst_dyn_trace_ray_trace_obj = new AutoInstDynTraceRayTrace;
+    auto_inst_dyn_trace_ray_trace_obj->RegisterValidationObject(local_enables[auto_inst_dyn_trace_ray_trace], api_version, report_data, local_object_dispatch);
+
+    auto auto_inst_divergence_characterization_obj = new AutoInstDivergenceCharacterization;
+    auto_inst_divergence_characterization_obj->RegisterValidationObject(local_enables[auto_inst_divergence_characterization], api_version, report_data, local_object_dispatch);
+
+    auto auto_inst_warp_entry_and_exit_obj = new AutoInstWarpEntryAndExit;
+    auto_inst_warp_entry_and_exit_obj->RegisterValidationObject(local_enables[auto_inst_warp_entry_and_exit], api_version, report_data, local_object_dispatch);
+    
+    auto auto_inst_dyn_shader_trace_obj = new AutoInstDynShaderTrace;
+    auto_inst_dyn_shader_trace_obj->RegisterValidationObject(local_enables[auto_inst_dyn_shader_trace], api_version, report_data, local_object_dispatch);
+
     // If handle wrapping is disabled via the ValidationFeatures extension, override build flag
     if (local_disables[handle_wrapping]) {
         wrap_handles = false;
@@ -986,7 +1023,13 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateInstance(const VkInstanceCreateInfo *pCreat
     gpu_assisted_obj->FinalizeInstanceValidationObject(framework);
     debug_printf_obj->FinalizeInstanceValidationObject(framework);
     sync_validation_obj->FinalizeInstanceValidationObject(framework);
-
+    auto_inst_simt_efficiency_obj->FinalizeInstanceValidationObject(framework);    
+    auto_inst_execution_trace_obj->FinalizeInstanceValidationObject(framework);
+    auto_inst_dyn_trace_ray_trace_obj->FinalizeInstanceValidationObject(framework);
+    auto_inst_divergence_characterization_obj->FinalizeInstanceValidationObject(framework);
+    auto_inst_warp_entry_and_exit_obj->FinalizeInstanceValidationObject(framework);
+    auto_inst_dyn_shader_trace_obj->FinalizeInstanceValidationObject(framework);
+    
     for (auto intercept : framework->object_dispatch) {
         auto lock = intercept->write_lock();
         intercept->PostCallRecordCreateInstance(pCreateInfo, pAllocator, pInstance, result);
@@ -1132,11 +1175,30 @@ VKAPI_ATTR VkResult VKAPI_CALL CreateDevice(VkPhysicalDevice gpu, const VkDevice
     auto sync_validation_obj = new SyncValidator;
     sync_validation_obj->InitDeviceValidationObject(enables[sync_validation], instance_interceptor, device_interceptor);
 
+    auto auto_inst_simt_efficiency_obj = new AutoInstSimtEfficiency;
+    auto_inst_simt_efficiency_obj->InitDeviceValidationObject(enables[auto_inst_simt_efficiency], instance_interceptor, device_interceptor);
+
+    auto auto_inst_execution_trace_obj = new AutoInstExecutionTrace;
+    auto_inst_execution_trace_obj->InitDeviceValidationObject(enables[auto_inst_execution_trace], instance_interceptor, device_interceptor);
+    
+    auto auto_inst_dyn_trace_ray_trace_obj = new AutoInstDynTraceRayTrace;
+    auto_inst_dyn_trace_ray_trace_obj->InitDeviceValidationObject(enables[auto_inst_dyn_trace_ray_trace], instance_interceptor, device_interceptor);
+    
+    auto auto_inst_divergence_characterization_obj = new AutoInstDivergenceCharacterization;
+    auto_inst_divergence_characterization_obj->InitDeviceValidationObject(enables[auto_inst_divergence_characterization], instance_interceptor, device_interceptor);
+    
+    auto auto_inst_warp_entry_and_exit_obj = new AutoInstWarpEntryAndExit;
+    auto_inst_warp_entry_and_exit_obj->InitDeviceValidationObject(enables[auto_inst_warp_entry_and_exit], instance_interceptor, device_interceptor);
+        
+    auto auto_inst_dyn_shader_trace_obj = new AutoInstDynShaderTrace;
+    auto_inst_dyn_shader_trace_obj->InitDeviceValidationObject(enables[auto_inst_dyn_shader_trace], instance_interceptor, device_interceptor);
+
     // Delete unused validation objects to avoid memory leak.
-    std::vector<ValidationObject *> local_objs = {
-        thread_safety_obj, stateless_validation_obj, object_tracker_obj,
-        core_checks_obj, best_practices_obj, gpu_assisted_obj, debug_printf_obj,
-        sync_validation_obj,
+    std::vector<ValidationObject*> local_objs = {
+        thread_checker_obj, object_tracker_obj, parameter_validation_obj,
+        core_checks_obj, best_practices_obj, gpu_assisted_obj, debug_printf_obj, sync_validation_obj, 
+        auto_inst_simt_efficiency_obj, auto_inst_execution_trace_obj, auto_inst_dyn_trace_ray_trace_obj,
+         auto_inst_divergence_characterization_obj, auto_inst_warp_entry_and_exit_obj, auto_inst_dyn_shader_trace_obj
     };
     for (auto obj : local_objs) {
         if (std::find(device_interceptor->object_dispatch.begin(), device_interceptor->object_dispatch.end(), obj) ==
-- 
2.29.2.windows.2