From 00eaec1619ca734ad76a802339d89df187f8a487 Mon Sep 17 00:00:00 2001
From: Philip Rebohle <philip.rebohle@tu-dortmund.de>
Date: Sun, 31 Jul 2022 00:59:08 +0200
Subject: [PATCH] [dxvk] Use normalized state to look up optimized graphics
 pipelines

We can't normalize all state at the time it is bound, e.g. disabling
unused blend state before render targets are known. By looking up
pipelines using normalized state we ensure that our VkPipelines are
actually unique.

Based on my testing this only affects a small number of pipelines in
most games (anywhere from 0 to a couple dozen), with some outliers
like The Witcher 1, where a third of the pipelines are redundant due
to stale render state.
---
 src/dxvk/dxvk_graphics.cpp | 83 ++++++++++++++++++++++----------------
 src/dxvk/dxvk_graphics.h   | 70 ++++++++++++++++++++++++++++++--
 2 files changed, 116 insertions(+), 37 deletions(-)

diff --git a/src/dxvk/dxvk_graphics.cpp b/src/dxvk/dxvk_graphics.cpp
index fe653264b..86633ba1a 100644
--- a/src/dxvk/dxvk_graphics.cpp
+++ b/src/dxvk/dxvk_graphics.cpp
@@ -868,8 +868,8 @@ namespace dxvk {
   
   
   DxvkGraphicsPipeline::~DxvkGraphicsPipeline() {
-    for (const auto& instance : m_pipelines)
-      this->destroyPipeline(instance.fastHandle.load());
+    for (const auto& instance : m_fastPipelines)
+      this->destroyPipeline(instance.second);
 
     for (const auto& instance : m_basePipelines)
       this->destroyPipeline(instance.second);
@@ -962,7 +962,7 @@ namespace dxvk {
      || instance->isCompiling.exchange(VK_TRUE, std::memory_order_acquire))
       return;
 
-    VkPipeline pipeline = this->createOptimizedPipeline(state, 0);
+    VkPipeline pipeline = this->getOptimizedPipeline(state, 0);
     instance->fastHandle.store(pipeline, std::memory_order_release);
 
     // Log pipeline state on error
@@ -981,7 +981,7 @@ namespace dxvk {
       // Try to create an optimized pipeline from the cache
       // first, since this is expected to be the fastest path.
       if (m_device->canUsePipelineCacheControl()) {
-        fastHandle = this->createOptimizedPipeline(state,
+        fastHandle = this->getOptimizedPipeline(state,
           VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT);
       }
 
@@ -991,7 +991,7 @@ namespace dxvk {
         baseHandle = this->getBasePipeline(state);
     } else {
       // Create optimized variant right away, no choice
-      fastHandle = this->createOptimizedPipeline(state, 0);
+      fastHandle = this->getOptimizedPipeline(state, 0);
     }
 
     // Log pipeline state if requested, or on failure
@@ -1108,59 +1108,74 @@ namespace dxvk {
     return pipeline;
   }
 
-  
-  VkPipeline DxvkGraphicsPipeline::createOptimizedPipeline(
+
+  VkPipeline DxvkGraphicsPipeline::getOptimizedPipeline(
     const DxvkGraphicsPipelineStateInfo& state,
+          VkPipelineCreateFlags          flags) {
+    DxvkGraphicsPipelineFastInstanceKey key(m_device,
+      m_shaders, state, m_flags, m_specConstantMask);
+
+    std::lock_guard lock(m_fastMutex);
+
+    auto entry = m_fastPipelines.find(key);
+    if (entry != m_fastPipelines.end())
+      return entry->second;
+
+    // Keep pipeline locked to prevent multiple threads from compiling
+    // identical Vulkan pipelines. This should be rare, but has been
+    // buggy on some drivers in the past, so just don't allow it.
+    VkPipeline handle = createOptimizedPipeline(key, flags);
+
+    if (handle)
+      m_fastPipelines.insert({ key, handle });
+
+    return handle;
+  }
+
+
+  VkPipeline DxvkGraphicsPipeline::createOptimizedPipeline(
+    const DxvkGraphicsPipelineFastInstanceKey& key,
           VkPipelineCreateFlags          flags) const {
     auto vk = m_device->vkd();
 
-    // Set up pipeline state
-    DxvkGraphicsPipelineShaderState           shState(m_shaders, state);
-    DxvkGraphicsPipelineDynamicState          dyState(m_device, state, m_flags);
-    DxvkGraphicsPipelineVertexInputState      viState(m_device, state, m_shaders.vs.ptr());
-    DxvkGraphicsPipelinePreRasterizationState prState(m_device, state, m_shaders.gs.ptr());
-    DxvkGraphicsPipelineFragmentShaderState   fsState(m_device, state);
-    DxvkGraphicsPipelineFragmentOutputState   foState(m_device, state, m_shaders.fs.ptr());
-    DxvkPipelineSpecConstantState             scState(m_specConstantMask, state.sc);
-
     // Build stage infos for all provided shaders
     DxvkShaderStageInfo stageInfo(m_device);
 
     if (flags & VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) {
-      stageInfo.addStage(VK_SHADER_STAGE_VERTEX_BIT, m_vsLibrary->getModuleIdentifier(), &scState.scInfo);
+      stageInfo.addStage(VK_SHADER_STAGE_VERTEX_BIT, m_vsLibrary->getModuleIdentifier(), &key.scState.scInfo);
 
       if (m_shaders.fs != nullptr)
-        stageInfo.addStage(VK_SHADER_STAGE_FRAGMENT_BIT, m_fsLibrary->getModuleIdentifier(), &scState.scInfo);
+        stageInfo.addStage(VK_SHADER_STAGE_FRAGMENT_BIT, m_fsLibrary->getModuleIdentifier(), &key.scState.scInfo);
     } else {
-      stageInfo.addStage(VK_SHADER_STAGE_VERTEX_BIT, getShaderCode(m_shaders.vs, shState.vsInfo), &scState.scInfo);
+      stageInfo.addStage(VK_SHADER_STAGE_VERTEX_BIT, getShaderCode(m_shaders.vs, key.shState.vsInfo), &key.scState.scInfo);
 
       if (m_shaders.tcs != nullptr)
-        stageInfo.addStage(VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT, getShaderCode(m_shaders.tcs, shState.tcsInfo), &scState.scInfo);
+        stageInfo.addStage(VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT, getShaderCode(m_shaders.tcs, key.shState.tcsInfo), &key.scState.scInfo);
       if (m_shaders.tes != nullptr)
-        stageInfo.addStage(VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, getShaderCode(m_shaders.tes, shState.tesInfo), &scState.scInfo);
+        stageInfo.addStage(VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, getShaderCode(m_shaders.tes, key.shState.tesInfo), &key.scState.scInfo);
       if (m_shaders.gs != nullptr)
-        stageInfo.addStage(VK_SHADER_STAGE_GEOMETRY_BIT, getShaderCode(m_shaders.gs, shState.gsInfo), &scState.scInfo);
+        stageInfo.addStage(VK_SHADER_STAGE_GEOMETRY_BIT, getShaderCode(m_shaders.gs, key.shState.gsInfo), &key.scState.scInfo);
       if (m_shaders.fs != nullptr)
-        stageInfo.addStage(VK_SHADER_STAGE_FRAGMENT_BIT, getShaderCode(m_shaders.fs, shState.fsInfo), &scState.scInfo);
+        stageInfo.addStage(VK_SHADER_STAGE_FRAGMENT_BIT, getShaderCode(m_shaders.fs, key.shState.fsInfo), &key.scState.scInfo);
     }
 
-    VkGraphicsPipelineCreateInfo info = { VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, &foState.rtInfo };
+    VkGraphicsPipelineCreateInfo info = { VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, &key.foState.rtInfo };
     info.flags                    = flags;
     info.stageCount               = stageInfo.getStageCount();
     info.pStages                  = stageInfo.getStageInfos();
-    info.pVertexInputState        = &viState.viInfo;
-    info.pInputAssemblyState      = &viState.iaInfo;
-    info.pTessellationState       = &prState.tsInfo;
-    info.pViewportState           = &prState.vpInfo;
-    info.pRasterizationState      = &prState.rsInfo;
-    info.pMultisampleState        = &foState.msInfo;
-    info.pDepthStencilState       = &fsState.dsInfo;
-    info.pColorBlendState         = &foState.cbInfo;
-    info.pDynamicState            = &dyState.dyInfo;
+    info.pVertexInputState        = &key.viState.viInfo;
+    info.pInputAssemblyState      = &key.viState.iaInfo;
+    info.pTessellationState       = &key.prState.tsInfo;
+    info.pViewportState           = &key.prState.vpInfo;
+    info.pRasterizationState      = &key.prState.rsInfo;
+    info.pMultisampleState        = &key.foState.msInfo;
+    info.pDepthStencilState       = &key.fsState.dsInfo;
+    info.pColorBlendState         = &key.foState.cbInfo;
+    info.pDynamicState            = &key.dyState.dyInfo;
     info.layout                   = m_bindings->getPipelineLayout(false);
     info.basePipelineIndex        = -1;
     
-    if (!prState.tsInfo.patchControlPoints)
+    if (!key.prState.tsInfo.patchControlPoints)
       info.pTessellationState = nullptr;
     
     VkPipeline pipeline = VK_NULL_HANDLE;
diff --git a/src/dxvk/dxvk_graphics.h b/src/dxvk/dxvk_graphics.h
index 31a130495..a0c0bfbaa 100644
--- a/src/dxvk/dxvk_graphics.h
+++ b/src/dxvk/dxvk_graphics.h
@@ -385,6 +385,61 @@ namespace dxvk {
   };
 
 
+  /**
+   * \brief Fast instance key
+   *
+   * Stores pipeline state used to compile an
+   * optimized pipeline.
+   */
+  struct DxvkGraphicsPipelineFastInstanceKey {
+    DxvkGraphicsPipelineFastInstanceKey() { }
+
+    DxvkGraphicsPipelineFastInstanceKey(
+            DxvkDevice*                       device,
+      const DxvkGraphicsPipelineShaders&      shaders,
+      const DxvkGraphicsPipelineStateInfo&    state,
+            DxvkGraphicsPipelineFlags         flags,
+            uint32_t                          specConstantMask)
+    : shState(shaders, state),
+      dyState(device, state, flags),
+      viState(device, state, shaders.vs.ptr()),
+      prState(device, state, shaders.gs.ptr()),
+      fsState(device, state),
+      foState(device, state, shaders.fs.ptr()),
+      scState(specConstantMask, state.sc) { }
+
+    DxvkGraphicsPipelineShaderState           shState;
+    DxvkGraphicsPipelineDynamicState          dyState;
+    DxvkGraphicsPipelineVertexInputState      viState;
+    DxvkGraphicsPipelinePreRasterizationState prState;
+    DxvkGraphicsPipelineFragmentShaderState   fsState;
+    DxvkGraphicsPipelineFragmentOutputState   foState;
+    DxvkPipelineSpecConstantState             scState;
+
+    bool eq(const DxvkGraphicsPipelineFastInstanceKey& other) const {
+      return shState.eq(other.shState)
+          && dyState.eq(other.dyState)
+          && viState.eq(other.viState)
+          && prState.eq(other.prState)
+          && fsState.eq(other.fsState)
+          && foState.eq(other.foState)
+          && scState.eq(other.scState);
+    }
+
+    size_t hash() const {
+      DxvkHashState hash;
+      hash.add(shState.hash());
+      hash.add(dyState.hash());
+      hash.add(viState.hash());
+      hash.add(prState.hash());
+      hash.add(fsState.hash());
+      hash.add(foState.hash());
+      hash.add(scState.hash());
+      return hash;
+    }
+  };
+
+
   /**
    * \brief Graphics pipeline
    * 
@@ -500,7 +555,6 @@ namespace dxvk {
 
     uint32_t m_specConstantMask = 0;
 
-    // List of pipeline instances, shared between threads
     alignas(CACHE_LINE_SIZE)
     dxvk::mutex                                   m_mutex;
     sync::List<DxvkGraphicsPipelineInstance>      m_pipelines;
@@ -509,6 +563,12 @@ namespace dxvk {
       DxvkGraphicsPipelineBaseInstanceKey,
       VkPipeline, DxvkHash, DxvkEq>               m_basePipelines;
     
+    alignas(CACHE_LINE_SIZE)
+    dxvk::mutex                                   m_fastMutex;
+    std::unordered_map<
+      DxvkGraphicsPipelineFastInstanceKey,
+      VkPipeline, DxvkHash, DxvkEq>               m_fastPipelines;
+    
     DxvkGraphicsPipelineInstance* createInstance(
       const DxvkGraphicsPipelineStateInfo& state,
             bool                           doCreateBasePipeline);
@@ -525,10 +585,14 @@ namespace dxvk {
     VkPipeline createBasePipeline(
       const DxvkGraphicsPipelineBaseInstanceKey& key) const;
     
-    VkPipeline createOptimizedPipeline(
+    VkPipeline getOptimizedPipeline(
       const DxvkGraphicsPipelineStateInfo& state,
+            VkPipelineCreateFlags          flags);
+
+    VkPipeline createOptimizedPipeline(
+      const DxvkGraphicsPipelineFastInstanceKey& key,
             VkPipelineCreateFlags          flags) const;
-    
+
     void destroyPipeline(
             VkPipeline                     pipeline) const;