From c3a53127d746a597d2cfab68d833d134976d6fbd Mon Sep 17 00:00:00 2001
From: Philip Rebohle <philip.rebohle@tu-dortmund.de>
Date: Thu, 11 Aug 2022 02:37:36 +0200
Subject: [PATCH] [dxvk] Add high-priority queue for shader compiles

As well as an API to queue shaders as high priority.
---
 src/dxvk/dxvk_device.cpp      |   6 ++
 src/dxvk/dxvk_device.h        |   7 +++
 src/dxvk/dxvk_pipemanager.cpp | 109 ++++++++++++++++++++++++++++------
 src/dxvk/dxvk_pipemanager.h   |  34 +++++++++--
 4 files changed, 133 insertions(+), 23 deletions(-)
diff --git a/src/dxvk/dxvk_device.cpp b/src/dxvk/dxvk_device.cpp
index d8bd98a0..08289cee 100644
--- a/src/dxvk/dxvk_device.cpp
+++ b/src/dxvk/dxvk_device.cpp
@@ -213,6 +213,12 @@ namespace dxvk {
   }
   
   
+  void DxvkDevice::requestCompileShader(
+    const Rc<DxvkShader>&           shader) {
+    m_objects.pipelineManager().requestCompileShader(shader);
+  }
+
+
   void DxvkDevice::presentImage(
     const Rc<vk::Presenter>&        presenter,
           DxvkSubmitStatus*         status) {
diff --git a/src/dxvk/dxvk_device.h b/src/dxvk/dxvk_device.h
index f4785010..e8f44ee6 100644
--- a/src/dxvk/dxvk_device.h
+++ b/src/dxvk/dxvk_device.h
@@ -380,6 +380,13 @@ namespace dxvk {
     void registerShader(
       const Rc<DxvkShader>&         shader);
     
+    /**
+     * \brief Prioritizes compilation of a given shader
+     * \param [in] shader Shader to start compiling
+     */
+    void requestCompileShader(
+      const Rc<DxvkShader>&         shader);
+
     /**
      * \brief Presents a swap chain image
      * 
diff --git a/src/dxvk/dxvk_pipemanager.cpp b/src/dxvk/dxvk_pipemanager.cpp
index 426443d7..ffd52f2b 100644
--- a/src/dxvk/dxvk_pipemanager.cpp
+++ b/src/dxvk/dxvk_pipemanager.cpp
@@ -7,17 +7,9 @@
 namespace dxvk {
   
   DxvkPipelineWorkers::DxvkPipelineWorkers(
-          DxvkDevice*                     device) {
-    // Use a reasonably large number of threads for compiling, but
-    // leave some cores to the application to avoid excessive stutter
-    uint32_t numCpuCores = dxvk::thread::hardware_concurrency();
-    m_workerCount = ((std::max(1u, numCpuCores) - 1) * 5) / 7;
+          DxvkDevice*                     device)
+  : m_device(device) {
 
-    if (m_workerCount <  1) m_workerCount =  1;
-    if (m_workerCount > 32) m_workerCount = 32;
-
-    if (device->config().numCompilerThreads > 0)
-      m_workerCount = device->config().numCompilerThreads;
   }
 
 
@@ -27,7 +19,8 @@ namespace dxvk {
 
 
   void DxvkPipelineWorkers::compilePipelineLibrary(
-          DxvkShaderPipelineLibrary*      library) {
+          DxvkShaderPipelineLibrary*      library,
+          DxvkPipelinePriority            priority) {
     std::unique_lock lock(m_queueLock);
     this->startWorkers();
 
@@ -36,7 +29,13 @@ namespace dxvk {
     PipelineLibraryEntry e = { };
     e.pipelineLibrary = library;
 
-    m_queuedLibraries.push(e);
+    if (priority == DxvkPipelinePriority::High) {
+      m_queuedLibrariesPrioritized.push(e);
+      m_queueCondPrioritized.notify_one();
+    } else {
+      m_queuedLibraries.push(e);
+    }
+
     m_queueCond.notify_one();
   }
 
@@ -100,14 +99,37 @@ namespace dxvk {
 
   void DxvkPipelineWorkers::startWorkers() {
     if (!m_workersRunning) {
+      // Use all available cores by default
+      uint32_t workerCount = dxvk::thread::hardware_concurrency();
+
+      if (workerCount <  1) workerCount =  1;
+      if (workerCount > 64) workerCount = 64;
+
+      // Reduce worker count on 32-bit to save adderss space
+      if (env::is32BitHostPlatform())
+        workerCount = std::min(workerCount, 16u);
+
+      if (m_device->config().numCompilerThreads > 0)
+        workerCount = m_device->config().numCompilerThreads;
+
+      // Number of workers that can process pipeline pipelines with normal
+      // priority. Any other workers can only build high-priority pipelines.
+      uint32_t npWorkerCount = m_device->canUseGraphicsPipelineLibrary()
+        ? std::max(((workerCount - 1) * 5) / 7, 1u)
+        : workerCount;
+      uint32_t hpWorkerCount = workerCount - npWorkerCount;
+
+      Logger::info(str::format("DXVK: Using ", npWorkerCount, " + ", hpWorkerCount, " compiler threads"));
+      m_workers.resize(npWorkerCount + hpWorkerCount);
+
+      // Set worker flag so that they don't exit immediately
       m_workersRunning = true;
 
-      Logger::info(str::format("DXVK: Using ", m_workerCount, " compiler threads"));
-      m_workers.resize(m_workerCount);
-
-      for (auto& worker : m_workers) {
-        worker = dxvk::thread([this] { runWorker(); });
-        worker.set_priority(ThreadPriority::Lowest);
+      for (size_t i = 0; i < m_workers.size(); i++) {
+        m_workers[i] = i >= npWorkerCount
+          ? dxvk::thread([this] { runWorkerPrioritized(); })
+          : dxvk::thread([this] { runWorker(); });
+        m_workers[i].set_priority(ThreadPriority::Lowest);
       }
     }
   }
@@ -124,6 +146,7 @@ namespace dxvk {
 
         m_queueCond.wait(lock, [this] {
           return !m_workersRunning
+              || !m_queuedLibrariesPrioritized.empty()
               || !m_queuedLibraries.empty()
               || !m_queuedPipelines.empty();
         });
@@ -132,6 +155,9 @@ namespace dxvk {
           // Skip pending work, exiting early is
           // more important in this case.
           break;
+        } else if (!m_queuedLibrariesPrioritized.empty()) {
+          l = m_queuedLibrariesPrioritized.front();
+          m_queuedLibrariesPrioritized.pop();
         } else if (!m_queuedLibraries.empty()) {
           l = m_queuedLibraries.front();
           m_queuedLibraries.pop();
@@ -162,6 +188,34 @@ namespace dxvk {
   }
 
 
+  void DxvkPipelineWorkers::runWorkerPrioritized() {
+    env::setThreadName("dxvk-shader-p");
+
+    while (true) {
+      PipelineLibraryEntry l = { };
+
+      { std::unique_lock lock(m_queueLock);
+
+        m_queueCondPrioritized.wait(lock, [this] {
+          return !m_workersRunning
+              || !m_queuedLibrariesPrioritized.empty();
+        });
+
+        if (!m_workersRunning)
+          break;
+
+        l = m_queuedLibrariesPrioritized.front();
+        m_queuedLibrariesPrioritized.pop();
+      }
+
+      if (l.pipelineLibrary)
+        l.pipelineLibrary->compilePipeline();
+
+      m_pendingTasks -= 1;
+    }
+  }
+
+
   DxvkPipelineManager::DxvkPipelineManager(
           DxvkDevice*         device)
   : m_device    (device),
@@ -285,13 +339,30 @@ namespace dxvk {
     const Rc<DxvkShader>&         shader) {
     if (canPrecompileShader(shader)) {
       auto library = createPipelineLibrary(shader);
-      m_workers.compilePipelineLibrary(library);
+      m_workers.compilePipelineLibrary(library, DxvkPipelinePriority::Normal);
     }
 
     m_stateCache.registerShader(shader);
   }
 
 
+  void DxvkPipelineManager::requestCompileShader(
+    const Rc<DxvkShader>&         shader) {
+    if (!shader->needsLibraryCompile())
+      return;
+
+    // Dispatch high-priority compile job
+    auto library = findPipelineLibrary(shader);
+
+    if (library)
+      m_workers.compilePipelineLibrary(library, DxvkPipelinePriority::High);
+
+    // Notify immediately so that this only gets called
+    // once, even if compilation does ot start immediately
+    shader->notifyLibraryCompile();
+  }
+
+
   DxvkPipelineCount DxvkPipelineManager::getPipelineCount() const {
     DxvkPipelineCount result;
     result.numGraphicsPipelines = m_stats.numGraphicsPipelines.load();
diff --git a/src/dxvk/dxvk_pipemanager.h b/src/dxvk/dxvk_pipemanager.h
index 5661dd86..b71751dd 100644
--- a/src/dxvk/dxvk_pipemanager.h
+++ b/src/dxvk/dxvk_pipemanager.h
@@ -34,6 +34,14 @@ namespace dxvk {
     std::atomic<uint32_t> numComputePipelines   = { 0u };
   };
 
+  /**
+   * \brief Pipeline priority
+   */
+  enum class DxvkPipelinePriority : uint32_t {
+    Normal  = 0,
+    High    = 1,
+  };
+
   /**
    * \brief Pipeline manager worker threads
    *
@@ -56,9 +64,11 @@ namespace dxvk {
      * the pipeline with default compile arguments.
      * Note that pipeline libraries are high priority.
      * \param [in] library The pipeline library
+     * \param [in] priority Pipeline priority
      */
     void compilePipelineLibrary(
-            DxvkShaderPipelineLibrary*      library);
+            DxvkShaderPipelineLibrary*      library,
+            DxvkPipelinePriority            priority);
 
     /**
      * \brief Compiles an optimized compute pipeline
@@ -107,15 +117,18 @@ namespace dxvk {
       DxvkShaderPipelineLibrary*    pipelineLibrary;
     };
 
+    DxvkDevice*                       m_device;
+
     std::atomic<uint64_t>             m_pendingTasks = { 0ull };
 
     dxvk::mutex                       m_queueLock;
     dxvk::condition_variable          m_queueCond;
+    dxvk::condition_variable          m_queueCondPrioritized;
 
+    std::queue<PipelineLibraryEntry>  m_queuedLibrariesPrioritized;
     std::queue<PipelineLibraryEntry>  m_queuedLibraries;
     std::queue<PipelineEntry>         m_queuedPipelines;
 
-    uint32_t                          m_workerCount = 0;
     bool                              m_workersRunning = false;
     std::vector<dxvk::thread>         m_workers;
 
@@ -123,6 +136,8 @@ namespace dxvk {
 
     void runWorker();
 
+    void runWorkerPrioritized();
+
   };
 
   
@@ -188,7 +203,7 @@ namespace dxvk {
     DxvkGraphicsPipelineFragmentOutputLibrary* createFragmentOutputLibrary(
       const DxvkGraphicsPipelineFragmentOutputState& state);
 
-    /*
+    /**
      * \brief Registers a shader
      * 
      * Starts compiling pipelines asynchronously
@@ -198,7 +213,18 @@ namespace dxvk {
      */
     void registerShader(
       const Rc<DxvkShader>&         shader);
-    
+
+    /**
+     * \brief Prioritizes compilation of a given shader
+     *
+     * Adds the pipeline library for the given shader
+     * to the high-priority queue of the background
+     * workers to make sure it gets compiled quickly.
+     * \param [in] shader Newly compiled shader
+     */
+    void requestCompileShader(
+      const Rc<DxvkShader>&         shader);
+
     /**
      * \brief Retrieves total pipeline count
      * \returns Number of compute/graphics pipelines