From 13ae0d218b81fd276f135decf1ebf6871b5be1fc Mon Sep 17 00:00:00 2001
From: netborg <137700136+netborg-afps@users.noreply.github.com>
Date: Tue, 18 Feb 2025 21:01:19 +0100
Subject: [PATCH 1/5] [dxvk] Add config variables for frame pacing

---
 dxvk.conf                 | 52 ++++++++++++++++++++++++++++++++++++++-
 src/dxvk/dxvk_options.cpp |  6 ++++-
 src/dxvk/dxvk_options.h   | 16 +++++++++++-
 3 files changed, 71 insertions(+), 3 deletions(-)
diff --git a/dxvk.conf b/dxvk.conf
index 799991eee..0b8deba8c 100644
--- a/dxvk.conf
+++ b/dxvk.conf
@@ -18,6 +18,51 @@
 # dxgi.enableHDR = True
 
 
+# Frame pacing mode managing CPU-GPU synchronization.
+# Defaults to "low-latency" in the draft-PR for demonstration purposes.
+#
+# "max-frame-latency" provides stable latency in the GPU-limit as long as
+# GPU render times are stable. Latency generally is higher but offers great
+# visual smoothness.
+#
+# "low-latency" provides lower latency in the GPU-limit and can be fine-tuned
+# via dxvk.lowLatencyOffset and dxvk.lowLatencyAllowCpuFramesOverlap.
+#
+# "min-latency" possibly provides the lowest latency (low-latency can be
+# quicker in some situations), and offers less fps in the GPU-limit
+# due to stalling the GPU between frames. Generally not recommended,
+# but helpful to get insights to fine-tune the low-latency mode and
+# possibly is useful for running games in the CPU-limit.
+#
+# "low/min-latency" also supports its own fps-limiting enabled via common
+# variables.
+#
+# Supported values: "max-frame-latency", "low-latency", "min-latency"
+
+# dxvk.framePacing = ""
+
+
+# Allows fine-tuning the low-latency frame pacing mode.
+# Positive values make a frame begin later which might improve responsiveness,
+# although only very slightly, but may be relevant for edge cases.
+# Negative values make a frame begin earlier which might improve fps.
+# Values are given in microseconds. Defaults to 0.
+#
+# Supported values: -10000 to 10000
+
+# dxvk.lowLatencyOffset = 0
+
+
+# Determines whether a frame is allowed to begin before finishing processing
+# the cpu-part of the previous one, when low-latency frame pacing is used.
+# Snappiness may be improved when disallowing overlap. On the other hand, this
+# might also decrease fps in certain cases. Defaults to True.
+#
+# Supported values: True, False
+
+# dxvk.lowLatencyAllowCpuFramesOverlap = True
+
+
 # Expose support for dcomp swap chains with a dummy window.
 #
 # This is not a valid implementation of DirectComposition swapchains,
@@ -104,8 +149,13 @@
 #         The implementation will either use VK_NV_low_latency2 if supported
 #         by the driver, or a custom algorithm.
 # - False: Disable Reflex support as well as built-in latency reduction.
+#         This build defaults to False to enable dxvk.framePacing. You need to
+#         enable Reflex manually (Auto) until we support switching back and
+#         forth between Reflex and the low-latency frame pacing - for example
+#         via the ingame options - and more critically we want to enable
+#         low-latency frame pacing if the game doesn't support Reflex.
   
-# dxvk.latencySleep = Auto
+# dxvk.latencySleep = False
 
 
 # Tolerance for the latency sleep heuristic, in microseconds. Higher values
diff --git a/src/dxvk/dxvk_options.cpp b/src/dxvk/dxvk_options.cpp
index d2d455c33..85fc3ec3e 100644
--- a/src/dxvk/dxvk_options.cpp
+++ b/src/dxvk/dxvk_options.cpp
@@ -12,12 +12,16 @@ namespace dxvk {
     useRawSsbo            = config.getOption<Tristate>("dxvk.useRawSsbo",             Tristate::Auto);
     hud                   = config.getOption<std::string>("dxvk.hud", "");
     tearFree              = config.getOption<Tristate>("dxvk.tearFree",               Tristate::Auto);
-    latencySleep          = config.getOption<Tristate>("dxvk.latencySleep",           Tristate::Auto);
+    latencySleep          = config.getOption<Tristate>("dxvk.latencySleep",           Tristate::False);
     latencyTolerance      = config.getOption<int32_t> ("dxvk.latencyTolerance",       1000);
     disableNvLowLatency2  = config.getOption<Tristate>("dxvk.disableNvLowLatency2",   Tristate::Auto);
     hideIntegratedGraphics = config.getOption<bool>   ("dxvk.hideIntegratedGraphics", false);
     zeroMappedMemory      = config.getOption<bool>    ("dxvk.zeroMappedMemory",       false);
     allowFse              = config.getOption<bool>    ("dxvk.allowFse",               false);
+    framePace             = config.getOption<std::string>("dxvk.framePace",           "");
+    lowLatencyOffset      = config.getOption<int32_t> ("dxvk.lowLatencyOffset",       0);
+    lowLatencyAllowCpuFramesOverlap
+                          = config.getOption<bool>    ("dxvk.lowLatencyAllowCpuFramesOverlap", true);
     deviceFilter          = config.getOption<std::string>("dxvk.deviceFilter",        "");
     tilerMode             = config.getOption<Tristate>("dxvk.tilerMode",              Tristate::Auto);
   }
diff --git a/src/dxvk/dxvk_options.h b/src/dxvk/dxvk_options.h
index 5351ac68b..fd2977143 100644
--- a/src/dxvk/dxvk_options.h
+++ b/src/dxvk/dxvk_options.h
@@ -38,7 +38,9 @@ namespace dxvk {
     Tristate tearFree = Tristate::Auto;
 
     /// Enables latency sleep
-    Tristate latencySleep = Tristate::Auto;
+    /// Defaults to false in this build to activate the FramePacer,
+    /// especially for the case when the game doesn't support Reflex
+    Tristate latencySleep = Tristate::False;
 
     /// Latency tolerance, in microseconds
     int32_t latencyTolerance = 0u;
@@ -61,6 +63,18 @@ namespace dxvk {
     /// Whether to enable tiler optimizations
     Tristate tilerMode = Tristate::Auto;
 
+    /// Frame pacing
+    std::string framePace;
+
+    /// A value in microseconds to fine-tune the low-latency frame pacing.
+    /// Positive values make a frame begin later which might improve responsiveness.
+    /// Negative values make a frame begin earlier which might improve fps.
+    int32_t lowLatencyOffset;
+
+    /// Determines whether a frame is allowed to begin before finishing processing
+    /// the cpu-part of the previous one, when low-latency frame pacing is used.
+    bool lowLatencyAllowCpuFramesOverlap;
+
     // Device name
     std::string deviceFilter;
   };

From 01ccd1e7763bbf9a2ccedc479bfe236abafa3a23 Mon Sep 17 00:00:00 2001
From: netborg <137700136+netborg-afps@users.noreply.github.com>
Date: Tue, 18 Feb 2025 21:18:22 +0100
Subject: [PATCH 2/5] [dxvk] Add low-latency frame pacing

---
 src/dxvk/dxvk_latency.h                       |   7 +
 src/dxvk/framepacer/dxvk_framepacer.cpp       |  64 +++++
 src/dxvk/framepacer/dxvk_framepacer.h         | 191 +++++++++++++
 src/dxvk/framepacer/dxvk_framepacer_mode.h    | 117 ++++++++
 .../dxvk_framepacer_mode_low_latency.cpp      |  43 +++
 .../dxvk_framepacer_mode_low_latency.h        | 255 ++++++++++++++++++
 .../dxvk_framepacer_mode_min_latency.h        |  45 ++++
 src/dxvk/framepacer/dxvk_latency_markers.h    | 148 ++++++++++
 src/dxvk/meson.build                          |   3 +
 9 files changed, 873 insertions(+)
 create mode 100644 src/dxvk/framepacer/dxvk_framepacer.cpp
 create mode 100644 src/dxvk/framepacer/dxvk_framepacer.h
 create mode 100644 src/dxvk/framepacer/dxvk_framepacer_mode.h
 create mode 100644 src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp
 create mode 100644 src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h
 create mode 100644 src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h
 create mode 100644 src/dxvk/framepacer/dxvk_latency_markers.h

diff --git a/src/dxvk/dxvk_latency.h b/src/dxvk/dxvk_latency.h
index c9ac93c5d..f4e74a7ce 100644
--- a/src/dxvk/dxvk_latency.h
+++ b/src/dxvk/dxvk_latency.h
@@ -128,6 +128,10 @@ namespace dxvk {
     virtual void notifyCpuPresentEnd(
             uint64_t                  frameId) = 0;
 
+    virtual void notifySubmit() { }
+    virtual void notifyPresent(
+            uint64_t                  frameId) { }
+
     /**
      * \brief Called when a command list is submitted to the GPU
      *
@@ -174,6 +178,9 @@ namespace dxvk {
     virtual void notifyGpuExecutionEnd(
             uint64_t                  frameId) = 0;
 
+    virtual void notifyGpuPresentBegin(
+            uint64_t                  frameId) { }
+
     /**
      * \brief Called when presentation of a given frame finishes on the GPU
      *
diff --git a/src/dxvk/framepacer/dxvk_framepacer.cpp b/src/dxvk/framepacer/dxvk_framepacer.cpp
new file mode 100644
index 000000000..63803f1ba
--- /dev/null
+++ b/src/dxvk/framepacer/dxvk_framepacer.cpp
@@ -0,0 +1,64 @@
+#include "dxvk_framepacer.h"
+#include "dxvk_framepacer_mode_low_latency.h"
+#include "dxvk_framepacer_mode_min_latency.h"
+#include "dxvk_options.h"
+#include "../../util/util_env.h"
+#include "../../util/log/log.h"
+
+namespace dxvk {
+
+
+  FramePacer::FramePacer( const DxvkOptions& options ) {
+    // we'll default to LOW_LATENCY in the draft-PR for now, for demonstration purposes,
+    // highlighting the generally much better input lag and medium-term time consistency.
+    // although MAX_FRAME_LATENCY has advantages in many games and is likely the better default,
+    // for its higher fps throughput and less susceptibility to short-term time inconsistencies.
+    // which mode being smoother depends on the game.
+    FramePacerMode::Mode mode = FramePacerMode::LOW_LATENCY;
+
+    std::string configStr = env::getEnvVar("DXVK_FRAME_PACE");
+
+    if (configStr.find("max-frame-latency") != std::string::npos) {
+      mode = FramePacerMode::MAX_FRAME_LATENCY;
+    } else if (configStr.find("low-latency") != std::string::npos) {
+      mode = FramePacerMode::LOW_LATENCY;
+    } else if (configStr.find("min-latency") != std::string::npos) {
+      mode = FramePacerMode::MIN_LATENCY;
+    } else if (options.framePace.find("max-frame-latency") != std::string::npos) {
+      mode = FramePacerMode::MAX_FRAME_LATENCY;
+    } else if (options.framePace.find("low-latency") != std::string::npos) {
+      mode = FramePacerMode::LOW_LATENCY;
+    } else if (options.framePace.find("min-latency") != std::string::npos) {
+      mode = FramePacerMode::MIN_LATENCY;
+    }
+
+    switch (mode) {
+      case FramePacerMode::MAX_FRAME_LATENCY:
+        Logger::info( "Frame pace: max-frame-latency" );
+        m_mode = std::make_unique<FramePacerMode>(FramePacerMode::MAX_FRAME_LATENCY, &m_latencyMarkersStorage);
+        break;
+
+      case FramePacerMode::LOW_LATENCY:
+        Logger::info( "Frame pace: low-latency" );
+        m_mode = std::make_unique<LowLatencyMode>(mode, &m_latencyMarkersStorage, options);
+        break;
+
+      case FramePacerMode::MIN_LATENCY:
+        Logger::info( "Frame pace: min-latency" );
+        m_mode = std::make_unique<MinLatencyMode>(mode, &m_latencyMarkersStorage);
+        break;
+    }
+
+    for (auto& gpuStart: m_gpuStarts) {
+      gpuStart.store(0);
+    }
+
+    // be consistent that every frame has a gpuReady event from the previous frame
+    LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(DXGI_MAX_SWAP_CHAIN_BUFFERS+1);
+    m->gpuReady.push_back(high_resolution_clock::now());
+  }
+
+
+  FramePacer::~FramePacer() {}
+
+}
diff --git a/src/dxvk/framepacer/dxvk_framepacer.h b/src/dxvk/framepacer/dxvk_framepacer.h
new file mode 100644
index 000000000..264dcff57
--- /dev/null
+++ b/src/dxvk/framepacer/dxvk_framepacer.h
@@ -0,0 +1,191 @@
+#pragma once
+
+#include "dxvk_framepacer_mode.h"
+#include "dxvk_latency_markers.h"
+#include "../dxvk_latency.h"
+#include "../../util/util_time.h"
+#include <dxgi.h>
+
+
+namespace dxvk {
+
+  struct DxvkOptions;
+
+  /* \brief Frame pacer interface managing the CPU - GPU synchronization.
+   *
+   * GPUs render frames asynchronously to the game's and dxvk's CPU-side work
+   * in order to improve fps-throughput. Aligning the cpu work to chosen time-
+   * points allows to tune certain characteristics of the video presentation,
+   * like smoothness and latency.
+   */
+
+  class FramePacer : public DxvkLatencyTracker {
+    using microseconds = std::chrono::microseconds;
+  public:
+
+    FramePacer( const DxvkOptions& options );
+    ~FramePacer();
+
+    void sleepAndBeginFrame(
+            uint64_t                  frameId,
+            double                    maxFrameRate) override {
+      // wait for finished rendering of a previous frame, typically the one before last
+      m_mode->waitRenderFinished(frameId);
+      // potentially wait some more if the cpu gets too much ahead
+      m_mode->startFrame(frameId);
+      m_latencyMarkersStorage.registerFrameStart(frameId);
+      m_gpuStarts[ frameId % m_gpuStarts.size() ].store(0);
+    }
+
+    void notifyGpuPresentEnd( uint64_t frameId ) override {
+      // the frame has been displayed to the screen
+      m_latencyMarkersStorage.registerFrameEnd(frameId);
+      m_mode->endFrame(frameId);
+    }
+
+    void notifyCsRenderBegin( uint64_t frameId ) override {
+      auto now = high_resolution_clock::now();
+      LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
+      m->csStart = std::chrono::duration_cast<microseconds>(now - m->start).count();
+    }
+
+    void notifyCsRenderEnd( uint64_t frameId ) override {
+      auto now = high_resolution_clock::now();
+      LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
+      m->csFinished = std::chrono::duration_cast<microseconds>(now - m->start).count();
+      m_mode->signalCsFinished( frameId );
+    }
+
+    void notifySubmit() override {
+      LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastSubmitFrameId+1);
+      m->gpuSubmit.push_back(high_resolution_clock::now());
+    }
+
+    void notifyPresent( uint64_t frameId ) override {
+      // dx to vk translation is finished
+      if (frameId != 0) {
+        auto now = high_resolution_clock::now();
+        m_lastSubmitFrameId = frameId;
+        LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
+        LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1);
+        m->gpuSubmit.push_back(now);
+        m->cpuFinished = std::chrono::duration_cast<microseconds>(now - m->start).count();
+        next->gpuSubmit.clear();
+
+        m_latencyMarkersStorage.m_timeline.cpuFinished.store(frameId);
+      }
+    }
+
+    void notifyQueueSubmit( uint64_t frameId ) override {
+      assert( frameId == m_lastQueueSubmitFrameId + 1 );
+      auto now = high_resolution_clock::now();
+      LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
+      m->gpuQueueSubmit.push_back(now);
+      queueSubmitCheckGpuStart(frameId, m, now);
+    }
+
+    void notifyQueuePresentBegin( uint64_t frameId ) override {
+      if (frameId != 0) {
+        auto now = high_resolution_clock::now();
+        m_lastQueueSubmitFrameId = frameId;
+        LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
+        LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1);
+        m->gpuQueueSubmit.push_back(now);
+        next->gpuQueueSubmit.clear();
+        queueSubmitCheckGpuStart(frameId, m, now);
+      }
+    }
+
+    void notifyGpuExecutionBegin( uint64_t frameId ) override {
+      assert( frameId == m_lastFinishedFrameId+1 );
+      LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1);
+      gpuExecutionCheckGpuStart(frameId, m, high_resolution_clock::now());
+    }
+
+    void notifyGpuExecutionEnd( uint64_t frameId ) override {
+      auto now = high_resolution_clock::now();
+      LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1);
+      m->gpuReady.push_back(now);
+    }
+
+    virtual void notifyGpuPresentBegin( uint64_t frameId ) override {
+      // we get frameId == 0 for repeated presents (SyncInterval)
+      if (frameId != 0) {
+        m_lastFinishedFrameId = frameId;
+        auto now = high_resolution_clock::now();
+
+        LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
+        LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1);
+        m->gpuReady.push_back(now);
+        m->gpuFinished = std::chrono::duration_cast<microseconds>(now - m->start).count();
+        next->gpuReady.clear();
+        next->gpuReady.push_back(now);
+
+        gpuExecutionCheckGpuStart(frameId, m, now);
+
+        m_latencyMarkersStorage.m_timeline.gpuFinished.store(frameId);
+        m_mode->finishRender(frameId);
+        m_mode->signalRenderFinished(frameId);
+      }
+    }
+
+    FramePacerMode::Mode getMode() const {
+      return m_mode->m_mode;
+    }
+
+    void setTargetFrameRate( double frameRate ) {
+      m_mode->setTargetFrameRate(frameRate);
+    }
+
+    bool needsAutoMarkers() override {
+      return true;
+    }
+
+    LatencyMarkersStorage m_latencyMarkersStorage;
+
+
+    // not implemented methods
+
+
+    void notifyCpuPresentBegin( uint64_t frameId) override { }
+    void notifyCpuPresentEnd( uint64_t frameId ) override { }
+    void notifyQueuePresentEnd( uint64_t frameId, VkResult status) override { }
+    void discardTimings() override { }
+    DxvkLatencyStats getStatistics( uint64_t frameId ) override
+      { return DxvkLatencyStats(); }
+
+  private:
+
+    void signalGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) {
+      m->gpuStart = std::chrono::duration_cast<microseconds>(t - m->start).count();
+      m_latencyMarkersStorage.m_timeline.gpuStart.store(frameId);
+      m_mode->signalGpuStart(frameId);
+    }
+
+    void queueSubmitCheckGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) {
+      auto& gpuStart = m_gpuStarts[ frameId % m_gpuStarts.size() ];
+      uint16_t val = gpuStart.fetch_or(queueSubmitBit);
+      if (val == gpuReadyBit)
+        signalGpuStart( frameId, m, t );
+    }
+
+    void gpuExecutionCheckGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) {
+      auto& gpuStart = m_gpuStarts[ frameId % m_gpuStarts.size() ];
+      uint16_t val = gpuStart.fetch_or(gpuReadyBit);
+      if (val == queueSubmitBit)
+        signalGpuStart( frameId, m, t );
+    }
+
+    std::unique_ptr<FramePacerMode> m_mode;
+
+    uint64_t m_lastSubmitFrameId      = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+    uint64_t m_lastQueueSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+    uint64_t m_lastFinishedFrameId    = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+
+    std::array< std::atomic< uint16_t >, 16 > m_gpuStarts = { };
+    static constexpr uint16_t queueSubmitBit = 1;
+    static constexpr uint16_t gpuReadyBit    = 2;
+
+  };
+
+}
diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode.h b/src/dxvk/framepacer/dxvk_framepacer_mode.h
new file mode 100644
index 000000000..109a240a2
--- /dev/null
+++ b/src/dxvk/framepacer/dxvk_framepacer_mode.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include "dxvk_latency_markers.h"
+#include "../../util/sync/sync_signal.h"
+#include "../../util/util_env.h"
+#include <dxgi.h>
+
+namespace dxvk {
+
+  /*
+   * /brief Abstract frame pacer mode in order to support different strategies of synchronization.
+   */
+
+  class FramePacerMode {
+
+  public:
+
+    enum Mode {
+      MAX_FRAME_LATENCY = 0,
+      LOW_LATENCY,
+      MIN_LATENCY
+    };
+
+    FramePacerMode( Mode mode, LatencyMarkersStorage* markerStorage, uint32_t maxFrameLatency=1 )
+    : m_mode( mode ),
+      m_waitLatency( maxFrameLatency+1 ),
+      m_latencyMarkersStorage( markerStorage ) {
+      setFpsLimitFrametimeFromEnv();
+    }
+
+    virtual ~FramePacerMode() { }
+
+    virtual void startFrame( uint64_t frameId ) { }
+    virtual void endFrame( uint64_t frameId ) { }
+
+    virtual void finishRender( uint64_t frameId ) { }
+
+    void waitRenderFinished( uint64_t frameId ) {
+      if (m_mode) m_fenceGpuFinished.wait(frameId-m_waitLatency); }
+
+    void signalRenderFinished( uint64_t frameId ) {
+      if (m_mode) m_fenceGpuFinished.signal(frameId); }
+
+    void signalGpuStart( uint64_t frameId ) {
+      if (m_mode) m_fenceGpuStart.signal(frameId); }
+
+    void signalCsFinished( uint64_t frameId ) {
+      if (m_mode) m_fenceCsFinished.signal(frameId); }
+
+    void setTargetFrameRate( double frameRate ) {
+      if (!m_fpsLimitEnvOverride && frameRate > 1.0)
+        m_fpsLimitFrametime.store( 1'000'000/frameRate );
+    }
+
+    const Mode m_mode;
+
+    static bool getDoubleFromEnv( const char* name, double* result );
+    static bool getIntFromEnv( const char* name, int* result );
+
+  protected:
+
+    void setFpsLimitFrametimeFromEnv();
+
+    const uint32_t m_waitLatency;
+    LatencyMarkersStorage* m_latencyMarkersStorage;
+    std::atomic<int32_t> m_fpsLimitFrametime = { 0 };
+    bool m_fpsLimitEnvOverride = { false };
+
+    sync::Fence m_fenceGpuStart    = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) };
+    sync::Fence m_fenceGpuFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) };
+    sync::Fence m_fenceCsFinished  = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS+50) };
+
+  };
+
+
+
+  inline bool FramePacerMode::getDoubleFromEnv( const char* name, double* result ) {
+    std::string env = env::getEnvVar(name);
+    if (env.empty())
+      return false;
+
+    try {
+      *result = std::stod(env);
+      return true;
+    } catch (const std::invalid_argument&) {
+      return false;
+    }
+  }
+
+
+  inline bool FramePacerMode::getIntFromEnv( const char* name, int* result ) {
+    std::string env = env::getEnvVar(name);
+    if (env.empty())
+      return false;
+
+    try {
+      *result = std::stoi(env);
+      return true;
+    } catch (const std::invalid_argument&) {
+      return false;
+    }
+  }
+
+
+  inline void FramePacerMode::setFpsLimitFrametimeFromEnv() {
+    double fpsLimit;
+    if (!getDoubleFromEnv("DXVK_FRAME_RATE", &fpsLimit))
+      return;
+
+    m_fpsLimitEnvOverride = true;
+    if (fpsLimit < 1.0)
+      return;
+
+    m_fpsLimitFrametime = 1'000'000/fpsLimit;
+  }
+
+}
diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp
new file mode 100644
index 000000000..4e39145b4
--- /dev/null
+++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp
@@ -0,0 +1,43 @@
+#include "dxvk_framepacer_mode_low_latency.h"
+
+namespace dxvk {
+
+
+  bool getLowLatencyOffsetFromEnv( int32_t& offset ) {
+    if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_OFFSET", &offset))
+      return false;
+    return true;
+  }
+
+
+  bool getLowLatencyAllowCpuFramesOverlapFromEnv( bool& allowOverlap ) {
+    int32_t o;
+    if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_ALLOW_CPU_FRAMES_OVERLAP", &o))
+      return false;
+    allowOverlap = (bool) o;
+    return true;
+  }
+
+
+  int32_t LowLatencyMode::getLowLatencyOffset( const DxvkOptions& options ) {
+    int32_t offset = options.lowLatencyOffset;
+    int32_t o;
+    if (getLowLatencyOffsetFromEnv(o))
+      offset = o;
+
+    offset = std::max( -10000, offset );
+    offset = std::min(  10000, offset );
+    return offset;
+  }
+
+
+  bool LowLatencyMode::getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options ) {
+    bool allowOverlap = options.lowLatencyAllowCpuFramesOverlap;
+    bool o;
+    if (getLowLatencyAllowCpuFramesOverlapFromEnv(o))
+      allowOverlap = o;
+    return allowOverlap;
+  }
+
+
+}
diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h
new file mode 100644
index 000000000..06fdaf0dd
--- /dev/null
+++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h
@@ -0,0 +1,255 @@
+#pragma once
+
+#include "dxvk_framepacer_mode.h"
+#include "../dxvk_options.h"
+#include "../../util/log/log.h"
+#include "../../util/util_string.h"
+#include <assert.h>
+
+namespace dxvk {
+
+  /*
+   * This low-latency mode aims to reduce latency with minimal impact in fps.
+   * Effective when operating in the GPU-limit. Efficient to be used in the CPU-limit as well.
+   *
+   * Greatly reduces input lag variations when switching between CPU- and GPU-limit, and
+   * compared to the max-frame-latency approach, it has a much more stable input lag when
+   * GPU running times change dramatically, which can happen for example when rotating within a scene.
+   *
+   * The current implementation rather generates fluctuations alternating frame-by-frame
+   * depending on the game's and dxvk's CPU-time variations. This might be visible as a loss
+   * in smoothness, which is an area this implementation can be further improved. Unsuitable
+   * smoothing however might degrade input-lag feel, so it's not implemented for now, but
+   * more advanced smoothing techniques will be investigated in the future.
+   * In some situations however, this low-latency pacing actually improves smoothing though,
+   * it will depend on the game.
+   *
+   * An interesting observation while playtesting was that not only the input lag was affected,
+   * but the video generated did progress more cleanly in time as well with regards to
+   * medium-term time consistency, in other words, the video playback speed remained more steady.
+   *
+   * Optimized for VRR and VK_PRESENT_MODE_IMMEDIATE_KHR. It also comes with its own fps-limiter
+   * which is typically used to prevent the game's fps exceeding the monitor's refresh rate,
+   * and which is tightly integrated into the pacing logic.
+   *
+   * Can be fine-tuned via the dxvk.lowLatencyOffset and dxvk.lowLatencyAllowCpuFramesOverlap
+   * variables (or their respective environment variables)
+   * Compared to maxFrameLatency = 3, render-latency reductions of up to 67% are achieved.
+   */
+
+  class LowLatencyMode : public FramePacerMode {
+    using microseconds = std::chrono::microseconds;
+    using time_point = high_resolution_clock::time_point;
+  public:
+
+    LowLatencyMode(Mode mode, LatencyMarkersStorage* storage, const DxvkOptions& options)
+    : FramePacerMode(mode, storage),
+      m_lowLatencyOffset(getLowLatencyOffset(options)),
+      m_allowCpuFramesOverlap(getLowLatencyAllowCpuFramesOverlap(options)) {
+      Logger::info( str::format("Using lowLatencyOffset: ", m_lowLatencyOffset) );
+      Logger::info( str::format("Using lowLatencyAllowCpuFramesOverlap: ", m_allowCpuFramesOverlap) );
+    }
+
+    ~LowLatencyMode() {}
+
+
+    void startFrame( uint64_t frameId ) override {
+      using std::chrono::duration_cast;
+
+      if (!m_allowCpuFramesOverlap)
+        m_fenceCsFinished.wait( frameId-1 );
+
+      m_fenceGpuStart.wait( frameId-1 );
+
+      time_point now = high_resolution_clock::now();
+      uint64_t finishedId = m_latencyMarkersStorage->getTimeline()->gpuFinished.load();
+      if (finishedId <= DXGI_MAX_SWAP_CHAIN_BUFFERS+1ull)
+        return;
+
+      if (finishedId == frameId-1) {
+        // we are the only in-flight frame, nothing to do other then to apply fps-limiter if needed
+        m_lastStart = sleepFor( now, 0 );
+        return;
+      }
+
+      if (finishedId != frameId-2) {
+        Logger::err( str::format("internal error during low-latency frame pacing: expected finished frameId=",
+          frameId-2, ", got: ", finishedId) );
+      }
+
+      const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId-1);
+
+      // estimate the target gpu sync point for this frame
+      // and calculate backwards when we want to start this frame
+
+      const SyncProps props = getSyncPrediction();
+      int32_t gpuReadyPrediction = duration_cast<microseconds>(
+        m->start + microseconds(m->gpuStart+getGpuStartToFinishPrediction()) - now).count();
+
+      int32_t targetGpuSync = gpuReadyPrediction + props.gpuSync;
+      int32_t delay = targetGpuSync - props.cpuUntilGpuSync + m_lowLatencyOffset;
+
+      m_lastStart = sleepFor( now, delay );
+
+    }
+
+
+    void finishRender( uint64_t frameId ) override {
+
+      using std::chrono::duration_cast;
+      const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId);
+
+      int32_t numLoop = (int32_t)(m->gpuReady.size())-1;
+      if (numLoop <= 1) {
+        m_props[frameId % m_props.size()] = SyncProps();
+        m_props[frameId % m_props.size()].isOutlier = true;
+        m_propsFinished.store( frameId );
+        return;
+      }
+
+      // estimates the optimal overlap for cpu/gpu work by optimizing gpu scheduling first
+      // such that the gpu doesn't go into idle for this frame, and then aligning cpu submits
+      // where gpuSubmit[i] <= gpuRun[i] for all i
+
+      std::vector<int32_t>& gpuRun = m_tempGpuRun;
+      std::vector<int32_t>& gpuRunDurations = m_tempGpuRunDurations;
+      gpuRun.clear();
+      gpuRunDurations.clear();
+      int32_t optimizedGpuTime = 0;
+      gpuRun.push_back(optimizedGpuTime);
+
+      for (int i=0; i<numLoop; ++i) {
+        time_point _gpuRun = std::max( m->gpuReady[i], m->gpuQueueSubmit[i] );
+        int32_t duration = duration_cast<microseconds>( m->gpuReady[i+1] - _gpuRun ).count();
+        optimizedGpuTime += duration;
+        gpuRun.push_back(optimizedGpuTime);
+        gpuRunDurations.push_back(duration);
+      }
+
+      int32_t alignment = duration_cast<microseconds>( m->gpuSubmit[numLoop-1] - m->gpuSubmit[0] ).count()
+        - gpuRun[numLoop-1];
+
+      int32_t offset = 0;
+      for (int i=numLoop-2; i>=0; --i) {
+        int32_t curSubmit = duration_cast<microseconds>( m->gpuSubmit[i] - m->gpuSubmit[0] ).count();
+        int32_t diff = curSubmit - gpuRun[i] - alignment;
+        diff = std::max( 0, diff );
+        offset += diff;
+        alignment += diff;
+      }
+
+
+      SyncProps& props = m_props[frameId % m_props.size()];
+      props.gpuSync = gpuRun[numLoop-1];
+      props.cpuUntilGpuSync = offset + duration_cast<microseconds>( m->gpuSubmit[numLoop-1] - m->start ).count();
+      props.optimizedGpuTime = optimizedGpuTime;
+      props.isOutlier = isOutlier(frameId);
+
+      m_propsFinished.store( frameId );
+
+    }
+
+
+    Sleep::TimePoint sleepFor( const Sleep::TimePoint t, int32_t delay ) {
+
+      // account for the fps limit and ensure we won't sleep too long, just in case
+      int32_t frametime = std::chrono::duration_cast<microseconds>( t - m_lastStart ).count();
+      int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime );
+      delay = std::max( delay, frametimeDiff );
+      delay = std::max( 0, std::min( delay, 20000 ) );
+
+      Sleep::TimePoint nextStart = t + microseconds(delay);
+      Sleep::sleepUntil( t, nextStart );
+      return nextStart;
+
+    }
+
+
+  private:
+
+    struct SyncProps {
+      int32_t optimizedGpuTime;   // gpu executing packed submits in one go
+      int32_t gpuSync;            // us after gpuStart
+      int32_t cpuUntilGpuSync;
+      bool    isOutlier;
+    };
+
+
+    SyncProps getSyncPrediction() {
+      // in the future we might use more samples to get a prediction
+      // however, simple averaging gives a slightly artificial mouse input
+      // more advanced methods will be investigated
+      SyncProps res = {};
+      uint64_t id = m_propsFinished;
+      if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7)
+        return res;
+
+      for (size_t i=0; i<7; ++i) {
+        const SyncProps& props = m_props[ (id-i) % m_props.size() ];
+        if (!props.isOutlier) {
+          id = id-i;
+          break;
+        }
+      }
+
+      return m_props[ id % m_props.size() ];
+    };
+
+
+    int32_t getGpuStartToFinishPrediction() {
+      uint64_t id = m_propsFinished;
+      if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7)
+        return 0;
+
+      for (size_t i=0; i<7; ++i) {
+        const SyncProps& props = m_props[ (id-i) % m_props.size() ];
+        if (!props.isOutlier) {
+          const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id-i);
+          if (m->gpuReady.empty() || m->gpuSubmit.empty())
+            return m->gpuFinished - m->gpuStart;
+
+          time_point t = std::max( m->gpuReady[0], m->gpuSubmit[0] );
+          return std::chrono::duration_cast<microseconds>( t - m->start ).count()
+            + props.optimizedGpuTime
+            - m->gpuStart;
+        }
+      }
+
+      const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id);
+      return m->gpuFinished - m->gpuStart;
+    };
+
+
+    bool isOutlier( uint64_t frameId ) {
+      constexpr size_t numLoop = 7;
+      int32_t totalCpuTime = 0;
+      for (size_t i=0; i<numLoop; ++i) {
+        const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId-i);
+        totalCpuTime += m->cpuFinished;
+      }
+
+      int32_t avgCpuTime = totalCpuTime / numLoop;
+      const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId);
+      if (m->cpuFinished > 1.7*avgCpuTime || m->gpuSubmit.empty() || m->gpuReady.size() != (m->gpuSubmit.size()+1) )
+        return true;
+
+      return false;
+    }
+
+
+    int32_t getLowLatencyOffset( const DxvkOptions& options );
+    bool getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options );
+
+    const int32_t m_lowLatencyOffset;
+    const bool    m_allowCpuFramesOverlap;
+
+    Sleep::TimePoint m_lastStart = { high_resolution_clock::now() };
+    std::array<SyncProps, 16> m_props;
+    std::atomic<uint64_t> m_propsFinished = { 0 };
+
+    std::vector<int32_t>  m_tempGpuRun;
+    std::vector<int32_t>  m_tempGpuRunDurations;
+
+  };
+
+}
diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h
new file mode 100644
index 000000000..763a5368c
--- /dev/null
+++ b/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include "dxvk_framepacer_mode.h"
+
+namespace dxvk {
+
+  /*
+   * Minimal latency is achieved here by waiting for the previous
+   * frame to complete, which results in very much reduced fps.
+   * Generally not recommended, but helpful to get insights to fine-tune
+   * the low-latency mode, and possibly is useful for running games
+   * in the cpu limit.
+   */
+
+  class MinLatencyMode : public FramePacerMode {
+
+  public:
+
+    MinLatencyMode(Mode mode, LatencyMarkersStorage* storage)
+    : FramePacerMode(mode, storage, 0) {}
+
+    ~MinLatencyMode() {}
+
+    void startFrame( uint64_t frameId ) override {
+
+      Sleep::TimePoint now = high_resolution_clock::now();
+      int32_t frametime = std::chrono::duration_cast<std::chrono::microseconds>(
+        now - m_lastStart ).count();
+      int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime );
+      int32_t delay = std::max( 0, frametimeDiff );
+      delay = std::min( delay, 20000 );
+
+      Sleep::TimePoint nextStart = now + std::chrono::microseconds(delay);
+      Sleep::sleepUntil( now, nextStart );
+      m_lastStart = nextStart;
+
+    }
+
+  private:
+
+    Sleep::TimePoint m_lastStart = { high_resolution_clock::now() };
+
+  };
+
+}
diff --git a/src/dxvk/framepacer/dxvk_latency_markers.h b/src/dxvk/framepacer/dxvk_latency_markers.h
new file mode 100644
index 000000000..7658f0737
--- /dev/null
+++ b/src/dxvk/framepacer/dxvk_latency_markers.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#include <atomic>
+#include <dxgi.h>
+#include <vector>
+#include <array>
+#include <assert.h>
+#include "../../util/util_sleep.h"
+#include "../../util/log/log.h"
+#include "../../util/util_string.h"
+
+
+namespace dxvk {
+
+  class FramePacer;
+  class LatencyMarkersStorage;
+
+
+  struct LatencyMarkers {
+
+    using time_point = high_resolution_clock::time_point;
+
+    time_point start;
+    time_point end;
+
+    int32_t csStart;
+    int32_t csFinished;
+    int32_t cpuFinished;
+    int32_t gpuStart;
+    int32_t gpuFinished;
+    int32_t presentFinished;
+
+    std::vector<time_point> gpuReady;
+    std::vector<time_point> gpuSubmit;
+    std::vector<time_point> gpuQueueSubmit;
+
+  };
+
+
+  /*
+   * stores which information is accessible for which frame
+   */
+  struct LatencyMarkersTimeline {
+
+    std::atomic<uint64_t> cpuFinished   = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+    std::atomic<uint64_t> gpuStart      = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+    std::atomic<uint64_t> gpuFinished   = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+    std::atomic<uint64_t> frameFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+
+  };
+
+
+  class LatencyMarkersReader {
+
+  public:
+
+    LatencyMarkersReader( const LatencyMarkersStorage* storage, uint32_t numEntries );
+    bool getNext( const LatencyMarkers*& result );
+
+  private:
+
+    const LatencyMarkersStorage* m_storage;
+    uint64_t m_index;
+
+  };
+
+
+  class LatencyMarkersStorage {
+    friend class LatencyMarkersReader;
+    friend class FramePacer;
+  public:
+
+    LatencyMarkersStorage() { }
+    ~LatencyMarkersStorage() { }
+
+    LatencyMarkersReader getReader( uint32_t numEntries ) const {
+      return LatencyMarkersReader(this, numEntries);
+    }
+
+    void registerFrameStart( uint64_t frameId ) {
+      if (frameId <= m_timeline.frameFinished.load()) {
+        Logger::warn( str::format("internal error during registerFrameStart: expected frameId=",
+          m_timeline.frameFinished.load()+1, ", got: ", frameId) );
+      }
+      auto now = high_resolution_clock::now();
+
+      LatencyMarkers* markers = getMarkers(frameId);
+      markers->start = now;
+    }
+
+    void registerFrameEnd( uint64_t frameId ) {
+      if (frameId <= m_timeline.frameFinished.load()) {
+        Logger::warn( str::format("internal error during registerFrameEnd: expected frameId=",
+          m_timeline.frameFinished.load()+1, ", got: ", frameId) );
+      }
+      auto now = high_resolution_clock::now();
+
+      LatencyMarkers* markers = getMarkers(frameId);
+      markers->presentFinished = std::chrono::duration_cast<std::chrono::microseconds>(
+        now - markers->start).count();
+      markers->end = now;
+
+      m_timeline.frameFinished.store(frameId);
+    }
+
+    const LatencyMarkersTimeline* getTimeline() const {
+      return &m_timeline;
+    }
+
+    const LatencyMarkers* getConstMarkers( uint64_t frameId ) const {
+      return &m_markers[frameId % m_numMarkers];
+    }
+
+
+  private:
+
+    LatencyMarkers* getMarkers( uint64_t frameId ) {
+      return &m_markers[frameId % m_numMarkers];
+    }
+
+    // simple modulo hash mapping is used for frameIds. They are expected to monotonically increase by one.
+    // select the size large enough, so we never come into a situation where the reader cannot keep up with the producer
+    static constexpr uint16_t m_numMarkers = 128;
+    std::array<LatencyMarkers, m_numMarkers> m_markers = { };
+    LatencyMarkersTimeline m_timeline;
+
+  };
+
+
+
+  inline LatencyMarkersReader::LatencyMarkersReader( const LatencyMarkersStorage* storage, uint32_t numEntries )
+  : m_storage(storage) {
+    m_index = 0;
+    if (m_storage->m_timeline.frameFinished.load() > numEntries + DXGI_MAX_SWAP_CHAIN_BUFFERS + 2)
+      m_index = m_storage->m_timeline.frameFinished.load() - numEntries;
+  }
+
+
+  inline bool LatencyMarkersReader::getNext( const LatencyMarkers*& result ) {
+    if (m_index == 0 || m_index > m_storage->m_timeline.frameFinished.load())
+      return false;
+
+    result = &m_storage->m_markers[m_index % m_storage->m_numMarkers];
+    m_index++;
+    return true;
+  }
+
+}
diff --git a/src/dxvk/meson.build b/src/dxvk/meson.build
index 9b2b07356..e5d990543 100644
--- a/src/dxvk/meson.build
+++ b/src/dxvk/meson.build
@@ -120,6 +120,9 @@ dxvk_src = [
   'hud/dxvk_hud_font.cpp',
   'hud/dxvk_hud_item.cpp',
   'hud/dxvk_hud_renderer.cpp',
+
+  'framepacer/dxvk_framepacer.cpp',
+  'framepacer/dxvk_framepacer_mode_low_latency.cpp',
 ]
 
 if platform == 'windows'

From a8bd6f069e1367dd1020a32d16e095fd38115fde Mon Sep 17 00:00:00 2001
From: netborg <137700136+netborg-afps@users.noreply.github.com>
Date: Tue, 18 Feb 2025 21:39:01 +0100
Subject: [PATCH 3/5] [dxvk] Integrate frame pacing

---
 src/d3d11/d3d11_swapchain.cpp |  5 +++++
 src/d3d9/d3d9_swapchain.cpp   |  5 +++++
 src/dxvk/dxvk_context.cpp     |  2 +-
 src/dxvk/dxvk_device.cpp      |  5 +++--
 src/dxvk/dxvk_presenter.cpp   |  9 ++++-----
 src/dxvk/dxvk_queue.cpp       |  9 ++++++++-
 src/util/util_fps_limiter.cpp | 15 +++++++++++++--
 src/util/util_fps_limiter.h   |  6 +++++-
 8 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/src/d3d11/d3d11_swapchain.cpp b/src/d3d11/d3d11_swapchain.cpp
index a2d356067..cb5679854 100644
--- a/src/d3d11/d3d11_swapchain.cpp
+++ b/src/d3d11/d3d11_swapchain.cpp
@@ -3,6 +3,7 @@
 #include "d3d11_swapchain.h"
 
 #include "../dxvk/dxvk_latency_builtin.h"
+#include "../dxvk/framepacer/dxvk_framepacer.h"
 
 #include "../util/util_win32_compat.h"
 
@@ -354,6 +355,10 @@ namespace dxvk {
 
     if (m_presenter != nullptr)
       m_presenter->setFrameRateLimit(m_targetFrameRate, GetActualFrameLatency());
+
+    FramePacer* framePacer = dynamic_cast<FramePacer*>(m_latency.ptr());
+    if (framePacer != nullptr)
+      framePacer->setTargetFrameRate(FrameRate);
   }
 
 
diff --git a/src/d3d9/d3d9_swapchain.cpp b/src/d3d9/d3d9_swapchain.cpp
index 73218c516..05466eca5 100644
--- a/src/d3d9/d3d9_swapchain.cpp
+++ b/src/d3d9/d3d9_swapchain.cpp
@@ -5,6 +5,8 @@
 #include "d3d9_hud.h"
 #include "d3d9_window.h"
 
+#include "../dxvk/framepacer/dxvk_framepacer.h"
+
 namespace dxvk {
 
   static uint16_t MapGammaControlPoint(float x) {
@@ -1112,6 +1114,9 @@ namespace dxvk {
     }
 
     m_wctx->presenter->setFrameRateLimit(frameRate, GetActualFrameLatency());
+    FramePacer* framePacer = dynamic_cast<FramePacer*>(m_latencyTracker.ptr());
+    if (framePacer != nullptr)
+      framePacer->setTargetFrameRate(frameRate);
     m_targetFrameRate = frameRate;
   }
 
diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp
index ec9fa0761..d33938dda 100644
--- a/src/dxvk/dxvk_context.cpp
+++ b/src/dxvk/dxvk_context.cpp
@@ -113,7 +113,7 @@ namespace dxvk {
   void DxvkContext::beginLatencyTracking(
     const Rc<DxvkLatencyTracker>&     tracker,
           uint64_t                    frameId) {
-    if (tracker && (!m_latencyTracker || m_latencyTracker == tracker)) {
+    if (tracker && m_latencyTracker != tracker) {
       tracker->notifyCsRenderBegin(frameId);
 
       m_latencyTracker = tracker;
diff --git a/src/dxvk/dxvk_device.cpp b/src/dxvk/dxvk_device.cpp
index eed3eb3cf..cdb59e6c7 100644
--- a/src/dxvk/dxvk_device.cpp
+++ b/src/dxvk/dxvk_device.cpp
@@ -2,6 +2,7 @@
 #include "dxvk_instance.h"
 #include "dxvk_latency_builtin.h"
 #include "dxvk_latency_reflex.h"
+#include "framepacer/dxvk_framepacer.h"
 
 namespace dxvk {
   
@@ -310,13 +311,13 @@ namespace dxvk {
   Rc<DxvkLatencyTracker> DxvkDevice::createLatencyTracker(
     const Rc<Presenter>&            presenter) {
     if (m_options.latencySleep == Tristate::False)
-      return nullptr;
+      return new FramePacer(m_options);
 
     if (m_options.latencySleep == Tristate::Auto) {
       if (m_features.nvLowLatency2)
         return new DxvkReflexLatencyTrackerNv(presenter);
       else
-        return nullptr;
+        return new FramePacer(m_options);
     }
 
     return new DxvkBuiltInLatencyTracker(presenter,
diff --git a/src/dxvk/dxvk_presenter.cpp b/src/dxvk/dxvk_presenter.cpp
index 0e3c87762..79e10ad66 100644
--- a/src/dxvk/dxvk_presenter.cpp
+++ b/src/dxvk/dxvk_presenter.cpp
@@ -270,7 +270,7 @@ namespace dxvk {
       if (canSignal)
         m_signal->signal(frameId);
     } else {
-      m_fpsLimiter.delay();
+      m_fpsLimiter.delay(tracker);
       m_signal->signal(frameId);
 
       if (tracker)
@@ -1243,15 +1243,14 @@ namespace dxvk {
 
       // Signal latency tracker right away to get more accurate
       // measurements if the frame rate limiter is enabled.
-      if (frame.tracker) {
+      if (frame.tracker)
         frame.tracker->notifyGpuPresentEnd(frame.frameId);
-        frame.tracker = nullptr;
-      }
 
       // Apply FPS limiter here to align it as closely with scanout as we can,
       // and delay signaling the frame latency event to emulate behaviour of a
       // low refresh rate display as closely as we can.
-      m_fpsLimiter.delay();
+      m_fpsLimiter.delay(frame.tracker);
+      frame.tracker = nullptr;
 
       // Wake up any thread that may be waiting for the queue to become empty
       bool canSignal = false;
diff --git a/src/dxvk/dxvk_queue.cpp b/src/dxvk/dxvk_queue.cpp
index 6d2d153b6..0c74428a0 100644
--- a/src/dxvk/dxvk_queue.cpp
+++ b/src/dxvk/dxvk_queue.cpp
@@ -1,5 +1,6 @@
 #include "dxvk_device.h"
 #include "dxvk_queue.h"
+#include "framepacer/dxvk_framepacer.h"
 
 namespace dxvk {
   
@@ -46,6 +47,8 @@ namespace dxvk {
           DxvkSubmitInfo            submitInfo,
           DxvkLatencyInfo           latencyInfo,
           DxvkSubmitStatus*         status) {
+    if (latencyInfo.tracker)
+      latencyInfo.tracker->notifySubmit();
     std::unique_lock<dxvk::mutex> lock(m_mutex);
 
     m_finishCond.wait(lock, [this] {
@@ -66,6 +69,8 @@ namespace dxvk {
           DxvkPresentInfo           presentInfo,
           DxvkLatencyInfo           latencyInfo,
           DxvkSubmitStatus*         status) {
+    if (latencyInfo.tracker)
+      latencyInfo.tracker->notifyPresent(presentInfo.frameId);
     std::unique_lock<dxvk::mutex> lock(m_mutex);
 
     DxvkSubmitEntry entry = { };
@@ -274,7 +279,9 @@ namespace dxvk {
       } else if (entry.present.presenter != nullptr) {
         // Signal the frame and then immediately destroy the reference.
         // This is necessary since the front-end may want to explicitly
-        // destroy the presenter object. 
+        // destroy the presenter object.
+        if (entry.latency.tracker)
+          entry.latency.tracker->notifyGpuPresentBegin(entry.present.frameId);
         entry.present.presenter->signalFrame(entry.present.frameId, entry.latency.tracker);
         entry.present.presenter = nullptr;
       }
diff --git a/src/util/util_fps_limiter.cpp b/src/util/util_fps_limiter.cpp
index 621e9a453..95fb79e7e 100644
--- a/src/util/util_fps_limiter.cpp
+++ b/src/util/util_fps_limiter.cpp
@@ -5,12 +5,15 @@
 #include "util_fps_limiter.h"
 #include "util_sleep.h"
 #include "util_string.h"
+#include "../dxvk/framepacer/dxvk_framepacer.h"
 
 #include "./log/log.h"
 
 using namespace std::chrono_literals;
 
 namespace dxvk {
+
+  std::atomic<bool> FpsLimiter::m_isActive = { false };
   
   FpsLimiter::FpsLimiter() {
     auto override = getEnvironmentOverride();
@@ -48,7 +51,12 @@ namespace dxvk {
   }
 
 
-  void FpsLimiter::delay() {
+  void FpsLimiter::delay(const Rc<DxvkLatencyTracker>& tracker) {
+    FramePacer* framePacer = dynamic_cast<FramePacer*>(tracker.ptr());
+    if (framePacer && framePacer->getMode()) {
+      return;
+    }
+
     std::unique_lock<dxvk::mutex> lock(m_mutex);
     auto interval = m_targetInterval;
     auto latency = m_maxLatency;
@@ -71,8 +79,11 @@ namespace dxvk {
     // that can be written by setTargetFrameRate
     lock.unlock();
 
-    if (t1 < m_nextFrame)
+    m_isActive.store(false);
+    if (t1 < m_nextFrame) {
+      m_isActive.store(true);
       Sleep::sleepUntil(t1, m_nextFrame);
+    }
 
     m_nextFrame = (t1 < m_nextFrame + interval)
       ? m_nextFrame + interval
diff --git a/src/util/util_fps_limiter.h b/src/util/util_fps_limiter.h
index 7c33a559f..d5610afd4 100644
--- a/src/util/util_fps_limiter.h
+++ b/src/util/util_fps_limiter.h
@@ -7,6 +7,8 @@
 #include "util_time.h"
 
 namespace dxvk {
+
+  class DxvkLatencyTracker;
   
   /**
    * \brief Frame rate limiter
@@ -38,7 +40,7 @@ namespace dxvk {
      * and the time since the last call to \ref delay is
      * shorter than the target interval.
      */
-    void delay();
+    void delay(const Rc<DxvkLatencyTracker>& tracker);
 
     /**
      * \brief Queries environment override
@@ -46,6 +48,8 @@ namespace dxvk {
      */
     static std::optional<double> getEnvironmentOverride();
 
+    static std::atomic<bool> m_isActive;
+
   private:
 
     using TimePoint = dxvk::high_resolution_clock::time_point;

From 2c26eb2118eddc8fee7edbd1c408727cf6f077f1 Mon Sep 17 00:00:00 2001
From: netborg <137700136+netborg-afps@users.noreply.github.com>
Date: Tue, 18 Feb 2025 21:47:03 +0100
Subject: [PATCH 4/5] [hud] Add render latency

---
 src/d3d11/d3d11_swapchain.cpp  | 11 ++++++-
 src/d3d11/d3d11_swapchain.h    |  3 +-
 src/d3d9/d3d9_swapchain.cpp    | 11 ++++++-
 src/d3d9/d3d9_swapchain.h      |  5 +--
 src/dxvk/hud/dxvk_hud.h        |  5 +++
 src/dxvk/hud/dxvk_hud_item.cpp | 58 ++++++++++++++++++++++++++++++++++
 src/dxvk/hud/dxvk_hud_item.h   | 45 ++++++++++++++++++++++++++
 7 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/src/d3d11/d3d11_swapchain.cpp b/src/d3d11/d3d11_swapchain.cpp
index cb5679854..e2100eb8d 100644
--- a/src/d3d11/d3d11_swapchain.cpp
+++ b/src/d3d11/d3d11_swapchain.cpp
@@ -295,6 +295,9 @@ namespace dxvk {
     if (m_latencyHud)
       m_latencyHud->accumulateStats(latencyStats);
 
+    if (m_renderLatencyHud)
+      m_renderLatencyHud->updateLatencyTracker(m_latency);
+
     return hr;
   }
 
@@ -604,8 +607,14 @@ namespace dxvk {
     if (hud) {
       hud->addItem<hud::HudClientApiItem>("api", 1, GetApiName());
 
-      if (m_latency)
+      if (m_latency) {
         m_latencyHud = hud->addItem<hud::HudLatencyItem>("latency", 4);
+        FramePacer* framePacer = dynamic_cast<FramePacer*>(m_latency.ptr());
+        if (framePacer) {
+          int32_t fpsItemPos = hud->getItemPos<hud::HudFpsItem>();
+          m_renderLatencyHud = hud->addItem<hud::HudRenderLatencyItem>("renderlatency", fpsItemPos+1);
+        }
+      }
     }
 
     m_blitter = new DxvkSwapchainBlitter(m_device, std::move(hud));
diff --git a/src/d3d11/d3d11_swapchain.h b/src/d3d11/d3d11_swapchain.h
index 99f09450c..6a77c7351 100644
--- a/src/d3d11/d3d11_swapchain.h
+++ b/src/d3d11/d3d11_swapchain.h
@@ -125,7 +125,8 @@ namespace dxvk {
     dxvk::mutex               m_frameStatisticsLock;
     DXGI_VK_FRAME_STATISTICS  m_frameStatistics = { };
 
-    Rc<hud::HudLatencyItem>   m_latencyHud;
+    Rc<hud::HudLatencyItem>       m_latencyHud;
+    Rc<hud::HudRenderLatencyItem> m_renderLatencyHud;
 
     Rc<DxvkImageView> GetBackBufferView();
 
diff --git a/src/d3d9/d3d9_swapchain.cpp b/src/d3d9/d3d9_swapchain.cpp
index 05466eca5..539b0815c 100644
--- a/src/d3d9/d3d9_swapchain.cpp
+++ b/src/d3d9/d3d9_swapchain.cpp
@@ -925,6 +925,9 @@ namespace dxvk {
     if (m_latencyHud)
       m_latencyHud->accumulateStats(latencyStats);
 
+    if (m_renderLatencyHud)
+      m_renderLatencyHud->updateLatencyTracker(m_latencyTracker);
+
     // Rotate swap chain buffers so that the back
     // buffer at index 0 becomes the front buffer.
     for (uint32_t i = 1; i < m_backBuffers.size(); i++)
@@ -1062,8 +1065,14 @@ namespace dxvk {
     if (hud) {
       m_apiHud = hud->addItem<hud::HudClientApiItem>("api", 1, GetApiName());
 
-      if (m_latencyTracking)
+      if (m_latencyTracking) {
         m_latencyHud = hud->addItem<hud::HudLatencyItem>("latency", 4);
+        FramePacer* framePacer = dynamic_cast<FramePacer*>(m_latencyTracker.ptr());
+        if (framePacer) {
+          int32_t fpsItemPos = hud->getItemPos<hud::HudFpsItem>();
+          m_renderLatencyHud = hud->addItem<hud::HudRenderLatencyItem>("renderlatency", fpsItemPos+1);
+        }
+      }
 
       hud->addItem<hud::HudSamplerCount>("samplers", -1, m_parent);
       hud->addItem<hud::HudFixedFunctionShaders>("ffshaders", -1, m_parent);
diff --git a/src/d3d9/d3d9_swapchain.h b/src/d3d9/d3d9_swapchain.h
index 6ea0d96cb..d06c388a9 100644
--- a/src/d3d9/d3d9_swapchain.h
+++ b/src/d3d9/d3d9_swapchain.h
@@ -183,8 +183,9 @@ namespace dxvk {
     bool                      m_latencyTracking = false;
     Rc<DxvkLatencyTracker>    m_latencyTracker = nullptr;
 
-    Rc<hud::HudClientApiItem> m_apiHud;
-    Rc<hud::HudLatencyItem>   m_latencyHud;
+    Rc<hud::HudClientApiItem>     m_apiHud;
+    Rc<hud::HudLatencyItem>       m_latencyHud;
+    Rc<hud::HudRenderLatencyItem> m_renderLatencyHud;
 
     std::optional<VkHdrMetadataEXT> m_hdrMetadata;
     bool m_unlockAdditionalFormats = false;
diff --git a/src/dxvk/hud/dxvk_hud.h b/src/dxvk/hud/dxvk_hud.h
index 58c383f07..388cbf22a 100644
--- a/src/dxvk/hud/dxvk_hud.h
+++ b/src/dxvk/hud/dxvk_hud.h
@@ -59,6 +59,11 @@ namespace dxvk::hud {
     Rc<T> addItem(const char* name, int32_t at, Args... args) {
       return m_hudItems.add<T>(name, at, std::forward<Args>(args)...);
     }
+
+    template<typename T>
+    int32_t getItemPos() {
+      return m_hudItems.getItemPos<T>();
+    }
     
     /**
      * \brief Creates the HUD
diff --git a/src/dxvk/hud/dxvk_hud_item.cpp b/src/dxvk/hud/dxvk_hud_item.cpp
index f80ba77d4..138f1333f 100644
--- a/src/dxvk/hud/dxvk_hud_item.cpp
+++ b/src/dxvk/hud/dxvk_hud_item.cpp
@@ -1,4 +1,5 @@
 #include "dxvk_hud_item.h"
+#include "../framepacer/dxvk_framepacer.h"
 
 #include <hud_chunk_frag_background.h>
 #include <hud_chunk_frag_visualize.h>
@@ -213,6 +214,63 @@ namespace dxvk::hud {
   }
 
 
+  HudRenderLatencyItem::HudRenderLatencyItem() { }
+  HudRenderLatencyItem::~HudRenderLatencyItem() { }
+
+  void HudRenderLatencyItem::update(dxvk::high_resolution_clock::time_point time) {
+    // we cannot measure latency when fps-limiting is performed in Presenter::runFrameThread()
+    // because it's interfering with getting the right timestamp from vkWaitForPresent()
+    // if we truely wanted to measure it, we would need one additional thread
+    if (FpsLimiter::m_isActive) {
+      m_latency = "N/A";
+      return;
+    }
+
+    const Rc<DxvkLatencyTracker> tracker = m_tracker;
+    const FramePacer* framePacer = dynamic_cast<FramePacer*>( tracker.ptr() );
+    if (!framePacer)
+      return;
+
+    auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(time - m_lastUpdate);
+
+    if (elapsed.count() >= UpdateInterval) {
+      m_lastUpdate = time;
+
+      LatencyMarkersReader reader = framePacer->m_latencyMarkersStorage.getReader(100);
+      const LatencyMarkers* markers;
+      uint32_t count = 0;
+      uint64_t totalLatency = 0;
+      while (reader.getNext(markers)) {
+        totalLatency += markers->presentFinished;
+        ++count;
+      }
+
+      if (!count)
+        return;
+
+      uint64_t latency = totalLatency / count;
+      m_latency = str::format(latency / 1000, ".", (latency/100) % 10, " ms");
+    }
+  }
+
+
+  HudPos HudRenderLatencyItem::render(
+    const DxvkContextObjects& ctx,
+    const HudPipelineKey&     key,
+    const HudOptions&         options,
+          HudRenderer&        renderer,
+          HudPos              position) {
+
+    position.y += 12;
+    renderer.drawText(16, position, 0xff4040ffu, "Render latency:");
+    renderer.drawText(16, { position.x + 195, position.y },
+      0xffffffffu, m_latency);
+
+    position.y += 8;
+    return position;
+  }
+
+
   HudFrameTimeItem::HudFrameTimeItem(const Rc<DxvkDevice>& device, HudRenderer* renderer)
   : m_device            (device),
     m_gfxSetLayout      (createDescriptorSetLayout()),
diff --git a/src/dxvk/hud/dxvk_hud_item.h b/src/dxvk/hud/dxvk_hud_item.h
index 866849b4e..fab3fc97f 100644
--- a/src/dxvk/hud/dxvk_hud_item.h
+++ b/src/dxvk/hud/dxvk_hud_item.h
@@ -131,6 +131,15 @@ namespace dxvk::hud {
       return value;
     }
 
+    template<typename T>
+    int32_t getItemPos() {
+      for (int i=0; i<(int)m_items.size(); ++i) {
+        if (dynamic_cast<T*>(m_items[i].ptr()))
+          return i;
+      }
+      return -1;
+    }
+
   private:
 
     bool                                          m_enableFull = false;
@@ -244,6 +253,42 @@ namespace dxvk::hud {
   };
 
 
+   /**
+   * \brief HUD item to display render latency
+   */
+  class HudRenderLatencyItem : public HudItem {
+    constexpr static int64_t UpdateInterval = 500'000;
+  public:
+
+    HudRenderLatencyItem();
+
+    ~HudRenderLatencyItem();
+
+    void updateLatencyTracker( const Rc<DxvkLatencyTracker>& tracker ) {
+      m_tracker = tracker;
+    }
+
+    void update(dxvk::high_resolution_clock::time_point time);
+
+    HudPos render(
+      const DxvkContextObjects& ctx,
+      const HudPipelineKey&     key,
+      const HudOptions&         options,
+            HudRenderer&        renderer,
+            HudPos              position);
+
+  private:
+
+    Rc<DxvkLatencyTracker> m_tracker;
+
+    dxvk::high_resolution_clock::time_point m_lastUpdate
+      = dxvk::high_resolution_clock::now();
+
+    std::string m_latency;
+
+  };
+
+
   /**
    * \brief HUD item to display the frame rate
    */

From 8e2a509eb6711afe20f2a5426ca5b111add82373 Mon Sep 17 00:00:00 2001
From: netborg <137700136+netborg-afps@users.noreply.github.com>
Date: Wed, 19 Feb 2025 15:47:45 +0100
Subject: [PATCH 5/5] Revert "[dxvk] Fix lack of forward progress guarantee in
 presenter"

This reverts commit efeb15edbd6dc913030f0846cbc1b587f6fb7c5d, because ordering guarantees were broken, that notifyGpuPresentEnd should happen after notifyGpuPresentBegin, which in turn lead to wrong latency measurements in case vkWaitForPresent was skipped.
---
 src/dxvk/dxvk_presenter.cpp | 61 ++++++++++++++-----------------------
 src/dxvk/dxvk_presenter.h   |  1 -
 2 files changed, 23 insertions(+), 39 deletions(-)

diff --git a/src/dxvk/dxvk_presenter.cpp b/src/dxvk/dxvk_presenter.cpp
index 79e10ad66..3297d14a0 100644
--- a/src/dxvk/dxvk_presenter.cpp
+++ b/src/dxvk/dxvk_presenter.cpp
@@ -259,16 +259,9 @@ namespace dxvk {
       return;
 
     if (m_device->features().khrPresentWait.presentWait) {
-      bool canSignal = false;
-
-      { std::unique_lock lock(m_frameMutex);
-
-        m_lastSignaled = frameId;
-        canSignal = m_lastCompleted >= frameId;
-      }
-
-      if (canSignal)
-        m_signal->signal(frameId);
+      std::lock_guard lock(m_frameMutex);
+      m_lastSignaled = frameId;
+      m_frameCond.notify_one();
     } else {
       m_fpsLimiter.delay(tracker);
       m_signal->signal(frameId);
@@ -1210,26 +1203,25 @@ namespace dxvk {
   void Presenter::runFrameThread() {
     env::setThreadName("dxvk-frame");
 
-    while (true) {
-      PresenterFrame frame = { };
+    std::unique_lock lock(m_frameMutex);
 
+    while (true) {
       // Wait for all GPU work for this frame to complete in order to maintain
       // ordering guarantees of the frame signal w.r.t. objects being released
-      { std::unique_lock lock(m_frameMutex);
+      m_frameCond.wait(lock, [this] {
+        return !m_frameQueue.empty() && m_frameQueue.front().frameId <= m_lastSignaled;
+      });
 
-        m_frameCond.wait(lock, [this] {
-          return !m_frameQueue.empty();
-        });
+      // Use a frame ID of 0 as an exit condition
+      PresenterFrame frame = m_frameQueue.front();
 
-        // Use a frame ID of 0 as an exit condition
-        frame = m_frameQueue.front();
-
-        if (!frame.frameId) {
-          m_frameQueue.pop();
-          return;
-        }
+      if (!frame.frameId) {
+        m_frameQueue.pop();
+        return;
       }
 
+      lock.unlock();
+
       // If the present operation has succeeded, actually wait for it to complete.
       // Don't bother with it on MAILBOX / IMMEDIATE modes since doing so would
       // restrict us to the display refresh rate on some platforms (XWayland).
@@ -1246,28 +1238,21 @@ namespace dxvk {
       if (frame.tracker)
         frame.tracker->notifyGpuPresentEnd(frame.frameId);
 
-      // Apply FPS limiter here to align it as closely with scanout as we can,
+      // Apply FPS limtier here to align it as closely with scanout as we can,
       // and delay signaling the frame latency event to emulate behaviour of a
       // low refresh rate display as closely as we can.
       m_fpsLimiter.delay(frame.tracker);
       frame.tracker = nullptr;
 
-      // Wake up any thread that may be waiting for the queue to become empty
-      bool canSignal = false;
-
-      { std::unique_lock lock(m_frameMutex);
-
-        m_frameQueue.pop();
-        m_frameDrain.notify_one();
-
-        m_lastCompleted = frame.frameId;
-        canSignal = m_lastSignaled >= frame.frameId;
-      }
-
       // Always signal even on error, since failures here
       // are transparent to the front-end.
-      if (canSignal)
-        m_signal->signal(frame.frameId);
+      m_signal->signal(frame.frameId);
+
+      // Wake up any thread that may be waiting for the queue to become empty
+      lock.lock();
+
+      m_frameQueue.pop();
+      m_frameDrain.notify_one();
     }
   }
 
diff --git a/src/dxvk/dxvk_presenter.h b/src/dxvk/dxvk_presenter.h
index 8e403b244..afbe465c3 100644
--- a/src/dxvk/dxvk_presenter.h
+++ b/src/dxvk/dxvk_presenter.h
@@ -315,7 +315,6 @@ namespace dxvk {
     std::queue<PresenterFrame>  m_frameQueue;
 
     uint64_t                    m_lastSignaled = 0u;
-    uint64_t                    m_lastCompleted = 0u;
 
     alignas(CACHE_LINE_SIZE)
     FpsLimiter                  m_fpsLimiter;