Merge 8e2a509eb6711afe20f2a5426ca5b111add82373 into c04410ca00f33162d0875bc8500d3f8185bc73df

2025-03-14 04:29:15 +01:00 · 2025-02-28 20:52:05 +07:00 · 2025-02-28 20:52:05 +07:00 · 46aeecc4e4
commit 46aeecc4e4
parent c04410ca00 8e2a509eb6
26 changed files with 1144 additions and 59 deletions
--- a/dxvk.conf
+++ b/dxvk.conf
@ -18,6 +18,51 @@
 # dxgi.enableHDR = True


+# Frame pacing mode managing CPU-GPU synchronization.
+# Defaults to "low-latency" in the draft-PR for demonstration purposes.
+#
+# "max-frame-latency" provides stable latency in the GPU-limit as long as
+# GPU render times are stable. Latency generally is higher but offers great
+# visual smoothness.
+#
+# "low-latency" provides lower latency in the GPU-limit and can be fine-tuned
+# via dxvk.lowLatencyOffset and dxvk.lowLatencyAllowCpuFramesOverlap.
+#
+# "min-latency" possibly provides the lowest latency (low-latency can be
+# quicker in some situations), and offers less fps in the GPU-limit
+# due to stalling the GPU between frames. Generally not recommended,
+# but helpful to get insights to fine-tune the low-latency mode and
+# possibly is useful for running games in the CPU-limit.
+#
+# "low/min-latency" also supports its own fps-limiting enabled via common
+# variables.
+#
+# Supported values: "max-frame-latency", "low-latency", "min-latency"
+
+# dxvk.framePacing = ""
+
+
+# Allows fine-tuning the low-latency frame pacing mode.
+# Positive values make a frame begin later which might improve responsiveness,
+# although only very slightly, but may be relevant for edge cases.
+# Negative values make a frame begin earlier which might improve fps.
+# Values are given in microseconds. Defaults to 0.
+#
+# Supported values: -10000 to 10000
+
+# dxvk.lowLatencyOffset = 0
+
+
+# Determines whether a frame is allowed to begin before finishing processing
+# the cpu-part of the previous one, when low-latency frame pacing is used.
+# Snappiness may be improved when disallowing overlap. On the other hand, this
+# might also decrease fps in certain cases. Defaults to True.
+#
+# Supported values: True, False
+
+# dxvk.lowLatencyAllowCpuFramesOverlap = True
+
+
 # Expose support for dcomp swap chains with a dummy window.
 #
 # This is not a valid implementation of DirectComposition swapchains,
@ -104,8 +149,13 @@
 #         The implementation will either use VK_NV_low_latency2 if supported
 #         by the driver, or a custom algorithm.
 # - False: Disable Reflex support as well as built-in latency reduction.
+#         This build defaults to False to enable dxvk.framePacing. You need to
+#         enable Reflex manually (Auto) until we support switching back and
+#         forth between Reflex and the low-latency frame pacing - for example
+#         via the ingame options - and more critically we want to enable
+#         low-latency frame pacing if the game doesn't support Reflex.
  
-# dxvk.latencySleep = Auto
+# dxvk.latencySleep = False


 # Tolerance for the latency sleep heuristic, in microseconds. Higher values
--- a/src/d3d11/d3d11_swapchain.cpp
+++ b/src/d3d11/d3d11_swapchain.cpp
@ -3,6 +3,7 @@
 #include "d3d11_swapchain.h"

 #include "../dxvk/dxvk_latency_builtin.h"
+#include "../dxvk/framepacer/dxvk_framepacer.h"

 #include "../util/util_win32_compat.h"

@ -294,6 +295,9 @@ namespace dxvk {
    if (m_latencyHud)
      m_latencyHud->accumulateStats(latencyStats);

+    if (m_renderLatencyHud)
+      m_renderLatencyHud->updateLatencyTracker(m_latency);
+
    return hr;
  }

@ -354,6 +358,10 @@ namespace dxvk {

    if (m_presenter != nullptr)
      m_presenter->setFrameRateLimit(m_targetFrameRate, GetActualFrameLatency());
+
+    FramePacer* framePacer = dynamic_cast<FramePacer*>(m_latency.ptr());
+    if (framePacer != nullptr)
+      framePacer->setTargetFrameRate(FrameRate);
  }


@ -599,8 +607,14 @@ namespace dxvk {
    if (hud) {
      hud->addItem<hud::HudClientApiItem>("api", 1, GetApiName());

-      if (m_latency)
+      if (m_latency) {
        m_latencyHud = hud->addItem<hud::HudLatencyItem>("latency", 4);
+        FramePacer* framePacer = dynamic_cast<FramePacer*>(m_latency.ptr());
+        if (framePacer) {
+          int32_t fpsItemPos = hud->getItemPos<hud::HudFpsItem>();
+          m_renderLatencyHud = hud->addItem<hud::HudRenderLatencyItem>("renderlatency", fpsItemPos+1);
+        }
+      }
    }

    m_blitter = new DxvkSwapchainBlitter(m_device, std::move(hud));
--- a/src/d3d11/d3d11_swapchain.h
+++ b/src/d3d11/d3d11_swapchain.h
@ -125,7 +125,8 @@ namespace dxvk {
    dxvk::mutex               m_frameStatisticsLock;
    DXGI_VK_FRAME_STATISTICS  m_frameStatistics = { };

-    Rc<hud::HudLatencyItem>   m_latencyHud;
+    Rc<hud::HudLatencyItem>       m_latencyHud;
+    Rc<hud::HudRenderLatencyItem> m_renderLatencyHud;

    Rc<DxvkImageView> GetBackBufferView();

--- a/src/d3d9/d3d9_swapchain.cpp
+++ b/src/d3d9/d3d9_swapchain.cpp
@ -5,6 +5,8 @@
 #include "d3d9_hud.h"
 #include "d3d9_window.h"

+#include "../dxvk/framepacer/dxvk_framepacer.h"
+
 namespace dxvk {

  static uint16_t MapGammaControlPoint(float x) {
@ -923,6 +925,9 @@ namespace dxvk {
    if (m_latencyHud)
      m_latencyHud->accumulateStats(latencyStats);

+    if (m_renderLatencyHud)
+      m_renderLatencyHud->updateLatencyTracker(m_latencyTracker);
+
    // Rotate swap chain buffers so that the back
    // buffer at index 0 becomes the front buffer.
    for (uint32_t i = 1; i < m_backBuffers.size(); i++)
@ -1060,8 +1065,14 @@ namespace dxvk {
    if (hud) {
      m_apiHud = hud->addItem<hud::HudClientApiItem>("api", 1, GetApiName());

-      if (m_latencyTracking)
+      if (m_latencyTracking) {
        m_latencyHud = hud->addItem<hud::HudLatencyItem>("latency", 4);
+        FramePacer* framePacer = dynamic_cast<FramePacer*>(m_latencyTracker.ptr());
+        if (framePacer) {
+          int32_t fpsItemPos = hud->getItemPos<hud::HudFpsItem>();
+          m_renderLatencyHud = hud->addItem<hud::HudRenderLatencyItem>("renderlatency", fpsItemPos+1);
+        }
+      }

      hud->addItem<hud::HudSamplerCount>("samplers", -1, m_parent);
      hud->addItem<hud::HudFixedFunctionShaders>("ffshaders", -1, m_parent);
@ -1112,6 +1123,9 @@ namespace dxvk {
    }

    m_wctx->presenter->setFrameRateLimit(frameRate, GetActualFrameLatency());
+    FramePacer* framePacer = dynamic_cast<FramePacer*>(m_latencyTracker.ptr());
+    if (framePacer != nullptr)
+      framePacer->setTargetFrameRate(frameRate);
    m_targetFrameRate = frameRate;
  }

--- a/src/d3d9/d3d9_swapchain.h
+++ b/src/d3d9/d3d9_swapchain.h
@ -183,8 +183,9 @@ namespace dxvk {
    bool                      m_latencyTracking = false;
    Rc<DxvkLatencyTracker>    m_latencyTracker = nullptr;

-    Rc<hud::HudClientApiItem> m_apiHud;
-    Rc<hud::HudLatencyItem>   m_latencyHud;
+    Rc<hud::HudClientApiItem>     m_apiHud;
+    Rc<hud::HudLatencyItem>       m_latencyHud;
+    Rc<hud::HudRenderLatencyItem> m_renderLatencyHud;

    std::optional<VkHdrMetadataEXT> m_hdrMetadata;
    bool m_unlockAdditionalFormats = false;
--- a/src/dxvk/dxvk_context.cpp
+++ b/src/dxvk/dxvk_context.cpp
@ -110,7 +110,7 @@ namespace dxvk {
  void DxvkContext::beginLatencyTracking(
    const Rc<DxvkLatencyTracker>&     tracker,
          uint64_t                    frameId) {
-    if (tracker && (!m_latencyTracker || m_latencyTracker == tracker)) {
+    if (tracker && m_latencyTracker != tracker) {
      tracker->notifyCsRenderBegin(frameId);

      m_latencyTracker = tracker;
--- a/src/dxvk/dxvk_device.cpp
+++ b/src/dxvk/dxvk_device.cpp
@ -2,6 +2,7 @@
 #include "dxvk_instance.h"
 #include "dxvk_latency_builtin.h"
 #include "dxvk_latency_reflex.h"
+#include "framepacer/dxvk_framepacer.h"

 namespace dxvk {
  
@ -310,13 +311,13 @@ namespace dxvk {
  Rc<DxvkLatencyTracker> DxvkDevice::createLatencyTracker(
    const Rc<Presenter>&            presenter) {
    if (m_options.latencySleep == Tristate::False)
-      return nullptr;
+      return new FramePacer(m_options);

    if (m_options.latencySleep == Tristate::Auto) {
      if (m_features.nvLowLatency2)
        return new DxvkReflexLatencyTrackerNv(presenter);
      else
-        return nullptr;
+        return new FramePacer(m_options);
    }

    return new DxvkBuiltInLatencyTracker(presenter,
--- a/src/dxvk/dxvk_latency.h
+++ b/src/dxvk/dxvk_latency.h
@ -128,6 +128,10 @@ namespace dxvk {
    virtual void notifyCpuPresentEnd(
            uint64_t                  frameId) = 0;

+    virtual void notifySubmit() { }
+    virtual void notifyPresent(
+            uint64_t                  frameId) { }
+
    /**
     * \brief Called when a command list is submitted to the GPU
     *
@ -174,6 +178,9 @@ namespace dxvk {
    virtual void notifyGpuExecutionEnd(
            uint64_t                  frameId) = 0;

+    virtual void notifyGpuPresentBegin(
+            uint64_t                  frameId) { }
+
    /**
     * \brief Called when presentation of a given frame finishes on the GPU
     *
--- a/src/dxvk/dxvk_options.cpp
+++ b/src/dxvk/dxvk_options.cpp
@ -12,12 +12,16 @@ namespace dxvk {
    useRawSsbo            = config.getOption<Tristate>("dxvk.useRawSsbo",             Tristate::Auto);
    hud                   = config.getOption<std::string>("dxvk.hud", "");
    tearFree              = config.getOption<Tristate>("dxvk.tearFree",               Tristate::Auto);
-    latencySleep          = config.getOption<Tristate>("dxvk.latencySleep",           Tristate::Auto);
+    latencySleep          = config.getOption<Tristate>("dxvk.latencySleep",           Tristate::False);
    latencyTolerance      = config.getOption<int32_t> ("dxvk.latencyTolerance",       1000);
    disableNvLowLatency2  = config.getOption<Tristate>("dxvk.disableNvLowLatency2",   Tristate::Auto);
    hideIntegratedGraphics = config.getOption<bool>   ("dxvk.hideIntegratedGraphics", false);
    zeroMappedMemory      = config.getOption<bool>    ("dxvk.zeroMappedMemory",       false);
    allowFse              = config.getOption<bool>    ("dxvk.allowFse",               false);
+    framePace             = config.getOption<std::string>("dxvk.framePace",           "");
+    lowLatencyOffset      = config.getOption<int32_t> ("dxvk.lowLatencyOffset",       0);
+    lowLatencyAllowCpuFramesOverlap
+                          = config.getOption<bool>    ("dxvk.lowLatencyAllowCpuFramesOverlap", true);
    deviceFilter          = config.getOption<std::string>("dxvk.deviceFilter",        "");
    tilerMode             = config.getOption<Tristate>("dxvk.tilerMode",              Tristate::Auto);
  }
--- a/src/dxvk/dxvk_options.h
+++ b/src/dxvk/dxvk_options.h
@ -38,7 +38,9 @@ namespace dxvk {
    Tristate tearFree = Tristate::Auto;

    /// Enables latency sleep
-    Tristate latencySleep = Tristate::Auto;
+    /// Defaults to false in this build to activate the FramePacer,
+    /// especially for the case when the game doesn't support Reflex
+    Tristate latencySleep = Tristate::False;

    /// Latency tolerance, in microseconds
    int32_t latencyTolerance = 0u;
@ -61,6 +63,18 @@ namespace dxvk {
    /// Whether to enable tiler optimizations
    Tristate tilerMode = Tristate::Auto;

+    /// Frame pacing
+    std::string framePace;
+
+    /// A value in microseconds to fine-tune the low-latency frame pacing.
+    /// Positive values make a frame begin later which might improve responsiveness.
+    /// Negative values make a frame begin earlier which might improve fps.
+    int32_t lowLatencyOffset;
+
+    /// Determines whether a frame is allowed to begin before finishing processing
+    /// the cpu-part of the previous one, when low-latency frame pacing is used.
+    bool lowLatencyAllowCpuFramesOverlap;
+
    // Device name
    std::string deviceFilter;
  };
--- a/src/dxvk/dxvk_presenter.cpp
+++ b/src/dxvk/dxvk_presenter.cpp
@ -259,18 +259,11 @@ namespace dxvk {
      return;

    if (m_device->features().khrPresentWait.presentWait) {
-      bool canSignal = false;
-
-      { std::unique_lock lock(m_frameMutex);
-
-        m_lastSignaled = frameId;
-        canSignal = m_lastCompleted >= frameId;
-      }
-
-      if (canSignal)
-        m_signal->signal(frameId);
+      std::lock_guard lock(m_frameMutex);
+      m_lastSignaled = frameId;
+      m_frameCond.notify_one();
    } else {
-      m_fpsLimiter.delay();
+      m_fpsLimiter.delay(tracker);
      m_signal->signal(frameId);

      if (tracker)
@ -1210,26 +1203,25 @@ namespace dxvk {
  void Presenter::runFrameThread() {
    env::setThreadName("dxvk-frame");

-    while (true) {
-      PresenterFrame frame = { };
+    std::unique_lock lock(m_frameMutex);

+    while (true) {
      // Wait for all GPU work for this frame to complete in order to maintain
      // ordering guarantees of the frame signal w.r.t. objects being released
-      { std::unique_lock lock(m_frameMutex);
+      m_frameCond.wait(lock, [this] {
+        return !m_frameQueue.empty() && m_frameQueue.front().frameId <= m_lastSignaled;
+      });

-        m_frameCond.wait(lock, [this] {
-          return !m_frameQueue.empty();
-        });
+      // Use a frame ID of 0 as an exit condition
+      PresenterFrame frame = m_frameQueue.front();

-        // Use a frame ID of 0 as an exit condition
-        frame = m_frameQueue.front();
-
-        if (!frame.frameId) {
-          m_frameQueue.pop();
-          return;
-        }
+      if (!frame.frameId) {
+        m_frameQueue.pop();
+        return;
      }

+      lock.unlock();
+
      // If the present operation has succeeded, actually wait for it to complete.
      // Don't bother with it on MAILBOX / IMMEDIATE modes since doing so would
      // restrict us to the display refresh rate on some platforms (XWayland).
@ -1243,32 +1235,24 @@ namespace dxvk {

      // Signal latency tracker right away to get more accurate
      // measurements if the frame rate limiter is enabled.
-      if (frame.tracker) {
+      if (frame.tracker)
        frame.tracker->notifyGpuPresentEnd(frame.frameId);
-        frame.tracker = nullptr;
-      }

-      // Apply FPS limiter here to align it as closely with scanout as we can,
+      // Apply FPS limtier here to align it as closely with scanout as we can,
      // and delay signaling the frame latency event to emulate behaviour of a
      // low refresh rate display as closely as we can.
-      m_fpsLimiter.delay();
-
-      // Wake up any thread that may be waiting for the queue to become empty
-      bool canSignal = false;
-
-      { std::unique_lock lock(m_frameMutex);
-
-        m_frameQueue.pop();
-        m_frameDrain.notify_one();
-
-        m_lastCompleted = frame.frameId;
-        canSignal = m_lastSignaled >= frame.frameId;
-      }
+      m_fpsLimiter.delay(frame.tracker);
+      frame.tracker = nullptr;

      // Always signal even on error, since failures here
      // are transparent to the front-end.
-      if (canSignal)
-        m_signal->signal(frame.frameId);
+      m_signal->signal(frame.frameId);
+
+      // Wake up any thread that may be waiting for the queue to become empty
+      lock.lock();
+
+      m_frameQueue.pop();
+      m_frameDrain.notify_one();
    }
  }

--- a/src/dxvk/dxvk_presenter.h
+++ b/src/dxvk/dxvk_presenter.h
@ -315,7 +315,6 @@ namespace dxvk {
    std::queue<PresenterFrame>  m_frameQueue;

    uint64_t                    m_lastSignaled = 0u;
-    uint64_t                    m_lastCompleted = 0u;

    alignas(CACHE_LINE_SIZE)
    FpsLimiter                  m_fpsLimiter;
--- a/src/dxvk/dxvk_queue.cpp
+++ b/src/dxvk/dxvk_queue.cpp
@ -1,5 +1,6 @@
 #include "dxvk_device.h"
 #include "dxvk_queue.h"
+#include "framepacer/dxvk_framepacer.h"

 namespace dxvk {
  
@ -46,6 +47,8 @@ namespace dxvk {
          DxvkSubmitInfo            submitInfo,
          DxvkLatencyInfo           latencyInfo,
          DxvkSubmitStatus*         status) {
+    if (latencyInfo.tracker)
+      latencyInfo.tracker->notifySubmit();
    std::unique_lock<dxvk::mutex> lock(m_mutex);

    m_finishCond.wait(lock, [this] {
@ -66,6 +69,8 @@ namespace dxvk {
          DxvkPresentInfo           presentInfo,
          DxvkLatencyInfo           latencyInfo,
          DxvkSubmitStatus*         status) {
+    if (latencyInfo.tracker)
+      latencyInfo.tracker->notifyPresent(presentInfo.frameId);
    std::unique_lock<dxvk::mutex> lock(m_mutex);

    DxvkSubmitEntry entry = { };
@ -274,7 +279,9 @@ namespace dxvk {
      } else if (entry.present.presenter != nullptr) {
        // Signal the frame and then immediately destroy the reference.
        // This is necessary since the front-end may want to explicitly
-        // destroy the presenter object. 
+        // destroy the presenter object.
+        if (entry.latency.tracker)
+          entry.latency.tracker->notifyGpuPresentBegin(entry.present.frameId);
        entry.present.presenter->signalFrame(entry.present.frameId, entry.latency.tracker);
        entry.present.presenter = nullptr;
      }
--- a/src/dxvk/framepacer/dxvk_framepacer.cpp
+++ b/src/dxvk/framepacer/dxvk_framepacer.cpp
@ -0,0 +1,64 @@
+#include "dxvk_framepacer.h"
+#include "dxvk_framepacer_mode_low_latency.h"
+#include "dxvk_framepacer_mode_min_latency.h"
+#include "dxvk_options.h"
+#include "../../util/util_env.h"
+#include "../../util/log/log.h"
+
+namespace dxvk {
+
+
+  FramePacer::FramePacer( const DxvkOptions& options ) {
+    // we'll default to LOW_LATENCY in the draft-PR for now, for demonstration purposes,
+    // highlighting the generally much better input lag and medium-term time consistency.
+    // although MAX_FRAME_LATENCY has advantages in many games and is likely the better default,
+    // for its higher fps throughput and less susceptibility to short-term time inconsistencies.
+    // which mode being smoother depends on the game.
+    FramePacerMode::Mode mode = FramePacerMode::LOW_LATENCY;
+
+    std::string configStr = env::getEnvVar("DXVK_FRAME_PACE");
+
+    if (configStr.find("max-frame-latency") != std::string::npos) {
+      mode = FramePacerMode::MAX_FRAME_LATENCY;
+    } else if (configStr.find("low-latency") != std::string::npos) {
+      mode = FramePacerMode::LOW_LATENCY;
+    } else if (configStr.find("min-latency") != std::string::npos) {
+      mode = FramePacerMode::MIN_LATENCY;
+    } else if (options.framePace.find("max-frame-latency") != std::string::npos) {
+      mode = FramePacerMode::MAX_FRAME_LATENCY;
+    } else if (options.framePace.find("low-latency") != std::string::npos) {
+      mode = FramePacerMode::LOW_LATENCY;
+    } else if (options.framePace.find("min-latency") != std::string::npos) {
+      mode = FramePacerMode::MIN_LATENCY;
+    }
+
+    switch (mode) {
+      case FramePacerMode::MAX_FRAME_LATENCY:
+        Logger::info( "Frame pace: max-frame-latency" );
+        m_mode = std::make_unique<FramePacerMode>(FramePacerMode::MAX_FRAME_LATENCY, &m_latencyMarkersStorage);
+        break;
+
+      case FramePacerMode::LOW_LATENCY:
+        Logger::info( "Frame pace: low-latency" );
+        m_mode = std::make_unique<LowLatencyMode>(mode, &m_latencyMarkersStorage, options);
+        break;
+
+      case FramePacerMode::MIN_LATENCY:
+        Logger::info( "Frame pace: min-latency" );
+        m_mode = std::make_unique<MinLatencyMode>(mode, &m_latencyMarkersStorage);
+        break;
+    }
+
+    for (auto& gpuStart: m_gpuStarts) {
+      gpuStart.store(0);
+    }
+
+    // be consistent that every frame has a gpuReady event from the previous frame
+    LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(DXGI_MAX_SWAP_CHAIN_BUFFERS+1);
+    m->gpuReady.push_back(high_resolution_clock::now());
+  }
+
+
+  FramePacer::~FramePacer() {}
+
+}
--- a/src/dxvk/framepacer/dxvk_framepacer.h
+++ b/src/dxvk/framepacer/dxvk_framepacer.h
@ -0,0 +1,191 @@
+#pragma once
+
+#include "dxvk_framepacer_mode.h"
+#include "dxvk_latency_markers.h"
+#include "../dxvk_latency.h"
+#include "../../util/util_time.h"
+#include <dxgi.h>
+
+
+namespace dxvk {
+
+  struct DxvkOptions;
+
+  /* \brief Frame pacer interface managing the CPU - GPU synchronization.
+   *
+   * GPUs render frames asynchronously to the game's and dxvk's CPU-side work
+   * in order to improve fps-throughput. Aligning the cpu work to chosen time-
+   * points allows to tune certain characteristics of the video presentation,
+   * like smoothness and latency.
+   */
+
+  class FramePacer : public DxvkLatencyTracker {
+    using microseconds = std::chrono::microseconds;
+  public:
+
+    FramePacer( const DxvkOptions& options );
+    ~FramePacer();
+
+    void sleepAndBeginFrame(
+            uint64_t                  frameId,
+            double                    maxFrameRate) override {
+      // wait for finished rendering of a previous frame, typically the one before last
+      m_mode->waitRenderFinished(frameId);
+      // potentially wait some more if the cpu gets too much ahead
+      m_mode->startFrame(frameId);
+      m_latencyMarkersStorage.registerFrameStart(frameId);
+      m_gpuStarts[ frameId % m_gpuStarts.size() ].store(0);
+    }
+
+    void notifyGpuPresentEnd( uint64_t frameId ) override {
+      // the frame has been displayed to the screen
+      m_latencyMarkersStorage.registerFrameEnd(frameId);
+      m_mode->endFrame(frameId);
+    }
+
+    void notifyCsRenderBegin( uint64_t frameId ) override {
+      auto now = high_resolution_clock::now();
+      LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
+      m->csStart = std::chrono::duration_cast<microseconds>(now - m->start).count();
+    }
+
+    void notifyCsRenderEnd( uint64_t frameId ) override {
+      auto now = high_resolution_clock::now();
+      LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
+      m->csFinished = std::chrono::duration_cast<microseconds>(now - m->start).count();
+      m_mode->signalCsFinished( frameId );
+    }
+
+    void notifySubmit() override {
+      LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastSubmitFrameId+1);
+      m->gpuSubmit.push_back(high_resolution_clock::now());
+    }
+
+    void notifyPresent( uint64_t frameId ) override {
+      // dx to vk translation is finished
+      if (frameId != 0) {
+        auto now = high_resolution_clock::now();
+        m_lastSubmitFrameId = frameId;
+        LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
+        LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1);
+        m->gpuSubmit.push_back(now);
+        m->cpuFinished = std::chrono::duration_cast<microseconds>(now - m->start).count();
+        next->gpuSubmit.clear();
+
+        m_latencyMarkersStorage.m_timeline.cpuFinished.store(frameId);
+      }
+    }
+
+    void notifyQueueSubmit( uint64_t frameId ) override {
+      assert( frameId == m_lastQueueSubmitFrameId + 1 );
+      auto now = high_resolution_clock::now();
+      LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
+      m->gpuQueueSubmit.push_back(now);
+      queueSubmitCheckGpuStart(frameId, m, now);
+    }
+
+    void notifyQueuePresentBegin( uint64_t frameId ) override {
+      if (frameId != 0) {
+        auto now = high_resolution_clock::now();
+        m_lastQueueSubmitFrameId = frameId;
+        LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
+        LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1);
+        m->gpuQueueSubmit.push_back(now);
+        next->gpuQueueSubmit.clear();
+        queueSubmitCheckGpuStart(frameId, m, now);
+      }
+    }
+
+    void notifyGpuExecutionBegin( uint64_t frameId ) override {
+      assert( frameId == m_lastFinishedFrameId+1 );
+      LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1);
+      gpuExecutionCheckGpuStart(frameId, m, high_resolution_clock::now());
+    }
+
+    void notifyGpuExecutionEnd( uint64_t frameId ) override {
+      auto now = high_resolution_clock::now();
+      LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1);
+      m->gpuReady.push_back(now);
+    }
+
+    virtual void notifyGpuPresentBegin( uint64_t frameId ) override {
+      // we get frameId == 0 for repeated presents (SyncInterval)
+      if (frameId != 0) {
+        m_lastFinishedFrameId = frameId;
+        auto now = high_resolution_clock::now();
+
+        LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
+        LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1);
+        m->gpuReady.push_back(now);
+        m->gpuFinished = std::chrono::duration_cast<microseconds>(now - m->start).count();
+        next->gpuReady.clear();
+        next->gpuReady.push_back(now);
+
+        gpuExecutionCheckGpuStart(frameId, m, now);
+
+        m_latencyMarkersStorage.m_timeline.gpuFinished.store(frameId);
+        m_mode->finishRender(frameId);
+        m_mode->signalRenderFinished(frameId);
+      }
+    }
+
+    FramePacerMode::Mode getMode() const {
+      return m_mode->m_mode;
+    }
+
+    void setTargetFrameRate( double frameRate ) {
+      m_mode->setTargetFrameRate(frameRate);
+    }
+
+    bool needsAutoMarkers() override {
+      return true;
+    }
+
+    LatencyMarkersStorage m_latencyMarkersStorage;
+
+
+    // not implemented methods
+
+
+    void notifyCpuPresentBegin( uint64_t frameId) override { }
+    void notifyCpuPresentEnd( uint64_t frameId ) override { }
+    void notifyQueuePresentEnd( uint64_t frameId, VkResult status) override { }
+    void discardTimings() override { }
+    DxvkLatencyStats getStatistics( uint64_t frameId ) override
+      { return DxvkLatencyStats(); }
+
+  private:
+
+    void signalGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) {
+      m->gpuStart = std::chrono::duration_cast<microseconds>(t - m->start).count();
+      m_latencyMarkersStorage.m_timeline.gpuStart.store(frameId);
+      m_mode->signalGpuStart(frameId);
+    }
+
+    void queueSubmitCheckGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) {
+      auto& gpuStart = m_gpuStarts[ frameId % m_gpuStarts.size() ];
+      uint16_t val = gpuStart.fetch_or(queueSubmitBit);
+      if (val == gpuReadyBit)
+        signalGpuStart( frameId, m, t );
+    }
+
+    void gpuExecutionCheckGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) {
+      auto& gpuStart = m_gpuStarts[ frameId % m_gpuStarts.size() ];
+      uint16_t val = gpuStart.fetch_or(gpuReadyBit);
+      if (val == queueSubmitBit)
+        signalGpuStart( frameId, m, t );
+    }
+
+    std::unique_ptr<FramePacerMode> m_mode;
+
+    uint64_t m_lastSubmitFrameId      = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+    uint64_t m_lastQueueSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+    uint64_t m_lastFinishedFrameId    = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+
+    std::array< std::atomic< uint16_t >, 16 > m_gpuStarts = { };
+    static constexpr uint16_t queueSubmitBit = 1;
+    static constexpr uint16_t gpuReadyBit    = 2;
+
+  };
+
+}
--- a/src/dxvk/framepacer/dxvk_framepacer_mode.h
+++ b/src/dxvk/framepacer/dxvk_framepacer_mode.h
@ -0,0 +1,117 @@
+#pragma once
+
+#include "dxvk_latency_markers.h"
+#include "../../util/sync/sync_signal.h"
+#include "../../util/util_env.h"
+#include <dxgi.h>
+
+namespace dxvk {
+
+  /*
+   * /brief Abstract frame pacer mode in order to support different strategies of synchronization.
+   */
+
+  class FramePacerMode {
+
+  public:
+
+    enum Mode {
+      MAX_FRAME_LATENCY = 0,
+      LOW_LATENCY,
+      MIN_LATENCY
+    };
+
+    FramePacerMode( Mode mode, LatencyMarkersStorage* markerStorage, uint32_t maxFrameLatency=1 )
+    : m_mode( mode ),
+      m_waitLatency( maxFrameLatency+1 ),
+      m_latencyMarkersStorage( markerStorage ) {
+      setFpsLimitFrametimeFromEnv();
+    }
+
+    virtual ~FramePacerMode() { }
+
+    virtual void startFrame( uint64_t frameId ) { }
+    virtual void endFrame( uint64_t frameId ) { }
+
+    virtual void finishRender( uint64_t frameId ) { }
+
+    void waitRenderFinished( uint64_t frameId ) {
+      if (m_mode) m_fenceGpuFinished.wait(frameId-m_waitLatency); }
+
+    void signalRenderFinished( uint64_t frameId ) {
+      if (m_mode) m_fenceGpuFinished.signal(frameId); }
+
+    void signalGpuStart( uint64_t frameId ) {
+      if (m_mode) m_fenceGpuStart.signal(frameId); }
+
+    void signalCsFinished( uint64_t frameId ) {
+      if (m_mode) m_fenceCsFinished.signal(frameId); }
+
+    void setTargetFrameRate( double frameRate ) {
+      if (!m_fpsLimitEnvOverride && frameRate > 1.0)
+        m_fpsLimitFrametime.store( 1'000'000/frameRate );
+    }
+
+    const Mode m_mode;
+
+    static bool getDoubleFromEnv( const char* name, double* result );
+    static bool getIntFromEnv( const char* name, int* result );
+
+  protected:
+
+    void setFpsLimitFrametimeFromEnv();
+
+    const uint32_t m_waitLatency;
+    LatencyMarkersStorage* m_latencyMarkersStorage;
+    std::atomic<int32_t> m_fpsLimitFrametime = { 0 };
+    bool m_fpsLimitEnvOverride = { false };
+
+    sync::Fence m_fenceGpuStart    = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) };
+    sync::Fence m_fenceGpuFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) };
+    sync::Fence m_fenceCsFinished  = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS+50) };
+
+  };
+
+
+
+  inline bool FramePacerMode::getDoubleFromEnv( const char* name, double* result ) {
+    std::string env = env::getEnvVar(name);
+    if (env.empty())
+      return false;
+
+    try {
+      *result = std::stod(env);
+      return true;
+    } catch (const std::invalid_argument&) {
+      return false;
+    }
+  }
+
+
+  inline bool FramePacerMode::getIntFromEnv( const char* name, int* result ) {
+    std::string env = env::getEnvVar(name);
+    if (env.empty())
+      return false;
+
+    try {
+      *result = std::stoi(env);
+      return true;
+    } catch (const std::invalid_argument&) {
+      return false;
+    }
+  }
+
+
+  inline void FramePacerMode::setFpsLimitFrametimeFromEnv() {
+    double fpsLimit;
+    if (!getDoubleFromEnv("DXVK_FRAME_RATE", &fpsLimit))
+      return;
+
+    m_fpsLimitEnvOverride = true;
+    if (fpsLimit < 1.0)
+      return;
+
+    m_fpsLimitFrametime = 1'000'000/fpsLimit;
+  }
+
+}
--- a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp
+++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp
@ -0,0 +1,43 @@
+#include "dxvk_framepacer_mode_low_latency.h"
+
+namespace dxvk {
+
+
+  bool getLowLatencyOffsetFromEnv( int32_t& offset ) {
+    if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_OFFSET", &offset))
+      return false;
+    return true;
+  }
+
+
+  bool getLowLatencyAllowCpuFramesOverlapFromEnv( bool& allowOverlap ) {
+    int32_t o;
+    if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_ALLOW_CPU_FRAMES_OVERLAP", &o))
+      return false;
+    allowOverlap = (bool) o;
+    return true;
+  }
+
+
+  int32_t LowLatencyMode::getLowLatencyOffset( const DxvkOptions& options ) {
+    int32_t offset = options.lowLatencyOffset;
+    int32_t o;
+    if (getLowLatencyOffsetFromEnv(o))
+      offset = o;
+
+    offset = std::max( -10000, offset );
+    offset = std::min(  10000, offset );
+    return offset;
+  }
+
+
+  bool LowLatencyMode::getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options ) {
+    bool allowOverlap = options.lowLatencyAllowCpuFramesOverlap;
+    bool o;
+    if (getLowLatencyAllowCpuFramesOverlapFromEnv(o))
+      allowOverlap = o;
+    return allowOverlap;
+  }
+
+
+}
--- a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h
+++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h
@ -0,0 +1,255 @@
+#pragma once
+
+#include "dxvk_framepacer_mode.h"
+#include "../dxvk_options.h"
+#include "../../util/log/log.h"
+#include "../../util/util_string.h"
+#include <assert.h>
+
+namespace dxvk {
+
+  /*
+   * This low-latency mode aims to reduce latency with minimal impact in fps.
+   * Effective when operating in the GPU-limit. Efficient to be used in the CPU-limit as well.
+   *
+   * Greatly reduces input lag variations when switching between CPU- and GPU-limit, and
+   * compared to the max-frame-latency approach, it has a much more stable input lag when
+   * GPU running times change dramatically, which can happen for example when rotating within a scene.
+   *
+   * The current implementation rather generates fluctuations alternating frame-by-frame
+   * depending on the game's and dxvk's CPU-time variations. This might be visible as a loss
+   * in smoothness, which is an area this implementation can be further improved. Unsuitable
+   * smoothing however might degrade input-lag feel, so it's not implemented for now, but
+   * more advanced smoothing techniques will be investigated in the future.
+   * In some situations however, this low-latency pacing actually improves smoothing though,
+   * it will depend on the game.
+   *
+   * An interesting observation while playtesting was that not only the input lag was affected,
+   * but the video generated did progress more cleanly in time as well with regards to
+   * medium-term time consistency, in other words, the video playback speed remained more steady.
+   *
+   * Optimized for VRR and VK_PRESENT_MODE_IMMEDIATE_KHR. It also comes with its own fps-limiter
+   * which is typically used to prevent the game's fps exceeding the monitor's refresh rate,
+   * and which is tightly integrated into the pacing logic.
+   *
+   * Can be fine-tuned via the dxvk.lowLatencyOffset and dxvk.lowLatencyAllowCpuFramesOverlap
+   * variables (or their respective environment variables)
+   * Compared to maxFrameLatency = 3, render-latency reductions of up to 67% are achieved.
+   */
+
+  class LowLatencyMode : public FramePacerMode {
+    using microseconds = std::chrono::microseconds;
+    using time_point = high_resolution_clock::time_point;
+  public:
+
+    LowLatencyMode(Mode mode, LatencyMarkersStorage* storage, const DxvkOptions& options)
+    : FramePacerMode(mode, storage),
+      m_lowLatencyOffset(getLowLatencyOffset(options)),
+      m_allowCpuFramesOverlap(getLowLatencyAllowCpuFramesOverlap(options)) {
+      Logger::info( str::format("Using lowLatencyOffset: ", m_lowLatencyOffset) );
+      Logger::info( str::format("Using lowLatencyAllowCpuFramesOverlap: ", m_allowCpuFramesOverlap) );
+    }
+
+    ~LowLatencyMode() {}
+
+
+    void startFrame( uint64_t frameId ) override {
+      using std::chrono::duration_cast;
+
+      if (!m_allowCpuFramesOverlap)
+        m_fenceCsFinished.wait( frameId-1 );
+
+      m_fenceGpuStart.wait( frameId-1 );
+
+      time_point now = high_resolution_clock::now();
+      uint64_t finishedId = m_latencyMarkersStorage->getTimeline()->gpuFinished.load();
+      if (finishedId <= DXGI_MAX_SWAP_CHAIN_BUFFERS+1ull)
+        return;
+
+      if (finishedId == frameId-1) {
+        // we are the only in-flight frame, nothing to do other then to apply fps-limiter if needed
+        m_lastStart = sleepFor( now, 0 );
+        return;
+      }
+
+      if (finishedId != frameId-2) {
+        Logger::err( str::format("internal error during low-latency frame pacing: expected finished frameId=",
+          frameId-2, ", got: ", finishedId) );
+      }
+
+      const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId-1);
+
+      // estimate the target gpu sync point for this frame
+      // and calculate backwards when we want to start this frame
+
+      const SyncProps props = getSyncPrediction();
+      int32_t gpuReadyPrediction = duration_cast<microseconds>(
+        m->start + microseconds(m->gpuStart+getGpuStartToFinishPrediction()) - now).count();
+
+      int32_t targetGpuSync = gpuReadyPrediction + props.gpuSync;
+      int32_t delay = targetGpuSync - props.cpuUntilGpuSync + m_lowLatencyOffset;
+
+      m_lastStart = sleepFor( now, delay );
+
+    }
+
+
+    void finishRender( uint64_t frameId ) override {
+
+      using std::chrono::duration_cast;
+      const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId);
+
+      int32_t numLoop = (int32_t)(m->gpuReady.size())-1;
+      if (numLoop <= 1) {
+        m_props[frameId % m_props.size()] = SyncProps();
+        m_props[frameId % m_props.size()].isOutlier = true;
+        m_propsFinished.store( frameId );
+        return;
+      }
+
+      // estimates the optimal overlap for cpu/gpu work by optimizing gpu scheduling first
+      // such that the gpu doesn't go into idle for this frame, and then aligning cpu submits
+      // where gpuSubmit[i] <= gpuRun[i] for all i
+
+      std::vector<int32_t>& gpuRun = m_tempGpuRun;
+      std::vector<int32_t>& gpuRunDurations = m_tempGpuRunDurations;
+      gpuRun.clear();
+      gpuRunDurations.clear();
+      int32_t optimizedGpuTime = 0;
+      gpuRun.push_back(optimizedGpuTime);
+
+      for (int i=0; i<numLoop; ++i) {
+        time_point _gpuRun = std::max( m->gpuReady[i], m->gpuQueueSubmit[i] );
+        int32_t duration = duration_cast<microseconds>( m->gpuReady[i+1] - _gpuRun ).count();
+        optimizedGpuTime += duration;
+        gpuRun.push_back(optimizedGpuTime);
+        gpuRunDurations.push_back(duration);
+      }
+
+      int32_t alignment = duration_cast<microseconds>( m->gpuSubmit[numLoop-1] - m->gpuSubmit[0] ).count()
+        - gpuRun[numLoop-1];
+
+      int32_t offset = 0;
+      for (int i=numLoop-2; i>=0; --i) {
+        int32_t curSubmit = duration_cast<microseconds>( m->gpuSubmit[i] - m->gpuSubmit[0] ).count();
+        int32_t diff = curSubmit - gpuRun[i] - alignment;
+        diff = std::max( 0, diff );
+        offset += diff;
+        alignment += diff;
+      }
+
+
+      SyncProps& props = m_props[frameId % m_props.size()];
+      props.gpuSync = gpuRun[numLoop-1];
+      props.cpuUntilGpuSync = offset + duration_cast<microseconds>( m->gpuSubmit[numLoop-1] - m->start ).count();
+      props.optimizedGpuTime = optimizedGpuTime;
+      props.isOutlier = isOutlier(frameId);
+
+      m_propsFinished.store( frameId );
+
+    }
+
+
+    Sleep::TimePoint sleepFor( const Sleep::TimePoint t, int32_t delay ) {
+
+      // account for the fps limit and ensure we won't sleep too long, just in case
+      int32_t frametime = std::chrono::duration_cast<microseconds>( t - m_lastStart ).count();
+      int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime );
+      delay = std::max( delay, frametimeDiff );
+      delay = std::max( 0, std::min( delay, 20000 ) );
+
+      Sleep::TimePoint nextStart = t + microseconds(delay);
+      Sleep::sleepUntil( t, nextStart );
+      return nextStart;
+
+    }
+
+
+  private:
+
+    struct SyncProps {
+      int32_t optimizedGpuTime;   // gpu executing packed submits in one go
+      int32_t gpuSync;            // us after gpuStart
+      int32_t cpuUntilGpuSync;
+      bool    isOutlier;
+    };
+
+
+    SyncProps getSyncPrediction() {
+      // in the future we might use more samples to get a prediction
+      // however, simple averaging gives a slightly artificial mouse input
+      // more advanced methods will be investigated
+      SyncProps res = {};
+      uint64_t id = m_propsFinished;
+      if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7)
+        return res;
+
+      for (size_t i=0; i<7; ++i) {
+        const SyncProps& props = m_props[ (id-i) % m_props.size() ];
+        if (!props.isOutlier) {
+          id = id-i;
+          break;
+        }
+      }
+
+      return m_props[ id % m_props.size() ];
+    };
+
+
+    int32_t getGpuStartToFinishPrediction() {
+      uint64_t id = m_propsFinished;
+      if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7)
+        return 0;
+
+      for (size_t i=0; i<7; ++i) {
+        const SyncProps& props = m_props[ (id-i) % m_props.size() ];
+        if (!props.isOutlier) {
+          const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id-i);
+          if (m->gpuReady.empty() || m->gpuSubmit.empty())
+            return m->gpuFinished - m->gpuStart;
+
+          time_point t = std::max( m->gpuReady[0], m->gpuSubmit[0] );
+          return std::chrono::duration_cast<microseconds>( t - m->start ).count()
+            + props.optimizedGpuTime
+            - m->gpuStart;
+        }
+      }
+
+      const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id);
+      return m->gpuFinished - m->gpuStart;
+    };
+
+
+    bool isOutlier( uint64_t frameId ) {
+      constexpr size_t numLoop = 7;
+      int32_t totalCpuTime = 0;
+      for (size_t i=0; i<numLoop; ++i) {
+        const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId-i);
+        totalCpuTime += m->cpuFinished;
+      }
+
+      int32_t avgCpuTime = totalCpuTime / numLoop;
+      const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId);
+      if (m->cpuFinished > 1.7*avgCpuTime || m->gpuSubmit.empty() || m->gpuReady.size() != (m->gpuSubmit.size()+1) )
+        return true;
+
+      return false;
+    }
+
+
+    int32_t getLowLatencyOffset( const DxvkOptions& options );
+    bool getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options );
+
+    const int32_t m_lowLatencyOffset;
+    const bool    m_allowCpuFramesOverlap;
+
+    Sleep::TimePoint m_lastStart = { high_resolution_clock::now() };
+    std::array<SyncProps, 16> m_props;
+    std::atomic<uint64_t> m_propsFinished = { 0 };
+
+    std::vector<int32_t>  m_tempGpuRun;
+    std::vector<int32_t>  m_tempGpuRunDurations;
+
+  };
+
+}
--- a/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h
+++ b/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h
@ -0,0 +1,45 @@
+#pragma once
+
+#include "dxvk_framepacer_mode.h"
+
+namespace dxvk {
+
+  /*
+   * Minimal latency is achieved here by waiting for the previous
+   * frame to complete, which results in very much reduced fps.
+   * Generally not recommended, but helpful to get insights to fine-tune
+   * the low-latency mode, and possibly is useful for running games
+   * in the cpu limit.
+   */
+
+  class MinLatencyMode : public FramePacerMode {
+
+  public:
+
+    MinLatencyMode(Mode mode, LatencyMarkersStorage* storage)
+    : FramePacerMode(mode, storage, 0) {}
+
+    ~MinLatencyMode() {}
+
+    void startFrame( uint64_t frameId ) override {
+
+      Sleep::TimePoint now = high_resolution_clock::now();
+      int32_t frametime = std::chrono::duration_cast<std::chrono::microseconds>(
+        now - m_lastStart ).count();
+      int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime );
+      int32_t delay = std::max( 0, frametimeDiff );
+      delay = std::min( delay, 20000 );
+
+      Sleep::TimePoint nextStart = now + std::chrono::microseconds(delay);
+      Sleep::sleepUntil( now, nextStart );
+      m_lastStart = nextStart;
+
+    }
+
+  private:
+
+    Sleep::TimePoint m_lastStart = { high_resolution_clock::now() };
+
+  };
+
+}
--- a/src/dxvk/framepacer/dxvk_latency_markers.h
+++ b/src/dxvk/framepacer/dxvk_latency_markers.h
@ -0,0 +1,148 @@
+#pragma once
+
+#include <atomic>
+#include <dxgi.h>
+#include <vector>
+#include <array>
+#include <assert.h>
+#include "../../util/util_sleep.h"
+#include "../../util/log/log.h"
+#include "../../util/util_string.h"
+
+
+namespace dxvk {
+
+  class FramePacer;
+  class LatencyMarkersStorage;
+
+
+  struct LatencyMarkers {
+
+    using time_point = high_resolution_clock::time_point;
+
+    time_point start;
+    time_point end;
+
+    int32_t csStart;
+    int32_t csFinished;
+    int32_t cpuFinished;
+    int32_t gpuStart;
+    int32_t gpuFinished;
+    int32_t presentFinished;
+
+    std::vector<time_point> gpuReady;
+    std::vector<time_point> gpuSubmit;
+    std::vector<time_point> gpuQueueSubmit;
+
+  };
+
+
+  /*
+   * stores which information is accessible for which frame
+   */
+  struct LatencyMarkersTimeline {
+
+    std::atomic<uint64_t> cpuFinished   = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+    std::atomic<uint64_t> gpuStart      = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+    std::atomic<uint64_t> gpuFinished   = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+    std::atomic<uint64_t> frameFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
+
+  };
+
+
+  class LatencyMarkersReader {
+
+  public:
+
+    LatencyMarkersReader( const LatencyMarkersStorage* storage, uint32_t numEntries );
+    bool getNext( const LatencyMarkers*& result );
+
+  private:
+
+    const LatencyMarkersStorage* m_storage;
+    uint64_t m_index;
+
+  };
+
+
+  class LatencyMarkersStorage {
+    friend class LatencyMarkersReader;
+    friend class FramePacer;
+  public:
+
+    LatencyMarkersStorage() { }
+    ~LatencyMarkersStorage() { }
+
+    LatencyMarkersReader getReader( uint32_t numEntries ) const {
+      return LatencyMarkersReader(this, numEntries);
+    }
+
+    void registerFrameStart( uint64_t frameId ) {
+      if (frameId <= m_timeline.frameFinished.load()) {
+        Logger::warn( str::format("internal error during registerFrameStart: expected frameId=",
+          m_timeline.frameFinished.load()+1, ", got: ", frameId) );
+      }
+      auto now = high_resolution_clock::now();
+
+      LatencyMarkers* markers = getMarkers(frameId);
+      markers->start = now;
+    }
+
+    void registerFrameEnd( uint64_t frameId ) {
+      if (frameId <= m_timeline.frameFinished.load()) {
+        Logger::warn( str::format("internal error during registerFrameEnd: expected frameId=",
+          m_timeline.frameFinished.load()+1, ", got: ", frameId) );
+      }
+      auto now = high_resolution_clock::now();
+
+      LatencyMarkers* markers = getMarkers(frameId);
+      markers->presentFinished = std::chrono::duration_cast<std::chrono::microseconds>(
+        now - markers->start).count();
+      markers->end = now;
+
+      m_timeline.frameFinished.store(frameId);
+    }
+
+    const LatencyMarkersTimeline* getTimeline() const {
+      return &m_timeline;
+    }
+
+    const LatencyMarkers* getConstMarkers( uint64_t frameId ) const {
+      return &m_markers[frameId % m_numMarkers];
+    }
+
+
+  private:
+
+    LatencyMarkers* getMarkers( uint64_t frameId ) {
+      return &m_markers[frameId % m_numMarkers];
+    }
+
+    // simple modulo hash mapping is used for frameIds. They are expected to monotonically increase by one.
+    // select the size large enough, so we never come into a situation where the reader cannot keep up with the producer
+    static constexpr uint16_t m_numMarkers = 128;
+    std::array<LatencyMarkers, m_numMarkers> m_markers = { };
+    LatencyMarkersTimeline m_timeline;
+
+  };
+
+
+
+  inline LatencyMarkersReader::LatencyMarkersReader( const LatencyMarkersStorage* storage, uint32_t numEntries )
+  : m_storage(storage) {
+    m_index = 0;
+    if (m_storage->m_timeline.frameFinished.load() > numEntries + DXGI_MAX_SWAP_CHAIN_BUFFERS + 2)
+      m_index = m_storage->m_timeline.frameFinished.load() - numEntries;
+  }
+
+
+  inline bool LatencyMarkersReader::getNext( const LatencyMarkers*& result ) {
+    if (m_index == 0 || m_index > m_storage->m_timeline.frameFinished.load())
+      return false;
+
+    result = &m_storage->m_markers[m_index % m_storage->m_numMarkers];
+    m_index++;
+    return true;
+  }
+
+}
--- a/src/dxvk/hud/dxvk_hud.h
+++ b/src/dxvk/hud/dxvk_hud.h
@ -59,6 +59,11 @@ namespace dxvk::hud {
    Rc<T> addItem(const char* name, int32_t at, Args... args) {
      return m_hudItems.add<T>(name, at, std::forward<Args>(args)...);
    }
+
+    template<typename T>
+    int32_t getItemPos() {
+      return m_hudItems.getItemPos<T>();
+    }
    
    /**
     * \brief Creates the HUD
--- a/src/dxvk/hud/dxvk_hud_item.cpp
+++ b/src/dxvk/hud/dxvk_hud_item.cpp
@ -1,4 +1,5 @@
 #include "dxvk_hud_item.h"
+#include "../framepacer/dxvk_framepacer.h"

 #include <hud_chunk_frag_background.h>
 #include <hud_chunk_frag_visualize.h>
@ -213,6 +214,63 @@ namespace dxvk::hud {
  }


+  HudRenderLatencyItem::HudRenderLatencyItem() { }
+  HudRenderLatencyItem::~HudRenderLatencyItem() { }
+
+  void HudRenderLatencyItem::update(dxvk::high_resolution_clock::time_point time) {
+    // we cannot measure latency when fps-limiting is performed in Presenter::runFrameThread()
+    // because it's interfering with getting the right timestamp from vkWaitForPresent()
+    // if we truely wanted to measure it, we would need one additional thread
+    if (FpsLimiter::m_isActive) {
+      m_latency = "N/A";
+      return;
+    }
+
+    const Rc<DxvkLatencyTracker> tracker = m_tracker;
+    const FramePacer* framePacer = dynamic_cast<FramePacer*>( tracker.ptr() );
+    if (!framePacer)
+      return;
+
+    auto elapsed = std::chrono::duration_cast<std::chrono::microseconds>(time - m_lastUpdate);
+
+    if (elapsed.count() >= UpdateInterval) {
+      m_lastUpdate = time;
+
+      LatencyMarkersReader reader = framePacer->m_latencyMarkersStorage.getReader(100);
+      const LatencyMarkers* markers;
+      uint32_t count = 0;
+      uint64_t totalLatency = 0;
+      while (reader.getNext(markers)) {
+        totalLatency += markers->presentFinished;
+        ++count;
+      }
+
+      if (!count)
+        return;
+
+      uint64_t latency = totalLatency / count;
+      m_latency = str::format(latency / 1000, ".", (latency/100) % 10, " ms");
+    }
+  }
+
+
+  HudPos HudRenderLatencyItem::render(
+    const DxvkContextObjects& ctx,
+    const HudPipelineKey&     key,
+    const HudOptions&         options,
+          HudRenderer&        renderer,
+          HudPos              position) {
+
+    position.y += 12;
+    renderer.drawText(16, position, 0xff4040ffu, "Render latency:");
+    renderer.drawText(16, { position.x + 195, position.y },
+      0xffffffffu, m_latency);
+
+    position.y += 8;
+    return position;
+  }
+
+
  HudFrameTimeItem::HudFrameTimeItem(const Rc<DxvkDevice>& device, HudRenderer* renderer)
  : m_device            (device),
    m_gfxSetLayout      (createDescriptorSetLayout()),
--- a/src/dxvk/hud/dxvk_hud_item.h
+++ b/src/dxvk/hud/dxvk_hud_item.h
@ -131,6 +131,15 @@ namespace dxvk::hud {
      return value;
    }

+    template<typename T>
+    int32_t getItemPos() {
+      for (int i=0; i<(int)m_items.size(); ++i) {
+        if (dynamic_cast<T*>(m_items[i].ptr()))
+          return i;
+      }
+      return -1;
+    }
+
  private:

    bool                                          m_enableFull = false;
@ -244,6 +253,42 @@ namespace dxvk::hud {
  };


+   /**
+   * \brief HUD item to display render latency
+   */
+  class HudRenderLatencyItem : public HudItem {
+    constexpr static int64_t UpdateInterval = 500'000;
+  public:
+
+    HudRenderLatencyItem();
+
+    ~HudRenderLatencyItem();
+
+    void updateLatencyTracker( const Rc<DxvkLatencyTracker>& tracker ) {
+      m_tracker = tracker;
+    }
+
+    void update(dxvk::high_resolution_clock::time_point time);
+
+    HudPos render(
+      const DxvkContextObjects& ctx,
+      const HudPipelineKey&     key,
+      const HudOptions&         options,
+            HudRenderer&        renderer,
+            HudPos              position);
+
+  private:
+
+    Rc<DxvkLatencyTracker> m_tracker;
+
+    dxvk::high_resolution_clock::time_point m_lastUpdate
+      = dxvk::high_resolution_clock::now();
+
+    std::string m_latency;
+
+  };
+
+
  /**
   * \brief HUD item to display the frame rate
   */
--- a/src/dxvk/meson.build
+++ b/src/dxvk/meson.build
@ -120,6 +120,9 @@ dxvk_src = [
  'hud/dxvk_hud_font.cpp',
  'hud/dxvk_hud_item.cpp',
  'hud/dxvk_hud_renderer.cpp',
+
+  'framepacer/dxvk_framepacer.cpp',
+  'framepacer/dxvk_framepacer_mode_low_latency.cpp',
 ]

 if platform == 'windows'
--- a/src/util/util_fps_limiter.cpp
+++ b/src/util/util_fps_limiter.cpp
@ -5,12 +5,15 @@
 #include "util_fps_limiter.h"
 #include "util_sleep.h"
 #include "util_string.h"
+#include "../dxvk/framepacer/dxvk_framepacer.h"

 #include "./log/log.h"

 using namespace std::chrono_literals;

 namespace dxvk {
+
+  std::atomic<bool> FpsLimiter::m_isActive = { false };
  
  FpsLimiter::FpsLimiter() {
    auto override = getEnvironmentOverride();
@ -48,7 +51,12 @@ namespace dxvk {
  }


-  void FpsLimiter::delay() {
+  void FpsLimiter::delay(const Rc<DxvkLatencyTracker>& tracker) {
+    FramePacer* framePacer = dynamic_cast<FramePacer*>(tracker.ptr());
+    if (framePacer && framePacer->getMode()) {
+      return;
+    }
+
    std::unique_lock<dxvk::mutex> lock(m_mutex);
    auto interval = m_targetInterval;
    auto latency = m_maxLatency;
@ -71,8 +79,11 @@ namespace dxvk {
    // that can be written by setTargetFrameRate
    lock.unlock();

-    if (t1 < m_nextFrame)
+    m_isActive.store(false);
+    if (t1 < m_nextFrame) {
+      m_isActive.store(true);
      Sleep::sleepUntil(t1, m_nextFrame);
+    }

    m_nextFrame = (t1 < m_nextFrame + interval)
      ? m_nextFrame + interval
--- a/src/util/util_fps_limiter.h
+++ b/src/util/util_fps_limiter.h
@ -7,6 +7,8 @@
 #include "util_time.h"

 namespace dxvk {
+
+  class DxvkLatencyTracker;
  
  /**
   * \brief Frame rate limiter
@ -38,7 +40,7 @@ namespace dxvk {
     * and the time since the last call to \ref delay is
     * shorter than the target interval.
     */
-    void delay();
+    void delay(const Rc<DxvkLatencyTracker>& tracker);

    /**
     * \brief Queries environment override
@ -46,6 +48,8 @@ namespace dxvk {
     */
    static std::optional<double> getEnvironmentOverride();

+    static std::atomic<bool> m_isActive;
+
  private:

    using TimePoint = dxvk::high_resolution_clock::time_point;