From 33febe985904a04e6cb56798279db024ca655a64 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 17 Jan 2025 21:58:56 +0100 Subject: [PATCH] [dxvk] Add latency tracker Implements a basic latency sleep solution that is intended to work without requiring games to support any related vendor features. This alone is not enough to expose the Reflex API to applications via dxvk-nvapi, but since that relies on NV_low_latency2 specifics anyway, we are going to add an implementation based on that extension later with an extended interface. --- dxvk.conf | 30 +++ src/dxvk/dxvk_device.cpp | 11 ++ src/dxvk/dxvk_device.h | 11 ++ src/dxvk/dxvk_latency.h | 185 +++++++++++++++++ src/dxvk/dxvk_latency_builtin.cpp | 317 ++++++++++++++++++++++++++++++ src/dxvk/dxvk_latency_builtin.h | 134 +++++++++++++ src/dxvk/dxvk_options.cpp | 2 + src/dxvk/dxvk_options.h | 6 + src/dxvk/meson.build | 1 + 9 files changed, 697 insertions(+) create mode 100644 src/dxvk/dxvk_latency.h create mode 100644 src/dxvk/dxvk_latency_builtin.cpp create mode 100644 src/dxvk/dxvk_latency_builtin.h diff --git a/dxvk.conf b/dxvk.conf index d8ff2989a..55cc232f3 100644 --- a/dxvk.conf +++ b/dxvk.conf @@ -76,6 +76,36 @@ # d3d9.maxFrameRate = 0 +# Controls latency sleep and Nvidia Reflex support. +# +# Supported values: +# - Auto: By default, DXVK only supports latency sleep in D3D11 games that +# use Reflex if the graphics driver supports VK_NV_low_latency2, +# and if dxvk-nvapi is enabled in Proton. +# - True: Enables built-in latency reduction based on internal timings. +# This assumes that input sampling for any given frame happens after +# the D3D9 or DXGI Present call returns; games that render and present +# asynchronously will not behave as intended. +# Similarly, this will not have any effect in games with built-in frame +# rate limiters, or if an external limiter (such as MangoHud) is used. +# In some games, enabling this may reduce performance or lead to less +# consistent frame pacing. +# The implementation will either use VK_NV_low_latency2 if supported +# by the driver, or a custom algorithm. +# - False: Disable Reflex support as well as built-in latency reduction. + +# dxvk.latencySleep = Auto + + +# Tolerance for the latency sleep heuristic, in microseconds. Higher values +# increase latency, but may lead to better frame pacing in some cases. Does +# not have any effect if NV_low_latency2 is used. +# +# Supported values: Any non-negative number + +# dxvk.latencyTolerance = 1000 + + # Override PCI vendor and device IDs reported to the application. Can # cause the app to adjust behaviour depending on the selected values. # diff --git a/src/dxvk/dxvk_device.cpp b/src/dxvk/dxvk_device.cpp index d9707c246..c029949aa 100644 --- a/src/dxvk/dxvk_device.cpp +++ b/src/dxvk/dxvk_device.cpp @@ -1,5 +1,6 @@ #include "dxvk_device.h" #include "dxvk_instance.h" +#include "dxvk_latency_builtin.h" namespace dxvk { @@ -305,6 +306,16 @@ namespace dxvk { } + Rc DxvkDevice::createLatencyTracker( + const Rc& presenter) { + if (m_options.latencySleep != Tristate::True) + return nullptr; + + return new DxvkBuiltInLatencyTracker( + m_options.latencyTolerance); + } + + void DxvkDevice::presentImage( const Rc& presenter, uint64_t frameId, diff --git a/src/dxvk/dxvk_device.h b/src/dxvk/dxvk_device.h index a40a25fec..5d8b4e37b 100644 --- a/src/dxvk/dxvk_device.h +++ b/src/dxvk/dxvk_device.h @@ -10,6 +10,7 @@ #include "dxvk_framebuffer.h" #include "dxvk_image.h" #include "dxvk_instance.h" +#include "dxvk_latency.h" #include "dxvk_memory.h" #include "dxvk_meta_clear.h" #include "dxvk_objects.h" @@ -478,6 +479,16 @@ namespace dxvk { void requestCompileShader( const Rc& shader); + /** + * \brief Creates latency tracker for a presenter + * + * The specicfic implementation and parameters used + * depend on user configuration. + * \param [in] presenter Presenter instance + */ + Rc createLatencyTracker( + const Rc& presenter); + /** * \brief Presents a swap chain image * diff --git a/src/dxvk/dxvk_latency.h b/src/dxvk/dxvk_latency.h new file mode 100644 index 000000000..20b348365 --- /dev/null +++ b/src/dxvk/dxvk_latency.h @@ -0,0 +1,185 @@ +#pragma once + +#include +#include +#include + +#include "../util/util_likely.h" +#include "../util/util_time.h" + +#include "../util/rc/util_rc_ptr.h" + +#include "../vulkan/vulkan_loader.h" + +namespace dxvk { + + /** + * \brief Latency tracker statistics + */ + struct DxvkLatencyStats { + std::chrono::microseconds frameLatency; + std::chrono::microseconds sleepDuration; + }; + + + /** + * \brief Latency tracker + * + * Accumulates time stamps of certain parts of a frame. + */ + class DxvkLatencyTracker { + + public: + + virtual ~DxvkLatencyTracker() { } + + /** + * \brief Increments ref count + */ + void incRef() { + m_refCount.fetch_add(1, std::memory_order_acquire); + } + + /** + * \brief Decrements ref count + * + * Destroys the object when there are no users left. + */ + void decRef() { + if (m_refCount.fetch_sub(1, std::memory_order_release) == 1u) + delete this; + } + + /** + * \brief Called when presentation begins on the CPU timeline + * + * Must happen before acquiring an image from the presenter. + * \param [in] frameId Current frame ID + */ + virtual void notifyCpuPresentBegin( + uint64_t frameId) = 0; + + /** + * \brief Called when the CS thread reaches a given frame + * + * Should be recorded into the CS thread after completing + * the previous frame on the application's CPU timeline. + * \param [in] frameId Current frame ID + */ + virtual void notifyCsRenderBegin( + uint64_t frameId) = 0; + + /** + * \brief Called when the CS thread completes a frame + * + * Should be recorded into the CS thread after recording + * presentation commands for that frame. + * \param [in] frameId Current frame ID + */ + virtual void notifyCsRenderEnd( + uint64_t frameId) = 0; + + /** + * \brief Called when presentation ends on the CPU timeline + * + * Must happen after acquiring an image for presentation, but + * before synchronizing with previous frames or performing + * latency sleep. The intention is to measure acquire delays. + * \param [in] frameId Current frame ID + */ + virtual void notifyCpuPresentEnd( + uint64_t frameId) = 0; + + /** + * \brief Called when a command list is submitted to the GPU + * + * \param [in] frameId Associated frame ID + */ + virtual void notifyQueueSubmit( + uint64_t frameId) = 0; + + /** + * \brief Called when a frame is queued for presentation + * + * \param [in] frameId Associated frame ID + */ + virtual void notifyQueuePresentBegin( + uint64_t frameId) = 0; + + /** + * \brief Called after a frame has been queued for presentation + * + * \param [in] frameId Associated frame ID + * \param [in] status Result of the present operation + */ + virtual void notifyQueuePresentEnd( + uint64_t frameId, + VkResult status) = 0; + + /** + * \brief Called when a submission begins execution on the GPU + * + * Any previous submissions will have completed by this time. This + * can be used to measure GPU idle time throughout a frame. + * \param [in] frameId Associated frame ID + */ + virtual void notifyGpuExecutionBegin( + uint64_t frameId) = 0; + + /** + * \brief Called when a submission completes execution on the GPU + * + * The previous submission will have completed by the time this + * gets called. This may be used to measure GPU idle time. + * \param [in] frameId Associated frame ID + */ + virtual void notifyGpuExecutionEnd( + uint64_t frameId) = 0; + + /** + * \brief Called when presentation of a given frame finishes on the GPU + * + * This is generally the last thing that happens within a frame. + * \param [in] frameId Associated frame ID + */ + virtual void notifyGpuPresentEnd( + uint64_t frameId) = 0; + + /** + * \brief Performs latency sleep and begins next frame + * + * Uses latency data from previous frames to estimate when to wake + * up the application thread in order to minimize input latency. + * \param [in] frameId Frame ID of the upcoming frame + * \param [in] maxFrameRate Maximum frame rate or refresh rate + */ + virtual void sleepAndBeginFrame( + uint64_t frameId, + double maxFrameRate) = 0; + + /** + * \brief Discards all current timing data + * + * Should be called to reset latency tracking in case + * presentation failed for any given frame. + */ + virtual void discardTimings() = 0; + + /** + * \brief Queries statistics for the given frame + * + * Returns statistics for the frame closest to \c frameId for + * which data is available. If no such frame exists, the stat + * counters will return 0. + * \param [in] frameId Frame to query + */ + virtual DxvkLatencyStats getStatistics( + uint64_t frameId) = 0; + + private: + + std::atomic m_refCount = { 0u }; + + }; + +} diff --git a/src/dxvk/dxvk_latency_builtin.cpp b/src/dxvk/dxvk_latency_builtin.cpp new file mode 100644 index 000000000..8d3c0fa1a --- /dev/null +++ b/src/dxvk/dxvk_latency_builtin.cpp @@ -0,0 +1,317 @@ +#include + +#include "dxvk_latency_builtin.h" + +#include "../util/log/log.h" + +#include "../util/util_fps_limiter.h" +#include "../util/util_string.h" + +namespace dxvk { + + DxvkBuiltInLatencyTracker::DxvkBuiltInLatencyTracker( + int32_t toleranceUs) + : m_tolerance(std::chrono::duration_cast( + std::chrono::microseconds(std::max(toleranceUs, 0)))) { + Logger::info("Latency control enabled, using built-in algorithm"); + auto limit = FpsLimiter::getEnvironmentOverride(); + + if (limit) + m_envFpsLimit = *limit; + } + + + DxvkBuiltInLatencyTracker::~DxvkBuiltInLatencyTracker() { + + } + + + void DxvkBuiltInLatencyTracker::notifyCpuPresentBegin( + uint64_t frameId) { + // Not interesting here + } + + + void DxvkBuiltInLatencyTracker::notifyCpuPresentEnd( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + auto frame = findFrame(frameId); + + if (frame) + frame->cpuPresentEnd = dxvk::high_resolution_clock::now(); + } + + + void DxvkBuiltInLatencyTracker::notifyCsRenderBegin( + uint64_t frameId) { + // Not interesting here + } + + + void DxvkBuiltInLatencyTracker::notifyCsRenderEnd( + uint64_t frameId) { + // Not interesting here + } + + + void DxvkBuiltInLatencyTracker::notifyQueueSubmit( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + auto frame = findFrame(frameId); + + if (frame && frame->queueSubmit == time_point()) + frame->queueSubmit = dxvk::high_resolution_clock::now(); + } + + + void DxvkBuiltInLatencyTracker::notifyQueuePresentBegin( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + auto frame = findFrame(frameId); + + if (frame) + frame->queuePresent = dxvk::high_resolution_clock::now(); + } + + + void DxvkBuiltInLatencyTracker::notifyQueuePresentEnd( + uint64_t frameId, + VkResult status) { + // Not interesting + } + + + void DxvkBuiltInLatencyTracker::notifyGpuExecutionBegin( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + auto frame = findFrame(frameId); + + if (frame) { + auto now = dxvk::high_resolution_clock::now(); + + if (frame->gpuExecStart == time_point()) + frame->gpuExecStart = now; + + if (frame->gpuIdleStart != time_point()) { + frame->gpuIdleTime += now - frame->gpuIdleStart; + frame->gpuIdleEnd = now; + } + } + + m_cond.notify_one(); + } + + + void DxvkBuiltInLatencyTracker::notifyGpuExecutionEnd( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + auto frame = findFrame(frameId); + + if (frame) { + auto now = dxvk::high_resolution_clock::now(); + + frame->gpuExecEnd = now; + frame->gpuIdleStart = now; + } + } + + + void DxvkBuiltInLatencyTracker::notifyGpuPresentEnd( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + auto frame = findFrame(frameId); + + if (frame) + frame->gpuPresent = dxvk::high_resolution_clock::now(); + + m_cond.notify_one(); + } + + + void DxvkBuiltInLatencyTracker::sleepAndBeginFrame( + uint64_t frameId, + double maxFrameRate) { + auto duration = sleep(frameId, maxFrameRate); + + std::unique_lock lock(m_mutex); + + auto next = initFrame(frameId); + next->frameStart = dxvk::high_resolution_clock::now(); + next->sleepDuration = duration; + } + + + void DxvkBuiltInLatencyTracker::discardTimings() { + std::unique_lock lock(m_mutex); + m_validRangeBegin = m_validRangeEnd + 1u; + } + + + DxvkLatencyStats DxvkBuiltInLatencyTracker::getStatistics( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + + DxvkLatencyStats stats = { }; + + while (frameId && frameId >= m_validRangeBegin) { + auto f = findFrame(frameId--); + + if (f && f->gpuPresent != time_point()) { + stats.frameLatency = std::chrono::duration_cast(f->gpuPresent - f->frameStart); + stats.sleepDuration = std::chrono::duration_cast(f->sleepDuration); + break; + } + } + + return stats; + } + + + DxvkBuiltInLatencyTracker::duration DxvkBuiltInLatencyTracker::sleep( + uint64_t frameId, + double maxFrameRate) { + // Wait for all relevant timings to become available. This should + // generally not stall for very long if a maximum frame latency of + // 1 is enforced correctly by the swap chain. + std::unique_lock lock(m_mutex); + + for (uint32_t i = 2; i <= FrameCount; i++) { + auto f = findFrame(frameId - i); + + if (!f || f->cpuPresentEnd == time_point()) + return duration(0u); + + m_cond.wait(lock, [f] { + return f->gpuPresent != time_point(); + }); + } + + // Frame entry of the last frame that fully completed + auto prev = findFrame(frameId - 2u); + + // The way we want to align subsequent frames depends on whether + // we are limited by GPU performance or display refresh. + // + // In either case, we estimate the amount of CPU time the game requires + // before any GPU work can start to be the delay between frame start and + // first submission, plus any GPU idle time during the frame. This is not + // accurate if there are forced GPU sync points, but we can't work around + // that in a meaningful way. + constexpr size_t EntryCount = FrameCount - 1u; + + std::array cpuTimes = { }; + std::array gpuTimes = { }; + + for (uint32_t i = 0; i < EntryCount; i++) { + auto f = findFrame(frameId - (i + 2u)); + + cpuTimes[i] = (f->queueSubmit - f->frameStart) + f->gpuIdleTime; + gpuTimes[i] = (f->gpuExecEnd - f->gpuExecStart) - f->gpuIdleTime; + } + + duration nextCpuTime = estimateTime(cpuTimes.data(), cpuTimes.size()); + duration nextGpuTime = estimateTime(gpuTimes.data(), gpuTimes.size()); + + // Compute the initial deadline based on GPU execution times + time_point gpuDeadline = prev->gpuExecEnd + 2u * nextGpuTime; + + // If we're rendering faster than refresh, use present_wait timings from + // previous frames as a starting point and compute an average in order to + // account for potentially erratic present_wait delays. + duration frameInterval = computeFrameInterval(maxFrameRate); + + if (frameInterval.count()) { + duration nextPresentFromPrev = duration(0u); + + for (uint32_t i = 2; i <= FrameCount; i++) { + auto f = findFrame(frameId - i); + + time_point deadline = f->gpuPresent + i * frameInterval - m_tolerance; + nextPresentFromPrev += deadline - prev->gpuPresent; + } + + time_point wsiDeadline = prev->gpuPresent + (nextPresentFromPrev / int32_t(FrameCount - 1u)); + gpuDeadline = std::max(gpuDeadline, wsiDeadline); + } + + // Line up the next frame in such a way that the first GPU submission + // happens just before the current frame's final submission completes + time_point gpuStartTime = gpuDeadline - nextGpuTime; + time_point cpuStartTime = gpuStartTime - nextCpuTime - m_tolerance; + + time_point now = dxvk::high_resolution_clock::now(); + + // Release lock before actually sleeping, or + // it will affect the time measurements. + lock.unlock(); + + Sleep::sleepUntil(now, cpuStartTime); + return std::max(duration(0u), cpuStartTime - now); + } + + + DxvkLatencyFrameData* DxvkBuiltInLatencyTracker::initFrame( + uint64_t frameId) { + if (m_validRangeEnd + 1u != frameId) + m_validRangeBegin = frameId; + + if (m_validRangeBegin + FrameCount <= frameId) + m_validRangeBegin = frameId + 1u - FrameCount; + + m_validRangeEnd = frameId; + + auto& frame = m_frames[frameId % FrameCount]; + frame = DxvkLatencyFrameData(); + frame.frameId = frameId; + return &frame; + } + + + DxvkLatencyFrameData* DxvkBuiltInLatencyTracker::findFrame( + uint64_t frameId) { + return frameId >= m_validRangeBegin && frameId <= m_validRangeEnd + ? &m_frames[frameId % FrameCount] + : nullptr; + } + + + DxvkBuiltInLatencyTracker::duration DxvkBuiltInLatencyTracker::computeFrameInterval( + double maxFrameRate) { + if (m_envFpsLimit > 0.0) + maxFrameRate = m_envFpsLimit; + + return computeIntervalFromRate(maxFrameRate); + } + + + DxvkBuiltInLatencyTracker::duration DxvkBuiltInLatencyTracker::computeIntervalFromRate( + double frameRate) { + if (frameRate <= 0.0 || !std::isnormal(frameRate)) + return duration(0u); + + uint64_t ns = uint64_t(1'000'000'000.0 / frameRate); + return std::chrono::duration_cast(std::chrono::nanoseconds(ns)); + } + + + DxvkBuiltInLatencyTracker::duration DxvkBuiltInLatencyTracker::estimateTime( + const duration* frames, + size_t frameCount) { + // For each frame, find the median of its neighbours, then + // use the maximum of those medians as our estimate. + duration result = duration(0u); + + for (size_t i = 0u; i < frameCount - 2u; i++) { + duration a = frames[i]; + duration b = frames[i + 1]; + duration c = frames[i + 2]; + + duration min = std::min(std::min(a, b), c); + duration max = std::max(std::max(a, b), c); + + result = std::max(result, a + b + c - min - max); + } + + return result; + } +} diff --git a/src/dxvk/dxvk_latency_builtin.h b/src/dxvk/dxvk_latency_builtin.h new file mode 100644 index 000000000..dcbc4be31 --- /dev/null +++ b/src/dxvk/dxvk_latency_builtin.h @@ -0,0 +1,134 @@ +#pragma once + +#include + +#include "dxvk_latency.h" + +#include "../util/thread.h" + +#include "../util/util_sleep.h" +#include "../util/util_time.h" + +#include "../util/config/config.h" + +#include "../util/sync/sync_spinlock.h" + +namespace dxvk { + + /** + * \brief Timings for a single tracked frame + */ + struct DxvkLatencyFrameData { + using time_point = dxvk::high_resolution_clock::time_point; + using duration = dxvk::high_resolution_clock::duration; + + uint64_t frameId = 0u; + time_point frameStart = time_point(); + time_point cpuPresentEnd = time_point(); + time_point queueSubmit = time_point(); + time_point queuePresent = time_point(); + time_point gpuExecStart = time_point(); + time_point gpuExecEnd = time_point(); + time_point gpuIdleStart = time_point(); + time_point gpuIdleEnd = time_point(); + duration gpuIdleTime = duration(0u); + time_point gpuPresent = time_point(); + duration sleepDuration = duration(0u); + }; + + + /** + * \brief Built-in latency tracker + * + * Implements a simple latency reduction algorithm + * based on CPU timestamps received from the backend. + */ + class DxvkBuiltInLatencyTracker : public DxvkLatencyTracker { + using time_point = typename DxvkLatencyFrameData::time_point; + using duration = typename DxvkLatencyFrameData::duration; + + constexpr static size_t FrameCount = 8u; + public: + + DxvkBuiltInLatencyTracker( + int32_t toleranceUs); + + ~DxvkBuiltInLatencyTracker(); + + void notifyCpuPresentBegin( + uint64_t frameId); + + void notifyCpuPresentEnd( + uint64_t frameId); + + void notifyCsRenderBegin( + uint64_t frameId); + + void notifyCsRenderEnd( + uint64_t frameId); + + void notifyQueueSubmit( + uint64_t frameId); + + void notifyQueuePresentBegin( + uint64_t frameId); + + void notifyQueuePresentEnd( + uint64_t frameId, + VkResult status); + + void notifyGpuExecutionBegin( + uint64_t frameId); + + void notifyGpuExecutionEnd( + uint64_t frameId); + + void notifyGpuPresentEnd( + uint64_t frameId); + + void sleepAndBeginFrame( + uint64_t frameId, + double maxFrameRate); + + void discardTimings(); + + DxvkLatencyStats getStatistics( + uint64_t frameId); + + private: + + dxvk::mutex m_mutex; + dxvk::condition_variable m_cond; + + duration m_tolerance; + + double m_envFpsLimit = 0.0; + + std::array m_frames = { }; + + uint64_t m_validRangeBegin = 0u; + uint64_t m_validRangeEnd = 0u; + + duration sleep( + uint64_t frameId, + double maxFrameRate); + + DxvkLatencyFrameData* initFrame( + uint64_t frameId); + + DxvkLatencyFrameData* findFrame( + uint64_t frameId); + + duration computeFrameInterval( + double maxFrameRate); + + static duration computeIntervalFromRate( + double frameRate); + + static duration estimateTime( + const duration* frames, + size_t frameCount); + + }; + +} diff --git a/src/dxvk/dxvk_options.cpp b/src/dxvk/dxvk_options.cpp index c939a7a41..8fbf20bb6 100644 --- a/src/dxvk/dxvk_options.cpp +++ b/src/dxvk/dxvk_options.cpp @@ -12,6 +12,8 @@ namespace dxvk { useRawSsbo = config.getOption("dxvk.useRawSsbo", Tristate::Auto); hud = config.getOption("dxvk.hud", ""); tearFree = config.getOption("dxvk.tearFree", Tristate::Auto); + latencySleep = config.getOption("dxvk.latencySleep", Tristate::Auto); + latencyTolerance = config.getOption ("dxvk.latencyTolerance", 1000); hideIntegratedGraphics = config.getOption ("dxvk.hideIntegratedGraphics", false); zeroMappedMemory = config.getOption ("dxvk.zeroMappedMemory", false); allowFse = config.getOption ("dxvk.allowFse", false); diff --git a/src/dxvk/dxvk_options.h b/src/dxvk/dxvk_options.h index 0994f1d5f..dcbd52230 100644 --- a/src/dxvk/dxvk_options.h +++ b/src/dxvk/dxvk_options.h @@ -37,6 +37,12 @@ namespace dxvk { /// or FIFO_RELAXED (if false) present mode Tristate tearFree = Tristate::Auto; + /// Enables latency sleep + Tristate latencySleep = Tristate::Auto; + + /// Latency tolerance, in microseconds + int32_t latencyTolerance = 0u; + // Hides integrated GPUs if dedicated GPUs are // present. May be necessary for some games that // incorrectly assume monitor layouts. diff --git a/src/dxvk/meson.build b/src/dxvk/meson.build index 9ab23dab2..789b911c8 100644 --- a/src/dxvk/meson.build +++ b/src/dxvk/meson.build @@ -90,6 +90,7 @@ dxvk_src = [ 'dxvk_graphics.cpp', 'dxvk_image.cpp', 'dxvk_instance.cpp', + 'dxvk_latency_builtin.cpp', 'dxvk_memory.cpp', 'dxvk_meta_blit.cpp', 'dxvk_meta_clear.cpp',