diff --git a/dxvk.conf b/dxvk.conf index d8ff2989a..55cc232f3 100644 --- a/dxvk.conf +++ b/dxvk.conf @@ -76,6 +76,36 @@ # d3d9.maxFrameRate = 0 +# Controls latency sleep and Nvidia Reflex support. +# +# Supported values: +# - Auto: By default, DXVK only supports latency sleep in D3D11 games that +# use Reflex if the graphics driver supports VK_NV_low_latency2, +# and if dxvk-nvapi is enabled in Proton. +# - True: Enables built-in latency reduction based on internal timings. +# This assumes that input sampling for any given frame happens after +# the D3D9 or DXGI Present call returns; games that render and present +# asynchronously will not behave as intended. +# Similarly, this will not have any effect in games with built-in frame +# rate limiters, or if an external limiter (such as MangoHud) is used. +# In some games, enabling this may reduce performance or lead to less +# consistent frame pacing. +# The implementation will either use VK_NV_low_latency2 if supported +# by the driver, or a custom algorithm. +# - False: Disable Reflex support as well as built-in latency reduction. + +# dxvk.latencySleep = Auto + + +# Tolerance for the latency sleep heuristic, in microseconds. Higher values +# increase latency, but may lead to better frame pacing in some cases. Does +# not have any effect if NV_low_latency2 is used. +# +# Supported values: Any non-negative number + +# dxvk.latencyTolerance = 1000 + + # Override PCI vendor and device IDs reported to the application. Can # cause the app to adjust behaviour depending on the selected values. # diff --git a/src/dxvk/dxvk_device.cpp b/src/dxvk/dxvk_device.cpp index d9707c246..c029949aa 100644 --- a/src/dxvk/dxvk_device.cpp +++ b/src/dxvk/dxvk_device.cpp @@ -1,5 +1,6 @@ #include "dxvk_device.h" #include "dxvk_instance.h" +#include "dxvk_latency_builtin.h" namespace dxvk { @@ -305,6 +306,16 @@ namespace dxvk { } + Rc DxvkDevice::createLatencyTracker( + const Rc& presenter) { + if (m_options.latencySleep != Tristate::True) + return nullptr; + + return new DxvkBuiltInLatencyTracker( + m_options.latencyTolerance); + } + + void DxvkDevice::presentImage( const Rc& presenter, uint64_t frameId, diff --git a/src/dxvk/dxvk_device.h b/src/dxvk/dxvk_device.h index a40a25fec..5d8b4e37b 100644 --- a/src/dxvk/dxvk_device.h +++ b/src/dxvk/dxvk_device.h @@ -10,6 +10,7 @@ #include "dxvk_framebuffer.h" #include "dxvk_image.h" #include "dxvk_instance.h" +#include "dxvk_latency.h" #include "dxvk_memory.h" #include "dxvk_meta_clear.h" #include "dxvk_objects.h" @@ -478,6 +479,16 @@ namespace dxvk { void requestCompileShader( const Rc& shader); + /** + * \brief Creates latency tracker for a presenter + * + * The specicfic implementation and parameters used + * depend on user configuration. + * \param [in] presenter Presenter instance + */ + Rc createLatencyTracker( + const Rc& presenter); + /** * \brief Presents a swap chain image * diff --git a/src/dxvk/dxvk_latency.h b/src/dxvk/dxvk_latency.h new file mode 100644 index 000000000..20b348365 --- /dev/null +++ b/src/dxvk/dxvk_latency.h @@ -0,0 +1,185 @@ +#pragma once + +#include +#include +#include + +#include "../util/util_likely.h" +#include "../util/util_time.h" + +#include "../util/rc/util_rc_ptr.h" + +#include "../vulkan/vulkan_loader.h" + +namespace dxvk { + + /** + * \brief Latency tracker statistics + */ + struct DxvkLatencyStats { + std::chrono::microseconds frameLatency; + std::chrono::microseconds sleepDuration; + }; + + + /** + * \brief Latency tracker + * + * Accumulates time stamps of certain parts of a frame. + */ + class DxvkLatencyTracker { + + public: + + virtual ~DxvkLatencyTracker() { } + + /** + * \brief Increments ref count + */ + void incRef() { + m_refCount.fetch_add(1, std::memory_order_acquire); + } + + /** + * \brief Decrements ref count + * + * Destroys the object when there are no users left. + */ + void decRef() { + if (m_refCount.fetch_sub(1, std::memory_order_release) == 1u) + delete this; + } + + /** + * \brief Called when presentation begins on the CPU timeline + * + * Must happen before acquiring an image from the presenter. + * \param [in] frameId Current frame ID + */ + virtual void notifyCpuPresentBegin( + uint64_t frameId) = 0; + + /** + * \brief Called when the CS thread reaches a given frame + * + * Should be recorded into the CS thread after completing + * the previous frame on the application's CPU timeline. + * \param [in] frameId Current frame ID + */ + virtual void notifyCsRenderBegin( + uint64_t frameId) = 0; + + /** + * \brief Called when the CS thread completes a frame + * + * Should be recorded into the CS thread after recording + * presentation commands for that frame. + * \param [in] frameId Current frame ID + */ + virtual void notifyCsRenderEnd( + uint64_t frameId) = 0; + + /** + * \brief Called when presentation ends on the CPU timeline + * + * Must happen after acquiring an image for presentation, but + * before synchronizing with previous frames or performing + * latency sleep. The intention is to measure acquire delays. + * \param [in] frameId Current frame ID + */ + virtual void notifyCpuPresentEnd( + uint64_t frameId) = 0; + + /** + * \brief Called when a command list is submitted to the GPU + * + * \param [in] frameId Associated frame ID + */ + virtual void notifyQueueSubmit( + uint64_t frameId) = 0; + + /** + * \brief Called when a frame is queued for presentation + * + * \param [in] frameId Associated frame ID + */ + virtual void notifyQueuePresentBegin( + uint64_t frameId) = 0; + + /** + * \brief Called after a frame has been queued for presentation + * + * \param [in] frameId Associated frame ID + * \param [in] status Result of the present operation + */ + virtual void notifyQueuePresentEnd( + uint64_t frameId, + VkResult status) = 0; + + /** + * \brief Called when a submission begins execution on the GPU + * + * Any previous submissions will have completed by this time. This + * can be used to measure GPU idle time throughout a frame. + * \param [in] frameId Associated frame ID + */ + virtual void notifyGpuExecutionBegin( + uint64_t frameId) = 0; + + /** + * \brief Called when a submission completes execution on the GPU + * + * The previous submission will have completed by the time this + * gets called. This may be used to measure GPU idle time. + * \param [in] frameId Associated frame ID + */ + virtual void notifyGpuExecutionEnd( + uint64_t frameId) = 0; + + /** + * \brief Called when presentation of a given frame finishes on the GPU + * + * This is generally the last thing that happens within a frame. + * \param [in] frameId Associated frame ID + */ + virtual void notifyGpuPresentEnd( + uint64_t frameId) = 0; + + /** + * \brief Performs latency sleep and begins next frame + * + * Uses latency data from previous frames to estimate when to wake + * up the application thread in order to minimize input latency. + * \param [in] frameId Frame ID of the upcoming frame + * \param [in] maxFrameRate Maximum frame rate or refresh rate + */ + virtual void sleepAndBeginFrame( + uint64_t frameId, + double maxFrameRate) = 0; + + /** + * \brief Discards all current timing data + * + * Should be called to reset latency tracking in case + * presentation failed for any given frame. + */ + virtual void discardTimings() = 0; + + /** + * \brief Queries statistics for the given frame + * + * Returns statistics for the frame closest to \c frameId for + * which data is available. If no such frame exists, the stat + * counters will return 0. + * \param [in] frameId Frame to query + */ + virtual DxvkLatencyStats getStatistics( + uint64_t frameId) = 0; + + private: + + std::atomic m_refCount = { 0u }; + + }; + +} diff --git a/src/dxvk/dxvk_latency_builtin.cpp b/src/dxvk/dxvk_latency_builtin.cpp new file mode 100644 index 000000000..8d3c0fa1a --- /dev/null +++ b/src/dxvk/dxvk_latency_builtin.cpp @@ -0,0 +1,317 @@ +#include + +#include "dxvk_latency_builtin.h" + +#include "../util/log/log.h" + +#include "../util/util_fps_limiter.h" +#include "../util/util_string.h" + +namespace dxvk { + + DxvkBuiltInLatencyTracker::DxvkBuiltInLatencyTracker( + int32_t toleranceUs) + : m_tolerance(std::chrono::duration_cast( + std::chrono::microseconds(std::max(toleranceUs, 0)))) { + Logger::info("Latency control enabled, using built-in algorithm"); + auto limit = FpsLimiter::getEnvironmentOverride(); + + if (limit) + m_envFpsLimit = *limit; + } + + + DxvkBuiltInLatencyTracker::~DxvkBuiltInLatencyTracker() { + + } + + + void DxvkBuiltInLatencyTracker::notifyCpuPresentBegin( + uint64_t frameId) { + // Not interesting here + } + + + void DxvkBuiltInLatencyTracker::notifyCpuPresentEnd( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + auto frame = findFrame(frameId); + + if (frame) + frame->cpuPresentEnd = dxvk::high_resolution_clock::now(); + } + + + void DxvkBuiltInLatencyTracker::notifyCsRenderBegin( + uint64_t frameId) { + // Not interesting here + } + + + void DxvkBuiltInLatencyTracker::notifyCsRenderEnd( + uint64_t frameId) { + // Not interesting here + } + + + void DxvkBuiltInLatencyTracker::notifyQueueSubmit( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + auto frame = findFrame(frameId); + + if (frame && frame->queueSubmit == time_point()) + frame->queueSubmit = dxvk::high_resolution_clock::now(); + } + + + void DxvkBuiltInLatencyTracker::notifyQueuePresentBegin( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + auto frame = findFrame(frameId); + + if (frame) + frame->queuePresent = dxvk::high_resolution_clock::now(); + } + + + void DxvkBuiltInLatencyTracker::notifyQueuePresentEnd( + uint64_t frameId, + VkResult status) { + // Not interesting + } + + + void DxvkBuiltInLatencyTracker::notifyGpuExecutionBegin( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + auto frame = findFrame(frameId); + + if (frame) { + auto now = dxvk::high_resolution_clock::now(); + + if (frame->gpuExecStart == time_point()) + frame->gpuExecStart = now; + + if (frame->gpuIdleStart != time_point()) { + frame->gpuIdleTime += now - frame->gpuIdleStart; + frame->gpuIdleEnd = now; + } + } + + m_cond.notify_one(); + } + + + void DxvkBuiltInLatencyTracker::notifyGpuExecutionEnd( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + auto frame = findFrame(frameId); + + if (frame) { + auto now = dxvk::high_resolution_clock::now(); + + frame->gpuExecEnd = now; + frame->gpuIdleStart = now; + } + } + + + void DxvkBuiltInLatencyTracker::notifyGpuPresentEnd( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + auto frame = findFrame(frameId); + + if (frame) + frame->gpuPresent = dxvk::high_resolution_clock::now(); + + m_cond.notify_one(); + } + + + void DxvkBuiltInLatencyTracker::sleepAndBeginFrame( + uint64_t frameId, + double maxFrameRate) { + auto duration = sleep(frameId, maxFrameRate); + + std::unique_lock lock(m_mutex); + + auto next = initFrame(frameId); + next->frameStart = dxvk::high_resolution_clock::now(); + next->sleepDuration = duration; + } + + + void DxvkBuiltInLatencyTracker::discardTimings() { + std::unique_lock lock(m_mutex); + m_validRangeBegin = m_validRangeEnd + 1u; + } + + + DxvkLatencyStats DxvkBuiltInLatencyTracker::getStatistics( + uint64_t frameId) { + std::unique_lock lock(m_mutex); + + DxvkLatencyStats stats = { }; + + while (frameId && frameId >= m_validRangeBegin) { + auto f = findFrame(frameId--); + + if (f && f->gpuPresent != time_point()) { + stats.frameLatency = std::chrono::duration_cast(f->gpuPresent - f->frameStart); + stats.sleepDuration = std::chrono::duration_cast(f->sleepDuration); + break; + } + } + + return stats; + } + + + DxvkBuiltInLatencyTracker::duration DxvkBuiltInLatencyTracker::sleep( + uint64_t frameId, + double maxFrameRate) { + // Wait for all relevant timings to become available. This should + // generally not stall for very long if a maximum frame latency of + // 1 is enforced correctly by the swap chain. + std::unique_lock lock(m_mutex); + + for (uint32_t i = 2; i <= FrameCount; i++) { + auto f = findFrame(frameId - i); + + if (!f || f->cpuPresentEnd == time_point()) + return duration(0u); + + m_cond.wait(lock, [f] { + return f->gpuPresent != time_point(); + }); + } + + // Frame entry of the last frame that fully completed + auto prev = findFrame(frameId - 2u); + + // The way we want to align subsequent frames depends on whether + // we are limited by GPU performance or display refresh. + // + // In either case, we estimate the amount of CPU time the game requires + // before any GPU work can start to be the delay between frame start and + // first submission, plus any GPU idle time during the frame. This is not + // accurate if there are forced GPU sync points, but we can't work around + // that in a meaningful way. + constexpr size_t EntryCount = FrameCount - 1u; + + std::array cpuTimes = { }; + std::array gpuTimes = { }; + + for (uint32_t i = 0; i < EntryCount; i++) { + auto f = findFrame(frameId - (i + 2u)); + + cpuTimes[i] = (f->queueSubmit - f->frameStart) + f->gpuIdleTime; + gpuTimes[i] = (f->gpuExecEnd - f->gpuExecStart) - f->gpuIdleTime; + } + + duration nextCpuTime = estimateTime(cpuTimes.data(), cpuTimes.size()); + duration nextGpuTime = estimateTime(gpuTimes.data(), gpuTimes.size()); + + // Compute the initial deadline based on GPU execution times + time_point gpuDeadline = prev->gpuExecEnd + 2u * nextGpuTime; + + // If we're rendering faster than refresh, use present_wait timings from + // previous frames as a starting point and compute an average in order to + // account for potentially erratic present_wait delays. + duration frameInterval = computeFrameInterval(maxFrameRate); + + if (frameInterval.count()) { + duration nextPresentFromPrev = duration(0u); + + for (uint32_t i = 2; i <= FrameCount; i++) { + auto f = findFrame(frameId - i); + + time_point deadline = f->gpuPresent + i * frameInterval - m_tolerance; + nextPresentFromPrev += deadline - prev->gpuPresent; + } + + time_point wsiDeadline = prev->gpuPresent + (nextPresentFromPrev / int32_t(FrameCount - 1u)); + gpuDeadline = std::max(gpuDeadline, wsiDeadline); + } + + // Line up the next frame in such a way that the first GPU submission + // happens just before the current frame's final submission completes + time_point gpuStartTime = gpuDeadline - nextGpuTime; + time_point cpuStartTime = gpuStartTime - nextCpuTime - m_tolerance; + + time_point now = dxvk::high_resolution_clock::now(); + + // Release lock before actually sleeping, or + // it will affect the time measurements. + lock.unlock(); + + Sleep::sleepUntil(now, cpuStartTime); + return std::max(duration(0u), cpuStartTime - now); + } + + + DxvkLatencyFrameData* DxvkBuiltInLatencyTracker::initFrame( + uint64_t frameId) { + if (m_validRangeEnd + 1u != frameId) + m_validRangeBegin = frameId; + + if (m_validRangeBegin + FrameCount <= frameId) + m_validRangeBegin = frameId + 1u - FrameCount; + + m_validRangeEnd = frameId; + + auto& frame = m_frames[frameId % FrameCount]; + frame = DxvkLatencyFrameData(); + frame.frameId = frameId; + return &frame; + } + + + DxvkLatencyFrameData* DxvkBuiltInLatencyTracker::findFrame( + uint64_t frameId) { + return frameId >= m_validRangeBegin && frameId <= m_validRangeEnd + ? &m_frames[frameId % FrameCount] + : nullptr; + } + + + DxvkBuiltInLatencyTracker::duration DxvkBuiltInLatencyTracker::computeFrameInterval( + double maxFrameRate) { + if (m_envFpsLimit > 0.0) + maxFrameRate = m_envFpsLimit; + + return computeIntervalFromRate(maxFrameRate); + } + + + DxvkBuiltInLatencyTracker::duration DxvkBuiltInLatencyTracker::computeIntervalFromRate( + double frameRate) { + if (frameRate <= 0.0 || !std::isnormal(frameRate)) + return duration(0u); + + uint64_t ns = uint64_t(1'000'000'000.0 / frameRate); + return std::chrono::duration_cast(std::chrono::nanoseconds(ns)); + } + + + DxvkBuiltInLatencyTracker::duration DxvkBuiltInLatencyTracker::estimateTime( + const duration* frames, + size_t frameCount) { + // For each frame, find the median of its neighbours, then + // use the maximum of those medians as our estimate. + duration result = duration(0u); + + for (size_t i = 0u; i < frameCount - 2u; i++) { + duration a = frames[i]; + duration b = frames[i + 1]; + duration c = frames[i + 2]; + + duration min = std::min(std::min(a, b), c); + duration max = std::max(std::max(a, b), c); + + result = std::max(result, a + b + c - min - max); + } + + return result; + } +} diff --git a/src/dxvk/dxvk_latency_builtin.h b/src/dxvk/dxvk_latency_builtin.h new file mode 100644 index 000000000..dcbc4be31 --- /dev/null +++ b/src/dxvk/dxvk_latency_builtin.h @@ -0,0 +1,134 @@ +#pragma once + +#include + +#include "dxvk_latency.h" + +#include "../util/thread.h" + +#include "../util/util_sleep.h" +#include "../util/util_time.h" + +#include "../util/config/config.h" + +#include "../util/sync/sync_spinlock.h" + +namespace dxvk { + + /** + * \brief Timings for a single tracked frame + */ + struct DxvkLatencyFrameData { + using time_point = dxvk::high_resolution_clock::time_point; + using duration = dxvk::high_resolution_clock::duration; + + uint64_t frameId = 0u; + time_point frameStart = time_point(); + time_point cpuPresentEnd = time_point(); + time_point queueSubmit = time_point(); + time_point queuePresent = time_point(); + time_point gpuExecStart = time_point(); + time_point gpuExecEnd = time_point(); + time_point gpuIdleStart = time_point(); + time_point gpuIdleEnd = time_point(); + duration gpuIdleTime = duration(0u); + time_point gpuPresent = time_point(); + duration sleepDuration = duration(0u); + }; + + + /** + * \brief Built-in latency tracker + * + * Implements a simple latency reduction algorithm + * based on CPU timestamps received from the backend. + */ + class DxvkBuiltInLatencyTracker : public DxvkLatencyTracker { + using time_point = typename DxvkLatencyFrameData::time_point; + using duration = typename DxvkLatencyFrameData::duration; + + constexpr static size_t FrameCount = 8u; + public: + + DxvkBuiltInLatencyTracker( + int32_t toleranceUs); + + ~DxvkBuiltInLatencyTracker(); + + void notifyCpuPresentBegin( + uint64_t frameId); + + void notifyCpuPresentEnd( + uint64_t frameId); + + void notifyCsRenderBegin( + uint64_t frameId); + + void notifyCsRenderEnd( + uint64_t frameId); + + void notifyQueueSubmit( + uint64_t frameId); + + void notifyQueuePresentBegin( + uint64_t frameId); + + void notifyQueuePresentEnd( + uint64_t frameId, + VkResult status); + + void notifyGpuExecutionBegin( + uint64_t frameId); + + void notifyGpuExecutionEnd( + uint64_t frameId); + + void notifyGpuPresentEnd( + uint64_t frameId); + + void sleepAndBeginFrame( + uint64_t frameId, + double maxFrameRate); + + void discardTimings(); + + DxvkLatencyStats getStatistics( + uint64_t frameId); + + private: + + dxvk::mutex m_mutex; + dxvk::condition_variable m_cond; + + duration m_tolerance; + + double m_envFpsLimit = 0.0; + + std::array m_frames = { }; + + uint64_t m_validRangeBegin = 0u; + uint64_t m_validRangeEnd = 0u; + + duration sleep( + uint64_t frameId, + double maxFrameRate); + + DxvkLatencyFrameData* initFrame( + uint64_t frameId); + + DxvkLatencyFrameData* findFrame( + uint64_t frameId); + + duration computeFrameInterval( + double maxFrameRate); + + static duration computeIntervalFromRate( + double frameRate); + + static duration estimateTime( + const duration* frames, + size_t frameCount); + + }; + +} diff --git a/src/dxvk/dxvk_options.cpp b/src/dxvk/dxvk_options.cpp index c939a7a41..8fbf20bb6 100644 --- a/src/dxvk/dxvk_options.cpp +++ b/src/dxvk/dxvk_options.cpp @@ -12,6 +12,8 @@ namespace dxvk { useRawSsbo = config.getOption("dxvk.useRawSsbo", Tristate::Auto); hud = config.getOption("dxvk.hud", ""); tearFree = config.getOption("dxvk.tearFree", Tristate::Auto); + latencySleep = config.getOption("dxvk.latencySleep", Tristate::Auto); + latencyTolerance = config.getOption ("dxvk.latencyTolerance", 1000); hideIntegratedGraphics = config.getOption ("dxvk.hideIntegratedGraphics", false); zeroMappedMemory = config.getOption ("dxvk.zeroMappedMemory", false); allowFse = config.getOption ("dxvk.allowFse", false); diff --git a/src/dxvk/dxvk_options.h b/src/dxvk/dxvk_options.h index 0994f1d5f..dcbd52230 100644 --- a/src/dxvk/dxvk_options.h +++ b/src/dxvk/dxvk_options.h @@ -37,6 +37,12 @@ namespace dxvk { /// or FIFO_RELAXED (if false) present mode Tristate tearFree = Tristate::Auto; + /// Enables latency sleep + Tristate latencySleep = Tristate::Auto; + + /// Latency tolerance, in microseconds + int32_t latencyTolerance = 0u; + // Hides integrated GPUs if dedicated GPUs are // present. May be necessary for some games that // incorrectly assume monitor layouts. diff --git a/src/dxvk/meson.build b/src/dxvk/meson.build index 9ab23dab2..789b911c8 100644 --- a/src/dxvk/meson.build +++ b/src/dxvk/meson.build @@ -90,6 +90,7 @@ dxvk_src = [ 'dxvk_graphics.cpp', 'dxvk_image.cpp', 'dxvk_instance.cpp', + 'dxvk_latency_builtin.cpp', 'dxvk_memory.cpp', 'dxvk_meta_blit.cpp', 'dxvk_meta_clear.cpp',