From 5bb8d09a96998fed552c920e72f9eb374bddb044 Mon Sep 17 00:00:00 2001 From: Robin Kertels Date: Wed, 18 Sep 2024 23:57:05 +0200 Subject: [PATCH] [d3d9] Always use per-draw buffer uploads on pure SWVP devices --- src/d3d9/d3d9_common_buffer.cpp | 11 ++------ src/d3d9/d3d9_common_buffer.h | 7 +++-- src/d3d9/d3d9_device.cpp | 48 ++++++++++++++++++++++++++++----- src/d3d9/d3d9_device.h | 12 ++++----- 4 files changed, 52 insertions(+), 26 deletions(-) diff --git a/src/d3d9/d3d9_common_buffer.cpp b/src/d3d9/d3d9_common_buffer.cpp index e40a799bf..370301dbe 100644 --- a/src/d3d9/d3d9_common_buffer.cpp +++ b/src/d3d9/d3d9_common_buffer.cpp @@ -74,14 +74,6 @@ namespace dxvk { if (!(m_desc.Usage & (D3DUSAGE_DYNAMIC | D3DUSAGE_WRITEONLY))) return D3D9_COMMON_BUFFER_MAP_MODE_BUFFER; - // Tests show that DISCARD does not work for pure SWVP devices. - // So force staging buffer path to avoid stalls. - // Dark Romance: Vampire in Love also expects draws to be synchronous - // and breaks if we respect NOOVERWRITE. - // D&D Temple of Elemental Evil breaks if we respect DISCARD. - if (m_parent->CanOnlySWVP()) - return D3D9_COMMON_BUFFER_MAP_MODE_BUFFER; - if (!options->allowDirectBufferMapping) return D3D9_COMMON_BUFFER_MAP_MODE_BUFFER; @@ -134,7 +126,8 @@ namespace dxvk { memoryFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; } - if ((memoryFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) && m_parent->GetOptions()->cachedDynamicBuffers) { + if ((memoryFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) && (m_parent->GetOptions()->cachedDynamicBuffers || m_parent->CanOnlySWVP())) { + // Never use uncached memory on devices that support SWVP because we might end up reading from it. memoryFlags &= ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; memoryFlags |= VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; diff --git a/src/d3d9/d3d9_common_buffer.h b/src/d3d9/d3d9_common_buffer.h index 00bea9530..3d588afd7 100644 --- a/src/d3d9/d3d9_common_buffer.h +++ b/src/d3d9/d3d9_common_buffer.h @@ -200,18 +200,17 @@ namespace dxvk { /** - * \brief Queries sequence number for a given subresource + * \brief Queries sequence number * * Returns which CS chunk the resource was last used on. - * \param [in] Subresource Subresource index - * \returns Sequence number for the given subresource + * \returns Sequence number */ uint64_t GetMappingBufferSequenceNumber() const { return HasSequenceNumber() ? m_seq : DxvkCsThread::SynchronizeAll; } - bool IsSysmemDynamic() const { + bool DoPerDrawUpload() const { return m_desc.Pool == D3DPOOL_SYSTEMMEM && (m_desc.Usage & D3DUSAGE_DYNAMIC) != 0; } diff --git a/src/d3d9/d3d9_device.cpp b/src/d3d9/d3d9_device.cpp index 9fc9a8c9f..b8a0e7eec 100644 --- a/src/d3d9/d3d9_device.cpp +++ b/src/d3d9/d3d9_device.cpp @@ -2698,7 +2698,7 @@ namespace dxvk { uint32_t firstIndex = 0; int32_t baseVertexIndex = 0; uint32_t vertexCount = GetVertexCount(PrimitiveType, PrimitiveCount); - UploadDynamicSysmemBuffers( + UploadPerDrawData( StartVertex, vertexCount, firstIndex, @@ -2747,7 +2747,7 @@ namespace dxvk { bool dynamicSysmemVBOs; bool dynamicSysmemIBO; uint32_t indexCount = GetVertexCount(PrimitiveType, PrimitiveCount); - UploadDynamicSysmemBuffers( + UploadPerDrawData( MinVertexIndex, NumVertices, StartIndex, @@ -2932,7 +2932,20 @@ namespace dxvk { D3D9CommonBuffer* dst = static_cast(pDestBuffer)->GetCommonBuffer(); D3D9VertexDecl* decl = static_cast (pVertexDecl); - PrepareDraw(D3DPT_FORCE_DWORD, true, true); + bool dynamicSysmemVBOs; + uint32_t firstIndex = 0; + int32_t baseVertexIndex = 0; + UploadPerDrawData( + SrcStartIndex, + VertexCount, + firstIndex, + 0, + baseVertexIndex, + &dynamicSysmemVBOs, + nullptr + ); + + PrepareDraw(D3DPT_FORCE_DWORD, !dynamicSysmemVBOs, false); if (decl == nullptr) { DWORD FVF = dst->Desc()->FVF; @@ -5057,7 +5070,7 @@ namespace dxvk { // Ignore DISCARD and NOOVERWRITE if the buffer is not DEFAULT pool (tests + Halo 2) // The docs say DISCARD and NOOVERWRITE are ignored if the buffer is not DYNAMIC // but tests say otherwise! - if (desc.Pool != D3DPOOL_DEFAULT) + if (desc.Pool != D3DPOOL_DEFAULT || CanOnlySWVP()) Flags &= ~(D3DLOCK_DISCARD | D3DLOCK_NOOVERWRITE); // Ignore DONOTWAIT if we are DYNAMIC @@ -5069,6 +5082,12 @@ namespace dxvk { if (unlikely(m_deviceLostState != D3D9DeviceLostState::Ok)) Flags &= ~D3DLOCK_DISCARD; + // In SWVP mode, we always use the per-draw upload path. + // So the buffer will never be in use on the device. + // FVF Buffers are the exception. Those can be used as a destination for ProcessVertices. + if (unlikely(CanOnlySWVP() && !pResource->NeedsReadback())) + Flags |= D3DLOCK_NOOVERWRITE; + // We only bounds check for MANAGED. // (TODO: Apparently this is meant to happen for DYNAMIC too but I am not sure // how that works given it is meant to be a DIRECT access..?) @@ -5209,7 +5228,7 @@ namespace dxvk { - void D3D9DeviceEx::UploadDynamicSysmemBuffers( + void D3D9DeviceEx::UploadPerDrawData( UINT& FirstVertexIndex, UINT NumVertices, UINT& FirstIndex, @@ -5221,10 +5240,10 @@ namespace dxvk { bool dynamicSysmemVBOs = true; for (uint32_t i = 0; i < caps::MaxStreams && dynamicSysmemVBOs; i++) { auto* vbo = GetCommonBuffer(m_state.vertexBuffers[i].vertexBuffer); - dynamicSysmemVBOs &= vbo == nullptr || vbo->IsSysmemDynamic(); + dynamicSysmemVBOs &= vbo == nullptr || (vbo->DoPerDrawUpload() || CanOnlySWVP()); } D3D9CommonBuffer* ibo = GetCommonBuffer(m_state.indices); - bool dynamicSysmemIBO = NumIndices != 0 && ibo != nullptr && ibo->IsSysmemDynamic(); + bool dynamicSysmemIBO = NumIndices != 0 && ibo != nullptr && (ibo->DoPerDrawUpload() || CanOnlySWVP()); *pDynamicVBOs = dynamicSysmemVBOs; @@ -5255,6 +5274,21 @@ namespace dxvk { if (likely(vbo == nullptr)) { continue; } + + if (unlikely(vbo->NeedsReadback())) { + // There's two ways the GPU can write to buffers in D3D9: + // - Copy data from a staging buffer to the primary one either on Unlock or at draw time depending on the D3DPOOL + // for buffers with MAP_MODE_STAGING. + // The backend handles inserting the required barriers. + // - Write data between Lock and Unlock to the buffer directly for buffers with MAP_MODE_DIRECT. + // - Write to the primary buffer using ProcessVertices. That is why we need to ensure the resource is idle. + // Even when using MAP_MODE_BUFFER, ProcessVertices copies the data over from the primary buffer to the staging buffer + // at the end. So it could end up writing to the buffer on the GPU while the same buffer gets read here on the CPU. + // ProcessVertices is also exceptionally rare though which is why we're using a second sequence number + // to avoid unnecessary CS thread synchronization. + WaitForResource(vbo->GetBuffer(), vbo->GetMappingBufferSequenceNumber(), D3DLOCK_READONLY); + } + const uint32_t vertexSize = m_state.vertexDecl->GetSize(i); const uint32_t vertexStride = m_state.vertexBuffers[i].stride; const uint32_t srcStride = vertexStride; diff --git a/src/d3d9/d3d9_device.h b/src/d3d9/d3d9_device.h index 5827e2606..0892222ed 100644 --- a/src/d3d9/d3d9_device.h +++ b/src/d3d9/d3d9_device.h @@ -774,7 +774,7 @@ namespace dxvk { * @param FirstIndex The first index * @param NumIndices The number of indices that will be drawn. If this is 0, the index buffer binding will not be modified. */ - void UploadDynamicSysmemBuffers( + void UploadPerDrawData( UINT& FirstVertexIndex, UINT NumVertices, UINT& FirstIndex, @@ -782,7 +782,7 @@ namespace dxvk { INT& BaseVertexIndex, bool* pDynamicVBOs, bool* pDynamicIBO); - + void SetupFPU(); @@ -1022,6 +1022,10 @@ namespace dxvk { return m_behaviorFlags & D3DCREATE_SOFTWARE_VERTEXPROCESSING; } + bool CanSWVP() const { + return m_behaviorFlags & (D3DCREATE_MIXED_VERTEXPROCESSING | D3DCREATE_SOFTWARE_VERTEXPROCESSING); + } + UINT GetFixedFunctionVSCount() const { return m_ffModules.GetVSCount(); } @@ -1063,10 +1067,6 @@ namespace dxvk { } } - bool CanSWVP() const { - return m_behaviorFlags & (D3DCREATE_MIXED_VERTEXPROCESSING | D3DCREATE_SOFTWARE_VERTEXPROCESSING); - } - // Device Reset detection for D3D9SwapChainEx::Present bool IsDeviceReset() { return std::exchange(m_deviceHasBeenReset, false);