diff --git a/src/d3d11/d3d11_context.cpp b/src/d3d11/d3d11_context.cpp index 8d42c283b..163837689 100644 --- a/src/d3d11/d3d11_context.cpp +++ b/src/d3d11/d3d11_context.cpp @@ -1009,6 +1009,9 @@ namespace dxvk { if (!ctrBuf.defined()) return; + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + // We bind the SO counter as an indirect count buffer, // so reset any tracking we may have been doing here. m_state.id.reset(); @@ -1035,6 +1038,9 @@ namespace dxvk { UINT StartVertexLocation) { D3D10DeviceLock lock = LockContext(); + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + EmitCs([=] (DxvkContext* ctx) { ctx->draw( VertexCount, 1, @@ -1050,6 +1056,9 @@ namespace dxvk { INT BaseVertexLocation) { D3D10DeviceLock lock = LockContext(); + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + EmitCs([=] (DxvkContext* ctx) { ctx->drawIndexed( IndexCount, 1, @@ -1067,6 +1076,9 @@ namespace dxvk { UINT StartInstanceLocation) { D3D10DeviceLock lock = LockContext(); + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + EmitCs([=] (DxvkContext* ctx) { ctx->draw( VertexCountPerInstance, @@ -1086,6 +1098,9 @@ namespace dxvk { UINT StartInstanceLocation) { D3D10DeviceLock lock = LockContext(); + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + EmitCs([=] (DxvkContext* ctx) { ctx->drawIndexed( IndexCountPerInstance, @@ -1107,6 +1122,9 @@ namespace dxvk { if (!ValidateDrawBufferSize(pBufferForArgs, AlignedByteOffsetForArgs, sizeof(VkDrawIndexedIndirectCommand))) return; + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + // If possible, batch up multiple indirect draw calls of // the same type into one single multiDrawIndirect call auto cmdData = static_cast(m_cmdData); @@ -1142,6 +1160,9 @@ namespace dxvk { if (!ValidateDrawBufferSize(pBufferForArgs, AlignedByteOffsetForArgs, sizeof(VkDrawIndirectCommand))) return; + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + // If possible, batch up multiple indirect draw calls of // the same type into one single multiDrawIndirect call auto cmdData = static_cast(m_cmdData); @@ -1174,6 +1195,9 @@ namespace dxvk { UINT ThreadGroupCountZ) { D3D10DeviceLock lock = LockContext(); + if (unlikely(HasDirtyComputeBindings())) + ApplyDirtyComputeBindings(); + EmitCs([=] (DxvkContext* ctx) { ctx->dispatch( ThreadGroupCountX, @@ -1193,6 +1217,9 @@ namespace dxvk { if (!ValidateDrawBufferSize(pBufferForArgs, AlignedByteOffsetForArgs, sizeof(VkDispatchIndirectCommand))) return; + if (unlikely(HasDirtyComputeBindings())) + ApplyDirtyComputeBindings(); + EmitCs([cOffset = AlignedByteOffsetForArgs] (DxvkContext* ctx) { ctx->dispatchIndirect(cOffset); @@ -3157,6 +3184,59 @@ namespace dxvk { } + template + void D3D11CommonContext::ApplyDirtyConstantBuffers( + DxbcProgramType Stage, + const DxbcBindingMask& BoundMask, + DxbcBindingMask& DirtyMask) { + uint32_t bindMask = BoundMask.cbvMask & DirtyMask.cbvMask; + + if (!bindMask) + return; + + const auto& state = m_state.cbv[Stage]; + DirtyMask.cbvMask -= bindMask; + + for (uint32_t slot : bit::BitMask(bindMask)) { + const auto& cbv = state.buffers[slot]; + + BindConstantBuffer(Stage, slot, cbv.buffer.ptr(), + cbv.constantOffset, cbv.constantBound); + } + } + + + template + void D3D11CommonContext::ApplyDirtyGraphicsBindings() { + auto dirtyMask = m_state.lazy.shadersDirty & m_state.lazy.shadersUsed; + dirtyMask.clr(DxbcProgramType::ComputeShader); + + for (uint32_t stageIndex : bit::BitMask(uint32_t(dirtyMask.raw()))) { + DxbcProgramType stage = DxbcProgramType(stageIndex); + + auto& boundMask = m_state.lazy.bindingsUsed[stage]; + auto& dirtyMask = m_state.lazy.bindingsDirty[stage]; + + ApplyDirtyConstantBuffers(stage, boundMask, dirtyMask); + + m_state.lazy.shadersDirty.clr(stage); + } + } + + + template + void D3D11CommonContext::ApplyDirtyComputeBindings() { + DxbcProgramType stage = DxbcProgramType::ComputeShader; + + auto& boundMask = m_state.lazy.bindingsUsed[stage]; + auto& dirtyMask = m_state.lazy.bindingsDirty[stage]; + + ApplyDirtyConstantBuffers(stage, boundMask, dirtyMask); + + m_state.lazy.shadersDirty.clr(stage); + } + + template void D3D11CommonContext::ApplyInputLayout() { auto inputLayout = m_state.ia.inputLayout.prvRef(); @@ -3686,45 +3766,49 @@ namespace dxvk { template - template void D3D11CommonContext::BindConstantBuffer( + DxbcProgramType ShaderStage, UINT Slot, D3D11Buffer* pBuffer, UINT Offset, UINT Length) { + uint32_t slotId = computeConstantBufferBinding(ShaderStage, Slot); + if (pBuffer) { EmitCs([ - cSlotId = Slot, + cSlotId = slotId, + cStage = GetShaderStage(ShaderStage), cBufferSlice = pBuffer->GetBufferSlice(16 * Offset, 16 * Length) ] (DxvkContext* ctx) mutable { - VkShaderStageFlagBits stage = GetShaderStage(ShaderStage); - ctx->bindUniformBuffer(stage, cSlotId, + ctx->bindUniformBuffer(cStage, cSlotId, Forwarder::move(cBufferSlice)); }); } else { EmitCs([ - cSlotId = Slot + cSlotId = slotId, + cStage = GetShaderStage(ShaderStage) ] (DxvkContext* ctx) { - VkShaderStageFlagBits stage = GetShaderStage(ShaderStage); - ctx->bindUniformBuffer(stage, cSlotId, DxvkBufferSlice()); + ctx->bindUniformBuffer(cStage, cSlotId, DxvkBufferSlice()); }); } } template - template void D3D11CommonContext::BindConstantBufferRange( + DxbcProgramType ShaderStage, UINT Slot, UINT Offset, UINT Length) { + uint32_t slotId = computeConstantBufferBinding(ShaderStage, Slot); + EmitCs([ - cSlotId = Slot, - cOffset = 16 * Offset, - cLength = 16 * Length + cSlotId = slotId, + cStage = GetShaderStage(ShaderStage), + cOffset = 16u * Offset, + cLength = 16u * Length ] (DxvkContext* ctx) { - VkShaderStageFlagBits stage = GetShaderStage(ShaderStage); - ctx->bindUniformBufferRange(stage, cSlotId, cOffset, cLength); + ctx->bindUniformBufferRange(cStage, cSlotId, cOffset, cLength); }); } @@ -4236,6 +4320,48 @@ namespace dxvk { } + template + template + bool D3D11CommonContext::DirtyBindingGeneric( + DxbcProgramType ShaderStage, + T BoundMask, + T& DirtyMask, + T DirtyBit, + bool IsNull) { + if ((BoundMask & ~DirtyMask) & DirtyBit) { + // If we're binding a non-null resource to an active slot that has not been + // marked for lazy binding yet, forward the call immediately in order to + // avoid tracking overhead. This is by far the most common case. + if (likely(!IsNull)) + return false; + + // If we are binding a null resource to an active slot, the app will likely + // either bind something else or bind a shader that does not use this slot. + // In that case, avoid likely redundant CS traffic and apply the binding on + // the next draw. + m_state.lazy.shadersDirty.set(ShaderStage); + } + + // Binding is either inactive or already dirty. In the inactive case, there + // is no need to mark the shader stage as dirty since binding a shader that + // activates the binding will implicitly do so. + DirtyMask |= DirtyBit; + return true; + } + + + template + bool D3D11CommonContext::DirtyConstantBuffer( + DxbcProgramType ShaderStage, + uint32_t Slot, + bool IsNull) { + return DirtyBindingGeneric(ShaderStage, + m_state.lazy.bindingsUsed[ShaderStage].cbvMask, + m_state.lazy.bindingsDirty[ShaderStage].cbvMask, + 1u << Slot, IsNull); + } + + template void D3D11CommonContext::DiscardBuffer( ID3D11Resource* pResource) { @@ -4398,6 +4524,21 @@ namespace dxvk { } + template + bool D3D11CommonContext::HasDirtyComputeBindings() { + return m_state.lazy.shadersDirty.test(DxbcProgramType::ComputeShader); + } + + + template + bool D3D11CommonContext::HasDirtyGraphicsBindings() { + return (m_state.lazy.shadersDirty & m_state.lazy.shadersUsed).any( + DxbcProgramType::VertexShader, DxbcProgramType::GeometryShader, + DxbcProgramType::HullShader, DxbcProgramType::DomainShader, + DxbcProgramType::PixelShader); + } + + template void D3D11CommonContext::ResetCommandListState() { EmitCs([ @@ -4646,36 +4787,6 @@ namespace dxvk { } - template - void D3D11CommonContext::RestoreUsedBindings() { - // Mark all bindings used since the last reset as dirty so that subsequent draws - // and dispatches will reapply them as necessary. Marking null bindings here may - // lead to some redundant CS thread traffic, but is otherwise harmless. - auto maxBindings = GetMaxUsedBindings(); - - for (uint32_t i = 0; i < uint32_t(DxbcProgramType::Count); i++) { - auto stage = DxbcProgramType(i); - auto stageInfo = maxBindings.stages[i]; - - m_state.lazy.bindingsDirty[stage].cbvMask |= (1u << stageInfo.cbvCount) - 1u; - m_state.lazy.bindingsDirty[stage].samplerMask |= (1u << stageInfo.samplerCount) - 1u; - - if (stageInfo.uavCount) - m_state.lazy.bindingsDirty[stage].uavMask |= uint64_t(-1) >> (64u - stageInfo.uavCount); - - if (stageInfo.srvCount > 64u) { - m_state.lazy.bindingsDirty[stage].srvMask[0] |= uint64_t(-1); - m_state.lazy.bindingsDirty[stage].srvMask[1] |= uint64_t(-1) >> (128u - stageInfo.srvCount); - } else if (stageInfo.srvCount) { - m_state.lazy.bindingsDirty[stage].srvMask[0] |= uint64_t(-1) >> (64u - stageInfo.srvCount); - } - - if (m_state.lazy.shadersUsed.test(stage) && !m_state.lazy.bindingsDirty[stage].empty()) - m_state.lazy.shadersDirty.set(stage); - } - } - - template void D3D11CommonContext::RestoreCommandListState() { BindFramebuffer(); @@ -4747,10 +4858,8 @@ namespace dxvk { template void D3D11CommonContext::RestoreConstantBuffers() { const auto& bindings = m_state.cbv[Stage]; - uint32_t slotId = computeConstantBufferBinding(Stage, 0); - for (uint32_t i = 0; i < bindings.maxCount; i++) { - BindConstantBuffer(slotId + i, bindings.buffers[i].buffer.ptr(), + BindConstantBuffer(Stage, i, bindings.buffers[i].buffer.ptr(), bindings.buffers[i].constantOffset, bindings.buffers[i].constantBound); } } @@ -4807,7 +4916,6 @@ namespace dxvk { UINT NumBuffers, ID3D11Buffer* const* ppConstantBuffers) { auto& bindings = m_state.cbv[ShaderStage]; - uint32_t slotId = computeConstantBufferBinding(ShaderStage, StartSlot); for (uint32_t i = 0; i < NumBuffers; i++) { auto newBuffer = static_cast(ppConstantBuffers[i]); @@ -4824,7 +4932,8 @@ namespace dxvk { bindings.buffers[StartSlot + i].constantCount = constantCount; bindings.buffers[StartSlot + i].constantBound = constantCount; - BindConstantBuffer(slotId + i, newBuffer, 0, constantCount); + if (!DirtyConstantBuffer(ShaderStage, StartSlot + i, !newBuffer)) + BindConstantBuffer(ShaderStage, StartSlot + i, newBuffer, 0, constantCount); } } @@ -4843,8 +4952,6 @@ namespace dxvk { const UINT* pNumConstants) { auto& bindings = m_state.cbv[ShaderStage]; - uint32_t slotId = computeConstantBufferBinding(ShaderStage, StartSlot); - for (uint32_t i = 0; i < NumBuffers; i++) { auto newBuffer = static_cast(ppConstantBuffers[i]); @@ -4883,14 +4990,16 @@ namespace dxvk { bindings.buffers[StartSlot + i].constantCount = constantCount; bindings.buffers[StartSlot + i].constantBound = constantBound; - BindConstantBuffer(slotId + i, newBuffer, constantOffset, constantBound); + if (!DirtyConstantBuffer(ShaderStage, StartSlot + i, !newBuffer)) + BindConstantBuffer(ShaderStage, StartSlot + i, newBuffer, constantOffset, constantBound); } else if (bindings.buffers[StartSlot + i].constantOffset != constantOffset || bindings.buffers[StartSlot + i].constantCount != constantCount) { bindings.buffers[StartSlot + i].constantOffset = constantOffset; bindings.buffers[StartSlot + i].constantCount = constantCount; bindings.buffers[StartSlot + i].constantBound = constantBound; - BindConstantBufferRange(slotId + i, constantOffset, constantBound); + if (!DirtyConstantBuffer(ShaderStage, StartSlot + i, !newBuffer)) + BindConstantBufferRange(ShaderStage, StartSlot + i, constantOffset, constantBound); } } diff --git a/src/d3d11/d3d11_context.h b/src/d3d11/d3d11_context.h index 32fed700c..734cede69 100644 --- a/src/d3d11/d3d11_context.h +++ b/src/d3d11/d3d11_context.h @@ -799,6 +799,15 @@ namespace dxvk { DxvkBufferSlice AllocStagingBuffer( VkDeviceSize Size); + void ApplyDirtyConstantBuffers( + DxbcProgramType Stage, + const DxbcBindingMask& BoundMask, + DxbcBindingMask& DirtyMask); + + void ApplyDirtyGraphicsBindings(); + + void ApplyDirtyComputeBindings(); + void ApplyInputLayout(); void ApplyPrimitiveTopology(); @@ -854,15 +863,15 @@ namespace dxvk { D3D11Buffer* pBuffer, UINT Offset); - template void BindConstantBuffer( + DxbcProgramType ShaderStage, UINT Slot, D3D11Buffer* pBuffer, UINT Offset, UINT Length); - template void BindConstantBufferRange( + DxbcProgramType ShaderStage, UINT Slot, UINT Offset, UINT Length); @@ -911,6 +920,19 @@ namespace dxvk { DxvkBufferSlice BufferSlice, UINT Flags); + template + bool DirtyBindingGeneric( + DxbcProgramType ShaderStage, + T BoundMask, + T& DirtyMask, + T DirtyBit, + bool IsNull); + + bool DirtyConstantBuffer( + DxbcProgramType ShaderStage, + uint32_t Slot, + bool IsNull); + void DiscardBuffer( ID3D11Resource* pResource); @@ -943,6 +965,10 @@ namespace dxvk { D3D11MaxUsedBindings GetMaxUsedBindings(); + bool HasDirtyComputeBindings(); + + bool HasDirtyGraphicsBindings(); + void ResetCommandListState(); void ResetContextState(); @@ -967,8 +993,6 @@ namespace dxvk { void ResolveOmUavHazards( D3D11RenderTargetView* pView); - void RestoreUsedBindings(); - void RestoreCommandListState(); template diff --git a/src/d3d11/d3d11_context_ext.cpp b/src/d3d11/d3d11_context_ext.cpp index 5254f480f..c933f571d 100644 --- a/src/d3d11/d3d11_context_ext.cpp +++ b/src/d3d11/d3d11_context_ext.cpp @@ -48,6 +48,9 @@ namespace dxvk { D3D10DeviceLock lock = m_ctx->LockContext(); m_ctx->SetDrawBuffers(pBufferForArgs, nullptr); + if (unlikely(m_ctx->HasDirtyGraphicsBindings())) + m_ctx->ApplyDirtyGraphicsBindings(); + m_ctx->EmitCs([ cCount = DrawCount, cOffset = ByteOffsetForArgs, @@ -67,6 +70,9 @@ namespace dxvk { D3D10DeviceLock lock = m_ctx->LockContext(); m_ctx->SetDrawBuffers(pBufferForArgs, nullptr); + if (unlikely(m_ctx->HasDirtyGraphicsBindings())) + m_ctx->ApplyDirtyGraphicsBindings(); + m_ctx->EmitCs([ cCount = DrawCount, cOffset = ByteOffsetForArgs, @@ -88,6 +94,9 @@ namespace dxvk { D3D10DeviceLock lock = m_ctx->LockContext(); m_ctx->SetDrawBuffers(pBufferForArgs, pBufferForCount); + if (unlikely(m_ctx->HasDirtyGraphicsBindings())) + m_ctx->ApplyDirtyGraphicsBindings(); + m_ctx->EmitCs([ cMaxCount = MaxDrawCount, cArgOffset = ByteOffsetForArgs, @@ -110,6 +119,9 @@ namespace dxvk { D3D10DeviceLock lock = m_ctx->LockContext(); m_ctx->SetDrawBuffers(pBufferForArgs, pBufferForCount); + if (unlikely(m_ctx->HasDirtyGraphicsBindings())) + m_ctx->ApplyDirtyGraphicsBindings(); + m_ctx->EmitCs([ cMaxCount = MaxDrawCount, cArgOffset = ByteOffsetForArgs,