From 016f05a770989d11e432115b37418870d7246edd Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 21 Feb 2025 15:52:32 +0100 Subject: [PATCH] [dxvk] Implement draw batching via VK_EXT_multi_draw --- src/d3d11/d3d11_context.cpp | 15 +--- src/d3d11/d3d11_video.cpp | 6 +- src/d3d9/d3d9_device.cpp | 45 ++++++---- src/dxvk/dxvk_context.cpp | 158 ++++++++++++++++++++++++++++------ src/dxvk/dxvk_context.h | 39 ++++----- src/dxvk/dxvk_context_state.h | 1 + src/vulkan/vulkan_util.h | 2 +- 7 files changed, 186 insertions(+), 80 deletions(-) diff --git a/src/d3d11/d3d11_context.cpp b/src/d3d11/d3d11_context.cpp index 16b396aa9..f9aacc317 100644 --- a/src/d3d11/d3d11_context.cpp +++ b/src/d3d11/d3d11_context.cpp @@ -3578,12 +3578,7 @@ namespace dxvk { EmitCsCmd(D3D11CmdType::Draw, 1u, [] (DxvkContext* ctx, const VkDrawIndirectCommand* draws, size_t count) { - for (size_t i = 0; i < count; i++) { - ctx->draw(draws[i].vertexCount, - draws[i].instanceCount, - draws[i].firstVertex, - draws[i].firstInstance); - } + ctx->draw(count, draws); }); new (m_csData->first()) VkDrawIndirectCommand(draw); @@ -3608,13 +3603,7 @@ namespace dxvk { EmitCsCmd(D3D11CmdType::DrawIndexed, 1u, [] (DxvkContext* ctx, const VkDrawIndexedIndirectCommand* draws, size_t count) { - for (size_t i = 0; i < count; i++) { - ctx->drawIndexed(draws[i].indexCount, - draws[i].instanceCount, - draws[i].firstIndex, - draws[i].vertexOffset, - draws[i].firstInstance); - } + ctx->drawIndexed(count, draws); }); new (m_csData->first()) VkDrawIndexedIndirectCommand(draw); diff --git a/src/d3d11/d3d11_video.cpp b/src/d3d11/d3d11_video.cpp index a5b20351d..bdcd80daf 100644 --- a/src/d3d11/d3d11_video.cpp +++ b/src/d3d11/d3d11_video.cpp @@ -1312,7 +1312,11 @@ namespace dxvk { for (uint32_t i = 0; i < cViews.size(); i++) ctx->bindResourceImageView(VK_SHADER_STAGE_FRAGMENT_BIT, 1 + i, Rc(cViews[i])); - ctx->draw(3, 1, 0, 0); + VkDrawIndirectCommand draw = { }; + draw.vertexCount = 3u; + draw.instanceCount = 1u; + + ctx->draw(1, &draw); for (uint32_t i = 0; i < cViews.size(); i++) ctx->bindResourceImageView(VK_SHADER_STAGE_FRAGMENT_BIT, 1 + i, nullptr); diff --git a/src/d3d9/d3d9_device.cpp b/src/d3d9/d3d9_device.cpp index f704723db..51e190405 100644 --- a/src/d3d9/d3d9_device.cpp +++ b/src/d3d9/d3d9_device.cpp @@ -2890,9 +2890,12 @@ namespace dxvk { // Tests on Windows show that D3D9 does not do non-indexed instanced draws. - ctx->draw( - vertexCount, 1, - cStartVertex, 0); + VkDrawIndirectCommand draw = { }; + draw.vertexCount = vertexCount; + draw.instanceCount = 1u; + draw.firstVertex = cStartVertex; + + ctx->draw(1u, &draw); }); return D3D_OK; @@ -2939,10 +2942,13 @@ namespace dxvk { ApplyPrimitiveType(ctx, cPrimType); - ctx->drawIndexed( - drawInfo.vertexCount, drawInfo.instanceCount, - cStartIndex, - cBaseVertexIndex, 0); + VkDrawIndexedIndirectCommand draw = { }; + draw.indexCount = drawInfo.vertexCount; + draw.instanceCount = drawInfo.instanceCount; + draw.firstIndex = cStartIndex; + draw.vertexOffset = cBaseVertexIndex; + + ctx->drawIndexed(1u, &draw); }); return D3D_OK; @@ -2981,11 +2987,12 @@ namespace dxvk { ApplyPrimitiveType(ctx, cPrimType); // Tests on Windows show that D3D9 does not do non-indexed instanced draws. + VkDrawIndirectCommand draw = { }; + draw.vertexCount = cVertexCount; + draw.instanceCount = 1u; ctx->bindVertexBuffer(0, std::move(cBufferSlice), cStride); - ctx->draw( - cVertexCount, 1, - 0, 0); + ctx->draw(1u, &draw); ctx->bindVertexBuffer(0, DxvkBufferSlice(), 0); }); @@ -3045,12 +3052,13 @@ namespace dxvk { ApplyPrimitiveType(ctx, cPrimType); + VkDrawIndexedIndirectCommand draw = { }; + draw.indexCount = drawInfo.vertexCount; + draw.instanceCount = drawInfo.instanceCount; + ctx->bindVertexBuffer(0, cBufferSlice.subSlice(0, cVertexSize), cStride); ctx->bindIndexBuffer(cBufferSlice.subSlice(cVertexSize, cBufferSlice.length() - cVertexSize), cIndexType); - ctx->drawIndexed( - drawInfo.vertexCount, drawInfo.instanceCount, - 0, - 0, 0); + ctx->drawIndexed(1u, &draw); ctx->bindVertexBuffer(0, DxvkBufferSlice(), 0); ctx->bindIndexBuffer(DxvkBufferSlice(), VK_INDEX_TYPE_UINT32); }); @@ -3162,11 +3170,14 @@ namespace dxvk { // to avoid val errors / UB. ctx->bindShader(nullptr); + VkDrawIndirectCommand draw = { }; + draw.vertexCount = drawInfo.vertexCount; + draw.instanceCount = drawInfo.instanceCount; + draw.firstVertex = cStartIndex; + ctx->bindShader(std::move(shader)); ctx->bindUniformBuffer(VK_SHADER_STAGE_GEOMETRY_BIT, getSWVPBufferSlot(), std::move(cBufferSlice)); - ctx->draw( - drawInfo.vertexCount, drawInfo.instanceCount, - cStartIndex, 0); + ctx->draw(1u, &draw); ctx->bindUniformBuffer(VK_SHADER_STAGE_GEOMETRY_BIT, getSWVPBufferSlot(), DxvkBufferSlice()); ctx->bindShader(nullptr); }); diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index e14720e2f..d4ad98b09 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -49,6 +49,11 @@ namespace dxvk { if (m_device->features().khrMaintenance5.maintenance5) m_features.set(DxvkContextFeature::IndexBufferRobustness); + // Check whether we can batch direct draws + if (m_device->features().extMultiDraw.multiDraw + && m_device->properties().extMultiDraw.maxMultiDrawCount >= DirectMultiDrawBatchSize) + m_features.set(DxvkContextFeature::DirectMultiDraw); + // Add a fast path to query debug utils support if (m_device->isDebugEnabled()) m_features.set(DxvkContextFeature::DebugUtils); @@ -922,15 +927,9 @@ namespace dxvk { void DxvkContext::draw( - uint32_t vertexCount, - uint32_t instanceCount, - uint32_t firstVertex, - uint32_t firstInstance) { - if (this->commitGraphicsState()) { - m_cmd->cmdDraw( - vertexCount, instanceCount, - firstVertex, firstInstance); - } + uint32_t count, + const VkDrawIndirectCommand* draws) { + drawGeneric(count, draws); } @@ -953,20 +952,12 @@ namespace dxvk { void DxvkContext::drawIndexed( - uint32_t indexCount, - uint32_t instanceCount, - uint32_t firstIndex, - int32_t vertexOffset, - uint32_t firstInstance) { - if (this->commitGraphicsState()) { - m_cmd->cmdDrawIndexed( - indexCount, instanceCount, - firstIndex, vertexOffset, - firstInstance); - } + uint32_t count, + const VkDrawIndexedIndirectCommand* draws) { + drawGeneric(count, draws); } - - + + void DxvkContext::drawIndexedIndirect( VkDeviceSize offset, uint32_t count, @@ -1689,6 +1680,116 @@ namespace dxvk { } + template + void DxvkContext::drawGeneric( + uint32_t count, + const T* draws) { + if (this->commitGraphicsState()) { + if (count == 1u) { + // Most common case, just emit a single draw + if constexpr (Indexed) { + m_cmd->cmdDrawIndexed(draws->indexCount, draws->instanceCount, + draws->firstIndex, draws->vertexOffset, draws->firstInstance); + } else { + m_cmd->cmdDraw(draws->vertexCount, draws->instanceCount, + draws->firstVertex, draws->firstInstance); + } + } else if (unlikely(needsDrawBarriers())) { + // If the current pipeline has storage resource hazards, + // unroll draws and insert a barrier after each one. + for (uint32_t i = 0; i < count; i++) { + if (i) + this->commitGraphicsState(); + + if constexpr (Indexed) { + m_cmd->cmdDrawIndexed(draws[i].indexCount, draws[i].instanceCount, + draws[i].firstIndex, draws[i].vertexOffset, draws[i].firstInstance); + } else { + m_cmd->cmdDraw(draws[i].vertexCount, draws[i].instanceCount, + draws[i].firstVertex, draws[i].firstInstance); + } + } + } else { + using MultiDrawInfo = std::conditional_t; + + // Intentially don't initialize this; we'll probably not use + // the full batch size anyway, so doing so would be wasteful. + std::array batch; + + uint32_t instanceCount = 0u; + uint32_t instanceIndex = 0u; + + uint32_t batchSize = 0u; + + for (uint32_t i = 0; i < count; i++) { + if (!batchSize) { + instanceCount = draws[i].instanceCount; + instanceIndex = draws[i].firstInstance; + } + + if constexpr (Indexed) { + auto& drawInfo = batch[batchSize++]; + drawInfo.firstIndex = draws[i].firstIndex; + drawInfo.indexCount = draws[i].indexCount; + drawInfo.vertexOffset = draws[i].vertexOffset; + } else { + auto& drawInfo = batch[batchSize++]; + drawInfo.firstVertex = draws[i].firstVertex; + drawInfo.vertexCount = draws[i].vertexCount; + } + + bool emitDraw = i + 1u == count || batchSize == DirectMultiDrawBatchSize; + + if (!emitDraw) { + const auto& next = draws[i + 1u]; + + emitDraw = instanceCount != next.instanceCount + || instanceIndex != next.firstInstance; + } + + if (emitDraw) { + if (m_features.test(DxvkContextFeature::DirectMultiDraw)) { + if constexpr (Indexed) { + m_cmd->cmdDrawMultiIndexed(batchSize, batch.data(), + instanceCount, instanceIndex); + } else { + m_cmd->cmdDrawMulti(batchSize, batch.data(), + instanceCount, instanceIndex); + } + } else { + // This path only really exists for consistency reasons; all drivers + // we care about support MultiDraw natively, but debug tools may not. + if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) { + const char* procName = Indexed ? "vkCmdDrawMultiIndexedEXT" : "vkCmdDrawMultiEXT"; + m_cmd->cmdBeginDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer, + vk::makeLabel(0u, str::format(procName, "(", batchSize, ")").c_str())); + } + + for (uint32_t i = 0; i < batchSize; i++) { + const auto& entry = batch[i]; + + if constexpr (Indexed) { + m_cmd->cmdDrawIndexed(entry.indexCount, instanceCount, + entry.firstIndex, entry.vertexOffset, instanceIndex); + } else { + m_cmd->cmdDraw(entry.vertexCount, instanceCount, + entry.firstVertex, instanceIndex); + } + } + + if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) + m_cmd->cmdEndDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer); + } + + batchSize = 0u; + } + } + } + } + } + + template void DxvkContext::drawIndirectGeneric( VkDeviceSize offset, @@ -1698,11 +1799,8 @@ namespace dxvk { if (this->commitGraphicsState()) { auto descriptor = m_state.id.argBuffer.getDescriptor(); - if (unroll) { - // Need to do this check after initially setting up the pipeline - unroll = m_state.gp.flags.test(DxvkGraphicsPipelineFlag::UnrollMergedDraws) - && !m_barrierControl.test(DxvkBarrierControl::GraphicsAllowReadWriteOverlap); - } + if (unroll) + unroll = needsDrawBarriers(); // If draws are merged but the pipeline has order-dependent stores, submit // one draw at a time as well as barriers in between. Otherwise, keep the @@ -2854,6 +2952,12 @@ namespace dxvk { } + bool DxvkContext::needsDrawBarriers() { + return m_state.gp.flags.test(DxvkGraphicsPipelineFlag::UnrollMergedDraws) + && !m_barrierControl.test(DxvkBarrierControl::GraphicsAllowReadWriteOverlap); + } + + void DxvkContext::beginRenderPassDebugRegion() { bool hasColorAttachments = false; bool hasDepthAttachment = m_state.om.renderTargets.depth.view != nullptr; diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index 921405472..b32a12b50 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -32,6 +32,8 @@ namespace dxvk { class DxvkContext : public RcObject { constexpr static VkDeviceSize MaxDiscardSizeInRp = 256u << 10u; constexpr static VkDeviceSize MaxDiscardSize = 16u << 10u; + + constexpr static uint32_t DirectMultiDrawBatchSize = 256u; public: DxvkContext(const Rc& device); @@ -744,17 +746,13 @@ namespace dxvk { /** * \brief Draws primitive without using an index buffer * - * \param [in] vertexCount Number of vertices to draw - * \param [in] instanceCount Number of instances to render - * \param [in] firstVertex First vertex in vertex buffer - * \param [in] firstInstance First instance ID + * \param [in] count Number of draws + * \param [in] draws Draw parameters */ void draw( - uint32_t vertexCount, - uint32_t instanceCount, - uint32_t firstVertex, - uint32_t firstInstance); - + uint32_t count, + const VkDrawIndirectCommand* draws); + /** * \brief Indirect draw call * @@ -791,19 +789,13 @@ namespace dxvk { /** * \brief Draws primitives using an index buffer * - * \param [in] indexCount Number of indices to draw - * \param [in] instanceCount Number of instances to render - * \param [in] firstIndex First index within the index buffer - * \param [in] vertexOffset Vertex ID that corresponds to index 0 - * \param [in] firstInstance First instance ID + * \param [in] count Number of draws + * \param [in] draws Draw parameters */ void drawIndexed( - uint32_t indexCount, - uint32_t instanceCount, - uint32_t firstIndex, - int32_t vertexOffset, - uint32_t firstInstance); - + uint32_t count, + const VkDrawIndexedIndirectCommand* draws); + /** * \brief Indirect indexed draw call * @@ -1595,6 +1587,11 @@ namespace dxvk { const Rc& buffer, VkDeviceSize offset); + template + void drawGeneric( + uint32_t count, + const T* draws); + template void drawIndirectGeneric( VkDeviceSize offset, @@ -2103,7 +2100,7 @@ namespace dxvk { return pred(DxvkAccess::Read); } - void invalidateWriteAfterWriteTracking(); + bool needsDrawBarriers(); void beginRenderPassDebugRegion(); diff --git a/src/dxvk/dxvk_context_state.h b/src/dxvk/dxvk_context_state.h index f91357e1e..564828860 100644 --- a/src/dxvk/dxvk_context_state.h +++ b/src/dxvk/dxvk_context_state.h @@ -75,6 +75,7 @@ namespace dxvk { VariableMultisampleRate, IndexBufferRobustness, DebugUtils, + DirectMultiDraw, FeatureCount }; diff --git a/src/vulkan/vulkan_util.h b/src/vulkan/vulkan_util.h index f872d5801..1c2556525 100644 --- a/src/vulkan/vulkan_util.h +++ b/src/vulkan/vulkan_util.h @@ -244,7 +244,7 @@ namespace dxvk::vk { label.color[0] = ((color >> 16u) & 0xffu) / 255.0f; label.color[1] = ((color >> 8u) & 0xffu) / 255.0f; label.color[2] = ((color >> 0u) & 0xffu) / 255.0f; - label.color[3] = 1.0f; + label.color[3] = color ? 1.0f : 0.0f; label.pLabelName = text; return label; }