From f250dcd4f1e503e431ffa07f1c19b1713fab4935 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Wed, 26 Feb 2025 22:26:57 +0100 Subject: [PATCH 1/3] [dxbc] Add option to implicitly synchronize UAV accesses --- src/dxbc/dxbc_compiler.cpp | 138 +++++++++++++++++++++++++++++++------ src/dxbc/dxbc_compiler.h | 11 ++- src/dxbc/dxbc_options.h | 4 ++ 3 files changed, 131 insertions(+), 22 deletions(-) diff --git a/src/dxbc/dxbc_compiler.cpp b/src/dxbc/dxbc_compiler.cpp index 79126c8ad..f768c6e3b 100644 --- a/src/dxbc/dxbc_compiler.cpp +++ b/src/dxbc/dxbc_compiler.cpp @@ -2831,6 +2831,9 @@ namespace dxvk { const DxbcRegister& dstReg = ins.dst[0]; const DxbcRegister& srcReg = isStructured ? ins.src[2] : ins.src[1]; + if (dstReg.type == DxbcOperandType::UnorderedAccessView) + emitUavBarrier(uint64_t(1u) << srcReg.idx[0].offset, 0u); + // Retrieve common info about the buffer const DxbcBufferInfo bufferInfo = getBufferInfo(srcReg); @@ -3049,6 +3052,9 @@ namespace dxvk { const DxbcRegister& dstReg = ins.dst[0]; const DxbcRegister& srcReg = isStructured ? ins.src[2] : ins.src[1]; + if (dstReg.type == DxbcOperandType::UnorderedAccessView) + emitUavBarrier(0u, uint64_t(1u) << dstReg.idx[0].offset); + DxbcRegisterValue value = emitRegisterLoad(srcReg, dstReg.mask); value = emitRegisterBitcast(value, DxbcScalarType::Uint32); @@ -4157,7 +4163,9 @@ namespace dxvk { // (src1) The UAV to load from const uint32_t registerId = ins.src[1].idx[0].offset; const DxbcUav uavInfo = m_uavs.at(registerId); - + + emitUavBarrier(uint64_t(1u) << registerId, 0u); + // Load texture coordinates DxbcRegisterValue texCoord = emitLoadTexCoord( ins.src[0], uavInfo.imageInfo); @@ -4210,6 +4218,7 @@ namespace dxvk { // (src0) The texture or buffer coordinates // (src1) The value to store const DxbcBufferInfo uavInfo = getBufferInfo(ins.dst[0]); + emitUavBarrier(0u, uint64_t(1u) << ins.dst[0].idx[0].offset); // Set image operands for coherent access if necessary SpirvImageOperands imageOperands; @@ -4551,6 +4560,8 @@ namespace dxvk { // return can be used in place of break to terminate a case block if (m_controlFlowBlocks.back().type == DxbcCfgBlockType::Switch) m_controlFlowBlocks.back().b_switch.labelCase = labelId; + + m_topLevelIsUniform = false; } else { // Last instruction in the current function this->emitFunctionEnd(); @@ -4580,6 +4591,9 @@ namespace dxvk { m_module.opReturn(); m_module.opLabel(continueLabel); + + // The return condition may be non-uniform + m_topLevelIsUniform = false; } @@ -4606,8 +4620,11 @@ namespace dxvk { m_module.opLabel(cond.labelEnd); m_module.enableCapability(spv::CapabilityDemoteToHelperInvocation); + + // Discard is just retc in a trenchcoat + m_topLevelIsUniform = false; } - + void DxbcCompiler::emitControlFlowLabel(const DxbcShaderInstruction& ins) { uint32_t functionNr = ins.dst[0].idx[0].offset; @@ -4623,6 +4640,10 @@ namespace dxvk { m_module.setDebugName(functionId, str::format("label", functionNr).c_str()); m_insideFunction = true; + + // We have to assume that this function gets + // called from non-uniform control flow + m_topLevelIsUniform = false; } @@ -4670,57 +4691,85 @@ namespace dxvk { void DxbcCompiler::emitControlFlow(const DxbcShaderInstruction& ins) { switch (ins.op) { case DxbcOpcode::If: - return this->emitControlFlowIf(ins); + this->emitUavBarrier(0, 0); + this->emitControlFlowIf(ins); + break; case DxbcOpcode::Else: - return this->emitControlFlowElse(ins); + this->emitControlFlowElse(ins); + break; case DxbcOpcode::EndIf: - return this->emitControlFlowEndIf(ins); + this->emitControlFlowEndIf(ins); + this->emitUavBarrier(0, 0); + break; case DxbcOpcode::Switch: - return this->emitControlFlowSwitch(ins); + this->emitUavBarrier(0, 0); + this->emitControlFlowSwitch(ins); + break; case DxbcOpcode::Case: - return this->emitControlFlowCase(ins); + this->emitControlFlowCase(ins); + break; case DxbcOpcode::Default: - return this->emitControlFlowDefault(ins); + this->emitControlFlowDefault(ins); + break; case DxbcOpcode::EndSwitch: - return this->emitControlFlowEndSwitch(ins); - + this->emitControlFlowEndSwitch(ins); + this->emitUavBarrier(0, 0); + break; + case DxbcOpcode::Loop: - return this->emitControlFlowLoop(ins); + this->emitUavBarrier(0, 0); + this->emitControlFlowLoop(ins); + break; case DxbcOpcode::EndLoop: - return this->emitControlFlowEndLoop(ins); - + this->emitControlFlowEndLoop(ins); + this->emitUavBarrier(0, 0); + break; + case DxbcOpcode::Break: case DxbcOpcode::Continue: - return this->emitControlFlowBreak(ins); + this->emitControlFlowBreak(ins); + break; case DxbcOpcode::Breakc: case DxbcOpcode::Continuec: - return this->emitControlFlowBreakc(ins); + this->emitControlFlowBreakc(ins); + break; case DxbcOpcode::Ret: - return this->emitControlFlowRet(ins); + this->emitControlFlowRet(ins); + break; case DxbcOpcode::Retc: - return this->emitControlFlowRetc(ins); + this->emitUavBarrier(0, 0); + this->emitControlFlowRetc(ins); + break; case DxbcOpcode::Discard: - return this->emitControlFlowDiscard(ins); + this->emitControlFlowDiscard(ins); + break; case DxbcOpcode::Label: - return this->emitControlFlowLabel(ins); + this->emitControlFlowLabel(ins); + break; case DxbcOpcode::Call: - return this->emitControlFlowCall(ins); + this->emitUavBarrier(0, 0); + this->emitControlFlowCall(ins); + this->emitUavBarrier(-1, -1); + break; case DxbcOpcode::Callc: - return this->emitControlFlowCallc(ins); + this->emitUavBarrier(0, 0); + this->emitControlFlowCallc(ins); + this->emitUavBarrier(-1, -1); + break; default: Logger::warn(str::format( @@ -7872,6 +7921,53 @@ namespace dxvk { } } + + void DxbcCompiler::emitUavBarrier(uint64_t readMask, uint64_t writeMask) { + if (!m_moduleInfo.options.forceComputeUavBarriers + || m_programInfo.type() != DxbcProgramType::ComputeShader) + return; + + // If both masks are 0, emit a barrier in case at least one read-write UAV + // has a pending unsynchronized access. Only consider read-after-write and + // write-after-read hazards, assume that back-to-back stores are safe and + // do not overlap in memory. Atomics are also completely ignored here. + uint64_t rdMask = m_uavRdMask; + uint64_t wrMask = m_uavWrMask; + + bool insertBarrier = bool(rdMask & wrMask); + + if (readMask || writeMask) { + rdMask &= m_uavWrMask; + wrMask &= m_uavRdMask; + } + + for (auto uav : bit::BitMask(rdMask | wrMask)) { + constexpr VkAccessFlags rwAccess = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + insertBarrier |= (m_analysis->uavInfos[uav].accessFlags & rwAccess) == rwAccess; + } + + // Need to be in uniform top-level control flow, or otherwise + // it is not safe to insert control barriers. + if (insertBarrier && m_controlFlowBlocks.empty() && m_topLevelIsUniform) { + m_module.opControlBarrier( + m_module.constu32(spv::ScopeWorkgroup), + m_module.constu32(m_hasGloballyCoherentUav ? spv::ScopeQueueFamily : spv::ScopeWorkgroup), + m_module.constu32(spv::MemorySemanticsWorkgroupMemoryMask + | spv::MemorySemanticsImageMemoryMask + | spv::MemorySemanticsUniformMemoryMask + | spv::MemorySemanticsAcquireReleaseMask + | spv::MemorySemanticsMakeAvailableMask + | spv::MemorySemanticsMakeVisibleMask)); + + m_uavWrMask = 0u; + m_uavRdMask = 0u; + } + + // Mark pending accesses + m_uavWrMask |= writeMask; + m_uavRdMask |= readMask; + } + DxbcVectorType DxbcCompiler::getInputRegType(uint32_t regIdx) const { switch (m_programInfo.type()) { diff --git a/src/dxbc/dxbc_compiler.h b/src/dxbc/dxbc_compiler.h index 7f4595a73..0695e2a10 100644 --- a/src/dxbc/dxbc_compiler.h +++ b/src/dxbc/dxbc_compiler.h @@ -483,7 +483,12 @@ namespace dxvk { // Control flow information. Stores labels for // currently active if-else blocks and loops. std::vector m_controlFlowBlocks; - + + bool m_topLevelIsUniform = true; + + uint64_t m_uavRdMask = 0u; + uint64_t m_uavWrMask = 0u; + ////////////////////////////////////////////// // Function state tracking. Required in order // to properly end functions in some cases. @@ -1258,6 +1263,10 @@ namespace dxvk { bool ignoreInputSystemValue( DxbcSystemValue sv) const; + void emitUavBarrier( + uint64_t readMask, + uint64_t writeMask); + /////////////////////////// // Type definition methods uint32_t getScalarTypeId( diff --git a/src/dxbc/dxbc_options.h b/src/dxbc/dxbc_options.h index 986004924..2cbb968b4 100644 --- a/src/dxbc/dxbc_options.h +++ b/src/dxbc/dxbc_options.h @@ -39,6 +39,10 @@ namespace dxvk { /// Insert memory barriers after TGSM stoes bool forceVolatileTgsmAccess = false; + /// Try to detect hazards in UAV access and insert + /// barriers when we know control flow is uniform. + bool forceComputeUavBarriers = false; + /// Replace ld_ms with ld bool disableMsaa = false; From 4d549d7a842ca5906b8f9ad4af708533f5fce9a3 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Wed, 26 Feb 2025 22:29:38 +0100 Subject: [PATCH 2/3] [d3d11] Add option to force compute shader UAV synchronization --- src/d3d11/d3d11_options.cpp | 1 + src/d3d11/d3d11_options.h | 7 +++++++ src/dxbc/dxbc_options.cpp | 1 + 3 files changed, 9 insertions(+) diff --git a/src/d3d11/d3d11_options.cpp b/src/d3d11/d3d11_options.cpp index b6d4d8045..e2281a3be 100644 --- a/src/d3d11/d3d11_options.cpp +++ b/src/d3d11/d3d11_options.cpp @@ -15,6 +15,7 @@ namespace dxvk { D3D11Options::D3D11Options(const Config& config) { this->zeroInitWorkgroupMemory = config.getOption("d3d11.zeroInitWorkgroupMemory", false); this->forceVolatileTgsmAccess = config.getOption("d3d11.forceVolatileTgsmAccess", false); + this->forceComputeUavBarriers = config.getOption("d3d11.forceComputeUavBarriers", false); this->relaxedBarriers = config.getOption("d3d11.relaxedBarriers", false); this->relaxedGraphicsBarriers = config.getOption("d3d11.relaxedGraphicsBarriers", false); this->maxTessFactor = config.getOption("d3d11.maxTessFactor", 0); diff --git a/src/d3d11/d3d11_options.h b/src/d3d11/d3d11_options.h index ec1dde73b..45405af9c 100644 --- a/src/d3d11/d3d11_options.h +++ b/src/d3d11/d3d11_options.h @@ -26,6 +26,13 @@ namespace dxvk { /// without explicit synchronization. bool forceVolatileTgsmAccess = false; + /// Force UAV synchronization insided compute shaders + /// + /// Workaround for compute shaders that access overlapping + /// memory regions within a UAV without proper workgroup + /// synchroniation. Will have a negative performance impact. + bool forceComputeUavBarriers = false; + /// Use relaxed memory barriers /// /// May improve performance in some games, diff --git a/src/dxbc/dxbc_options.cpp b/src/dxbc/dxbc_options.cpp index 7c269f792..257e822f1 100644 --- a/src/dxbc/dxbc_options.cpp +++ b/src/dxbc/dxbc_options.cpp @@ -35,6 +35,7 @@ namespace dxvk { invariantPosition = options.invariantPosition; zeroInitWorkgroupMemory = options.zeroInitWorkgroupMemory; forceVolatileTgsmAccess = options.forceVolatileTgsmAccess; + forceComputeUavBarriers = options.forceComputeUavBarriers; disableMsaa = options.disableMsaa; forceSampleRateShading = options.forceSampleRateShading; enableSampleShadingInterlock = device->features().extFragmentShaderInterlock.fragmentShaderSampleInterlock; From f1f9ae979d970936b6c9484d136066f6b5685ffb Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Wed, 26 Feb 2025 22:46:35 +0100 Subject: [PATCH 3/3] [util] Enable compute UAV barriers for Watch Dogs 2 --- src/util/config/config.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/util/config/config.cpp b/src/util/config/config.cpp index f85d7159b..47399b50b 100644 --- a/src/util/config/config.cpp +++ b/src/util/config/config.cpp @@ -458,6 +458,11 @@ namespace dxvk { { R"(\\FarCry(5|NewDawn)\.exe$)", {{ { "d3d11.zeroInitWorkgroupMemory", "True" }, }} }, + /* Watch Dogs 2 - ships broken compute shaders * + * with no barriers when they are needed */ + { R"(\\WatchDogs2\.exe$)", {{ + { "d3d11.forceComputeUavBarriers", "True" }, + }} }, /**********************************************/ /* D3D9 GAMES */