1
0
mirror of https://github.com/doitsujin/dxvk.git synced 2024-12-12 04:08:52 +01:00

[dxbc] Remove broken atomic counter subgroup optimization

This is not a legal optimization inside non-uniform control flow due
to Vulkan's extremely permissive convergence rules, and apparently
breaks on Nvidia as a result.

Mesa drivers already do the same thing internally anyway.
This commit is contained in:
Philip Rebohle 2024-04-03 14:55:43 +02:00
parent 855b2746b6
commit f06c646315
3 changed files with 4 additions and 82 deletions

View File

@ -2464,58 +2464,6 @@ namespace dxvk {
if (m_uavs.at(registerId).ctrId == 0) if (m_uavs.at(registerId).ctrId == 0)
m_uavs.at(registerId).ctrId = emitDclUavCounter(registerId); m_uavs.at(registerId).ctrId = emitDclUavCounter(registerId);
// Only use subgroup ops on compute to avoid having to
// deal with helper invocations or hardware limitations
bool useSubgroupOps = m_moduleInfo.options.useSubgroupOpsForAtomicCounters
&& m_programInfo.type() == DxbcProgramType::ComputeShader;
// Current block ID used in a phi later on
uint32_t baseBlockId = m_module.getBlockId();
// In case we have subgroup ops enabled, we need to
// count the number of active lanes, the lane index,
// and we need to perform the atomic op conditionally
uint32_t laneCount = 0;
uint32_t laneIndex = 0;
DxbcConditional elect;
if (useSubgroupOps) {
m_module.enableCapability(spv::CapabilityGroupNonUniform);
m_module.enableCapability(spv::CapabilityGroupNonUniformBallot);
uint32_t ballot = m_module.opGroupNonUniformBallot(
getVectorTypeId({ DxbcScalarType::Uint32, 4 }),
m_module.constu32(spv::ScopeSubgroup),
m_module.constBool(true));
laneCount = m_module.opGroupNonUniformBallotBitCount(
getScalarTypeId(DxbcScalarType::Uint32),
m_module.constu32(spv::ScopeSubgroup),
spv::GroupOperationReduce, ballot);
laneIndex = m_module.opGroupNonUniformBallotBitCount(
getScalarTypeId(DxbcScalarType::Uint32),
m_module.constu32(spv::ScopeSubgroup),
spv::GroupOperationExclusiveScan, ballot);
// Elect one lane to perform the atomic op
uint32_t election = m_module.opGroupNonUniformElect(
m_module.defBoolType(),
m_module.constu32(spv::ScopeSubgroup));
elect.labelIf = m_module.allocateId();
elect.labelEnd = m_module.allocateId();
m_module.opSelectionMerge(elect.labelEnd, spv::SelectionControlMaskNone);
m_module.opBranchConditional(election, elect.labelIf, elect.labelEnd);
m_module.opLabel(elect.labelIf);
} else {
// We're going to use this for the increment
laneCount = m_module.constu32(1);
}
// Get a pointer to the atomic counter in question // Get a pointer to the atomic counter in question
DxbcRegisterInfo ptrType; DxbcRegisterInfo ptrType;
ptrType.type.ctype = DxbcScalarType::Uint32; ptrType.type.ctype = DxbcScalarType::Uint32;
@ -2547,13 +2495,14 @@ namespace dxvk {
switch (ins.op) { switch (ins.op) {
case DxbcOpcode::ImmAtomicAlloc: case DxbcOpcode::ImmAtomicAlloc:
value.id = m_module.opAtomicIAdd(typeId, ptrId, value.id = m_module.opAtomicIAdd(typeId, ptrId,
scopeId, semanticsId, laneCount); scopeId, semanticsId, m_module.constu32(1));
break; break;
case DxbcOpcode::ImmAtomicConsume: case DxbcOpcode::ImmAtomicConsume:
value.id = m_module.opAtomicISub(typeId, ptrId, value.id = m_module.opAtomicISub(typeId, ptrId,
scopeId, semanticsId, laneCount); scopeId, semanticsId, m_module.constu32(1));
value.id = m_module.opISub(typeId, value.id, laneCount); value.id = m_module.opISub(typeId, value.id,
m_module.constu32(1));
break; break;
default: default:
@ -2563,26 +2512,6 @@ namespace dxvk {
return; return;
} }
// If we're using subgroup ops, we have to broadcast
// the result of the atomic op and compute the index
if (useSubgroupOps) {
m_module.opBranch(elect.labelEnd);
m_module.opLabel (elect.labelEnd);
uint32_t undef = m_module.constUndef(typeId);
std::array<SpirvPhiLabel, 2> phiLabels = {{
{ value.id, elect.labelIf },
{ undef, baseBlockId },
}};
value.id = m_module.opPhi(typeId,
phiLabels.size(), phiLabels.data());
value.id = m_module.opGroupNonUniformBroadcastFirst(typeId,
m_module.constu32(spv::ScopeSubgroup), value.id);
value.id = m_module.opIAdd(typeId, value.id, laneIndex);
}
// Store the result // Store the result
emitRegisterStore(ins.dst[0], value); emitRegisterStore(ins.dst[0], value);
} }

View File

@ -17,9 +17,6 @@ namespace dxvk {
useDepthClipWorkaround useDepthClipWorkaround
= !devFeatures.extDepthClipEnable.depthClipEnable; = !devFeatures.extDepthClipEnable.depthClipEnable;
useSubgroupOpsForAtomicCounters
= (devInfo.vk11.subgroupSupportedStages & VK_SHADER_STAGE_COMPUTE_BIT)
&& (devInfo.vk11.subgroupSupportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT);
VkFormatFeatureFlags2 r32Features VkFormatFeatureFlags2 r32Features
= device->getFormatFeatures(VK_FORMAT_R32_SFLOAT).optimal = device->getFormatFeatures(VK_FORMAT_R32_SFLOAT).optimal

View File

@ -30,10 +30,6 @@ namespace dxvk {
/// Determines whether raw access chains are supported /// Determines whether raw access chains are supported
bool supportsRawAccessChains = false; bool supportsRawAccessChains = false;
/// Use subgroup operations to reduce the number of
/// atomic operations for append/consume buffers.
bool useSubgroupOpsForAtomicCounters = false;
/// Clear thread-group shared memory to zero /// Clear thread-group shared memory to zero
bool zeroInitWorkgroupMemory = false; bool zeroInitWorkgroupMemory = false;