mirror of
https://github.com/doitsujin/dxvk.git
synced 2024-12-12 04:08:52 +01:00
[dxbc] Remove broken atomic counter subgroup optimization
This is not a legal optimization inside non-uniform control flow due to Vulkan's extremely permissive convergence rules, and apparently breaks on Nvidia as a result. Mesa drivers already do the same thing internally anyway.
This commit is contained in:
parent
855b2746b6
commit
f06c646315
@ -2464,58 +2464,6 @@ namespace dxvk {
|
|||||||
if (m_uavs.at(registerId).ctrId == 0)
|
if (m_uavs.at(registerId).ctrId == 0)
|
||||||
m_uavs.at(registerId).ctrId = emitDclUavCounter(registerId);
|
m_uavs.at(registerId).ctrId = emitDclUavCounter(registerId);
|
||||||
|
|
||||||
// Only use subgroup ops on compute to avoid having to
|
|
||||||
// deal with helper invocations or hardware limitations
|
|
||||||
bool useSubgroupOps = m_moduleInfo.options.useSubgroupOpsForAtomicCounters
|
|
||||||
&& m_programInfo.type() == DxbcProgramType::ComputeShader;
|
|
||||||
|
|
||||||
// Current block ID used in a phi later on
|
|
||||||
uint32_t baseBlockId = m_module.getBlockId();
|
|
||||||
|
|
||||||
// In case we have subgroup ops enabled, we need to
|
|
||||||
// count the number of active lanes, the lane index,
|
|
||||||
// and we need to perform the atomic op conditionally
|
|
||||||
uint32_t laneCount = 0;
|
|
||||||
uint32_t laneIndex = 0;
|
|
||||||
|
|
||||||
DxbcConditional elect;
|
|
||||||
|
|
||||||
if (useSubgroupOps) {
|
|
||||||
m_module.enableCapability(spv::CapabilityGroupNonUniform);
|
|
||||||
m_module.enableCapability(spv::CapabilityGroupNonUniformBallot);
|
|
||||||
|
|
||||||
uint32_t ballot = m_module.opGroupNonUniformBallot(
|
|
||||||
getVectorTypeId({ DxbcScalarType::Uint32, 4 }),
|
|
||||||
m_module.constu32(spv::ScopeSubgroup),
|
|
||||||
m_module.constBool(true));
|
|
||||||
|
|
||||||
laneCount = m_module.opGroupNonUniformBallotBitCount(
|
|
||||||
getScalarTypeId(DxbcScalarType::Uint32),
|
|
||||||
m_module.constu32(spv::ScopeSubgroup),
|
|
||||||
spv::GroupOperationReduce, ballot);
|
|
||||||
|
|
||||||
laneIndex = m_module.opGroupNonUniformBallotBitCount(
|
|
||||||
getScalarTypeId(DxbcScalarType::Uint32),
|
|
||||||
m_module.constu32(spv::ScopeSubgroup),
|
|
||||||
spv::GroupOperationExclusiveScan, ballot);
|
|
||||||
|
|
||||||
// Elect one lane to perform the atomic op
|
|
||||||
uint32_t election = m_module.opGroupNonUniformElect(
|
|
||||||
m_module.defBoolType(),
|
|
||||||
m_module.constu32(spv::ScopeSubgroup));
|
|
||||||
|
|
||||||
elect.labelIf = m_module.allocateId();
|
|
||||||
elect.labelEnd = m_module.allocateId();
|
|
||||||
|
|
||||||
m_module.opSelectionMerge(elect.labelEnd, spv::SelectionControlMaskNone);
|
|
||||||
m_module.opBranchConditional(election, elect.labelIf, elect.labelEnd);
|
|
||||||
|
|
||||||
m_module.opLabel(elect.labelIf);
|
|
||||||
} else {
|
|
||||||
// We're going to use this for the increment
|
|
||||||
laneCount = m_module.constu32(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Get a pointer to the atomic counter in question
|
// Get a pointer to the atomic counter in question
|
||||||
DxbcRegisterInfo ptrType;
|
DxbcRegisterInfo ptrType;
|
||||||
ptrType.type.ctype = DxbcScalarType::Uint32;
|
ptrType.type.ctype = DxbcScalarType::Uint32;
|
||||||
@ -2547,13 +2495,14 @@ namespace dxvk {
|
|||||||
switch (ins.op) {
|
switch (ins.op) {
|
||||||
case DxbcOpcode::ImmAtomicAlloc:
|
case DxbcOpcode::ImmAtomicAlloc:
|
||||||
value.id = m_module.opAtomicIAdd(typeId, ptrId,
|
value.id = m_module.opAtomicIAdd(typeId, ptrId,
|
||||||
scopeId, semanticsId, laneCount);
|
scopeId, semanticsId, m_module.constu32(1));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case DxbcOpcode::ImmAtomicConsume:
|
case DxbcOpcode::ImmAtomicConsume:
|
||||||
value.id = m_module.opAtomicISub(typeId, ptrId,
|
value.id = m_module.opAtomicISub(typeId, ptrId,
|
||||||
scopeId, semanticsId, laneCount);
|
scopeId, semanticsId, m_module.constu32(1));
|
||||||
value.id = m_module.opISub(typeId, value.id, laneCount);
|
value.id = m_module.opISub(typeId, value.id,
|
||||||
|
m_module.constu32(1));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
@ -2563,26 +2512,6 @@ namespace dxvk {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we're using subgroup ops, we have to broadcast
|
|
||||||
// the result of the atomic op and compute the index
|
|
||||||
if (useSubgroupOps) {
|
|
||||||
m_module.opBranch(elect.labelEnd);
|
|
||||||
m_module.opLabel (elect.labelEnd);
|
|
||||||
|
|
||||||
uint32_t undef = m_module.constUndef(typeId);
|
|
||||||
|
|
||||||
std::array<SpirvPhiLabel, 2> phiLabels = {{
|
|
||||||
{ value.id, elect.labelIf },
|
|
||||||
{ undef, baseBlockId },
|
|
||||||
}};
|
|
||||||
|
|
||||||
value.id = m_module.opPhi(typeId,
|
|
||||||
phiLabels.size(), phiLabels.data());
|
|
||||||
value.id = m_module.opGroupNonUniformBroadcastFirst(typeId,
|
|
||||||
m_module.constu32(spv::ScopeSubgroup), value.id);
|
|
||||||
value.id = m_module.opIAdd(typeId, value.id, laneIndex);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Store the result
|
// Store the result
|
||||||
emitRegisterStore(ins.dst[0], value);
|
emitRegisterStore(ins.dst[0], value);
|
||||||
}
|
}
|
||||||
|
@ -17,9 +17,6 @@ namespace dxvk {
|
|||||||
|
|
||||||
useDepthClipWorkaround
|
useDepthClipWorkaround
|
||||||
= !devFeatures.extDepthClipEnable.depthClipEnable;
|
= !devFeatures.extDepthClipEnable.depthClipEnable;
|
||||||
useSubgroupOpsForAtomicCounters
|
|
||||||
= (devInfo.vk11.subgroupSupportedStages & VK_SHADER_STAGE_COMPUTE_BIT)
|
|
||||||
&& (devInfo.vk11.subgroupSupportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT);
|
|
||||||
|
|
||||||
VkFormatFeatureFlags2 r32Features
|
VkFormatFeatureFlags2 r32Features
|
||||||
= device->getFormatFeatures(VK_FORMAT_R32_SFLOAT).optimal
|
= device->getFormatFeatures(VK_FORMAT_R32_SFLOAT).optimal
|
||||||
|
@ -30,10 +30,6 @@ namespace dxvk {
|
|||||||
/// Determines whether raw access chains are supported
|
/// Determines whether raw access chains are supported
|
||||||
bool supportsRawAccessChains = false;
|
bool supportsRawAccessChains = false;
|
||||||
|
|
||||||
/// Use subgroup operations to reduce the number of
|
|
||||||
/// atomic operations for append/consume buffers.
|
|
||||||
bool useSubgroupOpsForAtomicCounters = false;
|
|
||||||
|
|
||||||
/// Clear thread-group shared memory to zero
|
/// Clear thread-group shared memory to zero
|
||||||
bool zeroInitWorkgroupMemory = false;
|
bool zeroInitWorkgroupMemory = false;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user