[dxbc] Remove broken atomic counter subgroup optimization

This is not a legal optimization inside non-uniform control flow due to Vulkan's extremely permissive convergence rules, and apparently breaks on Nvidia as a result. Mesa drivers already do the same thing internally anyway.
2024-12-04 07:24:15 +01:00 · 2024-04-03 14:55:43 +02:00 · 2024-04-03 14:55:43 +02:00 · f06c646315
commit f06c646315
parent 855b2746b6
3 changed files with 4 additions and 82 deletions
--- a/src/dxbc/dxbc_compiler.cpp
+++ b/src/dxbc/dxbc_compiler.cpp
@ -2464,58 +2464,6 @@ namespace dxvk {
    if (m_uavs.at(registerId).ctrId == 0)
      m_uavs.at(registerId).ctrId = emitDclUavCounter(registerId);
    // Only use subgroup ops on compute to avoid having to
    // deal with helper invocations or hardware limitations
    bool useSubgroupOps = m_moduleInfo.options.useSubgroupOpsForAtomicCounters
      && m_programInfo.type() == DxbcProgramType::ComputeShader;
    // Current block ID used in a phi later on
    uint32_t baseBlockId = m_module.getBlockId();
    // In case we have subgroup ops enabled, we need to
    // count the number of active lanes, the lane index,
    // and we need to perform the atomic op conditionally
    uint32_t laneCount = 0;
    uint32_t laneIndex = 0;
    DxbcConditional elect;
    if (useSubgroupOps) {
      m_module.enableCapability(spv::CapabilityGroupNonUniform);
      m_module.enableCapability(spv::CapabilityGroupNonUniformBallot);
      uint32_t ballot = m_module.opGroupNonUniformBallot(
        getVectorTypeId({ DxbcScalarType::Uint32, 4 }),
        m_module.constu32(spv::ScopeSubgroup),
        m_module.constBool(true));
      laneCount = m_module.opGroupNonUniformBallotBitCount(
        getScalarTypeId(DxbcScalarType::Uint32),
        m_module.constu32(spv::ScopeSubgroup),
        spv::GroupOperationReduce, ballot);
      laneIndex = m_module.opGroupNonUniformBallotBitCount(
        getScalarTypeId(DxbcScalarType::Uint32),
        m_module.constu32(spv::ScopeSubgroup),
        spv::GroupOperationExclusiveScan, ballot);
      // Elect one lane to perform the atomic op
      uint32_t election = m_module.opGroupNonUniformElect(
        m_module.defBoolType(),
        m_module.constu32(spv::ScopeSubgroup));
      elect.labelIf  = m_module.allocateId();
      elect.labelEnd = m_module.allocateId();
      m_module.opSelectionMerge(elect.labelEnd, spv::SelectionControlMaskNone);
      m_module.opBranchConditional(election, elect.labelIf, elect.labelEnd);
      m_module.opLabel(elect.labelIf);
    } else {
      // We're going to use this for the increment
      laneCount = m_module.constu32(1);
    }
    // Get a pointer to the atomic counter in question
    DxbcRegisterInfo ptrType;
    ptrType.type.ctype   = DxbcScalarType::Uint32;
@ -2547,13 +2495,14 @@ namespace dxvk {
    switch (ins.op) {
      case DxbcOpcode::ImmAtomicAlloc:
        value.id = m_module.opAtomicIAdd(typeId, ptrId,
-          scopeId, semanticsId, laneCount);
+          scopeId, semanticsId, m_module.constu32(1));
        break;
      case DxbcOpcode::ImmAtomicConsume:
        value.id = m_module.opAtomicISub(typeId, ptrId,
-          scopeId, semanticsId, laneCount);
+          scopeId, semanticsId, m_module.constu32(1));
-        value.id = m_module.opISub(typeId, value.id, laneCount);
+        value.id = m_module.opISub(typeId, value.id,
          m_module.constu32(1));
        break;
      default:
@ -2563,26 +2512,6 @@ namespace dxvk {
        return;
    }
    // If we're using subgroup ops, we have to broadcast
    // the result of the atomic op and compute the index
    if (useSubgroupOps) {
      m_module.opBranch(elect.labelEnd);
      m_module.opLabel (elect.labelEnd);
      uint32_t undef = m_module.constUndef(typeId);
      std::array<SpirvPhiLabel, 2> phiLabels = {{
        { value.id, elect.labelIf },
        { undef,    baseBlockId   },
      }};
      value.id = m_module.opPhi(typeId,
        phiLabels.size(), phiLabels.data());
      value.id = m_module.opGroupNonUniformBroadcastFirst(typeId,
        m_module.constu32(spv::ScopeSubgroup), value.id);
      value.id = m_module.opIAdd(typeId, value.id, laneIndex);
    }
    // Store the result
    emitRegisterStore(ins.dst[0], value);
  }
--- a/src/dxbc/dxbc_options.cpp
+++ b/src/dxbc/dxbc_options.cpp
@ -17,9 +17,6 @@ namespace dxvk {
    useDepthClipWorkaround
      = !devFeatures.extDepthClipEnable.depthClipEnable;
    useSubgroupOpsForAtomicCounters
      = (devInfo.vk11.subgroupSupportedStages     & VK_SHADER_STAGE_COMPUTE_BIT)
     && (devInfo.vk11.subgroupSupportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT);
    VkFormatFeatureFlags2 r32Features
      = device->getFormatFeatures(VK_FORMAT_R32_SFLOAT).optimal
--- a/src/dxbc/dxbc_options.h
+++ b/src/dxbc/dxbc_options.h
@ -30,10 +30,6 @@ namespace dxvk {
    /// Determines whether raw access chains are supported
    bool supportsRawAccessChains = false;
    /// Use subgroup operations to reduce the number of
    /// atomic operations for append/consume buffers.
    bool useSubgroupOpsForAtomicCounters = false;
    /// Clear thread-group shared memory to zero
    bool zeroInitWorkgroupMemory = false;