From 8c4c814fb72f2af3be4c258bb71b449dc74ad52f Mon Sep 17 00:00:00 2001
From: Autumn Ashton <misyl@froggi.es>
Date: Wed, 4 Dec 2024 16:29:13 +0000
Subject: [PATCH] [d3d9] Spec-constant out writes to clip distances when
 disabled

Add a new spec constant with a mask of the enabled clip planes such that they can be optimized out to improve performance.

For GPL shaders, override what we return here so it's always true and don't bother putting the mask in the UBO.

Signed-off-by: Autumn Ashton <misyl@froggi.es>
---
 src/d3d9/d3d9_device.cpp         | 12 +++++++++---
 src/d3d9/d3d9_fixed_function.cpp | 15 ++++++++++-----
 src/d3d9/d3d9_spec_constants.h   | 13 ++++++++++---
 src/d3d9/d3d9_state.h            |  8 ++++++++
 src/dxso/dxso_compiler.cpp       |  9 ++++++++-
 5 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/src/d3d9/d3d9_device.cpp b/src/d3d9/d3d9_device.cpp
index 068e89519..1e2eee497 100644
--- a/src/d3d9/d3d9_device.cpp
+++ b/src/d3d9/d3d9_device.cpp
@@ -5783,7 +5783,7 @@ namespace dxvk {
         VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
         VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT,
         getSpecConstantBufferSlot(),
-        sizeof(D3D9SpecializationInfo));
+        D3D9SpecializationInfo::UBOSize);
     }
   }
 
@@ -5933,11 +5933,18 @@ namespace dxvk {
     auto mapPtr = m_vsClipPlanes.AllocSlice();
     auto dst = reinterpret_cast<D3D9ClipPlane*>(mapPtr);
 
+    uint32_t clipPlaneMask = 0u;
     for (uint32_t i = 0; i < caps::MaxClipPlanes; i++) {
       dst[i] = (m_state.renderStates[D3DRS_CLIPPLANEENABLE] & (1 << i))
         ? m_state.clipPlanes[i]
         : D3D9ClipPlane();
+
+      if (dst[i] != D3D9ClipPlane())
+        clipPlaneMask |= 1u << i;
     }
+
+    if (m_specInfo.set<SpecClipPlaneMask>(clipPlaneMask))
+      m_flags.set(D3D9DeviceFlag::DirtySpecializationEntries);
   }
 
 
@@ -8589,8 +8596,7 @@ namespace dxvk {
     if (m_usingGraphicsPipelines) {
       // TODO: Make uploading specialization information less naive.
       auto mapPtr = m_specBuffer.AllocSlice();
-      auto dst = reinterpret_cast<D3D9SpecializationInfo*>(mapPtr);
-      *dst = m_specInfo;
+      memcpy(mapPtr, m_specInfo.data.data(), D3D9SpecializationInfo::UBOSize);
     }
 
     m_flags.clr(D3D9DeviceFlag::DirtySpecializationEntries);
diff --git a/src/d3d9/d3d9_fixed_function.cpp b/src/d3d9/d3d9_fixed_function.cpp
index 46cdd4ec9..4451d8b26 100644
--- a/src/d3d9/d3d9_fixed_function.cpp
+++ b/src/d3d9/d3d9_fixed_function.cpp
@@ -2366,6 +2366,7 @@ namespace dxvk {
     
     uint32_t floatType = m_module.defFloatType(32);
     uint32_t vec4Type  = m_module.defVectorType(floatType, 4);
+    uint32_t boolType  = m_module.defBoolType();
     
     // Declare uniform buffer containing clip planes
     uint32_t clipPlaneArray  = m_module.defArrayTypeUnique(vec4Type, clipPlaneCountId);
@@ -2419,12 +2420,16 @@ namespace dxvk {
           clipPlaneBlock, blockMembers.size(), blockMembers.data()));
       
       uint32_t distId = m_module.opDot(floatType, worldPos, planeId);
+
+      // Always consider clip planes enabled when doing GPL by forcing a mask of 0xffffffff for the quick value.
+      uint32_t clipPlaneEnabledBit = m_spec.get(m_module, m_specUbo, SpecClipPlaneMask, i, 1, m_module.constu32(0xffffffff));
+      uint32_t clipPlaneEnabled = m_module.opINotEqual(boolType, clipPlaneEnabledBit, m_module.constu32(0));
+
+      uint32_t value = m_module.opSelect(floatType, clipPlaneEnabled, distId, m_module.constf32(0.0f));
       
-      m_module.opStore(
-        m_module.opAccessChain(
-          m_module.defPointerType(floatType, spv::StorageClassOutput),
-          clipDistArray, 1, &blockMembers[1]),
-        distId);
+      m_module.opStore(m_module.opAccessChain(
+        m_module.defPointerType(floatType, spv::StorageClassOutput),
+        clipDistArray, 1, &blockMembers[1]), value);
     }
   }
 
diff --git a/src/d3d9/d3d9_spec_constants.h b/src/d3d9/d3d9_spec_constants.h
index 4ecf710c9..835717bd1 100644
--- a/src/d3d9/d3d9_spec_constants.h
+++ b/src/d3d9/d3d9_spec_constants.h
@@ -30,6 +30,8 @@ namespace dxvk {
     SpecDrefClamp,          // 1 bit for 16 PS samplers       | Bits: 16
     SpecFetch4,             // 1 bit for 16 PS samplers       | Bits: 16
 
+    SpecClipPlaneMask,      // 6 bits for 6 clip planes       | Bits : 6
+
     SpecConstantCount,
   };
 
@@ -44,7 +46,10 @@ namespace dxvk {
   };
 
   struct D3D9SpecializationInfo {
-    static constexpr uint32_t MaxSpecDwords = 5;
+    static constexpr uint32_t MaxSpecDwords = 6;
+
+    static constexpr uint32_t MaxUBODwords  = 5;
+    static constexpr size_t UBOSize = MaxUBODwords * sizeof(uint32_t);
 
     static constexpr std::array<BitfieldPosition, SpecConstantCount> Layout{{
       { 0, 0, 32 },  // SamplerType
@@ -65,6 +70,8 @@ namespace dxvk {
 
       { 4, 0,  16 }, // DrefClamp
       { 4, 16, 16 }, // Fetch4
+
+      { 5, 0, 6 },   // ClipPlaneEnabled
     }};
 
     template <D3D9SpecConstantId Id, typename T>
@@ -97,13 +104,13 @@ namespace dxvk {
       return get(module, specUbo, id, 0, 32);
     }
 
-    uint32_t get(SpirvModule &module, uint32_t specUbo, D3D9SpecConstantId id, uint32_t bitOffset, uint32_t bitCount) {
+    uint32_t get(SpirvModule &module, uint32_t specUbo, D3D9SpecConstantId id, uint32_t bitOffset, uint32_t bitCount, uint32_t uboOverride = 0) {
       const auto &layout = D3D9SpecializationInfo::Layout[id];
 
       uint32_t uintType = module.defIntType(32, 0);
       uint32_t optimized = getOptimizedBool(module);
 
-      uint32_t quickValue     = getSpecUBODword(module, specUbo, layout.dwordOffset);
+      uint32_t quickValue     = uboOverride ? uboOverride : getSpecUBODword(module, specUbo, layout.dwordOffset);
       uint32_t optimizedValue = getSpecConstDword(module, layout.dwordOffset);
 
       uint32_t val = module.opSelect(uintType, optimized, optimizedValue, quickValue);
diff --git a/src/d3d9/d3d9_state.h b/src/d3d9/d3d9_state.h
index ddd3eaa5f..79aa0d9d7 100644
--- a/src/d3d9/d3d9_state.h
+++ b/src/d3d9/d3d9_state.h
@@ -28,6 +28,14 @@ namespace dxvk {
   
   struct D3D9ClipPlane {
     float coeff[4] = {};
+
+    bool operator == (const D3D9ClipPlane& other) {
+      return std::memcmp(this, &other, sizeof(D3D9ClipPlane)) == 0;
+    }
+
+    bool operator != (const D3D9ClipPlane& other) {
+      return !this->operator == (other);
+    }
   };
 
   struct D3D9RenderStateInfo {
diff --git a/src/dxso/dxso_compiler.cpp b/src/dxso/dxso_compiler.cpp
index 10a7bceb9..ca171ace4 100644
--- a/src/dxso/dxso_compiler.cpp
+++ b/src/dxso/dxso_compiler.cpp
@@ -3482,6 +3482,7 @@ void DxsoCompiler::emitControlFlowGenericLoop(
     
     uint32_t floatType = m_module.defFloatType(32);
     uint32_t vec4Type  = m_module.defVectorType(floatType, 4);
+    uint32_t boolType  = m_module.defBoolType();
     
     // Declare uniform buffer containing clip planes
     uint32_t clipPlaneArray  = m_module.defArrayTypeUnique(vec4Type, clipPlaneCountId);
@@ -3551,9 +3552,15 @@ void DxsoCompiler::emitControlFlowGenericLoop(
 
       DxsoRegisterValue dist = emitDot(position, plane);
 
+      // Always consider clip planes enabled when doing GPL by forcing a mask of 0xffffffff for the quick value.
+      uint32_t clipPlaneEnabledBit = m_spec.get(m_module, m_specUbo, SpecClipPlaneMask, i, 1, m_module.constu32(0xffffffff));
+      uint32_t clipPlaneEnabled = m_module.opINotEqual(boolType, clipPlaneEnabledBit, m_module.constu32(0));
+
+      uint32_t value = m_module.opSelect(floatType, clipPlaneEnabled, dist.id, m_module.constf32(0.0f));
+
       m_module.opStore(m_module.opAccessChain(
         m_module.defPointerType(floatType, spv::StorageClassOutput),
-        clipDistArray, 1, &blockMembers[1]), dist.id);
+        clipDistArray, 1, &blockMembers[1]), value);
     }
   }