From a51439fb29365e9945d1525880978672c85f16de Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 29 Dec 2017 00:51:31 +0100 Subject: [PATCH] [dxbc] Implemented thread group shared memory and barriers --- src/dxbc/dxbc_compiler.cpp | 199 +++++++++++++++++++++-------- src/dxbc/dxbc_compiler.h | 18 ++- src/dxbc/dxbc_defs.cpp | 2 +- src/dxbc/dxbc_defs.h | 1 + src/spirv/spirv_module.cpp | 20 +++ src/spirv/spirv_module.h | 9 ++ tests/d3d11/test_d3d11_compute.cpp | 22 +++- 7 files changed, 212 insertions(+), 59 deletions(-) diff --git a/src/dxbc/dxbc_compiler.cpp b/src/dxbc/dxbc_compiler.cpp index d2021dd45..61b95b459 100644 --- a/src/dxbc/dxbc_compiler.cpp +++ b/src/dxbc/dxbc_compiler.cpp @@ -69,6 +69,9 @@ namespace dxvk { case DxbcInstClass::Atomic: return this->emitAtomic(ins); + case DxbcInstClass::Barrier: + return this->emitBarrier(ins); + case DxbcInstClass::BufferLoad: return this->emitBufferLoad(ins); @@ -354,6 +357,39 @@ namespace dxvk { ins.op)); } } break; + + case DxbcOperandType::InputThreadId: { + m_cs.builtinGlobalInvocationId = emitNewBuiltinVariable({ + { DxbcScalarType::Uint32, 3, 0 }, + spv::StorageClassInput }, + spv::BuiltInGlobalInvocationId, + "vThreadId"); + } break; + + case DxbcOperandType::InputThreadGroupId: { + m_cs.builtinWorkgroupId = emitNewBuiltinVariable({ + { DxbcScalarType::Uint32, 3, 0 }, + spv::StorageClassInput }, + spv::BuiltInWorkgroupId, + "vThreadGroupId"); + } break; + + case DxbcOperandType::InputThreadIdInGroup: { + m_cs.builtinLocalInvocationId = emitNewBuiltinVariable({ + { DxbcScalarType::Uint32, 3, 0 }, + spv::StorageClassInput }, + spv::BuiltInLocalInvocationId, + "vThreadIdInGroup"); + } break; + + case DxbcOperandType::InputThreadIndexInGroup: { + // FIXME this might not be supported by Vulkan? + m_cs.builtinLocalInvocationIndex = emitNewBuiltinVariable({ + { DxbcScalarType::Uint32, 1, 0 }, + spv::StorageClassInput }, + spv::BuiltInLocalInvocationIndex, + "vThreadIndexInGroup"); + } break; default: Logger::err(str::format( @@ -734,8 +770,34 @@ namespace dxvk { // dcl_tgsm_structured takes three arguments: // (dst0) The resource register ID // (imm0) Structure stride, in bytes - // (imm0) Structure count - Logger::err("DxbcCompiler: emitDclThreadGroupSharedMemory not implemented"); + // (imm1) Structure count + const bool isStructured = ins.op == DxbcOpcode::DclThreadGroupSharedMemoryStructured; + + const uint32_t regId = ins.dst[0].idx[0].offset; + + if (regId >= m_gRegs.size()) + m_gRegs.resize(regId + 1); + + const uint32_t elementStride = isStructured ? ins.imm[0].u32 : 0; + const uint32_t elementCount = isStructured ? ins.imm[1].u32 : ins.imm[0].u32; + + DxbcRegisterInfo varInfo; + varInfo.type.ctype = DxbcScalarType::Uint32; + varInfo.type.ccount = 1; + varInfo.type.alength = isStructured + ? elementCount * elementStride / 4 + : elementCount; + varInfo.sclass = spv::StorageClassWorkgroup; + + m_gRegs[regId].type = isStructured + ? DxbcResourceType::Structured + : DxbcResourceType::Raw; + m_gRegs[regId].elementStride = elementStride; + m_gRegs[regId].elementCount = elementCount; + m_gRegs[regId].varId = emitNewVariable(varInfo); + + m_module.setDebugName(m_gRegs[regId].varId, + str::format("g", regId).c_str()); } @@ -1459,6 +1521,48 @@ namespace dxvk { } + void DxbcCompiler::emitBarrier(const DxbcShaderInstruction& ins) { + // sync takes no operands. Instead, the synchronization + // scope is defined by the operand control bits. + const DxbcSyncFlags flags = ins.controls.syncFlags; + + uint32_t executionScope = 0; + uint32_t memoryScope = 0; + uint32_t memorySemantics = 0; + + if (flags.test(DxbcSyncFlag::ThreadsInGroup)) + executionScope = spv::ScopeWorkgroup; + + if (flags.test(DxbcSyncFlag::ThreadGroupSharedMemory)) { + memoryScope = spv::ScopeWorkgroup; + memorySemantics |= spv::MemorySemanticsWorkgroupMemoryMask; + } + + if (flags.test(DxbcSyncFlag::UavMemoryGroup)) { + memoryScope = spv::ScopeWorkgroup; + memorySemantics |= spv::MemorySemanticsUniformMemoryMask; + } + + if (flags.test(DxbcSyncFlag::UavMemoryGlobal)) { + memoryScope = spv::ScopeDevice; + memorySemantics |= spv::MemorySemanticsUniformMemoryMask; + } + + if (executionScope != 0) { + m_module.opControlBarrier( + m_module.constu32(executionScope), + m_module.constu32(memoryScope), + m_module.constu32(memorySemantics)); + } else if (memorySemantics != spv::MemorySemanticsMaskNone) { + m_module.opMemoryBarrier( + m_module.constu32(memoryScope), + m_module.constu32(memorySemantics)); + } else { + Logger::warn("DxbcCompiler: sync instruction has no effect"); + } + } + + void DxbcCompiler::emitBufferLoad(const DxbcShaderInstruction& ins) { // ld_raw takes three arguments: // (dst0) Destination register @@ -2142,7 +2246,7 @@ namespace dxvk { uint32_t dstIndex = 0; - for (uint32_t i = 0; i < value.type.ccount; i++) { + for (uint32_t i = 0; i < 4; i++) { if (writeMask[i]) indices[dstIndex++] = swizzle[i]; } @@ -2500,6 +2604,26 @@ namespace dxvk { case DxbcOperandType::ImmediateConstantBuffer: return emitGetImmConstBufPtr(operand); + case DxbcOperandType::InputThreadId: + return DxbcRegisterPointer { + { DxbcScalarType::Uint32, 3 }, + m_cs.builtinGlobalInvocationId }; + + case DxbcOperandType::InputThreadGroupId: + return DxbcRegisterPointer { + { DxbcScalarType::Uint32, 3 }, + m_cs.builtinWorkgroupId }; + + case DxbcOperandType::InputThreadIdInGroup: + return DxbcRegisterPointer { + { DxbcScalarType::Uint32, 3 }, + m_cs.builtinLocalInvocationId }; + + case DxbcOperandType::InputThreadIndexInGroup: + return DxbcRegisterPointer { + { DxbcScalarType::Uint32, 1 }, + m_cs.builtinLocalInvocationIndex }; + default: throw DxvkError(str::format( "DxbcCompiler: Unhandled operand type: ", @@ -2561,6 +2685,11 @@ namespace dxvk { scalarTypeId, bufferId, elementIndexAdjusted, SpirvImageOperands()); + case DxbcOperandType::ThreadGroupSharedMemory: + return m_module.opLoad(scalarTypeId, + m_module.opAccessChain(bufferInfo.typeId, + bufferInfo.varId, 1, &elementIndexAdjusted)); + default: throw DxvkError("DxbcCompiler: Invalid operand type for strucured/raw load"); } @@ -2626,6 +2755,13 @@ namespace dxvk { SpirvImageOperands()); break; + case DxbcOperandType::ThreadGroupSharedMemory: + m_module.opStore( + m_module.opAccessChain(bufferInfo.typeId, + bufferInfo.varId, 1, &elementIndexAdjusted), + srcComponentId); + break; + default: throw DxvkError("DxbcCompiler: Invalid operand type for strucured/raw store"); } @@ -3090,46 +3226,6 @@ namespace dxvk { } - void DxbcCompiler::emitCsInitBuiltins() { - m_cs.builtinGlobalInvocationId = emitNewBuiltinVariable({ - { DxbcScalarType::Uint32, 3, 0 }, - spv::StorageClassInput }, - spv::BuiltInGlobalInvocationId, - "cs_global_invocation_id"); - - m_cs.builtinLocalInvocationId = emitNewBuiltinVariable({ - { DxbcScalarType::Uint32, 3, 0 }, - spv::StorageClassInput }, - spv::BuiltInLocalInvocationId, - "cs_local_invocation_id"); - - // FIXME Vulkan might not support this? not documented - m_cs.builtinLocalInvocationIndex = emitNewBuiltinVariable({ - { DxbcScalarType::Uint32, 1, 0 }, - spv::StorageClassInput }, - spv::BuiltInLocalInvocationIndex, - "cs_local_invocation_index"); - - m_cs.builtinWorkgroupId = emitNewBuiltinVariable({ - { DxbcScalarType::Uint32, 3, 0 }, - spv::StorageClassInput }, - spv::BuiltInWorkgroupId, - "cs_workgroup_id"); - - m_cs.builtinWorkgroupSize = emitNewBuiltinVariable({ - { DxbcScalarType::Uint32, 3, 0 }, - spv::StorageClassInput }, - spv::BuiltInWorkgroupSize, - "cs_workgroup_size"); - - m_cs.builtinWorkgroupCount = emitNewBuiltinVariable({ - { DxbcScalarType::Uint32, 3, 0 }, - spv::StorageClassInput }, - spv::BuiltInNumWorkgroups, - "cs_workgroup_count"); - } - - void DxbcCompiler::emitVsInit() { m_module.enableCapability(spv::CapabilityClipDistance); m_module.enableCapability(spv::CapabilityCullDistance); @@ -3247,10 +3343,6 @@ namespace dxvk { void DxbcCompiler::emitCsInit() { - // There are no input or output - // variables for compute shaders - emitCsInitBuiltins(); - // Main function of the compute shader m_cs.functionId = m_module.allocateId(); m_module.setDebugName(m_cs.functionId, "cs_main"); @@ -3403,9 +3495,16 @@ namespace dxvk { return result; } break; - // TODO implement -// case DxbcOperandType::ThreadGroupSharedMemory: { -// } break; + case DxbcOperandType::ThreadGroupSharedMemory: { + DxbcBufferInfo result; + result.type = m_gRegs.at(registerId).type; + result.typeId = m_module.defPointerType( + getScalarTypeId(DxbcScalarType::Uint32), + spv::StorageClassWorkgroup); + result.varId = m_gRegs.at(registerId).varId; + result.stride = m_gRegs.at(registerId).elementStride; + return result; + } break; default: throw DxvkError(str::format("DxbcCompiler: Invalid operand type for buffer: ", reg.type)); diff --git a/src/dxbc/dxbc_compiler.h b/src/dxbc/dxbc_compiler.h index 186fdb126..ee931d833 100644 --- a/src/dxbc/dxbc_compiler.h +++ b/src/dxbc/dxbc_compiler.h @@ -86,6 +86,14 @@ namespace dxvk { }; + struct DxbcGreg { + DxbcResourceType type = DxbcResourceType::Raw; + uint32_t elementStride = 0; + uint32_t elementCount = 0; + uint32_t varId = 0; + }; + + /** * \brief Vertex shader-specific structure */ @@ -133,8 +141,6 @@ namespace dxvk { uint32_t builtinLocalInvocationId = 0; uint32_t builtinLocalInvocationIndex = 0; uint32_t builtinWorkgroupId = 0; - uint32_t builtinWorkgroupSize = 0; - uint32_t builtinWorkgroupCount = 0; }; @@ -226,6 +232,10 @@ namespace dxvk { std::vector m_rRegs; std::vector m_xRegs; + ///////////////////////////////////////////// + // Thread group shared memory (g#) registers + std::vector m_gRegs; + /////////////////////////////////////////////////////////// // v# registers as defined by the shader. The type of each // of these inputs is either float4 or an array of float4. @@ -382,6 +392,9 @@ namespace dxvk { void emitAtomic( const DxbcShaderInstruction& ins); + void emitBarrier( + const DxbcShaderInstruction& ins); + void emitBufferLoad( const DxbcShaderInstruction& ins); @@ -577,7 +590,6 @@ namespace dxvk { void emitVsInitBuiltins(); void emitGsInitBuiltins(uint32_t vertexCount); void emitPsInitBuiltins(); - void emitCsInitBuiltins(); ///////////////////////////////// // Shader initialization methods diff --git a/src/dxbc/dxbc_defs.cpp b/src/dxbc/dxbc_defs.cpp index 8d92c406b..0745e91d1 100644 --- a/src/dxbc/dxbc_defs.cpp +++ b/src/dxbc/dxbc_defs.cpp @@ -811,7 +811,7 @@ namespace dxvk { { DxbcOperandKind::SrcReg, DxbcScalarType::Uint32 }, } }, /* Sync */ - { }, + { 0, DxbcInstClass::Barrier }, /* DAdd */ { }, /* DMax */ diff --git a/src/dxbc/dxbc_defs.h b/src/dxbc/dxbc_defs.h index 70f5e6258..eb14643b2 100644 --- a/src/dxbc/dxbc_defs.h +++ b/src/dxbc/dxbc_defs.h @@ -33,6 +33,7 @@ namespace dxvk { ControlFlow, ///< Control flow instructions GeometryEmit, ///< Special geometry shader instructions Atomic, ///< Atomic operations + Barrier, ///< Execution or memory barrier BufferLoad, ///< Structured or raw buffer load BufferStore, ///< Structured or raw buffer store TextureQuery, ///< Texture query instruction diff --git a/src/spirv/spirv_module.cpp b/src/spirv/spirv_module.cpp index 59e785853..8cc102cec 100644 --- a/src/spirv/spirv_module.cpp +++ b/src/spirv/spirv_module.cpp @@ -1869,6 +1869,26 @@ namespace dxvk { } + void SpirvModule::opControlBarrier( + uint32_t execution, + uint32_t memory, + uint32_t semantics) { + m_code.putIns (spv::OpControlBarrier, 4); + m_code.putWord(execution); + m_code.putWord(memory); + m_code.putWord(semantics); + } + + + void SpirvModule::opMemoryBarrier( + uint32_t memory, + uint32_t semantics) { + m_code.putIns (spv::OpMemoryBarrier, 3); + m_code.putWord(memory); + m_code.putWord(semantics); + } + + void SpirvModule::opLoopMerge( uint32_t mergeBlock, uint32_t continueTarget, diff --git a/src/spirv/spirv_module.h b/src/spirv/spirv_module.h index cde50109e..538539b32 100644 --- a/src/spirv/spirv_module.h +++ b/src/spirv/spirv_module.h @@ -643,6 +643,15 @@ namespace dxvk { uint32_t reference, const SpirvImageOperands& operands); + void opControlBarrier( + uint32_t execution, + uint32_t memory, + uint32_t semantics); + + void opMemoryBarrier( + uint32_t memory, + uint32_t semantics); + void opLoopMerge( uint32_t mergeBlock, uint32_t continueTarget, diff --git a/tests/d3d11/test_d3d11_compute.cpp b/tests/d3d11/test_d3d11_compute.cpp index eaaa04e2d..8875706ec 100644 --- a/tests/d3d11/test_d3d11_compute.cpp +++ b/tests/d3d11/test_d3d11_compute.cpp @@ -13,9 +13,21 @@ using namespace dxvk; const std::string g_computeShaderCode = "StructuredBuffer buf_in : register(t0);\n" "RWStructuredBuffer buf_out : register(u0);\n" - "[numthreads(1,1,1)]\n" - "void main() {\n" - " buf_out[0] = buf_in[0] * buf_in[1];\n" + "groupshared uint tmp[64];\n" + "[numthreads(64,1,1)]\n" + "void main(uint localId : SV_GroupIndex, uint3 globalId : SV_DispatchThreadID) {\n" + " tmp[localId] = buf_in[2 * globalId.x + 0]\n" + " + buf_in[2 * globalId.x + 1];\n" + " GroupMemoryBarrierWithGroupSync();\n" + " uint activeGroups = 32;\n" + " while (activeGroups != 0) {\n" + " if (localId < activeGroups)\n" + " tmp[localId] += tmp[localId + activeGroups];\n" + " GroupMemoryBarrierWithGroupSync();\n" + " activeGroups >>= 1;\n" + " }\n" + " if (localId == 0)\n" + " buf_out[0] = tmp[0];\n" "}\n"; int WINAPI WinMain(HINSTANCE hInstance, @@ -63,7 +75,7 @@ int WINAPI WinMain(HINSTANCE hInstance, return 1; } - std::array srcData; + std::array srcData; for (uint32_t i = 0; i < srcData.size(); i++) srcData[i] = i + 1; @@ -93,7 +105,7 @@ int WINAPI WinMain(HINSTANCE hInstance, dstBufferDesc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; dstBufferDesc.StructureByteStride = sizeof(uint32_t); - if (FAILED(device->CreateBuffer(&dstBufferDesc, nullptr, &dstBuffer))) { + if (FAILED(device->CreateBuffer(&dstBufferDesc, &srcDataInfo, &dstBuffer))) { std::cerr << "Failed to create destination buffer" << std::endl; return 1; }