From cc26064e1de080afb86e2cc761a8e0c6313592a6 Mon Sep 17 00:00:00 2001 From: yours3lf <0.tamas.marton@gmail.com> Date: Tue, 16 Jun 2020 20:59:59 +0100 Subject: [PATCH] multithreaded command submission now works --- driver/command.c | 85 ++++++++++++-------------- driver/draw.c | 38 ++++++------ test/multithreading/multithreading.cpp | 26 ++++---- 3 files changed, 74 insertions(+), 75 deletions(-) diff --git a/driver/command.c b/driver/command.c index 9d9aee6..d31b0f0 100644 --- a/driver/command.c +++ b/driver/command.c @@ -248,15 +248,15 @@ VKAPI_ATTR VkResult VKAPI_CALL RPIFUNC(vkBeginCommandBuffer)( //When a command buffer begins recording, all state in that command buffer is undefined - commandBuffer->usageFlags = pBeginInfo->flags; - commandBuffer->state = CMDBUF_STATE_RECORDING; - - if((pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) && + if((commandBuffer->state == CMDBUF_STATE_INVALID || commandBuffer->state == CMDBUF_STATE_EXECUTABLE) && commandBuffer->cp->resetAble) { RPIFUNC(vkResetCommandBuffer)(commandBuffer, 0); } + commandBuffer->usageFlags = pBeginInfo->flags; + commandBuffer->state = CMDBUF_STATE_RECORDING; + if(pBeginInfo->pInheritanceInfo && commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) { VkRenderPassBeginInfo rpbi = {0}; @@ -596,7 +596,7 @@ VKAPI_ATTR VkResult VKAPI_CALL RPIFUNC(vkQueueSubmit)( submitCl.shader_rec_count = marker->shaderRecCount; submitCl.uniforms_size = marker->uniformsSize; - /**/ + /** printf("BCL:\n"); uint8_t* mem = malloc(marker->size); memcpy(mem, marker+1, marker->size); @@ -613,10 +613,18 @@ VKAPI_ATTR VkResult VKAPI_CALL RPIFUNC(vkQueueSubmit)( { printf("%i ", *(((uint32_t*)getCPAptrFromOffset(cmdbuf->uniformsCl.CPA, marker->uniformsBufOffset + cmdbuf->uniformsCl.offset))+d)); } - printf("\nShader recs: "); - uint8_t* ptr = getCPAptrFromOffset(cmdbuf->shaderRecCl.CPA, marker->shaderRecBufOffset + cmdbuf->shaderRecCl.offset + (3 + 3) * 4); + + printf("\nShader recs: "); + uint8_t* ptr = getCPAptrFromOffset(cmdbuf->shaderRecCl.CPA, marker->shaderRecBufOffset + cmdbuf->shaderRecCl.offset); for(int d = 0; d < marker->shaderRecCount; ++d) { + printf("\nShader rec handle indices: "); + int numIndices = 3 + 1; + for(int d = 0; d < numIndices; ++d) + { + printf("%u ", *ptr); + ptr += 4; + } uint8_t flags = *ptr; uint8_t fragmentShaderIsSingleThreaded = flags & (1 << 0); uint8_t pointSizeIncludedInShadedVertexData = (flags & (1 << 1)) >> 1; @@ -985,6 +993,8 @@ VKAPI_ATTR void VKAPI_CALL RPIFUNC(vkCmdExecuteCommands)( _commandBuffer* primary = commandBuffer; + CLMarker* primaryMarker = getCPAptrFromOffset(primary->binCl.CPA, primary->binCl.currMarkerOffset); + for(uint32_t c = 0; c < commandBufferCount; ++c) { _commandBuffer* secondary = pCommandBuffers[c]; @@ -1000,28 +1010,37 @@ VKAPI_ATTR void VKAPI_CALL RPIFUNC(vkCmdExecuteCommands)( secondaryMarker->shaderRecRelocSize = secondary->shaderRecRelocCl.nextFreeByteOffset - (secondaryMarker->shaderRecRelocOffset + secondary->shaderRecRelocCl.offset); } - for(uint32_t d = 0; d < secondaryMarker->uniformRelocSize; ++d) + for(uint32_t d = 0; d < secondaryMarker->uniformRelocSize / 4; ++d) { - uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->uniformRelocCl.CPA, secondaryMarker->uniformRelocOffset + secondary->uniformRelocCl.offset); + uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->uniformRelocCl.CPA, secondaryMarker->uniformRelocOffset + secondary->uniformRelocCl.offset + d * 4); - uint32_t* handleIdx = getCPAptrFromOffset(secondary->uniformsCl.CPA, secondary->uniformsCl.offset + offset); - *handleIdx += primary->handlesCl.nextFreeByteOffset - primary->handlesCl.offset; + uint32_t* handleIdx = getCPAptrFromOffset(secondary->uniformsCl.CPA, secondaryMarker->uniformsBufOffset + secondary->uniformsCl.offset + offset); + uint32_t handle = *(uint32_t*)getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset + (*handleIdx) * 4); + clFit(&primary->handlesCl, 4); + uint32_t idx = clGetHandleIndex(&primary->handlesCl, primaryMarker->handlesBufOffset + primary->handlesCl.offset, primaryMarker->handlesSize, handle); + *handleIdx = idx; } - for(uint32_t d = 0; d < secondaryMarker->gemRelocSize; ++d) + for(uint32_t d = 0; d < secondaryMarker->gemRelocSize / 4; ++d) { - uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->gemRelocCl.CPA, secondaryMarker->gemRelocOffset + secondary->gemRelocCl.offset); + uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->gemRelocCl.CPA, secondaryMarker->gemRelocOffset + secondary->gemRelocCl.offset + d * 4); uint32_t* handleIdx = getCPAptrFromOffset(secondary->binCl.CPA, secondary->binCl.offset + offset); - *handleIdx += primary->handlesCl.nextFreeByteOffset - primary->handlesCl.offset; + uint32_t handle = *(uint32_t*)getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset + (*handleIdx) * 4); + clFit(&primary->handlesCl, 4); + uint32_t idx = clGetHandleIndex(&primary->handlesCl, primaryMarker->handlesBufOffset + primary->handlesCl.offset, primaryMarker->handlesSize, handle); + *handleIdx = idx; } - for(uint32_t d = 0; d < secondaryMarker->shaderRecRelocSize; ++d) + for(uint32_t d = 0; d < secondaryMarker->shaderRecRelocSize / 4; ++d) { - uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->shaderRecRelocCl.CPA, secondaryMarker->shaderRecRelocOffset + secondary->shaderRecRelocCl.offset); + uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->shaderRecRelocCl.CPA, secondaryMarker->shaderRecRelocOffset + secondary->shaderRecRelocCl.offset + d * 4); - uint32_t* handleIdx = getCPAptrFromOffset(secondary->shaderRecCl.CPA, secondary->shaderRecCl.offset + offset); - *handleIdx += primary->handlesCl.nextFreeByteOffset - primary->handlesCl.offset; + uint32_t* handleIdx = getCPAptrFromOffset(secondary->shaderRecCl.CPA, secondaryMarker->shaderRecBufOffset + secondary->shaderRecCl.offset + offset); + uint32_t handle = *(uint32_t*)getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset + (*handleIdx) * 4); + clFit(&primary->handlesCl, 4); + uint32_t idx = clGetHandleIndex(&primary->handlesCl, primaryMarker->handlesBufOffset + primary->handlesCl.offset, primaryMarker->handlesSize, handle); + *handleIdx = idx; } clFit(&primary->binCl, secondaryMarker->size); @@ -1029,39 +1048,13 @@ VKAPI_ATTR void VKAPI_CALL RPIFUNC(vkCmdExecuteCommands)( ((CLMarker*)getCPAptrFromOffset(primary->binCl.CPA, primary->binCl.currMarkerOffset))->numDrawCallsSubmitted += secondaryMarker->numDrawCallsSubmitted; - //TODO handles/handle indices might be grabled up like this... - clFit(&primary->handlesCl, secondaryMarker->handlesSize); - clInsertData(&primary->handlesCl, secondaryMarker->handlesSize, getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset)); + //clFit(&primary->handlesCl, secondaryMarker->handlesSize); + //clInsertData(&primary->handlesCl, secondaryMarker->handlesSize, getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset)); clFit(&primary->uniformsCl, secondaryMarker->uniformsSize); clInsertData(&primary->uniformsCl, secondaryMarker->uniformsSize, getCPAptrFromOffset(secondary->uniformsCl.CPA, secondaryMarker->uniformsBufOffset + secondary->uniformsCl.offset)); clFit(&primary->shaderRecCl, secondaryMarker->shaderRecSize); clInsertData(&primary->shaderRecCl, secondaryMarker->shaderRecSize, getCPAptrFromOffset(secondary->shaderRecCl.CPA, secondaryMarker->shaderRecBufOffset + secondary->shaderRecCl.offset)); - - printf("\nUniforms: "); - for(int d = 0; d < secondaryMarker->uniformsSize / 4; ++d) - { - printf("%i ", *(((uint32_t*)getCPAptrFromOffset(secondary->uniformsCl.CPA, secondaryMarker->uniformsBufOffset + secondary->uniformsCl.offset))+d)); - } - - printf("\nUniforms: "); - for(int d = 0; d < secondaryMarker->uniformsSize / 4; ++d) - { - printf("%i ", *(((uint32_t*)getCPAptrFromOffset(primary->uniformsCl.CPA, primary->uniformsCl.offset))+d)); - } - - printf("\nBO handles: "); - for(int d = 0; d < secondaryMarker->handlesSize / 4; ++d) - { - printf("%u ", *(((uint32_t*)getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset))+d)); - } - - printf("\nBO handles: "); - for(int d = 0; d < secondaryMarker->handlesSize / 4; ++d) - { - printf("%u ", *(((uint32_t*)getCPAptrFromOffset(primary->handlesCl.CPA, primary->handlesCl.offset))+d)); - } - primary->shaderRecCount += secondary->shaderRecCount; } diff --git a/driver/draw.c b/driver/draw.c index f9929b3..4be43f7 100644 --- a/driver/draw.c +++ b/driver/draw.c @@ -235,6 +235,26 @@ static uint32_t drawCommon(VkCommandBuffer commandBuffer, int32_t vertexOffset) assert(vertModule->numVertVPMreads == vertexAttribSize >> 2); assert(vertModule->numCoordVPMreads == coordAttribSize >> 2); + if(commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) + { + uint32_t offset = commandBuffer->shaderRecCl.nextFreeByteOffset - commandBuffer->shaderRecCl.offset; + + clFit(&commandBuffer->shaderRecRelocCl, 12); + clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset); + offset += 4; + clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset); + offset += 4; + clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset); + + + clFit(&commandBuffer->shaderRecRelocCl, 4 * attribCount); + for(uint32_t c = 0; c < attribCount; ++c) + { + uint32_t offset = commandBuffer->shaderRecCl.nextFreeByteOffset - commandBuffer->shaderRecCl.offset + 12 + c * 4; + clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset); + } + } + //number of attribs //3 is the number of type of possible shaders for(uint32_t c = 0; c < (3 + attribCount)*4; ++c) @@ -269,17 +289,6 @@ static uint32_t drawCommon(VkCommandBuffer commandBuffer, int32_t vertexOffset) coordCode //coordinate shader code address ); - if(commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) - { - uint32_t offset = commandBuffer->shaderRecCl.nextFreeByteOffset - commandBuffer->shaderRecCl.offset - 12; - clFit(&commandBuffer->shaderRecRelocCl, 12); - clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset); - offset -= 16; - clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset); - offset -= 16; - clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset); - } - uint32_t vertexAttribOffsets[8] = {}; uint32_t coordAttribOffsets[8] = {}; for(uint32_t c = 1; c < 8; ++c) @@ -341,13 +350,6 @@ static uint32_t drawCommon(VkCommandBuffer commandBuffer, int32_t vertexOffset) vertexAttribOffsets[cb->graphicsPipeline->vertexAttributeDescriptions[c].location], //vertex vpm offset coordAttribOffsets[cb->graphicsPipeline->vertexAttributeDescriptions[c].location] //coordinte vpm offset ); - - if(commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) - { - uint32_t offset = commandBuffer->shaderRecCl.nextFreeByteOffset - commandBuffer->shaderRecCl.offset - 12; - clFit(&commandBuffer->shaderRecRelocCl, 4); - clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset); - } } } diff --git a/test/multithreading/multithreading.cpp b/test/multithreading/multithreading.cpp index 19d58bb..a59a6fa 100644 --- a/test/multithreading/multithreading.cpp +++ b/test/multithreading/multithreading.cpp @@ -666,6 +666,7 @@ void createCommandQueues() { VkCommandPoolCreateInfo poolCreateInfo = {}; poolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; poolCreateInfo.queueFamilyIndex = presentQueueFamily; + poolCreateInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; if (vkCreateCommandPool(device, &poolCreateInfo, nullptr, &primaryCommandPool) != VK_SUCCESS) { std::cerr << "failed to create command queue for presentation queue family" << std::endl; @@ -748,14 +749,14 @@ void threadFunc(uint32_t threadIdx, VkCommandBufferInheritanceInfo inheritanceIn vkCmdPushConstants(threadDataVector[threadIdx].commandBuffer, pipelineLayout, VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(pushConstants), &pushConstants); uint32_t numVerticesPerDrawCall = 3 * 20; - for(uint32_t c = threadDataVector[threadIdx].vertexOffset; c < threadDataVector[threadIdx].numVertices; c += numVerticesPerDrawCall) + for(uint32_t c = 0; c < threadDataVector[threadIdx].numVertices; c += numVerticesPerDrawCall) { - vkCmdDraw(threadDataVector[threadIdx].commandBuffer, min(numVerticesPerDrawCall, threadDataVector[threadIdx].numVertices - c), 1, c, 0); + vkCmdDraw(threadDataVector[threadIdx].commandBuffer, min(numVerticesPerDrawCall, threadDataVector[threadIdx].numVertices - c), 1, threadDataVector[threadIdx].vertexOffset + c, 0); } vkEndCommandBuffer(threadDataVector[threadIdx].commandBuffer); - std::cerr << "Recorded thread " << threadIdx << std::endl; + //std::cerr << "Recorded thread " << threadIdx << std::endl; } void recordCommandBuffers() @@ -795,7 +796,7 @@ void recordCommandBuffers() //update secondary command buffers //multi threaded mode - /** + /**/ std::vector threads; threads.reserve(numThreads); for(uint32_t c = 0; c < numThreads; ++c) @@ -810,8 +811,9 @@ void recordCommandBuffers() /**/ //single threaded mode for debugging - /**/ + /** for(uint32_t c = 0; c < numThreads; ++c) + //for(uint32_t c = 0; c < 1; ++c) { threadFunc(c, inheritanceInfo); } @@ -820,11 +822,13 @@ void recordCommandBuffers() VkCommandBuffer cmdBufs[numThreads]; for(uint32_t c = 0; c < numThreads; ++c) + //for(uint32_t c = 0; c < 1; ++c) { cmdBufs[c] = threadDataVector[c].commandBuffer; } vkCmdExecuteCommands(primaryCommandBuffer, numThreads, cmdBufs); + //vkCmdExecuteCommands(primaryCommandBuffer, 1, cmdBufs); vkCmdEndRenderPass(primaryCommandBuffer); @@ -834,7 +838,7 @@ void recordCommandBuffers() } - std::cout << "recorded command buffer for image " << imageIndex << std::endl; + //std::cout << "recorded command buffer for image " << imageIndex << std::endl; } void draw() { @@ -845,7 +849,7 @@ void draw() { assert(0); } - std::cout << "acquired image" << std::endl; + //std::cout << "acquired image" << std::endl; recordCommandBuffers(); @@ -867,7 +871,7 @@ void draw() { assert(0); } - std::cout << "submitted draw command buffer" << std::endl; + //std::cout << "submitted draw command buffer" << std::endl; // Present drawn image // Note: semaphore here is not strictly necessary, because commands are processed in submission order within a single queue @@ -887,7 +891,7 @@ void draw() { assert(0); } - std::cout << "submitted presentation command buffer" << std::endl; + //std::cout << "submitted presentation command buffer" << std::endl; } void CreateRenderPass() @@ -1333,8 +1337,8 @@ void CreateVertexBuffer() float w = 2.0; float h = 2.0; - float stepH = 90*6.0*h/1080.0; - float stepW = 90*8.0*w/1920.0; + float stepH = 2*6.0*h/1080.0; + float stepW = 2*8.0*w/1920.0; vertices.reserve(3 * 2 * 960 * 540);