1
0
mirror of https://github.com/Yours3lf/rpi-vk-driver.git synced 2025-02-21 18:54:18 +01:00

multithreaded command submission now works

This commit is contained in:
yours3lf 2020-06-16 20:59:59 +01:00
parent 80d90ca190
commit cc26064e1d
3 changed files with 74 additions and 75 deletions

View File

@ -248,15 +248,15 @@ VKAPI_ATTR VkResult VKAPI_CALL RPIFUNC(vkBeginCommandBuffer)(
//When a command buffer begins recording, all state in that command buffer is undefined //When a command buffer begins recording, all state in that command buffer is undefined
commandBuffer->usageFlags = pBeginInfo->flags; if((commandBuffer->state == CMDBUF_STATE_INVALID || commandBuffer->state == CMDBUF_STATE_EXECUTABLE) &&
commandBuffer->state = CMDBUF_STATE_RECORDING;
if((pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) &&
commandBuffer->cp->resetAble) commandBuffer->cp->resetAble)
{ {
RPIFUNC(vkResetCommandBuffer)(commandBuffer, 0); RPIFUNC(vkResetCommandBuffer)(commandBuffer, 0);
} }
commandBuffer->usageFlags = pBeginInfo->flags;
commandBuffer->state = CMDBUF_STATE_RECORDING;
if(pBeginInfo->pInheritanceInfo && commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) if(pBeginInfo->pInheritanceInfo && commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
{ {
VkRenderPassBeginInfo rpbi = {0}; VkRenderPassBeginInfo rpbi = {0};
@ -596,7 +596,7 @@ VKAPI_ATTR VkResult VKAPI_CALL RPIFUNC(vkQueueSubmit)(
submitCl.shader_rec_count = marker->shaderRecCount; submitCl.shader_rec_count = marker->shaderRecCount;
submitCl.uniforms_size = marker->uniformsSize; submitCl.uniforms_size = marker->uniformsSize;
/**/ /**
printf("BCL:\n"); printf("BCL:\n");
uint8_t* mem = malloc(marker->size); uint8_t* mem = malloc(marker->size);
memcpy(mem, marker+1, marker->size); memcpy(mem, marker+1, marker->size);
@ -613,10 +613,18 @@ VKAPI_ATTR VkResult VKAPI_CALL RPIFUNC(vkQueueSubmit)(
{ {
printf("%i ", *(((uint32_t*)getCPAptrFromOffset(cmdbuf->uniformsCl.CPA, marker->uniformsBufOffset + cmdbuf->uniformsCl.offset))+d)); printf("%i ", *(((uint32_t*)getCPAptrFromOffset(cmdbuf->uniformsCl.CPA, marker->uniformsBufOffset + cmdbuf->uniformsCl.offset))+d));
} }
printf("\nShader recs: ");
uint8_t* ptr = getCPAptrFromOffset(cmdbuf->shaderRecCl.CPA, marker->shaderRecBufOffset + cmdbuf->shaderRecCl.offset + (3 + 3) * 4); printf("\nShader recs: ");
uint8_t* ptr = getCPAptrFromOffset(cmdbuf->shaderRecCl.CPA, marker->shaderRecBufOffset + cmdbuf->shaderRecCl.offset);
for(int d = 0; d < marker->shaderRecCount; ++d) for(int d = 0; d < marker->shaderRecCount; ++d)
{ {
printf("\nShader rec handle indices: ");
int numIndices = 3 + 1;
for(int d = 0; d < numIndices; ++d)
{
printf("%u ", *ptr);
ptr += 4;
}
uint8_t flags = *ptr; uint8_t flags = *ptr;
uint8_t fragmentShaderIsSingleThreaded = flags & (1 << 0); uint8_t fragmentShaderIsSingleThreaded = flags & (1 << 0);
uint8_t pointSizeIncludedInShadedVertexData = (flags & (1 << 1)) >> 1; uint8_t pointSizeIncludedInShadedVertexData = (flags & (1 << 1)) >> 1;
@ -985,6 +993,8 @@ VKAPI_ATTR void VKAPI_CALL RPIFUNC(vkCmdExecuteCommands)(
_commandBuffer* primary = commandBuffer; _commandBuffer* primary = commandBuffer;
CLMarker* primaryMarker = getCPAptrFromOffset(primary->binCl.CPA, primary->binCl.currMarkerOffset);
for(uint32_t c = 0; c < commandBufferCount; ++c) for(uint32_t c = 0; c < commandBufferCount; ++c)
{ {
_commandBuffer* secondary = pCommandBuffers[c]; _commandBuffer* secondary = pCommandBuffers[c];
@ -1000,28 +1010,37 @@ VKAPI_ATTR void VKAPI_CALL RPIFUNC(vkCmdExecuteCommands)(
secondaryMarker->shaderRecRelocSize = secondary->shaderRecRelocCl.nextFreeByteOffset - (secondaryMarker->shaderRecRelocOffset + secondary->shaderRecRelocCl.offset); secondaryMarker->shaderRecRelocSize = secondary->shaderRecRelocCl.nextFreeByteOffset - (secondaryMarker->shaderRecRelocOffset + secondary->shaderRecRelocCl.offset);
} }
for(uint32_t d = 0; d < secondaryMarker->uniformRelocSize; ++d) for(uint32_t d = 0; d < secondaryMarker->uniformRelocSize / 4; ++d)
{ {
uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->uniformRelocCl.CPA, secondaryMarker->uniformRelocOffset + secondary->uniformRelocCl.offset); uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->uniformRelocCl.CPA, secondaryMarker->uniformRelocOffset + secondary->uniformRelocCl.offset + d * 4);
uint32_t* handleIdx = getCPAptrFromOffset(secondary->uniformsCl.CPA, secondary->uniformsCl.offset + offset); uint32_t* handleIdx = getCPAptrFromOffset(secondary->uniformsCl.CPA, secondaryMarker->uniformsBufOffset + secondary->uniformsCl.offset + offset);
*handleIdx += primary->handlesCl.nextFreeByteOffset - primary->handlesCl.offset; uint32_t handle = *(uint32_t*)getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset + (*handleIdx) * 4);
clFit(&primary->handlesCl, 4);
uint32_t idx = clGetHandleIndex(&primary->handlesCl, primaryMarker->handlesBufOffset + primary->handlesCl.offset, primaryMarker->handlesSize, handle);
*handleIdx = idx;
} }
for(uint32_t d = 0; d < secondaryMarker->gemRelocSize; ++d) for(uint32_t d = 0; d < secondaryMarker->gemRelocSize / 4; ++d)
{ {
uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->gemRelocCl.CPA, secondaryMarker->gemRelocOffset + secondary->gemRelocCl.offset); uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->gemRelocCl.CPA, secondaryMarker->gemRelocOffset + secondary->gemRelocCl.offset + d * 4);
uint32_t* handleIdx = getCPAptrFromOffset(secondary->binCl.CPA, secondary->binCl.offset + offset); uint32_t* handleIdx = getCPAptrFromOffset(secondary->binCl.CPA, secondary->binCl.offset + offset);
*handleIdx += primary->handlesCl.nextFreeByteOffset - primary->handlesCl.offset; uint32_t handle = *(uint32_t*)getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset + (*handleIdx) * 4);
clFit(&primary->handlesCl, 4);
uint32_t idx = clGetHandleIndex(&primary->handlesCl, primaryMarker->handlesBufOffset + primary->handlesCl.offset, primaryMarker->handlesSize, handle);
*handleIdx = idx;
} }
for(uint32_t d = 0; d < secondaryMarker->shaderRecRelocSize; ++d) for(uint32_t d = 0; d < secondaryMarker->shaderRecRelocSize / 4; ++d)
{ {
uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->shaderRecRelocCl.CPA, secondaryMarker->shaderRecRelocOffset + secondary->shaderRecRelocCl.offset); uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->shaderRecRelocCl.CPA, secondaryMarker->shaderRecRelocOffset + secondary->shaderRecRelocCl.offset + d * 4);
uint32_t* handleIdx = getCPAptrFromOffset(secondary->shaderRecCl.CPA, secondary->shaderRecCl.offset + offset); uint32_t* handleIdx = getCPAptrFromOffset(secondary->shaderRecCl.CPA, secondaryMarker->shaderRecBufOffset + secondary->shaderRecCl.offset + offset);
*handleIdx += primary->handlesCl.nextFreeByteOffset - primary->handlesCl.offset; uint32_t handle = *(uint32_t*)getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset + (*handleIdx) * 4);
clFit(&primary->handlesCl, 4);
uint32_t idx = clGetHandleIndex(&primary->handlesCl, primaryMarker->handlesBufOffset + primary->handlesCl.offset, primaryMarker->handlesSize, handle);
*handleIdx = idx;
} }
clFit(&primary->binCl, secondaryMarker->size); clFit(&primary->binCl, secondaryMarker->size);
@ -1029,39 +1048,13 @@ VKAPI_ATTR void VKAPI_CALL RPIFUNC(vkCmdExecuteCommands)(
((CLMarker*)getCPAptrFromOffset(primary->binCl.CPA, primary->binCl.currMarkerOffset))->numDrawCallsSubmitted += secondaryMarker->numDrawCallsSubmitted; ((CLMarker*)getCPAptrFromOffset(primary->binCl.CPA, primary->binCl.currMarkerOffset))->numDrawCallsSubmitted += secondaryMarker->numDrawCallsSubmitted;
//TODO handles/handle indices might be grabled up like this... //clFit(&primary->handlesCl, secondaryMarker->handlesSize);
clFit(&primary->handlesCl, secondaryMarker->handlesSize); //clInsertData(&primary->handlesCl, secondaryMarker->handlesSize, getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset));
clInsertData(&primary->handlesCl, secondaryMarker->handlesSize, getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset));
clFit(&primary->uniformsCl, secondaryMarker->uniformsSize); clFit(&primary->uniformsCl, secondaryMarker->uniformsSize);
clInsertData(&primary->uniformsCl, secondaryMarker->uniformsSize, getCPAptrFromOffset(secondary->uniformsCl.CPA, secondaryMarker->uniformsBufOffset + secondary->uniformsCl.offset)); clInsertData(&primary->uniformsCl, secondaryMarker->uniformsSize, getCPAptrFromOffset(secondary->uniformsCl.CPA, secondaryMarker->uniformsBufOffset + secondary->uniformsCl.offset));
clFit(&primary->shaderRecCl, secondaryMarker->shaderRecSize); clFit(&primary->shaderRecCl, secondaryMarker->shaderRecSize);
clInsertData(&primary->shaderRecCl, secondaryMarker->shaderRecSize, getCPAptrFromOffset(secondary->shaderRecCl.CPA, secondaryMarker->shaderRecBufOffset + secondary->shaderRecCl.offset)); clInsertData(&primary->shaderRecCl, secondaryMarker->shaderRecSize, getCPAptrFromOffset(secondary->shaderRecCl.CPA, secondaryMarker->shaderRecBufOffset + secondary->shaderRecCl.offset));
printf("\nUniforms: ");
for(int d = 0; d < secondaryMarker->uniformsSize / 4; ++d)
{
printf("%i ", *(((uint32_t*)getCPAptrFromOffset(secondary->uniformsCl.CPA, secondaryMarker->uniformsBufOffset + secondary->uniformsCl.offset))+d));
}
printf("\nUniforms: ");
for(int d = 0; d < secondaryMarker->uniformsSize / 4; ++d)
{
printf("%i ", *(((uint32_t*)getCPAptrFromOffset(primary->uniformsCl.CPA, primary->uniformsCl.offset))+d));
}
printf("\nBO handles: ");
for(int d = 0; d < secondaryMarker->handlesSize / 4; ++d)
{
printf("%u ", *(((uint32_t*)getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset))+d));
}
printf("\nBO handles: ");
for(int d = 0; d < secondaryMarker->handlesSize / 4; ++d)
{
printf("%u ", *(((uint32_t*)getCPAptrFromOffset(primary->handlesCl.CPA, primary->handlesCl.offset))+d));
}
primary->shaderRecCount += secondary->shaderRecCount; primary->shaderRecCount += secondary->shaderRecCount;
} }

View File

@ -235,6 +235,26 @@ static uint32_t drawCommon(VkCommandBuffer commandBuffer, int32_t vertexOffset)
assert(vertModule->numVertVPMreads == vertexAttribSize >> 2); assert(vertModule->numVertVPMreads == vertexAttribSize >> 2);
assert(vertModule->numCoordVPMreads == coordAttribSize >> 2); assert(vertModule->numCoordVPMreads == coordAttribSize >> 2);
if(commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
{
uint32_t offset = commandBuffer->shaderRecCl.nextFreeByteOffset - commandBuffer->shaderRecCl.offset;
clFit(&commandBuffer->shaderRecRelocCl, 12);
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
offset += 4;
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
offset += 4;
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
clFit(&commandBuffer->shaderRecRelocCl, 4 * attribCount);
for(uint32_t c = 0; c < attribCount; ++c)
{
uint32_t offset = commandBuffer->shaderRecCl.nextFreeByteOffset - commandBuffer->shaderRecCl.offset + 12 + c * 4;
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
}
}
//number of attribs //number of attribs
//3 is the number of type of possible shaders //3 is the number of type of possible shaders
for(uint32_t c = 0; c < (3 + attribCount)*4; ++c) for(uint32_t c = 0; c < (3 + attribCount)*4; ++c)
@ -269,17 +289,6 @@ static uint32_t drawCommon(VkCommandBuffer commandBuffer, int32_t vertexOffset)
coordCode //coordinate shader code address coordCode //coordinate shader code address
); );
if(commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
{
uint32_t offset = commandBuffer->shaderRecCl.nextFreeByteOffset - commandBuffer->shaderRecCl.offset - 12;
clFit(&commandBuffer->shaderRecRelocCl, 12);
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
offset -= 16;
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
offset -= 16;
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
}
uint32_t vertexAttribOffsets[8] = {}; uint32_t vertexAttribOffsets[8] = {};
uint32_t coordAttribOffsets[8] = {}; uint32_t coordAttribOffsets[8] = {};
for(uint32_t c = 1; c < 8; ++c) for(uint32_t c = 1; c < 8; ++c)
@ -341,13 +350,6 @@ static uint32_t drawCommon(VkCommandBuffer commandBuffer, int32_t vertexOffset)
vertexAttribOffsets[cb->graphicsPipeline->vertexAttributeDescriptions[c].location], //vertex vpm offset vertexAttribOffsets[cb->graphicsPipeline->vertexAttributeDescriptions[c].location], //vertex vpm offset
coordAttribOffsets[cb->graphicsPipeline->vertexAttributeDescriptions[c].location] //coordinte vpm offset coordAttribOffsets[cb->graphicsPipeline->vertexAttributeDescriptions[c].location] //coordinte vpm offset
); );
if(commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
{
uint32_t offset = commandBuffer->shaderRecCl.nextFreeByteOffset - commandBuffer->shaderRecCl.offset - 12;
clFit(&commandBuffer->shaderRecRelocCl, 4);
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
}
} }
} }

View File

@ -666,6 +666,7 @@ void createCommandQueues() {
VkCommandPoolCreateInfo poolCreateInfo = {}; VkCommandPoolCreateInfo poolCreateInfo = {};
poolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; poolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
poolCreateInfo.queueFamilyIndex = presentQueueFamily; poolCreateInfo.queueFamilyIndex = presentQueueFamily;
poolCreateInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
if (vkCreateCommandPool(device, &poolCreateInfo, nullptr, &primaryCommandPool) != VK_SUCCESS) { if (vkCreateCommandPool(device, &poolCreateInfo, nullptr, &primaryCommandPool) != VK_SUCCESS) {
std::cerr << "failed to create command queue for presentation queue family" << std::endl; std::cerr << "failed to create command queue for presentation queue family" << std::endl;
@ -748,14 +749,14 @@ void threadFunc(uint32_t threadIdx, VkCommandBufferInheritanceInfo inheritanceIn
vkCmdPushConstants(threadDataVector[threadIdx].commandBuffer, pipelineLayout, VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(pushConstants), &pushConstants); vkCmdPushConstants(threadDataVector[threadIdx].commandBuffer, pipelineLayout, VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(pushConstants), &pushConstants);
uint32_t numVerticesPerDrawCall = 3 * 20; uint32_t numVerticesPerDrawCall = 3 * 20;
for(uint32_t c = threadDataVector[threadIdx].vertexOffset; c < threadDataVector[threadIdx].numVertices; c += numVerticesPerDrawCall) for(uint32_t c = 0; c < threadDataVector[threadIdx].numVertices; c += numVerticesPerDrawCall)
{ {
vkCmdDraw(threadDataVector[threadIdx].commandBuffer, min(numVerticesPerDrawCall, threadDataVector[threadIdx].numVertices - c), 1, c, 0); vkCmdDraw(threadDataVector[threadIdx].commandBuffer, min(numVerticesPerDrawCall, threadDataVector[threadIdx].numVertices - c), 1, threadDataVector[threadIdx].vertexOffset + c, 0);
} }
vkEndCommandBuffer(threadDataVector[threadIdx].commandBuffer); vkEndCommandBuffer(threadDataVector[threadIdx].commandBuffer);
std::cerr << "Recorded thread " << threadIdx << std::endl; //std::cerr << "Recorded thread " << threadIdx << std::endl;
} }
void recordCommandBuffers() void recordCommandBuffers()
@ -795,7 +796,7 @@ void recordCommandBuffers()
//update secondary command buffers //update secondary command buffers
//multi threaded mode //multi threaded mode
/** /**/
std::vector<std::thread> threads; std::vector<std::thread> threads;
threads.reserve(numThreads); threads.reserve(numThreads);
for(uint32_t c = 0; c < numThreads; ++c) for(uint32_t c = 0; c < numThreads; ++c)
@ -810,8 +811,9 @@ void recordCommandBuffers()
/**/ /**/
//single threaded mode for debugging //single threaded mode for debugging
/**/ /**
for(uint32_t c = 0; c < numThreads; ++c) for(uint32_t c = 0; c < numThreads; ++c)
//for(uint32_t c = 0; c < 1; ++c)
{ {
threadFunc(c, inheritanceInfo); threadFunc(c, inheritanceInfo);
} }
@ -820,11 +822,13 @@ void recordCommandBuffers()
VkCommandBuffer cmdBufs[numThreads]; VkCommandBuffer cmdBufs[numThreads];
for(uint32_t c = 0; c < numThreads; ++c) for(uint32_t c = 0; c < numThreads; ++c)
//for(uint32_t c = 0; c < 1; ++c)
{ {
cmdBufs[c] = threadDataVector[c].commandBuffer; cmdBufs[c] = threadDataVector[c].commandBuffer;
} }
vkCmdExecuteCommands(primaryCommandBuffer, numThreads, cmdBufs); vkCmdExecuteCommands(primaryCommandBuffer, numThreads, cmdBufs);
//vkCmdExecuteCommands(primaryCommandBuffer, 1, cmdBufs);
vkCmdEndRenderPass(primaryCommandBuffer); vkCmdEndRenderPass(primaryCommandBuffer);
@ -834,7 +838,7 @@ void recordCommandBuffers()
} }
std::cout << "recorded command buffer for image " << imageIndex << std::endl; //std::cout << "recorded command buffer for image " << imageIndex << std::endl;
} }
void draw() { void draw() {
@ -845,7 +849,7 @@ void draw() {
assert(0); assert(0);
} }
std::cout << "acquired image" << std::endl; //std::cout << "acquired image" << std::endl;
recordCommandBuffers(); recordCommandBuffers();
@ -867,7 +871,7 @@ void draw() {
assert(0); assert(0);
} }
std::cout << "submitted draw command buffer" << std::endl; //std::cout << "submitted draw command buffer" << std::endl;
// Present drawn image // Present drawn image
// Note: semaphore here is not strictly necessary, because commands are processed in submission order within a single queue // Note: semaphore here is not strictly necessary, because commands are processed in submission order within a single queue
@ -887,7 +891,7 @@ void draw() {
assert(0); assert(0);
} }
std::cout << "submitted presentation command buffer" << std::endl; //std::cout << "submitted presentation command buffer" << std::endl;
} }
void CreateRenderPass() void CreateRenderPass()
@ -1333,8 +1337,8 @@ void CreateVertexBuffer()
float w = 2.0; float w = 2.0;
float h = 2.0; float h = 2.0;
float stepH = 90*6.0*h/1080.0; float stepH = 2*6.0*h/1080.0;
float stepW = 90*8.0*w/1920.0; float stepW = 2*8.0*w/1920.0;
vertices.reserve(3 * 2 * 960 * 540); vertices.reserve(3 * 2 * 960 * 540);