1
0
mirror of https://github.com/Yours3lf/rpi-vk-driver.git synced 2025-02-21 18:54:18 +01:00

multithreaded command submission now works

This commit is contained in:
yours3lf 2020-06-16 20:59:59 +01:00
parent 80d90ca190
commit cc26064e1d
3 changed files with 74 additions and 75 deletions

View File

@ -248,15 +248,15 @@ VKAPI_ATTR VkResult VKAPI_CALL RPIFUNC(vkBeginCommandBuffer)(
//When a command buffer begins recording, all state in that command buffer is undefined
commandBuffer->usageFlags = pBeginInfo->flags;
commandBuffer->state = CMDBUF_STATE_RECORDING;
if((pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) &&
if((commandBuffer->state == CMDBUF_STATE_INVALID || commandBuffer->state == CMDBUF_STATE_EXECUTABLE) &&
commandBuffer->cp->resetAble)
{
RPIFUNC(vkResetCommandBuffer)(commandBuffer, 0);
}
commandBuffer->usageFlags = pBeginInfo->flags;
commandBuffer->state = CMDBUF_STATE_RECORDING;
if(pBeginInfo->pInheritanceInfo && commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
{
VkRenderPassBeginInfo rpbi = {0};
@ -596,7 +596,7 @@ VKAPI_ATTR VkResult VKAPI_CALL RPIFUNC(vkQueueSubmit)(
submitCl.shader_rec_count = marker->shaderRecCount;
submitCl.uniforms_size = marker->uniformsSize;
/**/
/**
printf("BCL:\n");
uint8_t* mem = malloc(marker->size);
memcpy(mem, marker+1, marker->size);
@ -613,10 +613,18 @@ VKAPI_ATTR VkResult VKAPI_CALL RPIFUNC(vkQueueSubmit)(
{
printf("%i ", *(((uint32_t*)getCPAptrFromOffset(cmdbuf->uniformsCl.CPA, marker->uniformsBufOffset + cmdbuf->uniformsCl.offset))+d));
}
printf("\nShader recs: ");
uint8_t* ptr = getCPAptrFromOffset(cmdbuf->shaderRecCl.CPA, marker->shaderRecBufOffset + cmdbuf->shaderRecCl.offset + (3 + 3) * 4);
uint8_t* ptr = getCPAptrFromOffset(cmdbuf->shaderRecCl.CPA, marker->shaderRecBufOffset + cmdbuf->shaderRecCl.offset);
for(int d = 0; d < marker->shaderRecCount; ++d)
{
printf("\nShader rec handle indices: ");
int numIndices = 3 + 1;
for(int d = 0; d < numIndices; ++d)
{
printf("%u ", *ptr);
ptr += 4;
}
uint8_t flags = *ptr;
uint8_t fragmentShaderIsSingleThreaded = flags & (1 << 0);
uint8_t pointSizeIncludedInShadedVertexData = (flags & (1 << 1)) >> 1;
@ -985,6 +993,8 @@ VKAPI_ATTR void VKAPI_CALL RPIFUNC(vkCmdExecuteCommands)(
_commandBuffer* primary = commandBuffer;
CLMarker* primaryMarker = getCPAptrFromOffset(primary->binCl.CPA, primary->binCl.currMarkerOffset);
for(uint32_t c = 0; c < commandBufferCount; ++c)
{
_commandBuffer* secondary = pCommandBuffers[c];
@ -1000,28 +1010,37 @@ VKAPI_ATTR void VKAPI_CALL RPIFUNC(vkCmdExecuteCommands)(
secondaryMarker->shaderRecRelocSize = secondary->shaderRecRelocCl.nextFreeByteOffset - (secondaryMarker->shaderRecRelocOffset + secondary->shaderRecRelocCl.offset);
}
for(uint32_t d = 0; d < secondaryMarker->uniformRelocSize; ++d)
for(uint32_t d = 0; d < secondaryMarker->uniformRelocSize / 4; ++d)
{
uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->uniformRelocCl.CPA, secondaryMarker->uniformRelocOffset + secondary->uniformRelocCl.offset);
uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->uniformRelocCl.CPA, secondaryMarker->uniformRelocOffset + secondary->uniformRelocCl.offset + d * 4);
uint32_t* handleIdx = getCPAptrFromOffset(secondary->uniformsCl.CPA, secondary->uniformsCl.offset + offset);
*handleIdx += primary->handlesCl.nextFreeByteOffset - primary->handlesCl.offset;
uint32_t* handleIdx = getCPAptrFromOffset(secondary->uniformsCl.CPA, secondaryMarker->uniformsBufOffset + secondary->uniformsCl.offset + offset);
uint32_t handle = *(uint32_t*)getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset + (*handleIdx) * 4);
clFit(&primary->handlesCl, 4);
uint32_t idx = clGetHandleIndex(&primary->handlesCl, primaryMarker->handlesBufOffset + primary->handlesCl.offset, primaryMarker->handlesSize, handle);
*handleIdx = idx;
}
for(uint32_t d = 0; d < secondaryMarker->gemRelocSize; ++d)
for(uint32_t d = 0; d < secondaryMarker->gemRelocSize / 4; ++d)
{
uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->gemRelocCl.CPA, secondaryMarker->gemRelocOffset + secondary->gemRelocCl.offset);
uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->gemRelocCl.CPA, secondaryMarker->gemRelocOffset + secondary->gemRelocCl.offset + d * 4);
uint32_t* handleIdx = getCPAptrFromOffset(secondary->binCl.CPA, secondary->binCl.offset + offset);
*handleIdx += primary->handlesCl.nextFreeByteOffset - primary->handlesCl.offset;
uint32_t handle = *(uint32_t*)getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset + (*handleIdx) * 4);
clFit(&primary->handlesCl, 4);
uint32_t idx = clGetHandleIndex(&primary->handlesCl, primaryMarker->handlesBufOffset + primary->handlesCl.offset, primaryMarker->handlesSize, handle);
*handleIdx = idx;
}
for(uint32_t d = 0; d < secondaryMarker->shaderRecRelocSize; ++d)
for(uint32_t d = 0; d < secondaryMarker->shaderRecRelocSize / 4; ++d)
{
uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->shaderRecRelocCl.CPA, secondaryMarker->shaderRecRelocOffset + secondary->shaderRecRelocCl.offset);
uint32_t offset = *(uint32_t*)getCPAptrFromOffset(secondary->shaderRecRelocCl.CPA, secondaryMarker->shaderRecRelocOffset + secondary->shaderRecRelocCl.offset + d * 4);
uint32_t* handleIdx = getCPAptrFromOffset(secondary->shaderRecCl.CPA, secondary->shaderRecCl.offset + offset);
*handleIdx += primary->handlesCl.nextFreeByteOffset - primary->handlesCl.offset;
uint32_t* handleIdx = getCPAptrFromOffset(secondary->shaderRecCl.CPA, secondaryMarker->shaderRecBufOffset + secondary->shaderRecCl.offset + offset);
uint32_t handle = *(uint32_t*)getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset + (*handleIdx) * 4);
clFit(&primary->handlesCl, 4);
uint32_t idx = clGetHandleIndex(&primary->handlesCl, primaryMarker->handlesBufOffset + primary->handlesCl.offset, primaryMarker->handlesSize, handle);
*handleIdx = idx;
}
clFit(&primary->binCl, secondaryMarker->size);
@ -1029,39 +1048,13 @@ VKAPI_ATTR void VKAPI_CALL RPIFUNC(vkCmdExecuteCommands)(
((CLMarker*)getCPAptrFromOffset(primary->binCl.CPA, primary->binCl.currMarkerOffset))->numDrawCallsSubmitted += secondaryMarker->numDrawCallsSubmitted;
//TODO handles/handle indices might be grabled up like this...
clFit(&primary->handlesCl, secondaryMarker->handlesSize);
clInsertData(&primary->handlesCl, secondaryMarker->handlesSize, getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset));
//clFit(&primary->handlesCl, secondaryMarker->handlesSize);
//clInsertData(&primary->handlesCl, secondaryMarker->handlesSize, getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset));
clFit(&primary->uniformsCl, secondaryMarker->uniformsSize);
clInsertData(&primary->uniformsCl, secondaryMarker->uniformsSize, getCPAptrFromOffset(secondary->uniformsCl.CPA, secondaryMarker->uniformsBufOffset + secondary->uniformsCl.offset));
clFit(&primary->shaderRecCl, secondaryMarker->shaderRecSize);
clInsertData(&primary->shaderRecCl, secondaryMarker->shaderRecSize, getCPAptrFromOffset(secondary->shaderRecCl.CPA, secondaryMarker->shaderRecBufOffset + secondary->shaderRecCl.offset));
printf("\nUniforms: ");
for(int d = 0; d < secondaryMarker->uniformsSize / 4; ++d)
{
printf("%i ", *(((uint32_t*)getCPAptrFromOffset(secondary->uniformsCl.CPA, secondaryMarker->uniformsBufOffset + secondary->uniformsCl.offset))+d));
}
printf("\nUniforms: ");
for(int d = 0; d < secondaryMarker->uniformsSize / 4; ++d)
{
printf("%i ", *(((uint32_t*)getCPAptrFromOffset(primary->uniformsCl.CPA, primary->uniformsCl.offset))+d));
}
printf("\nBO handles: ");
for(int d = 0; d < secondaryMarker->handlesSize / 4; ++d)
{
printf("%u ", *(((uint32_t*)getCPAptrFromOffset(secondary->handlesCl.CPA, secondaryMarker->handlesBufOffset + secondary->handlesCl.offset))+d));
}
printf("\nBO handles: ");
for(int d = 0; d < secondaryMarker->handlesSize / 4; ++d)
{
printf("%u ", *(((uint32_t*)getCPAptrFromOffset(primary->handlesCl.CPA, primary->handlesCl.offset))+d));
}
primary->shaderRecCount += secondary->shaderRecCount;
}

View File

@ -235,6 +235,26 @@ static uint32_t drawCommon(VkCommandBuffer commandBuffer, int32_t vertexOffset)
assert(vertModule->numVertVPMreads == vertexAttribSize >> 2);
assert(vertModule->numCoordVPMreads == coordAttribSize >> 2);
if(commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
{
uint32_t offset = commandBuffer->shaderRecCl.nextFreeByteOffset - commandBuffer->shaderRecCl.offset;
clFit(&commandBuffer->shaderRecRelocCl, 12);
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
offset += 4;
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
offset += 4;
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
clFit(&commandBuffer->shaderRecRelocCl, 4 * attribCount);
for(uint32_t c = 0; c < attribCount; ++c)
{
uint32_t offset = commandBuffer->shaderRecCl.nextFreeByteOffset - commandBuffer->shaderRecCl.offset + 12 + c * 4;
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
}
}
//number of attribs
//3 is the number of type of possible shaders
for(uint32_t c = 0; c < (3 + attribCount)*4; ++c)
@ -269,17 +289,6 @@ static uint32_t drawCommon(VkCommandBuffer commandBuffer, int32_t vertexOffset)
coordCode //coordinate shader code address
);
if(commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
{
uint32_t offset = commandBuffer->shaderRecCl.nextFreeByteOffset - commandBuffer->shaderRecCl.offset - 12;
clFit(&commandBuffer->shaderRecRelocCl, 12);
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
offset -= 16;
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
offset -= 16;
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
}
uint32_t vertexAttribOffsets[8] = {};
uint32_t coordAttribOffsets[8] = {};
for(uint32_t c = 1; c < 8; ++c)
@ -341,13 +350,6 @@ static uint32_t drawCommon(VkCommandBuffer commandBuffer, int32_t vertexOffset)
vertexAttribOffsets[cb->graphicsPipeline->vertexAttributeDescriptions[c].location], //vertex vpm offset
coordAttribOffsets[cb->graphicsPipeline->vertexAttributeDescriptions[c].location] //coordinte vpm offset
);
if(commandBuffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
{
uint32_t offset = commandBuffer->shaderRecCl.nextFreeByteOffset - commandBuffer->shaderRecCl.offset - 12;
clFit(&commandBuffer->shaderRecRelocCl, 4);
clInsertData(&commandBuffer->shaderRecRelocCl, 4, &offset);
}
}
}

View File

@ -666,6 +666,7 @@ void createCommandQueues() {
VkCommandPoolCreateInfo poolCreateInfo = {};
poolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
poolCreateInfo.queueFamilyIndex = presentQueueFamily;
poolCreateInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
if (vkCreateCommandPool(device, &poolCreateInfo, nullptr, &primaryCommandPool) != VK_SUCCESS) {
std::cerr << "failed to create command queue for presentation queue family" << std::endl;
@ -748,14 +749,14 @@ void threadFunc(uint32_t threadIdx, VkCommandBufferInheritanceInfo inheritanceIn
vkCmdPushConstants(threadDataVector[threadIdx].commandBuffer, pipelineLayout, VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(pushConstants), &pushConstants);
uint32_t numVerticesPerDrawCall = 3 * 20;
for(uint32_t c = threadDataVector[threadIdx].vertexOffset; c < threadDataVector[threadIdx].numVertices; c += numVerticesPerDrawCall)
for(uint32_t c = 0; c < threadDataVector[threadIdx].numVertices; c += numVerticesPerDrawCall)
{
vkCmdDraw(threadDataVector[threadIdx].commandBuffer, min(numVerticesPerDrawCall, threadDataVector[threadIdx].numVertices - c), 1, c, 0);
vkCmdDraw(threadDataVector[threadIdx].commandBuffer, min(numVerticesPerDrawCall, threadDataVector[threadIdx].numVertices - c), 1, threadDataVector[threadIdx].vertexOffset + c, 0);
}
vkEndCommandBuffer(threadDataVector[threadIdx].commandBuffer);
std::cerr << "Recorded thread " << threadIdx << std::endl;
//std::cerr << "Recorded thread " << threadIdx << std::endl;
}
void recordCommandBuffers()
@ -795,7 +796,7 @@ void recordCommandBuffers()
//update secondary command buffers
//multi threaded mode
/**
/**/
std::vector<std::thread> threads;
threads.reserve(numThreads);
for(uint32_t c = 0; c < numThreads; ++c)
@ -810,8 +811,9 @@ void recordCommandBuffers()
/**/
//single threaded mode for debugging
/**/
/**
for(uint32_t c = 0; c < numThreads; ++c)
//for(uint32_t c = 0; c < 1; ++c)
{
threadFunc(c, inheritanceInfo);
}
@ -820,11 +822,13 @@ void recordCommandBuffers()
VkCommandBuffer cmdBufs[numThreads];
for(uint32_t c = 0; c < numThreads; ++c)
//for(uint32_t c = 0; c < 1; ++c)
{
cmdBufs[c] = threadDataVector[c].commandBuffer;
}
vkCmdExecuteCommands(primaryCommandBuffer, numThreads, cmdBufs);
//vkCmdExecuteCommands(primaryCommandBuffer, 1, cmdBufs);
vkCmdEndRenderPass(primaryCommandBuffer);
@ -834,7 +838,7 @@ void recordCommandBuffers()
}
std::cout << "recorded command buffer for image " << imageIndex << std::endl;
//std::cout << "recorded command buffer for image " << imageIndex << std::endl;
}
void draw() {
@ -845,7 +849,7 @@ void draw() {
assert(0);
}
std::cout << "acquired image" << std::endl;
//std::cout << "acquired image" << std::endl;
recordCommandBuffers();
@ -867,7 +871,7 @@ void draw() {
assert(0);
}
std::cout << "submitted draw command buffer" << std::endl;
//std::cout << "submitted draw command buffer" << std::endl;
// Present drawn image
// Note: semaphore here is not strictly necessary, because commands are processed in submission order within a single queue
@ -887,7 +891,7 @@ void draw() {
assert(0);
}
std::cout << "submitted presentation command buffer" << std::endl;
//std::cout << "submitted presentation command buffer" << std::endl;
}
void CreateRenderPass()
@ -1333,8 +1337,8 @@ void CreateVertexBuffer()
float w = 2.0;
float h = 2.0;
float stepH = 90*6.0*h/1080.0;
float stepW = 90*8.0*w/1920.0;
float stepH = 2*6.0*h/1080.0;
float stepW = 2*8.0*w/1920.0;
vertices.reserve(3 * 2 * 960 * 540);