1
0
mirror of https://github.com/Yours3lf/rpi-vk-driver.git synced 2024-12-01 13:24:20 +01:00

performance queries now seem to work

This commit is contained in:
Unknown 2020-02-24 21:45:47 +00:00
parent eaf884547e
commit 939b791183
10 changed files with 111 additions and 40 deletions

View File

@ -72,6 +72,7 @@ void clInsertNewCLMarker(ControlList* cl,
marker.performResolve = performResolve;
marker.readMSAAimage = readMSAAimage;
marker.readMSAAdepthStencilImage = readMSAAdepthStencilImage;
marker.perfmonID = 0;
marker.handlesSize = 0;
marker.shaderRecSize = 0;
marker.uniformsSize = 0;

View File

@ -27,6 +27,7 @@ typedef struct CLMarker
uint32_t performResolve;
uint32_t readMSAAimage;
uint32_t readMSAAdepthStencilImage;
void* perfmonID;
//pointers that point to where all the other CL data is
//plus sizes

View File

@ -145,6 +145,8 @@ VKAPI_ATTR VkResult VKAPI_CALL rpi_vkAllocateCommandBuffers(
pCommandBuffers[c]->descriptorSetDirty = 1;
pCommandBuffers[c]->pushConstantDirty = 1;
pCommandBuffers[c]->perfmonID = 0;
if(!pCommandBuffers[c]->binCl.buffer)
{
res = VK_ERROR_OUT_OF_HOST_MEMORY;
@ -481,6 +483,20 @@ VKAPI_ATTR VkResult VKAPI_CALL rpi_vkQueueSubmit(
submitCl.shader_rec = marker->shaderRecBuf;
submitCl.uniforms = marker->uniformsBuf;
if(marker->perfmonID)
{
uint32_t perfmonSelector = 0;
uint32_t* perfmonIDptr = (uint32_t*)marker->perfmonID;
if(pSubmits->pNext)
{
VkPerformanceQuerySubmitInfoKHR* perfQuerySubmitInfo = pSubmits->pNext;
perfmonSelector = perfQuerySubmitInfo->counterPassIndex;
}
submitCl.perfmonid = *(perfmonIDptr + perfmonSelector);
}
//marker not closed yet
//close here
if(!marker->size)
@ -589,6 +605,7 @@ VKAPI_ATTR VkResult VKAPI_CALL rpi_vkQueueSubmit(
printf("clear z %u\n", submitCl.clear_z);
printf("clear s %u\n", submitCl.clear_s);
printf("flags %u\n", submitCl.flags);
printf("perfmonID %u\n", submitCl.perfmonid);
/**/

View File

@ -373,6 +373,12 @@ typedef struct VkCommandBuffer_T
uint32_t indexBufferOffset;
_buffer* indexBuffer;
//Renderpass scope query must begin outside renderpass
//so there won't be any current marker...
//therefore store perfmonID here, and copy on beginrenderpass
//into marker
void* perfmonID;
//dirty flags used to reduce command stream clutter
uint32_t vertexBufferDirty;
uint32_t indexBufferDirty;

View File

@ -203,7 +203,7 @@ VKAPI_ATTR void VKAPI_CALL rpi_vkGetPhysicalDeviceQueueFamilyPerformanceQueryPas
assert(pPerformanceQueryCreateInfo);
assert(pNumPasses);
*pNumPasses = pPerformanceQueryCreateInfo->counterIndexCount / DRM_VC4_MAX_PERF_COUNTERS;
*pNumPasses = pPerformanceQueryCreateInfo->counterIndexCount / DRM_VC4_MAX_PERF_COUNTERS + 1;
}
/*

View File

@ -342,10 +342,12 @@ int vc4_seqno_wait(int fd, uint64_t* lastFinishedSeqno, uint64_t seqno, uint64_t
if (ret != -ETIME) {
fprintf(stderr, "Seqno wait failed: %s\n",
strerror(errno));
vc4_print_hang_state(controlFd);
}
else
{
//Timeout happened
vc4_print_hang_state(controlFd);
*timeout_ns = -1;
return -1;
}
@ -577,6 +579,7 @@ void vc4_cl_submit(int fd, struct drm_vc4_submit_cl* submit, uint64_t* lastEmitt
if (*lastEmittedSeqno - *lastFinishedSeqno > 5) {
uint64_t timeout = WAIT_TIMEOUT_INFINITE;
//uint64_t timeout = 1000ull * 1000ull * 1000ull; //TODO waits too long...
if (!vc4_seqno_wait(fd,
lastFinishedSeqno,
*lastFinishedSeqno > 0 ? *lastEmittedSeqno - 5 : *lastEmittedSeqno,
@ -686,7 +689,7 @@ void vc4_print_hang_state(int fd)
if (drmIoctl(fd, DRM_IOCTL_VC4_GET_HANG_STATE, &arg))
{
fprintf(stderr, "Perfmon get values failed: %s\n",
fprintf(stderr, "vc4 get hang state failed: %s\n",
strerror(errno));
}
else

View File

@ -51,7 +51,7 @@ VKAPI_ATTR VkResult VKAPI_CALL rpi_vkCreateQueryPool(
for(uint32_t d = 0; d < ci.counterIndexCount; d += DRM_VC4_MAX_PERF_COUNTERS)
{
qp->queryPool[c].perfmonIDs[d / DRM_VC4_MAX_PERF_COUNTERS] = vc4_create_perfmon(controlFd, &qp->queryPool[c].enabledCounters[d], qp->queryPool[c].numEnabledCounters > DRM_VC4_MAX_PERF_COUNTERS ? DRM_VC4_MAX_PERF_COUNTERS : qp->queryPool[c].numEnabledCounters);
memset(&qp->queryPool[c].counterValues[d][0], 0, sizeof(uint64_t) * DRM_VC4_MAX_PERF_COUNTERS);
memset(&qp->queryPool[c].counterValues[d / DRM_VC4_MAX_PERF_COUNTERS][0], 0, sizeof(uint64_t) * DRM_VC4_MAX_PERF_COUNTERS);
}
}
@ -99,7 +99,9 @@ VKAPI_ATTR void VKAPI_CALL rpi_vkCmdEndQuery(
assert(commandBuffer);
assert(queryPool);
//TODO
_commandBuffer* cmdBuf = commandBuffer;
cmdBuf->perfmonID = 0;
}
VKAPI_ATTR void VKAPI_CALL rpi_vkCmdBeginQuery(
@ -111,7 +113,14 @@ VKAPI_ATTR void VKAPI_CALL rpi_vkCmdBeginQuery(
assert(commandBuffer);
assert(queryPool);
//TODO
//TODO flags
_commandBuffer* cmdBuf = commandBuffer;
_queryPool* qp = queryPool;
//pass id will select the perfmon at submit
cmdBuf->perfmonID = qp->queryPool[query].perfmonIDs;
}
VKAPI_ATTR void VKAPI_CALL rpi_vkCmdCopyQueryPoolResults(
@ -149,14 +158,14 @@ VKAPI_ATTR VkResult VKAPI_CALL rpi_vkGetQueryPoolResults(
{
for(uint32_t d = 0; d < qp->queryPool[c].numEnabledCounters; d += DRM_VC4_MAX_PERF_COUNTERS)
{
vc4_perfmon_get_values(controlFd, qp->queryPool[c].perfmonIDs[d / DRM_VC4_MAX_PERF_COUNTERS], &qp->queryPool[c].counterValues[d][0]);
vc4_perfmon_get_values(controlFd, qp->queryPool[c].perfmonIDs[d / DRM_VC4_MAX_PERF_COUNTERS], &qp->queryPool[c].counterValues[d / DRM_VC4_MAX_PERF_COUNTERS][0]);
}
uint32_t counter = 0;
for(uint32_t d = 0; d < dataSize; d += stride, ++counter)
{
VkPerformanceCounterResultKHR* result = ((char*)pData) + d;
result->uint64 = qp->queryPool[c].counterValues[counter];
result->uint64 = qp->queryPool[c].counterValues[counter / DRM_VC4_MAX_PERF_COUNTERS][counter % DRM_VC4_MAX_PERF_COUNTERS];
}
}

View File

@ -224,6 +224,8 @@ void rpi_vkCmdBeginRenderPass(VkCommandBuffer commandBuffer, const VkRenderPassB
//command list.
clFit(commandBuffer, &commandBuffer->binCl, V3D21_START_TILE_BINNING_length);
clInsertStartTileBinning(&commandBuffer->binCl);
cb->binCl.currMarker->perfmonID = cb->perfmonID;
}
/*

View File

@ -467,93 +467,123 @@ static VkPerformanceCounterDescriptionKHR performanceCounterDescriptions[] =
{
{
.name = "FRONT_END_PIPELINE_VALID_PRIMS_NO_RENDER",
.description = "FEP Valid primitives that result in no rendered pixels, for all rendered tiles"
},
{
.name = "FRONT_END_PIPELINE_VALID_PRIMS_RENDER",
.description = "FEP Valid primitives for all rendered tiles. (primitives may be counted in more than one tile)"
},
{
.name = "FRONT_END_PIPELINE_CLIPPED_QUADS",
.description = "FEP Early-Z/Near/Far clipped quads"
},
{
.name = "FRONT_END_PIPELINE_VALID_QUADS",
.description = "FEP Valid quads"
},
{
.name = "TILE_BUFFER_QUADS_NOT_PASSING_STENCIL",
.description = "TLB Quads with no pixels passing the stencil test"
},
{
.name = "TILE_BUFFER_QUADS_NOT_PASSING_Z_AND_STENCIL",
.description = "TLB Quads with no pixels passing the Z and stencil tests"
},
{
.name = "TILE_BUFFER_QUADS_PASSING_Z_AND_STENCIL",
.description = "TLB Quads with any pixels passing the Z and stencil tests"
},
{
.name = "TILE_BUFFER_QUADS_ZERO_COVERAGE",
.description = "TLB Quads with all pixels having zero coverage"
},
{
.name = "TILE_BUFFER_QUADS_NON_ZERO_COVERAGE",
.description = "TLB Quads with any pixels having non-zero coverage"
},
{
.name = "TILE_BUFFER_QUADS_WRITTEN_TO_COLOR_BUF",
.description = "TLB Quads with valid pixels written to color buffer"
},
{
.name = "PLB_PRIMS_OUTSIDE_VIEWPORT",
.description = "PTB Primitives discarded by being outside the viewport"
},
{
.name = "PLB_PRIMS_NEED_CLIPPING",
.description = "PTB Primitives that need clipping"
},
{
.name = "PRIMITIVE_SETUP_ENGINE_PRIMS_REVERSED",
.description = "PSE Primitives that are discarded because they are reversed"
},
{
.name = "QUAD_PROCESSOR_UNIT_TOTAL_IDLE_CYCLES",
.description = "QPU Total idle clock cycles for all QPUs"
},
{
.name = "QUAD_PROCESSOR_UNIT_TOTAL_CLK_CYCLES_VERTEX_COORD_SHADING",
.description = "QPU Total clock cycles for all QPUs doing vertex/coordinate shading"
},
{
.name = "QUAD_PROCESSOR_UNIT_TOTAL_CLK_CYCLES_FRAGMENT_SHADING",
.description = "QPU Total clock cycles for all QPUs doing fragment shading"
},
{
.name = "QUAD_PROCESSOR_UNIT_TOTAL_CLK_CYCLES_EXEC_VALID_INST",
.description = "QPU Total clock cycles for all QPUs executing valid instructions"
},
{
.name = "QUAD_PROCESSOR_UNIT_TOTAL_CLK_CYCLES_WAITING_TMUS",
.description = "QPU Total clock cycles for all QPUs stalled waiting for TMUs"
},
{
.name = "QUAD_PROCESSOR_UNIT_TOTAL_CLK_CYCLES_WAITING_SCOREBOARD",
.description = "QPU Total clock cycles for all QPUs stalled waiting for Scoreboard"
},
{
.name = "QUAD_PROCESSOR_UNIT_TOTAL_CLK_CYCLES_WAITING_VARYINGS",
.description = "QPU Total clock cycles for all QPUs stalled waiting for Varyings"
},
{
.name = "QUAD_PROCESSOR_UNIT_TOTAL_INST_CACHE_HIT",
.description = "QPU Total instruction cache hits for all slices"
},
{
.name = "QUAD_PROCESSOR_UNIT_TOTAL_INST_CACHE_MISS",
.description = "QPU Total instruction cache misses for all slices"
},
{
.name = "QUAD_PROCESSOR_UNIT_TOTAL_UNIFORM_CACHE_HIT",
.description = "QPU Total uniforms cache hits for all slices"
},
{
.name = "QUAD_PROCESSOR_UNIT_TOTAL_UNIFORM_CACHE_MISS",
.description = "QPU Total uniforms cache misses for all slices"
},
{
.name = "TEXTURE_MEMORY_LOOKUP_UNIT_TOTAL_TEXT_QUADS_PROCESSED",
.description = "TMU Total texture quads processed"
},
{
.name = "TEXTURE_MEMORY_LOOKUP_UNIT_TOTAL_TEXT_CACHE_MISS",
.description = "TMU Total texture cache misses (number of fetches from memory/L2cache)"
},
{
.name = "VERTEX_PIPE_MEMORY_TOTAL_CLK_CYCLES_VERTEX_DMA_WRITE_STALLED",
.description = "VPM Total clock cycles VDW is stalled waiting for VPM access"
},
{
.name = "VERTEX_PIPE_MEMORY_TOTAL_CLK_CYCLES_VERTEX_DMA_STALLED",
.description = "VPM Total clock cycles VCD is stalled waiting for VPM access"
},
{
.name = "L2C_TOTAL_L2_CACHE_HIT",
.description = "L2C Total Level 2 cache hits"
},
{
.name = "L2C_TOTAL_L2_CACHE_MISS",
.description = "L2C Total Level 2 cache misses"
}
};

View File

@ -735,19 +735,19 @@ void recordCommandBuffers()
}
void draw() {
// Acquire image
uint32_t imageIndex;
VkResult res = vkAcquireNextImageKHR(device, swapChain, UINT64_MAX, imageAvailableSemaphore, VK_NULL_HANDLE, &imageIndex);
if (res != VK_SUCCESS && res != VK_SUBOPTIMAL_KHR) {
std::cerr << "failed to acquire image" << std::endl;
assert(0);
}
std::cout << "acquired image" << std::endl;
for(uint32_t c = 0; c < numQueryPasses; ++c)
{
// Acquire image
uint32_t imageIndex;
VkResult res = vkAcquireNextImageKHR(device, swapChain, UINT64_MAX, imageAvailableSemaphore, VK_NULL_HANDLE, &imageIndex);
if (res != VK_SUCCESS && res != VK_SUBOPTIMAL_KHR) {
std::cerr << "failed to acquire image" << std::endl;
assert(0);
}
std::cout << "acquired image" << std::endl;
VkPerformanceQuerySubmitInfoKHR performanceQuerySubmitInfo = {};
performanceQuerySubmitInfo.sType = VK_STRUCTURE_TYPE_PERFORMANCE_QUERY_SUBMIT_INFO_KHR;
performanceQuerySubmitInfo.counterPassIndex = c;
@ -770,13 +770,33 @@ void draw() {
std::cerr << "failed to submit draw command buffer" << std::endl;
assert(0);
}
std::cout << "submitted draw command buffer" << std::endl;
my_vkReleaseProfilingLockKHR(device);
// Present drawn image
// Note: semaphore here is not strictly necessary, because commands are processed in submission order within a single queue
VkPresentInfoKHR presentInfo = {};
presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
presentInfo.waitSemaphoreCount = 1;
presentInfo.pWaitSemaphores = &renderingFinishedSemaphore;
presentInfo.swapchainCount = 1;
presentInfo.pSwapchains = &swapChain;
presentInfo.pImageIndices = &imageIndex;
res = vkQueuePresentKHR(presentQueue, &presentInfo);
if (res != VK_SUCCESS) {
std::cerr << "failed to submit present command buffer" << std::endl;
assert(0);
}
}
std::cout << "submitted draw command buffer" << std::endl;
my_vkReleaseProfilingLockKHR(device);
{ //Get query results
vkQueueWaitIdle(graphicsQueue);
VkPerformanceCounterResultKHR* recordedCounters = (VkPerformanceCounterResultKHR*)malloc(sizeof(VkPerformanceCounterResultKHR) * counterCount);
vkGetQueryPoolResults(device, queryPool, 0, 1, sizeof(VkPerformanceCounterResultKHR) * counterCount, recordedCounters, sizeof(VkPerformanceCounterResultKHR), 0);
@ -792,24 +812,6 @@ void draw() {
}
}
// Present drawn image
// Note: semaphore here is not strictly necessary, because commands are processed in submission order within a single queue
VkPresentInfoKHR presentInfo = {};
presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
presentInfo.waitSemaphoreCount = 1;
presentInfo.pWaitSemaphores = &renderingFinishedSemaphore;
presentInfo.swapchainCount = 1;
presentInfo.pSwapchains = &swapChain;
presentInfo.pImageIndices = &imageIndex;
res = vkQueuePresentKHR(presentQueue, &presentInfo);
if (res != VK_SUCCESS) {
std::cerr << "failed to submit present command buffer" << std::endl;
assert(0);
}
std::cout << "submitted presentation command buffer" << std::endl;
}