mirror of
https://github.com/Yours3lf/rpi-vk-driver.git
synced 2024-12-01 13:24:20 +01:00
performance queries now seem to work
This commit is contained in:
parent
eaf884547e
commit
939b791183
@ -72,6 +72,7 @@ void clInsertNewCLMarker(ControlList* cl,
|
||||
marker.performResolve = performResolve;
|
||||
marker.readMSAAimage = readMSAAimage;
|
||||
marker.readMSAAdepthStencilImage = readMSAAdepthStencilImage;
|
||||
marker.perfmonID = 0;
|
||||
marker.handlesSize = 0;
|
||||
marker.shaderRecSize = 0;
|
||||
marker.uniformsSize = 0;
|
||||
|
@ -27,6 +27,7 @@ typedef struct CLMarker
|
||||
uint32_t performResolve;
|
||||
uint32_t readMSAAimage;
|
||||
uint32_t readMSAAdepthStencilImage;
|
||||
void* perfmonID;
|
||||
|
||||
//pointers that point to where all the other CL data is
|
||||
//plus sizes
|
||||
|
@ -145,6 +145,8 @@ VKAPI_ATTR VkResult VKAPI_CALL rpi_vkAllocateCommandBuffers(
|
||||
pCommandBuffers[c]->descriptorSetDirty = 1;
|
||||
pCommandBuffers[c]->pushConstantDirty = 1;
|
||||
|
||||
pCommandBuffers[c]->perfmonID = 0;
|
||||
|
||||
if(!pCommandBuffers[c]->binCl.buffer)
|
||||
{
|
||||
res = VK_ERROR_OUT_OF_HOST_MEMORY;
|
||||
@ -481,6 +483,20 @@ VKAPI_ATTR VkResult VKAPI_CALL rpi_vkQueueSubmit(
|
||||
submitCl.shader_rec = marker->shaderRecBuf;
|
||||
submitCl.uniforms = marker->uniformsBuf;
|
||||
|
||||
if(marker->perfmonID)
|
||||
{
|
||||
uint32_t perfmonSelector = 0;
|
||||
uint32_t* perfmonIDptr = (uint32_t*)marker->perfmonID;
|
||||
|
||||
if(pSubmits->pNext)
|
||||
{
|
||||
VkPerformanceQuerySubmitInfoKHR* perfQuerySubmitInfo = pSubmits->pNext;
|
||||
perfmonSelector = perfQuerySubmitInfo->counterPassIndex;
|
||||
}
|
||||
|
||||
submitCl.perfmonid = *(perfmonIDptr + perfmonSelector);
|
||||
}
|
||||
|
||||
//marker not closed yet
|
||||
//close here
|
||||
if(!marker->size)
|
||||
@ -589,6 +605,7 @@ VKAPI_ATTR VkResult VKAPI_CALL rpi_vkQueueSubmit(
|
||||
printf("clear z %u\n", submitCl.clear_z);
|
||||
printf("clear s %u\n", submitCl.clear_s);
|
||||
printf("flags %u\n", submitCl.flags);
|
||||
printf("perfmonID %u\n", submitCl.perfmonid);
|
||||
/**/
|
||||
|
||||
|
||||
|
@ -373,6 +373,12 @@ typedef struct VkCommandBuffer_T
|
||||
uint32_t indexBufferOffset;
|
||||
_buffer* indexBuffer;
|
||||
|
||||
//Renderpass scope query must begin outside renderpass
|
||||
//so there won't be any current marker...
|
||||
//therefore store perfmonID here, and copy on beginrenderpass
|
||||
//into marker
|
||||
void* perfmonID;
|
||||
|
||||
//dirty flags used to reduce command stream clutter
|
||||
uint32_t vertexBufferDirty;
|
||||
uint32_t indexBufferDirty;
|
||||
|
@ -203,7 +203,7 @@ VKAPI_ATTR void VKAPI_CALL rpi_vkGetPhysicalDeviceQueueFamilyPerformanceQueryPas
|
||||
assert(pPerformanceQueryCreateInfo);
|
||||
assert(pNumPasses);
|
||||
|
||||
*pNumPasses = pPerformanceQueryCreateInfo->counterIndexCount / DRM_VC4_MAX_PERF_COUNTERS;
|
||||
*pNumPasses = pPerformanceQueryCreateInfo->counterIndexCount / DRM_VC4_MAX_PERF_COUNTERS + 1;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -342,10 +342,12 @@ int vc4_seqno_wait(int fd, uint64_t* lastFinishedSeqno, uint64_t seqno, uint64_t
|
||||
if (ret != -ETIME) {
|
||||
fprintf(stderr, "Seqno wait failed: %s\n",
|
||||
strerror(errno));
|
||||
vc4_print_hang_state(controlFd);
|
||||
}
|
||||
else
|
||||
{
|
||||
//Timeout happened
|
||||
vc4_print_hang_state(controlFd);
|
||||
*timeout_ns = -1;
|
||||
return -1;
|
||||
}
|
||||
@ -577,6 +579,7 @@ void vc4_cl_submit(int fd, struct drm_vc4_submit_cl* submit, uint64_t* lastEmitt
|
||||
|
||||
if (*lastEmittedSeqno - *lastFinishedSeqno > 5) {
|
||||
uint64_t timeout = WAIT_TIMEOUT_INFINITE;
|
||||
//uint64_t timeout = 1000ull * 1000ull * 1000ull; //TODO waits too long...
|
||||
if (!vc4_seqno_wait(fd,
|
||||
lastFinishedSeqno,
|
||||
*lastFinishedSeqno > 0 ? *lastEmittedSeqno - 5 : *lastEmittedSeqno,
|
||||
@ -686,7 +689,7 @@ void vc4_print_hang_state(int fd)
|
||||
|
||||
if (drmIoctl(fd, DRM_IOCTL_VC4_GET_HANG_STATE, &arg))
|
||||
{
|
||||
fprintf(stderr, "Perfmon get values failed: %s\n",
|
||||
fprintf(stderr, "vc4 get hang state failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
else
|
||||
|
@ -51,7 +51,7 @@ VKAPI_ATTR VkResult VKAPI_CALL rpi_vkCreateQueryPool(
|
||||
for(uint32_t d = 0; d < ci.counterIndexCount; d += DRM_VC4_MAX_PERF_COUNTERS)
|
||||
{
|
||||
qp->queryPool[c].perfmonIDs[d / DRM_VC4_MAX_PERF_COUNTERS] = vc4_create_perfmon(controlFd, &qp->queryPool[c].enabledCounters[d], qp->queryPool[c].numEnabledCounters > DRM_VC4_MAX_PERF_COUNTERS ? DRM_VC4_MAX_PERF_COUNTERS : qp->queryPool[c].numEnabledCounters);
|
||||
memset(&qp->queryPool[c].counterValues[d][0], 0, sizeof(uint64_t) * DRM_VC4_MAX_PERF_COUNTERS);
|
||||
memset(&qp->queryPool[c].counterValues[d / DRM_VC4_MAX_PERF_COUNTERS][0], 0, sizeof(uint64_t) * DRM_VC4_MAX_PERF_COUNTERS);
|
||||
}
|
||||
}
|
||||
|
||||
@ -99,7 +99,9 @@ VKAPI_ATTR void VKAPI_CALL rpi_vkCmdEndQuery(
|
||||
assert(commandBuffer);
|
||||
assert(queryPool);
|
||||
|
||||
//TODO
|
||||
_commandBuffer* cmdBuf = commandBuffer;
|
||||
|
||||
cmdBuf->perfmonID = 0;
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL rpi_vkCmdBeginQuery(
|
||||
@ -111,7 +113,14 @@ VKAPI_ATTR void VKAPI_CALL rpi_vkCmdBeginQuery(
|
||||
assert(commandBuffer);
|
||||
assert(queryPool);
|
||||
|
||||
//TODO
|
||||
//TODO flags
|
||||
|
||||
_commandBuffer* cmdBuf = commandBuffer;
|
||||
_queryPool* qp = queryPool;
|
||||
|
||||
|
||||
//pass id will select the perfmon at submit
|
||||
cmdBuf->perfmonID = qp->queryPool[query].perfmonIDs;
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL rpi_vkCmdCopyQueryPoolResults(
|
||||
@ -149,14 +158,14 @@ VKAPI_ATTR VkResult VKAPI_CALL rpi_vkGetQueryPoolResults(
|
||||
{
|
||||
for(uint32_t d = 0; d < qp->queryPool[c].numEnabledCounters; d += DRM_VC4_MAX_PERF_COUNTERS)
|
||||
{
|
||||
vc4_perfmon_get_values(controlFd, qp->queryPool[c].perfmonIDs[d / DRM_VC4_MAX_PERF_COUNTERS], &qp->queryPool[c].counterValues[d][0]);
|
||||
vc4_perfmon_get_values(controlFd, qp->queryPool[c].perfmonIDs[d / DRM_VC4_MAX_PERF_COUNTERS], &qp->queryPool[c].counterValues[d / DRM_VC4_MAX_PERF_COUNTERS][0]);
|
||||
}
|
||||
|
||||
uint32_t counter = 0;
|
||||
for(uint32_t d = 0; d < dataSize; d += stride, ++counter)
|
||||
{
|
||||
VkPerformanceCounterResultKHR* result = ((char*)pData) + d;
|
||||
result->uint64 = qp->queryPool[c].counterValues[counter];
|
||||
result->uint64 = qp->queryPool[c].counterValues[counter / DRM_VC4_MAX_PERF_COUNTERS][counter % DRM_VC4_MAX_PERF_COUNTERS];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -224,6 +224,8 @@ void rpi_vkCmdBeginRenderPass(VkCommandBuffer commandBuffer, const VkRenderPassB
|
||||
//command list.
|
||||
clFit(commandBuffer, &commandBuffer->binCl, V3D21_START_TILE_BINNING_length);
|
||||
clInsertStartTileBinning(&commandBuffer->binCl);
|
||||
|
||||
cb->binCl.currMarker->perfmonID = cb->perfmonID;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -467,93 +467,123 @@ static VkPerformanceCounterDescriptionKHR performanceCounterDescriptions[] =
|
||||
{
|
||||
{
|
||||
.name = "FRONT_END_PIPELINE_VALID_PRIMS_NO_RENDER",
|
||||
.description = "FEP Valid primitives that result in no rendered pixels, for all rendered tiles"
|
||||
},
|
||||
{
|
||||
.name = "FRONT_END_PIPELINE_VALID_PRIMS_RENDER",
|
||||
.description = "FEP Valid primitives for all rendered tiles. (primitives may be counted in more than one tile)"
|
||||
},
|
||||
{
|
||||
.name = "FRONT_END_PIPELINE_CLIPPED_QUADS",
|
||||
.description = "FEP Early-Z/Near/Far clipped quads"
|
||||
},
|
||||
{
|
||||
.name = "FRONT_END_PIPELINE_VALID_QUADS",
|
||||
.description = "FEP Valid quads"
|
||||
},
|
||||
{
|
||||
.name = "TILE_BUFFER_QUADS_NOT_PASSING_STENCIL",
|
||||
.description = "TLB Quads with no pixels passing the stencil test"
|
||||
},
|
||||
{
|
||||
.name = "TILE_BUFFER_QUADS_NOT_PASSING_Z_AND_STENCIL",
|
||||
.description = "TLB Quads with no pixels passing the Z and stencil tests"
|
||||
},
|
||||
{
|
||||
.name = "TILE_BUFFER_QUADS_PASSING_Z_AND_STENCIL",
|
||||
.description = "TLB Quads with any pixels passing the Z and stencil tests"
|
||||
},
|
||||
{
|
||||
.name = "TILE_BUFFER_QUADS_ZERO_COVERAGE",
|
||||
.description = "TLB Quads with all pixels having zero coverage"
|
||||
},
|
||||
{
|
||||
.name = "TILE_BUFFER_QUADS_NON_ZERO_COVERAGE",
|
||||
.description = "TLB Quads with any pixels having non-zero coverage"
|
||||
},
|
||||
{
|
||||
.name = "TILE_BUFFER_QUADS_WRITTEN_TO_COLOR_BUF",
|
||||
.description = "TLB Quads with valid pixels written to color buffer"
|
||||
},
|
||||
{
|
||||
.name = "PLB_PRIMS_OUTSIDE_VIEWPORT",
|
||||
.description = "PTB Primitives discarded by being outside the viewport"
|
||||
},
|
||||
{
|
||||
.name = "PLB_PRIMS_NEED_CLIPPING",
|
||||
.description = "PTB Primitives that need clipping"
|
||||
},
|
||||
{
|
||||
.name = "PRIMITIVE_SETUP_ENGINE_PRIMS_REVERSED",
|
||||
.description = "PSE Primitives that are discarded because they are reversed"
|
||||
},
|
||||
{
|
||||
.name = "QUAD_PROCESSOR_UNIT_TOTAL_IDLE_CYCLES",
|
||||
.description = "QPU Total idle clock cycles for all QPUs"
|
||||
},
|
||||
{
|
||||
.name = "QUAD_PROCESSOR_UNIT_TOTAL_CLK_CYCLES_VERTEX_COORD_SHADING",
|
||||
.description = "QPU Total clock cycles for all QPUs doing vertex/coordinate shading"
|
||||
},
|
||||
{
|
||||
.name = "QUAD_PROCESSOR_UNIT_TOTAL_CLK_CYCLES_FRAGMENT_SHADING",
|
||||
.description = "QPU Total clock cycles for all QPUs doing fragment shading"
|
||||
},
|
||||
{
|
||||
.name = "QUAD_PROCESSOR_UNIT_TOTAL_CLK_CYCLES_EXEC_VALID_INST",
|
||||
.description = "QPU Total clock cycles for all QPUs executing valid instructions"
|
||||
},
|
||||
{
|
||||
.name = "QUAD_PROCESSOR_UNIT_TOTAL_CLK_CYCLES_WAITING_TMUS",
|
||||
.description = "QPU Total clock cycles for all QPUs stalled waiting for TMUs"
|
||||
},
|
||||
{
|
||||
.name = "QUAD_PROCESSOR_UNIT_TOTAL_CLK_CYCLES_WAITING_SCOREBOARD",
|
||||
.description = "QPU Total clock cycles for all QPUs stalled waiting for Scoreboard"
|
||||
},
|
||||
{
|
||||
.name = "QUAD_PROCESSOR_UNIT_TOTAL_CLK_CYCLES_WAITING_VARYINGS",
|
||||
.description = "QPU Total clock cycles for all QPUs stalled waiting for Varyings"
|
||||
},
|
||||
{
|
||||
.name = "QUAD_PROCESSOR_UNIT_TOTAL_INST_CACHE_HIT",
|
||||
.description = "QPU Total instruction cache hits for all slices"
|
||||
},
|
||||
{
|
||||
.name = "QUAD_PROCESSOR_UNIT_TOTAL_INST_CACHE_MISS",
|
||||
.description = "QPU Total instruction cache misses for all slices"
|
||||
},
|
||||
{
|
||||
.name = "QUAD_PROCESSOR_UNIT_TOTAL_UNIFORM_CACHE_HIT",
|
||||
.description = "QPU Total uniforms cache hits for all slices"
|
||||
},
|
||||
{
|
||||
.name = "QUAD_PROCESSOR_UNIT_TOTAL_UNIFORM_CACHE_MISS",
|
||||
.description = "QPU Total uniforms cache misses for all slices"
|
||||
},
|
||||
{
|
||||
.name = "TEXTURE_MEMORY_LOOKUP_UNIT_TOTAL_TEXT_QUADS_PROCESSED",
|
||||
.description = "TMU Total texture quads processed"
|
||||
},
|
||||
{
|
||||
.name = "TEXTURE_MEMORY_LOOKUP_UNIT_TOTAL_TEXT_CACHE_MISS",
|
||||
.description = "TMU Total texture cache misses (number of fetches from memory/L2cache)"
|
||||
},
|
||||
{
|
||||
.name = "VERTEX_PIPE_MEMORY_TOTAL_CLK_CYCLES_VERTEX_DMA_WRITE_STALLED",
|
||||
.description = "VPM Total clock cycles VDW is stalled waiting for VPM access"
|
||||
},
|
||||
{
|
||||
.name = "VERTEX_PIPE_MEMORY_TOTAL_CLK_CYCLES_VERTEX_DMA_STALLED",
|
||||
.description = "VPM Total clock cycles VCD is stalled waiting for VPM access"
|
||||
},
|
||||
{
|
||||
.name = "L2C_TOTAL_L2_CACHE_HIT",
|
||||
.description = "L2C Total Level 2 cache hits"
|
||||
},
|
||||
{
|
||||
.name = "L2C_TOTAL_L2_CACHE_MISS",
|
||||
.description = "L2C Total Level 2 cache misses"
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -735,19 +735,19 @@ void recordCommandBuffers()
|
||||
}
|
||||
|
||||
void draw() {
|
||||
// Acquire image
|
||||
uint32_t imageIndex;
|
||||
VkResult res = vkAcquireNextImageKHR(device, swapChain, UINT64_MAX, imageAvailableSemaphore, VK_NULL_HANDLE, &imageIndex);
|
||||
|
||||
if (res != VK_SUCCESS && res != VK_SUBOPTIMAL_KHR) {
|
||||
std::cerr << "failed to acquire image" << std::endl;
|
||||
assert(0);
|
||||
}
|
||||
|
||||
std::cout << "acquired image" << std::endl;
|
||||
|
||||
for(uint32_t c = 0; c < numQueryPasses; ++c)
|
||||
{
|
||||
// Acquire image
|
||||
uint32_t imageIndex;
|
||||
VkResult res = vkAcquireNextImageKHR(device, swapChain, UINT64_MAX, imageAvailableSemaphore, VK_NULL_HANDLE, &imageIndex);
|
||||
|
||||
if (res != VK_SUCCESS && res != VK_SUBOPTIMAL_KHR) {
|
||||
std::cerr << "failed to acquire image" << std::endl;
|
||||
assert(0);
|
||||
}
|
||||
|
||||
std::cout << "acquired image" << std::endl;
|
||||
|
||||
VkPerformanceQuerySubmitInfoKHR performanceQuerySubmitInfo = {};
|
||||
performanceQuerySubmitInfo.sType = VK_STRUCTURE_TYPE_PERFORMANCE_QUERY_SUBMIT_INFO_KHR;
|
||||
performanceQuerySubmitInfo.counterPassIndex = c;
|
||||
@ -770,13 +770,33 @@ void draw() {
|
||||
std::cerr << "failed to submit draw command buffer" << std::endl;
|
||||
assert(0);
|
||||
}
|
||||
|
||||
std::cout << "submitted draw command buffer" << std::endl;
|
||||
|
||||
my_vkReleaseProfilingLockKHR(device);
|
||||
|
||||
// Present drawn image
|
||||
// Note: semaphore here is not strictly necessary, because commands are processed in submission order within a single queue
|
||||
VkPresentInfoKHR presentInfo = {};
|
||||
presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
|
||||
presentInfo.waitSemaphoreCount = 1;
|
||||
presentInfo.pWaitSemaphores = &renderingFinishedSemaphore;
|
||||
|
||||
presentInfo.swapchainCount = 1;
|
||||
presentInfo.pSwapchains = &swapChain;
|
||||
presentInfo.pImageIndices = &imageIndex;
|
||||
|
||||
res = vkQueuePresentKHR(presentQueue, &presentInfo);
|
||||
|
||||
if (res != VK_SUCCESS) {
|
||||
std::cerr << "failed to submit present command buffer" << std::endl;
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "submitted draw command buffer" << std::endl;
|
||||
|
||||
my_vkReleaseProfilingLockKHR(device);
|
||||
|
||||
{ //Get query results
|
||||
vkQueueWaitIdle(graphicsQueue);
|
||||
|
||||
VkPerformanceCounterResultKHR* recordedCounters = (VkPerformanceCounterResultKHR*)malloc(sizeof(VkPerformanceCounterResultKHR) * counterCount);
|
||||
vkGetQueryPoolResults(device, queryPool, 0, 1, sizeof(VkPerformanceCounterResultKHR) * counterCount, recordedCounters, sizeof(VkPerformanceCounterResultKHR), 0);
|
||||
|
||||
@ -792,24 +812,6 @@ void draw() {
|
||||
}
|
||||
}
|
||||
|
||||
// Present drawn image
|
||||
// Note: semaphore here is not strictly necessary, because commands are processed in submission order within a single queue
|
||||
VkPresentInfoKHR presentInfo = {};
|
||||
presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
|
||||
presentInfo.waitSemaphoreCount = 1;
|
||||
presentInfo.pWaitSemaphores = &renderingFinishedSemaphore;
|
||||
|
||||
presentInfo.swapchainCount = 1;
|
||||
presentInfo.pSwapchains = &swapChain;
|
||||
presentInfo.pImageIndices = &imageIndex;
|
||||
|
||||
res = vkQueuePresentKHR(presentQueue, &presentInfo);
|
||||
|
||||
if (res != VK_SUCCESS) {
|
||||
std::cerr << "failed to submit present command buffer" << std::endl;
|
||||
assert(0);
|
||||
}
|
||||
|
||||
std::cout << "submitted presentation command buffer" << std::endl;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user