rpi-vk-driver/driver/command.c

#include "common.h"

#include "kernel/vc4_packet.h"
#include "../brcm/cle/v3d_decoder.h"
#include "../brcm/clif/clif_dump.h"

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#commandbuffers-pools
 * Command pools are opaque objects that command buffer memory is allocated from, and which allow the implementation to amortize the
 * cost of resource creation across multiple command buffers. Command pools are externally synchronized, meaning that a command pool must
 * not be used concurrently in multiple threads. That includes use via recording commands on any command buffers allocated from the pool,
 * as well as operations that allocate, free, and reset command buffers or the pool itself.
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkCreateCommandPool(
		VkDevice                                    device,
		const VkCommandPoolCreateInfo*              pCreateInfo,
		const VkAllocationCallbacks*                pAllocator,
		VkCommandPool*                              pCommandPool)
{
	assert(device);
	assert(pCreateInfo);

	//TODO VK_COMMAND_POOL_CREATE_TRANSIENT_BIT
	//specifies that command buffers allocated from the pool will be short-lived, meaning that they will be reset or freed in a relatively short timeframe.
	//This flag may be used by the implementation to control memory allocation behavior within the pool.
	//--> definitely use pool allocator

	//TODO pool family ignored for now

	_commandPool* cp = ALLOCATE(sizeof(_commandPool), 1, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

	if(!cp)
	{
		return VK_ERROR_OUT_OF_HOST_MEMORY;
	}

	cp->queueFamilyIndex = pCreateInfo->queueFamilyIndex;

	cp->resetAble = pCreateInfo->flags & VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;

	//TODO CTS fails as we can't allocate enough memory for some reason
	//tweak system allocation as root using:
	//make sure kernel denies memory allocation that it won't be able to serve
	//sysctl -w vm.overcommit_memory="2"
	//specify after how much memory used the kernel will start denying requests
	//sysctl -w vm.overcommit_ratio="80"
	//


	//initial number of command buffers to hold
	int numCommandBufs = 128;
	int consecutiveBlockSize = ARM_PAGE_SIZE;
	int consecutiveBlockNumber = 64;
	//int numCommandBufs = 30;
	//int consecutiveBlockSize = getCPABlockSize(256);
	//int consecutiveBlockNumber = 30;
	int consecutivePoolSize = consecutiveBlockNumber * consecutiveBlockSize;

	static int counter = 0;

	//if(pCreateInfo->flags & VK_COMMAND_POOL_CREATE_TRANSIENT_BIT)
	{
		//use pool allocator
		void* pamem = ALLOCATE(numCommandBufs * sizeof(_commandBuffer), 1, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
		if(!pamem)
		{
			return VK_ERROR_OUT_OF_HOST_MEMORY;
		}
		cp->pa = createPoolAllocator(pamem, sizeof(_commandBuffer), numCommandBufs * sizeof(_commandBuffer));

		void* cpamem = ALLOCATE(consecutivePoolSize, 1, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
		if(!cpamem)
		{
			return VK_ERROR_OUT_OF_HOST_MEMORY;
		}
		cp->cpa = createConsecutivePoolAllocator(cpamem, consecutiveBlockSize, consecutivePoolSize);
	}

	*pCommandPool = (VkCommandPool)cp;

	return VK_SUCCESS;
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#commandbuffer-allocation
 * vkAllocateCommandBuffers can be used to create multiple command buffers. If the creation of any of those command buffers fails,
 * the implementation must destroy all successfully created command buffer objects from this command, set all entries of the pCommandBuffers array to NULL and return the error.
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkAllocateCommandBuffers(
		VkDevice                                    device,
		const VkCommandBufferAllocateInfo*          pAllocateInfo,
		VkCommandBuffer*                            pCommandBuffers)
{
	assert(device);
	assert(pAllocateInfo);
	assert(pCommandBuffers);

	VkResult res = VK_SUCCESS;

	_commandPool* cp = (_commandPool*)pAllocateInfo->commandPool;

	//TODO secondary command buffers

	//if(cp->usePoolAllocator)
	{
		for(int c = 0; c < pAllocateInfo->commandBufferCount; ++c)
		{
			pCommandBuffers[c] = poolAllocate(&cp->pa);

			if(!pCommandBuffers[c])
			{
				res = VK_ERROR_OUT_OF_HOST_MEMORY;
				break;
			}

			set_loader_magic_value(&pCommandBuffers[c]->loaderData);

			pCommandBuffers[c]->dev = device;

			pCommandBuffers[c]->shaderRecCount = 0;
			pCommandBuffers[c]->usageFlags = 0;
			pCommandBuffers[c]->state = CMDBUF_STATE_INITIAL;
			pCommandBuffers[c]->cp = cp;
			clInit(&pCommandBuffers[c]->binCl, consecutivePoolAllocate(&cp->cpa, 1), cp->cpa.blockSize);
			clInit(&pCommandBuffers[c]->handlesCl, consecutivePoolAllocate(&cp->cpa, 1), cp->cpa.blockSize);
			clInit(&pCommandBuffers[c]->shaderRecCl, consecutivePoolAllocate(&cp->cpa, 1), cp->cpa.blockSize);
			clInit(&pCommandBuffers[c]->uniformsCl, consecutivePoolAllocate(&cp->cpa, 1), cp->cpa.blockSize);

			pCommandBuffers[c]->graphicsPipeline = 0;
			pCommandBuffers[c]->computePipeline = 0;
			pCommandBuffers[c]->numDrawCallsSubmitted = 0;
			pCommandBuffers[c]->indexBuffer = 0;
			pCommandBuffers[c]->indexBufferOffset = 0;
			pCommandBuffers[c]->vertexBufferDirty = 1;
			pCommandBuffers[c]->indexBufferDirty = 1;
			pCommandBuffers[c]->viewportDirty = 1;
			pCommandBuffers[c]->lineWidthDirty = 1;
			pCommandBuffers[c]->depthBiasDirty = 1;
			pCommandBuffers[c]->graphicsPipelineDirty = 1;
			pCommandBuffers[c]->computePipelineDirty = 1;
			pCommandBuffers[c]->subpassDirty = 1;
			pCommandBuffers[c]->blendConstantsDirty = 1;
			pCommandBuffers[c]->scissorDirty = 1;
			pCommandBuffers[c]->depthBoundsDirty = 1;
			pCommandBuffers[c]->stencilCompareMaskDirty = 1;
			pCommandBuffers[c]->stencilWriteMaskDirty = 1;
			pCommandBuffers[c]->stencilReferenceDirty = 1;
			pCommandBuffers[c]->descriptorSetDirty = 1;
			pCommandBuffers[c]->pushConstantDirty = 1;

			pCommandBuffers[c]->perfmonID = 0;

			if(!pCommandBuffers[c]->binCl.buffer)
			{
				res = VK_ERROR_OUT_OF_HOST_MEMORY;
				break;
			}

			if(!pCommandBuffers[c]->handlesCl.buffer)
			{
				res = VK_ERROR_OUT_OF_HOST_MEMORY;
				break;
			}

			if(!pCommandBuffers[c]->shaderRecCl.buffer)
			{
				res = VK_ERROR_OUT_OF_HOST_MEMORY;
				break;
			}

			if(!pCommandBuffers[c]->uniformsCl.buffer)
			{
				res = VK_ERROR_OUT_OF_HOST_MEMORY;
				break;
			}
		}
	}

	if(res != VK_SUCCESS)
	{
		//if(cp->usePoolAllocator)
		{
			for(int c = 0; c < pAllocateInfo->commandBufferCount; ++c)
			{
				consecutivePoolFree(&cp->cpa, pCommandBuffers[c]->binCl.buffer, pCommandBuffers[c]->binCl.numBlocks);
				consecutivePoolFree(&cp->cpa, pCommandBuffers[c]->handlesCl.buffer, pCommandBuffers[c]->handlesCl.numBlocks);
				consecutivePoolFree(&cp->cpa, pCommandBuffers[c]->shaderRecCl.buffer, pCommandBuffers[c]->shaderRecCl.numBlocks);
				consecutivePoolFree(&cp->cpa, pCommandBuffers[c]->uniformsCl.buffer, pCommandBuffers[c]->uniformsCl.numBlocks);
				poolFree(&cp->pa, pCommandBuffers[c]);
				pCommandBuffers[c] = 0;
			}
		}
	}

	return res;
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkBeginCommandBuffer
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkBeginCommandBuffer(
		VkCommandBuffer                             commandBuffer,
		const VkCommandBufferBeginInfo*             pBeginInfo)
{
	assert(commandBuffer);
	assert(pBeginInfo);

	//TODO secondary command buffers

	//VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT
	//specifies that each recording of the command buffer will only be submitted once, and the command buffer will be reset and recorded again between each submission.

	//TODO VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT
	//specifies that a secondary command buffer is considered to be entirely inside a render pass. If this is a primary command buffer, then this bit is ignored

	//TODO VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT
	//specifies that a command buffer can be resubmitted to a queue while it is in the pending state, and recorded into multiple primary command buffers

	//When a command buffer begins recording, all state in that command buffer is undefined

	commandBuffer->usageFlags = pBeginInfo->flags;
	commandBuffer->state = CMDBUF_STATE_RECORDING;

	//TODO reset state?

	return VK_SUCCESS;
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEndCommandBuffer
 * If there was an error during recording, the application will be notified by an unsuccessful return code returned by vkEndCommandBuffer.
 * If the application wishes to further use the command buffer, the command buffer must be reset. The command buffer must have been in the recording state,
 * and is moved to the executable state.
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkEndCommandBuffer(
		VkCommandBuffer                             commandBuffer)
{
	assert(commandBuffer);

	commandBuffer->state = CMDBUF_STATE_EXECUTABLE;

	return VK_SUCCESS;
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkQueueSubmit
 * vkQueueSubmit is a queue submission command, with each batch defined by an element of pSubmits as an instance of the VkSubmitInfo structure.
 * Batches begin execution in the order they appear in pSubmits, but may complete out of order.
 * Fence and semaphore operations submitted with vkQueueSubmit have additional ordering constraints compared to other submission commands,
 * with dependencies involving previous and subsequent queue operations. Information about these additional constraints can be found in the semaphore and
 * fence sections of the synchronization chapter.
 * Details on the interaction of pWaitDstStageMask with synchronization are described in the semaphore wait operation section of the synchronization chapter.
 * The order that batches appear in pSubmits is used to determine submission order, and thus all the implicit ordering guarantees that respect it.
 * Other than these implicit ordering guarantees and any explicit synchronization primitives, these batches may overlap or otherwise execute out of order.
 * If any command buffer submitted to this queue is in the executable state, it is moved to the pending state. Once execution of all submissions of a command buffer complete,
 * it moves from the pending state, back to the executable state. If a command buffer was recorded with the VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT flag,
 * it instead moves back to the invalid state.
 * If vkQueueSubmit fails, it may return VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY.
 * If it does, the implementation must ensure that the state and contents of any resources or synchronization primitives referenced by the submitted command buffers and any semaphores
 * referenced by pSubmits is unaffected by the call or its failure. If vkQueueSubmit fails in such a way that the implementation is unable to make that guarantee,
 * the implementation must return VK_ERROR_DEVICE_LOST. See Lost Device.
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkQueueSubmit(
		VkQueue                                     queue,
		uint32_t                                    submitCount,
		const VkSubmitInfo*                         pSubmits,
		VkFence                                     fence)
{
	assert(queue);

	//TODO this is incorrect
	//see sync.c
	//TODO: deal with pSubmits->pWaitDstStageMask
	for(int c = 0; c < pSubmits->waitSemaphoreCount; ++c)
	{
		sem_wait((sem_t*)pSubmits->pWaitSemaphores[c]);
	}

	for(int c = 0; c < pSubmits->commandBufferCount; ++c)
	{
		if(pSubmits->pCommandBuffers[c]->state == CMDBUF_STATE_EXECUTABLE)
		{
			pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_PENDING;
		}
	}

	for(int c = 0; c < pSubmits->commandBufferCount; ++c)
	{
		VkCommandBuffer cmdbuf = pSubmits->pCommandBuffers[c];

		if(!cmdbuf->binCl.currMarker)
		{
			//no markers recorded yet, skip
			continue;
		}

		//first entry is assumed to be a marker
		CLMarker* marker = cmdbuf->binCl.buffer;

		//a command buffer may contain multiple render passes
		//and commands outside render passes such as clear commands
		//each of these corresponds to a control list submit

		//submit each separate control list
		while(marker)
		{
			struct drm_vc4_submit_cl submitCl =
			{
				.color_read.hindex = ~0,
				.zs_read.hindex = ~0,
				.color_write.hindex = ~0,
				.msaa_color_write.hindex = ~0,
				.zs_write.hindex = ~0,
				.msaa_zs_write.hindex = ~0,
			};

			_image* writeImage = marker->writeImage;
			_image* readImage = marker->readImage;
			_image* writeDepthStencilImage = marker->writeDepthStencilImage;
			_image* readDepthStencilImage = marker->readDepthStencilImage;
			_image* writeMSAAimage = marker->writeMSAAimage;
			_image* writeMSAAdepthStencilImage = marker->writeMSAAdepthStencilImage;
			uint32_t performResolve = marker->performResolve;
			uint32_t readMSAAimage = marker->readMSAAimage;
			uint32_t readMSAAdepthStencilImage = marker->readMSAAdepthStencilImage;

			//This should not result in an insertion!
			uint32_t writeImageIdx = writeImage ? clGetHandleIndex(&cmdbuf->handlesCl, marker->handlesBuf, marker->handlesSize, writeImage->boundMem->bo) : 0;
			uint32_t readImageIdx = readImage ? clGetHandleIndex(&cmdbuf->handlesCl, marker->handlesBuf, marker->handlesSize, readImage->boundMem->bo) : 0;
			uint32_t writeDepthStencilImageIdx = writeDepthStencilImage ? clGetHandleIndex(&cmdbuf->handlesCl, marker->handlesBuf, marker->handlesSize, writeDepthStencilImage->boundMem->bo) : 0;
			uint32_t readDepthStencilImageIdx = readDepthStencilImage ? clGetHandleIndex(&cmdbuf->handlesCl, marker->handlesBuf, marker->handlesSize, readDepthStencilImage->boundMem->bo) : 0;
			uint32_t writeMSAAimageIdx = writeMSAAimage ? clGetHandleIndex(&cmdbuf->handlesCl, marker->handlesBuf, marker->handlesSize, writeMSAAimage->boundMem->bo) : 0;
			uint32_t writeMSAAdepthStencilImageIdx = writeMSAAdepthStencilImage ? clGetHandleIndex(&cmdbuf->handlesCl, marker->handlesBuf, marker->handlesSize, writeMSAAdepthStencilImage->boundMem->bo) : 0;

//			fprintf(stderr, "writeImage: %u\n", writeImage);
//			fprintf(stderr, "readImage: %u\n", readImage);
//			fprintf(stderr, "writeDepthStencilImage: %u\n", writeDepthStencilImage);
//			fprintf(stderr, "readDepthStencilImage: %u\n", readDepthStencilImage);
//			fprintf(stderr, "writeMSAAimage: %u\n", writeMSAAimage);
//			fprintf(stderr, "writeMSAAdepthStencilImage: %u\n", writeMSAAdepthStencilImage);
//			fprintf(stderr, "performResolve: %u\n", performResolve);
//			fprintf(stderr, "readMSAAimage: %u\n", readMSAAimage);
//			fprintf(stderr, "readMSAAdepthStencilImage: %u\n", readMSAAdepthStencilImage);
//			fprintf(stderr, "writeImageIdx: %u\n", writeImageIdx);
//			fprintf(stderr, "readImageIdx: %u\n", readImageIdx);
//			fprintf(stderr, "writeDepthStencilImageIdx: %u\n", writeDepthStencilImageIdx);
//			fprintf(stderr, "readDepthStencilImageIdx: %u\n", readDepthStencilImageIdx);
//			fprintf(stderr, "writeMSAAimageIdx: %u\n", writeMSAAimageIdx);
//			fprintf(stderr, "writeMSAAdepthStencilImageIdx: %u\n", writeMSAAdepthStencilImageIdx);

			submitCl.clear_color[0] = 0;
			submitCl.clear_color[1] = 0;
			submitCl.clear_z = 0;
			submitCl.clear_s = 0;

			//fill out submit cl fields
			if(writeImage)
			{
				uint32_t nonPaddedSize = (marker->width * marker->height * getFormatBpp(writeImage->format)) >> 3;

				uint32_t tiling = writeImage->tiling;

				if(writeImage->tiling == VC4_TILING_FORMAT_T && nonPaddedSize <= 4096)
				{
					tiling = VC4_TILING_FORMAT_LT;
				}

				submitCl.color_write.hindex = writeImageIdx;
				submitCl.color_write.offset = marker->writeImageOffset;
				submitCl.color_write.flags = 0;
				submitCl.color_write.bits =
						VC4_SET_FIELD(getRenderTargetFormatVC4(writeImage->format), VC4_RENDER_CONFIG_FORMAT) |
						VC4_SET_FIELD(tiling, VC4_RENDER_CONFIG_MEMORY_FORMAT);

				if(performResolve)
				{
					submitCl.color_write.bits |= VC4_RENDER_CONFIG_MS_MODE_4X | VC4_RENDER_CONFIG_DECIMATE_MODE_4X;
				}
			}

			if(writeMSAAimage)
			{
				submitCl.msaa_color_write.hindex = writeMSAAimageIdx;
				submitCl.msaa_color_write.offset = marker->writeMSAAimageOffset;
				submitCl.msaa_color_write.flags = 0;
				submitCl.msaa_color_write.bits = VC4_RENDER_CONFIG_MS_MODE_4X;
			}

			if(readImage)
			{
				uint32_t nonPaddedSize = (marker->width * marker->height * getFormatBpp(readImage->format)) >> 3;

				uint32_t tiling = readImage->tiling;

				if(readImage->tiling == VC4_TILING_FORMAT_T && nonPaddedSize <= 4096)
				{
					tiling = VC4_TILING_FORMAT_LT;
				}

				submitCl.color_read.hindex = readImageIdx;
				submitCl.color_read.offset = marker->readImageOffset;
				submitCl.color_read.flags = readMSAAimage ? VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES : 0;
				submitCl.color_read.bits = VC4_SET_FIELD(getRenderTargetFormatVC4(readImage->format), VC4_RENDER_CONFIG_FORMAT) |
						VC4_SET_FIELD(tiling, VC4_RENDER_CONFIG_MEMORY_FORMAT);
			}

			if(writeDepthStencilImage)
			{
				uint32_t nonPaddedSize = (marker->width * marker->height * getFormatBpp(writeDepthStencilImage->format)) >> 3;

				uint32_t tiling = writeDepthStencilImage->tiling;

				if(writeDepthStencilImage->tiling == VC4_TILING_FORMAT_T && nonPaddedSize <= 4096)
				{
					tiling = VC4_TILING_FORMAT_LT;
				}

				submitCl.zs_write.hindex = writeDepthStencilImageIdx;
				submitCl.zs_write.offset = marker->writeDepthStencilImageOffset;
				submitCl.zs_write.flags = 0;
				submitCl.zs_write.bits = VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_ZS, VC4_LOADSTORE_TILE_BUFFER_BUFFER) |
										 VC4_SET_FIELD(tiling, VC4_LOADSTORE_TILE_BUFFER_TILING);
			}

			if(writeMSAAdepthStencilImage)
			{
				submitCl.msaa_zs_write.hindex = writeMSAAdepthStencilImageIdx;
				submitCl.msaa_zs_write.offset = marker->writeMSAAdepthStencilImageOffset;
				submitCl.msaa_zs_write.flags = 0;
				submitCl.msaa_zs_write.bits = VC4_RENDER_CONFIG_MS_MODE_4X;
			}

			if(readDepthStencilImage)
			{
				uint32_t nonPaddedSize = (marker->width * marker->height * getFormatBpp(readDepthStencilImage->format)) >> 3;

				uint32_t tiling = readDepthStencilImage->tiling;

				if(readDepthStencilImage->tiling == VC4_TILING_FORMAT_T && nonPaddedSize <= 4096)
				{
					tiling = VC4_TILING_FORMAT_LT;
				}

				submitCl.zs_read.hindex = readDepthStencilImageIdx;
				submitCl.zs_read.offset = marker->readDepthStencilImageOffset;
				submitCl.zs_read.flags = readMSAAdepthStencilImage ? VC4_SUBMIT_RCL_SURFACE_READ_IS_FULL_RES : 0; //TODO is this valid?
				submitCl.zs_read.bits = VC4_SET_FIELD(getRenderTargetFormatVC4(readDepthStencilImage->format), VC4_RENDER_CONFIG_FORMAT) |
						VC4_SET_FIELD(tiling, VC4_RENDER_CONFIG_MEMORY_FORMAT);
			}

			submitCl.clear_color[0] = marker->clearColor[0];
			submitCl.clear_color[1] = marker->clearColor[1];

			submitCl.clear_z = marker->clearDepth; //0...1 -> 0...0xffffff
			submitCl.clear_s = marker->clearStencil; //0...0xff


//			fprintf(stderr, "submitCl.clear_color[0]: %u\n", submitCl.clear_color[0]);
//			fprintf(stderr, "submitCl.clear_color[1]: %u\n", submitCl.clear_color[1]);
//			fprintf(stderr, "submitCl.clear_z: %u\n", submitCl.clear_z);
//			fprintf(stderr, "submitCl.clear_s: %u\n", submitCl.clear_s);

			submitCl.min_x_tile = 0;
			submitCl.min_y_tile = 0;

			uint32_t tileSizeW = 64;
			uint32_t tileSizeH = 64;

			uint32_t widthInTiles = 0, heightInTiles = 0;
			uint32_t width = 0, height = 0, bpp = 0;

			width = marker->width;
			height = marker->height;

			if(writeImage)
			{
				bpp = getFormatBpp(writeImage->format);
			}
			else if(writeMSAAimage)
			{
				bpp = getFormatBpp(writeMSAAimage->format);
			}

			if(bpp == 64)
			{
				tileSizeH >>= 1;
			}

			if(performResolve || writeMSAAimage || writeMSAAdepthStencilImage)
			{
				tileSizeW >>= 1;
				tileSizeH >>= 1;
			}

			widthInTiles = divRoundUp(width, tileSizeW);
			heightInTiles = divRoundUp(height, tileSizeH);

			//pad width if rendering to miplevel
			if(marker->renderToMip)
			{
				width = getPow2Pad(width);
				width = width < 4 ? 4 : width;
			}

			submitCl.max_x_tile = widthInTiles - 1;
			submitCl.max_y_tile = heightInTiles - 1;
			submitCl.width = width;
			submitCl.height = height;
			submitCl.flags |= marker->flags;

			submitCl.bo_handles = marker->handlesBuf;
			submitCl.bin_cl = ((uint8_t*)marker) + sizeof(CLMarker);
			submitCl.shader_rec = marker->shaderRecBuf;
			submitCl.uniforms = marker->uniformsBuf;

			if(marker->perfmonID)
			{
				uint32_t perfmonSelector = 0;
				uint32_t* perfmonIDptr = (uint32_t*)marker->perfmonID;

				if(pSubmits->pNext)
				{
					VkPerformanceQuerySubmitInfoKHR* perfQuerySubmitInfo = pSubmits->pNext;
					perfmonSelector = perfQuerySubmitInfo->counterPassIndex;
				}

				submitCl.perfmonid = *(perfmonIDptr + perfmonSelector);
			}

			//marker not closed yet
			//close here
			if(!marker->size)
			{
				clCloseCurrentMarker(&cmdbuf->binCl, &cmdbuf->handlesCl, &cmdbuf->shaderRecCl, cmdbuf->shaderRecCount, &cmdbuf->uniformsCl);
			}

			submitCl.bo_handle_count = marker->handlesSize / 4;
			submitCl.bin_cl_size = marker->size;
			submitCl.shader_rec_size = marker->shaderRecSize;
			submitCl.shader_rec_count = marker->shaderRecCount;
			submitCl.uniforms_size = marker->uniformsSize;

			/**/
			printf("BCL:\n");
			clDump(((uint8_t*)marker) + sizeof(CLMarker), marker->size);
			printf("BO handles: ");
			for(int d = 0; d < marker->handlesSize / 4; ++d)
			{
				printf("%u ", *((uint32_t*)(marker->handlesBuf)+d));
			}
			printf("\nUniforms: ");
			for(int d = 0; d < marker->uniformsSize / 4; ++d)
			{
				printf("%u ", *((uint32_t*)(marker->uniformsBuf)+d));
			}
			printf("\nShader recs: ");
			uint8_t* ptr = marker->shaderRecBuf + (3 + 1) * 4;
			for(int d = 0; d < marker->shaderRecCount; ++d)
			{
				uint8_t flags = *ptr;
				uint8_t fragmentShaderIsSingleThreaded = flags & (1 << 0);
				uint8_t pointSizeIncludedInShadedVertexData = (flags & (1 << 1)) >> 1;
				uint8_t enableClipping = (flags & (1 << 2)) >> 2;
				ptr += 2;

				uint8_t fragmentNumberOfUniforms = *ptr; ptr++;
				uint8_t fragmentNumberOfVaryings = *ptr; ptr++;
				uint32_t fragmentShaderCodeAddress = *(uint32_t*)ptr; ptr+=4;
				uint32_t fragmentShaderUniformAddress = *(uint32_t*)ptr; ptr+=4;

				uint16_t vertexNumberOfUniforms = *(uint16_t*)ptr; ptr+=2;
				uint8_t vertexAttribSelectBits = *ptr; ptr++;
				uint8_t vertexAttribTotalSize = *ptr; ptr++;
				uint32_t vertexShaderCodeAddress = *(uint32_t*)ptr; ptr+=4;
				uint32_t vertexShaderUniformAddress = *(uint32_t*)ptr; ptr+=4;

				uint16_t coordNumberOfUniforms = *(uint16_t*)ptr; ptr+=2;
				uint8_t coordAttribSelectBits = *ptr; ptr++;
				uint8_t coordAttribTotalSize = *ptr; ptr++;
				uint32_t coordShaderCodeAddress = *(uint32_t*)ptr; ptr+=4;
				uint32_t coordShaderUniformAddress = *(uint32_t*)ptr; ptr+=4;

				printf("\nfragmentShaderIsSingleThreaded: %i", fragmentShaderIsSingleThreaded);
				printf("\npointSizeIncludedInShadedVertexData: %i", pointSizeIncludedInShadedVertexData);
				printf("\nenableClipping: %i", enableClipping);

				printf("\nfragmentNumberOfUniforms: %i", fragmentNumberOfUniforms);
				printf("\nfragmentNumberOfVaryings: %i", fragmentNumberOfVaryings);
				printf("\nfragmentShaderCodeAddress: %i", fragmentShaderCodeAddress);
				printf("\nfragmentShaderUniformAddress: %i", fragmentShaderUniformAddress);

				printf("\nvertexNumberOfUniforms: %i", vertexNumberOfUniforms);
				printf("\nvertexAttribSelectBits: %i", vertexAttribSelectBits);
				printf("\nvertexAttribTotalSize: %i", vertexAttribTotalSize);
				printf("\nvertexShaderCodeAddress: %i", vertexShaderCodeAddress);
				printf("\nvertexShaderUniformAddress: %i", vertexShaderUniformAddress);

				printf("\ncoordNumberOfUniforms: %i", coordNumberOfUniforms);
				printf("\ncoordAttribSelectBits: %i", coordAttribSelectBits);
				printf("\ncoordAttribTotalSize: %i", coordAttribTotalSize);
				printf("\ncoordShaderCodeAddress: %i", coordShaderCodeAddress);
				printf("\ncoordShaderUniformAddress: %i", coordShaderUniformAddress);

				uint8_t numAttribs = 0;
				for(uint8_t e = 0; e < 8; ++e)
				{
					numAttribs += (vertexAttribSelectBits & (1 << e)) >> e;
				}

				for(uint8_t e = 0; e < numAttribs; ++e)
				{
					uint32_t attribBaseAddress = *(uint32_t*)ptr; ptr+=4;
					uint8_t attribNumBytes = *ptr; ptr++;
					uint8_t attribStride = *ptr; ptr++;
					uint8_t attribVsVPMOffset = *ptr; ptr++;
					uint8_t attribCsVPMOffset = *ptr; ptr++;

					printf("\nattrib \#%i", e);
					printf("\nattribBaseAddress: %i", attribBaseAddress);
					printf("\nattribNumBytes: %i", attribNumBytes);
					printf("\nattribStride: %i", attribStride);
					printf("\nattribVsVPMOffset: %i", attribVsVPMOffset);
					printf("\nattribCsVPMOffset: %i", attribCsVPMOffset);
				}
			}
			printf("\nwidth height: %u, %u\n", submitCl.width, submitCl.height);
			printf("tile min/max: %u,%u %u,%u\n", submitCl.min_x_tile, submitCl.min_y_tile, submitCl.max_x_tile, submitCl.max_y_tile);
			printf("color read surf: hindex, offset, bits, flags %u %u %u %u\n", submitCl.color_read.hindex, submitCl.color_read.offset, submitCl.color_read.bits, submitCl.color_read.flags);
			printf("color write surf: hindex, offset, bits, flags %u %u %u %u\n", submitCl.color_write.hindex, submitCl.color_write.offset, submitCl.color_write.bits, submitCl.color_write.flags);
			printf("zs read surf: hindex, offset, bits, flags %u %u %u %u\n", submitCl.zs_read.hindex, submitCl.zs_read.offset, submitCl.zs_read.bits, submitCl.zs_read.flags);
			printf("zs write surf: hindex, offset, bits, flags %u %u %u %u\n", submitCl.zs_write.hindex, submitCl.zs_write.offset, submitCl.zs_write.bits, submitCl.zs_write.flags);
			printf("msaa color write surf: hindex, offset, bits, flags %u %u %u %u\n", submitCl.msaa_color_write.hindex, submitCl.msaa_color_write.offset, submitCl.msaa_color_write.bits, submitCl.msaa_color_write.flags);
			printf("msaa zs write surf: hindex, offset, bits, flags %u %u %u %u\n", submitCl.msaa_zs_write.hindex, submitCl.msaa_zs_write.offset, submitCl.msaa_zs_write.bits, submitCl.msaa_zs_write.flags);
			printf("clear color packed rgba %u %u\n", submitCl.clear_color[0], submitCl.clear_color[1]);
			printf("clear z %u\n", submitCl.clear_z);
			printf("clear s %u\n", submitCl.clear_s);
			printf("flags %u\n", submitCl.flags);
			printf("perfmonID %u\n", submitCl.perfmonid);
			/**/

			assert(submitCl.bo_handle_count > 0);

			//TODO somehow store last finished globally
			//so waiting on fences is faster
			//eg. could be an atomic value
			static uint64_t lastFinishedSeqno = 0;

			//submit ioctl
			vc4_cl_submit(controlFd, &submitCl, &queue->lastEmitSeqno, &lastFinishedSeqno);

			//advance in linked list
			marker = marker->nextMarker;
		}
	}

	for(int c = 0; c < pSubmits->commandBufferCount; ++c)
	{
		if(pSubmits->pCommandBuffers[c]->state == CMDBUF_STATE_PENDING)
		{
			if(pSubmits->pCommandBuffers[c]->usageFlags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)
			{
				pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_INVALID;
			}
			else
			{
				pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_EXECUTABLE;
			}
		}
	}

	for(int c = 0; c < pSubmits->signalSemaphoreCount; ++c)
	{
		sem_post((sem_t*)pSubmits->pSignalSemaphores[c]);
	}

	_fence* f = fence;
	if(f)
	{
		f->seqno = queue->lastEmitSeqno;
	}

	return VK_SUCCESS;
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkFreeCommandBuffers
 * Any primary command buffer that is in the recording or executable state and has any element of pCommandBuffers recorded into it, becomes invalid.
 */
VKAPI_ATTR void VKAPI_CALL rpi_vkFreeCommandBuffers(
		VkDevice                                    device,
		VkCommandPool                               commandPool,
		uint32_t                                    commandBufferCount,
		const VkCommandBuffer*                      pCommandBuffers)
{
	assert(device);
	assert(commandPool);
	assert(pCommandBuffers);

	_commandPool* cp = (_commandPool*)commandPool;

	for(int c = 0; c < commandBufferCount; ++c)
	{
		if(pCommandBuffers[c])
		{
			consecutivePoolFree(&cp->cpa, pCommandBuffers[c]->binCl.buffer, pCommandBuffers[c]->binCl.numBlocks);
			consecutivePoolFree(&cp->cpa, pCommandBuffers[c]->handlesCl.buffer, pCommandBuffers[c]->handlesCl.numBlocks);
			consecutivePoolFree(&cp->cpa, pCommandBuffers[c]->shaderRecCl.buffer, pCommandBuffers[c]->shaderRecCl.numBlocks);
			consecutivePoolFree(&cp->cpa, pCommandBuffers[c]->uniformsCl.buffer, pCommandBuffers[c]->uniformsCl.numBlocks);
			poolFree(&cp->pa, pCommandBuffers[c]);
		}
	}
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyCommandPool
 * When a pool is destroyed, all command buffers allocated from the pool are freed.
 * Any primary command buffer allocated from another VkCommandPool that is in the recording or executable state and has a secondary command buffer
 * allocated from commandPool recorded into it, becomes invalid.
 */
VKAPI_ATTR void VKAPI_CALL rpi_vkDestroyCommandPool(
		VkDevice                                    device,
		VkCommandPool                               commandPool,
		const VkAllocationCallbacks*                pAllocator)
{
	assert(device);

	_commandPool* cp = (_commandPool*)commandPool;

	if(cp)
	{
		FREE(cp->pa.buf);
		FREE(cp->cpa.buf);
		destroyPoolAllocator(&cp->pa);
		destroyConsecutivePoolAllocator(&cp->cpa);
		FREE(cp);
	}
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkTrimCommandPool
 */
VKAPI_ATTR void VKAPI_CALL rpi_vkTrimCommandPool(
	VkDevice                                    device,
	VkCommandPool                               commandPool,
	VkCommandPoolTrimFlags                      flags)
{
	assert(device);
	assert(commandPool);

	_commandPool* cp = commandPool;

	//TODO trim cp's pool allocator and consecutive pool allocator
	//by reallocating to just used size
	//kinda silly, as if you need memory afterwards we need to reallocate again...
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkResetCommandPool
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkResetCommandPool(
	VkDevice                                    device,
	VkCommandPool                               commandPool,
	VkCommandPoolResetFlags                     flags)
{
	assert(device);
	assert(commandPool);

	_commandPool* cp = commandPool;

	for(char* c = cp->pa.buf; c != cp->pa.buf + cp->pa.size; c += cp->pa.blockSize)
	{
		char* d = cp->pa.nextFreeBlock;
		while(d)
		{
			if(c == d) break;

			d = *(uint32_t*)d;
		}

		if(c == d) //block is free, as we found it in the free chain
		{
			continue;
		}
		else
		{
			//we found a valid block
			_commandBuffer* cb = c;
			assert(cb->state != CMDBUF_STATE_PENDING);
			cb->state = CMDBUF_STATE_INITIAL;
		}
	}

	//TODO secondary command buffers

	//TODO reset flag --> free all pool resources
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkResetCommandBuffer
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkResetCommandBuffer(
	VkCommandBuffer                             commandBuffer,
	VkCommandBufferResetFlags                   flags)
{
	assert(commandBuffer);

	_commandBuffer* cb = commandBuffer;

	assert(cb->state != CMDBUF_STATE_PENDING);

	assert(cb->cp->resetAble);

	if(cb->state == CMDBUF_STATE_RECORDING || cb->state == CMDBUF_STATE_EXECUTABLE)
	{
		cb->state = CMDBUF_STATE_INVALID;
	}
	else
	{
		cb->state = CMDBUF_STATE_INITIAL;
	}

	if(flags & VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT)
	{
		//TODO release resources
	}

	//TODO reset state?
}

VKAPI_ATTR void VKAPI_CALL rpi_vkCmdExecuteCommands(
	VkCommandBuffer                             commandBuffer,
	uint32_t                                    commandBufferCount,
	const VkCommandBuffer*                      pCommandBuffers)
{

}

VKAPI_ATTR void VKAPI_CALL rpi_vkCmdSetDeviceMask(
	VkCommandBuffer                             commandBuffer,
	uint32_t                                    deviceMask)
{
	UNSUPPORTED(vkCmdSetDeviceMask);
}