rpi-vk-driver/driver/sync.c

#include "common.h"

#include "kernel/vc4_packet.h"

//-----------------------------
//Semaphore vs Fence:
// Semaphore is GPU to GPU sync
// Fence is GPU to CPU sync
// Both are signalled by the GPU
// Both are multi-queue
// But Fence can be waited on by the CPU
// Semaphore can only be waited on by the GPU
//
//Events are general can be signalled by the CPU or the GPU
// But can only be waited on by the GPU
// Limited to a single queue
//
//TODO as a result the current semaphore
//implementation is wrong
//maybe use:
//clInsertWaitOnSemaphore
//clInsertIncrementSemaphore
//
//seems like each binCL needs to end with increment semaphore
//signalling that binning is done
//and each renderCL starts with a wait semaphore (to wait for binning)
//
//in theory we could add a wait for semaphore to the start of a binCL
//and an increment semaphore to either to the end of another binCL or renderCL
//but we can't control renderCLs as the kernel side creates those...
//
//also there's only one of this semaphore, and in Vulkan you can have many
//and should only signal those selected
//so maybe we could emulate this in shaders?
//ie. stall shader until a value is something?
//and increment said value?
//but we'd need to patch shaders and it'd probably be slow...
//
//Apparently the RPi contains 16 4bit semaphores that are accessible for each QPU via SFU
//-----------------------------

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateSemaphore
 * Semaphores are a synchronization primitive that can be used to insert a dependency between batches submitted to queues.
 * Semaphores have two states - signaled and unsignaled. The state of a semaphore can be signaled after execution of a batch of commands is completed.
 * A batch can wait for a semaphore to become signaled before it begins execution, and the semaphore is also unsignaled before the batch begins execution.
 * As with most objects in Vulkan, semaphores are an interface to internal data which is typically opaque to applications.
 * This internal data is referred to as a semaphore’s payload. However, in order to enable communication with agents outside of the current device,
 * it is necessary to be able to export that payload to a commonly understood format, and subsequently import from that format as well.
 * The internal data of a semaphore may include a reference to any resources and pending work associated with signal or unsignal operations performed on that semaphore object.
 * Mechanisms to import and export that internal data to and from semaphores are provided below.
 * These mechanisms indirectly enable applications to share semaphore state between two or more semaphores and other synchronization primitives across process and API boundaries.
 * When created, the semaphore is in the unsignaled state.
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkCreateSemaphore(
		VkDevice                                    device,
		const VkSemaphoreCreateInfo*                pCreateInfo,
		const VkAllocationCallbacks*                pAllocator,
		VkSemaphore*                                pSemaphore)
{
	PROFILESTART(rpi_vkCreateSemaphore);

	assert(device);
	assert(pSemaphore);

	//we'll probably just use an IOCTL to wait for a GPU sequence number to complete.
	sem_t* s = ALLOCATE(sizeof(sem_t), 1, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
	if(!s)
	{
		PROFILEEND(rpi_vkCreateSemaphore);
		return VK_ERROR_OUT_OF_HOST_MEMORY;
	}
	sem_init(s, 0, 0); //create semaphore unsignalled, shared between threads

	*pSemaphore = (VkSemaphore)s;

	PROFILEEND(rpi_vkCreateSemaphore);
	return VK_SUCCESS;
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCmdPipelineBarrier
 * vkCmdPipelineBarrier is a synchronization command that inserts a dependency between commands submitted to the same queue, or between commands in the same subpass.
 * When vkCmdPipelineBarrier is submitted to a queue, it defines a memory dependency between commands that were submitted before it, and those submitted after it.
 * If vkCmdPipelineBarrier was recorded outside a render pass instance, the first synchronization scope includes all commands that occur earlier in submission order.
 * If vkCmdPipelineBarrier was recorded inside a render pass instance, the first synchronization scope includes only commands that occur earlier in submission order within the same subpass.
 * In either case, the first synchronization scope is limited to operations on the pipeline stages determined by the source stage mask specified by srcStageMask.
 *
 * If vkCmdPipelineBarrier was recorded outside a render pass instance, the second synchronization scope includes all commands that occur later in submission order.
 * If vkCmdPipelineBarrier was recorded inside a render pass instance, the second synchronization scope includes only commands that occur later in submission order within the same subpass.
 * In either case, the second synchronization scope is limited to operations on the pipeline stages determined by the destination stage mask specified by dstStageMask.
 *
 * The first access scope is limited to access in the pipeline stages determined by the source stage mask specified by srcStageMask.
 * Within that, the first access scope only includes the first access scopes defined by elements of the pMemoryBarriers,
 * pBufferMemoryBarriers and pImageMemoryBarriers arrays, which each define a set of memory barriers. If no memory barriers are specified,
 * then the first access scope includes no accesses.
 *
 * The second access scope is limited to access in the pipeline stages determined by the destination stage mask specified by dstStageMask.
 * Within that, the second access scope only includes the second access scopes defined by elements of the pMemoryBarriers, pBufferMemoryBarriers and pImageMemoryBarriers arrays,
 * which each define a set of memory barriers. If no memory barriers are specified, then the second access scope includes no accesses.
 *
 * If dependencyFlags includes VK_DEPENDENCY_BY_REGION_BIT, then any dependency between framebuffer-space pipeline stages is framebuffer-local - otherwise it is framebuffer-global.
 */
VKAPI_ATTR void VKAPI_CALL rpi_vkCmdPipelineBarrier(
		VkCommandBuffer                             commandBuffer,
		VkPipelineStageFlags                        srcStageMask,
		VkPipelineStageFlags                        dstStageMask,
		VkDependencyFlags                           dependencyFlags,
		uint32_t                                    memoryBarrierCount,
		const VkMemoryBarrier*                      pMemoryBarriers,
		uint32_t                                    bufferMemoryBarrierCount,
		const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
		uint32_t                                    imageMemoryBarrierCount,
		const VkImageMemoryBarrier*                 pImageMemoryBarriers)
{
	PROFILESTART(rpi_vkCmdPipelineBarrier);

	assert(commandBuffer);

	//TODO pipeline stage flags
	//VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
	//VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT
	//VK_PIPELINE_STAGE_VERTEX_INPUT_BIT
	//VK_PIPELINE_STAGE_VERTEX_SHADER_BIT
	//VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT
	//VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT
	//VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT
	//VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT
	//VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT
	//VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT
	//VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT
	//VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT
	//VK_PIPELINE_STAGE_TRANSFER_BIT
	//VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT
	//VK_PIPELINE_STAGE_HOST_BIT
	//VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT
	//VK_PIPELINE_STAGE_ALL_COMMANDS_BIT

	//TODO dependency flags
	//VK_DEPENDENCY_BY_REGION_BIT,
	//VK_DEPENDENCY_DEVICE_GROUP_BIT,
	//VK_DEPENDENCY_VIEW_LOCAL_BIT

	//TODO access flags
	//VK_ACCESS_INDIRECT_COMMAND_READ_BIT
	//VK_ACCESS_INDEX_READ_BIT
	//VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT
	//VK_ACCESS_UNIFORM_READ_BIT
	//VK_ACCESS_INPUT_ATTACHMENT_READ_BIT
	//VK_ACCESS_SHADER_READ_BIT
	//VK_ACCESS_SHADER_WRITE_BIT
	//VK_ACCESS_COLOR_ATTACHMENT_READ_BIT
	//VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT
	//VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT
	//VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT
	//VK_ACCESS_TRANSFER_READ_BIT
	//VK_ACCESS_TRANSFER_WRITE_BIT
	//VK_ACCESS_HOST_READ_BIT
	//VK_ACCESS_HOST_WRITE_BIT
	//VK_ACCESS_MEMORY_READ_BIT
	//VK_ACCESS_MEMORY_WRITE_BIT
	//VK_ACCESS_COMMAND_PROCESS_READ_BIT_NVX
	//VK_ACCESS_COMMAND_PROCESS_WRITE_BIT_NVX

	//TODO Layout transition flags
	//VK_IMAGE_LAYOUT_UNDEFINED
	//VK_IMAGE_LAYOUT_GENERAL
	//VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL
	//VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
	//VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL
	//VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
	//VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL
	//VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
	//VK_IMAGE_LAYOUT_PREINITIALIZED
	//VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
	//VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL
	//VK_IMAGE_LAYOUT_PRESENT_SRC_KHR
	//VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR

	for(int c = 0; c < memoryBarrierCount; ++c)
	{
		//TODO
	}

	for(int c = 0; c < bufferMemoryBarrierCount; ++c)
	{
		//TODO
	}

	for(int c = 0; c < imageMemoryBarrierCount; ++c)
	{
		_image* i = pImageMemoryBarriers[c].image;

		if(srcStageMask & VK_PIPELINE_STAGE_TRANSFER_BIT &&
		   pImageMemoryBarriers[c].srcAccessMask & VK_ACCESS_TRANSFER_WRITE_BIT)
		{
			assert(i->layout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
		}

		//transition to new layout
		i->layout = pImageMemoryBarriers[c].newLayout;
	}

	PROFILEEND(rpi_vkCmdPipelineBarrier);
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDeviceWaitIdle
 * vkDeviceWaitIdle is equivalent to calling vkQueueWaitIdle for all queues owned by device.
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkDeviceWaitIdle(
		VkDevice									device)
{
	PROFILESTART(rpi_vkDeviceWaitIdle);

	assert(device);

	for(int c = 0; c < numQueueFamilies; ++c)
	{
		for(int d = 0; d < device->numQueues[c]; ++d)
		{
			uint64_t lastFinishedSeqno;
			uint64_t timeout = WAIT_TIMEOUT_INFINITE;
			vc4_seqno_wait(controlFd, &lastFinishedSeqno, device->queues[c][d].lastEmitSeqno, &timeout);
		}
	}

	PROFILEEND(rpi_vkDeviceWaitIdle);
	return VK_SUCCESS;
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkQueueWaitIdle
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkQueueWaitIdle(
	VkQueue                                     queue)
{
	PROFILESTART(rpi_vkQueueWaitIdle);

	assert(queue);

	_queue* q = queue;
	uint64_t lastFinishedSeqno;
	uint64_t timeout = WAIT_TIMEOUT_INFINITE;
	vc4_seqno_wait(controlFd, &lastFinishedSeqno, q->lastEmitSeqno, &timeout);

	PROFILEEND(rpi_vkQueueWaitIdle);
	return VK_SUCCESS;
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySemaphore
 */
VKAPI_ATTR void VKAPI_CALL rpi_vkDestroySemaphore(
		VkDevice                                    device,
		VkSemaphore                                 semaphore,
		const VkAllocationCallbacks*                pAllocator)
{
	PROFILESTART(rpi_vkDestroySemaphore);

	assert(device);

	if(semaphore)
	{
		sem_destroy((sem_t*)semaphore);
		FREE(semaphore);
	}

	PROFILEEND(rpi_vkDestroySemaphore);
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateFence
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkCreateFence(
	VkDevice                                    device,
	const VkFenceCreateInfo*                    pCreateInfo,
	const VkAllocationCallbacks*                pAllocator,
	VkFence*                                    pFence)
{
	PROFILESTART(rpi_vkCreateFence);

	assert(device);
	assert(pCreateInfo);
	assert(pFence);

	_fence* f = ALLOCATE(sizeof(_fence), 1, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);

	if(!f)
	{
		PROFILEEND(rpi_vkCreateFence);
		return VK_ERROR_OUT_OF_HOST_MEMORY;
	}

	f->seqno = 0;
	f->signaled = pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT;

	*pFence = f;

	PROFILEEND(rpi_vkCreateFence);
	return VK_SUCCESS;
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyFence
 */
VKAPI_ATTR void VKAPI_CALL rpi_vkDestroyFence(
	VkDevice                                    device,
	VkFence                                     fence,
	const VkAllocationCallbacks*                pAllocator)
{
	PROFILESTART(rpi_vkDestroyFence);

	assert(device);

	if(fence)
	{
		FREE(fence);
	}

	PROFILEEND(rpi_vkDestroyFence);
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetFenceStatus
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkGetFenceStatus(
	VkDevice                                    device,
	VkFence                                     fence)
{
	PROFILESTART(rpi_vkGetFenceStatus);

	assert(device);
	assert(fence);

	//TODO update fence status based on last completed seqno?

	_fence* f = fence;
	VkResult retval = f->signaled ? VK_SUCCESS : VK_NOT_READY;

	PROFILEEND(rpi_vkGetFenceStatus);
	return retval;
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkResetFences
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkResetFences(
	VkDevice                                    device,
	uint32_t                                    fenceCount,
	const VkFence*                              pFences)
{
	PROFILESTART(rpi_vkResetFences);

	assert(device);
	assert(pFences);
	assert(fenceCount > 0);

	for(uint32_t c = 0; c < fenceCount; ++c)
	{
		_fence* f = pFences[c];
		f->signaled = 0;
		f->seqno = 0;
	}

	PROFILEEND(rpi_vkResetFences);
	return VK_SUCCESS;
}

/*
 * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkWaitForFences
 */
VKAPI_ATTR VkResult VKAPI_CALL rpi_vkWaitForFences(
	VkDevice                                    device,
	uint32_t                                    fenceCount,
	const VkFence*                              pFences,
	VkBool32                                    waitAll,
	uint64_t                                    timeout)
{
	PROFILESTART(rpi_vkWaitForFences);

	assert(device);
	assert(pFences);
	assert(fenceCount > 0);

	if(waitAll)
	{
		if(!timeout)
		{
			for(uint32_t c = 0; c < fenceCount; ++c)
			{
				_fence* f = pFences[c];
				if(!f->signaled) //if any unsignaled
				{
					PROFILEEND(rpi_vkWaitForFences);
					return VK_TIMEOUT;
				}

				PROFILEEND(rpi_vkWaitForFences);
				return VK_SUCCESS;
			}
		}

		//wait for all to be signaled
		for(uint32_t c = 0; c < fenceCount; ++c)
		{
			_fence* f = pFences[c];
			uint64_t lastFinishedSeqno = 0;
			if(!f->signaled)
			{
				int ret = vc4_seqno_wait(controlFd, &lastFinishedSeqno, f->seqno, &timeout);

				if(ret < 0)
				{
					PROFILEEND(rpi_vkWaitForFences);
					return VK_TIMEOUT;
				}

				f->signaled = 1;
				f->seqno = 0;
			}
		}
	}
	else
	{
		if(!timeout)
		{
			for(uint32_t c = 0; c < fenceCount; ++c)
			{
				_fence* f = pFences[c];
				if(f->signaled) //if any signaled
				{
					PROFILEEND(rpi_vkWaitForFences);
					return VK_SUCCESS;
				}

				PROFILEEND(rpi_vkWaitForFences);
				return VK_TIMEOUT;
			}
		}

		//wait for any to be signaled
		for(uint32_t c = 0; c < fenceCount; ++c)
		{
			_fence* f = pFences[c];
			uint64_t lastFinishedSeqno = 0;
			if(!f->signaled)
			{
				int ret = vc4_seqno_wait(controlFd, &lastFinishedSeqno, f->seqno, &timeout);

				if(ret < 0)
				{
					continue;
				}

				f->signaled = 1;
				f->seqno = 0;
				PROFILEEND(rpi_vkWaitForFences);
				return VK_SUCCESS;
			}
			else
			{
				//already signaled
				PROFILEEND(rpi_vkWaitForFences);
				return VK_SUCCESS;
			}
		}

		PROFILEEND(rpi_vkWaitForFences);
		return VK_TIMEOUT;
	}

	PROFILEEND(rpi_vkWaitForFences);
	return VK_SUCCESS;
}

VKAPI_ATTR void VKAPI_CALL rpi_vkCmdWaitEvents(
	VkCommandBuffer                             commandBuffer,
	uint32_t                                    eventCount,
	const VkEvent*                              pEvents,
	VkPipelineStageFlags                        srcStageMask,
	VkPipelineStageFlags                        dstStageMask,
	uint32_t                                    memoryBarrierCount,
	const VkMemoryBarrier*                      pMemoryBarriers,
	uint32_t                                    bufferMemoryBarrierCount,
	const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
	uint32_t                                    imageMemoryBarrierCount,
	const VkImageMemoryBarrier*                 pImageMemoryBarriers)
{
	UNSUPPORTED(vkCmdWaitEvents);
}

VKAPI_ATTR VkResult VKAPI_CALL rpi_vkGetEventStatus(
	VkDevice                                    device,
	VkEvent                                     event)
{
	UNSUPPORTED(vkGetEventStatus);
	return UNSUPPORTED_RETURN;
}

VKAPI_ATTR void VKAPI_CALL rpi_vkDestroyEvent(
	VkDevice                                    device,
	VkEvent                                     event,
	const VkAllocationCallbacks*                pAllocator)
{
	UNSUPPORTED(vkDestroyEvent);
}

VKAPI_ATTR void VKAPI_CALL rpi_vkCmdResetEvent(
	VkCommandBuffer                             commandBuffer,
	VkEvent                                     event,
	VkPipelineStageFlags                        stageMask)
{
	UNSUPPORTED(vkCmdResetEvent);
}

VKAPI_ATTR VkResult VKAPI_CALL rpi_vkCreateEvent(
	VkDevice                                    device,
	const VkEventCreateInfo*                    pCreateInfo,
	const VkAllocationCallbacks*                pAllocator,
	VkEvent*                                    pEvent)
{
	UNSUPPORTED(vkCreateEvent);
	return UNSUPPORTED_RETURN;
}

VKAPI_ATTR VkResult VKAPI_CALL rpi_vkResetEvent(
	VkDevice                                    device,
	VkEvent                                     event)
{
	UNSUPPORTED(vkResetEvent);
	return UNSUPPORTED_RETURN;
}

VKAPI_ATTR VkResult VKAPI_CALL rpi_vkSetEvent(
	VkDevice                                    device,
	VkEvent                                     event)
{
	UNSUPPORTED(vkSetEvent);
	return UNSUPPORTED_RETURN;
}

VKAPI_ATTR void VKAPI_CALL rpi_vkCmdSetEvent(
	VkCommandBuffer                             commandBuffer,
	VkEvent                                     event,
	VkPipelineStageFlags                        stageMask)
{
	UNSUPPORTED(vkCmdSetEvent);
}