diff --git a/driver/AlignedAllocator.c b/driver/AlignedAllocator.c
new file mode 100644
index 0000000..7e7c6e8
--- /dev/null
+++ b/driver/AlignedAllocator.c
@@ -0,0 +1,31 @@
+#include "AlignedAllocator.h"
+
+void* alignedAlloc( unsigned bytes, unsigned alignment )
+{
+	if( !bytes )
+	{
+		return 0;
+	}
+
+	const unsigned maxBytes = 1024 * 1024 * 1024; //1GB is max on RPi
+
+	if( bytes > maxBytes )
+	{
+		return 0; //bad alloc
+	}
+
+	void* pv = 0;
+
+	if( posix_memalign( &pv, alignment, bytes ) )
+	{
+		pv = 0; //allocation failed
+	}
+
+	return pv;
+}
+
+void alignedFree( void* p )
+{
+	free( p );
+}
+
diff --git a/driver/AlignedAllocator.h b/driver/AlignedAllocator.h
index 8f9471c..dc71b76 100644
--- a/driver/AlignedAllocator.h
+++ b/driver/AlignedAllocator.h
@@ -6,34 +6,8 @@ extern "C" {
 
 #include <stdlib.h>
 
-void* alignedAlloc( unsigned bytes, unsigned alignment )
-{
-	if( !bytes )
-	{
-		return 0;
-	}
-
-	const unsigned maxBytes = 1024 * 1024 * 1024; //1GB is max on RPi
-
-	if( bytes > maxBytes )
-	{
-		return 0; //bad alloc
-	}
-
-	void* pv = 0;
-
-	if( posix_memalign( &pv, alignment, bytes ) )
-	{
-		pv = 0; //allocation failed
-	}
-
-	return pv;
-}
-
-void alignedFree( void* p )
-{
-	free( p );
-}
+void* alignedAlloc( unsigned bytes, unsigned alignment );
+void alignedFree( void* p );
 
 #if defined (__cplusplus)
 }
diff --git a/driver/ConsecutivePoolAllocator.c b/driver/ConsecutivePoolAllocator.c
new file mode 100644
index 0000000..ef1ab75
--- /dev/null
+++ b/driver/ConsecutivePoolAllocator.c
@@ -0,0 +1,157 @@
+#include "ConsecutivePoolAllocator.h"
+
+#include "CustomAssert.h"
+
+#include <stdint.h>
+
+ConsecutivePoolAllocator createConsecutivePoolAllocator(char* b, unsigned bs, unsigned s)
+{
+	assert(b); //only allocated memory
+	assert(bs >= sizeof(void*)); //we need to be able to store
+	assert(s%bs==0); //we want a size that is the exact multiple of block size
+	assert(s > bs); //at least 1 element
+
+	ConsecutivePoolAllocator pa =
+	{
+		.buf = b,
+		.nextFreeBlock = (uint32_t*)b,
+		.blockSize = bs,
+		.size = s
+	};
+
+	//initialize linked list of free pointers
+	uint32_t* ptr = pa.nextFreeBlock;
+	unsigned last = s/bs - 1;
+	for(unsigned c = 0; c < last; ++c)
+	{
+		*ptr = (uint32_t)ptr + bs;
+		ptr += bs/4;
+	}
+
+	*ptr = 0; //last element
+
+	return pa;
+}
+
+void destroyConsecutivePoolAllocator(ConsecutivePoolAllocator* pa)
+{
+	//actual memory freeing is done by caller
+	pa->buf = 0;
+	pa->nextFreeBlock = 0;
+	pa->blockSize = 0;
+	pa->size = 0;
+}
+
+//allocate numBlocks consecutive memory
+void* consecutivePoolAllocate(ConsecutivePoolAllocator* pa, uint32_t numBlocks)
+{
+	assert(pa->buf);
+
+	if(!pa->nextFreeBlock)
+	{
+		return 0; //no free blocks
+	}
+
+	void* ret = 0;
+	for(uint32_t* candidate = pa->nextFreeBlock; candidate; candidate = (uint32_t*)*candidate)
+	{
+		uint32_t found = 1;
+		uint32_t* prevBlock = candidate;
+		uint32_t* blockAfterCandidate = (uint32_t*)*candidate;
+		//check if there are enough consecutive free blocks
+		for(uint32_t c = 0; c < numBlocks - 1; ++c)
+		{
+			if(blockAfterCandidate - prevBlock != pa->blockSize)
+			{
+				//signal if not consecutive (ie. diff is greater than blocksize)
+				found = 0;
+				break;
+			}
+			prevBlock = blockAfterCandidate;
+			blockAfterCandidate = (uint32_t*)*blockAfterCandidate;
+		}
+
+		//numblocks consecutive blocks found
+		if(found)
+		{
+			ret = candidate;
+			if(pa->nextFreeBlock == candidate)
+			{
+				//candidate found immediately
+				pa->nextFreeBlock = blockAfterCandidate;
+			}
+			else
+			{
+				//somewhere the linked list would point to candidate, we need to correct this
+				for(uint32_t* nextFreeBlockCandidate = pa->nextFreeBlock; nextFreeBlockCandidate; nextFreeBlockCandidate = (uint32_t*)*nextFreeBlockCandidate)
+				{
+					if((uint32_t*)*nextFreeBlockCandidate == candidate)
+					{
+						*nextFreeBlockCandidate = (uint32_t)blockAfterCandidate;
+						break;
+					}
+				}
+			}
+			break;
+		}
+	}
+
+	return ret;
+}
+
+//free numBlocks consecutive memory
+void consecutivePoolFree(ConsecutivePoolAllocator* pa, void* p, uint32_t numBlocks)
+{
+	assert(pa->buf);
+	assert(p);
+
+	if((void*)pa->nextFreeBlock > p)
+	{
+		for(uint32_t c = 0; c < numBlocks - 1; ++c)
+		{
+			//set each allocated block to form a linked list
+			*(uint32_t*)((char*)p + c * pa->blockSize) = (uint32_t)((char*)p + (c + 1) * pa->blockSize);
+		}
+		//set last block to point to the next free
+		*(uint32_t*)((char*)p + (numBlocks - 1) * pa->blockSize) = (uint32_t)pa->nextFreeBlock;
+		//set next free to the newly freed block
+		pa->nextFreeBlock = p;
+		return;
+	}
+
+	//somewhere the linked list may point after the free block (or null), we need to correct this
+	for(uint32_t* nextFreeBlockCandidate = pa->nextFreeBlock; nextFreeBlockCandidate; nextFreeBlockCandidate = (uint32_t*)*nextFreeBlockCandidate)
+	{
+		if((void*)*nextFreeBlockCandidate > p || !*nextFreeBlockCandidate)
+		{
+			for(uint32_t c = 0; c < numBlocks - 1; ++c)
+			{
+				//set each allocated block to form a linked list
+				*(uint32_t*)((char*)p + c * pa->blockSize) = (uint32_t)((char*)p + (c + 1) * pa->blockSize);
+			}
+			//set last block to point to the next free
+			*(uint32_t*)((char*)p + (numBlocks - 1) * pa->blockSize) = *nextFreeBlockCandidate;
+
+			*nextFreeBlockCandidate = (uint32_t)p;
+			break;
+		}
+	}
+}
+
+//if there's a block free after the current block, it just allocates one more block
+//else it frees current block and allocates a new one
+void* consecutivePoolReAllocate(ConsecutivePoolAllocator* pa, void* currentMem, uint32_t currNumBlocks)
+{
+	if(pa->nextFreeBlock == (uint32_t*)((char*)currentMem + currNumBlocks * pa->blockSize))
+	{
+		//we have one more block after current one, so just expand current
+		pa->nextFreeBlock = (uint32_t*)*pa->nextFreeBlock;
+		return currentMem;
+	}
+	else
+	{
+		void* ret = consecutivePoolAllocate(pa, currNumBlocks + 1);
+		consecutivePoolFree(pa, currentMem, currNumBlocks);
+		return ret;
+	}
+}
diff --git a/driver/ConsecutivePoolAllocator.h b/driver/ConsecutivePoolAllocator.h
index a90f751..112d57d 100644
--- a/driver/ConsecutivePoolAllocator.h
+++ b/driver/ConsecutivePoolAllocator.h
@@ -16,157 +16,11 @@ typedef struct ConsecutivePoolAllocator
 	unsigned size; //size is exact multiple of block size
 } ConsecutivePoolAllocator;
 
-ConsecutivePoolAllocator createConsecutivePoolAllocator(char* b, unsigned bs, unsigned s)
-{
-	assert(b); //only allocated memory
-	assert(bs >= sizeof(void*)); //we need to be able to store
-	assert(s%bs==0); //we want a size that is the exact multiple of block size
-	assert(s > bs); //at least 1 element
-
-	ConsecutivePoolAllocator pa =
-	{
-		.buf = b,
-		.nextFreeBlock = (uint32_t*)b,
-		.blockSize = bs,
-		.size = s
-	};
-
-	//initialize linked list of free pointers
-	uint32_t* ptr = pa.nextFreeBlock;
-	unsigned last = s/bs - 1;
-	for(unsigned c = 0; c < last; ++c)
-	{
-		*ptr = (uint32_t)ptr + bs;
-		ptr += bs/4;
-	}
-
-	*ptr = 0; //last element
-
-	return pa;
-}
-
-void destroyConsecutivePoolAllocator(ConsecutivePoolAllocator* pa)
-{
-	//actual memory freeing is done by caller
-	pa->buf = 0;
-	pa->nextFreeBlock = 0;
-	pa->blockSize = 0;
-	pa->size = 0;
-}
-
-//allocate numBlocks consecutive memory
-void* consecutivePoolAllocate(ConsecutivePoolAllocator* pa, uint32_t numBlocks)
-{
-	assert(pa->buf);
-
-	if(!pa->nextFreeBlock)
-	{
-		return 0; //no free blocks
-	}
-
-	void* ret = 0;
-	for(uint32_t* candidate = pa->nextFreeBlock; candidate; candidate = (uint32_t*)*candidate)
-	{
-		uint32_t found = 1;
-		uint32_t* prevBlock = candidate;
-		uint32_t* blockAfterCandidate = (uint32_t*)*candidate;
-		//check if there are enough consecutive free blocks
-		for(uint32_t c = 0; c < numBlocks - 1; ++c)
-		{
-			if(blockAfterCandidate - prevBlock != pa->blockSize)
-			{
-				//signal if not consecutive (ie. diff is greater than blocksize)
-				found = 0;
-				break;
-			}
-			prevBlock = blockAfterCandidate;
-			blockAfterCandidate = (uint32_t*)*blockAfterCandidate;
-		}
-
-		//numblocks consecutive blocks found
-		if(found)
-		{
-			ret = candidate;
-			if(pa->nextFreeBlock == candidate)
-			{
-				//candidate found immediately
-				pa->nextFreeBlock = blockAfterCandidate;
-			}
-			else
-			{
-				//somewhere the linked list would point to candidate, we need to correct this
-				for(uint32_t* nextFreeBlockCandidate = pa->nextFreeBlock; nextFreeBlockCandidate; nextFreeBlockCandidate = (uint32_t*)*nextFreeBlockCandidate)
-				{
-					if((uint32_t*)*nextFreeBlockCandidate == candidate)
-					{
-						*nextFreeBlockCandidate = (uint32_t)blockAfterCandidate;
-						break;
-					}
-				}
-			}
-			break;
-		}
-	}
-
-	return ret;
-}
-
-//free numBlocks consecutive memory
-void consecutivePoolFree(ConsecutivePoolAllocator* pa, void* p, uint32_t numBlocks)
-{
-	assert(pa->buf);
-	assert(p);
-
-	if((void*)pa->nextFreeBlock > p)
-	{
-		for(uint32_t c = 0; c < numBlocks - 1; ++c)
-		{
-			//set each allocated block to form a linked list
-			*(uint32_t*)((char*)p + c * pa->blockSize) = (uint32_t)((char*)p + (c + 1) * pa->blockSize);
-		}
-		//set last block to point to the next free
-		*(uint32_t*)((char*)p + (numBlocks - 1) * pa->blockSize) = (uint32_t)pa->nextFreeBlock;
-		//set next free to the newly freed block
-		pa->nextFreeBlock = p;
-		return;
-	}
-
-	//somewhere the linked list may point after the free block (or null), we need to correct this
-	for(uint32_t* nextFreeBlockCandidate = pa->nextFreeBlock; nextFreeBlockCandidate; nextFreeBlockCandidate = (uint32_t*)*nextFreeBlockCandidate)
-	{
-		if((void*)*nextFreeBlockCandidate > p || !*nextFreeBlockCandidate)
-		{
-			for(uint32_t c = 0; c < numBlocks - 1; ++c)
-			{
-				//set each allocated block to form a linked list
-				*(uint32_t*)((char*)p + c * pa->blockSize) = (uint32_t)((char*)p + (c + 1) * pa->blockSize);
-			}
-			//set last block to point to the next free
-			*(uint32_t*)((char*)p + (numBlocks - 1) * pa->blockSize) = *nextFreeBlockCandidate;
-
-			*nextFreeBlockCandidate = (uint32_t)p;
-			break;
-		}
-	}
-}
-
-//if there's a block free after the current block, it just allocates one more block
-//else it frees current block and allocates a new one
-void* consecutivePoolReAllocate(ConsecutivePoolAllocator* pa, void* currentMem, uint32_t currNumBlocks)
-{
-	if(pa->nextFreeBlock == (uint32_t*)((char*)currentMem + currNumBlocks * pa->blockSize))
-	{
-		//we have one more block after current one, so just expand current
-		pa->nextFreeBlock = (uint32_t*)*pa->nextFreeBlock;
-		return currentMem;
-	}
-	else
-	{
-		void* ret = consecutivePoolAllocate(pa, currNumBlocks + 1);
-		consecutivePoolFree(pa, currentMem, currNumBlocks);
-		return ret;
-	}
-}
+ConsecutivePoolAllocator createConsecutivePoolAllocator(char* b, unsigned bs, unsigned s);
+void destroyConsecutivePoolAllocator(ConsecutivePoolAllocator* pa);
+void* consecutivePoolAllocate(ConsecutivePoolAllocator* pa, uint32_t numBlocks);
+void consecutivePoolFree(ConsecutivePoolAllocator* pa, void* p, uint32_t numBlocks);
+void* consecutivePoolReAllocate(ConsecutivePoolAllocator* pa, void* currentMem, uint32_t currNumBlocks);
 
 #if defined (__cplusplus)
 }
diff --git a/driver/ControlListUtil.c b/driver/ControlListUtil.c
new file mode 100644
index 0000000..0439d1c
--- /dev/null
+++ b/driver/ControlListUtil.c
@@ -0,0 +1,715 @@
+#include "ControlListUtil.h"
+
+#include <stdint.h>
+
+uint32_t divRoundUp(uint32_t n, uint32_t d)
+{
+	return (((n) + (d) - 1) / (d));
+}
+
+//move bits to offset, mask rest to 0
+uint32_t moveBits(uint32_t d, uint32_t bits, uint32_t offset)
+{
+	return (d << offset) & (~(~0 << bits) << offset);
+}
+
+uint32_t clSize(ControlList* cl)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	return cl->nextFreeByte - cl->buffer;
+}
+
+uint32_t clHasEnoughSpace(ControlList* cl, uint32_t size)
+{
+	uint32_t currSize = clSize(cl);
+	if(currSize + size < CONTROL_LIST_SIZE)
+	{
+		return 1; //fits!
+	}
+	else
+	{
+		return 0; //need to reallocate
+	}
+}
+
+
+void clInit(ControlList* cl, void* buffer)
+{
+	assert(cl);
+	assert(buffer);
+	cl->buffer = buffer;
+	cl->numBlocks = 1;
+	cl->nextFreeByte = &cl->buffer[0];
+}
+
+void clInsertHalt(ControlList* cl)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_HALT_opcode;
+	cl->nextFreeByte++;
+}
+
+void clInsertNop(ControlList* cl)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_NOP_opcode;
+	cl->nextFreeByte++;
+}
+
+void clInsertFlush(ControlList* cl)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_FLUSH_opcode;
+	cl->nextFreeByte++;
+}
+
+void clInsertFlushAllState(ControlList* cl)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_FLUSH_ALL_STATE_opcode;
+	cl->nextFreeByte++;
+}
+
+void clInsertStartTileBinning(ControlList* cl)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_START_TILE_BINNING_opcode;
+	cl->nextFreeByte++;
+}
+
+void clInsertIncrementSemaphore(ControlList* cl)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_INCREMENT_SEMAPHORE_opcode;
+	cl->nextFreeByte++;
+}
+
+void clInsertWaitOnSemaphore(ControlList* cl)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_WAIT_ON_SEMAPHORE_opcode;
+	cl->nextFreeByte++;
+}
+
+//input: 2 cls (cl, handles cl)
+void clInsertBranch(ControlList* cls, ControlListAddress address)
+{
+	assert(cls);
+	assert(cls->buffer);
+	assert(cls->nextFreeByte);
+	*cls->nextFreeByte = V3D21_BRANCH_opcode; cls->nextFreeByte++;
+	//TODO is this correct?
+	clEmitShaderRelocation(cls, &address);
+	*(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4;
+}
+
+//input: 2 cls (cl, handles cl)
+void clInsertBranchToSubList(ControlList* cls, ControlListAddress address)
+{
+	assert(cls);
+	assert(cls->buffer);
+	assert(cls->nextFreeByte);
+	*cls->nextFreeByte = V3D21_BRANCH_TO_SUB_LIST_opcode; cls->nextFreeByte++;
+	//TODO is this correct?
+	clEmitShaderRelocation(cls, &address);
+	*(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4;
+}
+
+void clInsertReturnFromSubList(ControlList* cl)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_RETURN_FROM_SUB_LIST_opcode;
+	cl->nextFreeByte++;
+}
+
+void clInsertStoreMultiSampleResolvedTileColorBuffer(ControlList* cl)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_opcode;
+	cl->nextFreeByte++;
+}
+
+void clInsertStoreMultiSampleResolvedTileColorBufferAndEOF(ControlList* cl)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_AND_EOF_opcode;
+	cl->nextFreeByte++;
+}
+
+/*
+//input: 2 cls (cl, handles cl)
+void clInsertStoreFullResolutionTileBuffer(ControlList* cls,
+										   ControlListAddress address,
+										   uint32_t lastTile, //0/1
+										   uint32_t disableClearOnWrite, //0/1
+										   uint32_t disableZStencilBufferWrite, //0/1
+										   uint32_t disableColorBufferWrite) //0/1
+{
+	assert(cls);
+	assert(cls->buffer);
+	assert(cls->nextFreeByte);
+	*cls->nextFreeByte = V3D21_STORE_FULL_RESOLUTION_TILE_BUFFER_opcode; cls->nextFreeByte++;
+	//TODO is this correct?
+	clEmitShaderRelocation(cls, &address);
+	*(uint32_t*)cls->nextFreeByte =
+			moveBits(disableColorBufferWrite, 1, 0) |
+			moveBits(disableZStencilBufferWrite, 1, 1) |
+			moveBits(disableClearOnWrite, 1, 2) |
+			moveBits(lastTile, 1, 3) |
+			moveBits(address.offset, 28, 4);
+	cls->nextFreeByte += 4;
+}
+*/
+
+/*
+//input: 2 cls (cl, handles cl)
+void clInsertReLoadFullResolutionTileBuffer(ControlList* cls,
+											ControlListAddress address,
+											uint32_t disableZStencilBufferRead, //0/1
+											uint32_t disableColorBufferRead) //0/1
+{
+	assert(cls);
+	assert(cls->buffer);
+	assert(cls->nextFreeByte);
+	*cls->nextFreeByte = V3D21_RE_LOAD_FULL_RESOLUTION_TILE_BUFFER_opcode; cls->nextFreeByte++;
+	//TODO is this correct?
+	clEmitShaderRelocation(cls, &address);
+	*(uint32_t*)cls->nextFreeByte =
+			moveBits(disableColorBufferRead, 1, 0) |
+			moveBits(disableZStencilBufferRead, 1, 1) |
+			moveBits(address.offset, 28, 4);
+	cls->nextFreeByte += 4;
+}
+*/
+
+/*
+//input: 2 cls (cl, handles cl)
+void clInsertStoreTileBufferGeneral(ControlList* cls,
+									ControlListAddress address,
+									uint32_t lastTileOfFrame, //0/1
+									uint32_t disableZStencilBufferDump, //0/1
+									uint32_t disableColorBufferDump, //0/1
+									uint32_t disableZStencilBufferClearOnStoreDump, //0/1
+									uint32_t disableColorBufferClearOnStoreDump, //0/1
+									uint32_t disableDoubleBufferSwap, //0/1
+									uint32_t pixelColorFormat, //0/1/2 RGBA8/BGR565dither/BGR565nodither
+									uint32_t mode, //0/1/2 sample0/decimate4x/decimate16x
+									uint32_t format, //0/1/2 raster/t/lt
+									uint32_t bufferToStore) //0/1/2/3/5 none/color/zstencil/z/full
+{
+	assert(cls);
+	assert(cls->buffer);
+	assert(cls->nextFreeByte);
+	*cls->nextFreeByte = V3D21_STORE_TILE_BUFFER_GENERAL_opcode; cls->nextFreeByte++;
+	//TODO is this correct?
+	*cls->nextFreeByte =
+			moveBits(bufferToStore, 3, 0) |
+			moveBits(format, 2, 4) |
+			moveBits(mode, 2, 6);
+	cls->nextFreeByte++;
+	*cls->nextFreeByte =
+			moveBits(pixelColorFormat, 2, 0) |
+			moveBits(disableDoubleBufferSwap, 1, 4) |
+			moveBits(disableColorBufferClearOnStoreDump, 1, 5) |
+			moveBits(disableZStencilBufferClearOnStoreDump, 1, 6) |
+			moveBits(1, 1, 7); //disable vg mask
+	cls->nextFreeByte++;
+	clEmitShaderRelocation(cls, &address);
+	*(uint32_t*)cls->nextFreeByte =
+			moveBits(disableColorBufferDump, 1, 0) |
+			moveBits(disableZStencilBufferDump, 1, 1) |
+			moveBits(1, 1, 2) | //disable vg mask
+			moveBits(lastTileOfFrame, 1, 3) |
+			moveBits(address.offset, 28, 4);
+	cls->nextFreeByte += 4;
+}
+*/
+
+/*
+//input: 2 cls (cl, handles cl)
+void clInsertLoadTileBufferGeneral(ControlList* cls,
+								   ControlListAddress address,
+								   uint32_t disableZStencilBufferLoad, //0/1
+								   uint32_t disableColorBufferLoad, //0/1
+								   uint32_t pixelColorFormat, //0/1/2 RGBA8/BGR565dither/BGR565nodither
+								   uint32_t mode, //0/1/2 sample0/decimate4x/decimate16x
+								   uint32_t format, //0/1/2 raster/t/lt
+								   uint32_t bufferToLoad) //0/1/2/3/5 none/color/zstencil/z/full
+{
+	assert(cls);
+	assert(cls->buffer);
+	assert(cls->nextFreeByte);
+	*cls->nextFreeByte = V3D21_LOAD_TILE_BUFFER_GENERAL_opcode; cls->nextFreeByte++;
+	//TODO is this correct?
+	*cls->nextFreeByte =
+			moveBits(bufferToLoad, 3, 0) |
+			moveBits(format, 2, 4);
+	cls->nextFreeByte++;
+	*cls->nextFreeByte =
+			moveBits(pixelColorFormat, 2, 0);
+	cls->nextFreeByte++;
+	clEmitShaderRelocation(cls, &address);
+	*(uint32_t*)cls->nextFreeByte =
+			moveBits(disableColorBufferLoad, 1, 0) |
+			moveBits(disableZStencilBufferLoad, 1, 1) |
+			moveBits(1, 1, 2) | //disable vg mask
+			moveBits(address.offset, 28, 4);
+	cls->nextFreeByte += 4;
+
+}
+*/
+
+void clInsertIndexedPrimitiveList(ControlList* cl,
+								  uint32_t maxIndex,
+								  uint32_t indicesAddress,
+								  uint32_t length,
+								  uint32_t indexType, //0/1: 8 or 16 bit
+								  enum V3D21_Primitive primitiveMode)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_INDEXED_PRIMITIVE_LIST_opcode; cl->nextFreeByte++;
+	*cl->nextFreeByte = moveBits(indexType, 4, 4) | moveBits(primitiveMode, 4, 0); cl->nextFreeByte++;
+	*(uint32_t*)cl->nextFreeByte = length; cl->nextFreeByte += 4;
+	*(uint32_t*)cl->nextFreeByte = indicesAddress; cl->nextFreeByte += 4;
+	*(uint32_t*)cl->nextFreeByte = maxIndex; cl->nextFreeByte += 4;
+}
+
+void clInsertVertexArrayPrimitives(ControlList* cl,
+								  uint32_t firstVertexIndex,
+								  uint32_t length,
+								  enum V3D21_Primitive primitiveMode)
+{
+	assert(cl);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_VERTEX_ARRAY_PRIMITIVES_opcode; cl->nextFreeByte++;
+	*cl->nextFreeByte = moveBits(primitiveMode, 8, 0); cl->nextFreeByte++;
+	*(uint32_t*)cl->nextFreeByte = length; cl->nextFreeByte += 4;
+	*(uint32_t*)cl->nextFreeByte = firstVertexIndex; cl->nextFreeByte += 4;
+}
+
+void clInsertPrimitiveListFormat(ControlList* cl,
+								  uint32_t dataType, //1/3: 16 or 32 bit
+								  uint32_t primitiveType) //0/1/2/3: point/line/tri/rhy
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_PRIMITIVE_LIST_FORMAT_opcode; cl->nextFreeByte++;
+	*cl->nextFreeByte = moveBits(dataType, 4, 4) | moveBits(primitiveType, 4, 0); cl->nextFreeByte++;
+}
+
+void clInsertShaderState(ControlList* cl,
+						  uint32_t address,
+						  uint32_t extendedShaderRecord, //0/1: true/false
+						 uint32_t numberOfAttributeArrays)
+{
+	assert(cl);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_GL_SHADER_STATE_opcode; cl->nextFreeByte++;
+	//TODO is this correct?
+	*(uint32_t*)cl->nextFreeByte =
+			moveBits(address, 28, 4) |
+			moveBits(extendedShaderRecord, 1, 3) |
+			moveBits(numberOfAttributeArrays, 3, 0); cl->nextFreeByte += 4;
+}
+
+/*
+void clInsertClearColors(ControlList* cl,
+						uint32_t clearStencil,
+						uint32_t clearZ, //24 bit Z
+						uint64_t clearColor) //2x RGBA8 or 1x RGBA16
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_CLEAR_COLORS_opcode; cl->nextFreeByte++;
+	*(uint64_t*)cl->nextFreeByte = clearColor; cl->nextFreeByte += 8;
+	*(uint32_t*)cl->nextFreeByte = clearZ; cl->nextFreeByte += 4; //24 bits for Z, 8 bit for vg mask (unused)
+	*cl->nextFreeByte = clearStencil; cl->nextFreeByte++;
+}
+*/
+
+void clInsertConfigurationBits(ControlList* cl,
+						uint32_t earlyZUpdatesEnable, //0/1
+						uint32_t earlyZEnable, //0/1
+						uint32_t zUpdatesEnable, //0/1
+						enum V3D21_Compare_Function depthTestFunction,
+						uint32_t coverageReadMode, //0/1 clear/leave as is
+						uint32_t coveragePipeSelect, //0/1
+						uint32_t coverageUpdateMode, //0/1/2/3 nonzero, odd, or, zero
+						uint32_t coverageReadType, //0/1 4*8bit, 16 bit mask
+						uint32_t rasterizerOversampleMode, //0/1/2 none, 4x, 16x
+						uint32_t enableDepthOffset, //0/1
+						uint32_t clockwisePrimitives, //0/1
+						uint32_t enableReverseFacingPrimitive, //0/1
+						uint32_t enableForwardFacingPrimitive) //0/1
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_CONFIGURATION_BITS_opcode; cl->nextFreeByte++;
+	*(uint32_t*)cl->nextFreeByte =
+			moveBits(enableForwardFacingPrimitive, 1, 0) |
+			moveBits(enableReverseFacingPrimitive, 1, 1) |
+			moveBits(clockwisePrimitives, 1, 2) |
+			moveBits(enableDepthOffset, 1, 3) |
+			moveBits(coverageReadType, 1, 5) |
+			moveBits(rasterizerOversampleMode, 2, 6) |
+			moveBits(coveragePipeSelect, 1, 8) |
+			moveBits(coverageUpdateMode, 2, 9) |
+			moveBits(coverageReadMode, 1, 11) |
+			moveBits(depthTestFunction, 3, 12) |
+			moveBits(zUpdatesEnable, 1, 15) |
+			moveBits(earlyZEnable, 1, 16) |
+			moveBits(earlyZUpdatesEnable, 1, 17); cl->nextFreeByte += 4;
+}
+
+void clInsertFlatShadeFlags(ControlList* cl,
+						uint32_t flags)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_FLAT_SHADE_FLAGS_opcode; cl->nextFreeByte++;
+	*(uint32_t*)cl->nextFreeByte = flags; cl->nextFreeByte += 4;
+}
+
+void clInsertPointSize(ControlList* cl,
+						float size)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_POINT_SIZE_opcode; cl->nextFreeByte++;
+	*(float*)cl->nextFreeByte = size; cl->nextFreeByte += 4;
+}
+
+void clInsertLineWidth(ControlList* cl,
+						float width)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_LINE_WIDTH_opcode; cl->nextFreeByte++;
+	*(float*)cl->nextFreeByte = width; cl->nextFreeByte += 4;
+}
+
+void clInsertRHTXBoundary(ControlList* cl,
+						uint32_t boundary) //sint16
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_RHT_X_BOUNDARY_opcode; cl->nextFreeByte++;
+	*(uint16_t*)cl->nextFreeByte = moveBits(boundary, 16, 0); cl->nextFreeByte += 2;
+}
+
+void clInsertDepthOffset(ControlList* cl,
+						uint32_t units, //float 187
+						 uint32_t factor) //float 187
+{
+	assert(cl);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_DEPTH_OFFSET_opcode; cl->nextFreeByte++;
+	*(uint32_t*)cl->nextFreeByte = moveBits(factor, 16, 0) | moveBits(units, 16, 16); cl->nextFreeByte += 4;
+}
+
+void clInsertClipWindow(ControlList* cl,
+						uint32_t width, //uint16
+						uint32_t height, //uint16
+						uint32_t bottomPixelCoord, //uint16
+						uint32_t leftPixelCoord)  //uint16
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_CLIP_WINDOW_opcode; cl->nextFreeByte++;
+	*(uint32_t*)cl->nextFreeByte = moveBits(leftPixelCoord, 16, 0) | moveBits(bottomPixelCoord, 16, 16); cl->nextFreeByte += 4;
+	*(uint32_t*)cl->nextFreeByte = moveBits(width, 16, 0) | moveBits(height, 16, 16); cl->nextFreeByte += 4;
+}
+
+void clInsertViewPortOffset(ControlList* cl,
+						uint32_t x, //sint16
+						uint32_t y //sint16
+						)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_VIEWPORT_OFFSET_opcode; cl->nextFreeByte++;
+	*(uint32_t*)cl->nextFreeByte = moveBits(x, 16, 0) | moveBits(y, 16, 16); cl->nextFreeByte += 4;
+}
+
+void clInsertZMinMaxClippingPlanes(ControlList* cl,
+						float minZw,
+						float maxZw
+						)
+{
+	assert(cl);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_Z_MIN_AND_MAX_CLIPPING_PLANES_opcode; cl->nextFreeByte++;
+	*(float*)cl->nextFreeByte = minZw; cl->nextFreeByte += 4;
+	*(float*)cl->nextFreeByte = maxZw; cl->nextFreeByte += 4;
+}
+
+void clInsertClipperXYScaling(ControlList* cl,
+						float width, //half height in 1/16 of pixel
+						float height //half width in 1/16 of pixel
+						)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_CLIPPER_XY_SCALING_opcode; cl->nextFreeByte++;
+	*(float*)cl->nextFreeByte = width; cl->nextFreeByte += 4;
+	*(float*)cl->nextFreeByte = height; cl->nextFreeByte += 4;
+}
+
+void clInsertClipperZScaleOffset(ControlList* cl,
+						float zOffset, //zc to zs
+						float zScale //zc to zs
+						)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_CLIPPER_Z_SCALE_AND_OFFSET_opcode; cl->nextFreeByte++;
+	*(float*)cl->nextFreeByte = zScale; cl->nextFreeByte += 4;
+	*(float*)cl->nextFreeByte = zOffset; cl->nextFreeByte += 4;
+}
+
+void clInsertTileBinningModeConfiguration(ControlList* cl,
+						uint32_t doubleBufferInNonMsMode, //0/1
+						uint32_t tileAllocationBlockSize, //0/1/2/3 32/64/128/256 bytes
+						uint32_t tileAllocationInitialBlockSize, //0/1/2/3 32/64/128/256 bytes
+						uint32_t autoInitializeTileStateDataArray, //0/1
+						uint32_t tileBuffer64BitColorDepth, //0/1
+						uint32_t multisampleMode4x, //0/1
+						uint32_t widthInPixels,
+						uint32_t heightInPixels,
+						uint32_t tileStateDataArrayAddress, //16 byte aligned, size of 48 bytes * num tiles
+						uint32_t tileAllocationMemorySize,
+						uint32_t tileAllocationMemoryAddress
+						)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_TILE_BINNING_MODE_CONFIGURATION_opcode; cl->nextFreeByte++;
+	*(uint32_t*)cl->nextFreeByte = tileAllocationMemoryAddress; cl->nextFreeByte += 4;
+	*(uint32_t*)cl->nextFreeByte = tileAllocationMemorySize; cl->nextFreeByte += 4;
+	*(uint32_t*)cl->nextFreeByte = tileStateDataArrayAddress; cl->nextFreeByte += 4;
+	uint32_t tileSizeW = 64;
+	uint32_t tileSizeH = 64;
+
+	if(multisampleMode4x)
+	{
+		tileSizeW >>= 1;
+		tileSizeH >>= 1;
+	}
+
+	if(tileBuffer64BitColorDepth)
+	{
+		tileSizeH >>= 1;
+	}
+
+	uint32_t widthInTiles = divRoundUp(widthInPixels, tileSizeW);
+	uint32_t heightInTiles = divRoundUp(heightInPixels, tileSizeH);
+	*(uint8_t*)cl->nextFreeByte = widthInTiles; cl->nextFreeByte++;
+	*(uint8_t*)cl->nextFreeByte = heightInTiles; cl->nextFreeByte++;
+	*cl->nextFreeByte =
+			moveBits(multisampleMode4x, 1, 0) |
+			moveBits(tileBuffer64BitColorDepth, 1, 1) |
+			moveBits(autoInitializeTileStateDataArray, 1, 2) |
+			moveBits(tileAllocationInitialBlockSize, 2, 3) |
+			moveBits(tileAllocationBlockSize, 2, 5) |
+			moveBits(doubleBufferInNonMsMode, 1, 7); cl->nextFreeByte++;
+}
+
+/*
+void clInsertTileRenderingModeConfiguration(ControlList* cls,
+						ControlListAddress address,
+						uint32_t doubleBufferInNonMsMode, //0/1
+						uint32_t earlyZEarlyCovDisable, //0/1
+						uint32_t earlyZUpdateDirection, //0/1 lt,le/gt,ge
+						uint32_t selectCoverageMode, //0/1
+						uint32_t memoryFormat, //0/1/2 linear/t/lt
+						uint32_t decimateMode, //0/1/2 0x/4x/16x
+						uint32_t nonHDRFrameFormatColorFormat, //0/1/2 bgr565dithered/rgba8/bgr565nodither
+						uint32_t tileBufferHDRMode, //0/1
+						uint32_t multisampleMode4x, //0/1
+						uint32_t widthPixels,
+						uint32_t heightPixels)
+{
+	assert(cls);
+	assert(cls->buffer);
+	assert(cls->nextFreeByte);
+	*cls->nextFreeByte = V3D21_TILE_RENDERING_MODE_CONFIGURATION_opcode; cls->nextFreeByte++;
+	//TODO is this correct?
+	clEmitShaderRelocation(cls, &address);
+	*(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4;
+	*(uint32_t*)cls->nextFreeByte = moveBits(widthPixels, 16, 0) | moveBits(heightPixels, 16, 16); cls->nextFreeByte += 4;
+	*(uint16_t*)cls->nextFreeByte =
+			moveBits(multisampleMode4x, 1, 0) |
+			moveBits(tileBufferHDRMode, 1, 1) |
+			moveBits(nonHDRFrameFormatColorFormat, 2, 2) |
+			moveBits(decimateMode, 2, 4) |
+			moveBits(memoryFormat, 2, 6) |
+			moveBits(0, 1, 8) | //vg buffer enable
+			moveBits(selectCoverageMode, 1, 9) |
+			moveBits(earlyZUpdateDirection, 1, 10) |
+			moveBits(earlyZEarlyCovDisable, 1, 11) |
+			moveBits(doubleBufferInNonMsMode, 1, 12); cls->nextFreeByte += 2;
+}
+*/
+
+/*
+void clInsertTileCoordinates(ControlList* cl,
+						uint32_t tileColumnNumber, //int8
+						uint32_t tileRowNumber) //int8
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_TILE_COORDINATES_opcode; cl->nextFreeByte++;
+	*(uint16_t*)cl->nextFreeByte = moveBits(tileColumnNumber, 8, 0) | moveBits(tileRowNumber, 8, 8); cl->nextFreeByte += 2;
+}
+*/
+
+void clInsertGEMRelocations(ControlList* cl,
+							uint32_t buffer0,
+							uint32_t buffer1)
+{
+	assert(cl);
+	assert(cl->buffer);
+	assert(cl->nextFreeByte);
+	*cl->nextFreeByte = V3D21_GEM_RELOCATIONS_opcode; cl->nextFreeByte++;
+	*(uint32_t*)cl->nextFreeByte = buffer0; cl->nextFreeByte += 4;
+	*(uint32_t*)cl->nextFreeByte = buffer1; cl->nextFreeByte += 4;
+}
+
+//input: 2 cls (cl, handles cl)
+void clInsertShaderRecord(ControlList* cls,
+						  uint32_t fragmentShaderIsSingleThreaded, //0/1
+						  uint32_t pointSizeIncludedInShadedVertexData, //0/1
+						  uint32_t enableClipping, //0/1
+						  uint32_t fragmentNumberOfUnusedUniforms,
+						  uint32_t fragmentNumberOfVaryings,
+						  uint32_t fragmentUniformsAddress,
+						  ControlListAddress fragmentCodeAddress,
+						  uint32_t vertexNumberOfUnusedUniforms,
+						  uint32_t vertexAttributeArraySelectBits,
+						  uint32_t vertexTotalAttributesSize,
+						  uint32_t vertexUniformsAddress,
+						  ControlListAddress vertexCodeAddress)
+{
+	assert(cls);
+	assert(cls->buffer);
+	assert(cls->nextFreeByte);
+	//TODO is this correct?
+	*cls->nextFreeByte =
+			moveBits(fragmentShaderIsSingleThreaded, 1, 0) |
+			moveBits(pointSizeIncludedInShadedVertexData, 1, 1) |
+			moveBits(enableClipping, 1, 2); cls->nextFreeByte++;
+	*cls->nextFreeByte = 0; cls->nextFreeByte++;
+	*(uint16_t*)cls->nextFreeByte = moveBits(fragmentNumberOfUnusedUniforms, 16, 0); cls->nextFreeByte += 2;
+	*cls->nextFreeByte = fragmentNumberOfVaryings; cls->nextFreeByte++;
+	clEmitShaderRelocation(cls, &fragmentCodeAddress);
+	*(uint32_t*)cls->nextFreeByte = fragmentCodeAddress.offset; cls->nextFreeByte += 4;
+	*(uint32_t*)cls->nextFreeByte = fragmentUniformsAddress; cls->nextFreeByte += 4;
+
+	*(uint16_t*)cls->nextFreeByte = moveBits(vertexNumberOfUnusedUniforms, 16, 0); cls->nextFreeByte += 2;
+	*cls->nextFreeByte = vertexAttributeArraySelectBits; cls->nextFreeByte++;
+	*cls->nextFreeByte = vertexTotalAttributesSize; cls->nextFreeByte++;
+	clEmitShaderRelocation(cls, &vertexCodeAddress);
+	*(uint32_t*)cls->nextFreeByte = moveBits(vertexCodeAddress.offset, 32, 0) | moveBits(vertexUniformsAddress, 32, 0); cls->nextFreeByte += 4; //???
+	cls->nextFreeByte += 4;
+	//skip coordinate shader stuff
+	cls->nextFreeByte += 16;
+}
+
+//input: 2 cls (cl, handles cl)
+void clInsertAttributeRecord(ControlList* cls,
+						  ControlListAddress address,
+						  uint32_t sizeBytes,
+						  uint32_t stride,
+						  uint32_t vertexVPMOffset)
+{
+	assert(cls);
+	assert(cls->buffer);
+	assert(cls->nextFreeByte);
+	uint32_t sizeBytesMinusOne = sizeBytes - 1;
+	//TODO is this correct?
+	clEmitShaderRelocation(cls, &address);
+	*(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4;
+	*cls->nextFreeByte = sizeBytesMinusOne; cls->nextFreeByte++;
+	*cls->nextFreeByte = stride; cls->nextFreeByte++;
+	*cls->nextFreeByte = vertexVPMOffset; cls->nextFreeByte++;
+	cls->nextFreeByte++; //skip coordinate shader stuff
+}
+
+uint32_t clGetHandleIndex(ControlList* handlesCl, uint32_t handle)
+{
+	uint32_t c = 0;
+
+	uint32_t numHandles = clSize(handlesCl) / 4;
+
+	for(; c < numHandles; ++c)
+	{
+		if(((uint32_t*)handlesCl->buffer)[c] == handle)
+		{
+			//found
+			return c;
+		}
+	}
+
+	//write handle to handles cl
+	*(uint32_t*)handlesCl->nextFreeByte = handle;
+	handlesCl->nextFreeByte += 4;
+
+	return c;
+}
+
+//input: 2 cls (cl + handles cl)
+inline void clEmitShaderRelocation(ControlList* cls, const ControlListAddress* address)
+{
+	assert(cls);
+	assert(cls->buffer);
+	assert(cls->nextFreeByte);
+	assert(address);
+	assert(address->handle);
+
+	//search for handle in handles cl
+	//if found insert handle index
+
+	ControlList* cl = cls;
+	ControlList* handlesCl = cls + 1;
+
+	//store offset within handles in cl
+	*(uint32_t*)cl->nextFreeByte = clGetHandleIndex(handlesCl, address->handle);
+	cl->nextFreeByte += 4;
+}
diff --git a/driver/ControlListUtil.h b/driver/ControlListUtil.h
index 161e37c..24b4d9b 100644
--- a/driver/ControlListUtil.h
+++ b/driver/ControlListUtil.h
@@ -21,7 +21,7 @@ typedef struct ControlList
 	uint8_t* nextFreeByte; //pointer to the next available free byte
 } ControlList;
 
-static inline void clEmitShaderRelocation(ControlList* cl, const ControlListAddress* address);
+void clEmitShaderRelocation(ControlList* cl, const ControlListAddress* address);
 
 #define __gen_user_data struct ControlList
 #define __gen_address_type ControlListAddress
@@ -30,356 +30,40 @@ static inline void clEmitShaderRelocation(ControlList* cl, const ControlListAddr
 
 #include "brcm/cle/v3d_packet_v21_pack.h"
 
-uint32_t divRoundUp(uint32_t n, uint32_t d)
-{
-	return (((n) + (d) - 1) / (d));
-}
-
-//move bits to offset, mask rest to 0
-uint32_t moveBits(uint32_t d, uint32_t bits, uint32_t offset)
-{
-	return (d << offset) & (~(~0 << bits) << offset);
-}
-
-uint32_t clSize(ControlList* cl)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	return cl->nextFreeByte - cl->buffer;
-}
-
-uint32_t clHasEnoughSpace(ControlList* cl, uint32_t size)
-{
-	uint32_t currSize = clSize(cl);
-	if(currSize + size < CONTROL_LIST_SIZE)
-	{
-		return 1; //fits!
-	}
-	else
-	{
-		return 0; //need to reallocate
-	}
-}
-
-void clInit(ControlList* cl, void* buffer)
-{
-	assert(cl);
-	assert(buffer);
-	cl->buffer = buffer;
-	cl->numBlocks = 1;
-	cl->nextFreeByte = &cl->buffer[0];
-}
-
-void clInsertHalt(ControlList* cl)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_HALT_opcode;
-	cl->nextFreeByte++;
-}
-
-void clInsertNop(ControlList* cl)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_NOP_opcode;
-	cl->nextFreeByte++;
-}
-
-void clInsertFlush(ControlList* cl)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_FLUSH_opcode;
-	cl->nextFreeByte++;
-}
-
-void clInsertFlushAllState(ControlList* cl)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_FLUSH_ALL_STATE_opcode;
-	cl->nextFreeByte++;
-}
-
-void clInsertStartTileBinning(ControlList* cl)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_START_TILE_BINNING_opcode;
-	cl->nextFreeByte++;
-}
-
-void clInsertIncrementSemaphore(ControlList* cl)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_INCREMENT_SEMAPHORE_opcode;
-	cl->nextFreeByte++;
-}
-
-void clInsertWaitOnSemaphore(ControlList* cl)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_WAIT_ON_SEMAPHORE_opcode;
-	cl->nextFreeByte++;
-}
-
-//input: 2 cls (cl, handles cl)
-void clInsertBranch(ControlList* cls, ControlListAddress address)
-{
-	assert(cls);
-	assert(cls->buffer);
-	assert(cls->nextFreeByte);
-	*cls->nextFreeByte = V3D21_BRANCH_opcode; cls->nextFreeByte++;
-	//TODO is this correct?
-	clEmitShaderRelocation(cls, &address);
-	*(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4;
-}
-
-//input: 2 cls (cl, handles cl)
-void clInsertBranchToSubList(ControlList* cls, ControlListAddress address)
-{
-	assert(cls);
-	assert(cls->buffer);
-	assert(cls->nextFreeByte);
-	*cls->nextFreeByte = V3D21_BRANCH_TO_SUB_LIST_opcode; cls->nextFreeByte++;
-	//TODO is this correct?
-	clEmitShaderRelocation(cls, &address);
-	*(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4;
-}
-
-void clInsertReturnFromSubList(ControlList* cl)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_RETURN_FROM_SUB_LIST_opcode;
-	cl->nextFreeByte++;
-}
-
-void clInsertStoreMultiSampleResolvedTileColorBuffer(ControlList* cl)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_opcode;
-	cl->nextFreeByte++;
-}
-
-void clInsertStoreMultiSampleResolvedTileColorBufferAndEOF(ControlList* cl)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_AND_EOF_opcode;
-	cl->nextFreeByte++;
-}
-
-/*
-//input: 2 cls (cl, handles cl)
-void clInsertStoreFullResolutionTileBuffer(ControlList* cls,
-										   ControlListAddress address,
-										   uint32_t lastTile, //0/1
-										   uint32_t disableClearOnWrite, //0/1
-										   uint32_t disableZStencilBufferWrite, //0/1
-										   uint32_t disableColorBufferWrite) //0/1
-{
-	assert(cls);
-	assert(cls->buffer);
-	assert(cls->nextFreeByte);
-	*cls->nextFreeByte = V3D21_STORE_FULL_RESOLUTION_TILE_BUFFER_opcode; cls->nextFreeByte++;
-	//TODO is this correct?
-	clEmitShaderRelocation(cls, &address);
-	*(uint32_t*)cls->nextFreeByte =
-			moveBits(disableColorBufferWrite, 1, 0) |
-			moveBits(disableZStencilBufferWrite, 1, 1) |
-			moveBits(disableClearOnWrite, 1, 2) |
-			moveBits(lastTile, 1, 3) |
-			moveBits(address.offset, 28, 4);
-	cls->nextFreeByte += 4;
-}
-*/
-
-/*
-//input: 2 cls (cl, handles cl)
-void clInsertReLoadFullResolutionTileBuffer(ControlList* cls,
-											ControlListAddress address,
-											uint32_t disableZStencilBufferRead, //0/1
-											uint32_t disableColorBufferRead) //0/1
-{
-	assert(cls);
-	assert(cls->buffer);
-	assert(cls->nextFreeByte);
-	*cls->nextFreeByte = V3D21_RE_LOAD_FULL_RESOLUTION_TILE_BUFFER_opcode; cls->nextFreeByte++;
-	//TODO is this correct?
-	clEmitShaderRelocation(cls, &address);
-	*(uint32_t*)cls->nextFreeByte =
-			moveBits(disableColorBufferRead, 1, 0) |
-			moveBits(disableZStencilBufferRead, 1, 1) |
-			moveBits(address.offset, 28, 4);
-	cls->nextFreeByte += 4;
-}
-*/
-
-/*
-//input: 2 cls (cl, handles cl)
-void clInsertStoreTileBufferGeneral(ControlList* cls,
-									ControlListAddress address,
-									uint32_t lastTileOfFrame, //0/1
-									uint32_t disableZStencilBufferDump, //0/1
-									uint32_t disableColorBufferDump, //0/1
-									uint32_t disableZStencilBufferClearOnStoreDump, //0/1
-									uint32_t disableColorBufferClearOnStoreDump, //0/1
-									uint32_t disableDoubleBufferSwap, //0/1
-									uint32_t pixelColorFormat, //0/1/2 RGBA8/BGR565dither/BGR565nodither
-									uint32_t mode, //0/1/2 sample0/decimate4x/decimate16x
-									uint32_t format, //0/1/2 raster/t/lt
-									uint32_t bufferToStore) //0/1/2/3/5 none/color/zstencil/z/full
-{
-	assert(cls);
-	assert(cls->buffer);
-	assert(cls->nextFreeByte);
-	*cls->nextFreeByte = V3D21_STORE_TILE_BUFFER_GENERAL_opcode; cls->nextFreeByte++;
-	//TODO is this correct?
-	*cls->nextFreeByte =
-			moveBits(bufferToStore, 3, 0) |
-			moveBits(format, 2, 4) |
-			moveBits(mode, 2, 6);
-	cls->nextFreeByte++;
-	*cls->nextFreeByte =
-			moveBits(pixelColorFormat, 2, 0) |
-			moveBits(disableDoubleBufferSwap, 1, 4) |
-			moveBits(disableColorBufferClearOnStoreDump, 1, 5) |
-			moveBits(disableZStencilBufferClearOnStoreDump, 1, 6) |
-			moveBits(1, 1, 7); //disable vg mask
-	cls->nextFreeByte++;
-	clEmitShaderRelocation(cls, &address);
-	*(uint32_t*)cls->nextFreeByte =
-			moveBits(disableColorBufferDump, 1, 0) |
-			moveBits(disableZStencilBufferDump, 1, 1) |
-			moveBits(1, 1, 2) | //disable vg mask
-			moveBits(lastTileOfFrame, 1, 3) |
-			moveBits(address.offset, 28, 4);
-	cls->nextFreeByte += 4;
-}
-*/
-
-/*
-//input: 2 cls (cl, handles cl)
-void clInsertLoadTileBufferGeneral(ControlList* cls,
-								   ControlListAddress address,
-								   uint32_t disableZStencilBufferLoad, //0/1
-								   uint32_t disableColorBufferLoad, //0/1
-								   uint32_t pixelColorFormat, //0/1/2 RGBA8/BGR565dither/BGR565nodither
-								   uint32_t mode, //0/1/2 sample0/decimate4x/decimate16x
-								   uint32_t format, //0/1/2 raster/t/lt
-								   uint32_t bufferToLoad) //0/1/2/3/5 none/color/zstencil/z/full
-{
-	assert(cls);
-	assert(cls->buffer);
-	assert(cls->nextFreeByte);
-	*cls->nextFreeByte = V3D21_LOAD_TILE_BUFFER_GENERAL_opcode; cls->nextFreeByte++;
-	//TODO is this correct?
-	*cls->nextFreeByte =
-			moveBits(bufferToLoad, 3, 0) |
-			moveBits(format, 2, 4);
-	cls->nextFreeByte++;
-	*cls->nextFreeByte =
-			moveBits(pixelColorFormat, 2, 0);
-	cls->nextFreeByte++;
-	clEmitShaderRelocation(cls, &address);
-	*(uint32_t*)cls->nextFreeByte =
-			moveBits(disableColorBufferLoad, 1, 0) |
-			moveBits(disableZStencilBufferLoad, 1, 1) |
-			moveBits(1, 1, 2) | //disable vg mask
-			moveBits(address.offset, 28, 4);
-	cls->nextFreeByte += 4;
-
-}
-*/
-
+uint32_t divRoundUp(uint32_t n, uint32_t d);
+uint32_t moveBits(uint32_t d, uint32_t bits, uint32_t offset);
+uint32_t clSize(ControlList* cl);
+uint32_t clHasEnoughSpace(ControlList* cl, uint32_t size);
+void clInit(ControlList* cl, void* buffer);
+void clInsertHalt(ControlList* cl);
+void clInsertNop(ControlList* cl);
+void clInsertFlush(ControlList* cl);
+void clInsertFlushAllState(ControlList* cl);
+void clInsertStartTileBinning(ControlList* cl);
+void clInsertIncrementSemaphore(ControlList* cl);
+void clInsertWaitOnSemaphore(ControlList* cl);
+void clInsertBranch(ControlList* cls, ControlListAddress address);
+void clInsertBranchToSubList(ControlList* cls, ControlListAddress address);
+void clInsertReturnFromSubList(ControlList* cl);
+void clInsertStoreMultiSampleResolvedTileColorBuffer(ControlList* cl);
+void clInsertStoreMultiSampleResolvedTileColorBufferAndEOF(ControlList* cl);
 void clInsertIndexedPrimitiveList(ControlList* cl,
 								  uint32_t maxIndex,
 								  uint32_t indicesAddress,
 								  uint32_t length,
 								  uint32_t indexType, //0/1: 8 or 16 bit
-								  enum V3D21_Primitive primitiveMode)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_INDEXED_PRIMITIVE_LIST_opcode; cl->nextFreeByte++;
-	*cl->nextFreeByte = moveBits(indexType, 4, 4) | moveBits(primitiveMode, 4, 0); cl->nextFreeByte++;
-	*(uint32_t*)cl->nextFreeByte = length; cl->nextFreeByte += 4;
-	*(uint32_t*)cl->nextFreeByte = indicesAddress; cl->nextFreeByte += 4;
-	*(uint32_t*)cl->nextFreeByte = maxIndex; cl->nextFreeByte += 4;
-}
-
+								  enum V3D21_Primitive primitiveMode);
 void clInsertVertexArrayPrimitives(ControlList* cl,
 								  uint32_t firstVertexIndex,
 								  uint32_t length,
-								  enum V3D21_Primitive primitiveMode)
-{
-	assert(cl);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_VERTEX_ARRAY_PRIMITIVES_opcode; cl->nextFreeByte++;
-	*cl->nextFreeByte = moveBits(primitiveMode, 8, 0); cl->nextFreeByte++;
-	*(uint32_t*)cl->nextFreeByte = length; cl->nextFreeByte += 4;
-	*(uint32_t*)cl->nextFreeByte = firstVertexIndex; cl->nextFreeByte += 4;
-}
-
+								  enum V3D21_Primitive primitiveMode);
 void clInsertPrimitiveListFormat(ControlList* cl,
 								  uint32_t dataType, //1/3: 16 or 32 bit
-								  uint32_t primitiveType) //0/1/2/3: point/line/tri/rhy
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_PRIMITIVE_LIST_FORMAT_opcode; cl->nextFreeByte++;
-	*cl->nextFreeByte = moveBits(dataType, 4, 4) | moveBits(primitiveType, 4, 0); cl->nextFreeByte++;
-}
-
+								  uint32_t primitiveType); //0/1/2/3: point/line/tri/rhy
 void clInsertShaderState(ControlList* cl,
 						  uint32_t address,
 						  uint32_t extendedShaderRecord, //0/1: true/false
-						 uint32_t numberOfAttributeArrays)
-{
-	assert(cl);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_GL_SHADER_STATE_opcode; cl->nextFreeByte++;
-	//TODO is this correct?
-	*(uint32_t*)cl->nextFreeByte =
-			moveBits(address, 28, 4) |
-			moveBits(extendedShaderRecord, 1, 3) |
-			moveBits(numberOfAttributeArrays, 3, 0); cl->nextFreeByte += 4;
-}
-
-/*
-void clInsertClearColors(ControlList* cl,
-						uint32_t clearStencil,
-						uint32_t clearZ, //24 bit Z
-						uint64_t clearColor) //2x RGBA8 or 1x RGBA16
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_CLEAR_COLORS_opcode; cl->nextFreeByte++;
-	*(uint64_t*)cl->nextFreeByte = clearColor; cl->nextFreeByte += 8;
-	*(uint32_t*)cl->nextFreeByte = clearZ; cl->nextFreeByte += 4; //24 bits for Z, 8 bit for vg mask (unused)
-	*cl->nextFreeByte = clearStencil; cl->nextFreeByte++;
-}
-*/
-
+						 uint32_t numberOfAttributeArrays);
 void clInsertConfigurationBits(ControlList* cl,
 						uint32_t earlyZUpdatesEnable, //0/1
 						uint32_t earlyZEnable, //0/1
@@ -393,142 +77,39 @@ void clInsertConfigurationBits(ControlList* cl,
 						uint32_t enableDepthOffset, //0/1
 						uint32_t clockwisePrimitives, //0/1
 						uint32_t enableReverseFacingPrimitive, //0/1
-						uint32_t enableForwardFacingPrimitive) //0/1
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_CONFIGURATION_BITS_opcode; cl->nextFreeByte++;
-	*(uint32_t*)cl->nextFreeByte =
-			moveBits(enableForwardFacingPrimitive, 1, 0) |
-			moveBits(enableReverseFacingPrimitive, 1, 1) |
-			moveBits(clockwisePrimitives, 1, 2) |
-			moveBits(enableDepthOffset, 1, 3) |
-			moveBits(coverageReadType, 1, 5) |
-			moveBits(rasterizerOversampleMode, 2, 6) |
-			moveBits(coveragePipeSelect, 1, 8) |
-			moveBits(coverageUpdateMode, 2, 9) |
-			moveBits(coverageReadMode, 1, 11) |
-			moveBits(depthTestFunction, 3, 12) |
-			moveBits(zUpdatesEnable, 1, 15) |
-			moveBits(earlyZEnable, 1, 16) |
-			moveBits(earlyZUpdatesEnable, 1, 17); cl->nextFreeByte += 4;
-}
-
+						uint32_t enableForwardFacingPrimitive); //0/1
 void clInsertFlatShadeFlags(ControlList* cl,
-						uint32_t flags)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_FLAT_SHADE_FLAGS_opcode; cl->nextFreeByte++;
-	*(uint32_t*)cl->nextFreeByte = flags; cl->nextFreeByte += 4;
-}
-
+						uint32_t flags);
 void clInsertPointSize(ControlList* cl,
-						float size)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_POINT_SIZE_opcode; cl->nextFreeByte++;
-	*(float*)cl->nextFreeByte = size; cl->nextFreeByte += 4;
-}
-
+						float size);
 void clInsertLineWidth(ControlList* cl,
-						float width)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_LINE_WIDTH_opcode; cl->nextFreeByte++;
-	*(float*)cl->nextFreeByte = width; cl->nextFreeByte += 4;
-}
-
+						float width);
 void clInsertRHTXBoundary(ControlList* cl,
-						uint32_t boundary) //sint16
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_RHT_X_BOUNDARY_opcode; cl->nextFreeByte++;
-	*(uint16_t*)cl->nextFreeByte = moveBits(boundary, 16, 0); cl->nextFreeByte += 2;
-}
-
+						uint32_t boundary); //sint16
 void clInsertDepthOffset(ControlList* cl,
 						uint32_t units, //float 187
-						 uint32_t factor) //float 187
-{
-	assert(cl);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_DEPTH_OFFSET_opcode; cl->nextFreeByte++;
-	*(uint32_t*)cl->nextFreeByte = moveBits(factor, 16, 0) | moveBits(units, 16, 16); cl->nextFreeByte += 4;
-}
-
+						 uint32_t factor); //float 187
 void clInsertClipWindow(ControlList* cl,
 						uint32_t width, //uint16
 						uint32_t height, //uint16
 						uint32_t bottomPixelCoord, //uint16
-						uint32_t leftPixelCoord)  //uint16
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_CLIP_WINDOW_opcode; cl->nextFreeByte++;
-	*(uint32_t*)cl->nextFreeByte = moveBits(leftPixelCoord, 16, 0) | moveBits(bottomPixelCoord, 16, 16); cl->nextFreeByte += 4;
-	*(uint32_t*)cl->nextFreeByte = moveBits(width, 16, 0) | moveBits(height, 16, 16); cl->nextFreeByte += 4;
-}
-
+						uint32_t leftPixelCoord);  //uint16
 void clInsertViewPortOffset(ControlList* cl,
 						uint32_t x, //sint16
 						uint32_t y //sint16
-						)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_VIEWPORT_OFFSET_opcode; cl->nextFreeByte++;
-	*(uint32_t*)cl->nextFreeByte = moveBits(x, 16, 0) | moveBits(y, 16, 16); cl->nextFreeByte += 4;
-}
-
+						);
 void clInsertZMinMaxClippingPlanes(ControlList* cl,
 						float minZw,
 						float maxZw
-						)
-{
-	assert(cl);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_Z_MIN_AND_MAX_CLIPPING_PLANES_opcode; cl->nextFreeByte++;
-	*(float*)cl->nextFreeByte = minZw; cl->nextFreeByte += 4;
-	*(float*)cl->nextFreeByte = maxZw; cl->nextFreeByte += 4;
-}
-
+						);
 void clInsertClipperXYScaling(ControlList* cl,
 						float width, //half height in 1/16 of pixel
 						float height //half width in 1/16 of pixel
-						)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_CLIPPER_XY_SCALING_opcode; cl->nextFreeByte++;
-	*(float*)cl->nextFreeByte = width; cl->nextFreeByte += 4;
-	*(float*)cl->nextFreeByte = height; cl->nextFreeByte += 4;
-}
-
+						);
 void clInsertClipperZScaleOffset(ControlList* cl,
 						float zOffset, //zc to zs
 						float zScale //zc to zs
-						)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_CLIPPER_Z_SCALE_AND_OFFSET_opcode; cl->nextFreeByte++;
-	*(float*)cl->nextFreeByte = zScale; cl->nextFreeByte += 4;
-	*(float*)cl->nextFreeByte = zOffset; cl->nextFreeByte += 4;
-}
-
+						);
 void clInsertTileBinningModeConfiguration(ControlList* cl,
 						uint32_t doubleBufferInNonMsMode, //0/1
 						uint32_t tileAllocationBlockSize, //0/1/2/3 32/64/128/256 bytes
@@ -541,105 +122,10 @@ void clInsertTileBinningModeConfiguration(ControlList* cl,
 						uint32_t tileStateDataArrayAddress, //16 byte aligned, size of 48 bytes * num tiles
 						uint32_t tileAllocationMemorySize,
 						uint32_t tileAllocationMemoryAddress
-						)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_TILE_BINNING_MODE_CONFIGURATION_opcode; cl->nextFreeByte++;
-	*(uint32_t*)cl->nextFreeByte = tileAllocationMemoryAddress; cl->nextFreeByte += 4;
-	*(uint32_t*)cl->nextFreeByte = tileAllocationMemorySize; cl->nextFreeByte += 4;
-	*(uint32_t*)cl->nextFreeByte = tileStateDataArrayAddress; cl->nextFreeByte += 4;
-	uint32_t tileSizeW = 64;
-	uint32_t tileSizeH = 64;
-
-	if(multisampleMode4x)
-	{
-		tileSizeW >>= 1;
-		tileSizeH >>= 1;
-	}
-
-	if(tileBuffer64BitColorDepth)
-	{
-		tileSizeH >>= 1;
-	}
-
-	uint32_t widthInTiles = divRoundUp(widthInPixels, tileSizeW);
-	uint32_t heightInTiles = divRoundUp(heightInPixels, tileSizeH);
-	*(uint8_t*)cl->nextFreeByte = widthInTiles; cl->nextFreeByte++;
-	*(uint8_t*)cl->nextFreeByte = heightInTiles; cl->nextFreeByte++;
-	*cl->nextFreeByte =
-			moveBits(multisampleMode4x, 1, 0) |
-			moveBits(tileBuffer64BitColorDepth, 1, 1) |
-			moveBits(autoInitializeTileStateDataArray, 1, 2) |
-			moveBits(tileAllocationInitialBlockSize, 2, 3) |
-			moveBits(tileAllocationBlockSize, 2, 5) |
-			moveBits(doubleBufferInNonMsMode, 1, 7); cl->nextFreeByte++;
-}
-
-/*
-void clInsertTileRenderingModeConfiguration(ControlList* cls,
-						ControlListAddress address,
-						uint32_t doubleBufferInNonMsMode, //0/1
-						uint32_t earlyZEarlyCovDisable, //0/1
-						uint32_t earlyZUpdateDirection, //0/1 lt,le/gt,ge
-						uint32_t selectCoverageMode, //0/1
-						uint32_t memoryFormat, //0/1/2 linear/t/lt
-						uint32_t decimateMode, //0/1/2 0x/4x/16x
-						uint32_t nonHDRFrameFormatColorFormat, //0/1/2 bgr565dithered/rgba8/bgr565nodither
-						uint32_t tileBufferHDRMode, //0/1
-						uint32_t multisampleMode4x, //0/1
-						uint32_t widthPixels,
-						uint32_t heightPixels)
-{
-	assert(cls);
-	assert(cls->buffer);
-	assert(cls->nextFreeByte);
-	*cls->nextFreeByte = V3D21_TILE_RENDERING_MODE_CONFIGURATION_opcode; cls->nextFreeByte++;
-	//TODO is this correct?
-	clEmitShaderRelocation(cls, &address);
-	*(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4;
-	*(uint32_t*)cls->nextFreeByte = moveBits(widthPixels, 16, 0) | moveBits(heightPixels, 16, 16); cls->nextFreeByte += 4;
-	*(uint16_t*)cls->nextFreeByte =
-			moveBits(multisampleMode4x, 1, 0) |
-			moveBits(tileBufferHDRMode, 1, 1) |
-			moveBits(nonHDRFrameFormatColorFormat, 2, 2) |
-			moveBits(decimateMode, 2, 4) |
-			moveBits(memoryFormat, 2, 6) |
-			moveBits(0, 1, 8) | //vg buffer enable
-			moveBits(selectCoverageMode, 1, 9) |
-			moveBits(earlyZUpdateDirection, 1, 10) |
-			moveBits(earlyZEarlyCovDisable, 1, 11) |
-			moveBits(doubleBufferInNonMsMode, 1, 12); cls->nextFreeByte += 2;
-}
-*/
-
-/*
-void clInsertTileCoordinates(ControlList* cl,
-						uint32_t tileColumnNumber, //int8
-						uint32_t tileRowNumber) //int8
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_TILE_COORDINATES_opcode; cl->nextFreeByte++;
-	*(uint16_t*)cl->nextFreeByte = moveBits(tileColumnNumber, 8, 0) | moveBits(tileRowNumber, 8, 8); cl->nextFreeByte += 2;
-}
-*/
-
+						);
 void clInsertGEMRelocations(ControlList* cl,
 							uint32_t buffer0,
-							uint32_t buffer1)
-{
-	assert(cl);
-	assert(cl->buffer);
-	assert(cl->nextFreeByte);
-	*cl->nextFreeByte = V3D21_GEM_RELOCATIONS_opcode; cl->nextFreeByte++;
-	*(uint32_t*)cl->nextFreeByte = buffer0; cl->nextFreeByte += 4;
-	*(uint32_t*)cl->nextFreeByte = buffer1; cl->nextFreeByte += 4;
-}
-
-//input: 2 cls (cl, handles cl)
+							uint32_t buffer1);
 void clInsertShaderRecord(ControlList* cls,
 						  uint32_t fragmentShaderIsSingleThreaded, //0/1
 						  uint32_t pointSizeIncludedInShadedVertexData, //0/1
@@ -652,94 +138,13 @@ void clInsertShaderRecord(ControlList* cls,
 						  uint32_t vertexAttributeArraySelectBits,
 						  uint32_t vertexTotalAttributesSize,
 						  uint32_t vertexUniformsAddress,
-						  ControlListAddress vertexCodeAddress)
-{
-	assert(cls);
-	assert(cls->buffer);
-	assert(cls->nextFreeByte);
-	//TODO is this correct?
-	*cls->nextFreeByte =
-			moveBits(fragmentShaderIsSingleThreaded, 1, 0) |
-			moveBits(pointSizeIncludedInShadedVertexData, 1, 1) |
-			moveBits(enableClipping, 1, 2); cls->nextFreeByte++;
-	*cls->nextFreeByte = 0; cls->nextFreeByte++;
-	*(uint16_t*)cls->nextFreeByte = moveBits(fragmentNumberOfUnusedUniforms, 16, 0); cls->nextFreeByte += 2;
-	*cls->nextFreeByte = fragmentNumberOfVaryings; cls->nextFreeByte++;
-	clEmitShaderRelocation(cls, &fragmentCodeAddress);
-	*(uint32_t*)cls->nextFreeByte = fragmentCodeAddress.offset; cls->nextFreeByte += 4;
-	*(uint32_t*)cls->nextFreeByte = fragmentUniformsAddress; cls->nextFreeByte += 4;
-
-	*(uint16_t*)cls->nextFreeByte = moveBits(vertexNumberOfUnusedUniforms, 16, 0); cls->nextFreeByte += 2;
-	*cls->nextFreeByte = vertexAttributeArraySelectBits; cls->nextFreeByte++;
-	*cls->nextFreeByte = vertexTotalAttributesSize; cls->nextFreeByte++;
-	clEmitShaderRelocation(cls, &vertexCodeAddress);
-	*(uint32_t*)cls->nextFreeByte = moveBits(vertexCodeAddress.offset, 32, 0) | moveBits(vertexUniformsAddress, 32, 0); cls->nextFreeByte += 4; //???
-	cls->nextFreeByte += 4;
-	//skip coordinate shader stuff
-	cls->nextFreeByte += 16;
-}
-
-//input: 2 cls (cl, handles cl)
+						  ControlListAddress vertexCodeAddress);
 void clInsertAttributeRecord(ControlList* cls,
 						  ControlListAddress address,
 						  uint32_t sizeBytes,
 						  uint32_t stride,
-						  uint32_t vertexVPMOffset)
-{
-	assert(cls);
-	assert(cls->buffer);
-	assert(cls->nextFreeByte);
-	uint32_t sizeBytesMinusOne = sizeBytes - 1;
-	//TODO is this correct?
-	clEmitShaderRelocation(cls, &address);
-	*(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4;
-	*cls->nextFreeByte = sizeBytesMinusOne; cls->nextFreeByte++;
-	*cls->nextFreeByte = stride; cls->nextFreeByte++;
-	*cls->nextFreeByte = vertexVPMOffset; cls->nextFreeByte++;
-	cls->nextFreeByte++; //skip coordinate shader stuff
-}
-
-uint32_t clGetHandleIndex(ControlList* handlesCl, uint32_t handle)
-{
-	uint32_t c = 0;
-
-	uint32_t numHandles = clSize(handlesCl) / 4;
-
-	for(; c < numHandles; ++c)
-	{
-		if(((uint32_t*)handlesCl->buffer)[c] == handle)
-		{
-			//found
-			return c;
-		}
-	}
-
-	//write handle to handles cl
-	*(uint32_t*)handlesCl->nextFreeByte = handle;
-	handlesCl->nextFreeByte += 4;
-
-	return c;
-}
-
-//input: 2 cls (cl + handles cl)
-static inline void clEmitShaderRelocation(ControlList* cls, const ControlListAddress* address)
-{
-	assert(cls);
-	assert(cls->buffer);
-	assert(cls->nextFreeByte);
-	assert(address);
-	assert(address->handle);
-
-	//search for handle in handles cl
-	//if found insert handle index
-
-	ControlList* cl = cls;
-	ControlList* handlesCl = cls + 1;
-
-	//store offset within handles in cl
-	*(uint32_t*)cl->nextFreeByte = clGetHandleIndex(handlesCl, address->handle);
-	cl->nextFreeByte += 4;
-}
+						  uint32_t vertexVPMOffset);
+uint32_t clGetHandleIndex(ControlList* handlesCl, uint32_t handle);
 
 #if defined (__cplusplus)
 }
diff --git a/driver/LinearAllocator.c b/driver/LinearAllocator.c
new file mode 100644
index 0000000..aee53c7
--- /dev/null
+++ b/driver/LinearAllocator.c
@@ -0,0 +1,48 @@
+#include "LinearAllocator.h"
+
+#include "CustomAssert.h"
+
+#include <stdint.h>
+
+LinearAllocator createLinearAllocator(char* b, unsigned s)
+{
+	assert(b);
+	assert(s > 0);
+
+	LinearAllocator la =
+	{
+		.buf = b,
+		.offset = 0,
+		.size = s
+	};
+
+	return la;
+}
+
+void destroyLinearAllocator(LinearAllocator* la)
+{
+	la->buf = 0;
+	la->offset = 0;
+	la->size = 0;
+}
+
+void* linearAllocte(LinearAllocator* la, unsigned s)
+{
+	assert(la->buf);
+	assert(la->size > 0);
+
+	if(la->offset + s >= la->size)
+	{
+		return 0; //no space left
+	}
+
+	char* p = la->buf + la->offset + s;
+	la->offset += s;
+
+	return p;
+}
+
+void linearFree(LinearAllocator* la, void* p)
+{
+	//assert(0); //this shouldn't really happen, just destroy/reset the whole allocator
+}
diff --git a/driver/LinearAllocator.h b/driver/LinearAllocator.h
index 06ec0c8..941e6df 100644
--- a/driver/LinearAllocator.h
+++ b/driver/LinearAllocator.h
@@ -15,48 +15,10 @@ typedef struct LinearAllocator
 	unsigned size;
 } LinearAllocator;
 
-LinearAllocator createLinearAllocator(char* b, unsigned s)
-{
-	assert(b);
-	assert(s > 0);
-
-	LinearAllocator la =
-	{
-		.buf = b,
-		.offset = 0,
-		.size = s
-	};
-
-	return la;
-}
-
-void destroyLinearAllocator(LinearAllocator* la)
-{
-	la->buf = 0;
-	la->offset = 0;
-	la->size = 0;
-}
-
-void* linearAllocte(LinearAllocator* la, unsigned s)
-{
-	assert(la->buf);
-	assert(la->size > 0);
-
-	if(la->offset + s >= la->size)
-	{
-		return 0; //no space left
-	}
-
-	char* p = la->buf + la->offset + s;
-	la->offset += s;
-
-	return p;
-}
-
-void linearFree(LinearAllocator* la, void* p)
-{
-	//assert(0); //this shouldn't really happen, just destroy/reset the whole allocator
-}
+LinearAllocator createLinearAllocator(char* b, unsigned s);
+void destroyLinearAllocator(LinearAllocator* la);
+void* linearAllocte(LinearAllocator* la, unsigned s);
+void linearFree(LinearAllocator* la, void* p);
 
 #if defined (__cplusplus)
 }
diff --git a/driver/PoolAllocator.c b/driver/PoolAllocator.c
new file mode 100644
index 0000000..ab1a06b
--- /dev/null
+++ b/driver/PoolAllocator.c
@@ -0,0 +1,72 @@
+#include "PoolAllocator.h"
+
+#include "CustomAssert.h"
+
+#include <stdint.h>
+
+PoolAllocator createPoolAllocator(char* b, unsigned bs, unsigned s)
+{
+	assert(b); //only allocated memory
+	assert(bs >= sizeof(void*)); //we need to be able to store
+	assert(s%bs==0); //we want a size that is the exact multiple of block size
+	assert(s > bs); //at least 1 element
+
+	PoolAllocator pa =
+	{
+		.buf = b,
+		.nextFreeBlock = (uint32_t*)b,
+		.blockSize = bs,
+		.size = s
+	};
+
+	//initialize linked list of free pointers
+	uint32_t* ptr = pa.nextFreeBlock;
+	for(unsigned c = 0; c < s/bs - 1; ++c)
+	{
+		*ptr = (uint32_t)ptr + bs;
+		ptr += bs;
+	}
+
+	*ptr = 0; //last element
+
+	return pa;
+}
+
+void destroyPoolAllocator(PoolAllocator* pa)
+{
+	//actual memory freeing is done by caller
+	pa->buf = 0;
+	pa->nextFreeBlock = 0;
+	pa->blockSize = 0;
+	pa->size = 0;
+}
+
+void* poolAllocate(PoolAllocator* pa)
+{
+	assert(pa->buf);
+
+	if(!pa->nextFreeBlock)
+	{
+		return 0; //no free blocks
+	}
+
+	//next free block will be allocated
+	void* ret = pa->nextFreeBlock;
+
+	//set next free block to the one the current next points to
+	pa->nextFreeBlock = (uint32_t*)*pa->nextFreeBlock;
+
+	return ret;
+}
+
+void poolFree(PoolAllocator* pa, void* p)
+{
+	assert(pa->buf);
+	assert(p);
+
+	//set block to be freed to point to the current next free block
+	*(uint32_t*)p = (uint32_t)pa->nextFreeBlock;
+
+	//set next free block to the freshly freed block
+	pa->nextFreeBlock = p;
+}
diff --git a/driver/PoolAllocator.h b/driver/PoolAllocator.h
index 6e9e916..e8f9787 100644
--- a/driver/PoolAllocator.h
+++ b/driver/PoolAllocator.h
@@ -16,72 +16,10 @@ typedef struct PoolAllocator
 	unsigned size; //size is exact multiple of block size
 } PoolAllocator;
 
-PoolAllocator createPoolAllocator(char* b, unsigned bs, unsigned s)
-{
-	assert(b); //only allocated memory
-	assert(bs >= sizeof(void*)); //we need to be able to store
-	assert(s%bs==0); //we want a size that is the exact multiple of block size
-	assert(s > bs); //at least 1 element
-
-	PoolAllocator pa =
-	{
-		.buf = b,
-		.nextFreeBlock = (uint32_t*)b,
-		.blockSize = bs,
-		.size = s
-	};
-
-	//initialize linked list of free pointers
-	uint32_t* ptr = pa.nextFreeBlock;
-	for(unsigned c = 0; c < s/bs - 1; ++c)
-	{
-		*ptr = (uint32_t)ptr + bs;
-		ptr += bs;
-	}
-
-	*ptr = 0; //last element
-
-	return pa;
-}
-
-void destroyPoolAllocator(PoolAllocator* pa)
-{
-	//actual memory freeing is done by caller
-	pa->buf = 0;
-	pa->nextFreeBlock = 0;
-	pa->blockSize = 0;
-	pa->size = 0;
-}
-
-void* poolAllocate(PoolAllocator* pa)
-{
-	assert(pa->buf);
-
-	if(!pa->nextFreeBlock)
-	{
-		return 0; //no free blocks
-	}
-
-	//next free block will be allocated
-	void* ret = pa->nextFreeBlock;
-
-	//set next free block to the one the current next points to
-	pa->nextFreeBlock = (uint32_t*)*pa->nextFreeBlock;
-
-	return ret;
-}
-
-void poolFree(PoolAllocator* pa, void* p)
-{
-	assert(pa->buf);
-	assert(p);
-
-	//set block to be freed to point to the current next free block
-	*(uint32_t*)p = (uint32_t)pa->nextFreeBlock;
-
-	//set next free block to the freshly freed block
-	pa->nextFreeBlock = p;
-}
+PoolAllocator createPoolAllocator(char* b, unsigned bs, unsigned s);
+void destroyPoolAllocator(PoolAllocator* pa);
+void* poolAllocate(PoolAllocator* pa);
+void poolFree(PoolAllocator* pa, void* p);
 
 #if defined (__cplusplus)
 }
diff --git a/driver/command.c b/driver/command.c
new file mode 100644
index 0000000..0262bb1
--- /dev/null
+++ b/driver/command.c
@@ -0,0 +1,449 @@
+#include "common.h"
+
+#include "kernel/vc4_packet.h"
+#include "../brcm/cle/v3d_decoder.h"
+#include "../brcm/clif/clif_dump.h"
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#commandbuffers-pools
+ * Command pools are opaque objects that command buffer memory is allocated from, and which allow the implementation to amortize the
+ * cost of resource creation across multiple command buffers. Command pools are externally synchronized, meaning that a command pool must
+ * not be used concurrently in multiple threads. That includes use via recording commands on any command buffers allocated from the pool,
+ * as well as operations that allocate, free, and reset command buffers or the pool itself.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateCommandPool(
+		VkDevice                                    device,
+		const VkCommandPoolCreateInfo*              pCreateInfo,
+		const VkAllocationCallbacks*                pAllocator,
+		VkCommandPool*                              pCommandPool)
+{
+	assert(device);
+	assert(pCreateInfo);
+
+	//TODO: allocator is ignored for now
+	assert(pAllocator == 0);
+
+	//VK_COMMAND_POOL_CREATE_TRANSIENT_BIT
+	//specifies that command buffers allocated from the pool will be short-lived, meaning that they will be reset or freed in a relatively short timeframe.
+	//This flag may be used by the implementation to control memory allocation behavior within the pool.
+	//--> definitely use pool allocator
+
+	//VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT
+	//allows any command buffer allocated from a pool to be individually reset to the initial state; either by calling vkResetCommandBuffer, or via the implicit reset when calling vkBeginCommandBuffer.
+	//If this flag is not set on a pool, then vkResetCommandBuffer must not be called for any command buffer allocated from that pool.
+
+	//TODO pool family ignored for now
+
+	_commandPool* cp = malloc(sizeof(_commandPool));
+
+	if(!cp)
+	{
+		return VK_ERROR_OUT_OF_HOST_MEMORY;
+	}
+
+	cp->queueFamilyIndex = pCreateInfo->queueFamilyIndex;
+
+	//initial number of command buffers to hold
+	int numCommandBufs = 100;
+	int controlListSize = ARM_PAGE_SIZE * 100;
+
+	//if(pCreateInfo->flags & VK_COMMAND_POOL_CREATE_TRANSIENT_BIT)
+	{
+		//use pool allocator
+		void* pamem = malloc(numCommandBufs * sizeof(_commandBuffer));
+		if(!pamem)
+		{
+			return VK_ERROR_OUT_OF_HOST_MEMORY;
+		}
+		cp->pa = createPoolAllocator(pamem, sizeof(_commandBuffer), numCommandBufs * sizeof(_commandBuffer));
+
+		void* cpamem = malloc(controlListSize);
+		if(!cpamem)
+		{
+			return VK_ERROR_OUT_OF_HOST_MEMORY;
+		}
+		cp->cpa = createConsecutivePoolAllocator(cpamem, ARM_PAGE_SIZE, controlListSize);
+	}
+
+	*pCommandPool = (VkCommandPool)cp;
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#commandbuffer-allocation
+ * vkAllocateCommandBuffers can be used to create multiple command buffers. If the creation of any of those command buffers fails,
+ * the implementation must destroy all successfully created command buffer objects from this command, set all entries of the pCommandBuffers array to NULL and return the error.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkAllocateCommandBuffers(
+		VkDevice                                    device,
+		const VkCommandBufferAllocateInfo*          pAllocateInfo,
+		VkCommandBuffer*                            pCommandBuffers)
+{
+	assert(device);
+	assert(pAllocateInfo);
+	assert(pCommandBuffers);
+
+	VkResult res = VK_SUCCESS;
+
+	_commandPool* cp = (_commandPool*)pAllocateInfo->commandPool;
+
+	//if(cp->usePoolAllocator)
+	{
+		for(int c = 0; c < pAllocateInfo->commandBufferCount; ++c)
+		{
+			pCommandBuffers[c] = poolAllocate(&cp->pa);
+
+			if(!pCommandBuffers[c])
+			{
+				res = VK_ERROR_OUT_OF_HOST_MEMORY;
+				break;
+			}
+
+			pCommandBuffers[c]->shaderRecCount = 0;
+			pCommandBuffers[c]->usageFlags = 0;
+			pCommandBuffers[c]->state = CMDBUF_STATE_INITIAL;
+			pCommandBuffers[c]->cp = cp;
+			clInit(&pCommandBuffers[c]->binCl, consecutivePoolAllocate(&cp->cpa, 1));
+			clInit(&pCommandBuffers[c]->handlesCl, consecutivePoolAllocate(&cp->cpa, 1));
+			clInit(&pCommandBuffers[c]->shaderRecCl, consecutivePoolAllocate(&cp->cpa, 1));
+			clInit(&pCommandBuffers[c]->uniformsCl, consecutivePoolAllocate(&cp->cpa, 1));
+
+			if(!pCommandBuffers[c]->binCl.buffer)
+			{
+				res = VK_ERROR_OUT_OF_HOST_MEMORY;
+				break;
+			}
+
+			if(!pCommandBuffers[c]->handlesCl.buffer)
+			{
+				res = VK_ERROR_OUT_OF_HOST_MEMORY;
+				break;
+			}
+
+			if(!pCommandBuffers[c]->shaderRecCl.buffer)
+			{
+				res = VK_ERROR_OUT_OF_HOST_MEMORY;
+				break;
+			}
+
+			if(!pCommandBuffers[c]->uniformsCl.buffer)
+			{
+				res = VK_ERROR_OUT_OF_HOST_MEMORY;
+				break;
+			}
+		}
+	}
+
+	if(res != VK_SUCCESS)
+	{
+		//if(cp->usePoolAllocator)
+		{
+			for(int c = 0; c < pAllocateInfo->commandBufferCount; ++c)
+			{
+				consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->binCl, pCommandBuffers[c]->binCl.numBlocks);
+				consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->handlesCl, pCommandBuffers[c]->binCl.numBlocks);
+				consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->shaderRecCl, pCommandBuffers[c]->binCl.numBlocks);
+				consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->uniformsCl, pCommandBuffers[c]->binCl.numBlocks);
+				poolFree(&cp->pa, pCommandBuffers[c]);
+				pCommandBuffers[c] = 0;
+			}
+		}
+	}
+
+	return res;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkBeginCommandBuffer
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkBeginCommandBuffer(
+		VkCommandBuffer                             commandBuffer,
+		const VkCommandBufferBeginInfo*             pBeginInfo)
+{
+	assert(commandBuffer);
+	assert(pBeginInfo);
+
+	//TODO
+
+	//VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT
+	//specifies that each recording of the command buffer will only be submitted once, and the command buffer will be reset and recorded again between each submission.
+
+	//VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT
+	//specifies that a secondary command buffer is considered to be entirely inside a render pass. If this is a primary command buffer, then this bit is ignored
+
+	//VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT
+	//specifies that a command buffer can be resubmitted to a queue while it is in the pending state, and recorded into multiple primary command buffers
+
+	//When a command buffer begins recording, all state in that command buffer is undefined
+
+	struct drm_vc4_submit_cl submitCl =
+	{
+		.color_read.hindex = ~0,
+		.zs_read.hindex = ~0,
+		.color_write.hindex = ~0,
+		.msaa_color_write.hindex = ~0,
+		.zs_write.hindex = ~0,
+		.msaa_zs_write.hindex = ~0,
+	};
+
+	commandBuffer->usageFlags = pBeginInfo->flags;
+	commandBuffer->shaderRecCount = 0;
+	commandBuffer->state = CMDBUF_STATE_RECORDING;
+	commandBuffer->submitCl = submitCl;
+
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEndCommandBuffer
+ * If there was an error during recording, the application will be notified by an unsuccessful return code returned by vkEndCommandBuffer.
+ * If the application wishes to further use the command buffer, the command buffer must be reset. The command buffer must have been in the recording state,
+ * and is moved to the executable state.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkEndCommandBuffer(
+		VkCommandBuffer                             commandBuffer)
+{
+	assert(commandBuffer);
+
+	//Increment the semaphore indicating that binning is done and
+	//unblocking the render thread.  Note that this doesn't act
+	//until the FLUSH completes.
+	//The FLUSH caps all of our bin lists with a
+	//VC4_PACKET_RETURN.
+	clFit(commandBuffer, &commandBuffer->binCl, V3D21_INCREMENT_SEMAPHORE_length);
+	clInsertIncrementSemaphore(&commandBuffer->binCl);
+	clFit(commandBuffer, &commandBuffer->binCl, V3D21_FLUSH_length);
+	clInsertFlush(&commandBuffer->binCl);
+
+	commandBuffer->state = CMDBUF_STATE_EXECUTABLE;
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkQueueSubmit
+ * vkQueueSubmit is a queue submission command, with each batch defined by an element of pSubmits as an instance of the VkSubmitInfo structure.
+ * Batches begin execution in the order they appear in pSubmits, but may complete out of order.
+ * Fence and semaphore operations submitted with vkQueueSubmit have additional ordering constraints compared to other submission commands,
+ * with dependencies involving previous and subsequent queue operations. Information about these additional constraints can be found in the semaphore and
+ * fence sections of the synchronization chapter.
+ * Details on the interaction of pWaitDstStageMask with synchronization are described in the semaphore wait operation section of the synchronization chapter.
+ * The order that batches appear in pSubmits is used to determine submission order, and thus all the implicit ordering guarantees that respect it.
+ * Other than these implicit ordering guarantees and any explicit synchronization primitives, these batches may overlap or otherwise execute out of order.
+ * If any command buffer submitted to this queue is in the executable state, it is moved to the pending state. Once execution of all submissions of a command buffer complete,
+ * it moves from the pending state, back to the executable state. If a command buffer was recorded with the VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT flag,
+ * it instead moves back to the invalid state.
+ * If vkQueueSubmit fails, it may return VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY.
+ * If it does, the implementation must ensure that the state and contents of any resources or synchronization primitives referenced by the submitted command buffers and any semaphores
+ * referenced by pSubmits is unaffected by the call or its failure. If vkQueueSubmit fails in such a way that the implementation is unable to make that guarantee,
+ * the implementation must return VK_ERROR_DEVICE_LOST. See Lost Device.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit(
+		VkQueue                                     queue,
+		uint32_t                                    submitCount,
+		const VkSubmitInfo*                         pSubmits,
+		VkFence                                     fence)
+{
+	assert(queue);
+
+	for(int c = 0; c < pSubmits->waitSemaphoreCount; ++c)
+	{
+		sem_wait((sem_t*)pSubmits->pWaitSemaphores[c]);
+	}
+
+	//TODO: deal with pSubmits->pWaitDstStageMask
+
+	//TODO wait for fence??
+
+	for(int c = 0; c < pSubmits->commandBufferCount; ++c)
+	{
+		if(pSubmits->pCommandBuffers[c]->state == CMDBUF_STATE_EXECUTABLE)
+		{
+			pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_PENDING;
+		}
+	}
+
+	for(int c = 0; c < pSubmits->commandBufferCount; ++c)
+	{
+		VkCommandBuffer cmdbuf = pSubmits->pCommandBuffers[c];
+
+		cmdbuf->submitCl.bo_handles = cmdbuf->handlesCl.buffer;
+		cmdbuf->submitCl.bo_handle_count = clSize(&cmdbuf->handlesCl) / 4;
+		cmdbuf->submitCl.bin_cl = cmdbuf->binCl.buffer;
+		cmdbuf->submitCl.bin_cl_size = clSize(&cmdbuf->binCl);
+		cmdbuf->submitCl.shader_rec = cmdbuf->shaderRecCl.buffer;
+		cmdbuf->submitCl.shader_rec_size = clSize(&cmdbuf->shaderRecCl);
+		cmdbuf->submitCl.shader_rec_count = cmdbuf->shaderRecCount;
+		cmdbuf->submitCl.uniforms = cmdbuf->uniformsCl.buffer;
+		cmdbuf->submitCl.uniforms_size = clSize(&cmdbuf->uniformsCl);
+
+		printf("BCL:\n");
+		clDump(cmdbuf->submitCl.bin_cl, cmdbuf->submitCl.bin_cl_size);
+		printf("BO handles: ");
+		for(int d = 0; d < cmdbuf->submitCl.bo_handle_count; ++d)
+		{
+			printf("%u ", *((uint32_t*)(cmdbuf->submitCl.bo_handles)+d));
+		}
+		printf("\nwidth height: %u, %u\n", cmdbuf->submitCl.width, cmdbuf->submitCl.height);
+		printf("tile min/max: %u,%u %u,%u\n", cmdbuf->submitCl.min_x_tile, cmdbuf->submitCl.min_y_tile, cmdbuf->submitCl.max_x_tile, cmdbuf->submitCl.max_y_tile);
+		printf("color read surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.color_read.hindex, cmdbuf->submitCl.color_read.offset, cmdbuf->submitCl.color_read.bits, cmdbuf->submitCl.color_read.flags);
+		printf("color write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.color_write.hindex, cmdbuf->submitCl.color_write.offset, cmdbuf->submitCl.color_write.bits, cmdbuf->submitCl.color_write.flags);
+		printf("zs read surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.zs_read.hindex, cmdbuf->submitCl.zs_read.offset, cmdbuf->submitCl.zs_read.bits, cmdbuf->submitCl.zs_read.flags);
+		printf("zs write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.zs_write.hindex, cmdbuf->submitCl.zs_write.offset, cmdbuf->submitCl.zs_write.bits, cmdbuf->submitCl.zs_write.flags);
+		printf("msaa color write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.msaa_color_write.hindex, cmdbuf->submitCl.msaa_color_write.offset, cmdbuf->submitCl.msaa_color_write.bits, cmdbuf->submitCl.msaa_color_write.flags);
+		printf("msaa zs write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.msaa_zs_write.hindex, cmdbuf->submitCl.msaa_zs_write.offset, cmdbuf->submitCl.msaa_zs_write.bits, cmdbuf->submitCl.msaa_zs_write.flags);
+		printf("clear color packed rgba %u %u\n", cmdbuf->submitCl.clear_color[0], cmdbuf->submitCl.clear_color[1]);
+		printf("clear z %u\n", cmdbuf->submitCl.clear_z);
+		printf("clear s %u\n", cmdbuf->submitCl.clear_s);
+		printf("flags %u\n", cmdbuf->submitCl.flags);
+
+
+		//submit ioctl
+		static uint64_t lastFinishedSeqno = 0;
+		vc4_cl_submit(controlFd, &cmdbuf->submitCl, &queue->lastEmitSeqno, &lastFinishedSeqno);
+	}
+
+	for(int c = 0; c < pSubmits->commandBufferCount; ++c)
+	{
+		if(pSubmits->pCommandBuffers[c]->state == CMDBUF_STATE_PENDING)
+		{
+			if(pSubmits->pCommandBuffers[c]->usageFlags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)
+			{
+				pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_INVALID;
+			}
+			else
+			{
+				pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_EXECUTABLE;
+			}
+		}
+	}
+
+	for(int c = 0; c < pSubmits->signalSemaphoreCount; ++c)
+	{
+		sem_post((sem_t*)pSubmits->pSignalSemaphores[c]);
+	}
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkFreeCommandBuffers
+ * Any primary command buffer that is in the recording or executable state and has any element of pCommandBuffers recorded into it, becomes invalid.
+ */
+VKAPI_ATTR void VKAPI_CALL vkFreeCommandBuffers(
+		VkDevice                                    device,
+		VkCommandPool                               commandPool,
+		uint32_t                                    commandBufferCount,
+		const VkCommandBuffer*                      pCommandBuffers)
+{
+	assert(device);
+	assert(commandPool);
+	assert(pCommandBuffers);
+
+	_commandPool* cp = (_commandPool*)commandPool;
+
+	for(int c = 0; c < commandBufferCount; ++c)
+	{
+		//if(cp->usePoolAllocator)
+		{
+			consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->binCl, pCommandBuffers[c]->binCl.numBlocks);
+			consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->handlesCl, pCommandBuffers[c]->binCl.numBlocks);
+			consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->shaderRecCl, pCommandBuffers[c]->binCl.numBlocks);
+			consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->uniformsCl, pCommandBuffers[c]->binCl.numBlocks);
+			poolFree(&cp->pa, pCommandBuffers[c]);
+		}
+	}
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyCommandPool
+ * When a pool is destroyed, all command buffers allocated from the pool are freed.
+ * Any primary command buffer allocated from another VkCommandPool that is in the recording or executable state and has a secondary command buffer
+ * allocated from commandPool recorded into it, becomes invalid.
+ */
+VKAPI_ATTR void VKAPI_CALL vkDestroyCommandPool(
+		VkDevice                                    device,
+		VkCommandPool                               commandPool,
+		const VkAllocationCallbacks*                pAllocator)
+{
+	assert(device);
+	assert(commandPool);
+
+	//TODO: allocator is ignored for now
+	assert(pAllocator == 0);
+
+	_commandPool* cp = (_commandPool*)commandPool;
+
+	//if(cp->usePoolAllocator)
+	{
+		free(cp->pa.buf);
+		free(cp->cpa.buf);
+		destroyPoolAllocator(&cp->pa);
+		destroyConsecutivePoolAllocator(&cp->cpa);
+	}
+
+	free(cp);
+}
+
+void clFit(VkCommandBuffer cb, ControlList* cl, uint32_t commandSize)
+{
+	if(!clHasEnoughSpace(cl, commandSize))
+	{
+		uint32_t currSize = clSize(cl);
+		cl->buffer = consecutivePoolReAllocate(&cb->cp->cpa, cl->buffer, cl->numBlocks); assert(cl->buffer);
+		cl->nextFreeByte = cl->buffer + currSize;
+	}
+}
+
+void clDump(void* cl, uint32_t size)
+{
+		struct v3d_device_info devinfo = {
+				/* While the driver supports V3D 2.1 and 2.6, we haven't split
+				 * off a 2.6 XML yet (there are a couple of fields different
+				 * in render target formatting)
+				 */
+				.ver = 21,
+		};
+		struct v3d_spec* spec = v3d_spec_load(&devinfo);
+
+		struct clif_dump *clif = clif_dump_init(&devinfo, stderr, true);
+
+		uint32_t offset = 0, hw_offset = 0;
+		uint8_t *p = cl;
+
+		while (offset < size) {
+				struct v3d_group *inst = v3d_spec_find_instruction(spec, p);
+				uint8_t header = *p;
+				uint32_t length;
+
+				if (inst == NULL) {
+						printf("0x%08x 0x%08x: Unknown packet 0x%02x (%d)!\n",
+								offset, hw_offset, header, header);
+						return;
+				}
+
+				length = v3d_group_get_length(inst);
+
+				printf("0x%08x 0x%08x: 0x%02x %s\n",
+						offset, hw_offset, header, v3d_group_get_name(inst));
+
+				v3d_print_group(clif, inst, offset, p);
+
+				switch (header) {
+				case VC4_PACKET_HALT:
+				case VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF:
+						return;
+				default:
+						break;
+				}
+
+				offset += length;
+				if (header != VC4_PACKET_GEM_HANDLES)
+						hw_offset += length;
+				p += length;
+		}
+
+		clif_dump_destroy(clif);
+}
diff --git a/driver/common.c b/driver/common.c
new file mode 100644
index 0000000..58a578d
--- /dev/null
+++ b/driver/common.c
@@ -0,0 +1,328 @@
+#include "common.h"
+
+#include "kernel/vc4_packet.h"
+
+void createImageBO(_image* i)
+{
+	assert(i);
+	assert(i->format);
+	assert(i->width);
+	assert(i->height);
+
+	uint32_t bpp = getFormatBpp(i->format);
+	uint32_t pixelSizeBytes = bpp / 8;
+	uint32_t nonPaddedSize = i->width * i->height * pixelSizeBytes;
+	i->paddedWidth = i->width;
+	i->paddedHeight = i->height;
+
+	//need to pad to T format, as HW automatically chooses that
+	if(nonPaddedSize > 4096)
+	{
+		getPaddedTextureDimensionsT(i->width, i->height, bpp, &i->paddedWidth, &i->paddedHeight);
+	}
+
+	i->size = i->paddedWidth * i->paddedHeight * pixelSizeBytes;
+	i->stride = i->paddedWidth * pixelSizeBytes;
+	i->handle = vc4_bo_alloc(controlFd, i->size, "swapchain image"); assert(i->handle);
+
+	//set tiling to T if size > 4KB
+	if(nonPaddedSize > 4096)
+	{
+		int ret = vc4_bo_set_tiling(controlFd, i->handle, DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED); assert(ret);
+		i->tiling = VC4_TILING_FORMAT_T;
+	}
+	else
+	{
+		int ret = vc4_bo_set_tiling(controlFd, i->handle, DRM_FORMAT_MOD_LINEAR); assert(ret);
+		i->tiling = VC4_TILING_FORMAT_LT;
+	}
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCmdClearColorImage
+ * Color and depth/stencil images can be cleared outside a render pass instance using vkCmdClearColorImage or vkCmdClearDepthStencilImage, respectively.
+ * These commands are only allowed outside of a render pass instance.
+ */
+VKAPI_ATTR void VKAPI_CALL vkCmdClearColorImage(
+		VkCommandBuffer                             commandBuffer,
+		VkImage                                     image,
+		VkImageLayout                               imageLayout,
+		const VkClearColorValue*                    pColor,
+		uint32_t                                    rangeCount,
+		const VkImageSubresourceRange*              pRanges)
+{
+	assert(commandBuffer);
+	assert(image);
+	assert(pColor);
+
+	//TODO this should only flag an image for clearing. This can only be called outside a renderpass
+	//actual clearing would only happen:
+	// -if image is rendered to (insert clear before first draw call)
+	// -if the image is bound for sampling (submit a CL with a clear)
+	// -if a command buffer is submitted without any rendering (insert clear)
+	// -etc.
+	//we shouldn't clear an image if noone uses it
+
+	//TODO ranges support
+
+	assert(imageLayout == VK_IMAGE_LAYOUT_GENERAL ||
+		   imageLayout == VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR ||
+		   imageLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+
+	assert(commandBuffer->state	 == CMDBUF_STATE_RECORDING);
+	assert(_queueFamilyProperties[commandBuffer->cp->queueFamilyIndex].queueFlags & VK_QUEUE_GRAPHICS_BIT || _queueFamilyProperties[commandBuffer->cp->queueFamilyIndex].queueFlags & VK_QUEUE_COMPUTE_BIT);
+
+	_image* i = image;
+
+	assert(i->usageBits & VK_IMAGE_USAGE_TRANSFER_DST_BIT);
+
+	//TODO externally sync cmdbuf, cmdpool
+
+	i->needToClear = 1;
+	i->clearColor[0] = i->clearColor[1] = packVec4IntoABGR8(pColor->float32);
+}
+
+int findInstanceExtension(char* name)
+{
+	for(int c = 0; c < numInstanceExtensions; ++c)
+	{
+		if(strcmp(instanceExtensions[c].extensionName, name) == 0)
+		{
+			return c;
+		}
+	}
+
+	return -1;
+}
+
+int findDeviceExtension(char* name)
+{
+	for(int c = 0; c < numDeviceExtensions; ++c)
+	{
+		if(strcmp(deviceExtensions[c].extensionName, name) == 0)
+		{
+			return c;
+		}
+	}
+
+	return -1;
+}
+
+//Textures in T format:
+//formed out of 4KB tiles, which have 1KB subtiles (see page 105 in VC4 arch guide)
+//1KB subtiles have 512b microtiles.
+//Width/height of the 512b microtiles is the following:
+// 64bpp: 2x4
+// 32bpp: 4x4
+// 16bpp: 8x4
+// 8bpp:  8x8
+// 4bpp:  16x8
+// 1bpp:  32x16
+//Therefore width/height of 1KB subtiles is the following:
+// 64bpp: 8x16
+// 32bpp: 16x16
+// 16bpp: 32x16
+// 8bpp:  32x32
+// 4bpp:  64x32
+// 1bpp:  128x64
+//Finally width/height of the 4KB tiles:
+// 64bpp: 16x32
+// 32bpp: 32x32
+// 16bpp: 64x32
+// 8bpp:  64x64
+// 4bpp:  128x64
+// 1bpp:  256x128
+void getPaddedTextureDimensionsT(uint32_t width, uint32_t height, uint32_t bpp, uint32_t* paddedWidth, uint32_t* paddedHeight)
+{
+	assert(paddedWidth);
+	assert(paddedHeight);
+	uint32_t tileW = 0;
+	uint32_t tileH = 0;
+
+	switch(bpp)
+	{
+	case 64:
+	{
+		tileW = 16;
+		tileH = 32;
+		break;
+	}
+	case 32:
+	{
+		tileW = 32;
+		tileH = 32;
+		break;
+	}
+	case 16:
+	{
+		tileW = 64;
+		tileH = 32;
+		break;
+	}
+	case 8:
+	{
+		tileW = 64;
+		tileH = 64;
+		break;
+	}
+	case 4:
+	{
+		tileW = 128;
+		tileH = 64;
+		break;
+	}
+	case 1:
+	{
+		tileW = 256;
+		tileH = 128;
+		break;
+	}
+	default:
+	{
+		assert(0); //unsupported
+	}
+	}
+
+	*paddedWidth = ((tileW - (width % tileW)) % tileW) + width;
+	*paddedHeight = ((tileH - (height % tileH)) % tileH) + height;
+}
+
+uint32_t getFormatBpp(VkFormat f)
+{
+	switch(f)
+	{
+	case VK_FORMAT_R16G16B16A16_SFLOAT:
+		return 64;
+	case VK_FORMAT_R8G8B8_UNORM: //padded to 32
+	case VK_FORMAT_R8G8B8A8_UNORM:
+		return 32;
+		return 32;
+	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
+	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
+	case VK_FORMAT_R5G6B5_UNORM_PACK16:
+	case VK_FORMAT_R8G8_UNORM:
+	case VK_FORMAT_R16_SFLOAT:
+	case VK_FORMAT_R16_SINT:
+		return 16;
+	case VK_FORMAT_R8_UNORM:
+	case VK_FORMAT_R8_SINT:
+		return 8;
+	default:
+		assert(0);
+		return 0;
+	}
+}
+
+uint32_t packVec4IntoABGR8(const float rgba[4])
+{
+	uint8_t r, g, b, a;
+	r = rgba[0] * 255.0;
+	g = rgba[1] * 255.0;
+	b = rgba[2] * 255.0;
+	a = rgba[3] * 255.0;
+
+	uint32_t res = 0 |
+			(a << 24) |
+			(b << 16) |
+			(g << 8) |
+			(r << 0);
+
+	return res;
+}
+
+/*static inline void util_pack_color(const float rgba[4], enum pipe_format format, union util_color *uc)
+{
+   ubyte r = 0;
+   ubyte g = 0;
+   ubyte b = 0;
+   ubyte a = 0;
+
+   if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) <= 8) {
+	  r = float_to_ubyte(rgba[0]);
+	  g = float_to_ubyte(rgba[1]);
+	  b = float_to_ubyte(rgba[2]);
+	  a = float_to_ubyte(rgba[3]);
+   }
+
+   switch (format) {
+   case PIPE_FORMAT_ABGR8888_UNORM:
+	  {
+		 uc->ui[0] = (r << 24) | (g << 16) | (b << 8) | a;
+	  }
+	  return;
+   case PIPE_FORMAT_XBGR8888_UNORM:
+	  {
+		 uc->ui[0] = (r << 24) | (g << 16) | (b << 8) | 0xff;
+	  }
+	  return;
+   case PIPE_FORMAT_BGRA8888_UNORM:
+	  {
+		 uc->ui[0] = (a << 24) | (r << 16) | (g << 8) | b;
+	  }
+	  return;
+   case PIPE_FORMAT_BGRX8888_UNORM:
+	  {
+		 uc->ui[0] = (0xffu << 24) | (r << 16) | (g << 8) | b;
+	  }
+	  return;
+   case PIPE_FORMAT_ARGB8888_UNORM:
+	  {
+		 uc->ui[0] = (b << 24) | (g << 16) | (r << 8) | a;
+	  }
+	  return;
+   case PIPE_FORMAT_XRGB8888_UNORM:
+	  {
+		 uc->ui[0] = (b << 24) | (g << 16) | (r << 8) | 0xff;
+	  }
+	  return;
+   case PIPE_FORMAT_B5G6R5_UNORM:
+	  {
+		 uc->us = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
+	  }
+	  return;
+   case PIPE_FORMAT_B5G5R5X1_UNORM:
+	  {
+		 uc->us = ((0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
+	  }
+	  return;
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
+	  {
+		 uc->us = ((a & 0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
+	  }
+	  return;
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
+	  {
+		 uc->us = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4);
+	  }
+	  return;
+   case PIPE_FORMAT_A8_UNORM:
+	  {
+		 uc->ub = a;
+	  }
+	  return;
+   case PIPE_FORMAT_L8_UNORM:
+   case PIPE_FORMAT_I8_UNORM:
+	  {
+		 uc->ub = r;
+	  }
+	  return;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	  {
+		 uc->f[0] = rgba[0];
+		 uc->f[1] = rgba[1];
+		 uc->f[2] = rgba[2];
+		 uc->f[3] = rgba[3];
+	  }
+	  return;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+	  {
+		 uc->f[0] = rgba[0];
+		 uc->f[1] = rgba[1];
+		 uc->f[2] = rgba[2];
+	  }
+	  return;
+
+   default:
+	  util_format_write_4f(format, rgba, 0, uc, 0, 0, 0, 1, 1);
+   }
+}*/
diff --git a/driver/common.h b/driver/common.h
new file mode 100644
index 0000000..40eda28
--- /dev/null
+++ b/driver/common.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <drm/drm.h>
+#include <drm/drm_fourcc.h>
+#include <drm/vc4_drm.h>
+
+#include <vulkan/vulkan.h>
+#include "vkExt.h"
+
+#include "AlignedAllocator.h"
+#include "PoolAllocator.h"
+#include "ConsecutivePoolAllocator.h"
+#include "LinearAllocator.h"
+
+#include <stdio.h>
+#include "CustomAssert.h"
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <pthread.h>
+#include <semaphore.h>
+
+#include "modeset.h"
+#include "kernelInterface.h"
+#include "ControlListUtil.h"
+
+#ifndef min
+#define min(a, b) (a < b ? a : b)
+#endif
+
+#ifndef max
+#define max(a, b) (a > b ? a : b)
+#endif
+
+#include "vkCaps.h"
+
+typedef struct VkPhysicalDevice_T
+{
+	//hardware id?
+	int dummy;
+} _physicalDevice;
+
+typedef struct VkQueue_T
+{
+	uint64_t lastEmitSeqno;
+} _queue;
+
+typedef struct VkCommandPool_T
+{
+	PoolAllocator pa;
+	ConsecutivePoolAllocator cpa;
+	uint32_t queueFamilyIndex;
+} _commandPool;
+
+typedef enum commandBufferState
+{
+	CMDBUF_STATE_INITIAL = 0,
+	CMDBUF_STATE_RECORDING,
+	CMDBUF_STATE_EXECUTABLE,
+	CMDBUF_STATE_PENDING,
+	CMDBUF_STATE_INVALID,
+	CMDBUF_STATE_LAST
+} commandBufferState;
+
+typedef struct VkCommandBuffer_T
+{
+	//Recorded commands include commands to bind pipelines and descriptor sets to the command buffer, commands to modify dynamic state, commands to draw (for graphics rendering),
+	//commands to dispatch (for compute), commands to execute secondary command buffers (for primary command buffers only), commands to copy buffers and images, and other commands
+
+	struct drm_vc4_submit_cl submitCl;
+
+	ControlList binCl;
+	ControlList shaderRecCl;
+	uint32_t shaderRecCount;
+	ControlList uniformsCl;
+	ControlList handlesCl;
+	commandBufferState state;
+	VkCommandBufferUsageFlags usageFlags;
+	_commandPool* cp;
+} _commandBuffer;
+
+typedef struct VkInstance_T
+{
+	//supposedly this should contain all the enabled layers?
+	int enabledExtensions[numInstanceExtensions];
+	int numEnabledExtensions;
+	_physicalDevice dev;
+	int chipVersion;
+	int hasTiling;
+	int hasControlFlow;
+	int hasEtc1;
+	int hasThreadedFs;
+	int hasMadvise;
+} _instance;
+
+typedef struct VkDevice_T
+{
+	int enabledExtensions[numDeviceExtensions];
+	int numEnabledExtensions;
+	VkPhysicalDeviceFeatures enabledFeatures;
+	_physicalDevice* dev;
+	_queue* queues[numQueueFamilies];
+	int numQueues[numQueueFamilies];
+} _device;
+
+typedef struct VkSwapchain_T
+{
+	_image* images;
+	uint32_t numImages;
+	uint32_t backbufferIdx;
+	VkSurfaceKHR surface;
+} _swapchain;
+
+void getPaddedTextureDimensionsT(uint32_t width, uint32_t height, uint32_t bpp, uint32_t* paddedWidth, uint32_t* paddedHeight);
+uint32_t getFormatBpp(VkFormat f);
+uint32_t packVec4IntoABGR8(const float rgba[4]);
+int findInstanceExtension(char* name);
+int findDeviceExtension(char* name);
+void createImageBO(_image* i);
diff --git a/driver/device.c b/driver/device.c
new file mode 100644
index 0000000..89be256
--- /dev/null
+++ b/driver/device.c
@@ -0,0 +1,314 @@
+#include "common.h"
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#devsandqueues-physical-device-enumeration
+ * If pPhysicalDevices is NULL, then the number of physical devices available is returned in pPhysicalDeviceCount. Otherwise, pPhysicalDeviceCount must point to a
+ * variable set by the user to the number of elements in the pPhysicalDevices array, and on return the variable is overwritten with the number of handles actually
+ * written to pPhysicalDevices. If pPhysicalDeviceCount is less than the number of physical devices available, at most pPhysicalDeviceCount structures will be written.
+ * If pPhysicalDeviceCount is smaller than the number of physical devices available, VK_INCOMPLETE will be returned instead of VK_SUCCESS, to indicate that not all the
+ * available physical devices were returned.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumeratePhysicalDevices(
+		VkInstance                                  instance,
+		uint32_t*                                   pPhysicalDeviceCount,
+		VkPhysicalDevice*                           pPhysicalDevices)
+{
+	assert(instance);
+
+	//TODO is there a way to check if there's a gpu (and it's the rPi)?
+	int gpuExists = access( "/dev/dri/card0", F_OK ) != -1;
+
+	int numGPUs = gpuExists;
+
+	assert(pPhysicalDeviceCount);
+
+	if(!pPhysicalDevices)
+	{
+		*pPhysicalDeviceCount = numGPUs;
+		return VK_SUCCESS;
+	}
+
+	int arraySize = *pPhysicalDeviceCount;
+	int elementsWritten = min(numGPUs, arraySize);
+
+	for(int c = 0; c < elementsWritten; ++c)
+	{
+		pPhysicalDevices[c] = &instance->dev;
+	}
+
+	*pPhysicalDeviceCount = elementsWritten;
+
+	if(elementsWritten < arraySize)
+	{
+		return VK_INCOMPLETE;
+	}
+	else
+	{
+		return VK_SUCCESS;
+	}
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceProperties
+ */
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceProperties(
+		VkPhysicalDevice                            physicalDevice,
+		VkPhysicalDeviceProperties*                 pProperties)
+{
+	assert(physicalDevice);
+	assert(pProperties);
+
+	VkPhysicalDeviceSparseProperties sparseProps =
+	{
+		.residencyStandard2DBlockShape = 1,
+		.residencyStandard2DMultisampleBlockShape = 1,
+		.residencyStandard3DBlockShape = 1,
+		.residencyAlignedMipSize = 1,
+		.residencyNonResidentStrict = 1
+	};
+
+	pProperties->apiVersion = VK_MAKE_VERSION(1,1,0);
+	pProperties->driverVersion = 1; //we'll simply call this v1
+	pProperties->vendorID = 0x14E4; //Broadcom
+	pProperties->deviceID = 0; //TODO dunno?
+	pProperties->deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU;
+	strcpy(pProperties->deviceName, "VideoCore IV HW");
+	//pProperties->pipelineCacheUUID
+	pProperties->limits = _limits;
+	pProperties->sparseProperties = sparseProps;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceFeatures
+ */
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFeatures(
+		VkPhysicalDevice                            physicalDevice,
+		VkPhysicalDeviceFeatures*                   pFeatures)
+{
+	assert(physicalDevice);
+	assert(pFeatures);
+
+	*pFeatures = _features;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEnumerateDeviceExtensionProperties
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateDeviceExtensionProperties(
+		VkPhysicalDevice                            physicalDevice,
+		const char*                                 pLayerName,
+		uint32_t*                                   pPropertyCount,
+		VkExtensionProperties*                      pProperties)
+{
+	assert(physicalDevice);
+	assert(!pLayerName); //layers ignored for now
+	assert(pPropertyCount);
+
+	if(!pProperties)
+	{
+		*pPropertyCount = numDeviceExtensions;
+		return VK_INCOMPLETE;
+	}
+
+	int arraySize = *pPropertyCount;
+	int elementsWritten = min(numDeviceExtensions, arraySize);
+
+	for(int c = 0; c < elementsWritten; ++c)
+	{
+		pProperties[c] = deviceExtensions[c];
+	}
+
+	*pPropertyCount = elementsWritten;
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceQueueFamilyProperties
+ * If pQueueFamilyProperties is NULL, then the number of queue families available is returned in pQueueFamilyPropertyCount.
+ * Otherwise, pQueueFamilyPropertyCount must point to a variable set by the user to the number of elements in the pQueueFamilyProperties array,
+ * and on return the variable is overwritten with the number of structures actually written to pQueueFamilyProperties. If pQueueFamilyPropertyCount
+ * is less than the number of queue families available, at most pQueueFamilyPropertyCount structures will be written.
+ */
+VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceQueueFamilyProperties(
+		VkPhysicalDevice                            physicalDevice,
+		uint32_t*                                   pQueueFamilyPropertyCount,
+		VkQueueFamilyProperties*                    pQueueFamilyProperties)
+{
+	assert(physicalDevice);
+	assert(pQueueFamilyPropertyCount);
+
+	if(!pQueueFamilyProperties)
+	{
+		*pQueueFamilyPropertyCount = 1;
+		return;
+	}
+
+	int arraySize = *pQueueFamilyPropertyCount;
+	int elementsWritten = min(numQueueFamilies, arraySize);
+
+	for(int c = 0; c < elementsWritten; ++c)
+	{
+		pQueueFamilyProperties[c] = _queueFamilyProperties[c];
+	}
+
+	*pQueueFamilyPropertyCount = elementsWritten;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfaceSupportKHR
+ * does this queue family support presentation to this surface?
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceSupportKHR(
+		VkPhysicalDevice                            physicalDevice,
+		uint32_t                                    queueFamilyIndex,
+		VkSurfaceKHR                                surface,
+		VkBool32*                                   pSupported)
+{
+	assert(pSupported);
+	assert(surface);
+	assert(physicalDevice);
+
+	assert(queueFamilyIndex < numQueueFamilies);
+
+	//TODO if we plan to support headless rendering, there should be 2 families
+	//one using /dev/dri/card0 which has modesetting
+	//other using /dev/dri/renderD128 which does not support modesetting, this would say false here
+	*pSupported = VK_TRUE;
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateDevice
+ * vkCreateDevice verifies that extensions and features requested in the ppEnabledExtensionNames and pEnabledFeatures
+ * members of pCreateInfo, respectively, are supported by the implementation. If any requested extension is not supported,
+ * vkCreateDevice must return VK_ERROR_EXTENSION_NOT_PRESENT. If any requested feature is not supported, vkCreateDevice must return
+ * VK_ERROR_FEATURE_NOT_PRESENT. Support for extensions can be checked before creating a device by querying vkEnumerateDeviceExtensionProperties
+ * After verifying and enabling the extensions the VkDevice object is created and returned to the application.
+ * If a requested extension is only supported by a layer, both the layer and the extension need to be specified at vkCreateInstance
+ * time for the creation to succeed. Multiple logical devices can be created from the same physical device. Logical device creation may
+ * fail due to lack of device-specific resources (in addition to the other errors). If that occurs, vkCreateDevice will return VK_ERROR_TOO_MANY_OBJECTS.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateDevice(
+		VkPhysicalDevice                            physicalDevice,
+		const VkDeviceCreateInfo*                   pCreateInfo,
+		const VkAllocationCallbacks*                pAllocator,
+		VkDevice*                                   pDevice)
+{
+	assert(physicalDevice);
+	assert(pDevice);
+	assert(pCreateInfo);
+
+	//TODO: allocator is ignored for now
+	assert(pAllocator == 0);
+
+	*pDevice = malloc(sizeof(_device));
+	if(!pDevice)
+	{
+		return VK_ERROR_TOO_MANY_OBJECTS;
+	}
+
+	(*pDevice)->dev = physicalDevice;
+
+	for(int c = 0; c < pCreateInfo->enabledExtensionCount; ++c)
+	{
+		int findres = findDeviceExtension(pCreateInfo->ppEnabledExtensionNames[c]);
+		if(findres > -1)
+		{
+			(*pDevice)->enabledExtensions[(*pDevice)->numEnabledExtensions] = findres;
+			(*pDevice)->numEnabledExtensions++;
+		}
+		else
+		{
+			return VK_ERROR_EXTENSION_NOT_PRESENT;
+		}
+	}
+
+	VkBool32* requestedFeatures = pCreateInfo->pEnabledFeatures;
+	VkBool32* supportedFeatures = &_features;
+
+	if(requestedFeatures)
+	{
+		for(int c = 0; c < numFeatures; ++c)
+		{
+			if(requestedFeatures[c] && !supportedFeatures[c])
+			{
+				return VK_ERROR_FEATURE_NOT_PRESENT;
+			}
+		}
+
+		(*pDevice)->enabledFeatures = *pCreateInfo->pEnabledFeatures;
+	}
+	else
+	{
+		memset(&(*pDevice)->enabledFeatures, 0, sizeof((*pDevice)->enabledFeatures)); //just disable everything
+	}
+
+	//layers ignored per spec
+	//pCreateInfo->enabledLayerCount
+
+	for(int c = 0; c < numQueueFamilies; ++c)
+	{
+		(*pDevice)->queues[c] = 0;
+	}
+
+	if(pCreateInfo->queueCreateInfoCount > 0)
+	{
+		for(int c = 0; c < pCreateInfo->queueCreateInfoCount; ++c)
+		{
+			(*pDevice)->queues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex] = malloc(sizeof(_queue)*pCreateInfo->pQueueCreateInfos[c].queueCount);
+
+			if(!(*pDevice)->queues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex])
+			{
+				return VK_ERROR_OUT_OF_HOST_MEMORY;
+			}
+
+			for(int d = 0; d < pCreateInfo->pQueueCreateInfos[c].queueCount; ++d)
+			{
+				(*pDevice)->queues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex][d].lastEmitSeqno = 0;
+			}
+
+			(*pDevice)->numQueues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex] = pCreateInfo->pQueueCreateInfos[c].queueCount;
+		}
+	}
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetDeviceQueue
+ * vkGetDeviceQueue must only be used to get queues that were created with the flags parameter of VkDeviceQueueCreateInfo set to zero.
+ * To get queues that were created with a non-zero flags parameter use vkGetDeviceQueue2.
+ */
+VKAPI_ATTR void VKAPI_CALL vkGetDeviceQueue(
+		VkDevice                                    device,
+		uint32_t                                    queueFamilyIndex,
+		uint32_t                                    queueIndex,
+		VkQueue*                                    pQueue)
+{
+	assert(device);
+	assert(pQueue);
+
+	assert(queueFamilyIndex < numQueueFamilies);
+	assert(queueIndex < device->numQueues[queueFamilyIndex]);
+
+	*pQueue = &device->queues[queueFamilyIndex][queueIndex];
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyDevice
+ * To ensure that no work is active on the device, vkDeviceWaitIdle can be used to gate the destruction of the device.
+ * Prior to destroying a device, an application is responsible for destroying/freeing any Vulkan objects that were created using that device as the
+ * first parameter of the corresponding vkCreate* or vkAllocate* command
+ */
+VKAPI_ATTR void VKAPI_CALL vkDestroyDevice(
+		VkDevice                                    device,
+		const VkAllocationCallbacks*                pAllocator)
+{
+	assert(device);
+
+	//TODO: allocator is ignored for now
+	assert(pAllocator == 0);
+
+	//TODO
+}
diff --git a/driver/driver.c b/driver/driver.c
deleted file mode 100644
index f9c473d..0000000
--- a/driver/driver.c
+++ /dev/null
@@ -1,1938 +0,0 @@
-#include <stdio.h>
-#include "CustomAssert.h"
-#include <string.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <pthread.h>
-#include <semaphore.h>
-
-#include <vulkan/vulkan.h>
-#include "vkExt.h"
-
-#include "modeset.h"
-#include "kernelInterface.h"
-#include "ControlListUtil.h"
-
-#include "AlignedAllocator.h"
-#include "PoolAllocator.h"
-#include "ConsecutivePoolAllocator.h"
-#include "LinearAllocator.h"
-
-#include "kernel/vc4_packet.h"
-#include "../brcm/cle/v3d_decoder.h"
-#include "../brcm/clif/clif_dump.h"
-
-#ifndef min
-#define min(a, b) (a < b ? a : b)
-#endif
-
-#ifndef max
-#define max(a, b) (a > b ? a : b)
-#endif
-
-#include "vkCaps.h"
-
-typedef struct VkPhysicalDevice_T
-{
-	//hardware id?
-	int dummy;
-} _physicalDevice;
-
-typedef struct VkQueue_T
-{
-	uint64_t lastEmitSeqno;
-} _queue;
-
-typedef struct VkCommandPool_T
-{
-	PoolAllocator pa;
-	ConsecutivePoolAllocator cpa;
-	uint32_t queueFamilyIndex;
-} _commandPool;
-
-typedef enum commandBufferState
-{
-	CMDBUF_STATE_INITIAL = 0,
-	CMDBUF_STATE_RECORDING,
-	CMDBUF_STATE_EXECUTABLE,
-	CMDBUF_STATE_PENDING,
-	CMDBUF_STATE_INVALID,
-	CMDBUF_STATE_LAST
-} commandBufferState;
-
-typedef struct VkCommandBuffer_T
-{
-	//Recorded commands include commands to bind pipelines and descriptor sets to the command buffer, commands to modify dynamic state, commands to draw (for graphics rendering),
-	//commands to dispatch (for compute), commands to execute secondary command buffers (for primary command buffers only), commands to copy buffers and images, and other commands
-
-	struct drm_vc4_submit_cl submitCl;
-
-	ControlList binCl;
-	ControlList shaderRecCl;
-	uint32_t shaderRecCount;
-	ControlList uniformsCl;
-	ControlList handlesCl;
-	commandBufferState state;
-	VkCommandBufferUsageFlags usageFlags;
-	_commandPool* cp;
-} _commandBuffer;
-
-typedef struct VkInstance_T
-{
-	//supposedly this should contain all the enabled layers?
-	int enabledExtensions[numInstanceExtensions];
-	int numEnabledExtensions;
-	_physicalDevice dev;
-	int chipVersion;
-	int hasTiling;
-	int hasControlFlow;
-	int hasEtc1;
-	int hasThreadedFs;
-	int hasMadvise;
-} _instance;
-
-typedef struct VkDevice_T
-{
-	int enabledExtensions[numDeviceExtensions];
-	int numEnabledExtensions;
-	VkPhysicalDeviceFeatures enabledFeatures;
-	_physicalDevice* dev;
-	_queue* queues[numQueueFamilies];
-	int numQueues[numQueueFamilies];
-} _device;
-
-typedef struct VkSwapchain_T
-{
-	_image* images;
-	uint32_t numImages;
-	uint32_t backbufferIdx;
-	VkSurfaceKHR surface;
-} _swapchain;
-
-void clFit(VkCommandBuffer cb, ControlList* cl, uint32_t commandSize)
-{
-	if(!clHasEnoughSpace(cl, commandSize))
-	{
-		uint32_t currSize = clSize(cl);
-		cl->buffer = consecutivePoolReAllocate(&cb->cp->cpa, cl->buffer, cl->numBlocks); assert(cl->buffer);
-		cl->nextFreeByte = cl->buffer + currSize;
-	}
-}
-
-void clDump(void* cl, uint32_t size)
-{
-		struct v3d_device_info devinfo = {
-				/* While the driver supports V3D 2.1 and 2.6, we haven't split
-				 * off a 2.6 XML yet (there are a couple of fields different
-				 * in render target formatting)
-				 */
-				.ver = 21,
-		};
-		struct v3d_spec* spec = v3d_spec_load(&devinfo);
-
-		struct clif_dump *clif = clif_dump_init(&devinfo, stderr, true);
-
-		uint32_t offset = 0, hw_offset = 0;
-		uint8_t *p = cl;
-
-		while (offset < size) {
-				struct v3d_group *inst = v3d_spec_find_instruction(spec, p);
-				uint8_t header = *p;
-				uint32_t length;
-
-				if (inst == NULL) {
-						printf("0x%08x 0x%08x: Unknown packet 0x%02x (%d)!\n",
-								offset, hw_offset, header, header);
-						return;
-				}
-
-				length = v3d_group_get_length(inst);
-
-				printf("0x%08x 0x%08x: 0x%02x %s\n",
-						offset, hw_offset, header, v3d_group_get_name(inst));
-
-				v3d_print_group(clif, inst, offset, p);
-
-				switch (header) {
-				case VC4_PACKET_HALT:
-				case VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF:
-						return;
-				default:
-						break;
-				}
-
-				offset += length;
-				if (header != VC4_PACKET_GEM_HANDLES)
-						hw_offset += length;
-				p += length;
-		}
-
-		clif_dump_destroy(clif);
-}
-
-//Textures in T format:
-//formed out of 4KB tiles, which have 1KB subtiles (see page 105 in VC4 arch guide)
-//1KB subtiles have 512b microtiles.
-//Width/height of the 512b microtiles is the following:
-// 64bpp: 2x4
-// 32bpp: 4x4
-// 16bpp: 8x4
-// 8bpp:  8x8
-// 4bpp:  16x8
-// 1bpp:  32x16
-//Therefore width/height of 1KB subtiles is the following:
-// 64bpp: 8x16
-// 32bpp: 16x16
-// 16bpp: 32x16
-// 8bpp:  32x32
-// 4bpp:  64x32
-// 1bpp:  128x64
-//Finally width/height of the 4KB tiles:
-// 64bpp: 16x32
-// 32bpp: 32x32
-// 16bpp: 64x32
-// 8bpp:  64x64
-// 4bpp:  128x64
-// 1bpp:  256x128
-void getPaddedTextureDimensionsT(uint32_t width, uint32_t height, uint32_t bpp, uint32_t* paddedWidth, uint32_t* paddedHeight)
-{
-	assert(paddedWidth);
-	assert(paddedHeight);
-	uint32_t tileW = 0;
-	uint32_t tileH = 0;
-
-	switch(bpp)
-	{
-	case 64:
-	{
-		tileW = 16;
-		tileH = 32;
-		break;
-	}
-	case 32:
-	{
-		tileW = 32;
-		tileH = 32;
-		break;
-	}
-	case 16:
-	{
-		tileW = 64;
-		tileH = 32;
-		break;
-	}
-	case 8:
-	{
-		tileW = 64;
-		tileH = 64;
-		break;
-	}
-	case 4:
-	{
-		tileW = 128;
-		tileH = 64;
-		break;
-	}
-	case 1:
-	{
-		tileW = 256;
-		tileH = 128;
-		break;
-	}
-	default:
-	{
-		assert(0); //unsupported
-	}
-	}
-
-	*paddedWidth = ((tileW - (width % tileW)) % tileW) + width;
-	*paddedHeight = ((tileH - (height % tileH)) % tileH) + height;
-}
-
-uint32_t getFormatBpp(VkFormat f)
-{
-	switch(f)
-	{
-	case VK_FORMAT_R16G16B16A16_SFLOAT:
-		return 64;
-	case VK_FORMAT_R8G8B8_UNORM: //padded to 32
-	case VK_FORMAT_R8G8B8A8_UNORM:
-		return 32;
-		return 32;
-	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
-	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
-	case VK_FORMAT_R5G6B5_UNORM_PACK16:
-	case VK_FORMAT_R8G8_UNORM:
-	case VK_FORMAT_R16_SFLOAT:
-	case VK_FORMAT_R16_SINT:
-		return 16;
-	case VK_FORMAT_R8_UNORM:
-	case VK_FORMAT_R8_SINT:
-		return 8;
-	default:
-		assert(0);
-		return 0;
-	}
-}
-
-void createImageBO(_image* i)
-{
-	assert(i);
-	assert(i->format);
-	assert(i->width);
-	assert(i->height);
-
-	uint32_t bpp = getFormatBpp(i->format);
-	uint32_t pixelSizeBytes = bpp / 8;
-	uint32_t nonPaddedSize = i->width * i->height * pixelSizeBytes;
-	i->paddedWidth = i->width;
-	i->paddedHeight = i->height;
-
-	//need to pad to T format, as HW automatically chooses that
-	if(nonPaddedSize > 4096)
-	{
-		getPaddedTextureDimensionsT(i->width, i->height, bpp, &i->paddedWidth, &i->paddedHeight);
-	}
-
-	i->size = i->paddedWidth * i->paddedHeight * pixelSizeBytes;
-	i->stride = i->paddedWidth * pixelSizeBytes;
-	i->handle = vc4_bo_alloc(controlFd, i->size, "swapchain image"); assert(i->handle);
-
-	//set tiling to T if size > 4KB
-	if(nonPaddedSize > 4096)
-	{
-		int ret = vc4_bo_set_tiling(controlFd, i->handle, DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED); assert(ret);
-		i->tiling = VC4_TILING_FORMAT_T;
-	}
-	else
-	{
-		int ret = vc4_bo_set_tiling(controlFd, i->handle, DRM_FORMAT_MOD_LINEAR); assert(ret);
-		i->tiling = VC4_TILING_FORMAT_LT;
-	}
-}
-
-/*static inline void util_pack_color(const float rgba[4], enum pipe_format format, union util_color *uc)
-{
-   ubyte r = 0;
-   ubyte g = 0;
-   ubyte b = 0;
-   ubyte a = 0;
-
-   if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) <= 8) {
-	  r = float_to_ubyte(rgba[0]);
-	  g = float_to_ubyte(rgba[1]);
-	  b = float_to_ubyte(rgba[2]);
-	  a = float_to_ubyte(rgba[3]);
-   }
-
-   switch (format) {
-   case PIPE_FORMAT_ABGR8888_UNORM:
-	  {
-		 uc->ui[0] = (r << 24) | (g << 16) | (b << 8) | a;
-	  }
-	  return;
-   case PIPE_FORMAT_XBGR8888_UNORM:
-	  {
-		 uc->ui[0] = (r << 24) | (g << 16) | (b << 8) | 0xff;
-	  }
-	  return;
-   case PIPE_FORMAT_BGRA8888_UNORM:
-	  {
-		 uc->ui[0] = (a << 24) | (r << 16) | (g << 8) | b;
-	  }
-	  return;
-   case PIPE_FORMAT_BGRX8888_UNORM:
-	  {
-		 uc->ui[0] = (0xffu << 24) | (r << 16) | (g << 8) | b;
-	  }
-	  return;
-   case PIPE_FORMAT_ARGB8888_UNORM:
-	  {
-		 uc->ui[0] = (b << 24) | (g << 16) | (r << 8) | a;
-	  }
-	  return;
-   case PIPE_FORMAT_XRGB8888_UNORM:
-	  {
-		 uc->ui[0] = (b << 24) | (g << 16) | (r << 8) | 0xff;
-	  }
-	  return;
-   case PIPE_FORMAT_B5G6R5_UNORM:
-	  {
-		 uc->us = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
-	  }
-	  return;
-   case PIPE_FORMAT_B5G5R5X1_UNORM:
-	  {
-		 uc->us = ((0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
-	  }
-	  return;
-   case PIPE_FORMAT_B5G5R5A1_UNORM:
-	  {
-		 uc->us = ((a & 0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
-	  }
-	  return;
-   case PIPE_FORMAT_B4G4R4A4_UNORM:
-	  {
-		 uc->us = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4);
-	  }
-	  return;
-   case PIPE_FORMAT_A8_UNORM:
-	  {
-		 uc->ub = a;
-	  }
-	  return;
-   case PIPE_FORMAT_L8_UNORM:
-   case PIPE_FORMAT_I8_UNORM:
-	  {
-		 uc->ub = r;
-	  }
-	  return;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-	  {
-		 uc->f[0] = rgba[0];
-		 uc->f[1] = rgba[1];
-		 uc->f[2] = rgba[2];
-		 uc->f[3] = rgba[3];
-	  }
-	  return;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-	  {
-		 uc->f[0] = rgba[0];
-		 uc->f[1] = rgba[1];
-		 uc->f[2] = rgba[2];
-	  }
-	  return;
-
-   default:
-	  util_format_write_4f(format, rgba, 0, uc, 0, 0, 0, 1, 1);
-   }
-}*/
-
-uint32_t packVec4IntoABGR8(const float rgba[4])
-{
-	uint8_t r, g, b, a;
-	r = rgba[0] * 255.0;
-	g = rgba[1] * 255.0;
-	b = rgba[2] * 255.0;
-	a = rgba[3] * 255.0;
-
-	uint32_t res = 0 |
-			(a << 24) |
-			(b << 16) |
-			(g << 8) |
-			(r << 0);
-
-	return res;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEnumerateInstanceExtensionProperties
- * When pLayerName parameter is NULL, only extensions provided by the Vulkan implementation or by implicitly enabled layers are returned. When pLayerName is the name of a layer,
- * the instance extensions provided by that layer are returned.
- * If pProperties is NULL, then the number of extensions properties available is returned in pPropertyCount. Otherwise, pPropertyCount must point to a variable set by the user
- * to the number of elements in the pProperties array, and on return the variable is overwritten with the number of structures actually written to pProperties.
- * If pPropertyCount is less than the number of extension properties available, at most pPropertyCount structures will be written. If pPropertyCount is smaller than the number of extensions available,
- * VK_INCOMPLETE will be returned instead of VK_SUCCESS, to indicate that not all the available properties were returned.
- * Because the list of available layers may change externally between calls to vkEnumerateInstanceExtensionProperties,
- * two calls may retrieve different results if a pLayerName is available in one call but not in another. The extensions supported by a layer may also change between two calls,
- * e.g. if the layer implementation is replaced by a different version between those calls.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateInstanceExtensionProperties(
-		const char*                                 pLayerName,
-		uint32_t*                                   pPropertyCount,
-		VkExtensionProperties*                      pProperties)
-{
-	assert(!pLayerName); //TODO layers ignored for now
-	assert(pPropertyCount);
-
-	if(!pProperties)
-	{
-		*pPropertyCount = numInstanceExtensions;
-		return VK_INCOMPLETE;
-	}
-
-	int arraySize = *pPropertyCount;
-	int elementsWritten = min(numInstanceExtensions, arraySize);
-
-	for(int c = 0; c < elementsWritten; ++c)
-	{
-		pProperties[c] = instanceExtensions[c];
-	}
-
-	*pPropertyCount = elementsWritten;
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateInstance
- * There is no global state in Vulkan and all per-application state is stored in a VkInstance object. Creating a VkInstance object initializes the Vulkan library
- * vkCreateInstance verifies that the requested layers exist. If not, vkCreateInstance will return VK_ERROR_LAYER_NOT_PRESENT. Next vkCreateInstance verifies that
- * the requested extensions are supported (e.g. in the implementation or in any enabled instance layer) and if any requested extension is not supported,
- * vkCreateInstance must return VK_ERROR_EXTENSION_NOT_PRESENT. After verifying and enabling the instance layers and extensions the VkInstance object is
- * created and returned to the application.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkCreateInstance(
-		const VkInstanceCreateInfo*                 pCreateInfo,
-		const VkAllocationCallbacks*                pAllocator,
-		VkInstance*                                 pInstance)
-{
-	assert(pInstance);
-	assert(pCreateInfo);
-
-	*pInstance = malloc(sizeof(_instance));
-
-	if(!*pInstance)
-	{
-		return VK_ERROR_OUT_OF_HOST_MEMORY;
-	}
-
-	(*pInstance)->numEnabledExtensions = 0;
-
-	//TODO: allocator is ignored for now
-	assert(pAllocator == 0);
-
-	//TODO: possibly we need to load layers here
-	//and store them in pInstance
-	assert(pCreateInfo->enabledLayerCount == 0);
-
-	if(pCreateInfo->enabledExtensionCount)
-	{
-		assert(pCreateInfo->ppEnabledExtensionNames);
-	}
-
-	for(int c = 0; c < pCreateInfo->enabledExtensionCount; ++c)
-	{
-		int findres = findInstanceExtension(pCreateInfo->ppEnabledExtensionNames[c]);
-		if(findres > -1)
-		{
-			(*pInstance)->enabledExtensions[(*pInstance)->numEnabledExtensions] = findres;
-			(*pInstance)->numEnabledExtensions++;
-		}
-		else
-		{
-			return VK_ERROR_EXTENSION_NOT_PRESENT;
-		}
-	}
-
-	//TODO ignored for now
-	//pCreateInfo->pApplicationInfo
-
-	int ret = openIoctl(); assert(!ret);
-
-	(*pInstance)->chipVersion = vc4_get_chip_info(controlFd);
-	(*pInstance)->hasTiling = vc4_test_tiling(controlFd);
-
-	(*pInstance)->hasControlFlow = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_BRANCHES);
-	(*pInstance)->hasEtc1 = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_ETC1);
-	(*pInstance)->hasThreadedFs = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_THREADED_FS);
-	(*pInstance)->hasMadvise = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_MADVISE);
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#devsandqueues-physical-device-enumeration
- * If pPhysicalDevices is NULL, then the number of physical devices available is returned in pPhysicalDeviceCount. Otherwise, pPhysicalDeviceCount must point to a
- * variable set by the user to the number of elements in the pPhysicalDevices array, and on return the variable is overwritten with the number of handles actually
- * written to pPhysicalDevices. If pPhysicalDeviceCount is less than the number of physical devices available, at most pPhysicalDeviceCount structures will be written.
- * If pPhysicalDeviceCount is smaller than the number of physical devices available, VK_INCOMPLETE will be returned instead of VK_SUCCESS, to indicate that not all the
- * available physical devices were returned.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkEnumeratePhysicalDevices(
-		VkInstance                                  instance,
-		uint32_t*                                   pPhysicalDeviceCount,
-		VkPhysicalDevice*                           pPhysicalDevices)
-{
-	assert(instance);
-
-	//TODO is there a way to check if there's a gpu (and it's the rPi)?
-	int gpuExists = access( "/dev/dri/card0", F_OK ) != -1;
-
-	int numGPUs = gpuExists;
-
-	assert(pPhysicalDeviceCount);
-
-	if(!pPhysicalDevices)
-	{
-		*pPhysicalDeviceCount = numGPUs;
-		return VK_SUCCESS;
-	}
-
-	int arraySize = *pPhysicalDeviceCount;
-	int elementsWritten = min(numGPUs, arraySize);
-
-	for(int c = 0; c < elementsWritten; ++c)
-	{
-		pPhysicalDevices[c] = &instance->dev;
-	}
-
-	*pPhysicalDeviceCount = elementsWritten;
-
-	if(elementsWritten < arraySize)
-	{
-		return VK_INCOMPLETE;
-	}
-	else
-	{
-		return VK_SUCCESS;
-	}
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceProperties
- */
-VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceProperties(
-		VkPhysicalDevice                            physicalDevice,
-		VkPhysicalDeviceProperties*                 pProperties)
-{
-	assert(physicalDevice);
-	assert(pProperties);
-
-	VkPhysicalDeviceSparseProperties sparseProps =
-	{
-		.residencyStandard2DBlockShape = 1,
-		.residencyStandard2DMultisampleBlockShape = 1,
-		.residencyStandard3DBlockShape = 1,
-		.residencyAlignedMipSize = 1,
-		.residencyNonResidentStrict = 1
-	};
-
-	pProperties->apiVersion = VK_MAKE_VERSION(1,1,0);
-	pProperties->driverVersion = 1; //we'll simply call this v1
-	pProperties->vendorID = 0x14E4; //Broadcom
-	pProperties->deviceID = 0; //TODO dunno?
-	pProperties->deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU;
-	strcpy(pProperties->deviceName, "VideoCore IV HW");
-	//pProperties->pipelineCacheUUID
-	pProperties->limits = _limits;
-	pProperties->sparseProperties = sparseProps;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceFeatures
- */
-VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFeatures(
-		VkPhysicalDevice                            physicalDevice,
-		VkPhysicalDeviceFeatures*                   pFeatures)
-{
-	assert(physicalDevice);
-	assert(pFeatures);
-
-	*pFeatures = _features;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEnumerateDeviceExtensionProperties
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateDeviceExtensionProperties(
-		VkPhysicalDevice                            physicalDevice,
-		const char*                                 pLayerName,
-		uint32_t*                                   pPropertyCount,
-		VkExtensionProperties*                      pProperties)
-{
-	assert(physicalDevice);
-	assert(!pLayerName); //layers ignored for now
-	assert(pPropertyCount);
-
-	if(!pProperties)
-	{
-		*pPropertyCount = numDeviceExtensions;
-		return VK_INCOMPLETE;
-	}
-
-	int arraySize = *pPropertyCount;
-	int elementsWritten = min(numDeviceExtensions, arraySize);
-
-	for(int c = 0; c < elementsWritten; ++c)
-	{
-		pProperties[c] = deviceExtensions[c];
-	}
-
-	*pPropertyCount = elementsWritten;
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceQueueFamilyProperties
- * If pQueueFamilyProperties is NULL, then the number of queue families available is returned in pQueueFamilyPropertyCount.
- * Otherwise, pQueueFamilyPropertyCount must point to a variable set by the user to the number of elements in the pQueueFamilyProperties array,
- * and on return the variable is overwritten with the number of structures actually written to pQueueFamilyProperties. If pQueueFamilyPropertyCount
- * is less than the number of queue families available, at most pQueueFamilyPropertyCount structures will be written.
- */
-VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceQueueFamilyProperties(
-		VkPhysicalDevice                            physicalDevice,
-		uint32_t*                                   pQueueFamilyPropertyCount,
-		VkQueueFamilyProperties*                    pQueueFamilyProperties)
-{
-	assert(physicalDevice);
-	assert(pQueueFamilyPropertyCount);
-
-	if(!pQueueFamilyProperties)
-	{
-		*pQueueFamilyPropertyCount = 1;
-		return;
-	}
-
-	int arraySize = *pQueueFamilyPropertyCount;
-	int elementsWritten = min(numQueueFamilies, arraySize);
-
-	for(int c = 0; c < elementsWritten; ++c)
-	{
-		pQueueFamilyProperties[c] = _queueFamilyProperties[c];
-	}
-
-	*pQueueFamilyPropertyCount = elementsWritten;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfaceSupportKHR
- * does this queue family support presentation to this surface?
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceSupportKHR(
-		VkPhysicalDevice                            physicalDevice,
-		uint32_t                                    queueFamilyIndex,
-		VkSurfaceKHR                                surface,
-		VkBool32*                                   pSupported)
-{
-	assert(pSupported);
-	assert(surface);
-	assert(physicalDevice);
-
-	assert(queueFamilyIndex < numQueueFamilies);
-
-	//TODO if we plan to support headless rendering, there should be 2 families
-	//one using /dev/dri/card0 which has modesetting
-	//other using /dev/dri/renderD128 which does not support modesetting, this would say false here
-	*pSupported = VK_TRUE;
-	return VK_SUCCESS;
-}
-
-/*
- * Implementation of our RPI specific "extension"
- */
-VkResult vkCreateRpiSurfaceKHR(
-		VkInstance                                  instance,
-		const VkRpiSurfaceCreateInfoKHR*            pCreateInfo,
-		const VkAllocationCallbacks*                pAllocator,
-		VkSurfaceKHR*                               pSurface)
-{
-	assert(instance);
-	//assert(pCreateInfo); //ignored for now
-	assert(pSurface);
-	//TODO: allocator is ignored for now
-	assert(pAllocator == 0);
-
-	*pSurface = (VkSurfaceKHR)modeset_create(controlFd);
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySurfaceKHR
- * Destroying a VkSurfaceKHR merely severs the connection between Vulkan and the native surface,
- * and does not imply destroying the native surface, closing a window, or similar behavior
- * (but we'll do so anyways...)
- */
-VKAPI_ATTR void VKAPI_CALL vkDestroySurfaceKHR(
-		VkInstance                                  instance,
-		VkSurfaceKHR                                surface,
-		const VkAllocationCallbacks*                pAllocator)
-{
-	assert(instance);
-	assert(surface);
-
-	//TODO: allocator is ignored for now
-	assert(pAllocator == 0);
-
-	modeset_destroy(controlFd, (modeset_dev*)surface);
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateDevice
- * vkCreateDevice verifies that extensions and features requested in the ppEnabledExtensionNames and pEnabledFeatures
- * members of pCreateInfo, respectively, are supported by the implementation. If any requested extension is not supported,
- * vkCreateDevice must return VK_ERROR_EXTENSION_NOT_PRESENT. If any requested feature is not supported, vkCreateDevice must return
- * VK_ERROR_FEATURE_NOT_PRESENT. Support for extensions can be checked before creating a device by querying vkEnumerateDeviceExtensionProperties
- * After verifying and enabling the extensions the VkDevice object is created and returned to the application.
- * If a requested extension is only supported by a layer, both the layer and the extension need to be specified at vkCreateInstance
- * time for the creation to succeed. Multiple logical devices can be created from the same physical device. Logical device creation may
- * fail due to lack of device-specific resources (in addition to the other errors). If that occurs, vkCreateDevice will return VK_ERROR_TOO_MANY_OBJECTS.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkCreateDevice(
-		VkPhysicalDevice                            physicalDevice,
-		const VkDeviceCreateInfo*                   pCreateInfo,
-		const VkAllocationCallbacks*                pAllocator,
-		VkDevice*                                   pDevice)
-{
-	assert(physicalDevice);
-	assert(pDevice);
-	assert(pCreateInfo);
-
-	//TODO: allocator is ignored for now
-	assert(pAllocator == 0);
-
-	*pDevice = malloc(sizeof(_device));
-	if(!pDevice)
-	{
-		return VK_ERROR_TOO_MANY_OBJECTS;
-	}
-
-	(*pDevice)->dev = physicalDevice;
-
-	for(int c = 0; c < pCreateInfo->enabledExtensionCount; ++c)
-	{
-		int findres = findDeviceExtension(pCreateInfo->ppEnabledExtensionNames[c]);
-		if(findres > -1)
-		{
-			(*pDevice)->enabledExtensions[(*pDevice)->numEnabledExtensions] = findres;
-			(*pDevice)->numEnabledExtensions++;
-		}
-		else
-		{
-			return VK_ERROR_EXTENSION_NOT_PRESENT;
-		}
-	}
-
-	VkBool32* requestedFeatures = pCreateInfo->pEnabledFeatures;
-	VkBool32* supportedFeatures = &_features;
-
-	if(requestedFeatures)
-	{
-		for(int c = 0; c < numFeatures; ++c)
-		{
-			if(requestedFeatures[c] && !supportedFeatures[c])
-			{
-				return VK_ERROR_FEATURE_NOT_PRESENT;
-			}
-		}
-
-		(*pDevice)->enabledFeatures = *pCreateInfo->pEnabledFeatures;
-	}
-	else
-	{
-		memset(&(*pDevice)->enabledFeatures, 0, sizeof((*pDevice)->enabledFeatures)); //just disable everything
-	}
-
-	//layers ignored per spec
-	//pCreateInfo->enabledLayerCount
-
-	for(int c = 0; c < numQueueFamilies; ++c)
-	{
-		(*pDevice)->queues[c] = 0;
-	}
-
-	if(pCreateInfo->queueCreateInfoCount > 0)
-	{
-		for(int c = 0; c < pCreateInfo->queueCreateInfoCount; ++c)
-		{
-			(*pDevice)->queues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex] = malloc(sizeof(_queue)*pCreateInfo->pQueueCreateInfos[c].queueCount);
-
-			if(!(*pDevice)->queues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex])
-			{
-				return VK_ERROR_OUT_OF_HOST_MEMORY;
-			}
-
-			for(int d = 0; d < pCreateInfo->pQueueCreateInfos[c].queueCount; ++d)
-			{
-				(*pDevice)->queues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex][d].lastEmitSeqno = 0;
-			}
-
-			(*pDevice)->numQueues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex] = pCreateInfo->pQueueCreateInfos[c].queueCount;
-		}
-	}
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetDeviceQueue
- * vkGetDeviceQueue must only be used to get queues that were created with the flags parameter of VkDeviceQueueCreateInfo set to zero.
- * To get queues that were created with a non-zero flags parameter use vkGetDeviceQueue2.
- */
-VKAPI_ATTR void VKAPI_CALL vkGetDeviceQueue(
-		VkDevice                                    device,
-		uint32_t                                    queueFamilyIndex,
-		uint32_t                                    queueIndex,
-		VkQueue*                                    pQueue)
-{
-	assert(device);
-	assert(pQueue);
-
-	assert(queueFamilyIndex < numQueueFamilies);
-	assert(queueIndex < device->numQueues[queueFamilyIndex]);
-
-	*pQueue = &device->queues[queueFamilyIndex][queueIndex];
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateSemaphore
- * Semaphores are a synchronization primitive that can be used to insert a dependency between batches submitted to queues.
- * Semaphores have two states - signaled and unsignaled. The state of a semaphore can be signaled after execution of a batch of commands is completed.
- * A batch can wait for a semaphore to become signaled before it begins execution, and the semaphore is also unsignaled before the batch begins execution.
- * As with most objects in Vulkan, semaphores are an interface to internal data which is typically opaque to applications.
- * This internal data is referred to as a semaphore’s payload. However, in order to enable communication with agents outside of the current device,
- * it is necessary to be able to export that payload to a commonly understood format, and subsequently import from that format as well.
- * The internal data of a semaphore may include a reference to any resources and pending work associated with signal or unsignal operations performed on that semaphore object.
- * Mechanisms to import and export that internal data to and from semaphores are provided below.
- * These mechanisms indirectly enable applications to share semaphore state between two or more semaphores and other synchronization primitives across process and API boundaries.
- * When created, the semaphore is in the unsignaled state.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkCreateSemaphore(
-		VkDevice                                    device,
-		const VkSemaphoreCreateInfo*                pCreateInfo,
-		const VkAllocationCallbacks*                pAllocator,
-		VkSemaphore*                                pSemaphore)
-{
-	assert(device);
-	assert(pSemaphore);
-
-	//TODO: allocator is ignored for now
-	assert(pAllocator == 0);
-
-	//we'll probably just use an IOCTL to wait for a GPU sequence number to complete.
-	sem_t* s = malloc(sizeof(sem_t));
-	if(!s)
-	{
-		return VK_ERROR_OUT_OF_HOST_MEMORY;
-	}
-	sem_init(s, 0, 0); //create semaphore unsignalled, shared between threads
-
-	*pSemaphore = (VkSemaphore)s;
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfaceCapabilitiesKHR
- * The capabilities of a swapchain targetting a surface are the intersection of the capabilities of the WSI platform,
- * the native window or display, and the physical device. The resulting capabilities can be obtained with the queries listed
- * below in this section. Capabilities that correspond to image creation parameters are not independent of each other:
- * combinations of parameters that are not supported as reported by vkGetPhysicalDeviceImageFormatProperties are not supported
- * by the surface on that physical device, even if the capabilities taken individually are supported as part of some other parameter combinations.
- *
- * capabilities the specified device supports for a swapchain created for the surface
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilitiesKHR(
-		VkPhysicalDevice                            physicalDevice,
-		VkSurfaceKHR                                surface,
-		VkSurfaceCapabilitiesKHR*                   pSurfaceCapabilities)
-{
-	assert(physicalDevice);
-	assert(surface);
-	assert(pSurfaceCapabilities);
-
-	pSurfaceCapabilities->minImageCount = 1; //min 1
-	pSurfaceCapabilities->maxImageCount = 2; //TODO max 2 for double buffering for now...
-	pSurfaceCapabilities->currentExtent.width = ((modeset_dev*)surface)->width;
-	pSurfaceCapabilities->currentExtent.height = ((modeset_dev*)surface)->height;
-	pSurfaceCapabilities->minImageExtent.width = ((modeset_dev*)surface)->width; //TODO
-	pSurfaceCapabilities->minImageExtent.height = ((modeset_dev*)surface)->height; //TODO
-	pSurfaceCapabilities->maxImageExtent.width = ((modeset_dev*)surface)->width; //TODO
-	pSurfaceCapabilities->maxImageExtent.height = ((modeset_dev*)surface)->height; //TODO
-	pSurfaceCapabilities->maxImageArrayLayers = 1; //TODO maybe more layers for cursor etc.
-	pSurfaceCapabilities->supportedTransforms = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; //TODO no rotation for now
-	pSurfaceCapabilities->currentTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; //TODO get this from dev
-	pSurfaceCapabilities->supportedCompositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; //TODO no alpha compositing for now
-	pSurfaceCapabilities->supportedUsageFlags = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; //well we want to draw on the screen right
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfaceFormatsKHR
- * If pSurfaceFormats is NULL, then the number of format pairs supported for the given surface is returned in pSurfaceFormatCount.
- * The number of format pairs supported will be greater than or equal to 1. Otherwise, pSurfaceFormatCount must point to a variable
- * set by the user to the number of elements in the pSurfaceFormats array, and on return the variable is overwritten with the number
- * of structures actually written to pSurfaceFormats. If the value of pSurfaceFormatCount is less than the number of format pairs supported,
- * at most pSurfaceFormatCount structures will be written. If pSurfaceFormatCount is smaller than the number of format pairs supported for the given surface,
- * VK_INCOMPLETE will be returned instead of VK_SUCCESS to indicate that not all the available values were returned.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceFormatsKHR(
-		VkPhysicalDevice                            physicalDevice,
-		VkSurfaceKHR                                surface,
-		uint32_t*                                   pSurfaceFormatCount,
-		VkSurfaceFormatKHR*                         pSurfaceFormats)
-{
-	assert(physicalDevice);
-	assert(surface);
-	assert(pSurfaceFormatCount);
-
-	const int numFormats = 1;
-
-	if(!pSurfaceFormats)
-	{
-		*pSurfaceFormatCount = numFormats;
-		return VK_SUCCESS;
-	}
-
-	int arraySize = *pSurfaceFormatCount;
-	int elementsWritten = min(numFormats, arraySize);
-
-	for(int c = 0; c < elementsWritten; ++c)
-	{
-		pSurfaceFormats[c] = supportedSurfaceFormats[c];
-	}
-
-	*pSurfaceFormatCount = elementsWritten;
-
-	if(elementsWritten < numFormats)
-	{
-		return VK_INCOMPLETE;
-	}
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfacePresentModesKHR
- * If pPresentModes is NULL, then the number of presentation modes supported for the given surface is returned in pPresentModeCount.
- * Otherwise, pPresentModeCount must point to a variable set by the user to the number of elements in the pPresentModes array,
- * and on return the variable is overwritten with the number of values actually written to pPresentModes.
- * If the value of pPresentModeCount is less than the number of presentation modes supported, at most pPresentModeCount values will be written.
- * If pPresentModeCount is smaller than the number of presentation modes supported for the given surface, VK_INCOMPLETE will be returned instead of
- * VK_SUCCESS to indicate that not all the available values were returned.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfacePresentModesKHR(
-		VkPhysicalDevice                            physicalDevice,
-		VkSurfaceKHR                                surface,
-		uint32_t*                                   pPresentModeCount,
-		VkPresentModeKHR*                           pPresentModes)
-{
-	assert(physicalDevice);
-	assert(surface);
-	assert(pPresentModeCount);
-
-	const int numModes = 1;
-
-	if(!pPresentModes)
-	{
-		*pPresentModeCount = numModes;
-		return VK_SUCCESS;
-	}
-
-	int arraySize = *pPresentModeCount;
-	int elementsWritten = min(numModes, arraySize);
-
-	for(int c = 0; c < elementsWritten; ++c)
-	{
-		//TODO
-		pPresentModes[c] = VK_PRESENT_MODE_FIFO_KHR;
-	}
-
-	*pPresentModeCount = elementsWritten;
-
-	if(elementsWritten < numModes)
-	{
-		return VK_INCOMPLETE;
-	}
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateSwapchainKHR
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkCreateSwapchainKHR(
-		VkDevice                                    device,
-		const VkSwapchainCreateInfoKHR*             pCreateInfo,
-		const VkAllocationCallbacks*                pAllocator,
-		VkSwapchainKHR*                             pSwapchain)
-{
-	assert(device);
-	assert(pCreateInfo);
-	assert(pSwapchain);
-
-	//TODO: allocator is ignored for now
-	assert(pAllocator == 0);
-
-	*pSwapchain = malloc(sizeof(_swapchain));
-	if(!*pSwapchain)
-	{
-		return VK_ERROR_OUT_OF_HOST_MEMORY;
-	}
-
-	_swapchain* s = *pSwapchain;
-
-	//TODO flags, layers, queue sharing, pretransform, composite alpha, present mode..., clipped, oldswapchain
-	//TODO external sync on surface, oldswapchain
-
-	s->images = malloc(sizeof(_image) * pCreateInfo->minImageCount);
-	if(!s->images)
-	{
-		return VK_ERROR_OUT_OF_HOST_MEMORY;
-	}
-
-	s->backbufferIdx = 0;
-	s->numImages = pCreateInfo->minImageCount;
-	s->surface = pCreateInfo->surface;
-
-	for(int c = 0; c < pCreateInfo->minImageCount; ++c)
-	{
-		s->images[c].width = pCreateInfo->imageExtent.width;
-		s->images[c].height = pCreateInfo->imageExtent.height;
-		s->images[c].depth = 1;
-		s->images[c].layers = pCreateInfo->imageArrayLayers;
-		s->images[c].miplevels = 1;
-		s->images[c].samples = 1; //TODO
-		s->images[c].usageBits = pCreateInfo->imageUsage;
-		s->images[c].format = pCreateInfo->imageFormat;
-		s->images[c].imageSpace = pCreateInfo->imageColorSpace;
-		s->images[c].concurrentAccess = pCreateInfo->imageSharingMode;
-		s->images[c].numQueueFamiliesWithAccess = pCreateInfo->queueFamilyIndexCount;
-		if(s->images[c].concurrentAccess)
-		{
-			s->images[c].queueFamiliesWithAccess = malloc(sizeof(uint32_t)*s->images[c].numQueueFamiliesWithAccess);
-			memcpy(s->images[c].queueFamiliesWithAccess, pCreateInfo->pQueueFamilyIndices, sizeof(uint32_t)*s->images[c].numQueueFamiliesWithAccess);
-		}
-		s->images[c].preTransformMode = pCreateInfo->preTransform;
-		s->images[c].compositeAlpha = pCreateInfo->compositeAlpha;
-		s->images[c].presentMode = pCreateInfo->presentMode;
-		s->images[c].clipped = pCreateInfo->clipped;
-
-		createImageBO(&s->images[c]);
-		int res = modeset_create_fb(controlFd, &s->images[c]); assert(res == 0);
-	}
-
-	//defer to first swapbuffer (or at least later, getting swapchain != presenting immediately)
-	//int res = modeset_fb_for_dev(controlFd, s->surface, &s->images[s->backbufferIdx]); assert(res == 0);
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetSwapchainImagesKHR
- * If pSwapchainImages is NULL, then the number of presentable images for swapchain is returned in pSwapchainImageCount.
- * Otherwise, pSwapchainImageCount must point to a variable set by the user to the number of elements in the pSwapchainImages array,
- * and on return the variable is overwritten with the number of structures actually written to pSwapchainImages.
- * If the value of pSwapchainImageCount is less than the number of presentable images for swapchain, at most pSwapchainImageCount structures will be written.
- * If pSwapchainImageCount is smaller than the number of presentable images for swapchain, VK_INCOMPLETE will be returned instead of VK_SUCCESS to
- * indicate that not all the available values were returned.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainImagesKHR(
-		VkDevice                                    device,
-		VkSwapchainKHR                              swapchain,
-		uint32_t*                                   pSwapchainImageCount,
-		VkImage*                                    pSwapchainImages)
-{
-	assert(device);
-	assert(swapchain);
-	assert(pSwapchainImageCount);
-
-	_swapchain* s = swapchain;
-
-	if(!pSwapchainImages)
-	{
-		*pSwapchainImageCount = s->numImages;
-		return VK_SUCCESS;
-	}
-
-	int arraySize = *pSwapchainImageCount;
-	int elementsWritten = min(s->numImages, arraySize);
-
-	for(int c = 0; c < elementsWritten; ++c)
-	{
-		pSwapchainImages[c] = &s->images[c];
-	}
-
-	*pSwapchainImageCount = elementsWritten;
-
-	if(elementsWritten < s->numImages)
-	{
-		return VK_INCOMPLETE;
-	}
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#commandbuffers-pools
- * Command pools are opaque objects that command buffer memory is allocated from, and which allow the implementation to amortize the
- * cost of resource creation across multiple command buffers. Command pools are externally synchronized, meaning that a command pool must
- * not be used concurrently in multiple threads. That includes use via recording commands on any command buffers allocated from the pool,
- * as well as operations that allocate, free, and reset command buffers or the pool itself.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkCreateCommandPool(
-		VkDevice                                    device,
-		const VkCommandPoolCreateInfo*              pCreateInfo,
-		const VkAllocationCallbacks*                pAllocator,
-		VkCommandPool*                              pCommandPool)
-{
-	assert(device);
-	assert(pCreateInfo);
-
-	//TODO: allocator is ignored for now
-	assert(pAllocator == 0);
-
-	//VK_COMMAND_POOL_CREATE_TRANSIENT_BIT
-	//specifies that command buffers allocated from the pool will be short-lived, meaning that they will be reset or freed in a relatively short timeframe.
-	//This flag may be used by the implementation to control memory allocation behavior within the pool.
-	//--> definitely use pool allocator
-
-	//VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT
-	//allows any command buffer allocated from a pool to be individually reset to the initial state; either by calling vkResetCommandBuffer, or via the implicit reset when calling vkBeginCommandBuffer.
-	//If this flag is not set on a pool, then vkResetCommandBuffer must not be called for any command buffer allocated from that pool.
-
-	//TODO pool family ignored for now
-
-	_commandPool* cp = malloc(sizeof(_commandPool));
-
-	if(!cp)
-	{
-		return VK_ERROR_OUT_OF_HOST_MEMORY;
-	}
-
-	cp->queueFamilyIndex = pCreateInfo->queueFamilyIndex;
-
-	//initial number of command buffers to hold
-	int numCommandBufs = 100;
-	int controlListSize = ARM_PAGE_SIZE * 100;
-
-	//if(pCreateInfo->flags & VK_COMMAND_POOL_CREATE_TRANSIENT_BIT)
-	{
-		//use pool allocator
-		void* pamem = malloc(numCommandBufs * sizeof(_commandBuffer));
-		if(!pamem)
-		{
-			return VK_ERROR_OUT_OF_HOST_MEMORY;
-		}
-		cp->pa = createPoolAllocator(pamem, sizeof(_commandBuffer), numCommandBufs * sizeof(_commandBuffer));
-
-		void* cpamem = malloc(controlListSize);
-		if(!cpamem)
-		{
-			return VK_ERROR_OUT_OF_HOST_MEMORY;
-		}
-		cp->cpa = createConsecutivePoolAllocator(cpamem, ARM_PAGE_SIZE, controlListSize);
-	}
-
-	*pCommandPool = (VkCommandPool)cp;
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#commandbuffer-allocation
- * vkAllocateCommandBuffers can be used to create multiple command buffers. If the creation of any of those command buffers fails,
- * the implementation must destroy all successfully created command buffer objects from this command, set all entries of the pCommandBuffers array to NULL and return the error.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkAllocateCommandBuffers(
-		VkDevice                                    device,
-		const VkCommandBufferAllocateInfo*          pAllocateInfo,
-		VkCommandBuffer*                            pCommandBuffers)
-{
-	assert(device);
-	assert(pAllocateInfo);
-	assert(pCommandBuffers);
-
-	VkResult res = VK_SUCCESS;
-
-	_commandPool* cp = (_commandPool*)pAllocateInfo->commandPool;
-
-	//if(cp->usePoolAllocator)
-	{
-		for(int c = 0; c < pAllocateInfo->commandBufferCount; ++c)
-		{
-			pCommandBuffers[c] = poolAllocate(&cp->pa);
-
-			if(!pCommandBuffers[c])
-			{
-				res = VK_ERROR_OUT_OF_HOST_MEMORY;
-				break;
-			}
-
-			pCommandBuffers[c]->shaderRecCount = 0;
-			pCommandBuffers[c]->usageFlags = 0;
-			pCommandBuffers[c]->state = CMDBUF_STATE_INITIAL;
-			pCommandBuffers[c]->cp = cp;
-			clInit(&pCommandBuffers[c]->binCl, consecutivePoolAllocate(&cp->cpa, 1));
-			clInit(&pCommandBuffers[c]->handlesCl, consecutivePoolAllocate(&cp->cpa, 1));
-			clInit(&pCommandBuffers[c]->shaderRecCl, consecutivePoolAllocate(&cp->cpa, 1));
-			clInit(&pCommandBuffers[c]->uniformsCl, consecutivePoolAllocate(&cp->cpa, 1));
-
-			if(!pCommandBuffers[c]->binCl.buffer)
-			{
-				res = VK_ERROR_OUT_OF_HOST_MEMORY;
-				break;
-			}
-
-			if(!pCommandBuffers[c]->handlesCl.buffer)
-			{
-				res = VK_ERROR_OUT_OF_HOST_MEMORY;
-				break;
-			}
-
-			if(!pCommandBuffers[c]->shaderRecCl.buffer)
-			{
-				res = VK_ERROR_OUT_OF_HOST_MEMORY;
-				break;
-			}
-
-			if(!pCommandBuffers[c]->uniformsCl.buffer)
-			{
-				res = VK_ERROR_OUT_OF_HOST_MEMORY;
-				break;
-			}
-		}
-	}
-
-	if(res != VK_SUCCESS)
-	{
-		//if(cp->usePoolAllocator)
-		{
-			for(int c = 0; c < pAllocateInfo->commandBufferCount; ++c)
-			{
-				consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->binCl, pCommandBuffers[c]->binCl.numBlocks);
-				consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->handlesCl, pCommandBuffers[c]->binCl.numBlocks);
-				consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->shaderRecCl, pCommandBuffers[c]->binCl.numBlocks);
-				consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->uniformsCl, pCommandBuffers[c]->binCl.numBlocks);
-				poolFree(&cp->pa, pCommandBuffers[c]);
-				pCommandBuffers[c] = 0;
-			}
-		}
-	}
-
-	return res;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkBeginCommandBuffer
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkBeginCommandBuffer(
-		VkCommandBuffer                             commandBuffer,
-		const VkCommandBufferBeginInfo*             pBeginInfo)
-{
-	assert(commandBuffer);
-	assert(pBeginInfo);
-
-	//TODO
-
-	//VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT
-	//specifies that each recording of the command buffer will only be submitted once, and the command buffer will be reset and recorded again between each submission.
-
-	//VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT
-	//specifies that a secondary command buffer is considered to be entirely inside a render pass. If this is a primary command buffer, then this bit is ignored
-
-	//VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT
-	//specifies that a command buffer can be resubmitted to a queue while it is in the pending state, and recorded into multiple primary command buffers
-
-	//When a command buffer begins recording, all state in that command buffer is undefined
-
-	struct drm_vc4_submit_cl submitCl =
-	{
-		.color_read.hindex = ~0,
-		.zs_read.hindex = ~0,
-		.color_write.hindex = ~0,
-		.msaa_color_write.hindex = ~0,
-		.zs_write.hindex = ~0,
-		.msaa_zs_write.hindex = ~0,
-	};
-
-	commandBuffer->usageFlags = pBeginInfo->flags;
-	commandBuffer->shaderRecCount = 0;
-	commandBuffer->state = CMDBUF_STATE_RECORDING;
-	commandBuffer->submitCl = submitCl;
-
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCmdPipelineBarrier
- * vkCmdPipelineBarrier is a synchronization command that inserts a dependency between commands submitted to the same queue, or between commands in the same subpass.
- * When vkCmdPipelineBarrier is submitted to a queue, it defines a memory dependency between commands that were submitted before it, and those submitted after it.
- * If vkCmdPipelineBarrier was recorded outside a render pass instance, the first synchronization scope includes all commands that occur earlier in submission order.
- * If vkCmdPipelineBarrier was recorded inside a render pass instance, the first synchronization scope includes only commands that occur earlier in submission order within the same subpass.
- * In either case, the first synchronization scope is limited to operations on the pipeline stages determined by the source stage mask specified by srcStageMask.
- *
- * If vkCmdPipelineBarrier was recorded outside a render pass instance, the second synchronization scope includes all commands that occur later in submission order.
- * If vkCmdPipelineBarrier was recorded inside a render pass instance, the second synchronization scope includes only commands that occur later in submission order within the same subpass.
- * In either case, the second synchronization scope is limited to operations on the pipeline stages determined by the destination stage mask specified by dstStageMask.
- *
- * The first access scope is limited to access in the pipeline stages determined by the source stage mask specified by srcStageMask.
- * Within that, the first access scope only includes the first access scopes defined by elements of the pMemoryBarriers,
- * pBufferMemoryBarriers and pImageMemoryBarriers arrays, which each define a set of memory barriers. If no memory barriers are specified,
- * then the first access scope includes no accesses.
- *
- * The second access scope is limited to access in the pipeline stages determined by the destination stage mask specified by dstStageMask.
- * Within that, the second access scope only includes the second access scopes defined by elements of the pMemoryBarriers, pBufferMemoryBarriers and pImageMemoryBarriers arrays,
- * which each define a set of memory barriers. If no memory barriers are specified, then the second access scope includes no accesses.
- *
- * If dependencyFlags includes VK_DEPENDENCY_BY_REGION_BIT, then any dependency between framebuffer-space pipeline stages is framebuffer-local - otherwise it is framebuffer-global.
- */
-VKAPI_ATTR void VKAPI_CALL vkCmdPipelineBarrier(
-		VkCommandBuffer                             commandBuffer,
-		VkPipelineStageFlags                        srcStageMask,
-		VkPipelineStageFlags                        dstStageMask,
-		VkDependencyFlags                           dependencyFlags,
-		uint32_t                                    memoryBarrierCount,
-		const VkMemoryBarrier*                      pMemoryBarriers,
-		uint32_t                                    bufferMemoryBarrierCount,
-		const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
-		uint32_t                                    imageMemoryBarrierCount,
-		const VkImageMemoryBarrier*                 pImageMemoryBarriers)
-{
-	assert(commandBuffer);
-
-	//TODO pipeline stage flags
-	//VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
-	//VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT
-	//VK_PIPELINE_STAGE_VERTEX_INPUT_BIT
-	//VK_PIPELINE_STAGE_VERTEX_SHADER_BIT
-	//VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT
-	//VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT
-	//VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT
-	//VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT
-	//VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT
-	//VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT
-	//VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT
-	//VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT
-	//VK_PIPELINE_STAGE_TRANSFER_BIT
-	//VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT
-	//VK_PIPELINE_STAGE_HOST_BIT
-	//VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT
-	//VK_PIPELINE_STAGE_ALL_COMMANDS_BIT
-
-	//TODO dependency flags
-	//VK_DEPENDENCY_BY_REGION_BIT,
-	//VK_DEPENDENCY_DEVICE_GROUP_BIT,
-	//VK_DEPENDENCY_VIEW_LOCAL_BIT
-
-	//TODO access flags
-	//VK_ACCESS_INDIRECT_COMMAND_READ_BIT
-	//VK_ACCESS_INDEX_READ_BIT
-	//VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT
-	//VK_ACCESS_UNIFORM_READ_BIT
-	//VK_ACCESS_INPUT_ATTACHMENT_READ_BIT
-	//VK_ACCESS_SHADER_READ_BIT
-	//VK_ACCESS_SHADER_WRITE_BIT
-	//VK_ACCESS_COLOR_ATTACHMENT_READ_BIT
-	//VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT
-	//VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT
-	//VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT
-	//VK_ACCESS_TRANSFER_READ_BIT
-	//VK_ACCESS_TRANSFER_WRITE_BIT
-	//VK_ACCESS_HOST_READ_BIT
-	//VK_ACCESS_HOST_WRITE_BIT
-	//VK_ACCESS_MEMORY_READ_BIT
-	//VK_ACCESS_MEMORY_WRITE_BIT
-	//VK_ACCESS_COMMAND_PROCESS_READ_BIT_NVX
-	//VK_ACCESS_COMMAND_PROCESS_WRITE_BIT_NVX
-
-	//TODO Layout transition flags
-	//VK_IMAGE_LAYOUT_UNDEFINED
-	//VK_IMAGE_LAYOUT_GENERAL
-	//VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL
-	//VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
-	//VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL
-	//VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
-	//VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL
-	//VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
-	//VK_IMAGE_LAYOUT_PREINITIALIZED
-	//VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
-	//VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL
-	//VK_IMAGE_LAYOUT_PRESENT_SRC_KHR
-	//VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR
-
-	for(int c = 0; c < memoryBarrierCount; ++c)
-	{
-		//TODO
-	}
-
-	for(int c = 0; c < bufferMemoryBarrierCount; ++c)
-	{
-		//TODO
-	}
-
-	for(int c = 0; c < imageMemoryBarrierCount; ++c)
-	{
-		_image* i = pImageMemoryBarriers[c].image;
-
-		assert(i->layout == pImageMemoryBarriers[c].oldLayout || i->layout == VK_IMAGE_LAYOUT_UNDEFINED);
-
-		if(srcStageMask & VK_PIPELINE_STAGE_TRANSFER_BIT &&
-		   pImageMemoryBarriers[c].srcAccessMask & VK_ACCESS_TRANSFER_WRITE_BIT &&
-		   i->needToClear)
-		{
-			//insert CRs to clear the image
-
-			assert(i->layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
-
-			clFit(commandBuffer, &commandBuffer->binCl, V3D21_TILE_BINNING_MODE_CONFIGURATION_length);
-			clInsertTileBinningModeConfiguration(&commandBuffer->binCl,
-												 0, 0, 0, 0,
-												 getFormatBpp(i->format) == 64, //64 bit color mode
-												 i->samples > 1, //msaa
-												 i->width, i->height, 0, 0, 0);
-
-			//START_TILE_BINNING resets the statechange counters in the hardware,
-			//which are what is used when a primitive is binned to a tile to
-			//figure out what new state packets need to be written to that tile's
-			//command list.
-			clFit(commandBuffer, &commandBuffer->binCl, V3D21_START_TILE_BINNING_length);
-			clInsertStartTileBinning(&commandBuffer->binCl);
-
-			//Reset the current compressed primitives format.  This gets modified
-			//by VC4_PACKET_GL_INDEXED_PRIMITIVE and
-			//VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start
-			//of every tile.
-			clFit(commandBuffer, &commandBuffer->binCl, V3D21_PRIMITIVE_LIST_FORMAT_length);
-			clInsertPrimitiveListFormat(&commandBuffer->binCl,
-										1, //16 bit
-										2); //tris
-
-			clFit(commandBuffer, &commandBuffer->handlesCl, 4);
-			uint32_t idx = clGetHandleIndex(&commandBuffer->handlesCl, i->handle);
-			commandBuffer->submitCl.color_write.hindex = idx;
-			commandBuffer->submitCl.color_write.offset = 0;
-			commandBuffer->submitCl.color_write.flags = 0;
-			//TODO format
-			commandBuffer->submitCl.color_write.bits =
-					VC4_SET_FIELD(VC4_RENDER_CONFIG_FORMAT_RGBA8888, VC4_RENDER_CONFIG_FORMAT) |
-					VC4_SET_FIELD(i->tiling, VC4_RENDER_CONFIG_MEMORY_FORMAT);
-
-			commandBuffer->submitCl.clear_color[0] = i->clearColor[0];
-			commandBuffer->submitCl.clear_color[1] = i->clearColor[1];
-
-			//TODO ranges
-			commandBuffer->submitCl.min_x_tile = 0;
-			commandBuffer->submitCl.min_y_tile = 0;
-
-			uint32_t tileSizeW = 64;
-			uint32_t tileSizeH = 64;
-
-			if(i->samples > 1)
-			{
-				tileSizeW >>= 1;
-				tileSizeH >>= 1;
-			}
-
-			if(getFormatBpp(i->format) == 64)
-			{
-				tileSizeH >>= 1;
-			}
-
-			uint32_t widthInTiles = divRoundUp(i->width, tileSizeW);
-			uint32_t heightInTiles = divRoundUp(i->height, tileSizeH);
-
-			commandBuffer->submitCl.max_x_tile = widthInTiles - 1;
-			commandBuffer->submitCl.max_y_tile = heightInTiles - 1;
-			commandBuffer->submitCl.width = i->width;
-			commandBuffer->submitCl.height = i->height;
-			commandBuffer->submitCl.flags |= VC4_SUBMIT_CL_USE_CLEAR_COLOR;
-			commandBuffer->submitCl.clear_z = 0; //TODO
-			commandBuffer->submitCl.clear_s = 0;
-		}
-
-		//transition to new layout
-		i->layout = pImageMemoryBarriers[c].newLayout;
-	}
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCmdClearColorImage
- * Color and depth/stencil images can be cleared outside a render pass instance using vkCmdClearColorImage or vkCmdClearDepthStencilImage, respectively.
- * These commands are only allowed outside of a render pass instance.
- */
-VKAPI_ATTR void VKAPI_CALL vkCmdClearColorImage(
-		VkCommandBuffer                             commandBuffer,
-		VkImage                                     image,
-		VkImageLayout                               imageLayout,
-		const VkClearColorValue*                    pColor,
-		uint32_t                                    rangeCount,
-		const VkImageSubresourceRange*              pRanges)
-{
-	assert(commandBuffer);
-	assert(image);
-	assert(pColor);
-
-	//TODO this should only flag an image for clearing. This can only be called outside a renderpass
-	//actual clearing would only happen:
-	// -if image is rendered to (insert clear before first draw call)
-	// -if the image is bound for sampling (submit a CL with a clear)
-	// -if a command buffer is submitted without any rendering (insert clear)
-	// -etc.
-	//we shouldn't clear an image if noone uses it
-
-	//TODO ranges support
-
-	assert(imageLayout == VK_IMAGE_LAYOUT_GENERAL ||
-		   imageLayout == VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR ||
-		   imageLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
-
-	assert(commandBuffer->state	 == CMDBUF_STATE_RECORDING);
-	assert(_queueFamilyProperties[commandBuffer->cp->queueFamilyIndex].queueFlags & VK_QUEUE_GRAPHICS_BIT || _queueFamilyProperties[commandBuffer->cp->queueFamilyIndex].queueFlags & VK_QUEUE_COMPUTE_BIT);
-
-	_image* i = image;
-
-	assert(i->usageBits & VK_IMAGE_USAGE_TRANSFER_DST_BIT);
-
-	//TODO externally sync cmdbuf, cmdpool
-
-	i->needToClear = 1;
-	i->clearColor[0] = i->clearColor[1] = packVec4IntoABGR8(pColor->float32);
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEndCommandBuffer
- * If there was an error during recording, the application will be notified by an unsuccessful return code returned by vkEndCommandBuffer.
- * If the application wishes to further use the command buffer, the command buffer must be reset. The command buffer must have been in the recording state,
- * and is moved to the executable state.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkEndCommandBuffer(
-		VkCommandBuffer                             commandBuffer)
-{
-	assert(commandBuffer);
-
-	//Increment the semaphore indicating that binning is done and
-	//unblocking the render thread.  Note that this doesn't act
-	//until the FLUSH completes.
-	//The FLUSH caps all of our bin lists with a
-	//VC4_PACKET_RETURN.
-	clFit(commandBuffer, &commandBuffer->binCl, V3D21_INCREMENT_SEMAPHORE_length);
-	clInsertIncrementSemaphore(&commandBuffer->binCl);
-	clFit(commandBuffer, &commandBuffer->binCl, V3D21_FLUSH_length);
-	clInsertFlush(&commandBuffer->binCl);
-
-	commandBuffer->state = CMDBUF_STATE_EXECUTABLE;
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkAcquireNextImageKHR
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkAcquireNextImageKHR(
-		VkDevice                                    device,
-		VkSwapchainKHR                              swapchain,
-		uint64_t                                    timeout,
-		VkSemaphore                                 semaphore,
-		VkFence                                     fence,
-		uint32_t*                                   pImageIndex)
-{
-	assert(device);
-	assert(swapchain);
-
-	assert(semaphore != VK_NULL_HANDLE || fence != VK_NULL_HANDLE);
-
-	sem_t* s = semaphore;
-
-	//TODO we need to keep track of currently acquired images?
-
-	//TODO wait timeout?
-
-	*pImageIndex = ((_swapchain*)swapchain)->backbufferIdx; //return back buffer index
-
-	//signal semaphore
-	int semVal; sem_getvalue(s, &semVal); assert(semVal <= 0); //make sure semaphore is unsignalled
-	sem_post(s);
-
-	//TODO signal fence
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkQueueSubmit
- * vkQueueSubmit is a queue submission command, with each batch defined by an element of pSubmits as an instance of the VkSubmitInfo structure.
- * Batches begin execution in the order they appear in pSubmits, but may complete out of order.
- * Fence and semaphore operations submitted with vkQueueSubmit have additional ordering constraints compared to other submission commands,
- * with dependencies involving previous and subsequent queue operations. Information about these additional constraints can be found in the semaphore and
- * fence sections of the synchronization chapter.
- * Details on the interaction of pWaitDstStageMask with synchronization are described in the semaphore wait operation section of the synchronization chapter.
- * The order that batches appear in pSubmits is used to determine submission order, and thus all the implicit ordering guarantees that respect it.
- * Other than these implicit ordering guarantees and any explicit synchronization primitives, these batches may overlap or otherwise execute out of order.
- * If any command buffer submitted to this queue is in the executable state, it is moved to the pending state. Once execution of all submissions of a command buffer complete,
- * it moves from the pending state, back to the executable state. If a command buffer was recorded with the VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT flag,
- * it instead moves back to the invalid state.
- * If vkQueueSubmit fails, it may return VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY.
- * If it does, the implementation must ensure that the state and contents of any resources or synchronization primitives referenced by the submitted command buffers and any semaphores
- * referenced by pSubmits is unaffected by the call or its failure. If vkQueueSubmit fails in such a way that the implementation is unable to make that guarantee,
- * the implementation must return VK_ERROR_DEVICE_LOST. See Lost Device.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit(
-		VkQueue                                     queue,
-		uint32_t                                    submitCount,
-		const VkSubmitInfo*                         pSubmits,
-		VkFence                                     fence)
-{
-	assert(queue);
-
-	for(int c = 0; c < pSubmits->waitSemaphoreCount; ++c)
-	{
-		sem_wait((sem_t*)pSubmits->pWaitSemaphores[c]);
-	}
-
-	//TODO: deal with pSubmits->pWaitDstStageMask
-
-	//TODO wait for fence??
-
-	for(int c = 0; c < pSubmits->commandBufferCount; ++c)
-	{
-		if(pSubmits->pCommandBuffers[c]->state == CMDBUF_STATE_EXECUTABLE)
-		{
-			pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_PENDING;
-		}
-	}
-
-	for(int c = 0; c < pSubmits->commandBufferCount; ++c)
-	{
-		VkCommandBuffer cmdbuf = pSubmits->pCommandBuffers[c];
-
-		cmdbuf->submitCl.bo_handles = cmdbuf->handlesCl.buffer;
-		cmdbuf->submitCl.bo_handle_count = clSize(&cmdbuf->handlesCl) / 4;
-		cmdbuf->submitCl.bin_cl = cmdbuf->binCl.buffer;
-		cmdbuf->submitCl.bin_cl_size = clSize(&cmdbuf->binCl);
-		cmdbuf->submitCl.shader_rec = cmdbuf->shaderRecCl.buffer;
-		cmdbuf->submitCl.shader_rec_size = clSize(&cmdbuf->shaderRecCl);
-		cmdbuf->submitCl.shader_rec_count = cmdbuf->shaderRecCount;
-		cmdbuf->submitCl.uniforms = cmdbuf->uniformsCl.buffer;
-		cmdbuf->submitCl.uniforms_size = clSize(&cmdbuf->uniformsCl);
-
-		printf("BCL:\n");
-		clDump(cmdbuf->submitCl.bin_cl, cmdbuf->submitCl.bin_cl_size);
-		printf("BO handles: ");
-		for(int d = 0; d < cmdbuf->submitCl.bo_handle_count; ++d)
-		{
-			printf("%u ", *((uint32_t*)(cmdbuf->submitCl.bo_handles)+d));
-		}
-		printf("\nwidth height: %u, %u\n", cmdbuf->submitCl.width, cmdbuf->submitCl.height);
-		printf("tile min/max: %u,%u %u,%u\n", cmdbuf->submitCl.min_x_tile, cmdbuf->submitCl.min_y_tile, cmdbuf->submitCl.max_x_tile, cmdbuf->submitCl.max_y_tile);
-		printf("color read surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.color_read.hindex, cmdbuf->submitCl.color_read.offset, cmdbuf->submitCl.color_read.bits, cmdbuf->submitCl.color_read.flags);
-		printf("color write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.color_write.hindex, cmdbuf->submitCl.color_write.offset, cmdbuf->submitCl.color_write.bits, cmdbuf->submitCl.color_write.flags);
-		printf("zs read surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.zs_read.hindex, cmdbuf->submitCl.zs_read.offset, cmdbuf->submitCl.zs_read.bits, cmdbuf->submitCl.zs_read.flags);
-		printf("zs write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.zs_write.hindex, cmdbuf->submitCl.zs_write.offset, cmdbuf->submitCl.zs_write.bits, cmdbuf->submitCl.zs_write.flags);
-		printf("msaa color write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.msaa_color_write.hindex, cmdbuf->submitCl.msaa_color_write.offset, cmdbuf->submitCl.msaa_color_write.bits, cmdbuf->submitCl.msaa_color_write.flags);
-		printf("msaa zs write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.msaa_zs_write.hindex, cmdbuf->submitCl.msaa_zs_write.offset, cmdbuf->submitCl.msaa_zs_write.bits, cmdbuf->submitCl.msaa_zs_write.flags);
-		printf("clear color packed rgba %u %u\n", cmdbuf->submitCl.clear_color[0], cmdbuf->submitCl.clear_color[1]);
-		printf("clear z %u\n", cmdbuf->submitCl.clear_z);
-		printf("clear s %u\n", cmdbuf->submitCl.clear_s);
-		printf("flags %u\n", cmdbuf->submitCl.flags);
-
-
-		//submit ioctl
-		static uint64_t lastFinishedSeqno = 0;
-		vc4_cl_submit(controlFd, &cmdbuf->submitCl, &queue->lastEmitSeqno, &lastFinishedSeqno);
-	}
-
-	for(int c = 0; c < pSubmits->commandBufferCount; ++c)
-	{
-		if(pSubmits->pCommandBuffers[c]->state == CMDBUF_STATE_PENDING)
-		{
-			if(pSubmits->pCommandBuffers[c]->usageFlags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT)
-			{
-				pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_INVALID;
-			}
-			else
-			{
-				pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_EXECUTABLE;
-			}
-		}
-	}
-
-	for(int c = 0; c < pSubmits->signalSemaphoreCount; ++c)
-	{
-		sem_post((sem_t*)pSubmits->pSignalSemaphores[c]);
-	}
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkQueuePresentKHR
- * Any writes to memory backing the images referenced by the pImageIndices and pSwapchains members of pPresentInfo,
- * that are available before vkQueuePresentKHR is executed, are automatically made visible to the read access performed by the presentation engine.
- * This automatic visibility operation for an image happens-after the semaphore signal operation, and happens-before the presentation engine accesses the image.
- * Queueing an image for presentation defines a set of queue operations, including waiting on the semaphores and submitting a presentation request to the presentation engine.
- * However, the scope of this set of queue operations does not include the actual processing of the image by the presentation engine.
- * If vkQueuePresentKHR fails to enqueue the corresponding set of queue operations, it may return VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY.
- * If it does, the implementation must ensure that the state and contents of any resources or synchronization primitives referenced is unaffected by the call or its failure.
- * If vkQueuePresentKHR fails in such a way that the implementation is unable to make that guarantee, the implementation must return VK_ERROR_DEVICE_LOST.
- * However, if the presentation request is rejected by the presentation engine with an error VK_ERROR_OUT_OF_DATE_KHR or VK_ERROR_SURFACE_LOST_KHR,
- * the set of queue operations are still considered to be enqueued and thus any semaphore to be waited on gets unsignaled when the corresponding queue operation is complete.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkQueuePresentKHR(
-		VkQueue                                     queue,
-		const VkPresentInfoKHR*                     pPresentInfo)
-{
-	assert(queue);
-	assert(pPresentInfo);
-
-	//wait for semaphore in present info set by submit ioctl to make sure cls are flushed
-	for(int c = 0; c < pPresentInfo->waitSemaphoreCount; ++c)
-	{
-		sem_wait((sem_t*)pPresentInfo->pWaitSemaphores[c]);
-	}
-
-	for(int c = 0; c < pPresentInfo->swapchainCount; ++c)
-	{
-		_swapchain* s = pPresentInfo->pSwapchains[c];
-		modeset_present_buffer(controlFd, (modeset_dev*)s->surface, &s->images[s->backbufferIdx]);
-		s->backbufferIdx = (s->backbufferIdx + 1) % s->numImages;
-	}
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDeviceWaitIdle
- * vkDeviceWaitIdle is equivalent to calling vkQueueWaitIdle for all queues owned by device.
- */
-VKAPI_ATTR VkResult VKAPI_CALL vkDeviceWaitIdle(
-		VkDevice									device)
-{
-	assert(device);
-
-	for(int c = 0; c < numQueueFamilies; ++c)
-	{
-		for(int d = 0; d < device->numQueues[c]; ++d)
-		{
-			uint64_t lastFinishedSeqno;
-			vc4_seqno_wait(controlFd, &lastFinishedSeqno, device->queues[c][d].lastEmitSeqno, WAIT_TIMEOUT_INFINITE);
-		}
-	}
-
-	return VK_SUCCESS;
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkFreeCommandBuffers
- * Any primary command buffer that is in the recording or executable state and has any element of pCommandBuffers recorded into it, becomes invalid.
- */
-VKAPI_ATTR void VKAPI_CALL vkFreeCommandBuffers(
-		VkDevice                                    device,
-		VkCommandPool                               commandPool,
-		uint32_t                                    commandBufferCount,
-		const VkCommandBuffer*                      pCommandBuffers)
-{
-	assert(device);
-	assert(commandPool);
-	assert(pCommandBuffers);
-
-	_commandPool* cp = (_commandPool*)commandPool;
-
-	for(int c = 0; c < commandBufferCount; ++c)
-	{
-		//if(cp->usePoolAllocator)
-		{
-			consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->binCl, pCommandBuffers[c]->binCl.numBlocks);
-			consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->handlesCl, pCommandBuffers[c]->binCl.numBlocks);
-			consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->shaderRecCl, pCommandBuffers[c]->binCl.numBlocks);
-			consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->uniformsCl, pCommandBuffers[c]->binCl.numBlocks);
-			poolFree(&cp->pa, pCommandBuffers[c]);
-		}
-	}
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyCommandPool
- * When a pool is destroyed, all command buffers allocated from the pool are freed.
- * Any primary command buffer allocated from another VkCommandPool that is in the recording or executable state and has a secondary command buffer
- * allocated from commandPool recorded into it, becomes invalid.
- */
-VKAPI_ATTR void VKAPI_CALL vkDestroyCommandPool(
-		VkDevice                                    device,
-		VkCommandPool                               commandPool,
-		const VkAllocationCallbacks*                pAllocator)
-{
-	assert(device);
-	assert(commandPool);
-
-	//TODO: allocator is ignored for now
-	assert(pAllocator == 0);
-
-	_commandPool* cp = (_commandPool*)commandPool;
-
-	//if(cp->usePoolAllocator)
-	{
-		free(cp->pa.buf);
-		free(cp->cpa.buf);
-		destroyPoolAllocator(&cp->pa);
-		destroyConsecutivePoolAllocator(&cp->cpa);
-	}
-
-	free(cp);
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySemaphore
- */
-VKAPI_ATTR void VKAPI_CALL vkDestroySemaphore(
-		VkDevice                                    device,
-		VkSemaphore                                 semaphore,
-		const VkAllocationCallbacks*                pAllocator)
-{
-	assert(device);
-	assert(semaphore);
-
-	//TODO: allocator is ignored for now
-	assert(pAllocator == 0);
-
-	sem_destroy((sem_t*)semaphore);
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySwapchainKHR
- */
-VKAPI_ATTR void VKAPI_CALL vkDestroySwapchainKHR(
-		VkDevice                                    device,
-		VkSwapchainKHR                              swapchain,
-		const VkAllocationCallbacks*                pAllocator)
-{
-	assert(device);
-	assert(swapchain);
-
-	//TODO: allocator is ignored for now
-	assert(pAllocator == 0);
-
-	//TODO flush all ops
-
-	_swapchain* s = swapchain;
-
-	for(int c = 0; c < s->numImages; ++c)
-	{
-		vc4_bo_free(controlFd, s->images[c].handle, 0, s->images->size);
-		modeset_destroy_fb(controlFd, &s->images[c]);
-	}
-
-	free(s->images);
-	free(s);
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyDevice
- * To ensure that no work is active on the device, vkDeviceWaitIdle can be used to gate the destruction of the device.
- * Prior to destroying a device, an application is responsible for destroying/freeing any Vulkan objects that were created using that device as the
- * first parameter of the corresponding vkCreate* or vkAllocate* command
- */
-VKAPI_ATTR void VKAPI_CALL vkDestroyDevice(
-		VkDevice                                    device,
-		const VkAllocationCallbacks*                pAllocator)
-{
-	assert(device);
-
-	//TODO: allocator is ignored for now
-	assert(pAllocator == 0);
-
-	//TODO
-}
-
-/*
- * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyInstance
- *
- */
-VKAPI_ATTR void VKAPI_CALL vkDestroyInstance(
-		VkInstance                                  instance,
-		const VkAllocationCallbacks*                pAllocator)
-{
-	assert(instance);
-
-	//TODO: allocator is ignored for now
-	assert(pAllocator == 0);
-
-	//TODO
-	closeIoctl();
-}
-
diff --git a/driver/instance.c b/driver/instance.c
new file mode 100644
index 0000000..413475f
--- /dev/null
+++ b/driver/instance.c
@@ -0,0 +1,124 @@
+#include "common.h"
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEnumerateInstanceExtensionProperties
+ * When pLayerName parameter is NULL, only extensions provided by the Vulkan implementation or by implicitly enabled layers are returned. When pLayerName is the name of a layer,
+ * the instance extensions provided by that layer are returned.
+ * If pProperties is NULL, then the number of extensions properties available is returned in pPropertyCount. Otherwise, pPropertyCount must point to a variable set by the user
+ * to the number of elements in the pProperties array, and on return the variable is overwritten with the number of structures actually written to pProperties.
+ * If pPropertyCount is less than the number of extension properties available, at most pPropertyCount structures will be written. If pPropertyCount is smaller than the number of extensions available,
+ * VK_INCOMPLETE will be returned instead of VK_SUCCESS, to indicate that not all the available properties were returned.
+ * Because the list of available layers may change externally between calls to vkEnumerateInstanceExtensionProperties,
+ * two calls may retrieve different results if a pLayerName is available in one call but not in another. The extensions supported by a layer may also change between two calls,
+ * e.g. if the layer implementation is replaced by a different version between those calls.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateInstanceExtensionProperties(
+		const char*                                 pLayerName,
+		uint32_t*                                   pPropertyCount,
+		VkExtensionProperties*                      pProperties)
+{
+	assert(!pLayerName); //TODO layers ignored for now
+	assert(pPropertyCount);
+
+	if(!pProperties)
+	{
+		*pPropertyCount = numInstanceExtensions;
+		return VK_INCOMPLETE;
+	}
+
+	int arraySize = *pPropertyCount;
+	int elementsWritten = min(numInstanceExtensions, arraySize);
+
+	for(int c = 0; c < elementsWritten; ++c)
+	{
+		pProperties[c] = instanceExtensions[c];
+	}
+
+	*pPropertyCount = elementsWritten;
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateInstance
+ * There is no global state in Vulkan and all per-application state is stored in a VkInstance object. Creating a VkInstance object initializes the Vulkan library
+ * vkCreateInstance verifies that the requested layers exist. If not, vkCreateInstance will return VK_ERROR_LAYER_NOT_PRESENT. Next vkCreateInstance verifies that
+ * the requested extensions are supported (e.g. in the implementation or in any enabled instance layer) and if any requested extension is not supported,
+ * vkCreateInstance must return VK_ERROR_EXTENSION_NOT_PRESENT. After verifying and enabling the instance layers and extensions the VkInstance object is
+ * created and returned to the application.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateInstance(
+		const VkInstanceCreateInfo*                 pCreateInfo,
+		const VkAllocationCallbacks*                pAllocator,
+		VkInstance*                                 pInstance)
+{
+	assert(pInstance);
+	assert(pCreateInfo);
+
+	*pInstance = malloc(sizeof(_instance));
+
+	if(!*pInstance)
+	{
+		return VK_ERROR_OUT_OF_HOST_MEMORY;
+	}
+
+	(*pInstance)->numEnabledExtensions = 0;
+
+	//TODO: allocator is ignored for now
+	assert(pAllocator == 0);
+
+	//TODO: possibly we need to load layers here
+	//and store them in pInstance
+	assert(pCreateInfo->enabledLayerCount == 0);
+
+	if(pCreateInfo->enabledExtensionCount)
+	{
+		assert(pCreateInfo->ppEnabledExtensionNames);
+	}
+
+	for(int c = 0; c < pCreateInfo->enabledExtensionCount; ++c)
+	{
+		int findres = findInstanceExtension(pCreateInfo->ppEnabledExtensionNames[c]);
+		if(findres > -1)
+		{
+			(*pInstance)->enabledExtensions[(*pInstance)->numEnabledExtensions] = findres;
+			(*pInstance)->numEnabledExtensions++;
+		}
+		else
+		{
+			return VK_ERROR_EXTENSION_NOT_PRESENT;
+		}
+	}
+
+	//TODO ignored for now
+	//pCreateInfo->pApplicationInfo
+
+	int ret = openIoctl(); assert(!ret);
+
+	(*pInstance)->chipVersion = vc4_get_chip_info(controlFd);
+	(*pInstance)->hasTiling = vc4_test_tiling(controlFd);
+
+	(*pInstance)->hasControlFlow = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_BRANCHES);
+	(*pInstance)->hasEtc1 = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_ETC1);
+	(*pInstance)->hasThreadedFs = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_THREADED_FS);
+	(*pInstance)->hasMadvise = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_MADVISE);
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyInstance
+ *
+ */
+VKAPI_ATTR void VKAPI_CALL vkDestroyInstance(
+		VkInstance                                  instance,
+		const VkAllocationCallbacks*                pAllocator)
+{
+	assert(instance);
+
+	//TODO: allocator is ignored for now
+	assert(pAllocator == 0);
+
+	//TODO
+	closeIoctl();
+}
diff --git a/driver/sync.c b/driver/sync.c
new file mode 100644
index 0000000..1c641f2
--- /dev/null
+++ b/driver/sync.c
@@ -0,0 +1,273 @@
+#include "common.h"
+
+#include "kernel/vc4_packet.h"
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateSemaphore
+ * Semaphores are a synchronization primitive that can be used to insert a dependency between batches submitted to queues.
+ * Semaphores have two states - signaled and unsignaled. The state of a semaphore can be signaled after execution of a batch of commands is completed.
+ * A batch can wait for a semaphore to become signaled before it begins execution, and the semaphore is also unsignaled before the batch begins execution.
+ * As with most objects in Vulkan, semaphores are an interface to internal data which is typically opaque to applications.
+ * This internal data is referred to as a semaphore’s payload. However, in order to enable communication with agents outside of the current device,
+ * it is necessary to be able to export that payload to a commonly understood format, and subsequently import from that format as well.
+ * The internal data of a semaphore may include a reference to any resources and pending work associated with signal or unsignal operations performed on that semaphore object.
+ * Mechanisms to import and export that internal data to and from semaphores are provided below.
+ * These mechanisms indirectly enable applications to share semaphore state between two or more semaphores and other synchronization primitives across process and API boundaries.
+ * When created, the semaphore is in the unsignaled state.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSemaphore(
+		VkDevice                                    device,
+		const VkSemaphoreCreateInfo*                pCreateInfo,
+		const VkAllocationCallbacks*                pAllocator,
+		VkSemaphore*                                pSemaphore)
+{
+	assert(device);
+	assert(pSemaphore);
+
+	//TODO: allocator is ignored for now
+	assert(pAllocator == 0);
+
+	//we'll probably just use an IOCTL to wait for a GPU sequence number to complete.
+	sem_t* s = malloc(sizeof(sem_t));
+	if(!s)
+	{
+		return VK_ERROR_OUT_OF_HOST_MEMORY;
+	}
+	sem_init(s, 0, 0); //create semaphore unsignalled, shared between threads
+
+	*pSemaphore = (VkSemaphore)s;
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCmdPipelineBarrier
+ * vkCmdPipelineBarrier is a synchronization command that inserts a dependency between commands submitted to the same queue, or between commands in the same subpass.
+ * When vkCmdPipelineBarrier is submitted to a queue, it defines a memory dependency between commands that were submitted before it, and those submitted after it.
+ * If vkCmdPipelineBarrier was recorded outside a render pass instance, the first synchronization scope includes all commands that occur earlier in submission order.
+ * If vkCmdPipelineBarrier was recorded inside a render pass instance, the first synchronization scope includes only commands that occur earlier in submission order within the same subpass.
+ * In either case, the first synchronization scope is limited to operations on the pipeline stages determined by the source stage mask specified by srcStageMask.
+ *
+ * If vkCmdPipelineBarrier was recorded outside a render pass instance, the second synchronization scope includes all commands that occur later in submission order.
+ * If vkCmdPipelineBarrier was recorded inside a render pass instance, the second synchronization scope includes only commands that occur later in submission order within the same subpass.
+ * In either case, the second synchronization scope is limited to operations on the pipeline stages determined by the destination stage mask specified by dstStageMask.
+ *
+ * The first access scope is limited to access in the pipeline stages determined by the source stage mask specified by srcStageMask.
+ * Within that, the first access scope only includes the first access scopes defined by elements of the pMemoryBarriers,
+ * pBufferMemoryBarriers and pImageMemoryBarriers arrays, which each define a set of memory barriers. If no memory barriers are specified,
+ * then the first access scope includes no accesses.
+ *
+ * The second access scope is limited to access in the pipeline stages determined by the destination stage mask specified by dstStageMask.
+ * Within that, the second access scope only includes the second access scopes defined by elements of the pMemoryBarriers, pBufferMemoryBarriers and pImageMemoryBarriers arrays,
+ * which each define a set of memory barriers. If no memory barriers are specified, then the second access scope includes no accesses.
+ *
+ * If dependencyFlags includes VK_DEPENDENCY_BY_REGION_BIT, then any dependency between framebuffer-space pipeline stages is framebuffer-local - otherwise it is framebuffer-global.
+ */
+VKAPI_ATTR void VKAPI_CALL vkCmdPipelineBarrier(
+		VkCommandBuffer                             commandBuffer,
+		VkPipelineStageFlags                        srcStageMask,
+		VkPipelineStageFlags                        dstStageMask,
+		VkDependencyFlags                           dependencyFlags,
+		uint32_t                                    memoryBarrierCount,
+		const VkMemoryBarrier*                      pMemoryBarriers,
+		uint32_t                                    bufferMemoryBarrierCount,
+		const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
+		uint32_t                                    imageMemoryBarrierCount,
+		const VkImageMemoryBarrier*                 pImageMemoryBarriers)
+{
+	assert(commandBuffer);
+
+	//TODO pipeline stage flags
+	//VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT
+	//VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT
+	//VK_PIPELINE_STAGE_VERTEX_INPUT_BIT
+	//VK_PIPELINE_STAGE_VERTEX_SHADER_BIT
+	//VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT
+	//VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT
+	//VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT
+	//VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT
+	//VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT
+	//VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT
+	//VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT
+	//VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT
+	//VK_PIPELINE_STAGE_TRANSFER_BIT
+	//VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT
+	//VK_PIPELINE_STAGE_HOST_BIT
+	//VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT
+	//VK_PIPELINE_STAGE_ALL_COMMANDS_BIT
+
+	//TODO dependency flags
+	//VK_DEPENDENCY_BY_REGION_BIT,
+	//VK_DEPENDENCY_DEVICE_GROUP_BIT,
+	//VK_DEPENDENCY_VIEW_LOCAL_BIT
+
+	//TODO access flags
+	//VK_ACCESS_INDIRECT_COMMAND_READ_BIT
+	//VK_ACCESS_INDEX_READ_BIT
+	//VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT
+	//VK_ACCESS_UNIFORM_READ_BIT
+	//VK_ACCESS_INPUT_ATTACHMENT_READ_BIT
+	//VK_ACCESS_SHADER_READ_BIT
+	//VK_ACCESS_SHADER_WRITE_BIT
+	//VK_ACCESS_COLOR_ATTACHMENT_READ_BIT
+	//VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT
+	//VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT
+	//VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT
+	//VK_ACCESS_TRANSFER_READ_BIT
+	//VK_ACCESS_TRANSFER_WRITE_BIT
+	//VK_ACCESS_HOST_READ_BIT
+	//VK_ACCESS_HOST_WRITE_BIT
+	//VK_ACCESS_MEMORY_READ_BIT
+	//VK_ACCESS_MEMORY_WRITE_BIT
+	//VK_ACCESS_COMMAND_PROCESS_READ_BIT_NVX
+	//VK_ACCESS_COMMAND_PROCESS_WRITE_BIT_NVX
+
+	//TODO Layout transition flags
+	//VK_IMAGE_LAYOUT_UNDEFINED
+	//VK_IMAGE_LAYOUT_GENERAL
+	//VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL
+	//VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
+	//VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL
+	//VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL
+	//VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL
+	//VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
+	//VK_IMAGE_LAYOUT_PREINITIALIZED
+	//VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
+	//VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL
+	//VK_IMAGE_LAYOUT_PRESENT_SRC_KHR
+	//VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR
+
+	for(int c = 0; c < memoryBarrierCount; ++c)
+	{
+		//TODO
+	}
+
+	for(int c = 0; c < bufferMemoryBarrierCount; ++c)
+	{
+		//TODO
+	}
+
+	for(int c = 0; c < imageMemoryBarrierCount; ++c)
+	{
+		_image* i = pImageMemoryBarriers[c].image;
+
+		assert(i->layout == pImageMemoryBarriers[c].oldLayout || i->layout == VK_IMAGE_LAYOUT_UNDEFINED);
+
+		if(srcStageMask & VK_PIPELINE_STAGE_TRANSFER_BIT &&
+		   pImageMemoryBarriers[c].srcAccessMask & VK_ACCESS_TRANSFER_WRITE_BIT &&
+		   i->needToClear)
+		{
+			//insert CRs to clear the image
+
+			assert(i->layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL);
+
+			clFit(commandBuffer, &commandBuffer->binCl, V3D21_TILE_BINNING_MODE_CONFIGURATION_length);
+			clInsertTileBinningModeConfiguration(&commandBuffer->binCl,
+												 0, 0, 0, 0,
+												 getFormatBpp(i->format) == 64, //64 bit color mode
+												 i->samples > 1, //msaa
+												 i->width, i->height, 0, 0, 0);
+
+			//START_TILE_BINNING resets the statechange counters in the hardware,
+			//which are what is used when a primitive is binned to a tile to
+			//figure out what new state packets need to be written to that tile's
+			//command list.
+			clFit(commandBuffer, &commandBuffer->binCl, V3D21_START_TILE_BINNING_length);
+			clInsertStartTileBinning(&commandBuffer->binCl);
+
+			//Reset the current compressed primitives format.  This gets modified
+			//by VC4_PACKET_GL_INDEXED_PRIMITIVE and
+			//VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start
+			//of every tile.
+			clFit(commandBuffer, &commandBuffer->binCl, V3D21_PRIMITIVE_LIST_FORMAT_length);
+			clInsertPrimitiveListFormat(&commandBuffer->binCl,
+										1, //16 bit
+										2); //tris
+
+			clFit(commandBuffer, &commandBuffer->handlesCl, 4);
+			uint32_t idx = clGetHandleIndex(&commandBuffer->handlesCl, i->handle);
+			commandBuffer->submitCl.color_write.hindex = idx;
+			commandBuffer->submitCl.color_write.offset = 0;
+			commandBuffer->submitCl.color_write.flags = 0;
+			//TODO format
+			commandBuffer->submitCl.color_write.bits =
+					VC4_SET_FIELD(VC4_RENDER_CONFIG_FORMAT_RGBA8888, VC4_RENDER_CONFIG_FORMAT) |
+					VC4_SET_FIELD(i->tiling, VC4_RENDER_CONFIG_MEMORY_FORMAT);
+
+			commandBuffer->submitCl.clear_color[0] = i->clearColor[0];
+			commandBuffer->submitCl.clear_color[1] = i->clearColor[1];
+
+			//TODO ranges
+			commandBuffer->submitCl.min_x_tile = 0;
+			commandBuffer->submitCl.min_y_tile = 0;
+
+			uint32_t tileSizeW = 64;
+			uint32_t tileSizeH = 64;
+
+			if(i->samples > 1)
+			{
+				tileSizeW >>= 1;
+				tileSizeH >>= 1;
+			}
+
+			if(getFormatBpp(i->format) == 64)
+			{
+				tileSizeH >>= 1;
+			}
+
+			uint32_t widthInTiles = divRoundUp(i->width, tileSizeW);
+			uint32_t heightInTiles = divRoundUp(i->height, tileSizeH);
+
+			commandBuffer->submitCl.max_x_tile = widthInTiles - 1;
+			commandBuffer->submitCl.max_y_tile = heightInTiles - 1;
+			commandBuffer->submitCl.width = i->width;
+			commandBuffer->submitCl.height = i->height;
+			commandBuffer->submitCl.flags |= VC4_SUBMIT_CL_USE_CLEAR_COLOR;
+			commandBuffer->submitCl.clear_z = 0; //TODO
+			commandBuffer->submitCl.clear_s = 0;
+		}
+
+		//transition to new layout
+		i->layout = pImageMemoryBarriers[c].newLayout;
+	}
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDeviceWaitIdle
+ * vkDeviceWaitIdle is equivalent to calling vkQueueWaitIdle for all queues owned by device.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkDeviceWaitIdle(
+		VkDevice									device)
+{
+	assert(device);
+
+	for(int c = 0; c < numQueueFamilies; ++c)
+	{
+		for(int d = 0; d < device->numQueues[c]; ++d)
+		{
+			uint64_t lastFinishedSeqno;
+			vc4_seqno_wait(controlFd, &lastFinishedSeqno, device->queues[c][d].lastEmitSeqno, WAIT_TIMEOUT_INFINITE);
+		}
+	}
+
+	return VK_SUCCESS;
+}
+
+
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySemaphore
+ */
+VKAPI_ATTR void VKAPI_CALL vkDestroySemaphore(
+		VkDevice                                    device,
+		VkSemaphore                                 semaphore,
+		const VkAllocationCallbacks*                pAllocator)
+{
+	assert(device);
+	assert(semaphore);
+
+	//TODO: allocator is ignored for now
+	assert(pAllocator == 0);
+
+	sem_destroy((sem_t*)semaphore);
+}
diff --git a/driver/vkCaps.h b/driver/vkCaps.h
index 4901b32..e35d27a 100644
--- a/driver/vkCaps.h
+++ b/driver/vkCaps.h
@@ -2,7 +2,7 @@
 
 #include <vulkan/vulkan.h>
 
-VkPhysicalDeviceLimits _limits =
+static VkPhysicalDeviceLimits _limits =
 {
 	//TODO these values might change
 	.maxImageDimension1D = 16384,
@@ -113,7 +113,7 @@ VkPhysicalDeviceLimits _limits =
 	.nonCoherentAtomSize = 0x40
 };
 
-VkPhysicalDeviceFeatures _features =
+static VkPhysicalDeviceFeatures _features =
 {
 	//TODO this might change
 	.robustBufferAccess = 1,
@@ -174,7 +174,7 @@ VkPhysicalDeviceFeatures _features =
 };
 #define numFeatures (sizeof(_features)/sizeof(VkBool32))
 
-VkQueueFamilyProperties _queueFamilyProperties[] =
+static VkQueueFamilyProperties _queueFamilyProperties[] =
 {
 	{
 		.queueFlags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT,
@@ -185,7 +185,7 @@ VkQueueFamilyProperties _queueFamilyProperties[] =
 };
 #define numQueueFamilies (sizeof(_queueFamilyProperties)/sizeof(VkQueueFamilyProperties))
 
-VkSurfaceFormatKHR supportedSurfaceFormats[] =
+static VkSurfaceFormatKHR supportedSurfaceFormats[] =
 {
 	{
 		.format = VK_FORMAT_R8G8B8A8_UNORM,
@@ -259,29 +259,3 @@ static VkExtensionProperties deviceExtensions[] =
 	}
 };
 #define numDeviceExtensions (sizeof(deviceExtensions) / sizeof(VkExtensionProperties))
-
-int findInstanceExtension(char* name)
-{
-	for(int c = 0; c < numInstanceExtensions; ++c)
-	{
-		if(strcmp(instanceExtensions[c].extensionName, name) == 0)
-		{
-			return c;
-		}
-	}
-
-	return -1;
-}
-
-int findDeviceExtension(char* name)
-{
-	for(int c = 0; c < numDeviceExtensions; ++c)
-	{
-		if(strcmp(deviceExtensions[c].extensionName, name) == 0)
-		{
-			return c;
-		}
-	}
-
-	return -1;
-}
diff --git a/driver/wsi.c b/driver/wsi.c
new file mode 100644
index 0000000..3aa8d69
--- /dev/null
+++ b/driver/wsi.c
@@ -0,0 +1,381 @@
+#include "common.h"
+
+/*
+ * Implementation of our RPI specific "extension"
+ */
+VkResult vkCreateRpiSurfaceKHR(
+		VkInstance                                  instance,
+		const VkRpiSurfaceCreateInfoKHR*            pCreateInfo,
+		const VkAllocationCallbacks*                pAllocator,
+		VkSurfaceKHR*                               pSurface)
+{
+	assert(instance);
+	//assert(pCreateInfo); //ignored for now
+	assert(pSurface);
+	//TODO: allocator is ignored for now
+	assert(pAllocator == 0);
+
+	*pSurface = (VkSurfaceKHR)modeset_create(controlFd);
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySurfaceKHR
+ * Destroying a VkSurfaceKHR merely severs the connection between Vulkan and the native surface,
+ * and does not imply destroying the native surface, closing a window, or similar behavior
+ * (but we'll do so anyways...)
+ */
+VKAPI_ATTR void VKAPI_CALL vkDestroySurfaceKHR(
+		VkInstance                                  instance,
+		VkSurfaceKHR                                surface,
+		const VkAllocationCallbacks*                pAllocator)
+{
+	assert(instance);
+	assert(surface);
+
+	//TODO: allocator is ignored for now
+	assert(pAllocator == 0);
+
+	modeset_destroy(controlFd, (modeset_dev*)surface);
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfaceCapabilitiesKHR
+ * The capabilities of a swapchain targetting a surface are the intersection of the capabilities of the WSI platform,
+ * the native window or display, and the physical device. The resulting capabilities can be obtained with the queries listed
+ * below in this section. Capabilities that correspond to image creation parameters are not independent of each other:
+ * combinations of parameters that are not supported as reported by vkGetPhysicalDeviceImageFormatProperties are not supported
+ * by the surface on that physical device, even if the capabilities taken individually are supported as part of some other parameter combinations.
+ *
+ * capabilities the specified device supports for a swapchain created for the surface
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilitiesKHR(
+		VkPhysicalDevice                            physicalDevice,
+		VkSurfaceKHR                                surface,
+		VkSurfaceCapabilitiesKHR*                   pSurfaceCapabilities)
+{
+	assert(physicalDevice);
+	assert(surface);
+	assert(pSurfaceCapabilities);
+
+	pSurfaceCapabilities->minImageCount = 1; //min 1
+	pSurfaceCapabilities->maxImageCount = 2; //TODO max 2 for double buffering for now...
+	pSurfaceCapabilities->currentExtent.width = ((modeset_dev*)surface)->width;
+	pSurfaceCapabilities->currentExtent.height = ((modeset_dev*)surface)->height;
+	pSurfaceCapabilities->minImageExtent.width = ((modeset_dev*)surface)->width; //TODO
+	pSurfaceCapabilities->minImageExtent.height = ((modeset_dev*)surface)->height; //TODO
+	pSurfaceCapabilities->maxImageExtent.width = ((modeset_dev*)surface)->width; //TODO
+	pSurfaceCapabilities->maxImageExtent.height = ((modeset_dev*)surface)->height; //TODO
+	pSurfaceCapabilities->maxImageArrayLayers = 1; //TODO maybe more layers for cursor etc.
+	pSurfaceCapabilities->supportedTransforms = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; //TODO no rotation for now
+	pSurfaceCapabilities->currentTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; //TODO get this from dev
+	pSurfaceCapabilities->supportedCompositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; //TODO no alpha compositing for now
+	pSurfaceCapabilities->supportedUsageFlags = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; //well we want to draw on the screen right
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfaceFormatsKHR
+ * If pSurfaceFormats is NULL, then the number of format pairs supported for the given surface is returned in pSurfaceFormatCount.
+ * The number of format pairs supported will be greater than or equal to 1. Otherwise, pSurfaceFormatCount must point to a variable
+ * set by the user to the number of elements in the pSurfaceFormats array, and on return the variable is overwritten with the number
+ * of structures actually written to pSurfaceFormats. If the value of pSurfaceFormatCount is less than the number of format pairs supported,
+ * at most pSurfaceFormatCount structures will be written. If pSurfaceFormatCount is smaller than the number of format pairs supported for the given surface,
+ * VK_INCOMPLETE will be returned instead of VK_SUCCESS to indicate that not all the available values were returned.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceFormatsKHR(
+		VkPhysicalDevice                            physicalDevice,
+		VkSurfaceKHR                                surface,
+		uint32_t*                                   pSurfaceFormatCount,
+		VkSurfaceFormatKHR*                         pSurfaceFormats)
+{
+	assert(physicalDevice);
+	assert(surface);
+	assert(pSurfaceFormatCount);
+
+	const int numFormats = 1;
+
+	if(!pSurfaceFormats)
+	{
+		*pSurfaceFormatCount = numFormats;
+		return VK_SUCCESS;
+	}
+
+	int arraySize = *pSurfaceFormatCount;
+	int elementsWritten = min(numFormats, arraySize);
+
+	for(int c = 0; c < elementsWritten; ++c)
+	{
+		pSurfaceFormats[c] = supportedSurfaceFormats[c];
+	}
+
+	*pSurfaceFormatCount = elementsWritten;
+
+	if(elementsWritten < numFormats)
+	{
+		return VK_INCOMPLETE;
+	}
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfacePresentModesKHR
+ * If pPresentModes is NULL, then the number of presentation modes supported for the given surface is returned in pPresentModeCount.
+ * Otherwise, pPresentModeCount must point to a variable set by the user to the number of elements in the pPresentModes array,
+ * and on return the variable is overwritten with the number of values actually written to pPresentModes.
+ * If the value of pPresentModeCount is less than the number of presentation modes supported, at most pPresentModeCount values will be written.
+ * If pPresentModeCount is smaller than the number of presentation modes supported for the given surface, VK_INCOMPLETE will be returned instead of
+ * VK_SUCCESS to indicate that not all the available values were returned.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfacePresentModesKHR(
+		VkPhysicalDevice                            physicalDevice,
+		VkSurfaceKHR                                surface,
+		uint32_t*                                   pPresentModeCount,
+		VkPresentModeKHR*                           pPresentModes)
+{
+	assert(physicalDevice);
+	assert(surface);
+	assert(pPresentModeCount);
+
+	const int numModes = 1;
+
+	if(!pPresentModes)
+	{
+		*pPresentModeCount = numModes;
+		return VK_SUCCESS;
+	}
+
+	int arraySize = *pPresentModeCount;
+	int elementsWritten = min(numModes, arraySize);
+
+	for(int c = 0; c < elementsWritten; ++c)
+	{
+		//TODO
+		pPresentModes[c] = VK_PRESENT_MODE_FIFO_KHR;
+	}
+
+	*pPresentModeCount = elementsWritten;
+
+	if(elementsWritten < numModes)
+	{
+		return VK_INCOMPLETE;
+	}
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateSwapchainKHR
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkCreateSwapchainKHR(
+		VkDevice                                    device,
+		const VkSwapchainCreateInfoKHR*             pCreateInfo,
+		const VkAllocationCallbacks*                pAllocator,
+		VkSwapchainKHR*                             pSwapchain)
+{
+	assert(device);
+	assert(pCreateInfo);
+	assert(pSwapchain);
+
+	//TODO: allocator is ignored for now
+	assert(pAllocator == 0);
+
+	*pSwapchain = malloc(sizeof(_swapchain));
+	if(!*pSwapchain)
+	{
+		return VK_ERROR_OUT_OF_HOST_MEMORY;
+	}
+
+	_swapchain* s = *pSwapchain;
+
+	//TODO flags, layers, queue sharing, pretransform, composite alpha, present mode..., clipped, oldswapchain
+	//TODO external sync on surface, oldswapchain
+
+	s->images = malloc(sizeof(_image) * pCreateInfo->minImageCount);
+	if(!s->images)
+	{
+		return VK_ERROR_OUT_OF_HOST_MEMORY;
+	}
+
+	s->backbufferIdx = 0;
+	s->numImages = pCreateInfo->minImageCount;
+	s->surface = pCreateInfo->surface;
+
+	for(int c = 0; c < pCreateInfo->minImageCount; ++c)
+	{
+		s->images[c].width = pCreateInfo->imageExtent.width;
+		s->images[c].height = pCreateInfo->imageExtent.height;
+		s->images[c].depth = 1;
+		s->images[c].layers = pCreateInfo->imageArrayLayers;
+		s->images[c].miplevels = 1;
+		s->images[c].samples = 1; //TODO
+		s->images[c].usageBits = pCreateInfo->imageUsage;
+		s->images[c].format = pCreateInfo->imageFormat;
+		s->images[c].imageSpace = pCreateInfo->imageColorSpace;
+		s->images[c].concurrentAccess = pCreateInfo->imageSharingMode;
+		s->images[c].numQueueFamiliesWithAccess = pCreateInfo->queueFamilyIndexCount;
+		if(s->images[c].concurrentAccess)
+		{
+			s->images[c].queueFamiliesWithAccess = malloc(sizeof(uint32_t)*s->images[c].numQueueFamiliesWithAccess);
+			memcpy(s->images[c].queueFamiliesWithAccess, pCreateInfo->pQueueFamilyIndices, sizeof(uint32_t)*s->images[c].numQueueFamiliesWithAccess);
+		}
+		s->images[c].preTransformMode = pCreateInfo->preTransform;
+		s->images[c].compositeAlpha = pCreateInfo->compositeAlpha;
+		s->images[c].presentMode = pCreateInfo->presentMode;
+		s->images[c].clipped = pCreateInfo->clipped;
+
+		createImageBO(&s->images[c]);
+		int res = modeset_create_fb(controlFd, &s->images[c]); assert(res == 0);
+	}
+
+	//defer to first swapbuffer (or at least later, getting swapchain != presenting immediately)
+	//int res = modeset_fb_for_dev(controlFd, s->surface, &s->images[s->backbufferIdx]); assert(res == 0);
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetSwapchainImagesKHR
+ * If pSwapchainImages is NULL, then the number of presentable images for swapchain is returned in pSwapchainImageCount.
+ * Otherwise, pSwapchainImageCount must point to a variable set by the user to the number of elements in the pSwapchainImages array,
+ * and on return the variable is overwritten with the number of structures actually written to pSwapchainImages.
+ * If the value of pSwapchainImageCount is less than the number of presentable images for swapchain, at most pSwapchainImageCount structures will be written.
+ * If pSwapchainImageCount is smaller than the number of presentable images for swapchain, VK_INCOMPLETE will be returned instead of VK_SUCCESS to
+ * indicate that not all the available values were returned.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainImagesKHR(
+		VkDevice                                    device,
+		VkSwapchainKHR                              swapchain,
+		uint32_t*                                   pSwapchainImageCount,
+		VkImage*                                    pSwapchainImages)
+{
+	assert(device);
+	assert(swapchain);
+	assert(pSwapchainImageCount);
+
+	_swapchain* s = swapchain;
+
+	if(!pSwapchainImages)
+	{
+		*pSwapchainImageCount = s->numImages;
+		return VK_SUCCESS;
+	}
+
+	int arraySize = *pSwapchainImageCount;
+	int elementsWritten = min(s->numImages, arraySize);
+
+	for(int c = 0; c < elementsWritten; ++c)
+	{
+		pSwapchainImages[c] = &s->images[c];
+	}
+
+	*pSwapchainImageCount = elementsWritten;
+
+	if(elementsWritten < s->numImages)
+	{
+		return VK_INCOMPLETE;
+	}
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkAcquireNextImageKHR
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkAcquireNextImageKHR(
+		VkDevice                                    device,
+		VkSwapchainKHR                              swapchain,
+		uint64_t                                    timeout,
+		VkSemaphore                                 semaphore,
+		VkFence                                     fence,
+		uint32_t*                                   pImageIndex)
+{
+	assert(device);
+	assert(swapchain);
+
+	assert(semaphore != VK_NULL_HANDLE || fence != VK_NULL_HANDLE);
+
+	sem_t* s = semaphore;
+
+	//TODO we need to keep track of currently acquired images?
+
+	//TODO wait timeout?
+
+	*pImageIndex = ((_swapchain*)swapchain)->backbufferIdx; //return back buffer index
+
+	//signal semaphore
+	int semVal; sem_getvalue(s, &semVal); assert(semVal <= 0); //make sure semaphore is unsignalled
+	sem_post(s);
+
+	//TODO signal fence
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkQueuePresentKHR
+ * Any writes to memory backing the images referenced by the pImageIndices and pSwapchains members of pPresentInfo,
+ * that are available before vkQueuePresentKHR is executed, are automatically made visible to the read access performed by the presentation engine.
+ * This automatic visibility operation for an image happens-after the semaphore signal operation, and happens-before the presentation engine accesses the image.
+ * Queueing an image for presentation defines a set of queue operations, including waiting on the semaphores and submitting a presentation request to the presentation engine.
+ * However, the scope of this set of queue operations does not include the actual processing of the image by the presentation engine.
+ * If vkQueuePresentKHR fails to enqueue the corresponding set of queue operations, it may return VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY.
+ * If it does, the implementation must ensure that the state and contents of any resources or synchronization primitives referenced is unaffected by the call or its failure.
+ * If vkQueuePresentKHR fails in such a way that the implementation is unable to make that guarantee, the implementation must return VK_ERROR_DEVICE_LOST.
+ * However, if the presentation request is rejected by the presentation engine with an error VK_ERROR_OUT_OF_DATE_KHR or VK_ERROR_SURFACE_LOST_KHR,
+ * the set of queue operations are still considered to be enqueued and thus any semaphore to be waited on gets unsignaled when the corresponding queue operation is complete.
+ */
+VKAPI_ATTR VkResult VKAPI_CALL vkQueuePresentKHR(
+		VkQueue                                     queue,
+		const VkPresentInfoKHR*                     pPresentInfo)
+{
+	assert(queue);
+	assert(pPresentInfo);
+
+	//wait for semaphore in present info set by submit ioctl to make sure cls are flushed
+	for(int c = 0; c < pPresentInfo->waitSemaphoreCount; ++c)
+	{
+		sem_wait((sem_t*)pPresentInfo->pWaitSemaphores[c]);
+	}
+
+	for(int c = 0; c < pPresentInfo->swapchainCount; ++c)
+	{
+		_swapchain* s = pPresentInfo->pSwapchains[c];
+		modeset_present_buffer(controlFd, (modeset_dev*)s->surface, &s->images[s->backbufferIdx]);
+		s->backbufferIdx = (s->backbufferIdx + 1) % s->numImages;
+	}
+
+	return VK_SUCCESS;
+}
+
+/*
+ * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySwapchainKHR
+ */
+VKAPI_ATTR void VKAPI_CALL vkDestroySwapchainKHR(
+		VkDevice                                    device,
+		VkSwapchainKHR                              swapchain,
+		const VkAllocationCallbacks*                pAllocator)
+{
+	assert(device);
+	assert(swapchain);
+
+	//TODO: allocator is ignored for now
+	assert(pAllocator == 0);
+
+	//TODO flush all ops
+
+	_swapchain* s = swapchain;
+
+	for(int c = 0; c < s->numImages; ++c)
+	{
+		vc4_bo_free(controlFd, s->images[c].handle, 0, s->images->size);
+		modeset_destroy_fb(controlFd, &s->images[c]);
+	}
+
+	free(s->images);
+	free(s);
+}
+