#include "ControlListUtil.h"
#include "ConsecutivePoolAllocator.h"

#include <stdint.h>

uint32_t divRoundUp(uint32_t n, uint32_t d)
{
	return (((n) + (d) - 1) / (d));
}

//move bits to offset, mask rest to 0
uint32_t moveBits(uint32_t d, uint32_t bits, uint32_t offset)
{
	return (d << offset) & (~(~0 << bits) << offset);
}

uint32_t clHasEnoughSpace(ControlList* cl, uint32_t size)
{
	assert(cl);
	assert(cl->CPA);
	uint32_t currSize = cl->nextFreeByteOffset - cl->offset;
	if(currSize + size < cl->numBlocks * cl->blockSize - 4)
	{
		return 1; //fits!
	}
	else
	{
		return 0; //need to reallocate
	}
}

void clInit(ControlList* cl, void* CPA, uint32_t offset, uint32_t blockSize)
{
	assert(cl);
	assert(CPA);
	cl->offset = offset;
	cl->numBlocks = 1;
	cl->blockSize = blockSize;
	cl->nextFreeByteOffset = offset;
	cl->currMarkerOffset = -1;
	cl->CPA = CPA;
}

void clInsertNewCLMarker(ControlList* cl,
						 ControlList* handlesCL,
						 ControlList* shaderRecCL,
						 uint32_t shaderRecCount,
						 ControlList* uniformsCL)
{
	//to be inserted when you'd insert tile binning mode config
	assert(cl);
	assert(cl->CPA);
	assert(handlesCL);
	assert(shaderRecCL);
	assert(uniformsCL);

	CLMarker marker = {};
	marker.memGuard = 0xDDDDDDDD;
	marker.handlesBufOffset = 0;
	marker.shaderRecBufOffset = 0;
	marker.uniformsBufOffset = 0;
	marker.nextMarkerOffset = -1;

	//close current marker
	if(cl->currMarkerOffset != -1 && !((CLMarker*)getCPAptrFromOffset(cl->CPA, cl->currMarkerOffset))->size)
	{
		clCloseCurrentMarker(cl, handlesCL, shaderRecCL, shaderRecCount, uniformsCL);
	}

	//if this is not the first marker
	if(cl->currMarkerOffset != -1)
	{
		CLMarker* currMarker = getCPAptrFromOffset(cl->CPA, cl->currMarkerOffset);
		marker.handlesBufOffset = currMarker->handlesBufOffset + currMarker->handlesSize;
		marker.shaderRecBufOffset = currMarker->shaderRecBufOffset + currMarker->shaderRecSize;
		marker.uniformsBufOffset = currMarker->uniformsBufOffset + currMarker->uniformsSize;
		marker.shaderRecCount = currMarker->shaderRecCount; //initialize with previous marker's data
	}

	*(CLMarker*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = marker;
	if(cl->currMarkerOffset != -1)
	{
		((CLMarker*)getCPAptrFromOffset(cl->CPA, cl->currMarkerOffset))->nextMarkerOffset = cl->nextFreeByteOffset;
	}
	cl->currMarkerOffset = cl->nextFreeByteOffset;
	cl->nextFreeByteOffset += sizeof(CLMarker);
}

void clCloseCurrentMarker(ControlList* cl, ControlList* handlesCL, ControlList* shaderRecCL, uint32_t shaderRecCount, ControlList* uniformsCL)
{
	assert(cl);
	assert(cl->CPA);
	assert(handlesCL);
	assert(shaderRecCL);
	assert(uniformsCL);
	CLMarker* currMarker = getCPAptrFromOffset(cl->CPA, cl->currMarkerOffset);
	currMarker->size = cl->nextFreeByteOffset - (cl->currMarkerOffset + sizeof(CLMarker));
	currMarker->handlesSize = handlesCL->nextFreeByteOffset - (currMarker->handlesBufOffset + handlesCL->offset);
	currMarker->shaderRecSize = shaderRecCL->nextFreeByteOffset - (currMarker->shaderRecBufOffset + shaderRecCL->offset);
	currMarker->uniformsSize = uniformsCL->nextFreeByteOffset - (currMarker->uniformsBufOffset + uniformsCL->offset);
	currMarker->shaderRecCount = shaderRecCount - currMarker->shaderRecCount; //update shader rec count to reflect added shader recs
}

void clInsertData(ControlList* cl, uint32_t size, uint8_t* data)
{
	assert(cl);
	assert(cl->CPA);
	memcpy(getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset), data, size);
	cl->nextFreeByteOffset += size;
}

void clInsertUniformConstant(ControlList* cl, uint32_t data)
{
	assert(cl);
	assert(cl->CPA);
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = data;
	cl->nextFreeByteOffset += 4;
}

void clInsertUniformXYScale(ControlList* cl, float data)
{
	assert(cl);
	assert(cl->CPA);
	*(float*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = data;
	cl->nextFreeByteOffset += 4;
}

void clInsertUniformZOffset(ControlList* cl, float data)
{
	assert(cl);
	assert(cl->CPA);
	*(float*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = data;
	cl->nextFreeByteOffset += 4;
}

void clInsertHalt(ControlList* cl)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_HALT_opcode;
	cl->nextFreeByteOffset++;
}

void clInsertNop(ControlList* cl)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_NOP_opcode;
	cl->nextFreeByteOffset++;
}

void clInsertFlush(ControlList* cl)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_FLUSH_opcode;
	cl->nextFreeByteOffset++;
}

void clInsertFlushAllState(ControlList* cl)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_FLUSH_ALL_STATE_opcode;
	cl->nextFreeByteOffset++;
}

void clInsertStartTileBinning(ControlList* cl)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_START_TILE_BINNING_opcode;
	cl->nextFreeByteOffset++;
}

void clInsertIncrementSemaphore(ControlList* cl)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_INCREMENT_SEMAPHORE_opcode;
	cl->nextFreeByteOffset++;
}

void clInsertWaitOnSemaphore(ControlList* cl)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_WAIT_ON_SEMAPHORE_opcode;
	cl->nextFreeByteOffset++;
}

//input: 2 cls (cl, handles cl)
void clInsertBranch(ControlList* cls, ControlListAddress address)
{
	assert(cls);
	assert(cls->CPA);
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = V3D21_BRANCH_opcode; cls->nextFreeByteOffset++;
	//TODO is this correct?
	//clEmitShaderRelocation(cls, &address);
	*(uint32_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = address.offset; cls->nextFreeByteOffset += 4;
}

//input: 2 cls (cl, handles cl)
void clInsertBranchToSubList(ControlList* cls, ControlListAddress address)
{
	assert(cls);
	assert(cls->CPA);
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = V3D21_BRANCH_TO_SUB_LIST_opcode; cls->nextFreeByteOffset++;
	//TODO is this correct?
	//clEmitShaderRelocation(cls, &address);
	*(uint32_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = address.offset; cls->nextFreeByteOffset += 4;
}

void clInsertReturnFromSubList(ControlList* cl)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_RETURN_FROM_SUB_LIST_opcode;
	cl->nextFreeByteOffset++;
}

/*void clInsertStoreMultiSampleResolvedTileColorBuffer(ControlList* cl)
{
	assert(cl);
	assert(cl->buffer);
	assert(cl->nextFreeByte);
	*cl->nextFreeByte = V3D21_STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_opcode;
	cl->nextFreeByte++;
}*/

/*void clInsertStoreMultiSampleResolvedTileColorBufferAndEOF(ControlList* cl)
{
	assert(cl);
	assert(cl->buffer);
	assert(cl->nextFreeByte);
	*cl->nextFreeByte = V3D21_STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_AND_EOF_opcode;
	cl->nextFreeByte++;
}*/

/*
//input: 2 cls (cl, handles cl)
void clInsertStoreFullResolutionTileBuffer(ControlList* cls,
										   ControlListAddress address,
										   uint32_t lastTile, //0/1
										   uint32_t disableClearOnWrite, //0/1
										   uint32_t disableZStencilBufferWrite, //0/1
										   uint32_t disableColorBufferWrite) //0/1
{
	assert(cls);
	assert(cls->buffer);
	assert(cls->nextFreeByte);
	*cls->nextFreeByte = V3D21_STORE_FULL_RESOLUTION_TILE_BUFFER_opcode; cls->nextFreeByte++;
	//is this correct?
	clEmitShaderRelocation(cls, &address);
	*(uint32_t*)cls->nextFreeByte =
			moveBits(disableColorBufferWrite, 1, 0) |
			moveBits(disableZStencilBufferWrite, 1, 1) |
			moveBits(disableClearOnWrite, 1, 2) |
			moveBits(lastTile, 1, 3) |
			moveBits(address.offset, 28, 4);
	cls->nextFreeByte += 4;
}
*/

/*
//input: 2 cls (cl, handles cl)
void clInsertReLoadFullResolutionTileBuffer(ControlList* cls,
											ControlListAddress address,
											uint32_t disableZStencilBufferRead, //0/1
											uint32_t disableColorBufferRead) //0/1
{
	assert(cls);
	assert(cls->buffer);
	assert(cls->nextFreeByte);
	*cls->nextFreeByte = V3D21_RE_LOAD_FULL_RESOLUTION_TILE_BUFFER_opcode; cls->nextFreeByte++;
	//is this correct?
	clEmitShaderRelocation(cls, &address);
	*(uint32_t*)cls->nextFreeByte =
			moveBits(disableColorBufferRead, 1, 0) |
			moveBits(disableZStencilBufferRead, 1, 1) |
			moveBits(address.offset, 28, 4);
	cls->nextFreeByte += 4;
}
*/

/*
//input: 2 cls (cl, handles cl)
void clInsertStoreTileBufferGeneral(ControlList* cls,
									ControlListAddress address,
									uint32_t lastTileOfFrame, //0/1
									uint32_t disableZStencilBufferDump, //0/1
									uint32_t disableColorBufferDump, //0/1
									uint32_t disableZStencilBufferClearOnStoreDump, //0/1
									uint32_t disableColorBufferClearOnStoreDump, //0/1
									uint32_t disableDoubleBufferSwap, //0/1
									uint32_t pixelColorFormat, //0/1/2 RGBA8/BGR565dither/BGR565nodither
									uint32_t mode, //0/1/2 sample0/decimate4x/decimate16x
									uint32_t format, //0/1/2 raster/t/lt
									uint32_t bufferToStore) //0/1/2/3/5 none/color/zstencil/z/full
{
	assert(cls);
	assert(cls->buffer);
	assert(cls->nextFreeByte);
	*cls->nextFreeByte = V3D21_STORE_TILE_BUFFER_GENERAL_opcode; cls->nextFreeByte++;
	//is this correct?
	*cls->nextFreeByte =
			moveBits(bufferToStore, 3, 0) |
			moveBits(format, 2, 4) |
			moveBits(mode, 2, 6);
	cls->nextFreeByte++;
	*cls->nextFreeByte =
			moveBits(pixelColorFormat, 2, 0) |
			moveBits(disableDoubleBufferSwap, 1, 4) |
			moveBits(disableColorBufferClearOnStoreDump, 1, 5) |
			moveBits(disableZStencilBufferClearOnStoreDump, 1, 6) |
			moveBits(1, 1, 7); //disable vg mask
	cls->nextFreeByte++;
	clEmitShaderRelocation(cls, &address);
	*(uint32_t*)cls->nextFreeByte =
			moveBits(disableColorBufferDump, 1, 0) |
			moveBits(disableZStencilBufferDump, 1, 1) |
			moveBits(1, 1, 2) | //disable vg mask
			moveBits(lastTileOfFrame, 1, 3) |
			moveBits(address.offset, 28, 4);
	cls->nextFreeByte += 4;
}
*/

/*
//input: 2 cls (cl, handles cl)
void clInsertLoadTileBufferGeneral(ControlList* cls,
								   ControlListAddress address,
								   uint32_t disableZStencilBufferLoad, //0/1
								   uint32_t disableColorBufferLoad, //0/1
								   uint32_t pixelColorFormat, //0/1/2 RGBA8/BGR565dither/BGR565nodither
								   uint32_t mode, //0/1/2 sample0/decimate4x/decimate16x
								   uint32_t format, //0/1/2 raster/t/lt
								   uint32_t bufferToLoad) //0/1/2/3/5 none/color/zstencil/z/full
{
	assert(cls);
	assert(cls->buffer);
	assert(cls->nextFreeByte);
	*cls->nextFreeByte = V3D21_LOAD_TILE_BUFFER_GENERAL_opcode; cls->nextFreeByte++;
	//is this correct?
	*cls->nextFreeByte =
			moveBits(bufferToLoad, 3, 0) |
			moveBits(format, 2, 4);
	cls->nextFreeByte++;
	*cls->nextFreeByte =
			moveBits(pixelColorFormat, 2, 0);
	cls->nextFreeByte++;
	clEmitShaderRelocation(cls, &address);
	*(uint32_t*)cls->nextFreeByte =
			moveBits(disableColorBufferLoad, 1, 0) |
			moveBits(disableZStencilBufferLoad, 1, 1) |
			moveBits(1, 1, 2) | //disable vg mask
			moveBits(address.offset, 28, 4);
	cls->nextFreeByte += 4;

}
*/

void clInsertIndexedPrimitiveList(ControlList* cl,
								  uint32_t maxIndex,
								  uint32_t indicesAddress,
								  uint32_t length,
								  uint32_t indexType, //0/1: 8 or 16 bit
								  enum V3D21_Primitive primitiveMode)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_INDEXED_PRIMITIVE_LIST_opcode; cl->nextFreeByteOffset++;
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = moveBits(indexType, 4, 4) | moveBits(primitiveMode, 4, 0); cl->nextFreeByteOffset++;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = length; cl->nextFreeByteOffset += 4;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = indicesAddress; cl->nextFreeByteOffset += 4;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = maxIndex; cl->nextFreeByteOffset += 4;
}

void clInsertVertexArrayPrimitives(ControlList* cl,
								  uint32_t firstVertexIndex,
								  uint32_t length,
								  enum V3D21_Primitive primitiveMode)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_VERTEX_ARRAY_PRIMITIVES_opcode; cl->nextFreeByteOffset++;
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = moveBits(primitiveMode, 8, 0); cl->nextFreeByteOffset++;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = length; cl->nextFreeByteOffset += 4;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = firstVertexIndex; cl->nextFreeByteOffset += 4;
}

/*void clInsertPrimitiveListFormat(ControlList* cl,
								  uint32_t dataType, //1/3: 16 or 32 bit
								  uint32_t primitiveType) //0/1/2/3: point/line/tri/rhy
{
	assert(cl);
	assert(cl->buffer);
	assert(cl->nextFreeByte);
	*cl->nextFreeByte = V3D21_PRIMITIVE_LIST_FORMAT_opcode; cl->nextFreeByte++;
	*cl->nextFreeByte = moveBits(dataType, 4, 4) | moveBits(primitiveType, 4, 0); cl->nextFreeByte++;
}*/

void clInsertShaderState(ControlList* cl,
						  uint32_t address,
						  uint32_t extendedShaderRecord, //0/1: true/false
						 uint32_t numberOfAttributeArrays)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_GL_SHADER_STATE_opcode; cl->nextFreeByteOffset++;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) =
			moveBits(address, 28, 4) |
			moveBits(extendedShaderRecord, 1, 3) |
			moveBits(numberOfAttributeArrays, 3, 0); cl->nextFreeByteOffset += 4;
}

/*
void clInsertClearColors(ControlList* cl,
						uint32_t clearStencil,
						uint32_t clearZ, //24 bit Z
						uint64_t clearColor) //2x RGBA8 or 1x RGBA16
{
	assert(cl);
	assert(cl->buffer);
	assert(cl->nextFreeByte);
	*cl->nextFreeByte = V3D21_CLEAR_COLORS_opcode; cl->nextFreeByte++;
	*(uint64_t*)cl->nextFreeByte = clearColor; cl->nextFreeByte += 8;
	*(uint32_t*)cl->nextFreeByte = clearZ; cl->nextFreeByte += 4; //24 bits for Z, 8 bit for vg mask (unused)
	*cl->nextFreeByte = clearStencil; cl->nextFreeByte++;
}
*/

void clInsertConfigurationBits(ControlList* cl,
						uint32_t earlyZUpdatesEnable, //0/1
						uint32_t earlyZEnable, //0/1
						uint32_t zUpdatesEnable, //0/1
						enum V3D21_Compare_Function depthTestFunction,
						uint32_t coverageReadMode, //0/1 clear/leave as is
						uint32_t coveragePipeSelect, //0/1
						uint32_t coverageUpdateMode, //0/1/2/3 nonzero, odd, or, zero
						uint32_t coverageReadType, //0/1 4*8bit, 16 bit mask
						uint32_t rasterizerOversampleMode, //0/1/2 none, 4x, 16x
						uint32_t enableDepthOffset, //0/1
						uint32_t clockwisePrimitives, //0/1
						uint32_t enableReverseFacingPrimitive, //0/1
						uint32_t enableForwardFacingPrimitive) //0/1
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_CONFIGURATION_BITS_opcode; cl->nextFreeByteOffset++;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) =
			moveBits(enableForwardFacingPrimitive, 1, 0) |
			moveBits(enableReverseFacingPrimitive, 1, 1) |
			moveBits(clockwisePrimitives, 1, 2) |
			moveBits(enableDepthOffset, 1, 3) |
			moveBits(coverageReadType, 1, 5) |
			moveBits(rasterizerOversampleMode, 2, 6) |
			moveBits(coveragePipeSelect, 1, 8) |
			moveBits(coverageUpdateMode, 2, 9) |
			moveBits(coverageReadMode, 1, 11) |
			moveBits(depthTestFunction, 3, 12) |
			moveBits(zUpdatesEnable, 1, 15) |
			moveBits(earlyZEnable, 1, 16) |
			moveBits(earlyZUpdatesEnable, 1, 17); cl->nextFreeByteOffset += 3;
}

void clInsertFlatShadeFlags(ControlList* cl,
						uint32_t flags)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_FLAT_SHADE_FLAGS_opcode; cl->nextFreeByteOffset++;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = flags; cl->nextFreeByteOffset += 4;
}

void clInsertPointSize(ControlList* cl,
						float size)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_POINT_SIZE_opcode; cl->nextFreeByteOffset++;
	*(float*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = size; cl->nextFreeByteOffset += 4;
}

void clInsertLineWidth(ControlList* cl,
						float width)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_LINE_WIDTH_opcode; cl->nextFreeByteOffset++;
	*(float*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = width; cl->nextFreeByteOffset += 4;
}

void clInsertRHTXBoundary(ControlList* cl,
						uint32_t boundary) //sint16
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_RHT_X_BOUNDARY_opcode; cl->nextFreeByteOffset++;
	*(uint16_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = moveBits(boundary, 16, 0); cl->nextFreeByteOffset += 2;
}

uint32_t f32_to_f187(float f32)
{
   uint32_t bits = *(uint32_t*)&f32;
   return bits >> 16;
}

void clInsertDepthOffset(ControlList* cl,
						float units,
						 float factor)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_DEPTH_OFFSET_opcode; cl->nextFreeByteOffset++;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = moveBits(f32_to_f187(factor), 16, 0) | moveBits(f32_to_f187(units), 16, 16); cl->nextFreeByteOffset += 4;
}

void clInsertClipWindow(ControlList* cl,
						uint32_t width, //uint16
						uint32_t height, //uint16
						uint32_t bottomPixelCoord, //uint16
						uint32_t leftPixelCoord)  //uint16
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_CLIP_WINDOW_opcode; cl->nextFreeByteOffset++;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = moveBits(leftPixelCoord, 16, 0) | moveBits(bottomPixelCoord, 16, 16); cl->nextFreeByteOffset += 4;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = moveBits(width, 16, 0) | moveBits(height, 16, 16); cl->nextFreeByteOffset += 4;
}

uint16_t get16bitSignedFixedNumber(float x)
{
	int32_t integerPart = roundf(x * 16.0f);
	return integerPart & 0xffff;
}

//viewport centre x/y coordinate
void clInsertViewPortOffset(ControlList* cl,
						float x,
						float y
						)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_VIEWPORT_OFFSET_opcode; cl->nextFreeByteOffset++;
	//expects 16 bit signed fixed point number with 4 fractional bits
	*(uint16_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = get16bitSignedFixedNumber(x); cl->nextFreeByteOffset += 2;
	*(uint16_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = get16bitSignedFixedNumber(y); cl->nextFreeByteOffset += 2;
}

void clInsertZMinMaxClippingPlanes(ControlList* cl,
						float minZw,
						float maxZw
						)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_Z_MIN_AND_MAX_CLIPPING_PLANES_opcode; cl->nextFreeByteOffset++;
	*(float*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = minZw; cl->nextFreeByteOffset += 4;
	*(float*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = maxZw; cl->nextFreeByteOffset += 4;
}

void clInsertClipperXYScaling(ControlList* cl,
						float width, //half width in 1/16 of pixel
						float height //half height in 1/16 of pixel
						)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_CLIPPER_XY_SCALING_opcode; cl->nextFreeByteOffset++;
	*(float*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = width; cl->nextFreeByteOffset += 4;
	*(float*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = height; cl->nextFreeByteOffset += 4;
}

void clInsertClipperZScaleOffset(ControlList* cl,
						float zOffset, //zc to zs
						float zScale //zc to zs
						)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_CLIPPER_Z_SCALE_AND_OFFSET_opcode; cl->nextFreeByteOffset++;
	*(float*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = zScale; cl->nextFreeByteOffset += 4;
	*(float*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = zOffset; cl->nextFreeByteOffset += 4;
}

void clInsertTileBinningModeConfiguration(ControlList* cl,
						uint32_t doubleBufferInNonMsMode, //0/1
						uint32_t tileAllocationBlockSize, //0/1/2/3 32/64/128/256 bytes
						uint32_t tileAllocationInitialBlockSize, //0/1/2/3 32/64/128/256 bytes
						uint32_t autoInitializeTileStateDataArray, //0/1
						uint32_t tileBuffer64BitColorDepth, //0/1
						uint32_t multisampleMode4x, //0/1
						uint32_t widthInPixels,
						uint32_t heightInPixels,
						uint32_t tileStateDataArrayAddress, //16 byte aligned, size of 48 bytes * num tiles
						uint32_t tileAllocationMemorySize,
						uint32_t tileAllocationMemoryAddress
						)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_TILE_BINNING_MODE_CONFIGURATION_opcode; cl->nextFreeByteOffset++;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = tileAllocationMemoryAddress; cl->nextFreeByteOffset += 4;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = tileAllocationMemorySize; cl->nextFreeByteOffset += 4;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = tileStateDataArrayAddress; cl->nextFreeByteOffset += 4;
	uint32_t tileSizeW = 64;
	uint32_t tileSizeH = 64;

	if(multisampleMode4x)
	{
		tileSizeW >>= 1;
		tileSizeH >>= 1;
	}

	if(tileBuffer64BitColorDepth)
	{
		tileSizeH >>= 1;
	}

	uint32_t widthInTiles = divRoundUp(widthInPixels, tileSizeW);
	uint32_t heightInTiles = divRoundUp(heightInPixels, tileSizeH);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = widthInTiles; cl->nextFreeByteOffset++;
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = heightInTiles; cl->nextFreeByteOffset++;
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) =
			moveBits(multisampleMode4x, 1, 0) |
			moveBits(tileBuffer64BitColorDepth, 1, 1) |
			moveBits(autoInitializeTileStateDataArray, 1, 2) |
			moveBits(tileAllocationInitialBlockSize, 2, 3) |
			moveBits(tileAllocationBlockSize, 2, 5) |
			moveBits(doubleBufferInNonMsMode, 1, 7); cl->nextFreeByteOffset++;
}

/*
void clInsertTileRenderingModeConfiguration(ControlList* cls,
						ControlListAddress address,
						uint32_t doubleBufferInNonMsMode, //0/1
						uint32_t earlyZEarlyCovDisable, //0/1
						uint32_t earlyZUpdateDirection, //0/1 lt,le/gt,ge
						uint32_t selectCoverageMode, //0/1
						uint32_t memoryFormat, //0/1/2 linear/t/lt
						uint32_t decimateMode, //0/1/2 0x/4x/16x
						uint32_t nonHDRFrameFormatColorFormat, //0/1/2 bgr565dithered/rgba8/bgr565nodither
						uint32_t tileBufferHDRMode, //0/1
						uint32_t multisampleMode4x, //0/1
						uint32_t widthPixels,
						uint32_t heightPixels)
{
	assert(cls);
	assert(cls->buffer);
	assert(cls->nextFreeByte);
	*cls->nextFreeByte = V3D21_TILE_RENDERING_MODE_CONFIGURATION_opcode; cls->nextFreeByte++;
	//is this correct?
	clEmitShaderRelocation(cls, &address);
	*(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4;
	*(uint32_t*)cls->nextFreeByte = moveBits(widthPixels, 16, 0) | moveBits(heightPixels, 16, 16); cls->nextFreeByte += 4;
	*(uint16_t*)cls->nextFreeByte =
			moveBits(multisampleMode4x, 1, 0) |
			moveBits(tileBufferHDRMode, 1, 1) |
			moveBits(nonHDRFrameFormatColorFormat, 2, 2) |
			moveBits(decimateMode, 2, 4) |
			moveBits(memoryFormat, 2, 6) |
			moveBits(0, 1, 8) | //vg buffer enable
			moveBits(selectCoverageMode, 1, 9) |
			moveBits(earlyZUpdateDirection, 1, 10) |
			moveBits(earlyZEarlyCovDisable, 1, 11) |
			moveBits(doubleBufferInNonMsMode, 1, 12); cls->nextFreeByte += 2;
}
*/

/*
void clInsertTileCoordinates(ControlList* cl,
						uint32_t tileColumnNumber, //int8
						uint32_t tileRowNumber) //int8
{
	assert(cl);
	assert(cl->buffer);
	assert(cl->nextFreeByte);
	*cl->nextFreeByte = V3D21_TILE_COORDINATES_opcode; cl->nextFreeByte++;
	*(uint16_t*)cl->nextFreeByte = moveBits(tileColumnNumber, 8, 0) | moveBits(tileRowNumber, 8, 8); cl->nextFreeByte += 2;
}
*/

void clInsertGEMRelocations(ControlList* cl,
							uint32_t buffer0,
							uint32_t buffer1)
{
	assert(cl);
	assert(cl->CPA);
	*(uint8_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = V3D21_GEM_RELOCATIONS_opcode; cl->nextFreeByteOffset++;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = buffer0; cl->nextFreeByteOffset += 4;
	*(uint32_t*)getCPAptrFromOffset(cl->CPA, cl->nextFreeByteOffset) = buffer1; cl->nextFreeByteOffset += 4;
}

//input: 2 cls (cl, handles cl)
void clInsertShaderRecord(ControlList* cls,
						  ControlList* relocCl,
						  ControlList* handlesCl,
						  uint32_t handlesOffset,
						  uint32_t handlesSize,
						  uint32_t fragmentShaderIsSingleThreaded, //0/1
						  uint32_t pointSizeIncludedInShadedVertexData, //0/1
						  uint32_t enableClipping, //0/1
						  uint32_t fragmentNumberOfUsedUniforms,
						  uint32_t fragmentNumberOfVaryings,
						  uint32_t fragmentUniformsAddress,
						  ControlListAddress fragmentCodeAddress,
						  uint32_t vertexNumberOfUsedUniforms,
						  uint32_t vertexAttributeArraySelectBits,
						  uint32_t vertexTotalAttributesSize,
						  uint32_t vertexUniformsAddress,
						  ControlListAddress vertexCodeAddress,
						  uint32_t coordinateNumberOfUsedUniforms,
						  uint32_t coordinateAttributeArraySelectBits,
						  uint32_t coordinateTotalAttributesSize,
						  uint32_t coordinateUniformsAddress,
						  ControlListAddress coordinateCodeAddress)
{
	assert(cls);
	assert(cls->CPA);
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) =
			moveBits(fragmentShaderIsSingleThreaded, 1, 0) |
			moveBits(pointSizeIncludedInShadedVertexData, 1, 1) |
			moveBits(enableClipping, 1, 2); cls->nextFreeByteOffset++;
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = 0; cls->nextFreeByteOffset++;
	*(uint16_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = moveBits(fragmentNumberOfUsedUniforms, 16, 0); cls->nextFreeByteOffset++;
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) |= fragmentNumberOfVaryings; cls->nextFreeByteOffset++;
	clEmitShaderRelocation(relocCl, handlesCl, handlesOffset, handlesSize, &fragmentCodeAddress);
	*(uint32_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = fragmentCodeAddress.offset; cls->nextFreeByteOffset += 4;
	*(uint32_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = fragmentUniformsAddress; cls->nextFreeByteOffset += 4;

	*(uint16_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = moveBits(vertexNumberOfUsedUniforms, 16, 0); cls->nextFreeByteOffset += 2;
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = vertexAttributeArraySelectBits; cls->nextFreeByteOffset++;
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = vertexTotalAttributesSize; cls->nextFreeByteOffset++;
	clEmitShaderRelocation(relocCl, handlesCl, handlesOffset, handlesSize, &vertexCodeAddress);
	//wtf??? --> shader code will always have an offset of 0 so this is fine
	uint32_t offset = moveBits(vertexCodeAddress.offset, 32, 0) | moveBits(vertexUniformsAddress, 32, 0);
	*(uint32_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = offset; cls->nextFreeByteOffset += 4;
	cls->nextFreeByteOffset += 4;

	*(uint16_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = moveBits(coordinateNumberOfUsedUniforms, 16, 0); cls->nextFreeByteOffset += 2;
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = coordinateAttributeArraySelectBits; cls->nextFreeByteOffset++;
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = coordinateTotalAttributesSize; cls->nextFreeByteOffset++;
	clEmitShaderRelocation(relocCl, handlesCl, handlesOffset, handlesSize, &coordinateCodeAddress);
	*(uint32_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = coordinateCodeAddress.offset; cls->nextFreeByteOffset += 4;
	*(uint32_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = coordinateUniformsAddress; cls->nextFreeByteOffset += 4;
}

//input: 2 cls (cl, handles cl)
void clInsertAttributeRecord(ControlList* cls,
							 ControlList* relocCl,
							 ControlList* handlesCl,
							 uint32_t handlesOffset, uint32_t handlesSize,
						  ControlListAddress address,
						  uint32_t sizeBytes,
						  uint32_t stride,
						  uint32_t vertexVPMOffset,
						  uint32_t coordinateVPMOffset)
{
	assert(cls);
	assert(cls->CPA);
	uint32_t sizeBytesMinusOne = sizeBytes - 1;
	clEmitShaderRelocation(relocCl, handlesCl, handlesOffset, handlesSize, &address);
	*(uint32_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = address.offset; cls->nextFreeByteOffset += 4;
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = sizeBytesMinusOne; cls->nextFreeByteOffset++;
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = stride; cls->nextFreeByteOffset++;
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = vertexVPMOffset; cls->nextFreeByteOffset++;
	*(uint8_t*)getCPAptrFromOffset(cls->CPA, cls->nextFreeByteOffset) = coordinateVPMOffset; cls->nextFreeByteOffset++;
}

uint32_t clGetHandleIndex(ControlList* handlesCl, uint32_t handlesOffset, uint32_t handlesSize, uint32_t handle)
{
	uint32_t c = 0;

	//if curr marker is closed already we need to work with the stored size
	uint32_t numHandles = (handlesSize ? handlesSize : (handlesCl->nextFreeByteOffset - handlesOffset)) / 4;

	for(; c < numHandles; ++c)
	{
		if(((uint32_t*)getCPAptrFromOffset(handlesCl->CPA, handlesOffset))[c] == handle)
		{
			//found
			return c;
		}
	}

	//write handle to handles cl
	*(uint32_t*)getCPAptrFromOffset(handlesCl->CPA, handlesCl->nextFreeByteOffset) = handle;
	handlesCl->nextFreeByteOffset += 4;

	assert(handlesCl->nextFreeByteOffset < handlesCl->offset + handlesCl->blockSize * handlesCl->numBlocks);

	return c;
}

//input: 2 cls (cl + handles cl)
inline void clEmitShaderRelocation(ControlList* relocCl, ControlList* handlesCl, uint32_t handlesOffset, uint32_t handlesSize, const ControlListAddress* address)
{
	assert(relocCl);
	assert(relocCl->CPA);
	assert(handlesCl);
	assert(handlesCl->CPA);
	assert(address);
	assert(address->handle);

	//store offset within handles in cl
	*(uint32_t*)getCPAptrFromOffset(relocCl->CPA, relocCl->nextFreeByteOffset) = clGetHandleIndex(handlesCl, handlesOffset, handlesSize, address->handle);
	relocCl->nextFreeByteOffset += 4;
}

inline void clDummyRelocation(ControlList* relocCl, const ControlListAddress* address)
{}