diff --git a/driver/AlignedAllocator.c b/driver/AlignedAllocator.c new file mode 100644 index 0000000..7e7c6e8 --- /dev/null +++ b/driver/AlignedAllocator.c @@ -0,0 +1,31 @@ +#include "AlignedAllocator.h" + +void* alignedAlloc( unsigned bytes, unsigned alignment ) +{ + if( !bytes ) + { + return 0; + } + + const unsigned maxBytes = 1024 * 1024 * 1024; //1GB is max on RPi + + if( bytes > maxBytes ) + { + return 0; //bad alloc + } + + void* pv = 0; + + if( posix_memalign( &pv, alignment, bytes ) ) + { + pv = 0; //allocation failed + } + + return pv; +} + +void alignedFree( void* p ) +{ + free( p ); +} + diff --git a/driver/AlignedAllocator.h b/driver/AlignedAllocator.h index 8f9471c..dc71b76 100644 --- a/driver/AlignedAllocator.h +++ b/driver/AlignedAllocator.h @@ -6,34 +6,8 @@ extern "C" { #include -void* alignedAlloc( unsigned bytes, unsigned alignment ) -{ - if( !bytes ) - { - return 0; - } - - const unsigned maxBytes = 1024 * 1024 * 1024; //1GB is max on RPi - - if( bytes > maxBytes ) - { - return 0; //bad alloc - } - - void* pv = 0; - - if( posix_memalign( &pv, alignment, bytes ) ) - { - pv = 0; //allocation failed - } - - return pv; -} - -void alignedFree( void* p ) -{ - free( p ); -} +void* alignedAlloc( unsigned bytes, unsigned alignment ); +void alignedFree( void* p ); #if defined (__cplusplus) } diff --git a/driver/ConsecutivePoolAllocator.c b/driver/ConsecutivePoolAllocator.c new file mode 100644 index 0000000..ef1ab75 --- /dev/null +++ b/driver/ConsecutivePoolAllocator.c @@ -0,0 +1,157 @@ +#include "ConsecutivePoolAllocator.h" + +#include "CustomAssert.h" + +#include + +ConsecutivePoolAllocator createConsecutivePoolAllocator(char* b, unsigned bs, unsigned s) +{ + assert(b); //only allocated memory + assert(bs >= sizeof(void*)); //we need to be able to store + assert(s%bs==0); //we want a size that is the exact multiple of block size + assert(s > bs); //at least 1 element + + ConsecutivePoolAllocator pa = + { + .buf = b, + .nextFreeBlock = (uint32_t*)b, + .blockSize = bs, + .size = s + }; + + //initialize linked list of free pointers + uint32_t* ptr = pa.nextFreeBlock; + unsigned last = s/bs - 1; + for(unsigned c = 0; c < last; ++c) + { + *ptr = (uint32_t)ptr + bs; + ptr += bs/4; + } + + *ptr = 0; //last element + + return pa; +} + +void destroyConsecutivePoolAllocator(ConsecutivePoolAllocator* pa) +{ + //actual memory freeing is done by caller + pa->buf = 0; + pa->nextFreeBlock = 0; + pa->blockSize = 0; + pa->size = 0; +} + +//allocate numBlocks consecutive memory +void* consecutivePoolAllocate(ConsecutivePoolAllocator* pa, uint32_t numBlocks) +{ + assert(pa->buf); + + if(!pa->nextFreeBlock) + { + return 0; //no free blocks + } + + void* ret = 0; + for(uint32_t* candidate = pa->nextFreeBlock; candidate; candidate = (uint32_t*)*candidate) + { + uint32_t found = 1; + uint32_t* prevBlock = candidate; + uint32_t* blockAfterCandidate = (uint32_t*)*candidate; + //check if there are enough consecutive free blocks + for(uint32_t c = 0; c < numBlocks - 1; ++c) + { + if(blockAfterCandidate - prevBlock != pa->blockSize) + { + //signal if not consecutive (ie. diff is greater than blocksize) + found = 0; + break; + } + prevBlock = blockAfterCandidate; + blockAfterCandidate = (uint32_t*)*blockAfterCandidate; + } + + //numblocks consecutive blocks found + if(found) + { + ret = candidate; + if(pa->nextFreeBlock == candidate) + { + //candidate found immediately + pa->nextFreeBlock = blockAfterCandidate; + } + else + { + //somewhere the linked list would point to candidate, we need to correct this + for(uint32_t* nextFreeBlockCandidate = pa->nextFreeBlock; nextFreeBlockCandidate; nextFreeBlockCandidate = (uint32_t*)*nextFreeBlockCandidate) + { + if((uint32_t*)*nextFreeBlockCandidate == candidate) + { + *nextFreeBlockCandidate = (uint32_t)blockAfterCandidate; + break; + } + } + } + break; + } + } + + return ret; +} + +//free numBlocks consecutive memory +void consecutivePoolFree(ConsecutivePoolAllocator* pa, void* p, uint32_t numBlocks) +{ + assert(pa->buf); + assert(p); + + if((void*)pa->nextFreeBlock > p) + { + for(uint32_t c = 0; c < numBlocks - 1; ++c) + { + //set each allocated block to form a linked list + *(uint32_t*)((char*)p + c * pa->blockSize) = (uint32_t)((char*)p + (c + 1) * pa->blockSize); + } + //set last block to point to the next free + *(uint32_t*)((char*)p + (numBlocks - 1) * pa->blockSize) = (uint32_t)pa->nextFreeBlock; + //set next free to the newly freed block + pa->nextFreeBlock = p; + return; + } + + //somewhere the linked list may point after the free block (or null), we need to correct this + for(uint32_t* nextFreeBlockCandidate = pa->nextFreeBlock; nextFreeBlockCandidate; nextFreeBlockCandidate = (uint32_t*)*nextFreeBlockCandidate) + { + if((void*)*nextFreeBlockCandidate > p || !*nextFreeBlockCandidate) + { + for(uint32_t c = 0; c < numBlocks - 1; ++c) + { + //set each allocated block to form a linked list + *(uint32_t*)((char*)p + c * pa->blockSize) = (uint32_t)((char*)p + (c + 1) * pa->blockSize); + } + //set last block to point to the next free + *(uint32_t*)((char*)p + (numBlocks - 1) * pa->blockSize) = *nextFreeBlockCandidate; + + *nextFreeBlockCandidate = (uint32_t)p; + break; + } + } +} + +//if there's a block free after the current block, it just allocates one more block +//else it frees current block and allocates a new one +void* consecutivePoolReAllocate(ConsecutivePoolAllocator* pa, void* currentMem, uint32_t currNumBlocks) +{ + if(pa->nextFreeBlock == (uint32_t*)((char*)currentMem + currNumBlocks * pa->blockSize)) + { + //we have one more block after current one, so just expand current + pa->nextFreeBlock = (uint32_t*)*pa->nextFreeBlock; + return currentMem; + } + else + { + void* ret = consecutivePoolAllocate(pa, currNumBlocks + 1); + consecutivePoolFree(pa, currentMem, currNumBlocks); + return ret; + } +} diff --git a/driver/ConsecutivePoolAllocator.h b/driver/ConsecutivePoolAllocator.h index a90f751..112d57d 100644 --- a/driver/ConsecutivePoolAllocator.h +++ b/driver/ConsecutivePoolAllocator.h @@ -16,157 +16,11 @@ typedef struct ConsecutivePoolAllocator unsigned size; //size is exact multiple of block size } ConsecutivePoolAllocator; -ConsecutivePoolAllocator createConsecutivePoolAllocator(char* b, unsigned bs, unsigned s) -{ - assert(b); //only allocated memory - assert(bs >= sizeof(void*)); //we need to be able to store - assert(s%bs==0); //we want a size that is the exact multiple of block size - assert(s > bs); //at least 1 element - - ConsecutivePoolAllocator pa = - { - .buf = b, - .nextFreeBlock = (uint32_t*)b, - .blockSize = bs, - .size = s - }; - - //initialize linked list of free pointers - uint32_t* ptr = pa.nextFreeBlock; - unsigned last = s/bs - 1; - for(unsigned c = 0; c < last; ++c) - { - *ptr = (uint32_t)ptr + bs; - ptr += bs/4; - } - - *ptr = 0; //last element - - return pa; -} - -void destroyConsecutivePoolAllocator(ConsecutivePoolAllocator* pa) -{ - //actual memory freeing is done by caller - pa->buf = 0; - pa->nextFreeBlock = 0; - pa->blockSize = 0; - pa->size = 0; -} - -//allocate numBlocks consecutive memory -void* consecutivePoolAllocate(ConsecutivePoolAllocator* pa, uint32_t numBlocks) -{ - assert(pa->buf); - - if(!pa->nextFreeBlock) - { - return 0; //no free blocks - } - - void* ret = 0; - for(uint32_t* candidate = pa->nextFreeBlock; candidate; candidate = (uint32_t*)*candidate) - { - uint32_t found = 1; - uint32_t* prevBlock = candidate; - uint32_t* blockAfterCandidate = (uint32_t*)*candidate; - //check if there are enough consecutive free blocks - for(uint32_t c = 0; c < numBlocks - 1; ++c) - { - if(blockAfterCandidate - prevBlock != pa->blockSize) - { - //signal if not consecutive (ie. diff is greater than blocksize) - found = 0; - break; - } - prevBlock = blockAfterCandidate; - blockAfterCandidate = (uint32_t*)*blockAfterCandidate; - } - - //numblocks consecutive blocks found - if(found) - { - ret = candidate; - if(pa->nextFreeBlock == candidate) - { - //candidate found immediately - pa->nextFreeBlock = blockAfterCandidate; - } - else - { - //somewhere the linked list would point to candidate, we need to correct this - for(uint32_t* nextFreeBlockCandidate = pa->nextFreeBlock; nextFreeBlockCandidate; nextFreeBlockCandidate = (uint32_t*)*nextFreeBlockCandidate) - { - if((uint32_t*)*nextFreeBlockCandidate == candidate) - { - *nextFreeBlockCandidate = (uint32_t)blockAfterCandidate; - break; - } - } - } - break; - } - } - - return ret; -} - -//free numBlocks consecutive memory -void consecutivePoolFree(ConsecutivePoolAllocator* pa, void* p, uint32_t numBlocks) -{ - assert(pa->buf); - assert(p); - - if((void*)pa->nextFreeBlock > p) - { - for(uint32_t c = 0; c < numBlocks - 1; ++c) - { - //set each allocated block to form a linked list - *(uint32_t*)((char*)p + c * pa->blockSize) = (uint32_t)((char*)p + (c + 1) * pa->blockSize); - } - //set last block to point to the next free - *(uint32_t*)((char*)p + (numBlocks - 1) * pa->blockSize) = (uint32_t)pa->nextFreeBlock; - //set next free to the newly freed block - pa->nextFreeBlock = p; - return; - } - - //somewhere the linked list may point after the free block (or null), we need to correct this - for(uint32_t* nextFreeBlockCandidate = pa->nextFreeBlock; nextFreeBlockCandidate; nextFreeBlockCandidate = (uint32_t*)*nextFreeBlockCandidate) - { - if((void*)*nextFreeBlockCandidate > p || !*nextFreeBlockCandidate) - { - for(uint32_t c = 0; c < numBlocks - 1; ++c) - { - //set each allocated block to form a linked list - *(uint32_t*)((char*)p + c * pa->blockSize) = (uint32_t)((char*)p + (c + 1) * pa->blockSize); - } - //set last block to point to the next free - *(uint32_t*)((char*)p + (numBlocks - 1) * pa->blockSize) = *nextFreeBlockCandidate; - - *nextFreeBlockCandidate = (uint32_t)p; - break; - } - } -} - -//if there's a block free after the current block, it just allocates one more block -//else it frees current block and allocates a new one -void* consecutivePoolReAllocate(ConsecutivePoolAllocator* pa, void* currentMem, uint32_t currNumBlocks) -{ - if(pa->nextFreeBlock == (uint32_t*)((char*)currentMem + currNumBlocks * pa->blockSize)) - { - //we have one more block after current one, so just expand current - pa->nextFreeBlock = (uint32_t*)*pa->nextFreeBlock; - return currentMem; - } - else - { - void* ret = consecutivePoolAllocate(pa, currNumBlocks + 1); - consecutivePoolFree(pa, currentMem, currNumBlocks); - return ret; - } -} +ConsecutivePoolAllocator createConsecutivePoolAllocator(char* b, unsigned bs, unsigned s); +void destroyConsecutivePoolAllocator(ConsecutivePoolAllocator* pa); +void* consecutivePoolAllocate(ConsecutivePoolAllocator* pa, uint32_t numBlocks); +void consecutivePoolFree(ConsecutivePoolAllocator* pa, void* p, uint32_t numBlocks); +void* consecutivePoolReAllocate(ConsecutivePoolAllocator* pa, void* currentMem, uint32_t currNumBlocks); #if defined (__cplusplus) } diff --git a/driver/ControlListUtil.c b/driver/ControlListUtil.c new file mode 100644 index 0000000..0439d1c --- /dev/null +++ b/driver/ControlListUtil.c @@ -0,0 +1,715 @@ +#include "ControlListUtil.h" + +#include + +uint32_t divRoundUp(uint32_t n, uint32_t d) +{ + return (((n) + (d) - 1) / (d)); +} + +//move bits to offset, mask rest to 0 +uint32_t moveBits(uint32_t d, uint32_t bits, uint32_t offset) +{ + return (d << offset) & (~(~0 << bits) << offset); +} + +uint32_t clSize(ControlList* cl) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + return cl->nextFreeByte - cl->buffer; +} + +uint32_t clHasEnoughSpace(ControlList* cl, uint32_t size) +{ + uint32_t currSize = clSize(cl); + if(currSize + size < CONTROL_LIST_SIZE) + { + return 1; //fits! + } + else + { + return 0; //need to reallocate + } +} + + +void clInit(ControlList* cl, void* buffer) +{ + assert(cl); + assert(buffer); + cl->buffer = buffer; + cl->numBlocks = 1; + cl->nextFreeByte = &cl->buffer[0]; +} + +void clInsertHalt(ControlList* cl) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_HALT_opcode; + cl->nextFreeByte++; +} + +void clInsertNop(ControlList* cl) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_NOP_opcode; + cl->nextFreeByte++; +} + +void clInsertFlush(ControlList* cl) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_FLUSH_opcode; + cl->nextFreeByte++; +} + +void clInsertFlushAllState(ControlList* cl) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_FLUSH_ALL_STATE_opcode; + cl->nextFreeByte++; +} + +void clInsertStartTileBinning(ControlList* cl) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_START_TILE_BINNING_opcode; + cl->nextFreeByte++; +} + +void clInsertIncrementSemaphore(ControlList* cl) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_INCREMENT_SEMAPHORE_opcode; + cl->nextFreeByte++; +} + +void clInsertWaitOnSemaphore(ControlList* cl) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_WAIT_ON_SEMAPHORE_opcode; + cl->nextFreeByte++; +} + +//input: 2 cls (cl, handles cl) +void clInsertBranch(ControlList* cls, ControlListAddress address) +{ + assert(cls); + assert(cls->buffer); + assert(cls->nextFreeByte); + *cls->nextFreeByte = V3D21_BRANCH_opcode; cls->nextFreeByte++; + //TODO is this correct? + clEmitShaderRelocation(cls, &address); + *(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4; +} + +//input: 2 cls (cl, handles cl) +void clInsertBranchToSubList(ControlList* cls, ControlListAddress address) +{ + assert(cls); + assert(cls->buffer); + assert(cls->nextFreeByte); + *cls->nextFreeByte = V3D21_BRANCH_TO_SUB_LIST_opcode; cls->nextFreeByte++; + //TODO is this correct? + clEmitShaderRelocation(cls, &address); + *(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4; +} + +void clInsertReturnFromSubList(ControlList* cl) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_RETURN_FROM_SUB_LIST_opcode; + cl->nextFreeByte++; +} + +void clInsertStoreMultiSampleResolvedTileColorBuffer(ControlList* cl) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_opcode; + cl->nextFreeByte++; +} + +void clInsertStoreMultiSampleResolvedTileColorBufferAndEOF(ControlList* cl) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_AND_EOF_opcode; + cl->nextFreeByte++; +} + +/* +//input: 2 cls (cl, handles cl) +void clInsertStoreFullResolutionTileBuffer(ControlList* cls, + ControlListAddress address, + uint32_t lastTile, //0/1 + uint32_t disableClearOnWrite, //0/1 + uint32_t disableZStencilBufferWrite, //0/1 + uint32_t disableColorBufferWrite) //0/1 +{ + assert(cls); + assert(cls->buffer); + assert(cls->nextFreeByte); + *cls->nextFreeByte = V3D21_STORE_FULL_RESOLUTION_TILE_BUFFER_opcode; cls->nextFreeByte++; + //TODO is this correct? + clEmitShaderRelocation(cls, &address); + *(uint32_t*)cls->nextFreeByte = + moveBits(disableColorBufferWrite, 1, 0) | + moveBits(disableZStencilBufferWrite, 1, 1) | + moveBits(disableClearOnWrite, 1, 2) | + moveBits(lastTile, 1, 3) | + moveBits(address.offset, 28, 4); + cls->nextFreeByte += 4; +} +*/ + +/* +//input: 2 cls (cl, handles cl) +void clInsertReLoadFullResolutionTileBuffer(ControlList* cls, + ControlListAddress address, + uint32_t disableZStencilBufferRead, //0/1 + uint32_t disableColorBufferRead) //0/1 +{ + assert(cls); + assert(cls->buffer); + assert(cls->nextFreeByte); + *cls->nextFreeByte = V3D21_RE_LOAD_FULL_RESOLUTION_TILE_BUFFER_opcode; cls->nextFreeByte++; + //TODO is this correct? + clEmitShaderRelocation(cls, &address); + *(uint32_t*)cls->nextFreeByte = + moveBits(disableColorBufferRead, 1, 0) | + moveBits(disableZStencilBufferRead, 1, 1) | + moveBits(address.offset, 28, 4); + cls->nextFreeByte += 4; +} +*/ + +/* +//input: 2 cls (cl, handles cl) +void clInsertStoreTileBufferGeneral(ControlList* cls, + ControlListAddress address, + uint32_t lastTileOfFrame, //0/1 + uint32_t disableZStencilBufferDump, //0/1 + uint32_t disableColorBufferDump, //0/1 + uint32_t disableZStencilBufferClearOnStoreDump, //0/1 + uint32_t disableColorBufferClearOnStoreDump, //0/1 + uint32_t disableDoubleBufferSwap, //0/1 + uint32_t pixelColorFormat, //0/1/2 RGBA8/BGR565dither/BGR565nodither + uint32_t mode, //0/1/2 sample0/decimate4x/decimate16x + uint32_t format, //0/1/2 raster/t/lt + uint32_t bufferToStore) //0/1/2/3/5 none/color/zstencil/z/full +{ + assert(cls); + assert(cls->buffer); + assert(cls->nextFreeByte); + *cls->nextFreeByte = V3D21_STORE_TILE_BUFFER_GENERAL_opcode; cls->nextFreeByte++; + //TODO is this correct? + *cls->nextFreeByte = + moveBits(bufferToStore, 3, 0) | + moveBits(format, 2, 4) | + moveBits(mode, 2, 6); + cls->nextFreeByte++; + *cls->nextFreeByte = + moveBits(pixelColorFormat, 2, 0) | + moveBits(disableDoubleBufferSwap, 1, 4) | + moveBits(disableColorBufferClearOnStoreDump, 1, 5) | + moveBits(disableZStencilBufferClearOnStoreDump, 1, 6) | + moveBits(1, 1, 7); //disable vg mask + cls->nextFreeByte++; + clEmitShaderRelocation(cls, &address); + *(uint32_t*)cls->nextFreeByte = + moveBits(disableColorBufferDump, 1, 0) | + moveBits(disableZStencilBufferDump, 1, 1) | + moveBits(1, 1, 2) | //disable vg mask + moveBits(lastTileOfFrame, 1, 3) | + moveBits(address.offset, 28, 4); + cls->nextFreeByte += 4; +} +*/ + +/* +//input: 2 cls (cl, handles cl) +void clInsertLoadTileBufferGeneral(ControlList* cls, + ControlListAddress address, + uint32_t disableZStencilBufferLoad, //0/1 + uint32_t disableColorBufferLoad, //0/1 + uint32_t pixelColorFormat, //0/1/2 RGBA8/BGR565dither/BGR565nodither + uint32_t mode, //0/1/2 sample0/decimate4x/decimate16x + uint32_t format, //0/1/2 raster/t/lt + uint32_t bufferToLoad) //0/1/2/3/5 none/color/zstencil/z/full +{ + assert(cls); + assert(cls->buffer); + assert(cls->nextFreeByte); + *cls->nextFreeByte = V3D21_LOAD_TILE_BUFFER_GENERAL_opcode; cls->nextFreeByte++; + //TODO is this correct? + *cls->nextFreeByte = + moveBits(bufferToLoad, 3, 0) | + moveBits(format, 2, 4); + cls->nextFreeByte++; + *cls->nextFreeByte = + moveBits(pixelColorFormat, 2, 0); + cls->nextFreeByte++; + clEmitShaderRelocation(cls, &address); + *(uint32_t*)cls->nextFreeByte = + moveBits(disableColorBufferLoad, 1, 0) | + moveBits(disableZStencilBufferLoad, 1, 1) | + moveBits(1, 1, 2) | //disable vg mask + moveBits(address.offset, 28, 4); + cls->nextFreeByte += 4; + +} +*/ + +void clInsertIndexedPrimitiveList(ControlList* cl, + uint32_t maxIndex, + uint32_t indicesAddress, + uint32_t length, + uint32_t indexType, //0/1: 8 or 16 bit + enum V3D21_Primitive primitiveMode) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_INDEXED_PRIMITIVE_LIST_opcode; cl->nextFreeByte++; + *cl->nextFreeByte = moveBits(indexType, 4, 4) | moveBits(primitiveMode, 4, 0); cl->nextFreeByte++; + *(uint32_t*)cl->nextFreeByte = length; cl->nextFreeByte += 4; + *(uint32_t*)cl->nextFreeByte = indicesAddress; cl->nextFreeByte += 4; + *(uint32_t*)cl->nextFreeByte = maxIndex; cl->nextFreeByte += 4; +} + +void clInsertVertexArrayPrimitives(ControlList* cl, + uint32_t firstVertexIndex, + uint32_t length, + enum V3D21_Primitive primitiveMode) +{ + assert(cl); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_VERTEX_ARRAY_PRIMITIVES_opcode; cl->nextFreeByte++; + *cl->nextFreeByte = moveBits(primitiveMode, 8, 0); cl->nextFreeByte++; + *(uint32_t*)cl->nextFreeByte = length; cl->nextFreeByte += 4; + *(uint32_t*)cl->nextFreeByte = firstVertexIndex; cl->nextFreeByte += 4; +} + +void clInsertPrimitiveListFormat(ControlList* cl, + uint32_t dataType, //1/3: 16 or 32 bit + uint32_t primitiveType) //0/1/2/3: point/line/tri/rhy +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_PRIMITIVE_LIST_FORMAT_opcode; cl->nextFreeByte++; + *cl->nextFreeByte = moveBits(dataType, 4, 4) | moveBits(primitiveType, 4, 0); cl->nextFreeByte++; +} + +void clInsertShaderState(ControlList* cl, + uint32_t address, + uint32_t extendedShaderRecord, //0/1: true/false + uint32_t numberOfAttributeArrays) +{ + assert(cl); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_GL_SHADER_STATE_opcode; cl->nextFreeByte++; + //TODO is this correct? + *(uint32_t*)cl->nextFreeByte = + moveBits(address, 28, 4) | + moveBits(extendedShaderRecord, 1, 3) | + moveBits(numberOfAttributeArrays, 3, 0); cl->nextFreeByte += 4; +} + +/* +void clInsertClearColors(ControlList* cl, + uint32_t clearStencil, + uint32_t clearZ, //24 bit Z + uint64_t clearColor) //2x RGBA8 or 1x RGBA16 +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_CLEAR_COLORS_opcode; cl->nextFreeByte++; + *(uint64_t*)cl->nextFreeByte = clearColor; cl->nextFreeByte += 8; + *(uint32_t*)cl->nextFreeByte = clearZ; cl->nextFreeByte += 4; //24 bits for Z, 8 bit for vg mask (unused) + *cl->nextFreeByte = clearStencil; cl->nextFreeByte++; +} +*/ + +void clInsertConfigurationBits(ControlList* cl, + uint32_t earlyZUpdatesEnable, //0/1 + uint32_t earlyZEnable, //0/1 + uint32_t zUpdatesEnable, //0/1 + enum V3D21_Compare_Function depthTestFunction, + uint32_t coverageReadMode, //0/1 clear/leave as is + uint32_t coveragePipeSelect, //0/1 + uint32_t coverageUpdateMode, //0/1/2/3 nonzero, odd, or, zero + uint32_t coverageReadType, //0/1 4*8bit, 16 bit mask + uint32_t rasterizerOversampleMode, //0/1/2 none, 4x, 16x + uint32_t enableDepthOffset, //0/1 + uint32_t clockwisePrimitives, //0/1 + uint32_t enableReverseFacingPrimitive, //0/1 + uint32_t enableForwardFacingPrimitive) //0/1 +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_CONFIGURATION_BITS_opcode; cl->nextFreeByte++; + *(uint32_t*)cl->nextFreeByte = + moveBits(enableForwardFacingPrimitive, 1, 0) | + moveBits(enableReverseFacingPrimitive, 1, 1) | + moveBits(clockwisePrimitives, 1, 2) | + moveBits(enableDepthOffset, 1, 3) | + moveBits(coverageReadType, 1, 5) | + moveBits(rasterizerOversampleMode, 2, 6) | + moveBits(coveragePipeSelect, 1, 8) | + moveBits(coverageUpdateMode, 2, 9) | + moveBits(coverageReadMode, 1, 11) | + moveBits(depthTestFunction, 3, 12) | + moveBits(zUpdatesEnable, 1, 15) | + moveBits(earlyZEnable, 1, 16) | + moveBits(earlyZUpdatesEnable, 1, 17); cl->nextFreeByte += 4; +} + +void clInsertFlatShadeFlags(ControlList* cl, + uint32_t flags) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_FLAT_SHADE_FLAGS_opcode; cl->nextFreeByte++; + *(uint32_t*)cl->nextFreeByte = flags; cl->nextFreeByte += 4; +} + +void clInsertPointSize(ControlList* cl, + float size) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_POINT_SIZE_opcode; cl->nextFreeByte++; + *(float*)cl->nextFreeByte = size; cl->nextFreeByte += 4; +} + +void clInsertLineWidth(ControlList* cl, + float width) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_LINE_WIDTH_opcode; cl->nextFreeByte++; + *(float*)cl->nextFreeByte = width; cl->nextFreeByte += 4; +} + +void clInsertRHTXBoundary(ControlList* cl, + uint32_t boundary) //sint16 +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_RHT_X_BOUNDARY_opcode; cl->nextFreeByte++; + *(uint16_t*)cl->nextFreeByte = moveBits(boundary, 16, 0); cl->nextFreeByte += 2; +} + +void clInsertDepthOffset(ControlList* cl, + uint32_t units, //float 187 + uint32_t factor) //float 187 +{ + assert(cl); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_DEPTH_OFFSET_opcode; cl->nextFreeByte++; + *(uint32_t*)cl->nextFreeByte = moveBits(factor, 16, 0) | moveBits(units, 16, 16); cl->nextFreeByte += 4; +} + +void clInsertClipWindow(ControlList* cl, + uint32_t width, //uint16 + uint32_t height, //uint16 + uint32_t bottomPixelCoord, //uint16 + uint32_t leftPixelCoord) //uint16 +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_CLIP_WINDOW_opcode; cl->nextFreeByte++; + *(uint32_t*)cl->nextFreeByte = moveBits(leftPixelCoord, 16, 0) | moveBits(bottomPixelCoord, 16, 16); cl->nextFreeByte += 4; + *(uint32_t*)cl->nextFreeByte = moveBits(width, 16, 0) | moveBits(height, 16, 16); cl->nextFreeByte += 4; +} + +void clInsertViewPortOffset(ControlList* cl, + uint32_t x, //sint16 + uint32_t y //sint16 + ) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_VIEWPORT_OFFSET_opcode; cl->nextFreeByte++; + *(uint32_t*)cl->nextFreeByte = moveBits(x, 16, 0) | moveBits(y, 16, 16); cl->nextFreeByte += 4; +} + +void clInsertZMinMaxClippingPlanes(ControlList* cl, + float minZw, + float maxZw + ) +{ + assert(cl); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_Z_MIN_AND_MAX_CLIPPING_PLANES_opcode; cl->nextFreeByte++; + *(float*)cl->nextFreeByte = minZw; cl->nextFreeByte += 4; + *(float*)cl->nextFreeByte = maxZw; cl->nextFreeByte += 4; +} + +void clInsertClipperXYScaling(ControlList* cl, + float width, //half height in 1/16 of pixel + float height //half width in 1/16 of pixel + ) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_CLIPPER_XY_SCALING_opcode; cl->nextFreeByte++; + *(float*)cl->nextFreeByte = width; cl->nextFreeByte += 4; + *(float*)cl->nextFreeByte = height; cl->nextFreeByte += 4; +} + +void clInsertClipperZScaleOffset(ControlList* cl, + float zOffset, //zc to zs + float zScale //zc to zs + ) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_CLIPPER_Z_SCALE_AND_OFFSET_opcode; cl->nextFreeByte++; + *(float*)cl->nextFreeByte = zScale; cl->nextFreeByte += 4; + *(float*)cl->nextFreeByte = zOffset; cl->nextFreeByte += 4; +} + +void clInsertTileBinningModeConfiguration(ControlList* cl, + uint32_t doubleBufferInNonMsMode, //0/1 + uint32_t tileAllocationBlockSize, //0/1/2/3 32/64/128/256 bytes + uint32_t tileAllocationInitialBlockSize, //0/1/2/3 32/64/128/256 bytes + uint32_t autoInitializeTileStateDataArray, //0/1 + uint32_t tileBuffer64BitColorDepth, //0/1 + uint32_t multisampleMode4x, //0/1 + uint32_t widthInPixels, + uint32_t heightInPixels, + uint32_t tileStateDataArrayAddress, //16 byte aligned, size of 48 bytes * num tiles + uint32_t tileAllocationMemorySize, + uint32_t tileAllocationMemoryAddress + ) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_TILE_BINNING_MODE_CONFIGURATION_opcode; cl->nextFreeByte++; + *(uint32_t*)cl->nextFreeByte = tileAllocationMemoryAddress; cl->nextFreeByte += 4; + *(uint32_t*)cl->nextFreeByte = tileAllocationMemorySize; cl->nextFreeByte += 4; + *(uint32_t*)cl->nextFreeByte = tileStateDataArrayAddress; cl->nextFreeByte += 4; + uint32_t tileSizeW = 64; + uint32_t tileSizeH = 64; + + if(multisampleMode4x) + { + tileSizeW >>= 1; + tileSizeH >>= 1; + } + + if(tileBuffer64BitColorDepth) + { + tileSizeH >>= 1; + } + + uint32_t widthInTiles = divRoundUp(widthInPixels, tileSizeW); + uint32_t heightInTiles = divRoundUp(heightInPixels, tileSizeH); + *(uint8_t*)cl->nextFreeByte = widthInTiles; cl->nextFreeByte++; + *(uint8_t*)cl->nextFreeByte = heightInTiles; cl->nextFreeByte++; + *cl->nextFreeByte = + moveBits(multisampleMode4x, 1, 0) | + moveBits(tileBuffer64BitColorDepth, 1, 1) | + moveBits(autoInitializeTileStateDataArray, 1, 2) | + moveBits(tileAllocationInitialBlockSize, 2, 3) | + moveBits(tileAllocationBlockSize, 2, 5) | + moveBits(doubleBufferInNonMsMode, 1, 7); cl->nextFreeByte++; +} + +/* +void clInsertTileRenderingModeConfiguration(ControlList* cls, + ControlListAddress address, + uint32_t doubleBufferInNonMsMode, //0/1 + uint32_t earlyZEarlyCovDisable, //0/1 + uint32_t earlyZUpdateDirection, //0/1 lt,le/gt,ge + uint32_t selectCoverageMode, //0/1 + uint32_t memoryFormat, //0/1/2 linear/t/lt + uint32_t decimateMode, //0/1/2 0x/4x/16x + uint32_t nonHDRFrameFormatColorFormat, //0/1/2 bgr565dithered/rgba8/bgr565nodither + uint32_t tileBufferHDRMode, //0/1 + uint32_t multisampleMode4x, //0/1 + uint32_t widthPixels, + uint32_t heightPixels) +{ + assert(cls); + assert(cls->buffer); + assert(cls->nextFreeByte); + *cls->nextFreeByte = V3D21_TILE_RENDERING_MODE_CONFIGURATION_opcode; cls->nextFreeByte++; + //TODO is this correct? + clEmitShaderRelocation(cls, &address); + *(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4; + *(uint32_t*)cls->nextFreeByte = moveBits(widthPixels, 16, 0) | moveBits(heightPixels, 16, 16); cls->nextFreeByte += 4; + *(uint16_t*)cls->nextFreeByte = + moveBits(multisampleMode4x, 1, 0) | + moveBits(tileBufferHDRMode, 1, 1) | + moveBits(nonHDRFrameFormatColorFormat, 2, 2) | + moveBits(decimateMode, 2, 4) | + moveBits(memoryFormat, 2, 6) | + moveBits(0, 1, 8) | //vg buffer enable + moveBits(selectCoverageMode, 1, 9) | + moveBits(earlyZUpdateDirection, 1, 10) | + moveBits(earlyZEarlyCovDisable, 1, 11) | + moveBits(doubleBufferInNonMsMode, 1, 12); cls->nextFreeByte += 2; +} +*/ + +/* +void clInsertTileCoordinates(ControlList* cl, + uint32_t tileColumnNumber, //int8 + uint32_t tileRowNumber) //int8 +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_TILE_COORDINATES_opcode; cl->nextFreeByte++; + *(uint16_t*)cl->nextFreeByte = moveBits(tileColumnNumber, 8, 0) | moveBits(tileRowNumber, 8, 8); cl->nextFreeByte += 2; +} +*/ + +void clInsertGEMRelocations(ControlList* cl, + uint32_t buffer0, + uint32_t buffer1) +{ + assert(cl); + assert(cl->buffer); + assert(cl->nextFreeByte); + *cl->nextFreeByte = V3D21_GEM_RELOCATIONS_opcode; cl->nextFreeByte++; + *(uint32_t*)cl->nextFreeByte = buffer0; cl->nextFreeByte += 4; + *(uint32_t*)cl->nextFreeByte = buffer1; cl->nextFreeByte += 4; +} + +//input: 2 cls (cl, handles cl) +void clInsertShaderRecord(ControlList* cls, + uint32_t fragmentShaderIsSingleThreaded, //0/1 + uint32_t pointSizeIncludedInShadedVertexData, //0/1 + uint32_t enableClipping, //0/1 + uint32_t fragmentNumberOfUnusedUniforms, + uint32_t fragmentNumberOfVaryings, + uint32_t fragmentUniformsAddress, + ControlListAddress fragmentCodeAddress, + uint32_t vertexNumberOfUnusedUniforms, + uint32_t vertexAttributeArraySelectBits, + uint32_t vertexTotalAttributesSize, + uint32_t vertexUniformsAddress, + ControlListAddress vertexCodeAddress) +{ + assert(cls); + assert(cls->buffer); + assert(cls->nextFreeByte); + //TODO is this correct? + *cls->nextFreeByte = + moveBits(fragmentShaderIsSingleThreaded, 1, 0) | + moveBits(pointSizeIncludedInShadedVertexData, 1, 1) | + moveBits(enableClipping, 1, 2); cls->nextFreeByte++; + *cls->nextFreeByte = 0; cls->nextFreeByte++; + *(uint16_t*)cls->nextFreeByte = moveBits(fragmentNumberOfUnusedUniforms, 16, 0); cls->nextFreeByte += 2; + *cls->nextFreeByte = fragmentNumberOfVaryings; cls->nextFreeByte++; + clEmitShaderRelocation(cls, &fragmentCodeAddress); + *(uint32_t*)cls->nextFreeByte = fragmentCodeAddress.offset; cls->nextFreeByte += 4; + *(uint32_t*)cls->nextFreeByte = fragmentUniformsAddress; cls->nextFreeByte += 4; + + *(uint16_t*)cls->nextFreeByte = moveBits(vertexNumberOfUnusedUniforms, 16, 0); cls->nextFreeByte += 2; + *cls->nextFreeByte = vertexAttributeArraySelectBits; cls->nextFreeByte++; + *cls->nextFreeByte = vertexTotalAttributesSize; cls->nextFreeByte++; + clEmitShaderRelocation(cls, &vertexCodeAddress); + *(uint32_t*)cls->nextFreeByte = moveBits(vertexCodeAddress.offset, 32, 0) | moveBits(vertexUniformsAddress, 32, 0); cls->nextFreeByte += 4; //??? + cls->nextFreeByte += 4; + //skip coordinate shader stuff + cls->nextFreeByte += 16; +} + +//input: 2 cls (cl, handles cl) +void clInsertAttributeRecord(ControlList* cls, + ControlListAddress address, + uint32_t sizeBytes, + uint32_t stride, + uint32_t vertexVPMOffset) +{ + assert(cls); + assert(cls->buffer); + assert(cls->nextFreeByte); + uint32_t sizeBytesMinusOne = sizeBytes - 1; + //TODO is this correct? + clEmitShaderRelocation(cls, &address); + *(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4; + *cls->nextFreeByte = sizeBytesMinusOne; cls->nextFreeByte++; + *cls->nextFreeByte = stride; cls->nextFreeByte++; + *cls->nextFreeByte = vertexVPMOffset; cls->nextFreeByte++; + cls->nextFreeByte++; //skip coordinate shader stuff +} + +uint32_t clGetHandleIndex(ControlList* handlesCl, uint32_t handle) +{ + uint32_t c = 0; + + uint32_t numHandles = clSize(handlesCl) / 4; + + for(; c < numHandles; ++c) + { + if(((uint32_t*)handlesCl->buffer)[c] == handle) + { + //found + return c; + } + } + + //write handle to handles cl + *(uint32_t*)handlesCl->nextFreeByte = handle; + handlesCl->nextFreeByte += 4; + + return c; +} + +//input: 2 cls (cl + handles cl) +inline void clEmitShaderRelocation(ControlList* cls, const ControlListAddress* address) +{ + assert(cls); + assert(cls->buffer); + assert(cls->nextFreeByte); + assert(address); + assert(address->handle); + + //search for handle in handles cl + //if found insert handle index + + ControlList* cl = cls; + ControlList* handlesCl = cls + 1; + + //store offset within handles in cl + *(uint32_t*)cl->nextFreeByte = clGetHandleIndex(handlesCl, address->handle); + cl->nextFreeByte += 4; +} diff --git a/driver/ControlListUtil.h b/driver/ControlListUtil.h index 161e37c..24b4d9b 100644 --- a/driver/ControlListUtil.h +++ b/driver/ControlListUtil.h @@ -21,7 +21,7 @@ typedef struct ControlList uint8_t* nextFreeByte; //pointer to the next available free byte } ControlList; -static inline void clEmitShaderRelocation(ControlList* cl, const ControlListAddress* address); +void clEmitShaderRelocation(ControlList* cl, const ControlListAddress* address); #define __gen_user_data struct ControlList #define __gen_address_type ControlListAddress @@ -30,356 +30,40 @@ static inline void clEmitShaderRelocation(ControlList* cl, const ControlListAddr #include "brcm/cle/v3d_packet_v21_pack.h" -uint32_t divRoundUp(uint32_t n, uint32_t d) -{ - return (((n) + (d) - 1) / (d)); -} - -//move bits to offset, mask rest to 0 -uint32_t moveBits(uint32_t d, uint32_t bits, uint32_t offset) -{ - return (d << offset) & (~(~0 << bits) << offset); -} - -uint32_t clSize(ControlList* cl) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - return cl->nextFreeByte - cl->buffer; -} - -uint32_t clHasEnoughSpace(ControlList* cl, uint32_t size) -{ - uint32_t currSize = clSize(cl); - if(currSize + size < CONTROL_LIST_SIZE) - { - return 1; //fits! - } - else - { - return 0; //need to reallocate - } -} - -void clInit(ControlList* cl, void* buffer) -{ - assert(cl); - assert(buffer); - cl->buffer = buffer; - cl->numBlocks = 1; - cl->nextFreeByte = &cl->buffer[0]; -} - -void clInsertHalt(ControlList* cl) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_HALT_opcode; - cl->nextFreeByte++; -} - -void clInsertNop(ControlList* cl) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_NOP_opcode; - cl->nextFreeByte++; -} - -void clInsertFlush(ControlList* cl) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_FLUSH_opcode; - cl->nextFreeByte++; -} - -void clInsertFlushAllState(ControlList* cl) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_FLUSH_ALL_STATE_opcode; - cl->nextFreeByte++; -} - -void clInsertStartTileBinning(ControlList* cl) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_START_TILE_BINNING_opcode; - cl->nextFreeByte++; -} - -void clInsertIncrementSemaphore(ControlList* cl) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_INCREMENT_SEMAPHORE_opcode; - cl->nextFreeByte++; -} - -void clInsertWaitOnSemaphore(ControlList* cl) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_WAIT_ON_SEMAPHORE_opcode; - cl->nextFreeByte++; -} - -//input: 2 cls (cl, handles cl) -void clInsertBranch(ControlList* cls, ControlListAddress address) -{ - assert(cls); - assert(cls->buffer); - assert(cls->nextFreeByte); - *cls->nextFreeByte = V3D21_BRANCH_opcode; cls->nextFreeByte++; - //TODO is this correct? - clEmitShaderRelocation(cls, &address); - *(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4; -} - -//input: 2 cls (cl, handles cl) -void clInsertBranchToSubList(ControlList* cls, ControlListAddress address) -{ - assert(cls); - assert(cls->buffer); - assert(cls->nextFreeByte); - *cls->nextFreeByte = V3D21_BRANCH_TO_SUB_LIST_opcode; cls->nextFreeByte++; - //TODO is this correct? - clEmitShaderRelocation(cls, &address); - *(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4; -} - -void clInsertReturnFromSubList(ControlList* cl) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_RETURN_FROM_SUB_LIST_opcode; - cl->nextFreeByte++; -} - -void clInsertStoreMultiSampleResolvedTileColorBuffer(ControlList* cl) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_opcode; - cl->nextFreeByte++; -} - -void clInsertStoreMultiSampleResolvedTileColorBufferAndEOF(ControlList* cl) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_AND_EOF_opcode; - cl->nextFreeByte++; -} - -/* -//input: 2 cls (cl, handles cl) -void clInsertStoreFullResolutionTileBuffer(ControlList* cls, - ControlListAddress address, - uint32_t lastTile, //0/1 - uint32_t disableClearOnWrite, //0/1 - uint32_t disableZStencilBufferWrite, //0/1 - uint32_t disableColorBufferWrite) //0/1 -{ - assert(cls); - assert(cls->buffer); - assert(cls->nextFreeByte); - *cls->nextFreeByte = V3D21_STORE_FULL_RESOLUTION_TILE_BUFFER_opcode; cls->nextFreeByte++; - //TODO is this correct? - clEmitShaderRelocation(cls, &address); - *(uint32_t*)cls->nextFreeByte = - moveBits(disableColorBufferWrite, 1, 0) | - moveBits(disableZStencilBufferWrite, 1, 1) | - moveBits(disableClearOnWrite, 1, 2) | - moveBits(lastTile, 1, 3) | - moveBits(address.offset, 28, 4); - cls->nextFreeByte += 4; -} -*/ - -/* -//input: 2 cls (cl, handles cl) -void clInsertReLoadFullResolutionTileBuffer(ControlList* cls, - ControlListAddress address, - uint32_t disableZStencilBufferRead, //0/1 - uint32_t disableColorBufferRead) //0/1 -{ - assert(cls); - assert(cls->buffer); - assert(cls->nextFreeByte); - *cls->nextFreeByte = V3D21_RE_LOAD_FULL_RESOLUTION_TILE_BUFFER_opcode; cls->nextFreeByte++; - //TODO is this correct? - clEmitShaderRelocation(cls, &address); - *(uint32_t*)cls->nextFreeByte = - moveBits(disableColorBufferRead, 1, 0) | - moveBits(disableZStencilBufferRead, 1, 1) | - moveBits(address.offset, 28, 4); - cls->nextFreeByte += 4; -} -*/ - -/* -//input: 2 cls (cl, handles cl) -void clInsertStoreTileBufferGeneral(ControlList* cls, - ControlListAddress address, - uint32_t lastTileOfFrame, //0/1 - uint32_t disableZStencilBufferDump, //0/1 - uint32_t disableColorBufferDump, //0/1 - uint32_t disableZStencilBufferClearOnStoreDump, //0/1 - uint32_t disableColorBufferClearOnStoreDump, //0/1 - uint32_t disableDoubleBufferSwap, //0/1 - uint32_t pixelColorFormat, //0/1/2 RGBA8/BGR565dither/BGR565nodither - uint32_t mode, //0/1/2 sample0/decimate4x/decimate16x - uint32_t format, //0/1/2 raster/t/lt - uint32_t bufferToStore) //0/1/2/3/5 none/color/zstencil/z/full -{ - assert(cls); - assert(cls->buffer); - assert(cls->nextFreeByte); - *cls->nextFreeByte = V3D21_STORE_TILE_BUFFER_GENERAL_opcode; cls->nextFreeByte++; - //TODO is this correct? - *cls->nextFreeByte = - moveBits(bufferToStore, 3, 0) | - moveBits(format, 2, 4) | - moveBits(mode, 2, 6); - cls->nextFreeByte++; - *cls->nextFreeByte = - moveBits(pixelColorFormat, 2, 0) | - moveBits(disableDoubleBufferSwap, 1, 4) | - moveBits(disableColorBufferClearOnStoreDump, 1, 5) | - moveBits(disableZStencilBufferClearOnStoreDump, 1, 6) | - moveBits(1, 1, 7); //disable vg mask - cls->nextFreeByte++; - clEmitShaderRelocation(cls, &address); - *(uint32_t*)cls->nextFreeByte = - moveBits(disableColorBufferDump, 1, 0) | - moveBits(disableZStencilBufferDump, 1, 1) | - moveBits(1, 1, 2) | //disable vg mask - moveBits(lastTileOfFrame, 1, 3) | - moveBits(address.offset, 28, 4); - cls->nextFreeByte += 4; -} -*/ - -/* -//input: 2 cls (cl, handles cl) -void clInsertLoadTileBufferGeneral(ControlList* cls, - ControlListAddress address, - uint32_t disableZStencilBufferLoad, //0/1 - uint32_t disableColorBufferLoad, //0/1 - uint32_t pixelColorFormat, //0/1/2 RGBA8/BGR565dither/BGR565nodither - uint32_t mode, //0/1/2 sample0/decimate4x/decimate16x - uint32_t format, //0/1/2 raster/t/lt - uint32_t bufferToLoad) //0/1/2/3/5 none/color/zstencil/z/full -{ - assert(cls); - assert(cls->buffer); - assert(cls->nextFreeByte); - *cls->nextFreeByte = V3D21_LOAD_TILE_BUFFER_GENERAL_opcode; cls->nextFreeByte++; - //TODO is this correct? - *cls->nextFreeByte = - moveBits(bufferToLoad, 3, 0) | - moveBits(format, 2, 4); - cls->nextFreeByte++; - *cls->nextFreeByte = - moveBits(pixelColorFormat, 2, 0); - cls->nextFreeByte++; - clEmitShaderRelocation(cls, &address); - *(uint32_t*)cls->nextFreeByte = - moveBits(disableColorBufferLoad, 1, 0) | - moveBits(disableZStencilBufferLoad, 1, 1) | - moveBits(1, 1, 2) | //disable vg mask - moveBits(address.offset, 28, 4); - cls->nextFreeByte += 4; - -} -*/ - +uint32_t divRoundUp(uint32_t n, uint32_t d); +uint32_t moveBits(uint32_t d, uint32_t bits, uint32_t offset); +uint32_t clSize(ControlList* cl); +uint32_t clHasEnoughSpace(ControlList* cl, uint32_t size); +void clInit(ControlList* cl, void* buffer); +void clInsertHalt(ControlList* cl); +void clInsertNop(ControlList* cl); +void clInsertFlush(ControlList* cl); +void clInsertFlushAllState(ControlList* cl); +void clInsertStartTileBinning(ControlList* cl); +void clInsertIncrementSemaphore(ControlList* cl); +void clInsertWaitOnSemaphore(ControlList* cl); +void clInsertBranch(ControlList* cls, ControlListAddress address); +void clInsertBranchToSubList(ControlList* cls, ControlListAddress address); +void clInsertReturnFromSubList(ControlList* cl); +void clInsertStoreMultiSampleResolvedTileColorBuffer(ControlList* cl); +void clInsertStoreMultiSampleResolvedTileColorBufferAndEOF(ControlList* cl); void clInsertIndexedPrimitiveList(ControlList* cl, uint32_t maxIndex, uint32_t indicesAddress, uint32_t length, uint32_t indexType, //0/1: 8 or 16 bit - enum V3D21_Primitive primitiveMode) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_INDEXED_PRIMITIVE_LIST_opcode; cl->nextFreeByte++; - *cl->nextFreeByte = moveBits(indexType, 4, 4) | moveBits(primitiveMode, 4, 0); cl->nextFreeByte++; - *(uint32_t*)cl->nextFreeByte = length; cl->nextFreeByte += 4; - *(uint32_t*)cl->nextFreeByte = indicesAddress; cl->nextFreeByte += 4; - *(uint32_t*)cl->nextFreeByte = maxIndex; cl->nextFreeByte += 4; -} - + enum V3D21_Primitive primitiveMode); void clInsertVertexArrayPrimitives(ControlList* cl, uint32_t firstVertexIndex, uint32_t length, - enum V3D21_Primitive primitiveMode) -{ - assert(cl); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_VERTEX_ARRAY_PRIMITIVES_opcode; cl->nextFreeByte++; - *cl->nextFreeByte = moveBits(primitiveMode, 8, 0); cl->nextFreeByte++; - *(uint32_t*)cl->nextFreeByte = length; cl->nextFreeByte += 4; - *(uint32_t*)cl->nextFreeByte = firstVertexIndex; cl->nextFreeByte += 4; -} - + enum V3D21_Primitive primitiveMode); void clInsertPrimitiveListFormat(ControlList* cl, uint32_t dataType, //1/3: 16 or 32 bit - uint32_t primitiveType) //0/1/2/3: point/line/tri/rhy -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_PRIMITIVE_LIST_FORMAT_opcode; cl->nextFreeByte++; - *cl->nextFreeByte = moveBits(dataType, 4, 4) | moveBits(primitiveType, 4, 0); cl->nextFreeByte++; -} - + uint32_t primitiveType); //0/1/2/3: point/line/tri/rhy void clInsertShaderState(ControlList* cl, uint32_t address, uint32_t extendedShaderRecord, //0/1: true/false - uint32_t numberOfAttributeArrays) -{ - assert(cl); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_GL_SHADER_STATE_opcode; cl->nextFreeByte++; - //TODO is this correct? - *(uint32_t*)cl->nextFreeByte = - moveBits(address, 28, 4) | - moveBits(extendedShaderRecord, 1, 3) | - moveBits(numberOfAttributeArrays, 3, 0); cl->nextFreeByte += 4; -} - -/* -void clInsertClearColors(ControlList* cl, - uint32_t clearStencil, - uint32_t clearZ, //24 bit Z - uint64_t clearColor) //2x RGBA8 or 1x RGBA16 -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_CLEAR_COLORS_opcode; cl->nextFreeByte++; - *(uint64_t*)cl->nextFreeByte = clearColor; cl->nextFreeByte += 8; - *(uint32_t*)cl->nextFreeByte = clearZ; cl->nextFreeByte += 4; //24 bits for Z, 8 bit for vg mask (unused) - *cl->nextFreeByte = clearStencil; cl->nextFreeByte++; -} -*/ - + uint32_t numberOfAttributeArrays); void clInsertConfigurationBits(ControlList* cl, uint32_t earlyZUpdatesEnable, //0/1 uint32_t earlyZEnable, //0/1 @@ -393,142 +77,39 @@ void clInsertConfigurationBits(ControlList* cl, uint32_t enableDepthOffset, //0/1 uint32_t clockwisePrimitives, //0/1 uint32_t enableReverseFacingPrimitive, //0/1 - uint32_t enableForwardFacingPrimitive) //0/1 -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_CONFIGURATION_BITS_opcode; cl->nextFreeByte++; - *(uint32_t*)cl->nextFreeByte = - moveBits(enableForwardFacingPrimitive, 1, 0) | - moveBits(enableReverseFacingPrimitive, 1, 1) | - moveBits(clockwisePrimitives, 1, 2) | - moveBits(enableDepthOffset, 1, 3) | - moveBits(coverageReadType, 1, 5) | - moveBits(rasterizerOversampleMode, 2, 6) | - moveBits(coveragePipeSelect, 1, 8) | - moveBits(coverageUpdateMode, 2, 9) | - moveBits(coverageReadMode, 1, 11) | - moveBits(depthTestFunction, 3, 12) | - moveBits(zUpdatesEnable, 1, 15) | - moveBits(earlyZEnable, 1, 16) | - moveBits(earlyZUpdatesEnable, 1, 17); cl->nextFreeByte += 4; -} - + uint32_t enableForwardFacingPrimitive); //0/1 void clInsertFlatShadeFlags(ControlList* cl, - uint32_t flags) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_FLAT_SHADE_FLAGS_opcode; cl->nextFreeByte++; - *(uint32_t*)cl->nextFreeByte = flags; cl->nextFreeByte += 4; -} - + uint32_t flags); void clInsertPointSize(ControlList* cl, - float size) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_POINT_SIZE_opcode; cl->nextFreeByte++; - *(float*)cl->nextFreeByte = size; cl->nextFreeByte += 4; -} - + float size); void clInsertLineWidth(ControlList* cl, - float width) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_LINE_WIDTH_opcode; cl->nextFreeByte++; - *(float*)cl->nextFreeByte = width; cl->nextFreeByte += 4; -} - + float width); void clInsertRHTXBoundary(ControlList* cl, - uint32_t boundary) //sint16 -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_RHT_X_BOUNDARY_opcode; cl->nextFreeByte++; - *(uint16_t*)cl->nextFreeByte = moveBits(boundary, 16, 0); cl->nextFreeByte += 2; -} - + uint32_t boundary); //sint16 void clInsertDepthOffset(ControlList* cl, uint32_t units, //float 187 - uint32_t factor) //float 187 -{ - assert(cl); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_DEPTH_OFFSET_opcode; cl->nextFreeByte++; - *(uint32_t*)cl->nextFreeByte = moveBits(factor, 16, 0) | moveBits(units, 16, 16); cl->nextFreeByte += 4; -} - + uint32_t factor); //float 187 void clInsertClipWindow(ControlList* cl, uint32_t width, //uint16 uint32_t height, //uint16 uint32_t bottomPixelCoord, //uint16 - uint32_t leftPixelCoord) //uint16 -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_CLIP_WINDOW_opcode; cl->nextFreeByte++; - *(uint32_t*)cl->nextFreeByte = moveBits(leftPixelCoord, 16, 0) | moveBits(bottomPixelCoord, 16, 16); cl->nextFreeByte += 4; - *(uint32_t*)cl->nextFreeByte = moveBits(width, 16, 0) | moveBits(height, 16, 16); cl->nextFreeByte += 4; -} - + uint32_t leftPixelCoord); //uint16 void clInsertViewPortOffset(ControlList* cl, uint32_t x, //sint16 uint32_t y //sint16 - ) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_VIEWPORT_OFFSET_opcode; cl->nextFreeByte++; - *(uint32_t*)cl->nextFreeByte = moveBits(x, 16, 0) | moveBits(y, 16, 16); cl->nextFreeByte += 4; -} - + ); void clInsertZMinMaxClippingPlanes(ControlList* cl, float minZw, float maxZw - ) -{ - assert(cl); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_Z_MIN_AND_MAX_CLIPPING_PLANES_opcode; cl->nextFreeByte++; - *(float*)cl->nextFreeByte = minZw; cl->nextFreeByte += 4; - *(float*)cl->nextFreeByte = maxZw; cl->nextFreeByte += 4; -} - + ); void clInsertClipperXYScaling(ControlList* cl, float width, //half height in 1/16 of pixel float height //half width in 1/16 of pixel - ) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_CLIPPER_XY_SCALING_opcode; cl->nextFreeByte++; - *(float*)cl->nextFreeByte = width; cl->nextFreeByte += 4; - *(float*)cl->nextFreeByte = height; cl->nextFreeByte += 4; -} - + ); void clInsertClipperZScaleOffset(ControlList* cl, float zOffset, //zc to zs float zScale //zc to zs - ) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_CLIPPER_Z_SCALE_AND_OFFSET_opcode; cl->nextFreeByte++; - *(float*)cl->nextFreeByte = zScale; cl->nextFreeByte += 4; - *(float*)cl->nextFreeByte = zOffset; cl->nextFreeByte += 4; -} - + ); void clInsertTileBinningModeConfiguration(ControlList* cl, uint32_t doubleBufferInNonMsMode, //0/1 uint32_t tileAllocationBlockSize, //0/1/2/3 32/64/128/256 bytes @@ -541,105 +122,10 @@ void clInsertTileBinningModeConfiguration(ControlList* cl, uint32_t tileStateDataArrayAddress, //16 byte aligned, size of 48 bytes * num tiles uint32_t tileAllocationMemorySize, uint32_t tileAllocationMemoryAddress - ) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_TILE_BINNING_MODE_CONFIGURATION_opcode; cl->nextFreeByte++; - *(uint32_t*)cl->nextFreeByte = tileAllocationMemoryAddress; cl->nextFreeByte += 4; - *(uint32_t*)cl->nextFreeByte = tileAllocationMemorySize; cl->nextFreeByte += 4; - *(uint32_t*)cl->nextFreeByte = tileStateDataArrayAddress; cl->nextFreeByte += 4; - uint32_t tileSizeW = 64; - uint32_t tileSizeH = 64; - - if(multisampleMode4x) - { - tileSizeW >>= 1; - tileSizeH >>= 1; - } - - if(tileBuffer64BitColorDepth) - { - tileSizeH >>= 1; - } - - uint32_t widthInTiles = divRoundUp(widthInPixels, tileSizeW); - uint32_t heightInTiles = divRoundUp(heightInPixels, tileSizeH); - *(uint8_t*)cl->nextFreeByte = widthInTiles; cl->nextFreeByte++; - *(uint8_t*)cl->nextFreeByte = heightInTiles; cl->nextFreeByte++; - *cl->nextFreeByte = - moveBits(multisampleMode4x, 1, 0) | - moveBits(tileBuffer64BitColorDepth, 1, 1) | - moveBits(autoInitializeTileStateDataArray, 1, 2) | - moveBits(tileAllocationInitialBlockSize, 2, 3) | - moveBits(tileAllocationBlockSize, 2, 5) | - moveBits(doubleBufferInNonMsMode, 1, 7); cl->nextFreeByte++; -} - -/* -void clInsertTileRenderingModeConfiguration(ControlList* cls, - ControlListAddress address, - uint32_t doubleBufferInNonMsMode, //0/1 - uint32_t earlyZEarlyCovDisable, //0/1 - uint32_t earlyZUpdateDirection, //0/1 lt,le/gt,ge - uint32_t selectCoverageMode, //0/1 - uint32_t memoryFormat, //0/1/2 linear/t/lt - uint32_t decimateMode, //0/1/2 0x/4x/16x - uint32_t nonHDRFrameFormatColorFormat, //0/1/2 bgr565dithered/rgba8/bgr565nodither - uint32_t tileBufferHDRMode, //0/1 - uint32_t multisampleMode4x, //0/1 - uint32_t widthPixels, - uint32_t heightPixels) -{ - assert(cls); - assert(cls->buffer); - assert(cls->nextFreeByte); - *cls->nextFreeByte = V3D21_TILE_RENDERING_MODE_CONFIGURATION_opcode; cls->nextFreeByte++; - //TODO is this correct? - clEmitShaderRelocation(cls, &address); - *(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4; - *(uint32_t*)cls->nextFreeByte = moveBits(widthPixels, 16, 0) | moveBits(heightPixels, 16, 16); cls->nextFreeByte += 4; - *(uint16_t*)cls->nextFreeByte = - moveBits(multisampleMode4x, 1, 0) | - moveBits(tileBufferHDRMode, 1, 1) | - moveBits(nonHDRFrameFormatColorFormat, 2, 2) | - moveBits(decimateMode, 2, 4) | - moveBits(memoryFormat, 2, 6) | - moveBits(0, 1, 8) | //vg buffer enable - moveBits(selectCoverageMode, 1, 9) | - moveBits(earlyZUpdateDirection, 1, 10) | - moveBits(earlyZEarlyCovDisable, 1, 11) | - moveBits(doubleBufferInNonMsMode, 1, 12); cls->nextFreeByte += 2; -} -*/ - -/* -void clInsertTileCoordinates(ControlList* cl, - uint32_t tileColumnNumber, //int8 - uint32_t tileRowNumber) //int8 -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_TILE_COORDINATES_opcode; cl->nextFreeByte++; - *(uint16_t*)cl->nextFreeByte = moveBits(tileColumnNumber, 8, 0) | moveBits(tileRowNumber, 8, 8); cl->nextFreeByte += 2; -} -*/ - + ); void clInsertGEMRelocations(ControlList* cl, uint32_t buffer0, - uint32_t buffer1) -{ - assert(cl); - assert(cl->buffer); - assert(cl->nextFreeByte); - *cl->nextFreeByte = V3D21_GEM_RELOCATIONS_opcode; cl->nextFreeByte++; - *(uint32_t*)cl->nextFreeByte = buffer0; cl->nextFreeByte += 4; - *(uint32_t*)cl->nextFreeByte = buffer1; cl->nextFreeByte += 4; -} - -//input: 2 cls (cl, handles cl) + uint32_t buffer1); void clInsertShaderRecord(ControlList* cls, uint32_t fragmentShaderIsSingleThreaded, //0/1 uint32_t pointSizeIncludedInShadedVertexData, //0/1 @@ -652,94 +138,13 @@ void clInsertShaderRecord(ControlList* cls, uint32_t vertexAttributeArraySelectBits, uint32_t vertexTotalAttributesSize, uint32_t vertexUniformsAddress, - ControlListAddress vertexCodeAddress) -{ - assert(cls); - assert(cls->buffer); - assert(cls->nextFreeByte); - //TODO is this correct? - *cls->nextFreeByte = - moveBits(fragmentShaderIsSingleThreaded, 1, 0) | - moveBits(pointSizeIncludedInShadedVertexData, 1, 1) | - moveBits(enableClipping, 1, 2); cls->nextFreeByte++; - *cls->nextFreeByte = 0; cls->nextFreeByte++; - *(uint16_t*)cls->nextFreeByte = moveBits(fragmentNumberOfUnusedUniforms, 16, 0); cls->nextFreeByte += 2; - *cls->nextFreeByte = fragmentNumberOfVaryings; cls->nextFreeByte++; - clEmitShaderRelocation(cls, &fragmentCodeAddress); - *(uint32_t*)cls->nextFreeByte = fragmentCodeAddress.offset; cls->nextFreeByte += 4; - *(uint32_t*)cls->nextFreeByte = fragmentUniformsAddress; cls->nextFreeByte += 4; - - *(uint16_t*)cls->nextFreeByte = moveBits(vertexNumberOfUnusedUniforms, 16, 0); cls->nextFreeByte += 2; - *cls->nextFreeByte = vertexAttributeArraySelectBits; cls->nextFreeByte++; - *cls->nextFreeByte = vertexTotalAttributesSize; cls->nextFreeByte++; - clEmitShaderRelocation(cls, &vertexCodeAddress); - *(uint32_t*)cls->nextFreeByte = moveBits(vertexCodeAddress.offset, 32, 0) | moveBits(vertexUniformsAddress, 32, 0); cls->nextFreeByte += 4; //??? - cls->nextFreeByte += 4; - //skip coordinate shader stuff - cls->nextFreeByte += 16; -} - -//input: 2 cls (cl, handles cl) + ControlListAddress vertexCodeAddress); void clInsertAttributeRecord(ControlList* cls, ControlListAddress address, uint32_t sizeBytes, uint32_t stride, - uint32_t vertexVPMOffset) -{ - assert(cls); - assert(cls->buffer); - assert(cls->nextFreeByte); - uint32_t sizeBytesMinusOne = sizeBytes - 1; - //TODO is this correct? - clEmitShaderRelocation(cls, &address); - *(uint32_t*)cls->nextFreeByte = address.offset; cls->nextFreeByte += 4; - *cls->nextFreeByte = sizeBytesMinusOne; cls->nextFreeByte++; - *cls->nextFreeByte = stride; cls->nextFreeByte++; - *cls->nextFreeByte = vertexVPMOffset; cls->nextFreeByte++; - cls->nextFreeByte++; //skip coordinate shader stuff -} - -uint32_t clGetHandleIndex(ControlList* handlesCl, uint32_t handle) -{ - uint32_t c = 0; - - uint32_t numHandles = clSize(handlesCl) / 4; - - for(; c < numHandles; ++c) - { - if(((uint32_t*)handlesCl->buffer)[c] == handle) - { - //found - return c; - } - } - - //write handle to handles cl - *(uint32_t*)handlesCl->nextFreeByte = handle; - handlesCl->nextFreeByte += 4; - - return c; -} - -//input: 2 cls (cl + handles cl) -static inline void clEmitShaderRelocation(ControlList* cls, const ControlListAddress* address) -{ - assert(cls); - assert(cls->buffer); - assert(cls->nextFreeByte); - assert(address); - assert(address->handle); - - //search for handle in handles cl - //if found insert handle index - - ControlList* cl = cls; - ControlList* handlesCl = cls + 1; - - //store offset within handles in cl - *(uint32_t*)cl->nextFreeByte = clGetHandleIndex(handlesCl, address->handle); - cl->nextFreeByte += 4; -} + uint32_t vertexVPMOffset); +uint32_t clGetHandleIndex(ControlList* handlesCl, uint32_t handle); #if defined (__cplusplus) } diff --git a/driver/LinearAllocator.c b/driver/LinearAllocator.c new file mode 100644 index 0000000..aee53c7 --- /dev/null +++ b/driver/LinearAllocator.c @@ -0,0 +1,48 @@ +#include "LinearAllocator.h" + +#include "CustomAssert.h" + +#include + +LinearAllocator createLinearAllocator(char* b, unsigned s) +{ + assert(b); + assert(s > 0); + + LinearAllocator la = + { + .buf = b, + .offset = 0, + .size = s + }; + + return la; +} + +void destroyLinearAllocator(LinearAllocator* la) +{ + la->buf = 0; + la->offset = 0; + la->size = 0; +} + +void* linearAllocte(LinearAllocator* la, unsigned s) +{ + assert(la->buf); + assert(la->size > 0); + + if(la->offset + s >= la->size) + { + return 0; //no space left + } + + char* p = la->buf + la->offset + s; + la->offset += s; + + return p; +} + +void linearFree(LinearAllocator* la, void* p) +{ + //assert(0); //this shouldn't really happen, just destroy/reset the whole allocator +} diff --git a/driver/LinearAllocator.h b/driver/LinearAllocator.h index 06ec0c8..941e6df 100644 --- a/driver/LinearAllocator.h +++ b/driver/LinearAllocator.h @@ -15,48 +15,10 @@ typedef struct LinearAllocator unsigned size; } LinearAllocator; -LinearAllocator createLinearAllocator(char* b, unsigned s) -{ - assert(b); - assert(s > 0); - - LinearAllocator la = - { - .buf = b, - .offset = 0, - .size = s - }; - - return la; -} - -void destroyLinearAllocator(LinearAllocator* la) -{ - la->buf = 0; - la->offset = 0; - la->size = 0; -} - -void* linearAllocte(LinearAllocator* la, unsigned s) -{ - assert(la->buf); - assert(la->size > 0); - - if(la->offset + s >= la->size) - { - return 0; //no space left - } - - char* p = la->buf + la->offset + s; - la->offset += s; - - return p; -} - -void linearFree(LinearAllocator* la, void* p) -{ - //assert(0); //this shouldn't really happen, just destroy/reset the whole allocator -} +LinearAllocator createLinearAllocator(char* b, unsigned s); +void destroyLinearAllocator(LinearAllocator* la); +void* linearAllocte(LinearAllocator* la, unsigned s); +void linearFree(LinearAllocator* la, void* p); #if defined (__cplusplus) } diff --git a/driver/PoolAllocator.c b/driver/PoolAllocator.c new file mode 100644 index 0000000..ab1a06b --- /dev/null +++ b/driver/PoolAllocator.c @@ -0,0 +1,72 @@ +#include "PoolAllocator.h" + +#include "CustomAssert.h" + +#include + +PoolAllocator createPoolAllocator(char* b, unsigned bs, unsigned s) +{ + assert(b); //only allocated memory + assert(bs >= sizeof(void*)); //we need to be able to store + assert(s%bs==0); //we want a size that is the exact multiple of block size + assert(s > bs); //at least 1 element + + PoolAllocator pa = + { + .buf = b, + .nextFreeBlock = (uint32_t*)b, + .blockSize = bs, + .size = s + }; + + //initialize linked list of free pointers + uint32_t* ptr = pa.nextFreeBlock; + for(unsigned c = 0; c < s/bs - 1; ++c) + { + *ptr = (uint32_t)ptr + bs; + ptr += bs; + } + + *ptr = 0; //last element + + return pa; +} + +void destroyPoolAllocator(PoolAllocator* pa) +{ + //actual memory freeing is done by caller + pa->buf = 0; + pa->nextFreeBlock = 0; + pa->blockSize = 0; + pa->size = 0; +} + +void* poolAllocate(PoolAllocator* pa) +{ + assert(pa->buf); + + if(!pa->nextFreeBlock) + { + return 0; //no free blocks + } + + //next free block will be allocated + void* ret = pa->nextFreeBlock; + + //set next free block to the one the current next points to + pa->nextFreeBlock = (uint32_t*)*pa->nextFreeBlock; + + return ret; +} + +void poolFree(PoolAllocator* pa, void* p) +{ + assert(pa->buf); + assert(p); + + //set block to be freed to point to the current next free block + *(uint32_t*)p = (uint32_t)pa->nextFreeBlock; + + //set next free block to the freshly freed block + pa->nextFreeBlock = p; +} diff --git a/driver/PoolAllocator.h b/driver/PoolAllocator.h index 6e9e916..e8f9787 100644 --- a/driver/PoolAllocator.h +++ b/driver/PoolAllocator.h @@ -16,72 +16,10 @@ typedef struct PoolAllocator unsigned size; //size is exact multiple of block size } PoolAllocator; -PoolAllocator createPoolAllocator(char* b, unsigned bs, unsigned s) -{ - assert(b); //only allocated memory - assert(bs >= sizeof(void*)); //we need to be able to store - assert(s%bs==0); //we want a size that is the exact multiple of block size - assert(s > bs); //at least 1 element - - PoolAllocator pa = - { - .buf = b, - .nextFreeBlock = (uint32_t*)b, - .blockSize = bs, - .size = s - }; - - //initialize linked list of free pointers - uint32_t* ptr = pa.nextFreeBlock; - for(unsigned c = 0; c < s/bs - 1; ++c) - { - *ptr = (uint32_t)ptr + bs; - ptr += bs; - } - - *ptr = 0; //last element - - return pa; -} - -void destroyPoolAllocator(PoolAllocator* pa) -{ - //actual memory freeing is done by caller - pa->buf = 0; - pa->nextFreeBlock = 0; - pa->blockSize = 0; - pa->size = 0; -} - -void* poolAllocate(PoolAllocator* pa) -{ - assert(pa->buf); - - if(!pa->nextFreeBlock) - { - return 0; //no free blocks - } - - //next free block will be allocated - void* ret = pa->nextFreeBlock; - - //set next free block to the one the current next points to - pa->nextFreeBlock = (uint32_t*)*pa->nextFreeBlock; - - return ret; -} - -void poolFree(PoolAllocator* pa, void* p) -{ - assert(pa->buf); - assert(p); - - //set block to be freed to point to the current next free block - *(uint32_t*)p = (uint32_t)pa->nextFreeBlock; - - //set next free block to the freshly freed block - pa->nextFreeBlock = p; -} +PoolAllocator createPoolAllocator(char* b, unsigned bs, unsigned s); +void destroyPoolAllocator(PoolAllocator* pa); +void* poolAllocate(PoolAllocator* pa); +void poolFree(PoolAllocator* pa, void* p); #if defined (__cplusplus) } diff --git a/driver/command.c b/driver/command.c new file mode 100644 index 0000000..0262bb1 --- /dev/null +++ b/driver/command.c @@ -0,0 +1,449 @@ +#include "common.h" + +#include "kernel/vc4_packet.h" +#include "../brcm/cle/v3d_decoder.h" +#include "../brcm/clif/clif_dump.h" + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#commandbuffers-pools + * Command pools are opaque objects that command buffer memory is allocated from, and which allow the implementation to amortize the + * cost of resource creation across multiple command buffers. Command pools are externally synchronized, meaning that a command pool must + * not be used concurrently in multiple threads. That includes use via recording commands on any command buffers allocated from the pool, + * as well as operations that allocate, free, and reset command buffers or the pool itself. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkCreateCommandPool( + VkDevice device, + const VkCommandPoolCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkCommandPool* pCommandPool) +{ + assert(device); + assert(pCreateInfo); + + //TODO: allocator is ignored for now + assert(pAllocator == 0); + + //VK_COMMAND_POOL_CREATE_TRANSIENT_BIT + //specifies that command buffers allocated from the pool will be short-lived, meaning that they will be reset or freed in a relatively short timeframe. + //This flag may be used by the implementation to control memory allocation behavior within the pool. + //--> definitely use pool allocator + + //VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT + //allows any command buffer allocated from a pool to be individually reset to the initial state; either by calling vkResetCommandBuffer, or via the implicit reset when calling vkBeginCommandBuffer. + //If this flag is not set on a pool, then vkResetCommandBuffer must not be called for any command buffer allocated from that pool. + + //TODO pool family ignored for now + + _commandPool* cp = malloc(sizeof(_commandPool)); + + if(!cp) + { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + cp->queueFamilyIndex = pCreateInfo->queueFamilyIndex; + + //initial number of command buffers to hold + int numCommandBufs = 100; + int controlListSize = ARM_PAGE_SIZE * 100; + + //if(pCreateInfo->flags & VK_COMMAND_POOL_CREATE_TRANSIENT_BIT) + { + //use pool allocator + void* pamem = malloc(numCommandBufs * sizeof(_commandBuffer)); + if(!pamem) + { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + cp->pa = createPoolAllocator(pamem, sizeof(_commandBuffer), numCommandBufs * sizeof(_commandBuffer)); + + void* cpamem = malloc(controlListSize); + if(!cpamem) + { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + cp->cpa = createConsecutivePoolAllocator(cpamem, ARM_PAGE_SIZE, controlListSize); + } + + *pCommandPool = (VkCommandPool)cp; + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#commandbuffer-allocation + * vkAllocateCommandBuffers can be used to create multiple command buffers. If the creation of any of those command buffers fails, + * the implementation must destroy all successfully created command buffer objects from this command, set all entries of the pCommandBuffers array to NULL and return the error. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkAllocateCommandBuffers( + VkDevice device, + const VkCommandBufferAllocateInfo* pAllocateInfo, + VkCommandBuffer* pCommandBuffers) +{ + assert(device); + assert(pAllocateInfo); + assert(pCommandBuffers); + + VkResult res = VK_SUCCESS; + + _commandPool* cp = (_commandPool*)pAllocateInfo->commandPool; + + //if(cp->usePoolAllocator) + { + for(int c = 0; c < pAllocateInfo->commandBufferCount; ++c) + { + pCommandBuffers[c] = poolAllocate(&cp->pa); + + if(!pCommandBuffers[c]) + { + res = VK_ERROR_OUT_OF_HOST_MEMORY; + break; + } + + pCommandBuffers[c]->shaderRecCount = 0; + pCommandBuffers[c]->usageFlags = 0; + pCommandBuffers[c]->state = CMDBUF_STATE_INITIAL; + pCommandBuffers[c]->cp = cp; + clInit(&pCommandBuffers[c]->binCl, consecutivePoolAllocate(&cp->cpa, 1)); + clInit(&pCommandBuffers[c]->handlesCl, consecutivePoolAllocate(&cp->cpa, 1)); + clInit(&pCommandBuffers[c]->shaderRecCl, consecutivePoolAllocate(&cp->cpa, 1)); + clInit(&pCommandBuffers[c]->uniformsCl, consecutivePoolAllocate(&cp->cpa, 1)); + + if(!pCommandBuffers[c]->binCl.buffer) + { + res = VK_ERROR_OUT_OF_HOST_MEMORY; + break; + } + + if(!pCommandBuffers[c]->handlesCl.buffer) + { + res = VK_ERROR_OUT_OF_HOST_MEMORY; + break; + } + + if(!pCommandBuffers[c]->shaderRecCl.buffer) + { + res = VK_ERROR_OUT_OF_HOST_MEMORY; + break; + } + + if(!pCommandBuffers[c]->uniformsCl.buffer) + { + res = VK_ERROR_OUT_OF_HOST_MEMORY; + break; + } + } + } + + if(res != VK_SUCCESS) + { + //if(cp->usePoolAllocator) + { + for(int c = 0; c < pAllocateInfo->commandBufferCount; ++c) + { + consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->binCl, pCommandBuffers[c]->binCl.numBlocks); + consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->handlesCl, pCommandBuffers[c]->binCl.numBlocks); + consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->shaderRecCl, pCommandBuffers[c]->binCl.numBlocks); + consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->uniformsCl, pCommandBuffers[c]->binCl.numBlocks); + poolFree(&cp->pa, pCommandBuffers[c]); + pCommandBuffers[c] = 0; + } + } + } + + return res; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkBeginCommandBuffer + */ +VKAPI_ATTR VkResult VKAPI_CALL vkBeginCommandBuffer( + VkCommandBuffer commandBuffer, + const VkCommandBufferBeginInfo* pBeginInfo) +{ + assert(commandBuffer); + assert(pBeginInfo); + + //TODO + + //VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT + //specifies that each recording of the command buffer will only be submitted once, and the command buffer will be reset and recorded again between each submission. + + //VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT + //specifies that a secondary command buffer is considered to be entirely inside a render pass. If this is a primary command buffer, then this bit is ignored + + //VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT + //specifies that a command buffer can be resubmitted to a queue while it is in the pending state, and recorded into multiple primary command buffers + + //When a command buffer begins recording, all state in that command buffer is undefined + + struct drm_vc4_submit_cl submitCl = + { + .color_read.hindex = ~0, + .zs_read.hindex = ~0, + .color_write.hindex = ~0, + .msaa_color_write.hindex = ~0, + .zs_write.hindex = ~0, + .msaa_zs_write.hindex = ~0, + }; + + commandBuffer->usageFlags = pBeginInfo->flags; + commandBuffer->shaderRecCount = 0; + commandBuffer->state = CMDBUF_STATE_RECORDING; + commandBuffer->submitCl = submitCl; + + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEndCommandBuffer + * If there was an error during recording, the application will be notified by an unsuccessful return code returned by vkEndCommandBuffer. + * If the application wishes to further use the command buffer, the command buffer must be reset. The command buffer must have been in the recording state, + * and is moved to the executable state. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkEndCommandBuffer( + VkCommandBuffer commandBuffer) +{ + assert(commandBuffer); + + //Increment the semaphore indicating that binning is done and + //unblocking the render thread. Note that this doesn't act + //until the FLUSH completes. + //The FLUSH caps all of our bin lists with a + //VC4_PACKET_RETURN. + clFit(commandBuffer, &commandBuffer->binCl, V3D21_INCREMENT_SEMAPHORE_length); + clInsertIncrementSemaphore(&commandBuffer->binCl); + clFit(commandBuffer, &commandBuffer->binCl, V3D21_FLUSH_length); + clInsertFlush(&commandBuffer->binCl); + + commandBuffer->state = CMDBUF_STATE_EXECUTABLE; + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkQueueSubmit + * vkQueueSubmit is a queue submission command, with each batch defined by an element of pSubmits as an instance of the VkSubmitInfo structure. + * Batches begin execution in the order they appear in pSubmits, but may complete out of order. + * Fence and semaphore operations submitted with vkQueueSubmit have additional ordering constraints compared to other submission commands, + * with dependencies involving previous and subsequent queue operations. Information about these additional constraints can be found in the semaphore and + * fence sections of the synchronization chapter. + * Details on the interaction of pWaitDstStageMask with synchronization are described in the semaphore wait operation section of the synchronization chapter. + * The order that batches appear in pSubmits is used to determine submission order, and thus all the implicit ordering guarantees that respect it. + * Other than these implicit ordering guarantees and any explicit synchronization primitives, these batches may overlap or otherwise execute out of order. + * If any command buffer submitted to this queue is in the executable state, it is moved to the pending state. Once execution of all submissions of a command buffer complete, + * it moves from the pending state, back to the executable state. If a command buffer was recorded with the VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT flag, + * it instead moves back to the invalid state. + * If vkQueueSubmit fails, it may return VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY. + * If it does, the implementation must ensure that the state and contents of any resources or synchronization primitives referenced by the submitted command buffers and any semaphores + * referenced by pSubmits is unaffected by the call or its failure. If vkQueueSubmit fails in such a way that the implementation is unable to make that guarantee, + * the implementation must return VK_ERROR_DEVICE_LOST. See Lost Device. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit( + VkQueue queue, + uint32_t submitCount, + const VkSubmitInfo* pSubmits, + VkFence fence) +{ + assert(queue); + + for(int c = 0; c < pSubmits->waitSemaphoreCount; ++c) + { + sem_wait((sem_t*)pSubmits->pWaitSemaphores[c]); + } + + //TODO: deal with pSubmits->pWaitDstStageMask + + //TODO wait for fence?? + + for(int c = 0; c < pSubmits->commandBufferCount; ++c) + { + if(pSubmits->pCommandBuffers[c]->state == CMDBUF_STATE_EXECUTABLE) + { + pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_PENDING; + } + } + + for(int c = 0; c < pSubmits->commandBufferCount; ++c) + { + VkCommandBuffer cmdbuf = pSubmits->pCommandBuffers[c]; + + cmdbuf->submitCl.bo_handles = cmdbuf->handlesCl.buffer; + cmdbuf->submitCl.bo_handle_count = clSize(&cmdbuf->handlesCl) / 4; + cmdbuf->submitCl.bin_cl = cmdbuf->binCl.buffer; + cmdbuf->submitCl.bin_cl_size = clSize(&cmdbuf->binCl); + cmdbuf->submitCl.shader_rec = cmdbuf->shaderRecCl.buffer; + cmdbuf->submitCl.shader_rec_size = clSize(&cmdbuf->shaderRecCl); + cmdbuf->submitCl.shader_rec_count = cmdbuf->shaderRecCount; + cmdbuf->submitCl.uniforms = cmdbuf->uniformsCl.buffer; + cmdbuf->submitCl.uniforms_size = clSize(&cmdbuf->uniformsCl); + + printf("BCL:\n"); + clDump(cmdbuf->submitCl.bin_cl, cmdbuf->submitCl.bin_cl_size); + printf("BO handles: "); + for(int d = 0; d < cmdbuf->submitCl.bo_handle_count; ++d) + { + printf("%u ", *((uint32_t*)(cmdbuf->submitCl.bo_handles)+d)); + } + printf("\nwidth height: %u, %u\n", cmdbuf->submitCl.width, cmdbuf->submitCl.height); + printf("tile min/max: %u,%u %u,%u\n", cmdbuf->submitCl.min_x_tile, cmdbuf->submitCl.min_y_tile, cmdbuf->submitCl.max_x_tile, cmdbuf->submitCl.max_y_tile); + printf("color read surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.color_read.hindex, cmdbuf->submitCl.color_read.offset, cmdbuf->submitCl.color_read.bits, cmdbuf->submitCl.color_read.flags); + printf("color write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.color_write.hindex, cmdbuf->submitCl.color_write.offset, cmdbuf->submitCl.color_write.bits, cmdbuf->submitCl.color_write.flags); + printf("zs read surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.zs_read.hindex, cmdbuf->submitCl.zs_read.offset, cmdbuf->submitCl.zs_read.bits, cmdbuf->submitCl.zs_read.flags); + printf("zs write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.zs_write.hindex, cmdbuf->submitCl.zs_write.offset, cmdbuf->submitCl.zs_write.bits, cmdbuf->submitCl.zs_write.flags); + printf("msaa color write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.msaa_color_write.hindex, cmdbuf->submitCl.msaa_color_write.offset, cmdbuf->submitCl.msaa_color_write.bits, cmdbuf->submitCl.msaa_color_write.flags); + printf("msaa zs write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.msaa_zs_write.hindex, cmdbuf->submitCl.msaa_zs_write.offset, cmdbuf->submitCl.msaa_zs_write.bits, cmdbuf->submitCl.msaa_zs_write.flags); + printf("clear color packed rgba %u %u\n", cmdbuf->submitCl.clear_color[0], cmdbuf->submitCl.clear_color[1]); + printf("clear z %u\n", cmdbuf->submitCl.clear_z); + printf("clear s %u\n", cmdbuf->submitCl.clear_s); + printf("flags %u\n", cmdbuf->submitCl.flags); + + + //submit ioctl + static uint64_t lastFinishedSeqno = 0; + vc4_cl_submit(controlFd, &cmdbuf->submitCl, &queue->lastEmitSeqno, &lastFinishedSeqno); + } + + for(int c = 0; c < pSubmits->commandBufferCount; ++c) + { + if(pSubmits->pCommandBuffers[c]->state == CMDBUF_STATE_PENDING) + { + if(pSubmits->pCommandBuffers[c]->usageFlags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) + { + pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_INVALID; + } + else + { + pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_EXECUTABLE; + } + } + } + + for(int c = 0; c < pSubmits->signalSemaphoreCount; ++c) + { + sem_post((sem_t*)pSubmits->pSignalSemaphores[c]); + } + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkFreeCommandBuffers + * Any primary command buffer that is in the recording or executable state and has any element of pCommandBuffers recorded into it, becomes invalid. + */ +VKAPI_ATTR void VKAPI_CALL vkFreeCommandBuffers( + VkDevice device, + VkCommandPool commandPool, + uint32_t commandBufferCount, + const VkCommandBuffer* pCommandBuffers) +{ + assert(device); + assert(commandPool); + assert(pCommandBuffers); + + _commandPool* cp = (_commandPool*)commandPool; + + for(int c = 0; c < commandBufferCount; ++c) + { + //if(cp->usePoolAllocator) + { + consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->binCl, pCommandBuffers[c]->binCl.numBlocks); + consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->handlesCl, pCommandBuffers[c]->binCl.numBlocks); + consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->shaderRecCl, pCommandBuffers[c]->binCl.numBlocks); + consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->uniformsCl, pCommandBuffers[c]->binCl.numBlocks); + poolFree(&cp->pa, pCommandBuffers[c]); + } + } +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyCommandPool + * When a pool is destroyed, all command buffers allocated from the pool are freed. + * Any primary command buffer allocated from another VkCommandPool that is in the recording or executable state and has a secondary command buffer + * allocated from commandPool recorded into it, becomes invalid. + */ +VKAPI_ATTR void VKAPI_CALL vkDestroyCommandPool( + VkDevice device, + VkCommandPool commandPool, + const VkAllocationCallbacks* pAllocator) +{ + assert(device); + assert(commandPool); + + //TODO: allocator is ignored for now + assert(pAllocator == 0); + + _commandPool* cp = (_commandPool*)commandPool; + + //if(cp->usePoolAllocator) + { + free(cp->pa.buf); + free(cp->cpa.buf); + destroyPoolAllocator(&cp->pa); + destroyConsecutivePoolAllocator(&cp->cpa); + } + + free(cp); +} + +void clFit(VkCommandBuffer cb, ControlList* cl, uint32_t commandSize) +{ + if(!clHasEnoughSpace(cl, commandSize)) + { + uint32_t currSize = clSize(cl); + cl->buffer = consecutivePoolReAllocate(&cb->cp->cpa, cl->buffer, cl->numBlocks); assert(cl->buffer); + cl->nextFreeByte = cl->buffer + currSize; + } +} + +void clDump(void* cl, uint32_t size) +{ + struct v3d_device_info devinfo = { + /* While the driver supports V3D 2.1 and 2.6, we haven't split + * off a 2.6 XML yet (there are a couple of fields different + * in render target formatting) + */ + .ver = 21, + }; + struct v3d_spec* spec = v3d_spec_load(&devinfo); + + struct clif_dump *clif = clif_dump_init(&devinfo, stderr, true); + + uint32_t offset = 0, hw_offset = 0; + uint8_t *p = cl; + + while (offset < size) { + struct v3d_group *inst = v3d_spec_find_instruction(spec, p); + uint8_t header = *p; + uint32_t length; + + if (inst == NULL) { + printf("0x%08x 0x%08x: Unknown packet 0x%02x (%d)!\n", + offset, hw_offset, header, header); + return; + } + + length = v3d_group_get_length(inst); + + printf("0x%08x 0x%08x: 0x%02x %s\n", + offset, hw_offset, header, v3d_group_get_name(inst)); + + v3d_print_group(clif, inst, offset, p); + + switch (header) { + case VC4_PACKET_HALT: + case VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF: + return; + default: + break; + } + + offset += length; + if (header != VC4_PACKET_GEM_HANDLES) + hw_offset += length; + p += length; + } + + clif_dump_destroy(clif); +} diff --git a/driver/common.c b/driver/common.c new file mode 100644 index 0000000..58a578d --- /dev/null +++ b/driver/common.c @@ -0,0 +1,328 @@ +#include "common.h" + +#include "kernel/vc4_packet.h" + +void createImageBO(_image* i) +{ + assert(i); + assert(i->format); + assert(i->width); + assert(i->height); + + uint32_t bpp = getFormatBpp(i->format); + uint32_t pixelSizeBytes = bpp / 8; + uint32_t nonPaddedSize = i->width * i->height * pixelSizeBytes; + i->paddedWidth = i->width; + i->paddedHeight = i->height; + + //need to pad to T format, as HW automatically chooses that + if(nonPaddedSize > 4096) + { + getPaddedTextureDimensionsT(i->width, i->height, bpp, &i->paddedWidth, &i->paddedHeight); + } + + i->size = i->paddedWidth * i->paddedHeight * pixelSizeBytes; + i->stride = i->paddedWidth * pixelSizeBytes; + i->handle = vc4_bo_alloc(controlFd, i->size, "swapchain image"); assert(i->handle); + + //set tiling to T if size > 4KB + if(nonPaddedSize > 4096) + { + int ret = vc4_bo_set_tiling(controlFd, i->handle, DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED); assert(ret); + i->tiling = VC4_TILING_FORMAT_T; + } + else + { + int ret = vc4_bo_set_tiling(controlFd, i->handle, DRM_FORMAT_MOD_LINEAR); assert(ret); + i->tiling = VC4_TILING_FORMAT_LT; + } +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCmdClearColorImage + * Color and depth/stencil images can be cleared outside a render pass instance using vkCmdClearColorImage or vkCmdClearDepthStencilImage, respectively. + * These commands are only allowed outside of a render pass instance. + */ +VKAPI_ATTR void VKAPI_CALL vkCmdClearColorImage( + VkCommandBuffer commandBuffer, + VkImage image, + VkImageLayout imageLayout, + const VkClearColorValue* pColor, + uint32_t rangeCount, + const VkImageSubresourceRange* pRanges) +{ + assert(commandBuffer); + assert(image); + assert(pColor); + + //TODO this should only flag an image for clearing. This can only be called outside a renderpass + //actual clearing would only happen: + // -if image is rendered to (insert clear before first draw call) + // -if the image is bound for sampling (submit a CL with a clear) + // -if a command buffer is submitted without any rendering (insert clear) + // -etc. + //we shouldn't clear an image if noone uses it + + //TODO ranges support + + assert(imageLayout == VK_IMAGE_LAYOUT_GENERAL || + imageLayout == VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR || + imageLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + + assert(commandBuffer->state == CMDBUF_STATE_RECORDING); + assert(_queueFamilyProperties[commandBuffer->cp->queueFamilyIndex].queueFlags & VK_QUEUE_GRAPHICS_BIT || _queueFamilyProperties[commandBuffer->cp->queueFamilyIndex].queueFlags & VK_QUEUE_COMPUTE_BIT); + + _image* i = image; + + assert(i->usageBits & VK_IMAGE_USAGE_TRANSFER_DST_BIT); + + //TODO externally sync cmdbuf, cmdpool + + i->needToClear = 1; + i->clearColor[0] = i->clearColor[1] = packVec4IntoABGR8(pColor->float32); +} + +int findInstanceExtension(char* name) +{ + for(int c = 0; c < numInstanceExtensions; ++c) + { + if(strcmp(instanceExtensions[c].extensionName, name) == 0) + { + return c; + } + } + + return -1; +} + +int findDeviceExtension(char* name) +{ + for(int c = 0; c < numDeviceExtensions; ++c) + { + if(strcmp(deviceExtensions[c].extensionName, name) == 0) + { + return c; + } + } + + return -1; +} + +//Textures in T format: +//formed out of 4KB tiles, which have 1KB subtiles (see page 105 in VC4 arch guide) +//1KB subtiles have 512b microtiles. +//Width/height of the 512b microtiles is the following: +// 64bpp: 2x4 +// 32bpp: 4x4 +// 16bpp: 8x4 +// 8bpp: 8x8 +// 4bpp: 16x8 +// 1bpp: 32x16 +//Therefore width/height of 1KB subtiles is the following: +// 64bpp: 8x16 +// 32bpp: 16x16 +// 16bpp: 32x16 +// 8bpp: 32x32 +// 4bpp: 64x32 +// 1bpp: 128x64 +//Finally width/height of the 4KB tiles: +// 64bpp: 16x32 +// 32bpp: 32x32 +// 16bpp: 64x32 +// 8bpp: 64x64 +// 4bpp: 128x64 +// 1bpp: 256x128 +void getPaddedTextureDimensionsT(uint32_t width, uint32_t height, uint32_t bpp, uint32_t* paddedWidth, uint32_t* paddedHeight) +{ + assert(paddedWidth); + assert(paddedHeight); + uint32_t tileW = 0; + uint32_t tileH = 0; + + switch(bpp) + { + case 64: + { + tileW = 16; + tileH = 32; + break; + } + case 32: + { + tileW = 32; + tileH = 32; + break; + } + case 16: + { + tileW = 64; + tileH = 32; + break; + } + case 8: + { + tileW = 64; + tileH = 64; + break; + } + case 4: + { + tileW = 128; + tileH = 64; + break; + } + case 1: + { + tileW = 256; + tileH = 128; + break; + } + default: + { + assert(0); //unsupported + } + } + + *paddedWidth = ((tileW - (width % tileW)) % tileW) + width; + *paddedHeight = ((tileH - (height % tileH)) % tileH) + height; +} + +uint32_t getFormatBpp(VkFormat f) +{ + switch(f) + { + case VK_FORMAT_R16G16B16A16_SFLOAT: + return 64; + case VK_FORMAT_R8G8B8_UNORM: //padded to 32 + case VK_FORMAT_R8G8B8A8_UNORM: + return 32; + return 32; + case VK_FORMAT_R5G5B5A1_UNORM_PACK16: + case VK_FORMAT_R4G4B4A4_UNORM_PACK16: + case VK_FORMAT_R5G6B5_UNORM_PACK16: + case VK_FORMAT_R8G8_UNORM: + case VK_FORMAT_R16_SFLOAT: + case VK_FORMAT_R16_SINT: + return 16; + case VK_FORMAT_R8_UNORM: + case VK_FORMAT_R8_SINT: + return 8; + default: + assert(0); + return 0; + } +} + +uint32_t packVec4IntoABGR8(const float rgba[4]) +{ + uint8_t r, g, b, a; + r = rgba[0] * 255.0; + g = rgba[1] * 255.0; + b = rgba[2] * 255.0; + a = rgba[3] * 255.0; + + uint32_t res = 0 | + (a << 24) | + (b << 16) | + (g << 8) | + (r << 0); + + return res; +} + +/*static inline void util_pack_color(const float rgba[4], enum pipe_format format, union util_color *uc) +{ + ubyte r = 0; + ubyte g = 0; + ubyte b = 0; + ubyte a = 0; + + if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) <= 8) { + r = float_to_ubyte(rgba[0]); + g = float_to_ubyte(rgba[1]); + b = float_to_ubyte(rgba[2]); + a = float_to_ubyte(rgba[3]); + } + + switch (format) { + case PIPE_FORMAT_ABGR8888_UNORM: + { + uc->ui[0] = (r << 24) | (g << 16) | (b << 8) | a; + } + return; + case PIPE_FORMAT_XBGR8888_UNORM: + { + uc->ui[0] = (r << 24) | (g << 16) | (b << 8) | 0xff; + } + return; + case PIPE_FORMAT_BGRA8888_UNORM: + { + uc->ui[0] = (a << 24) | (r << 16) | (g << 8) | b; + } + return; + case PIPE_FORMAT_BGRX8888_UNORM: + { + uc->ui[0] = (0xffu << 24) | (r << 16) | (g << 8) | b; + } + return; + case PIPE_FORMAT_ARGB8888_UNORM: + { + uc->ui[0] = (b << 24) | (g << 16) | (r << 8) | a; + } + return; + case PIPE_FORMAT_XRGB8888_UNORM: + { + uc->ui[0] = (b << 24) | (g << 16) | (r << 8) | 0xff; + } + return; + case PIPE_FORMAT_B5G6R5_UNORM: + { + uc->us = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3); + } + return; + case PIPE_FORMAT_B5G5R5X1_UNORM: + { + uc->us = ((0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3); + } + return; + case PIPE_FORMAT_B5G5R5A1_UNORM: + { + uc->us = ((a & 0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3); + } + return; + case PIPE_FORMAT_B4G4R4A4_UNORM: + { + uc->us = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4); + } + return; + case PIPE_FORMAT_A8_UNORM: + { + uc->ub = a; + } + return; + case PIPE_FORMAT_L8_UNORM: + case PIPE_FORMAT_I8_UNORM: + { + uc->ub = r; + } + return; + case PIPE_FORMAT_R32G32B32A32_FLOAT: + { + uc->f[0] = rgba[0]; + uc->f[1] = rgba[1]; + uc->f[2] = rgba[2]; + uc->f[3] = rgba[3]; + } + return; + case PIPE_FORMAT_R32G32B32_FLOAT: + { + uc->f[0] = rgba[0]; + uc->f[1] = rgba[1]; + uc->f[2] = rgba[2]; + } + return; + + default: + util_format_write_4f(format, rgba, 0, uc, 0, 0, 0, 1, 1); + } +}*/ diff --git a/driver/common.h b/driver/common.h new file mode 100644 index 0000000..40eda28 --- /dev/null +++ b/driver/common.h @@ -0,0 +1,120 @@ +#pragma once + +#include +#include +#include + +#include +#include "vkExt.h" + +#include "AlignedAllocator.h" +#include "PoolAllocator.h" +#include "ConsecutivePoolAllocator.h" +#include "LinearAllocator.h" + +#include +#include "CustomAssert.h" +#include +#include +#include +#include +#include +#include + +#include "modeset.h" +#include "kernelInterface.h" +#include "ControlListUtil.h" + +#ifndef min +#define min(a, b) (a < b ? a : b) +#endif + +#ifndef max +#define max(a, b) (a > b ? a : b) +#endif + +#include "vkCaps.h" + +typedef struct VkPhysicalDevice_T +{ + //hardware id? + int dummy; +} _physicalDevice; + +typedef struct VkQueue_T +{ + uint64_t lastEmitSeqno; +} _queue; + +typedef struct VkCommandPool_T +{ + PoolAllocator pa; + ConsecutivePoolAllocator cpa; + uint32_t queueFamilyIndex; +} _commandPool; + +typedef enum commandBufferState +{ + CMDBUF_STATE_INITIAL = 0, + CMDBUF_STATE_RECORDING, + CMDBUF_STATE_EXECUTABLE, + CMDBUF_STATE_PENDING, + CMDBUF_STATE_INVALID, + CMDBUF_STATE_LAST +} commandBufferState; + +typedef struct VkCommandBuffer_T +{ + //Recorded commands include commands to bind pipelines and descriptor sets to the command buffer, commands to modify dynamic state, commands to draw (for graphics rendering), + //commands to dispatch (for compute), commands to execute secondary command buffers (for primary command buffers only), commands to copy buffers and images, and other commands + + struct drm_vc4_submit_cl submitCl; + + ControlList binCl; + ControlList shaderRecCl; + uint32_t shaderRecCount; + ControlList uniformsCl; + ControlList handlesCl; + commandBufferState state; + VkCommandBufferUsageFlags usageFlags; + _commandPool* cp; +} _commandBuffer; + +typedef struct VkInstance_T +{ + //supposedly this should contain all the enabled layers? + int enabledExtensions[numInstanceExtensions]; + int numEnabledExtensions; + _physicalDevice dev; + int chipVersion; + int hasTiling; + int hasControlFlow; + int hasEtc1; + int hasThreadedFs; + int hasMadvise; +} _instance; + +typedef struct VkDevice_T +{ + int enabledExtensions[numDeviceExtensions]; + int numEnabledExtensions; + VkPhysicalDeviceFeatures enabledFeatures; + _physicalDevice* dev; + _queue* queues[numQueueFamilies]; + int numQueues[numQueueFamilies]; +} _device; + +typedef struct VkSwapchain_T +{ + _image* images; + uint32_t numImages; + uint32_t backbufferIdx; + VkSurfaceKHR surface; +} _swapchain; + +void getPaddedTextureDimensionsT(uint32_t width, uint32_t height, uint32_t bpp, uint32_t* paddedWidth, uint32_t* paddedHeight); +uint32_t getFormatBpp(VkFormat f); +uint32_t packVec4IntoABGR8(const float rgba[4]); +int findInstanceExtension(char* name); +int findDeviceExtension(char* name); +void createImageBO(_image* i); diff --git a/driver/device.c b/driver/device.c new file mode 100644 index 0000000..89be256 --- /dev/null +++ b/driver/device.c @@ -0,0 +1,314 @@ +#include "common.h" + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#devsandqueues-physical-device-enumeration + * If pPhysicalDevices is NULL, then the number of physical devices available is returned in pPhysicalDeviceCount. Otherwise, pPhysicalDeviceCount must point to a + * variable set by the user to the number of elements in the pPhysicalDevices array, and on return the variable is overwritten with the number of handles actually + * written to pPhysicalDevices. If pPhysicalDeviceCount is less than the number of physical devices available, at most pPhysicalDeviceCount structures will be written. + * If pPhysicalDeviceCount is smaller than the number of physical devices available, VK_INCOMPLETE will be returned instead of VK_SUCCESS, to indicate that not all the + * available physical devices were returned. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkEnumeratePhysicalDevices( + VkInstance instance, + uint32_t* pPhysicalDeviceCount, + VkPhysicalDevice* pPhysicalDevices) +{ + assert(instance); + + //TODO is there a way to check if there's a gpu (and it's the rPi)? + int gpuExists = access( "/dev/dri/card0", F_OK ) != -1; + + int numGPUs = gpuExists; + + assert(pPhysicalDeviceCount); + + if(!pPhysicalDevices) + { + *pPhysicalDeviceCount = numGPUs; + return VK_SUCCESS; + } + + int arraySize = *pPhysicalDeviceCount; + int elementsWritten = min(numGPUs, arraySize); + + for(int c = 0; c < elementsWritten; ++c) + { + pPhysicalDevices[c] = &instance->dev; + } + + *pPhysicalDeviceCount = elementsWritten; + + if(elementsWritten < arraySize) + { + return VK_INCOMPLETE; + } + else + { + return VK_SUCCESS; + } +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceProperties + */ +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceProperties( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceProperties* pProperties) +{ + assert(physicalDevice); + assert(pProperties); + + VkPhysicalDeviceSparseProperties sparseProps = + { + .residencyStandard2DBlockShape = 1, + .residencyStandard2DMultisampleBlockShape = 1, + .residencyStandard3DBlockShape = 1, + .residencyAlignedMipSize = 1, + .residencyNonResidentStrict = 1 + }; + + pProperties->apiVersion = VK_MAKE_VERSION(1,1,0); + pProperties->driverVersion = 1; //we'll simply call this v1 + pProperties->vendorID = 0x14E4; //Broadcom + pProperties->deviceID = 0; //TODO dunno? + pProperties->deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU; + strcpy(pProperties->deviceName, "VideoCore IV HW"); + //pProperties->pipelineCacheUUID + pProperties->limits = _limits; + pProperties->sparseProperties = sparseProps; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceFeatures + */ +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFeatures( + VkPhysicalDevice physicalDevice, + VkPhysicalDeviceFeatures* pFeatures) +{ + assert(physicalDevice); + assert(pFeatures); + + *pFeatures = _features; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEnumerateDeviceExtensionProperties + */ +VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateDeviceExtensionProperties( + VkPhysicalDevice physicalDevice, + const char* pLayerName, + uint32_t* pPropertyCount, + VkExtensionProperties* pProperties) +{ + assert(physicalDevice); + assert(!pLayerName); //layers ignored for now + assert(pPropertyCount); + + if(!pProperties) + { + *pPropertyCount = numDeviceExtensions; + return VK_INCOMPLETE; + } + + int arraySize = *pPropertyCount; + int elementsWritten = min(numDeviceExtensions, arraySize); + + for(int c = 0; c < elementsWritten; ++c) + { + pProperties[c] = deviceExtensions[c]; + } + + *pPropertyCount = elementsWritten; + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceQueueFamilyProperties + * If pQueueFamilyProperties is NULL, then the number of queue families available is returned in pQueueFamilyPropertyCount. + * Otherwise, pQueueFamilyPropertyCount must point to a variable set by the user to the number of elements in the pQueueFamilyProperties array, + * and on return the variable is overwritten with the number of structures actually written to pQueueFamilyProperties. If pQueueFamilyPropertyCount + * is less than the number of queue families available, at most pQueueFamilyPropertyCount structures will be written. + */ +VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceQueueFamilyProperties( + VkPhysicalDevice physicalDevice, + uint32_t* pQueueFamilyPropertyCount, + VkQueueFamilyProperties* pQueueFamilyProperties) +{ + assert(physicalDevice); + assert(pQueueFamilyPropertyCount); + + if(!pQueueFamilyProperties) + { + *pQueueFamilyPropertyCount = 1; + return; + } + + int arraySize = *pQueueFamilyPropertyCount; + int elementsWritten = min(numQueueFamilies, arraySize); + + for(int c = 0; c < elementsWritten; ++c) + { + pQueueFamilyProperties[c] = _queueFamilyProperties[c]; + } + + *pQueueFamilyPropertyCount = elementsWritten; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfaceSupportKHR + * does this queue family support presentation to this surface? + */ +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceSupportKHR( + VkPhysicalDevice physicalDevice, + uint32_t queueFamilyIndex, + VkSurfaceKHR surface, + VkBool32* pSupported) +{ + assert(pSupported); + assert(surface); + assert(physicalDevice); + + assert(queueFamilyIndex < numQueueFamilies); + + //TODO if we plan to support headless rendering, there should be 2 families + //one using /dev/dri/card0 which has modesetting + //other using /dev/dri/renderD128 which does not support modesetting, this would say false here + *pSupported = VK_TRUE; + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateDevice + * vkCreateDevice verifies that extensions and features requested in the ppEnabledExtensionNames and pEnabledFeatures + * members of pCreateInfo, respectively, are supported by the implementation. If any requested extension is not supported, + * vkCreateDevice must return VK_ERROR_EXTENSION_NOT_PRESENT. If any requested feature is not supported, vkCreateDevice must return + * VK_ERROR_FEATURE_NOT_PRESENT. Support for extensions can be checked before creating a device by querying vkEnumerateDeviceExtensionProperties + * After verifying and enabling the extensions the VkDevice object is created and returned to the application. + * If a requested extension is only supported by a layer, both the layer and the extension need to be specified at vkCreateInstance + * time for the creation to succeed. Multiple logical devices can be created from the same physical device. Logical device creation may + * fail due to lack of device-specific resources (in addition to the other errors). If that occurs, vkCreateDevice will return VK_ERROR_TOO_MANY_OBJECTS. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkCreateDevice( + VkPhysicalDevice physicalDevice, + const VkDeviceCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkDevice* pDevice) +{ + assert(physicalDevice); + assert(pDevice); + assert(pCreateInfo); + + //TODO: allocator is ignored for now + assert(pAllocator == 0); + + *pDevice = malloc(sizeof(_device)); + if(!pDevice) + { + return VK_ERROR_TOO_MANY_OBJECTS; + } + + (*pDevice)->dev = physicalDevice; + + for(int c = 0; c < pCreateInfo->enabledExtensionCount; ++c) + { + int findres = findDeviceExtension(pCreateInfo->ppEnabledExtensionNames[c]); + if(findres > -1) + { + (*pDevice)->enabledExtensions[(*pDevice)->numEnabledExtensions] = findres; + (*pDevice)->numEnabledExtensions++; + } + else + { + return VK_ERROR_EXTENSION_NOT_PRESENT; + } + } + + VkBool32* requestedFeatures = pCreateInfo->pEnabledFeatures; + VkBool32* supportedFeatures = &_features; + + if(requestedFeatures) + { + for(int c = 0; c < numFeatures; ++c) + { + if(requestedFeatures[c] && !supportedFeatures[c]) + { + return VK_ERROR_FEATURE_NOT_PRESENT; + } + } + + (*pDevice)->enabledFeatures = *pCreateInfo->pEnabledFeatures; + } + else + { + memset(&(*pDevice)->enabledFeatures, 0, sizeof((*pDevice)->enabledFeatures)); //just disable everything + } + + //layers ignored per spec + //pCreateInfo->enabledLayerCount + + for(int c = 0; c < numQueueFamilies; ++c) + { + (*pDevice)->queues[c] = 0; + } + + if(pCreateInfo->queueCreateInfoCount > 0) + { + for(int c = 0; c < pCreateInfo->queueCreateInfoCount; ++c) + { + (*pDevice)->queues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex] = malloc(sizeof(_queue)*pCreateInfo->pQueueCreateInfos[c].queueCount); + + if(!(*pDevice)->queues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex]) + { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + for(int d = 0; d < pCreateInfo->pQueueCreateInfos[c].queueCount; ++d) + { + (*pDevice)->queues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex][d].lastEmitSeqno = 0; + } + + (*pDevice)->numQueues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex] = pCreateInfo->pQueueCreateInfos[c].queueCount; + } + } + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetDeviceQueue + * vkGetDeviceQueue must only be used to get queues that were created with the flags parameter of VkDeviceQueueCreateInfo set to zero. + * To get queues that were created with a non-zero flags parameter use vkGetDeviceQueue2. + */ +VKAPI_ATTR void VKAPI_CALL vkGetDeviceQueue( + VkDevice device, + uint32_t queueFamilyIndex, + uint32_t queueIndex, + VkQueue* pQueue) +{ + assert(device); + assert(pQueue); + + assert(queueFamilyIndex < numQueueFamilies); + assert(queueIndex < device->numQueues[queueFamilyIndex]); + + *pQueue = &device->queues[queueFamilyIndex][queueIndex]; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyDevice + * To ensure that no work is active on the device, vkDeviceWaitIdle can be used to gate the destruction of the device. + * Prior to destroying a device, an application is responsible for destroying/freeing any Vulkan objects that were created using that device as the + * first parameter of the corresponding vkCreate* or vkAllocate* command + */ +VKAPI_ATTR void VKAPI_CALL vkDestroyDevice( + VkDevice device, + const VkAllocationCallbacks* pAllocator) +{ + assert(device); + + //TODO: allocator is ignored for now + assert(pAllocator == 0); + + //TODO +} diff --git a/driver/driver.c b/driver/driver.c deleted file mode 100644 index f9c473d..0000000 --- a/driver/driver.c +++ /dev/null @@ -1,1938 +0,0 @@ -#include -#include "CustomAssert.h" -#include -#include -#include -#include -#include -#include - -#include -#include "vkExt.h" - -#include "modeset.h" -#include "kernelInterface.h" -#include "ControlListUtil.h" - -#include "AlignedAllocator.h" -#include "PoolAllocator.h" -#include "ConsecutivePoolAllocator.h" -#include "LinearAllocator.h" - -#include "kernel/vc4_packet.h" -#include "../brcm/cle/v3d_decoder.h" -#include "../brcm/clif/clif_dump.h" - -#ifndef min -#define min(a, b) (a < b ? a : b) -#endif - -#ifndef max -#define max(a, b) (a > b ? a : b) -#endif - -#include "vkCaps.h" - -typedef struct VkPhysicalDevice_T -{ - //hardware id? - int dummy; -} _physicalDevice; - -typedef struct VkQueue_T -{ - uint64_t lastEmitSeqno; -} _queue; - -typedef struct VkCommandPool_T -{ - PoolAllocator pa; - ConsecutivePoolAllocator cpa; - uint32_t queueFamilyIndex; -} _commandPool; - -typedef enum commandBufferState -{ - CMDBUF_STATE_INITIAL = 0, - CMDBUF_STATE_RECORDING, - CMDBUF_STATE_EXECUTABLE, - CMDBUF_STATE_PENDING, - CMDBUF_STATE_INVALID, - CMDBUF_STATE_LAST -} commandBufferState; - -typedef struct VkCommandBuffer_T -{ - //Recorded commands include commands to bind pipelines and descriptor sets to the command buffer, commands to modify dynamic state, commands to draw (for graphics rendering), - //commands to dispatch (for compute), commands to execute secondary command buffers (for primary command buffers only), commands to copy buffers and images, and other commands - - struct drm_vc4_submit_cl submitCl; - - ControlList binCl; - ControlList shaderRecCl; - uint32_t shaderRecCount; - ControlList uniformsCl; - ControlList handlesCl; - commandBufferState state; - VkCommandBufferUsageFlags usageFlags; - _commandPool* cp; -} _commandBuffer; - -typedef struct VkInstance_T -{ - //supposedly this should contain all the enabled layers? - int enabledExtensions[numInstanceExtensions]; - int numEnabledExtensions; - _physicalDevice dev; - int chipVersion; - int hasTiling; - int hasControlFlow; - int hasEtc1; - int hasThreadedFs; - int hasMadvise; -} _instance; - -typedef struct VkDevice_T -{ - int enabledExtensions[numDeviceExtensions]; - int numEnabledExtensions; - VkPhysicalDeviceFeatures enabledFeatures; - _physicalDevice* dev; - _queue* queues[numQueueFamilies]; - int numQueues[numQueueFamilies]; -} _device; - -typedef struct VkSwapchain_T -{ - _image* images; - uint32_t numImages; - uint32_t backbufferIdx; - VkSurfaceKHR surface; -} _swapchain; - -void clFit(VkCommandBuffer cb, ControlList* cl, uint32_t commandSize) -{ - if(!clHasEnoughSpace(cl, commandSize)) - { - uint32_t currSize = clSize(cl); - cl->buffer = consecutivePoolReAllocate(&cb->cp->cpa, cl->buffer, cl->numBlocks); assert(cl->buffer); - cl->nextFreeByte = cl->buffer + currSize; - } -} - -void clDump(void* cl, uint32_t size) -{ - struct v3d_device_info devinfo = { - /* While the driver supports V3D 2.1 and 2.6, we haven't split - * off a 2.6 XML yet (there are a couple of fields different - * in render target formatting) - */ - .ver = 21, - }; - struct v3d_spec* spec = v3d_spec_load(&devinfo); - - struct clif_dump *clif = clif_dump_init(&devinfo, stderr, true); - - uint32_t offset = 0, hw_offset = 0; - uint8_t *p = cl; - - while (offset < size) { - struct v3d_group *inst = v3d_spec_find_instruction(spec, p); - uint8_t header = *p; - uint32_t length; - - if (inst == NULL) { - printf("0x%08x 0x%08x: Unknown packet 0x%02x (%d)!\n", - offset, hw_offset, header, header); - return; - } - - length = v3d_group_get_length(inst); - - printf("0x%08x 0x%08x: 0x%02x %s\n", - offset, hw_offset, header, v3d_group_get_name(inst)); - - v3d_print_group(clif, inst, offset, p); - - switch (header) { - case VC4_PACKET_HALT: - case VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF: - return; - default: - break; - } - - offset += length; - if (header != VC4_PACKET_GEM_HANDLES) - hw_offset += length; - p += length; - } - - clif_dump_destroy(clif); -} - -//Textures in T format: -//formed out of 4KB tiles, which have 1KB subtiles (see page 105 in VC4 arch guide) -//1KB subtiles have 512b microtiles. -//Width/height of the 512b microtiles is the following: -// 64bpp: 2x4 -// 32bpp: 4x4 -// 16bpp: 8x4 -// 8bpp: 8x8 -// 4bpp: 16x8 -// 1bpp: 32x16 -//Therefore width/height of 1KB subtiles is the following: -// 64bpp: 8x16 -// 32bpp: 16x16 -// 16bpp: 32x16 -// 8bpp: 32x32 -// 4bpp: 64x32 -// 1bpp: 128x64 -//Finally width/height of the 4KB tiles: -// 64bpp: 16x32 -// 32bpp: 32x32 -// 16bpp: 64x32 -// 8bpp: 64x64 -// 4bpp: 128x64 -// 1bpp: 256x128 -void getPaddedTextureDimensionsT(uint32_t width, uint32_t height, uint32_t bpp, uint32_t* paddedWidth, uint32_t* paddedHeight) -{ - assert(paddedWidth); - assert(paddedHeight); - uint32_t tileW = 0; - uint32_t tileH = 0; - - switch(bpp) - { - case 64: - { - tileW = 16; - tileH = 32; - break; - } - case 32: - { - tileW = 32; - tileH = 32; - break; - } - case 16: - { - tileW = 64; - tileH = 32; - break; - } - case 8: - { - tileW = 64; - tileH = 64; - break; - } - case 4: - { - tileW = 128; - tileH = 64; - break; - } - case 1: - { - tileW = 256; - tileH = 128; - break; - } - default: - { - assert(0); //unsupported - } - } - - *paddedWidth = ((tileW - (width % tileW)) % tileW) + width; - *paddedHeight = ((tileH - (height % tileH)) % tileH) + height; -} - -uint32_t getFormatBpp(VkFormat f) -{ - switch(f) - { - case VK_FORMAT_R16G16B16A16_SFLOAT: - return 64; - case VK_FORMAT_R8G8B8_UNORM: //padded to 32 - case VK_FORMAT_R8G8B8A8_UNORM: - return 32; - return 32; - case VK_FORMAT_R5G5B5A1_UNORM_PACK16: - case VK_FORMAT_R4G4B4A4_UNORM_PACK16: - case VK_FORMAT_R5G6B5_UNORM_PACK16: - case VK_FORMAT_R8G8_UNORM: - case VK_FORMAT_R16_SFLOAT: - case VK_FORMAT_R16_SINT: - return 16; - case VK_FORMAT_R8_UNORM: - case VK_FORMAT_R8_SINT: - return 8; - default: - assert(0); - return 0; - } -} - -void createImageBO(_image* i) -{ - assert(i); - assert(i->format); - assert(i->width); - assert(i->height); - - uint32_t bpp = getFormatBpp(i->format); - uint32_t pixelSizeBytes = bpp / 8; - uint32_t nonPaddedSize = i->width * i->height * pixelSizeBytes; - i->paddedWidth = i->width; - i->paddedHeight = i->height; - - //need to pad to T format, as HW automatically chooses that - if(nonPaddedSize > 4096) - { - getPaddedTextureDimensionsT(i->width, i->height, bpp, &i->paddedWidth, &i->paddedHeight); - } - - i->size = i->paddedWidth * i->paddedHeight * pixelSizeBytes; - i->stride = i->paddedWidth * pixelSizeBytes; - i->handle = vc4_bo_alloc(controlFd, i->size, "swapchain image"); assert(i->handle); - - //set tiling to T if size > 4KB - if(nonPaddedSize > 4096) - { - int ret = vc4_bo_set_tiling(controlFd, i->handle, DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED); assert(ret); - i->tiling = VC4_TILING_FORMAT_T; - } - else - { - int ret = vc4_bo_set_tiling(controlFd, i->handle, DRM_FORMAT_MOD_LINEAR); assert(ret); - i->tiling = VC4_TILING_FORMAT_LT; - } -} - -/*static inline void util_pack_color(const float rgba[4], enum pipe_format format, union util_color *uc) -{ - ubyte r = 0; - ubyte g = 0; - ubyte b = 0; - ubyte a = 0; - - if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) <= 8) { - r = float_to_ubyte(rgba[0]); - g = float_to_ubyte(rgba[1]); - b = float_to_ubyte(rgba[2]); - a = float_to_ubyte(rgba[3]); - } - - switch (format) { - case PIPE_FORMAT_ABGR8888_UNORM: - { - uc->ui[0] = (r << 24) | (g << 16) | (b << 8) | a; - } - return; - case PIPE_FORMAT_XBGR8888_UNORM: - { - uc->ui[0] = (r << 24) | (g << 16) | (b << 8) | 0xff; - } - return; - case PIPE_FORMAT_BGRA8888_UNORM: - { - uc->ui[0] = (a << 24) | (r << 16) | (g << 8) | b; - } - return; - case PIPE_FORMAT_BGRX8888_UNORM: - { - uc->ui[0] = (0xffu << 24) | (r << 16) | (g << 8) | b; - } - return; - case PIPE_FORMAT_ARGB8888_UNORM: - { - uc->ui[0] = (b << 24) | (g << 16) | (r << 8) | a; - } - return; - case PIPE_FORMAT_XRGB8888_UNORM: - { - uc->ui[0] = (b << 24) | (g << 16) | (r << 8) | 0xff; - } - return; - case PIPE_FORMAT_B5G6R5_UNORM: - { - uc->us = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3); - } - return; - case PIPE_FORMAT_B5G5R5X1_UNORM: - { - uc->us = ((0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3); - } - return; - case PIPE_FORMAT_B5G5R5A1_UNORM: - { - uc->us = ((a & 0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3); - } - return; - case PIPE_FORMAT_B4G4R4A4_UNORM: - { - uc->us = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4); - } - return; - case PIPE_FORMAT_A8_UNORM: - { - uc->ub = a; - } - return; - case PIPE_FORMAT_L8_UNORM: - case PIPE_FORMAT_I8_UNORM: - { - uc->ub = r; - } - return; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - { - uc->f[0] = rgba[0]; - uc->f[1] = rgba[1]; - uc->f[2] = rgba[2]; - uc->f[3] = rgba[3]; - } - return; - case PIPE_FORMAT_R32G32B32_FLOAT: - { - uc->f[0] = rgba[0]; - uc->f[1] = rgba[1]; - uc->f[2] = rgba[2]; - } - return; - - default: - util_format_write_4f(format, rgba, 0, uc, 0, 0, 0, 1, 1); - } -}*/ - -uint32_t packVec4IntoABGR8(const float rgba[4]) -{ - uint8_t r, g, b, a; - r = rgba[0] * 255.0; - g = rgba[1] * 255.0; - b = rgba[2] * 255.0; - a = rgba[3] * 255.0; - - uint32_t res = 0 | - (a << 24) | - (b << 16) | - (g << 8) | - (r << 0); - - return res; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEnumerateInstanceExtensionProperties - * When pLayerName parameter is NULL, only extensions provided by the Vulkan implementation or by implicitly enabled layers are returned. When pLayerName is the name of a layer, - * the instance extensions provided by that layer are returned. - * If pProperties is NULL, then the number of extensions properties available is returned in pPropertyCount. Otherwise, pPropertyCount must point to a variable set by the user - * to the number of elements in the pProperties array, and on return the variable is overwritten with the number of structures actually written to pProperties. - * If pPropertyCount is less than the number of extension properties available, at most pPropertyCount structures will be written. If pPropertyCount is smaller than the number of extensions available, - * VK_INCOMPLETE will be returned instead of VK_SUCCESS, to indicate that not all the available properties were returned. - * Because the list of available layers may change externally between calls to vkEnumerateInstanceExtensionProperties, - * two calls may retrieve different results if a pLayerName is available in one call but not in another. The extensions supported by a layer may also change between two calls, - * e.g. if the layer implementation is replaced by a different version between those calls. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateInstanceExtensionProperties( - const char* pLayerName, - uint32_t* pPropertyCount, - VkExtensionProperties* pProperties) -{ - assert(!pLayerName); //TODO layers ignored for now - assert(pPropertyCount); - - if(!pProperties) - { - *pPropertyCount = numInstanceExtensions; - return VK_INCOMPLETE; - } - - int arraySize = *pPropertyCount; - int elementsWritten = min(numInstanceExtensions, arraySize); - - for(int c = 0; c < elementsWritten; ++c) - { - pProperties[c] = instanceExtensions[c]; - } - - *pPropertyCount = elementsWritten; - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateInstance - * There is no global state in Vulkan and all per-application state is stored in a VkInstance object. Creating a VkInstance object initializes the Vulkan library - * vkCreateInstance verifies that the requested layers exist. If not, vkCreateInstance will return VK_ERROR_LAYER_NOT_PRESENT. Next vkCreateInstance verifies that - * the requested extensions are supported (e.g. in the implementation or in any enabled instance layer) and if any requested extension is not supported, - * vkCreateInstance must return VK_ERROR_EXTENSION_NOT_PRESENT. After verifying and enabling the instance layers and extensions the VkInstance object is - * created and returned to the application. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkCreateInstance( - const VkInstanceCreateInfo* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkInstance* pInstance) -{ - assert(pInstance); - assert(pCreateInfo); - - *pInstance = malloc(sizeof(_instance)); - - if(!*pInstance) - { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - - (*pInstance)->numEnabledExtensions = 0; - - //TODO: allocator is ignored for now - assert(pAllocator == 0); - - //TODO: possibly we need to load layers here - //and store them in pInstance - assert(pCreateInfo->enabledLayerCount == 0); - - if(pCreateInfo->enabledExtensionCount) - { - assert(pCreateInfo->ppEnabledExtensionNames); - } - - for(int c = 0; c < pCreateInfo->enabledExtensionCount; ++c) - { - int findres = findInstanceExtension(pCreateInfo->ppEnabledExtensionNames[c]); - if(findres > -1) - { - (*pInstance)->enabledExtensions[(*pInstance)->numEnabledExtensions] = findres; - (*pInstance)->numEnabledExtensions++; - } - else - { - return VK_ERROR_EXTENSION_NOT_PRESENT; - } - } - - //TODO ignored for now - //pCreateInfo->pApplicationInfo - - int ret = openIoctl(); assert(!ret); - - (*pInstance)->chipVersion = vc4_get_chip_info(controlFd); - (*pInstance)->hasTiling = vc4_test_tiling(controlFd); - - (*pInstance)->hasControlFlow = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_BRANCHES); - (*pInstance)->hasEtc1 = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_ETC1); - (*pInstance)->hasThreadedFs = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_THREADED_FS); - (*pInstance)->hasMadvise = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_MADVISE); - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#devsandqueues-physical-device-enumeration - * If pPhysicalDevices is NULL, then the number of physical devices available is returned in pPhysicalDeviceCount. Otherwise, pPhysicalDeviceCount must point to a - * variable set by the user to the number of elements in the pPhysicalDevices array, and on return the variable is overwritten with the number of handles actually - * written to pPhysicalDevices. If pPhysicalDeviceCount is less than the number of physical devices available, at most pPhysicalDeviceCount structures will be written. - * If pPhysicalDeviceCount is smaller than the number of physical devices available, VK_INCOMPLETE will be returned instead of VK_SUCCESS, to indicate that not all the - * available physical devices were returned. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkEnumeratePhysicalDevices( - VkInstance instance, - uint32_t* pPhysicalDeviceCount, - VkPhysicalDevice* pPhysicalDevices) -{ - assert(instance); - - //TODO is there a way to check if there's a gpu (and it's the rPi)? - int gpuExists = access( "/dev/dri/card0", F_OK ) != -1; - - int numGPUs = gpuExists; - - assert(pPhysicalDeviceCount); - - if(!pPhysicalDevices) - { - *pPhysicalDeviceCount = numGPUs; - return VK_SUCCESS; - } - - int arraySize = *pPhysicalDeviceCount; - int elementsWritten = min(numGPUs, arraySize); - - for(int c = 0; c < elementsWritten; ++c) - { - pPhysicalDevices[c] = &instance->dev; - } - - *pPhysicalDeviceCount = elementsWritten; - - if(elementsWritten < arraySize) - { - return VK_INCOMPLETE; - } - else - { - return VK_SUCCESS; - } -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceProperties - */ -VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceProperties( - VkPhysicalDevice physicalDevice, - VkPhysicalDeviceProperties* pProperties) -{ - assert(physicalDevice); - assert(pProperties); - - VkPhysicalDeviceSparseProperties sparseProps = - { - .residencyStandard2DBlockShape = 1, - .residencyStandard2DMultisampleBlockShape = 1, - .residencyStandard3DBlockShape = 1, - .residencyAlignedMipSize = 1, - .residencyNonResidentStrict = 1 - }; - - pProperties->apiVersion = VK_MAKE_VERSION(1,1,0); - pProperties->driverVersion = 1; //we'll simply call this v1 - pProperties->vendorID = 0x14E4; //Broadcom - pProperties->deviceID = 0; //TODO dunno? - pProperties->deviceType = VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU; - strcpy(pProperties->deviceName, "VideoCore IV HW"); - //pProperties->pipelineCacheUUID - pProperties->limits = _limits; - pProperties->sparseProperties = sparseProps; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceFeatures - */ -VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceFeatures( - VkPhysicalDevice physicalDevice, - VkPhysicalDeviceFeatures* pFeatures) -{ - assert(physicalDevice); - assert(pFeatures); - - *pFeatures = _features; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEnumerateDeviceExtensionProperties - */ -VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateDeviceExtensionProperties( - VkPhysicalDevice physicalDevice, - const char* pLayerName, - uint32_t* pPropertyCount, - VkExtensionProperties* pProperties) -{ - assert(physicalDevice); - assert(!pLayerName); //layers ignored for now - assert(pPropertyCount); - - if(!pProperties) - { - *pPropertyCount = numDeviceExtensions; - return VK_INCOMPLETE; - } - - int arraySize = *pPropertyCount; - int elementsWritten = min(numDeviceExtensions, arraySize); - - for(int c = 0; c < elementsWritten; ++c) - { - pProperties[c] = deviceExtensions[c]; - } - - *pPropertyCount = elementsWritten; - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceQueueFamilyProperties - * If pQueueFamilyProperties is NULL, then the number of queue families available is returned in pQueueFamilyPropertyCount. - * Otherwise, pQueueFamilyPropertyCount must point to a variable set by the user to the number of elements in the pQueueFamilyProperties array, - * and on return the variable is overwritten with the number of structures actually written to pQueueFamilyProperties. If pQueueFamilyPropertyCount - * is less than the number of queue families available, at most pQueueFamilyPropertyCount structures will be written. - */ -VKAPI_ATTR void VKAPI_CALL vkGetPhysicalDeviceQueueFamilyProperties( - VkPhysicalDevice physicalDevice, - uint32_t* pQueueFamilyPropertyCount, - VkQueueFamilyProperties* pQueueFamilyProperties) -{ - assert(physicalDevice); - assert(pQueueFamilyPropertyCount); - - if(!pQueueFamilyProperties) - { - *pQueueFamilyPropertyCount = 1; - return; - } - - int arraySize = *pQueueFamilyPropertyCount; - int elementsWritten = min(numQueueFamilies, arraySize); - - for(int c = 0; c < elementsWritten; ++c) - { - pQueueFamilyProperties[c] = _queueFamilyProperties[c]; - } - - *pQueueFamilyPropertyCount = elementsWritten; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfaceSupportKHR - * does this queue family support presentation to this surface? - */ -VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceSupportKHR( - VkPhysicalDevice physicalDevice, - uint32_t queueFamilyIndex, - VkSurfaceKHR surface, - VkBool32* pSupported) -{ - assert(pSupported); - assert(surface); - assert(physicalDevice); - - assert(queueFamilyIndex < numQueueFamilies); - - //TODO if we plan to support headless rendering, there should be 2 families - //one using /dev/dri/card0 which has modesetting - //other using /dev/dri/renderD128 which does not support modesetting, this would say false here - *pSupported = VK_TRUE; - return VK_SUCCESS; -} - -/* - * Implementation of our RPI specific "extension" - */ -VkResult vkCreateRpiSurfaceKHR( - VkInstance instance, - const VkRpiSurfaceCreateInfoKHR* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkSurfaceKHR* pSurface) -{ - assert(instance); - //assert(pCreateInfo); //ignored for now - assert(pSurface); - //TODO: allocator is ignored for now - assert(pAllocator == 0); - - *pSurface = (VkSurfaceKHR)modeset_create(controlFd); - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySurfaceKHR - * Destroying a VkSurfaceKHR merely severs the connection between Vulkan and the native surface, - * and does not imply destroying the native surface, closing a window, or similar behavior - * (but we'll do so anyways...) - */ -VKAPI_ATTR void VKAPI_CALL vkDestroySurfaceKHR( - VkInstance instance, - VkSurfaceKHR surface, - const VkAllocationCallbacks* pAllocator) -{ - assert(instance); - assert(surface); - - //TODO: allocator is ignored for now - assert(pAllocator == 0); - - modeset_destroy(controlFd, (modeset_dev*)surface); -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateDevice - * vkCreateDevice verifies that extensions and features requested in the ppEnabledExtensionNames and pEnabledFeatures - * members of pCreateInfo, respectively, are supported by the implementation. If any requested extension is not supported, - * vkCreateDevice must return VK_ERROR_EXTENSION_NOT_PRESENT. If any requested feature is not supported, vkCreateDevice must return - * VK_ERROR_FEATURE_NOT_PRESENT. Support for extensions can be checked before creating a device by querying vkEnumerateDeviceExtensionProperties - * After verifying and enabling the extensions the VkDevice object is created and returned to the application. - * If a requested extension is only supported by a layer, both the layer and the extension need to be specified at vkCreateInstance - * time for the creation to succeed. Multiple logical devices can be created from the same physical device. Logical device creation may - * fail due to lack of device-specific resources (in addition to the other errors). If that occurs, vkCreateDevice will return VK_ERROR_TOO_MANY_OBJECTS. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkCreateDevice( - VkPhysicalDevice physicalDevice, - const VkDeviceCreateInfo* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkDevice* pDevice) -{ - assert(physicalDevice); - assert(pDevice); - assert(pCreateInfo); - - //TODO: allocator is ignored for now - assert(pAllocator == 0); - - *pDevice = malloc(sizeof(_device)); - if(!pDevice) - { - return VK_ERROR_TOO_MANY_OBJECTS; - } - - (*pDevice)->dev = physicalDevice; - - for(int c = 0; c < pCreateInfo->enabledExtensionCount; ++c) - { - int findres = findDeviceExtension(pCreateInfo->ppEnabledExtensionNames[c]); - if(findres > -1) - { - (*pDevice)->enabledExtensions[(*pDevice)->numEnabledExtensions] = findres; - (*pDevice)->numEnabledExtensions++; - } - else - { - return VK_ERROR_EXTENSION_NOT_PRESENT; - } - } - - VkBool32* requestedFeatures = pCreateInfo->pEnabledFeatures; - VkBool32* supportedFeatures = &_features; - - if(requestedFeatures) - { - for(int c = 0; c < numFeatures; ++c) - { - if(requestedFeatures[c] && !supportedFeatures[c]) - { - return VK_ERROR_FEATURE_NOT_PRESENT; - } - } - - (*pDevice)->enabledFeatures = *pCreateInfo->pEnabledFeatures; - } - else - { - memset(&(*pDevice)->enabledFeatures, 0, sizeof((*pDevice)->enabledFeatures)); //just disable everything - } - - //layers ignored per spec - //pCreateInfo->enabledLayerCount - - for(int c = 0; c < numQueueFamilies; ++c) - { - (*pDevice)->queues[c] = 0; - } - - if(pCreateInfo->queueCreateInfoCount > 0) - { - for(int c = 0; c < pCreateInfo->queueCreateInfoCount; ++c) - { - (*pDevice)->queues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex] = malloc(sizeof(_queue)*pCreateInfo->pQueueCreateInfos[c].queueCount); - - if(!(*pDevice)->queues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex]) - { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - - for(int d = 0; d < pCreateInfo->pQueueCreateInfos[c].queueCount; ++d) - { - (*pDevice)->queues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex][d].lastEmitSeqno = 0; - } - - (*pDevice)->numQueues[pCreateInfo->pQueueCreateInfos[c].queueFamilyIndex] = pCreateInfo->pQueueCreateInfos[c].queueCount; - } - } - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetDeviceQueue - * vkGetDeviceQueue must only be used to get queues that were created with the flags parameter of VkDeviceQueueCreateInfo set to zero. - * To get queues that were created with a non-zero flags parameter use vkGetDeviceQueue2. - */ -VKAPI_ATTR void VKAPI_CALL vkGetDeviceQueue( - VkDevice device, - uint32_t queueFamilyIndex, - uint32_t queueIndex, - VkQueue* pQueue) -{ - assert(device); - assert(pQueue); - - assert(queueFamilyIndex < numQueueFamilies); - assert(queueIndex < device->numQueues[queueFamilyIndex]); - - *pQueue = &device->queues[queueFamilyIndex][queueIndex]; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateSemaphore - * Semaphores are a synchronization primitive that can be used to insert a dependency between batches submitted to queues. - * Semaphores have two states - signaled and unsignaled. The state of a semaphore can be signaled after execution of a batch of commands is completed. - * A batch can wait for a semaphore to become signaled before it begins execution, and the semaphore is also unsignaled before the batch begins execution. - * As with most objects in Vulkan, semaphores are an interface to internal data which is typically opaque to applications. - * This internal data is referred to as a semaphore’s payload. However, in order to enable communication with agents outside of the current device, - * it is necessary to be able to export that payload to a commonly understood format, and subsequently import from that format as well. - * The internal data of a semaphore may include a reference to any resources and pending work associated with signal or unsignal operations performed on that semaphore object. - * Mechanisms to import and export that internal data to and from semaphores are provided below. - * These mechanisms indirectly enable applications to share semaphore state between two or more semaphores and other synchronization primitives across process and API boundaries. - * When created, the semaphore is in the unsignaled state. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkCreateSemaphore( - VkDevice device, - const VkSemaphoreCreateInfo* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkSemaphore* pSemaphore) -{ - assert(device); - assert(pSemaphore); - - //TODO: allocator is ignored for now - assert(pAllocator == 0); - - //we'll probably just use an IOCTL to wait for a GPU sequence number to complete. - sem_t* s = malloc(sizeof(sem_t)); - if(!s) - { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - sem_init(s, 0, 0); //create semaphore unsignalled, shared between threads - - *pSemaphore = (VkSemaphore)s; - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfaceCapabilitiesKHR - * The capabilities of a swapchain targetting a surface are the intersection of the capabilities of the WSI platform, - * the native window or display, and the physical device. The resulting capabilities can be obtained with the queries listed - * below in this section. Capabilities that correspond to image creation parameters are not independent of each other: - * combinations of parameters that are not supported as reported by vkGetPhysicalDeviceImageFormatProperties are not supported - * by the surface on that physical device, even if the capabilities taken individually are supported as part of some other parameter combinations. - * - * capabilities the specified device supports for a swapchain created for the surface - */ -VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilitiesKHR( - VkPhysicalDevice physicalDevice, - VkSurfaceKHR surface, - VkSurfaceCapabilitiesKHR* pSurfaceCapabilities) -{ - assert(physicalDevice); - assert(surface); - assert(pSurfaceCapabilities); - - pSurfaceCapabilities->minImageCount = 1; //min 1 - pSurfaceCapabilities->maxImageCount = 2; //TODO max 2 for double buffering for now... - pSurfaceCapabilities->currentExtent.width = ((modeset_dev*)surface)->width; - pSurfaceCapabilities->currentExtent.height = ((modeset_dev*)surface)->height; - pSurfaceCapabilities->minImageExtent.width = ((modeset_dev*)surface)->width; //TODO - pSurfaceCapabilities->minImageExtent.height = ((modeset_dev*)surface)->height; //TODO - pSurfaceCapabilities->maxImageExtent.width = ((modeset_dev*)surface)->width; //TODO - pSurfaceCapabilities->maxImageExtent.height = ((modeset_dev*)surface)->height; //TODO - pSurfaceCapabilities->maxImageArrayLayers = 1; //TODO maybe more layers for cursor etc. - pSurfaceCapabilities->supportedTransforms = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; //TODO no rotation for now - pSurfaceCapabilities->currentTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; //TODO get this from dev - pSurfaceCapabilities->supportedCompositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; //TODO no alpha compositing for now - pSurfaceCapabilities->supportedUsageFlags = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; //well we want to draw on the screen right - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfaceFormatsKHR - * If pSurfaceFormats is NULL, then the number of format pairs supported for the given surface is returned in pSurfaceFormatCount. - * The number of format pairs supported will be greater than or equal to 1. Otherwise, pSurfaceFormatCount must point to a variable - * set by the user to the number of elements in the pSurfaceFormats array, and on return the variable is overwritten with the number - * of structures actually written to pSurfaceFormats. If the value of pSurfaceFormatCount is less than the number of format pairs supported, - * at most pSurfaceFormatCount structures will be written. If pSurfaceFormatCount is smaller than the number of format pairs supported for the given surface, - * VK_INCOMPLETE will be returned instead of VK_SUCCESS to indicate that not all the available values were returned. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceFormatsKHR( - VkPhysicalDevice physicalDevice, - VkSurfaceKHR surface, - uint32_t* pSurfaceFormatCount, - VkSurfaceFormatKHR* pSurfaceFormats) -{ - assert(physicalDevice); - assert(surface); - assert(pSurfaceFormatCount); - - const int numFormats = 1; - - if(!pSurfaceFormats) - { - *pSurfaceFormatCount = numFormats; - return VK_SUCCESS; - } - - int arraySize = *pSurfaceFormatCount; - int elementsWritten = min(numFormats, arraySize); - - for(int c = 0; c < elementsWritten; ++c) - { - pSurfaceFormats[c] = supportedSurfaceFormats[c]; - } - - *pSurfaceFormatCount = elementsWritten; - - if(elementsWritten < numFormats) - { - return VK_INCOMPLETE; - } - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfacePresentModesKHR - * If pPresentModes is NULL, then the number of presentation modes supported for the given surface is returned in pPresentModeCount. - * Otherwise, pPresentModeCount must point to a variable set by the user to the number of elements in the pPresentModes array, - * and on return the variable is overwritten with the number of values actually written to pPresentModes. - * If the value of pPresentModeCount is less than the number of presentation modes supported, at most pPresentModeCount values will be written. - * If pPresentModeCount is smaller than the number of presentation modes supported for the given surface, VK_INCOMPLETE will be returned instead of - * VK_SUCCESS to indicate that not all the available values were returned. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfacePresentModesKHR( - VkPhysicalDevice physicalDevice, - VkSurfaceKHR surface, - uint32_t* pPresentModeCount, - VkPresentModeKHR* pPresentModes) -{ - assert(physicalDevice); - assert(surface); - assert(pPresentModeCount); - - const int numModes = 1; - - if(!pPresentModes) - { - *pPresentModeCount = numModes; - return VK_SUCCESS; - } - - int arraySize = *pPresentModeCount; - int elementsWritten = min(numModes, arraySize); - - for(int c = 0; c < elementsWritten; ++c) - { - //TODO - pPresentModes[c] = VK_PRESENT_MODE_FIFO_KHR; - } - - *pPresentModeCount = elementsWritten; - - if(elementsWritten < numModes) - { - return VK_INCOMPLETE; - } - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateSwapchainKHR - */ -VKAPI_ATTR VkResult VKAPI_CALL vkCreateSwapchainKHR( - VkDevice device, - const VkSwapchainCreateInfoKHR* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkSwapchainKHR* pSwapchain) -{ - assert(device); - assert(pCreateInfo); - assert(pSwapchain); - - //TODO: allocator is ignored for now - assert(pAllocator == 0); - - *pSwapchain = malloc(sizeof(_swapchain)); - if(!*pSwapchain) - { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - - _swapchain* s = *pSwapchain; - - //TODO flags, layers, queue sharing, pretransform, composite alpha, present mode..., clipped, oldswapchain - //TODO external sync on surface, oldswapchain - - s->images = malloc(sizeof(_image) * pCreateInfo->minImageCount); - if(!s->images) - { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - - s->backbufferIdx = 0; - s->numImages = pCreateInfo->minImageCount; - s->surface = pCreateInfo->surface; - - for(int c = 0; c < pCreateInfo->minImageCount; ++c) - { - s->images[c].width = pCreateInfo->imageExtent.width; - s->images[c].height = pCreateInfo->imageExtent.height; - s->images[c].depth = 1; - s->images[c].layers = pCreateInfo->imageArrayLayers; - s->images[c].miplevels = 1; - s->images[c].samples = 1; //TODO - s->images[c].usageBits = pCreateInfo->imageUsage; - s->images[c].format = pCreateInfo->imageFormat; - s->images[c].imageSpace = pCreateInfo->imageColorSpace; - s->images[c].concurrentAccess = pCreateInfo->imageSharingMode; - s->images[c].numQueueFamiliesWithAccess = pCreateInfo->queueFamilyIndexCount; - if(s->images[c].concurrentAccess) - { - s->images[c].queueFamiliesWithAccess = malloc(sizeof(uint32_t)*s->images[c].numQueueFamiliesWithAccess); - memcpy(s->images[c].queueFamiliesWithAccess, pCreateInfo->pQueueFamilyIndices, sizeof(uint32_t)*s->images[c].numQueueFamiliesWithAccess); - } - s->images[c].preTransformMode = pCreateInfo->preTransform; - s->images[c].compositeAlpha = pCreateInfo->compositeAlpha; - s->images[c].presentMode = pCreateInfo->presentMode; - s->images[c].clipped = pCreateInfo->clipped; - - createImageBO(&s->images[c]); - int res = modeset_create_fb(controlFd, &s->images[c]); assert(res == 0); - } - - //defer to first swapbuffer (or at least later, getting swapchain != presenting immediately) - //int res = modeset_fb_for_dev(controlFd, s->surface, &s->images[s->backbufferIdx]); assert(res == 0); - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetSwapchainImagesKHR - * If pSwapchainImages is NULL, then the number of presentable images for swapchain is returned in pSwapchainImageCount. - * Otherwise, pSwapchainImageCount must point to a variable set by the user to the number of elements in the pSwapchainImages array, - * and on return the variable is overwritten with the number of structures actually written to pSwapchainImages. - * If the value of pSwapchainImageCount is less than the number of presentable images for swapchain, at most pSwapchainImageCount structures will be written. - * If pSwapchainImageCount is smaller than the number of presentable images for swapchain, VK_INCOMPLETE will be returned instead of VK_SUCCESS to - * indicate that not all the available values were returned. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainImagesKHR( - VkDevice device, - VkSwapchainKHR swapchain, - uint32_t* pSwapchainImageCount, - VkImage* pSwapchainImages) -{ - assert(device); - assert(swapchain); - assert(pSwapchainImageCount); - - _swapchain* s = swapchain; - - if(!pSwapchainImages) - { - *pSwapchainImageCount = s->numImages; - return VK_SUCCESS; - } - - int arraySize = *pSwapchainImageCount; - int elementsWritten = min(s->numImages, arraySize); - - for(int c = 0; c < elementsWritten; ++c) - { - pSwapchainImages[c] = &s->images[c]; - } - - *pSwapchainImageCount = elementsWritten; - - if(elementsWritten < s->numImages) - { - return VK_INCOMPLETE; - } - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#commandbuffers-pools - * Command pools are opaque objects that command buffer memory is allocated from, and which allow the implementation to amortize the - * cost of resource creation across multiple command buffers. Command pools are externally synchronized, meaning that a command pool must - * not be used concurrently in multiple threads. That includes use via recording commands on any command buffers allocated from the pool, - * as well as operations that allocate, free, and reset command buffers or the pool itself. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkCreateCommandPool( - VkDevice device, - const VkCommandPoolCreateInfo* pCreateInfo, - const VkAllocationCallbacks* pAllocator, - VkCommandPool* pCommandPool) -{ - assert(device); - assert(pCreateInfo); - - //TODO: allocator is ignored for now - assert(pAllocator == 0); - - //VK_COMMAND_POOL_CREATE_TRANSIENT_BIT - //specifies that command buffers allocated from the pool will be short-lived, meaning that they will be reset or freed in a relatively short timeframe. - //This flag may be used by the implementation to control memory allocation behavior within the pool. - //--> definitely use pool allocator - - //VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT - //allows any command buffer allocated from a pool to be individually reset to the initial state; either by calling vkResetCommandBuffer, or via the implicit reset when calling vkBeginCommandBuffer. - //If this flag is not set on a pool, then vkResetCommandBuffer must not be called for any command buffer allocated from that pool. - - //TODO pool family ignored for now - - _commandPool* cp = malloc(sizeof(_commandPool)); - - if(!cp) - { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - - cp->queueFamilyIndex = pCreateInfo->queueFamilyIndex; - - //initial number of command buffers to hold - int numCommandBufs = 100; - int controlListSize = ARM_PAGE_SIZE * 100; - - //if(pCreateInfo->flags & VK_COMMAND_POOL_CREATE_TRANSIENT_BIT) - { - //use pool allocator - void* pamem = malloc(numCommandBufs * sizeof(_commandBuffer)); - if(!pamem) - { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - cp->pa = createPoolAllocator(pamem, sizeof(_commandBuffer), numCommandBufs * sizeof(_commandBuffer)); - - void* cpamem = malloc(controlListSize); - if(!cpamem) - { - return VK_ERROR_OUT_OF_HOST_MEMORY; - } - cp->cpa = createConsecutivePoolAllocator(cpamem, ARM_PAGE_SIZE, controlListSize); - } - - *pCommandPool = (VkCommandPool)cp; - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#commandbuffer-allocation - * vkAllocateCommandBuffers can be used to create multiple command buffers. If the creation of any of those command buffers fails, - * the implementation must destroy all successfully created command buffer objects from this command, set all entries of the pCommandBuffers array to NULL and return the error. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkAllocateCommandBuffers( - VkDevice device, - const VkCommandBufferAllocateInfo* pAllocateInfo, - VkCommandBuffer* pCommandBuffers) -{ - assert(device); - assert(pAllocateInfo); - assert(pCommandBuffers); - - VkResult res = VK_SUCCESS; - - _commandPool* cp = (_commandPool*)pAllocateInfo->commandPool; - - //if(cp->usePoolAllocator) - { - for(int c = 0; c < pAllocateInfo->commandBufferCount; ++c) - { - pCommandBuffers[c] = poolAllocate(&cp->pa); - - if(!pCommandBuffers[c]) - { - res = VK_ERROR_OUT_OF_HOST_MEMORY; - break; - } - - pCommandBuffers[c]->shaderRecCount = 0; - pCommandBuffers[c]->usageFlags = 0; - pCommandBuffers[c]->state = CMDBUF_STATE_INITIAL; - pCommandBuffers[c]->cp = cp; - clInit(&pCommandBuffers[c]->binCl, consecutivePoolAllocate(&cp->cpa, 1)); - clInit(&pCommandBuffers[c]->handlesCl, consecutivePoolAllocate(&cp->cpa, 1)); - clInit(&pCommandBuffers[c]->shaderRecCl, consecutivePoolAllocate(&cp->cpa, 1)); - clInit(&pCommandBuffers[c]->uniformsCl, consecutivePoolAllocate(&cp->cpa, 1)); - - if(!pCommandBuffers[c]->binCl.buffer) - { - res = VK_ERROR_OUT_OF_HOST_MEMORY; - break; - } - - if(!pCommandBuffers[c]->handlesCl.buffer) - { - res = VK_ERROR_OUT_OF_HOST_MEMORY; - break; - } - - if(!pCommandBuffers[c]->shaderRecCl.buffer) - { - res = VK_ERROR_OUT_OF_HOST_MEMORY; - break; - } - - if(!pCommandBuffers[c]->uniformsCl.buffer) - { - res = VK_ERROR_OUT_OF_HOST_MEMORY; - break; - } - } - } - - if(res != VK_SUCCESS) - { - //if(cp->usePoolAllocator) - { - for(int c = 0; c < pAllocateInfo->commandBufferCount; ++c) - { - consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->binCl, pCommandBuffers[c]->binCl.numBlocks); - consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->handlesCl, pCommandBuffers[c]->binCl.numBlocks); - consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->shaderRecCl, pCommandBuffers[c]->binCl.numBlocks); - consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->uniformsCl, pCommandBuffers[c]->binCl.numBlocks); - poolFree(&cp->pa, pCommandBuffers[c]); - pCommandBuffers[c] = 0; - } - } - } - - return res; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkBeginCommandBuffer - */ -VKAPI_ATTR VkResult VKAPI_CALL vkBeginCommandBuffer( - VkCommandBuffer commandBuffer, - const VkCommandBufferBeginInfo* pBeginInfo) -{ - assert(commandBuffer); - assert(pBeginInfo); - - //TODO - - //VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT - //specifies that each recording of the command buffer will only be submitted once, and the command buffer will be reset and recorded again between each submission. - - //VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT - //specifies that a secondary command buffer is considered to be entirely inside a render pass. If this is a primary command buffer, then this bit is ignored - - //VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT - //specifies that a command buffer can be resubmitted to a queue while it is in the pending state, and recorded into multiple primary command buffers - - //When a command buffer begins recording, all state in that command buffer is undefined - - struct drm_vc4_submit_cl submitCl = - { - .color_read.hindex = ~0, - .zs_read.hindex = ~0, - .color_write.hindex = ~0, - .msaa_color_write.hindex = ~0, - .zs_write.hindex = ~0, - .msaa_zs_write.hindex = ~0, - }; - - commandBuffer->usageFlags = pBeginInfo->flags; - commandBuffer->shaderRecCount = 0; - commandBuffer->state = CMDBUF_STATE_RECORDING; - commandBuffer->submitCl = submitCl; - - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCmdPipelineBarrier - * vkCmdPipelineBarrier is a synchronization command that inserts a dependency between commands submitted to the same queue, or between commands in the same subpass. - * When vkCmdPipelineBarrier is submitted to a queue, it defines a memory dependency between commands that were submitted before it, and those submitted after it. - * If vkCmdPipelineBarrier was recorded outside a render pass instance, the first synchronization scope includes all commands that occur earlier in submission order. - * If vkCmdPipelineBarrier was recorded inside a render pass instance, the first synchronization scope includes only commands that occur earlier in submission order within the same subpass. - * In either case, the first synchronization scope is limited to operations on the pipeline stages determined by the source stage mask specified by srcStageMask. - * - * If vkCmdPipelineBarrier was recorded outside a render pass instance, the second synchronization scope includes all commands that occur later in submission order. - * If vkCmdPipelineBarrier was recorded inside a render pass instance, the second synchronization scope includes only commands that occur later in submission order within the same subpass. - * In either case, the second synchronization scope is limited to operations on the pipeline stages determined by the destination stage mask specified by dstStageMask. - * - * The first access scope is limited to access in the pipeline stages determined by the source stage mask specified by srcStageMask. - * Within that, the first access scope only includes the first access scopes defined by elements of the pMemoryBarriers, - * pBufferMemoryBarriers and pImageMemoryBarriers arrays, which each define a set of memory barriers. If no memory barriers are specified, - * then the first access scope includes no accesses. - * - * The second access scope is limited to access in the pipeline stages determined by the destination stage mask specified by dstStageMask. - * Within that, the second access scope only includes the second access scopes defined by elements of the pMemoryBarriers, pBufferMemoryBarriers and pImageMemoryBarriers arrays, - * which each define a set of memory barriers. If no memory barriers are specified, then the second access scope includes no accesses. - * - * If dependencyFlags includes VK_DEPENDENCY_BY_REGION_BIT, then any dependency between framebuffer-space pipeline stages is framebuffer-local - otherwise it is framebuffer-global. - */ -VKAPI_ATTR void VKAPI_CALL vkCmdPipelineBarrier( - VkCommandBuffer commandBuffer, - VkPipelineStageFlags srcStageMask, - VkPipelineStageFlags dstStageMask, - VkDependencyFlags dependencyFlags, - uint32_t memoryBarrierCount, - const VkMemoryBarrier* pMemoryBarriers, - uint32_t bufferMemoryBarrierCount, - const VkBufferMemoryBarrier* pBufferMemoryBarriers, - uint32_t imageMemoryBarrierCount, - const VkImageMemoryBarrier* pImageMemoryBarriers) -{ - assert(commandBuffer); - - //TODO pipeline stage flags - //VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT - //VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT - //VK_PIPELINE_STAGE_VERTEX_INPUT_BIT - //VK_PIPELINE_STAGE_VERTEX_SHADER_BIT - //VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT - //VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT - //VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT - //VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT - //VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT - //VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT - //VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT - //VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT - //VK_PIPELINE_STAGE_TRANSFER_BIT - //VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT - //VK_PIPELINE_STAGE_HOST_BIT - //VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT - //VK_PIPELINE_STAGE_ALL_COMMANDS_BIT - - //TODO dependency flags - //VK_DEPENDENCY_BY_REGION_BIT, - //VK_DEPENDENCY_DEVICE_GROUP_BIT, - //VK_DEPENDENCY_VIEW_LOCAL_BIT - - //TODO access flags - //VK_ACCESS_INDIRECT_COMMAND_READ_BIT - //VK_ACCESS_INDEX_READ_BIT - //VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT - //VK_ACCESS_UNIFORM_READ_BIT - //VK_ACCESS_INPUT_ATTACHMENT_READ_BIT - //VK_ACCESS_SHADER_READ_BIT - //VK_ACCESS_SHADER_WRITE_BIT - //VK_ACCESS_COLOR_ATTACHMENT_READ_BIT - //VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT - //VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT - //VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT - //VK_ACCESS_TRANSFER_READ_BIT - //VK_ACCESS_TRANSFER_WRITE_BIT - //VK_ACCESS_HOST_READ_BIT - //VK_ACCESS_HOST_WRITE_BIT - //VK_ACCESS_MEMORY_READ_BIT - //VK_ACCESS_MEMORY_WRITE_BIT - //VK_ACCESS_COMMAND_PROCESS_READ_BIT_NVX - //VK_ACCESS_COMMAND_PROCESS_WRITE_BIT_NVX - - //TODO Layout transition flags - //VK_IMAGE_LAYOUT_UNDEFINED - //VK_IMAGE_LAYOUT_GENERAL - //VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL - //VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL - //VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL - //VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL - //VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL - //VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL - //VK_IMAGE_LAYOUT_PREINITIALIZED - //VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL - //VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL - //VK_IMAGE_LAYOUT_PRESENT_SRC_KHR - //VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR - - for(int c = 0; c < memoryBarrierCount; ++c) - { - //TODO - } - - for(int c = 0; c < bufferMemoryBarrierCount; ++c) - { - //TODO - } - - for(int c = 0; c < imageMemoryBarrierCount; ++c) - { - _image* i = pImageMemoryBarriers[c].image; - - assert(i->layout == pImageMemoryBarriers[c].oldLayout || i->layout == VK_IMAGE_LAYOUT_UNDEFINED); - - if(srcStageMask & VK_PIPELINE_STAGE_TRANSFER_BIT && - pImageMemoryBarriers[c].srcAccessMask & VK_ACCESS_TRANSFER_WRITE_BIT && - i->needToClear) - { - //insert CRs to clear the image - - assert(i->layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); - - clFit(commandBuffer, &commandBuffer->binCl, V3D21_TILE_BINNING_MODE_CONFIGURATION_length); - clInsertTileBinningModeConfiguration(&commandBuffer->binCl, - 0, 0, 0, 0, - getFormatBpp(i->format) == 64, //64 bit color mode - i->samples > 1, //msaa - i->width, i->height, 0, 0, 0); - - //START_TILE_BINNING resets the statechange counters in the hardware, - //which are what is used when a primitive is binned to a tile to - //figure out what new state packets need to be written to that tile's - //command list. - clFit(commandBuffer, &commandBuffer->binCl, V3D21_START_TILE_BINNING_length); - clInsertStartTileBinning(&commandBuffer->binCl); - - //Reset the current compressed primitives format. This gets modified - //by VC4_PACKET_GL_INDEXED_PRIMITIVE and - //VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start - //of every tile. - clFit(commandBuffer, &commandBuffer->binCl, V3D21_PRIMITIVE_LIST_FORMAT_length); - clInsertPrimitiveListFormat(&commandBuffer->binCl, - 1, //16 bit - 2); //tris - - clFit(commandBuffer, &commandBuffer->handlesCl, 4); - uint32_t idx = clGetHandleIndex(&commandBuffer->handlesCl, i->handle); - commandBuffer->submitCl.color_write.hindex = idx; - commandBuffer->submitCl.color_write.offset = 0; - commandBuffer->submitCl.color_write.flags = 0; - //TODO format - commandBuffer->submitCl.color_write.bits = - VC4_SET_FIELD(VC4_RENDER_CONFIG_FORMAT_RGBA8888, VC4_RENDER_CONFIG_FORMAT) | - VC4_SET_FIELD(i->tiling, VC4_RENDER_CONFIG_MEMORY_FORMAT); - - commandBuffer->submitCl.clear_color[0] = i->clearColor[0]; - commandBuffer->submitCl.clear_color[1] = i->clearColor[1]; - - //TODO ranges - commandBuffer->submitCl.min_x_tile = 0; - commandBuffer->submitCl.min_y_tile = 0; - - uint32_t tileSizeW = 64; - uint32_t tileSizeH = 64; - - if(i->samples > 1) - { - tileSizeW >>= 1; - tileSizeH >>= 1; - } - - if(getFormatBpp(i->format) == 64) - { - tileSizeH >>= 1; - } - - uint32_t widthInTiles = divRoundUp(i->width, tileSizeW); - uint32_t heightInTiles = divRoundUp(i->height, tileSizeH); - - commandBuffer->submitCl.max_x_tile = widthInTiles - 1; - commandBuffer->submitCl.max_y_tile = heightInTiles - 1; - commandBuffer->submitCl.width = i->width; - commandBuffer->submitCl.height = i->height; - commandBuffer->submitCl.flags |= VC4_SUBMIT_CL_USE_CLEAR_COLOR; - commandBuffer->submitCl.clear_z = 0; //TODO - commandBuffer->submitCl.clear_s = 0; - } - - //transition to new layout - i->layout = pImageMemoryBarriers[c].newLayout; - } -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCmdClearColorImage - * Color and depth/stencil images can be cleared outside a render pass instance using vkCmdClearColorImage or vkCmdClearDepthStencilImage, respectively. - * These commands are only allowed outside of a render pass instance. - */ -VKAPI_ATTR void VKAPI_CALL vkCmdClearColorImage( - VkCommandBuffer commandBuffer, - VkImage image, - VkImageLayout imageLayout, - const VkClearColorValue* pColor, - uint32_t rangeCount, - const VkImageSubresourceRange* pRanges) -{ - assert(commandBuffer); - assert(image); - assert(pColor); - - //TODO this should only flag an image for clearing. This can only be called outside a renderpass - //actual clearing would only happen: - // -if image is rendered to (insert clear before first draw call) - // -if the image is bound for sampling (submit a CL with a clear) - // -if a command buffer is submitted without any rendering (insert clear) - // -etc. - //we shouldn't clear an image if noone uses it - - //TODO ranges support - - assert(imageLayout == VK_IMAGE_LAYOUT_GENERAL || - imageLayout == VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR || - imageLayout == VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); - - assert(commandBuffer->state == CMDBUF_STATE_RECORDING); - assert(_queueFamilyProperties[commandBuffer->cp->queueFamilyIndex].queueFlags & VK_QUEUE_GRAPHICS_BIT || _queueFamilyProperties[commandBuffer->cp->queueFamilyIndex].queueFlags & VK_QUEUE_COMPUTE_BIT); - - _image* i = image; - - assert(i->usageBits & VK_IMAGE_USAGE_TRANSFER_DST_BIT); - - //TODO externally sync cmdbuf, cmdpool - - i->needToClear = 1; - i->clearColor[0] = i->clearColor[1] = packVec4IntoABGR8(pColor->float32); -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEndCommandBuffer - * If there was an error during recording, the application will be notified by an unsuccessful return code returned by vkEndCommandBuffer. - * If the application wishes to further use the command buffer, the command buffer must be reset. The command buffer must have been in the recording state, - * and is moved to the executable state. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkEndCommandBuffer( - VkCommandBuffer commandBuffer) -{ - assert(commandBuffer); - - //Increment the semaphore indicating that binning is done and - //unblocking the render thread. Note that this doesn't act - //until the FLUSH completes. - //The FLUSH caps all of our bin lists with a - //VC4_PACKET_RETURN. - clFit(commandBuffer, &commandBuffer->binCl, V3D21_INCREMENT_SEMAPHORE_length); - clInsertIncrementSemaphore(&commandBuffer->binCl); - clFit(commandBuffer, &commandBuffer->binCl, V3D21_FLUSH_length); - clInsertFlush(&commandBuffer->binCl); - - commandBuffer->state = CMDBUF_STATE_EXECUTABLE; - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkAcquireNextImageKHR - */ -VKAPI_ATTR VkResult VKAPI_CALL vkAcquireNextImageKHR( - VkDevice device, - VkSwapchainKHR swapchain, - uint64_t timeout, - VkSemaphore semaphore, - VkFence fence, - uint32_t* pImageIndex) -{ - assert(device); - assert(swapchain); - - assert(semaphore != VK_NULL_HANDLE || fence != VK_NULL_HANDLE); - - sem_t* s = semaphore; - - //TODO we need to keep track of currently acquired images? - - //TODO wait timeout? - - *pImageIndex = ((_swapchain*)swapchain)->backbufferIdx; //return back buffer index - - //signal semaphore - int semVal; sem_getvalue(s, &semVal); assert(semVal <= 0); //make sure semaphore is unsignalled - sem_post(s); - - //TODO signal fence - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkQueueSubmit - * vkQueueSubmit is a queue submission command, with each batch defined by an element of pSubmits as an instance of the VkSubmitInfo structure. - * Batches begin execution in the order they appear in pSubmits, but may complete out of order. - * Fence and semaphore operations submitted with vkQueueSubmit have additional ordering constraints compared to other submission commands, - * with dependencies involving previous and subsequent queue operations. Information about these additional constraints can be found in the semaphore and - * fence sections of the synchronization chapter. - * Details on the interaction of pWaitDstStageMask with synchronization are described in the semaphore wait operation section of the synchronization chapter. - * The order that batches appear in pSubmits is used to determine submission order, and thus all the implicit ordering guarantees that respect it. - * Other than these implicit ordering guarantees and any explicit synchronization primitives, these batches may overlap or otherwise execute out of order. - * If any command buffer submitted to this queue is in the executable state, it is moved to the pending state. Once execution of all submissions of a command buffer complete, - * it moves from the pending state, back to the executable state. If a command buffer was recorded with the VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT flag, - * it instead moves back to the invalid state. - * If vkQueueSubmit fails, it may return VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY. - * If it does, the implementation must ensure that the state and contents of any resources or synchronization primitives referenced by the submitted command buffers and any semaphores - * referenced by pSubmits is unaffected by the call or its failure. If vkQueueSubmit fails in such a way that the implementation is unable to make that guarantee, - * the implementation must return VK_ERROR_DEVICE_LOST. See Lost Device. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkQueueSubmit( - VkQueue queue, - uint32_t submitCount, - const VkSubmitInfo* pSubmits, - VkFence fence) -{ - assert(queue); - - for(int c = 0; c < pSubmits->waitSemaphoreCount; ++c) - { - sem_wait((sem_t*)pSubmits->pWaitSemaphores[c]); - } - - //TODO: deal with pSubmits->pWaitDstStageMask - - //TODO wait for fence?? - - for(int c = 0; c < pSubmits->commandBufferCount; ++c) - { - if(pSubmits->pCommandBuffers[c]->state == CMDBUF_STATE_EXECUTABLE) - { - pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_PENDING; - } - } - - for(int c = 0; c < pSubmits->commandBufferCount; ++c) - { - VkCommandBuffer cmdbuf = pSubmits->pCommandBuffers[c]; - - cmdbuf->submitCl.bo_handles = cmdbuf->handlesCl.buffer; - cmdbuf->submitCl.bo_handle_count = clSize(&cmdbuf->handlesCl) / 4; - cmdbuf->submitCl.bin_cl = cmdbuf->binCl.buffer; - cmdbuf->submitCl.bin_cl_size = clSize(&cmdbuf->binCl); - cmdbuf->submitCl.shader_rec = cmdbuf->shaderRecCl.buffer; - cmdbuf->submitCl.shader_rec_size = clSize(&cmdbuf->shaderRecCl); - cmdbuf->submitCl.shader_rec_count = cmdbuf->shaderRecCount; - cmdbuf->submitCl.uniforms = cmdbuf->uniformsCl.buffer; - cmdbuf->submitCl.uniforms_size = clSize(&cmdbuf->uniformsCl); - - printf("BCL:\n"); - clDump(cmdbuf->submitCl.bin_cl, cmdbuf->submitCl.bin_cl_size); - printf("BO handles: "); - for(int d = 0; d < cmdbuf->submitCl.bo_handle_count; ++d) - { - printf("%u ", *((uint32_t*)(cmdbuf->submitCl.bo_handles)+d)); - } - printf("\nwidth height: %u, %u\n", cmdbuf->submitCl.width, cmdbuf->submitCl.height); - printf("tile min/max: %u,%u %u,%u\n", cmdbuf->submitCl.min_x_tile, cmdbuf->submitCl.min_y_tile, cmdbuf->submitCl.max_x_tile, cmdbuf->submitCl.max_y_tile); - printf("color read surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.color_read.hindex, cmdbuf->submitCl.color_read.offset, cmdbuf->submitCl.color_read.bits, cmdbuf->submitCl.color_read.flags); - printf("color write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.color_write.hindex, cmdbuf->submitCl.color_write.offset, cmdbuf->submitCl.color_write.bits, cmdbuf->submitCl.color_write.flags); - printf("zs read surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.zs_read.hindex, cmdbuf->submitCl.zs_read.offset, cmdbuf->submitCl.zs_read.bits, cmdbuf->submitCl.zs_read.flags); - printf("zs write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.zs_write.hindex, cmdbuf->submitCl.zs_write.offset, cmdbuf->submitCl.zs_write.bits, cmdbuf->submitCl.zs_write.flags); - printf("msaa color write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.msaa_color_write.hindex, cmdbuf->submitCl.msaa_color_write.offset, cmdbuf->submitCl.msaa_color_write.bits, cmdbuf->submitCl.msaa_color_write.flags); - printf("msaa zs write surf: hindex, offset, bits, flags %u %u %u %u\n", cmdbuf->submitCl.msaa_zs_write.hindex, cmdbuf->submitCl.msaa_zs_write.offset, cmdbuf->submitCl.msaa_zs_write.bits, cmdbuf->submitCl.msaa_zs_write.flags); - printf("clear color packed rgba %u %u\n", cmdbuf->submitCl.clear_color[0], cmdbuf->submitCl.clear_color[1]); - printf("clear z %u\n", cmdbuf->submitCl.clear_z); - printf("clear s %u\n", cmdbuf->submitCl.clear_s); - printf("flags %u\n", cmdbuf->submitCl.flags); - - - //submit ioctl - static uint64_t lastFinishedSeqno = 0; - vc4_cl_submit(controlFd, &cmdbuf->submitCl, &queue->lastEmitSeqno, &lastFinishedSeqno); - } - - for(int c = 0; c < pSubmits->commandBufferCount; ++c) - { - if(pSubmits->pCommandBuffers[c]->state == CMDBUF_STATE_PENDING) - { - if(pSubmits->pCommandBuffers[c]->usageFlags & VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT) - { - pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_INVALID; - } - else - { - pSubmits->pCommandBuffers[c]->state = CMDBUF_STATE_EXECUTABLE; - } - } - } - - for(int c = 0; c < pSubmits->signalSemaphoreCount; ++c) - { - sem_post((sem_t*)pSubmits->pSignalSemaphores[c]); - } - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkQueuePresentKHR - * Any writes to memory backing the images referenced by the pImageIndices and pSwapchains members of pPresentInfo, - * that are available before vkQueuePresentKHR is executed, are automatically made visible to the read access performed by the presentation engine. - * This automatic visibility operation for an image happens-after the semaphore signal operation, and happens-before the presentation engine accesses the image. - * Queueing an image for presentation defines a set of queue operations, including waiting on the semaphores and submitting a presentation request to the presentation engine. - * However, the scope of this set of queue operations does not include the actual processing of the image by the presentation engine. - * If vkQueuePresentKHR fails to enqueue the corresponding set of queue operations, it may return VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY. - * If it does, the implementation must ensure that the state and contents of any resources or synchronization primitives referenced is unaffected by the call or its failure. - * If vkQueuePresentKHR fails in such a way that the implementation is unable to make that guarantee, the implementation must return VK_ERROR_DEVICE_LOST. - * However, if the presentation request is rejected by the presentation engine with an error VK_ERROR_OUT_OF_DATE_KHR or VK_ERROR_SURFACE_LOST_KHR, - * the set of queue operations are still considered to be enqueued and thus any semaphore to be waited on gets unsignaled when the corresponding queue operation is complete. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkQueuePresentKHR( - VkQueue queue, - const VkPresentInfoKHR* pPresentInfo) -{ - assert(queue); - assert(pPresentInfo); - - //wait for semaphore in present info set by submit ioctl to make sure cls are flushed - for(int c = 0; c < pPresentInfo->waitSemaphoreCount; ++c) - { - sem_wait((sem_t*)pPresentInfo->pWaitSemaphores[c]); - } - - for(int c = 0; c < pPresentInfo->swapchainCount; ++c) - { - _swapchain* s = pPresentInfo->pSwapchains[c]; - modeset_present_buffer(controlFd, (modeset_dev*)s->surface, &s->images[s->backbufferIdx]); - s->backbufferIdx = (s->backbufferIdx + 1) % s->numImages; - } - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDeviceWaitIdle - * vkDeviceWaitIdle is equivalent to calling vkQueueWaitIdle for all queues owned by device. - */ -VKAPI_ATTR VkResult VKAPI_CALL vkDeviceWaitIdle( - VkDevice device) -{ - assert(device); - - for(int c = 0; c < numQueueFamilies; ++c) - { - for(int d = 0; d < device->numQueues[c]; ++d) - { - uint64_t lastFinishedSeqno; - vc4_seqno_wait(controlFd, &lastFinishedSeqno, device->queues[c][d].lastEmitSeqno, WAIT_TIMEOUT_INFINITE); - } - } - - return VK_SUCCESS; -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkFreeCommandBuffers - * Any primary command buffer that is in the recording or executable state and has any element of pCommandBuffers recorded into it, becomes invalid. - */ -VKAPI_ATTR void VKAPI_CALL vkFreeCommandBuffers( - VkDevice device, - VkCommandPool commandPool, - uint32_t commandBufferCount, - const VkCommandBuffer* pCommandBuffers) -{ - assert(device); - assert(commandPool); - assert(pCommandBuffers); - - _commandPool* cp = (_commandPool*)commandPool; - - for(int c = 0; c < commandBufferCount; ++c) - { - //if(cp->usePoolAllocator) - { - consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->binCl, pCommandBuffers[c]->binCl.numBlocks); - consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->handlesCl, pCommandBuffers[c]->binCl.numBlocks); - consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->shaderRecCl, pCommandBuffers[c]->binCl.numBlocks); - consecutivePoolFree(&cp->cpa, &pCommandBuffers[c]->uniformsCl, pCommandBuffers[c]->binCl.numBlocks); - poolFree(&cp->pa, pCommandBuffers[c]); - } - } -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyCommandPool - * When a pool is destroyed, all command buffers allocated from the pool are freed. - * Any primary command buffer allocated from another VkCommandPool that is in the recording or executable state and has a secondary command buffer - * allocated from commandPool recorded into it, becomes invalid. - */ -VKAPI_ATTR void VKAPI_CALL vkDestroyCommandPool( - VkDevice device, - VkCommandPool commandPool, - const VkAllocationCallbacks* pAllocator) -{ - assert(device); - assert(commandPool); - - //TODO: allocator is ignored for now - assert(pAllocator == 0); - - _commandPool* cp = (_commandPool*)commandPool; - - //if(cp->usePoolAllocator) - { - free(cp->pa.buf); - free(cp->cpa.buf); - destroyPoolAllocator(&cp->pa); - destroyConsecutivePoolAllocator(&cp->cpa); - } - - free(cp); -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySemaphore - */ -VKAPI_ATTR void VKAPI_CALL vkDestroySemaphore( - VkDevice device, - VkSemaphore semaphore, - const VkAllocationCallbacks* pAllocator) -{ - assert(device); - assert(semaphore); - - //TODO: allocator is ignored for now - assert(pAllocator == 0); - - sem_destroy((sem_t*)semaphore); -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySwapchainKHR - */ -VKAPI_ATTR void VKAPI_CALL vkDestroySwapchainKHR( - VkDevice device, - VkSwapchainKHR swapchain, - const VkAllocationCallbacks* pAllocator) -{ - assert(device); - assert(swapchain); - - //TODO: allocator is ignored for now - assert(pAllocator == 0); - - //TODO flush all ops - - _swapchain* s = swapchain; - - for(int c = 0; c < s->numImages; ++c) - { - vc4_bo_free(controlFd, s->images[c].handle, 0, s->images->size); - modeset_destroy_fb(controlFd, &s->images[c]); - } - - free(s->images); - free(s); -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyDevice - * To ensure that no work is active on the device, vkDeviceWaitIdle can be used to gate the destruction of the device. - * Prior to destroying a device, an application is responsible for destroying/freeing any Vulkan objects that were created using that device as the - * first parameter of the corresponding vkCreate* or vkAllocate* command - */ -VKAPI_ATTR void VKAPI_CALL vkDestroyDevice( - VkDevice device, - const VkAllocationCallbacks* pAllocator) -{ - assert(device); - - //TODO: allocator is ignored for now - assert(pAllocator == 0); - - //TODO -} - -/* - * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyInstance - * - */ -VKAPI_ATTR void VKAPI_CALL vkDestroyInstance( - VkInstance instance, - const VkAllocationCallbacks* pAllocator) -{ - assert(instance); - - //TODO: allocator is ignored for now - assert(pAllocator == 0); - - //TODO - closeIoctl(); -} - diff --git a/driver/instance.c b/driver/instance.c new file mode 100644 index 0000000..413475f --- /dev/null +++ b/driver/instance.c @@ -0,0 +1,124 @@ +#include "common.h" + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkEnumerateInstanceExtensionProperties + * When pLayerName parameter is NULL, only extensions provided by the Vulkan implementation or by implicitly enabled layers are returned. When pLayerName is the name of a layer, + * the instance extensions provided by that layer are returned. + * If pProperties is NULL, then the number of extensions properties available is returned in pPropertyCount. Otherwise, pPropertyCount must point to a variable set by the user + * to the number of elements in the pProperties array, and on return the variable is overwritten with the number of structures actually written to pProperties. + * If pPropertyCount is less than the number of extension properties available, at most pPropertyCount structures will be written. If pPropertyCount is smaller than the number of extensions available, + * VK_INCOMPLETE will be returned instead of VK_SUCCESS, to indicate that not all the available properties were returned. + * Because the list of available layers may change externally between calls to vkEnumerateInstanceExtensionProperties, + * two calls may retrieve different results if a pLayerName is available in one call but not in another. The extensions supported by a layer may also change between two calls, + * e.g. if the layer implementation is replaced by a different version between those calls. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkEnumerateInstanceExtensionProperties( + const char* pLayerName, + uint32_t* pPropertyCount, + VkExtensionProperties* pProperties) +{ + assert(!pLayerName); //TODO layers ignored for now + assert(pPropertyCount); + + if(!pProperties) + { + *pPropertyCount = numInstanceExtensions; + return VK_INCOMPLETE; + } + + int arraySize = *pPropertyCount; + int elementsWritten = min(numInstanceExtensions, arraySize); + + for(int c = 0; c < elementsWritten; ++c) + { + pProperties[c] = instanceExtensions[c]; + } + + *pPropertyCount = elementsWritten; + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateInstance + * There is no global state in Vulkan and all per-application state is stored in a VkInstance object. Creating a VkInstance object initializes the Vulkan library + * vkCreateInstance verifies that the requested layers exist. If not, vkCreateInstance will return VK_ERROR_LAYER_NOT_PRESENT. Next vkCreateInstance verifies that + * the requested extensions are supported (e.g. in the implementation or in any enabled instance layer) and if any requested extension is not supported, + * vkCreateInstance must return VK_ERROR_EXTENSION_NOT_PRESENT. After verifying and enabling the instance layers and extensions the VkInstance object is + * created and returned to the application. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkCreateInstance( + const VkInstanceCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkInstance* pInstance) +{ + assert(pInstance); + assert(pCreateInfo); + + *pInstance = malloc(sizeof(_instance)); + + if(!*pInstance) + { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + (*pInstance)->numEnabledExtensions = 0; + + //TODO: allocator is ignored for now + assert(pAllocator == 0); + + //TODO: possibly we need to load layers here + //and store them in pInstance + assert(pCreateInfo->enabledLayerCount == 0); + + if(pCreateInfo->enabledExtensionCount) + { + assert(pCreateInfo->ppEnabledExtensionNames); + } + + for(int c = 0; c < pCreateInfo->enabledExtensionCount; ++c) + { + int findres = findInstanceExtension(pCreateInfo->ppEnabledExtensionNames[c]); + if(findres > -1) + { + (*pInstance)->enabledExtensions[(*pInstance)->numEnabledExtensions] = findres; + (*pInstance)->numEnabledExtensions++; + } + else + { + return VK_ERROR_EXTENSION_NOT_PRESENT; + } + } + + //TODO ignored for now + //pCreateInfo->pApplicationInfo + + int ret = openIoctl(); assert(!ret); + + (*pInstance)->chipVersion = vc4_get_chip_info(controlFd); + (*pInstance)->hasTiling = vc4_test_tiling(controlFd); + + (*pInstance)->hasControlFlow = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_BRANCHES); + (*pInstance)->hasEtc1 = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_ETC1); + (*pInstance)->hasThreadedFs = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_THREADED_FS); + (*pInstance)->hasMadvise = vc4_has_feature(controlFd, DRM_VC4_PARAM_SUPPORTS_MADVISE); + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroyInstance + * + */ +VKAPI_ATTR void VKAPI_CALL vkDestroyInstance( + VkInstance instance, + const VkAllocationCallbacks* pAllocator) +{ + assert(instance); + + //TODO: allocator is ignored for now + assert(pAllocator == 0); + + //TODO + closeIoctl(); +} diff --git a/driver/sync.c b/driver/sync.c new file mode 100644 index 0000000..1c641f2 --- /dev/null +++ b/driver/sync.c @@ -0,0 +1,273 @@ +#include "common.h" + +#include "kernel/vc4_packet.h" + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateSemaphore + * Semaphores are a synchronization primitive that can be used to insert a dependency between batches submitted to queues. + * Semaphores have two states - signaled and unsignaled. The state of a semaphore can be signaled after execution of a batch of commands is completed. + * A batch can wait for a semaphore to become signaled before it begins execution, and the semaphore is also unsignaled before the batch begins execution. + * As with most objects in Vulkan, semaphores are an interface to internal data which is typically opaque to applications. + * This internal data is referred to as a semaphore’s payload. However, in order to enable communication with agents outside of the current device, + * it is necessary to be able to export that payload to a commonly understood format, and subsequently import from that format as well. + * The internal data of a semaphore may include a reference to any resources and pending work associated with signal or unsignal operations performed on that semaphore object. + * Mechanisms to import and export that internal data to and from semaphores are provided below. + * These mechanisms indirectly enable applications to share semaphore state between two or more semaphores and other synchronization primitives across process and API boundaries. + * When created, the semaphore is in the unsignaled state. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkCreateSemaphore( + VkDevice device, + const VkSemaphoreCreateInfo* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSemaphore* pSemaphore) +{ + assert(device); + assert(pSemaphore); + + //TODO: allocator is ignored for now + assert(pAllocator == 0); + + //we'll probably just use an IOCTL to wait for a GPU sequence number to complete. + sem_t* s = malloc(sizeof(sem_t)); + if(!s) + { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + sem_init(s, 0, 0); //create semaphore unsignalled, shared between threads + + *pSemaphore = (VkSemaphore)s; + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCmdPipelineBarrier + * vkCmdPipelineBarrier is a synchronization command that inserts a dependency between commands submitted to the same queue, or between commands in the same subpass. + * When vkCmdPipelineBarrier is submitted to a queue, it defines a memory dependency between commands that were submitted before it, and those submitted after it. + * If vkCmdPipelineBarrier was recorded outside a render pass instance, the first synchronization scope includes all commands that occur earlier in submission order. + * If vkCmdPipelineBarrier was recorded inside a render pass instance, the first synchronization scope includes only commands that occur earlier in submission order within the same subpass. + * In either case, the first synchronization scope is limited to operations on the pipeline stages determined by the source stage mask specified by srcStageMask. + * + * If vkCmdPipelineBarrier was recorded outside a render pass instance, the second synchronization scope includes all commands that occur later in submission order. + * If vkCmdPipelineBarrier was recorded inside a render pass instance, the second synchronization scope includes only commands that occur later in submission order within the same subpass. + * In either case, the second synchronization scope is limited to operations on the pipeline stages determined by the destination stage mask specified by dstStageMask. + * + * The first access scope is limited to access in the pipeline stages determined by the source stage mask specified by srcStageMask. + * Within that, the first access scope only includes the first access scopes defined by elements of the pMemoryBarriers, + * pBufferMemoryBarriers and pImageMemoryBarriers arrays, which each define a set of memory barriers. If no memory barriers are specified, + * then the first access scope includes no accesses. + * + * The second access scope is limited to access in the pipeline stages determined by the destination stage mask specified by dstStageMask. + * Within that, the second access scope only includes the second access scopes defined by elements of the pMemoryBarriers, pBufferMemoryBarriers and pImageMemoryBarriers arrays, + * which each define a set of memory barriers. If no memory barriers are specified, then the second access scope includes no accesses. + * + * If dependencyFlags includes VK_DEPENDENCY_BY_REGION_BIT, then any dependency between framebuffer-space pipeline stages is framebuffer-local - otherwise it is framebuffer-global. + */ +VKAPI_ATTR void VKAPI_CALL vkCmdPipelineBarrier( + VkCommandBuffer commandBuffer, + VkPipelineStageFlags srcStageMask, + VkPipelineStageFlags dstStageMask, + VkDependencyFlags dependencyFlags, + uint32_t memoryBarrierCount, + const VkMemoryBarrier* pMemoryBarriers, + uint32_t bufferMemoryBarrierCount, + const VkBufferMemoryBarrier* pBufferMemoryBarriers, + uint32_t imageMemoryBarrierCount, + const VkImageMemoryBarrier* pImageMemoryBarriers) +{ + assert(commandBuffer); + + //TODO pipeline stage flags + //VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT + //VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT + //VK_PIPELINE_STAGE_VERTEX_INPUT_BIT + //VK_PIPELINE_STAGE_VERTEX_SHADER_BIT + //VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT + //VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT + //VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT + //VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT + //VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT + //VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT + //VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT + //VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT + //VK_PIPELINE_STAGE_TRANSFER_BIT + //VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT + //VK_PIPELINE_STAGE_HOST_BIT + //VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT + //VK_PIPELINE_STAGE_ALL_COMMANDS_BIT + + //TODO dependency flags + //VK_DEPENDENCY_BY_REGION_BIT, + //VK_DEPENDENCY_DEVICE_GROUP_BIT, + //VK_DEPENDENCY_VIEW_LOCAL_BIT + + //TODO access flags + //VK_ACCESS_INDIRECT_COMMAND_READ_BIT + //VK_ACCESS_INDEX_READ_BIT + //VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT + //VK_ACCESS_UNIFORM_READ_BIT + //VK_ACCESS_INPUT_ATTACHMENT_READ_BIT + //VK_ACCESS_SHADER_READ_BIT + //VK_ACCESS_SHADER_WRITE_BIT + //VK_ACCESS_COLOR_ATTACHMENT_READ_BIT + //VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT + //VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT + //VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT + //VK_ACCESS_TRANSFER_READ_BIT + //VK_ACCESS_TRANSFER_WRITE_BIT + //VK_ACCESS_HOST_READ_BIT + //VK_ACCESS_HOST_WRITE_BIT + //VK_ACCESS_MEMORY_READ_BIT + //VK_ACCESS_MEMORY_WRITE_BIT + //VK_ACCESS_COMMAND_PROCESS_READ_BIT_NVX + //VK_ACCESS_COMMAND_PROCESS_WRITE_BIT_NVX + + //TODO Layout transition flags + //VK_IMAGE_LAYOUT_UNDEFINED + //VK_IMAGE_LAYOUT_GENERAL + //VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL + //VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL + //VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL + //VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL + //VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL + //VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL + //VK_IMAGE_LAYOUT_PREINITIALIZED + //VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL + //VK_IMAGE_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_OPTIMAL + //VK_IMAGE_LAYOUT_PRESENT_SRC_KHR + //VK_IMAGE_LAYOUT_SHARED_PRESENT_KHR + + for(int c = 0; c < memoryBarrierCount; ++c) + { + //TODO + } + + for(int c = 0; c < bufferMemoryBarrierCount; ++c) + { + //TODO + } + + for(int c = 0; c < imageMemoryBarrierCount; ++c) + { + _image* i = pImageMemoryBarriers[c].image; + + assert(i->layout == pImageMemoryBarriers[c].oldLayout || i->layout == VK_IMAGE_LAYOUT_UNDEFINED); + + if(srcStageMask & VK_PIPELINE_STAGE_TRANSFER_BIT && + pImageMemoryBarriers[c].srcAccessMask & VK_ACCESS_TRANSFER_WRITE_BIT && + i->needToClear) + { + //insert CRs to clear the image + + assert(i->layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + + clFit(commandBuffer, &commandBuffer->binCl, V3D21_TILE_BINNING_MODE_CONFIGURATION_length); + clInsertTileBinningModeConfiguration(&commandBuffer->binCl, + 0, 0, 0, 0, + getFormatBpp(i->format) == 64, //64 bit color mode + i->samples > 1, //msaa + i->width, i->height, 0, 0, 0); + + //START_TILE_BINNING resets the statechange counters in the hardware, + //which are what is used when a primitive is binned to a tile to + //figure out what new state packets need to be written to that tile's + //command list. + clFit(commandBuffer, &commandBuffer->binCl, V3D21_START_TILE_BINNING_length); + clInsertStartTileBinning(&commandBuffer->binCl); + + //Reset the current compressed primitives format. This gets modified + //by VC4_PACKET_GL_INDEXED_PRIMITIVE and + //VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start + //of every tile. + clFit(commandBuffer, &commandBuffer->binCl, V3D21_PRIMITIVE_LIST_FORMAT_length); + clInsertPrimitiveListFormat(&commandBuffer->binCl, + 1, //16 bit + 2); //tris + + clFit(commandBuffer, &commandBuffer->handlesCl, 4); + uint32_t idx = clGetHandleIndex(&commandBuffer->handlesCl, i->handle); + commandBuffer->submitCl.color_write.hindex = idx; + commandBuffer->submitCl.color_write.offset = 0; + commandBuffer->submitCl.color_write.flags = 0; + //TODO format + commandBuffer->submitCl.color_write.bits = + VC4_SET_FIELD(VC4_RENDER_CONFIG_FORMAT_RGBA8888, VC4_RENDER_CONFIG_FORMAT) | + VC4_SET_FIELD(i->tiling, VC4_RENDER_CONFIG_MEMORY_FORMAT); + + commandBuffer->submitCl.clear_color[0] = i->clearColor[0]; + commandBuffer->submitCl.clear_color[1] = i->clearColor[1]; + + //TODO ranges + commandBuffer->submitCl.min_x_tile = 0; + commandBuffer->submitCl.min_y_tile = 0; + + uint32_t tileSizeW = 64; + uint32_t tileSizeH = 64; + + if(i->samples > 1) + { + tileSizeW >>= 1; + tileSizeH >>= 1; + } + + if(getFormatBpp(i->format) == 64) + { + tileSizeH >>= 1; + } + + uint32_t widthInTiles = divRoundUp(i->width, tileSizeW); + uint32_t heightInTiles = divRoundUp(i->height, tileSizeH); + + commandBuffer->submitCl.max_x_tile = widthInTiles - 1; + commandBuffer->submitCl.max_y_tile = heightInTiles - 1; + commandBuffer->submitCl.width = i->width; + commandBuffer->submitCl.height = i->height; + commandBuffer->submitCl.flags |= VC4_SUBMIT_CL_USE_CLEAR_COLOR; + commandBuffer->submitCl.clear_z = 0; //TODO + commandBuffer->submitCl.clear_s = 0; + } + + //transition to new layout + i->layout = pImageMemoryBarriers[c].newLayout; + } +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDeviceWaitIdle + * vkDeviceWaitIdle is equivalent to calling vkQueueWaitIdle for all queues owned by device. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkDeviceWaitIdle( + VkDevice device) +{ + assert(device); + + for(int c = 0; c < numQueueFamilies; ++c) + { + for(int d = 0; d < device->numQueues[c]; ++d) + { + uint64_t lastFinishedSeqno; + vc4_seqno_wait(controlFd, &lastFinishedSeqno, device->queues[c][d].lastEmitSeqno, WAIT_TIMEOUT_INFINITE); + } + } + + return VK_SUCCESS; +} + + + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySemaphore + */ +VKAPI_ATTR void VKAPI_CALL vkDestroySemaphore( + VkDevice device, + VkSemaphore semaphore, + const VkAllocationCallbacks* pAllocator) +{ + assert(device); + assert(semaphore); + + //TODO: allocator is ignored for now + assert(pAllocator == 0); + + sem_destroy((sem_t*)semaphore); +} diff --git a/driver/vkCaps.h b/driver/vkCaps.h index 4901b32..e35d27a 100644 --- a/driver/vkCaps.h +++ b/driver/vkCaps.h @@ -2,7 +2,7 @@ #include -VkPhysicalDeviceLimits _limits = +static VkPhysicalDeviceLimits _limits = { //TODO these values might change .maxImageDimension1D = 16384, @@ -113,7 +113,7 @@ VkPhysicalDeviceLimits _limits = .nonCoherentAtomSize = 0x40 }; -VkPhysicalDeviceFeatures _features = +static VkPhysicalDeviceFeatures _features = { //TODO this might change .robustBufferAccess = 1, @@ -174,7 +174,7 @@ VkPhysicalDeviceFeatures _features = }; #define numFeatures (sizeof(_features)/sizeof(VkBool32)) -VkQueueFamilyProperties _queueFamilyProperties[] = +static VkQueueFamilyProperties _queueFamilyProperties[] = { { .queueFlags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT, @@ -185,7 +185,7 @@ VkQueueFamilyProperties _queueFamilyProperties[] = }; #define numQueueFamilies (sizeof(_queueFamilyProperties)/sizeof(VkQueueFamilyProperties)) -VkSurfaceFormatKHR supportedSurfaceFormats[] = +static VkSurfaceFormatKHR supportedSurfaceFormats[] = { { .format = VK_FORMAT_R8G8B8A8_UNORM, @@ -259,29 +259,3 @@ static VkExtensionProperties deviceExtensions[] = } }; #define numDeviceExtensions (sizeof(deviceExtensions) / sizeof(VkExtensionProperties)) - -int findInstanceExtension(char* name) -{ - for(int c = 0; c < numInstanceExtensions; ++c) - { - if(strcmp(instanceExtensions[c].extensionName, name) == 0) - { - return c; - } - } - - return -1; -} - -int findDeviceExtension(char* name) -{ - for(int c = 0; c < numDeviceExtensions; ++c) - { - if(strcmp(deviceExtensions[c].extensionName, name) == 0) - { - return c; - } - } - - return -1; -} diff --git a/driver/wsi.c b/driver/wsi.c new file mode 100644 index 0000000..3aa8d69 --- /dev/null +++ b/driver/wsi.c @@ -0,0 +1,381 @@ +#include "common.h" + +/* + * Implementation of our RPI specific "extension" + */ +VkResult vkCreateRpiSurfaceKHR( + VkInstance instance, + const VkRpiSurfaceCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSurfaceKHR* pSurface) +{ + assert(instance); + //assert(pCreateInfo); //ignored for now + assert(pSurface); + //TODO: allocator is ignored for now + assert(pAllocator == 0); + + *pSurface = (VkSurfaceKHR)modeset_create(controlFd); + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySurfaceKHR + * Destroying a VkSurfaceKHR merely severs the connection between Vulkan and the native surface, + * and does not imply destroying the native surface, closing a window, or similar behavior + * (but we'll do so anyways...) + */ +VKAPI_ATTR void VKAPI_CALL vkDestroySurfaceKHR( + VkInstance instance, + VkSurfaceKHR surface, + const VkAllocationCallbacks* pAllocator) +{ + assert(instance); + assert(surface); + + //TODO: allocator is ignored for now + assert(pAllocator == 0); + + modeset_destroy(controlFd, (modeset_dev*)surface); +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfaceCapabilitiesKHR + * The capabilities of a swapchain targetting a surface are the intersection of the capabilities of the WSI platform, + * the native window or display, and the physical device. The resulting capabilities can be obtained with the queries listed + * below in this section. Capabilities that correspond to image creation parameters are not independent of each other: + * combinations of parameters that are not supported as reported by vkGetPhysicalDeviceImageFormatProperties are not supported + * by the surface on that physical device, even if the capabilities taken individually are supported as part of some other parameter combinations. + * + * capabilities the specified device supports for a swapchain created for the surface + */ +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceCapabilitiesKHR( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + VkSurfaceCapabilitiesKHR* pSurfaceCapabilities) +{ + assert(physicalDevice); + assert(surface); + assert(pSurfaceCapabilities); + + pSurfaceCapabilities->minImageCount = 1; //min 1 + pSurfaceCapabilities->maxImageCount = 2; //TODO max 2 for double buffering for now... + pSurfaceCapabilities->currentExtent.width = ((modeset_dev*)surface)->width; + pSurfaceCapabilities->currentExtent.height = ((modeset_dev*)surface)->height; + pSurfaceCapabilities->minImageExtent.width = ((modeset_dev*)surface)->width; //TODO + pSurfaceCapabilities->minImageExtent.height = ((modeset_dev*)surface)->height; //TODO + pSurfaceCapabilities->maxImageExtent.width = ((modeset_dev*)surface)->width; //TODO + pSurfaceCapabilities->maxImageExtent.height = ((modeset_dev*)surface)->height; //TODO + pSurfaceCapabilities->maxImageArrayLayers = 1; //TODO maybe more layers for cursor etc. + pSurfaceCapabilities->supportedTransforms = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; //TODO no rotation for now + pSurfaceCapabilities->currentTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR; //TODO get this from dev + pSurfaceCapabilities->supportedCompositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR; //TODO no alpha compositing for now + pSurfaceCapabilities->supportedUsageFlags = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; //well we want to draw on the screen right + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfaceFormatsKHR + * If pSurfaceFormats is NULL, then the number of format pairs supported for the given surface is returned in pSurfaceFormatCount. + * The number of format pairs supported will be greater than or equal to 1. Otherwise, pSurfaceFormatCount must point to a variable + * set by the user to the number of elements in the pSurfaceFormats array, and on return the variable is overwritten with the number + * of structures actually written to pSurfaceFormats. If the value of pSurfaceFormatCount is less than the number of format pairs supported, + * at most pSurfaceFormatCount structures will be written. If pSurfaceFormatCount is smaller than the number of format pairs supported for the given surface, + * VK_INCOMPLETE will be returned instead of VK_SUCCESS to indicate that not all the available values were returned. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfaceFormatsKHR( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + uint32_t* pSurfaceFormatCount, + VkSurfaceFormatKHR* pSurfaceFormats) +{ + assert(physicalDevice); + assert(surface); + assert(pSurfaceFormatCount); + + const int numFormats = 1; + + if(!pSurfaceFormats) + { + *pSurfaceFormatCount = numFormats; + return VK_SUCCESS; + } + + int arraySize = *pSurfaceFormatCount; + int elementsWritten = min(numFormats, arraySize); + + for(int c = 0; c < elementsWritten; ++c) + { + pSurfaceFormats[c] = supportedSurfaceFormats[c]; + } + + *pSurfaceFormatCount = elementsWritten; + + if(elementsWritten < numFormats) + { + return VK_INCOMPLETE; + } + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetPhysicalDeviceSurfacePresentModesKHR + * If pPresentModes is NULL, then the number of presentation modes supported for the given surface is returned in pPresentModeCount. + * Otherwise, pPresentModeCount must point to a variable set by the user to the number of elements in the pPresentModes array, + * and on return the variable is overwritten with the number of values actually written to pPresentModes. + * If the value of pPresentModeCount is less than the number of presentation modes supported, at most pPresentModeCount values will be written. + * If pPresentModeCount is smaller than the number of presentation modes supported for the given surface, VK_INCOMPLETE will be returned instead of + * VK_SUCCESS to indicate that not all the available values were returned. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkGetPhysicalDeviceSurfacePresentModesKHR( + VkPhysicalDevice physicalDevice, + VkSurfaceKHR surface, + uint32_t* pPresentModeCount, + VkPresentModeKHR* pPresentModes) +{ + assert(physicalDevice); + assert(surface); + assert(pPresentModeCount); + + const int numModes = 1; + + if(!pPresentModes) + { + *pPresentModeCount = numModes; + return VK_SUCCESS; + } + + int arraySize = *pPresentModeCount; + int elementsWritten = min(numModes, arraySize); + + for(int c = 0; c < elementsWritten; ++c) + { + //TODO + pPresentModes[c] = VK_PRESENT_MODE_FIFO_KHR; + } + + *pPresentModeCount = elementsWritten; + + if(elementsWritten < numModes) + { + return VK_INCOMPLETE; + } + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkCreateSwapchainKHR + */ +VKAPI_ATTR VkResult VKAPI_CALL vkCreateSwapchainKHR( + VkDevice device, + const VkSwapchainCreateInfoKHR* pCreateInfo, + const VkAllocationCallbacks* pAllocator, + VkSwapchainKHR* pSwapchain) +{ + assert(device); + assert(pCreateInfo); + assert(pSwapchain); + + //TODO: allocator is ignored for now + assert(pAllocator == 0); + + *pSwapchain = malloc(sizeof(_swapchain)); + if(!*pSwapchain) + { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + _swapchain* s = *pSwapchain; + + //TODO flags, layers, queue sharing, pretransform, composite alpha, present mode..., clipped, oldswapchain + //TODO external sync on surface, oldswapchain + + s->images = malloc(sizeof(_image) * pCreateInfo->minImageCount); + if(!s->images) + { + return VK_ERROR_OUT_OF_HOST_MEMORY; + } + + s->backbufferIdx = 0; + s->numImages = pCreateInfo->minImageCount; + s->surface = pCreateInfo->surface; + + for(int c = 0; c < pCreateInfo->minImageCount; ++c) + { + s->images[c].width = pCreateInfo->imageExtent.width; + s->images[c].height = pCreateInfo->imageExtent.height; + s->images[c].depth = 1; + s->images[c].layers = pCreateInfo->imageArrayLayers; + s->images[c].miplevels = 1; + s->images[c].samples = 1; //TODO + s->images[c].usageBits = pCreateInfo->imageUsage; + s->images[c].format = pCreateInfo->imageFormat; + s->images[c].imageSpace = pCreateInfo->imageColorSpace; + s->images[c].concurrentAccess = pCreateInfo->imageSharingMode; + s->images[c].numQueueFamiliesWithAccess = pCreateInfo->queueFamilyIndexCount; + if(s->images[c].concurrentAccess) + { + s->images[c].queueFamiliesWithAccess = malloc(sizeof(uint32_t)*s->images[c].numQueueFamiliesWithAccess); + memcpy(s->images[c].queueFamiliesWithAccess, pCreateInfo->pQueueFamilyIndices, sizeof(uint32_t)*s->images[c].numQueueFamiliesWithAccess); + } + s->images[c].preTransformMode = pCreateInfo->preTransform; + s->images[c].compositeAlpha = pCreateInfo->compositeAlpha; + s->images[c].presentMode = pCreateInfo->presentMode; + s->images[c].clipped = pCreateInfo->clipped; + + createImageBO(&s->images[c]); + int res = modeset_create_fb(controlFd, &s->images[c]); assert(res == 0); + } + + //defer to first swapbuffer (or at least later, getting swapchain != presenting immediately) + //int res = modeset_fb_for_dev(controlFd, s->surface, &s->images[s->backbufferIdx]); assert(res == 0); + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkGetSwapchainImagesKHR + * If pSwapchainImages is NULL, then the number of presentable images for swapchain is returned in pSwapchainImageCount. + * Otherwise, pSwapchainImageCount must point to a variable set by the user to the number of elements in the pSwapchainImages array, + * and on return the variable is overwritten with the number of structures actually written to pSwapchainImages. + * If the value of pSwapchainImageCount is less than the number of presentable images for swapchain, at most pSwapchainImageCount structures will be written. + * If pSwapchainImageCount is smaller than the number of presentable images for swapchain, VK_INCOMPLETE will be returned instead of VK_SUCCESS to + * indicate that not all the available values were returned. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainImagesKHR( + VkDevice device, + VkSwapchainKHR swapchain, + uint32_t* pSwapchainImageCount, + VkImage* pSwapchainImages) +{ + assert(device); + assert(swapchain); + assert(pSwapchainImageCount); + + _swapchain* s = swapchain; + + if(!pSwapchainImages) + { + *pSwapchainImageCount = s->numImages; + return VK_SUCCESS; + } + + int arraySize = *pSwapchainImageCount; + int elementsWritten = min(s->numImages, arraySize); + + for(int c = 0; c < elementsWritten; ++c) + { + pSwapchainImages[c] = &s->images[c]; + } + + *pSwapchainImageCount = elementsWritten; + + if(elementsWritten < s->numImages) + { + return VK_INCOMPLETE; + } + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkAcquireNextImageKHR + */ +VKAPI_ATTR VkResult VKAPI_CALL vkAcquireNextImageKHR( + VkDevice device, + VkSwapchainKHR swapchain, + uint64_t timeout, + VkSemaphore semaphore, + VkFence fence, + uint32_t* pImageIndex) +{ + assert(device); + assert(swapchain); + + assert(semaphore != VK_NULL_HANDLE || fence != VK_NULL_HANDLE); + + sem_t* s = semaphore; + + //TODO we need to keep track of currently acquired images? + + //TODO wait timeout? + + *pImageIndex = ((_swapchain*)swapchain)->backbufferIdx; //return back buffer index + + //signal semaphore + int semVal; sem_getvalue(s, &semVal); assert(semVal <= 0); //make sure semaphore is unsignalled + sem_post(s); + + //TODO signal fence + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkQueuePresentKHR + * Any writes to memory backing the images referenced by the pImageIndices and pSwapchains members of pPresentInfo, + * that are available before vkQueuePresentKHR is executed, are automatically made visible to the read access performed by the presentation engine. + * This automatic visibility operation for an image happens-after the semaphore signal operation, and happens-before the presentation engine accesses the image. + * Queueing an image for presentation defines a set of queue operations, including waiting on the semaphores and submitting a presentation request to the presentation engine. + * However, the scope of this set of queue operations does not include the actual processing of the image by the presentation engine. + * If vkQueuePresentKHR fails to enqueue the corresponding set of queue operations, it may return VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY. + * If it does, the implementation must ensure that the state and contents of any resources or synchronization primitives referenced is unaffected by the call or its failure. + * If vkQueuePresentKHR fails in such a way that the implementation is unable to make that guarantee, the implementation must return VK_ERROR_DEVICE_LOST. + * However, if the presentation request is rejected by the presentation engine with an error VK_ERROR_OUT_OF_DATE_KHR or VK_ERROR_SURFACE_LOST_KHR, + * the set of queue operations are still considered to be enqueued and thus any semaphore to be waited on gets unsignaled when the corresponding queue operation is complete. + */ +VKAPI_ATTR VkResult VKAPI_CALL vkQueuePresentKHR( + VkQueue queue, + const VkPresentInfoKHR* pPresentInfo) +{ + assert(queue); + assert(pPresentInfo); + + //wait for semaphore in present info set by submit ioctl to make sure cls are flushed + for(int c = 0; c < pPresentInfo->waitSemaphoreCount; ++c) + { + sem_wait((sem_t*)pPresentInfo->pWaitSemaphores[c]); + } + + for(int c = 0; c < pPresentInfo->swapchainCount; ++c) + { + _swapchain* s = pPresentInfo->pSwapchains[c]; + modeset_present_buffer(controlFd, (modeset_dev*)s->surface, &s->images[s->backbufferIdx]); + s->backbufferIdx = (s->backbufferIdx + 1) % s->numImages; + } + + return VK_SUCCESS; +} + +/* + * https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/vkspec.html#vkDestroySwapchainKHR + */ +VKAPI_ATTR void VKAPI_CALL vkDestroySwapchainKHR( + VkDevice device, + VkSwapchainKHR swapchain, + const VkAllocationCallbacks* pAllocator) +{ + assert(device); + assert(swapchain); + + //TODO: allocator is ignored for now + assert(pAllocator == 0); + + //TODO flush all ops + + _swapchain* s = swapchain; + + for(int c = 0; c < s->numImages; ++c) + { + vc4_bo_free(controlFd, s->images[c].handle, 0, s->images->size); + modeset_destroy_fb(controlFd, &s->images[c]); + } + + free(s->images); + free(s); +} +