#pragma once #include "common.h" #include "QPUassembler/qpu_assembler.h" #include "modeset.h" #include "vkExtFunctions.h" #ifdef __cplusplus extern "C" { #endif //TODO collect shader performance data //eg number of texture samples etc. //TODO check if shader has flow control and make sure instance also has flow control //TODO make sure instance has threaded fs if shader contains thread switch VkResult rpi_vkCreateShaderModuleFromRpiAssemblyEXT(VkPhysicalDevice physicalDevice, VkRpiShaderModuleAssemblyCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkShaderModule* pShaderModule) { assert(physicalDevice); assert(pCreateInfo); assert(pShaderModule); assert(pCreateInfo->asmStrings); _shaderModule* shader = ALLOCATE(sizeof(_shaderModule), 1, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if(!shader) { return VK_ERROR_OUT_OF_HOST_MEMORY; } shader->hasThreadSwitch = 0; for(int c = 0; c < RPI_ASSEMBLY_TYPE_MAX; ++c) { if(pCreateInfo->asmStrings[c]) { uint32_t numInstructions = get_num_instructions(pCreateInfo->asmStrings[c]); uint32_t size = sizeof(uint64_t)*numInstructions; //TODO this alloc feels kinda useless, we just copy the data anyway to kernel space //why not map kernel space mem to user space instead? shader->instructions[c] = ALLOCATE(size, 1, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if(!shader->instructions[c]) { return VK_ERROR_OUT_OF_HOST_MEMORY; } //need to create a temporary copy as the assembly algorithm is destructive uint32_t stringLength = strlen(pCreateInfo->asmStrings[c]); char* tmpShaderStr = ALLOCATE(stringLength+1, 1, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); memcpy(tmpShaderStr, pCreateInfo->asmStrings[c], stringLength+1); assemble_qpu_asm(tmpShaderStr, shader->instructions[c]); FREE(tmpShaderStr); for(uint64_t d = 0; d < numInstructions; ++d) { uint64_t s = (shader->instructions[c][d] & (0xfll << 60)) >> 60; if(s == 2ll) { shader->hasThreadSwitch = 1; break; } } shader->numVaryings = 0; for(uint64_t d = 0; d < numInstructions; ++d) { unsigned is_sem = ((shader->instructions[c][d] & (0x7fll << 57)) >> 57) == 0x74; unsigned sig_bits = ((shader->instructions[c][d] & (0xfll << 60)) >> 60); //if it's an ALU instruction if(!is_sem && sig_bits != 14 && sig_bits != 15) { unsigned raddr_a = ((shader->instructions[c][d] & (0x3fll << 18)) >> 18); unsigned raddr_b = ((shader->instructions[c][d] & (0x3fll << 12)) >> 12); if(raddr_a == 35) { shader->numVaryings++; } //don't count small immediates if(sig_bits != 13 && raddr_b == 35) { shader->numVaryings++; } } } shader->sizes[c] = size; } else { shader->bos[c] = 0; shader->sizes[c] = 0; } } shader->numMappings = pCreateInfo->numMappings; if(pCreateInfo->numMappings > 0) { shader->mappings = ALLOCATE(sizeof(VkRpiAssemblyMappingEXT)*pCreateInfo->numMappings, 1, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if(!shader->mappings) { return VK_ERROR_OUT_OF_HOST_MEMORY; } memcpy(shader->mappings, pCreateInfo->mappings, sizeof(VkRpiAssemblyMappingEXT)*pCreateInfo->numMappings); } *pShaderModule = shader; return VK_SUCCESS; } /* * Implementation of our RPI specific "extension" */ VkResult rpi_vkCreateRpiSurfaceEXT( VkPhysicalDevice physicalDevice) { assert(physicalDevice); //TODO use allocator! _physicalDevice* ptr = physicalDevice; *(VkSurfaceKHR*)ptr->customData = (VkSurfaceKHR)modeset_create(controlFd); return VK_SUCCESS; } #ifdef __cplusplus } #endif