1
0
mirror of https://github.com/Yours3lf/rpi-vk-driver.git synced 2025-02-21 18:54:18 +01:00
2019-04-14 14:43:27 +01:00

804 lines
20 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "vc4_qpu_defines.h"
/*********************************************************************************************************************
Instruction restrictions
* The last three instructions of any program (Thread End plus the following two delay-slot instructions) must
not do varyings read, uniforms read or any kind of VPM, VDR, or VDW read or write.
* The Program End instruction must not write to either physical regfile A or B.
* The Program End instruction and the following two delay slot instructions must not write or read address 14
in either regfile A or B.
* The final program instruction (the second delay slot instruction) must not do a TLB Z write.
* A scoreboard wait must not occur in the first two instructions of a fragment shader. This is either the
explicit Wait for Scoreboard signal or an implicit wait with the first tile-buffer read or write instruction.
* If TMU_NOSWAP is written, the write must be three instructions before the first TMU write instruction.
For example, if TMU_NOSWAP is written in the first shader instruction, the first TMU write cannot occur
before the 4th shader instruction.
* An instruction must not read from a location in physical regfile A or B that was written to by the previous
instruction.
* After an SFU lookup instruction, accumulator r4 must not be read in the following two instructions. Any
other instruction that results in r4 being written (that is, TMU read, TLB read, SFU lookup) cannot occur in
the two instructions following an SFU lookup.
* An instruction that does a vector rotate by r5 must not immediately follow an instruction that writes to r5.
* An instruction that does a vector rotate must not immediately follow an instruction that writes to the
accumulator that is being rotated.
* After an instruction that does a TLB Z write, the multisample mask must not be read as an instruction
input argument in the following two instruction. The TLB Z write instruction can, however, be followed
immediately by a TLB color write.
* A single instruction can only perform a maximum of one of the following closely coupled peripheral
accesses in a single instruction: TMU write, TMU read, TLB write, TLB read, TLB combined color read and
write, SFU write, Mutex read or Semaphore access.
*********************************************************************************************************************/
uint64_t encode_alu(qpu_sig_bits sig_bits,
qpu_unpack unpack_mode,
//If the pm bit is set, the unpack field programs the r4 unpack unit,
//and the pack field is used to program the color
//conversion on the output of the mul unit
uint8_t pack_unpack_select,
uint8_t pack_mode,
qpu_cond add_cond,
qpu_cond mul_cond,
uint8_t set_flags, //Flags are updated from the add ALU unless the add ALU performed a NOP (or its condition code was NEVER) in which case flags are updated from the mul ALU
uint8_t write_swap_flag, //0: add writes to A, mul to B, 1: add writes to B, mul to A
qpu_waddr waddr_add,
qpu_waddr waddr_mul,
qpu_op_mul op_mul,
qpu_op_add op_add,
qpu_raddr raddr_a,
qpu_raddr raddr_b,
qpu_mux add_a,
qpu_mux add_b,
qpu_mux mul_a,
qpu_mux mul_b
)
{
uint64_t res = 0;
uint64_t tmp = 0;
tmp = sig_bits & 0xf; //mask ls 4 bits
res |= tmp << QPU_SIG_SHIFT;
tmp = unpack_mode & 0x7; //mask ls 3 bits
res |= tmp << QPU_UNPACK_SHIFT;
tmp = pack_unpack_select & 1;
res |= tmp << 56;
tmp = pack_mode & 0xf;
res |= tmp << QPU_PACK_SHIFT;
tmp = add_cond & 0x7;
res |= tmp << QPU_COND_ADD_SHIFT;
tmp = mul_cond & 0x7;
res |= tmp << QPU_COND_MUL_SHIFT;
tmp = set_flags & 1;
res |= tmp << 45;
tmp = write_swap_flag & 1;
res |= tmp << 44;
tmp = waddr_add & 0x3f;
res |= tmp << QPU_WADDR_ADD_SHIFT;
tmp = waddr_mul & 0x3f;
res |= tmp << QPU_WADDR_MUL_SHIFT;
tmp = op_mul & 0x7;
res |= tmp << QPU_OP_MUL_SHIFT;
tmp = op_add & 0x1f;
res |= tmp << QPU_OP_ADD_SHIFT;
tmp = raddr_a & 0x3f;
res |= tmp << QPU_RADDR_A_SHIFT;
tmp = raddr_b & 0x3f;
res |= tmp << QPU_RADDR_B_SHIFT;
tmp = add_a & 0x7;
res |= tmp << QPU_ADD_A_SHIFT;
tmp = add_b & 0x7;
res |= tmp << QPU_ADD_B_SHIFT;
tmp = mul_a & 0x7;
res |= tmp << QPU_MUL_A_SHIFT;
tmp = mul_b & 0x7;
res |= tmp << QPU_MUL_B_SHIFT;
return res;
}
uint64_t encode_alu_small_imm(qpu_unpack unpack_mode,
uint8_t pack_unpack_select,
uint8_t pack_mode,
qpu_cond add_cond,
qpu_cond mul_cond,
uint8_t set_flags, //Flags are updated from the add ALU unless the add ALU performed a NOP (or its condition code was NEVER) in which case flags are updated from the mul ALU
uint8_t write_swap_flag, //0: add writes to A, mul to B, 1: add writes to B, mul to A
qpu_waddr waddr_add,
qpu_waddr waddr_mul,
qpu_op_mul op_mul,
qpu_op_add op_add,
qpu_raddr raddr_a,
uint8_t small_imm,
qpu_mux add_a,
qpu_mux add_b,
qpu_mux mul_a,
qpu_mux mul_b
)
{
return encode_alu(0xd,
unpack_mode,
pack_unpack_select,
pack_mode,
add_cond,
mul_cond,
set_flags,
write_swap_flag,
waddr_add,
waddr_mul,
op_mul,
op_add,
raddr_a,
small_imm,
add_a,
add_b,
mul_a,
mul_b);
}
uint64_t encode_branch(qpu_branch_cond branch_cond,
uint8_t is_relative, //if set branch target is relative to PC+4
uint8_t use_raddr_a, //if set add value of raddr_a (from simd elem 0) to branch target
qpu_raddr raddr_a,
uint8_t write_swap_bit,
qpu_waddr waddr_add,
qpu_waddr waddr_mul,
uint32_t imm //always added to branch target, set to 0 if unused
)
{
uint64_t res = 0;
uint64_t tmp = 0;
tmp = 0xf;
res |= tmp << 60;
tmp = branch_cond & 0xf;
res |= tmp << QPU_BRANCH_COND_SHIFT;
tmp = is_relative & 1;
res |= tmp << 51;
tmp = use_raddr_a & 1;
res |= tmp << 50;
tmp = raddr_a & 0x1f;
res |= tmp << QPU_BRANCH_RADDR_A_SHIFT;
tmp = write_swap_bit & 1;
res |= tmp << 44;
tmp = waddr_add & 0x3f;
res |= tmp << QPU_WADDR_ADD_SHIFT;
tmp = waddr_mul & 0x3f;
res |= tmp << QPU_WADDR_MUL_SHIFT;
res |= imm;
return res;
}
uint64_t encode_semaphore(uint8_t pack_unpack_select,
uint8_t pack_mode,
qpu_cond cond_add,
qpu_cond cond_mul,
uint8_t set_flags,
uint8_t write_swap,
qpu_waddr waddr_add,
qpu_waddr waddr_mul,
uint8_t incr_sem, //if 1 increment semaphore
uint8_t sem, //4 bit semaphore selector
uint32_t imm_val //27bit immediate value loaded into all 16 simd elements
)
{
uint64_t res = 0;
uint64_t tmp = 0;
tmp = 0x74;
res |= tmp << 57;
tmp = pack_unpack_select & 1;
res |= tmp << 56;
tmp = pack_mode & 0xf;
res |= tmp << QPU_PACK_SHIFT;
tmp = cond_add & 0x7;
res |= tmp << QPU_COND_ADD_SHIFT;
tmp = cond_mul & 0x7;
res |= tmp << QPU_COND_MUL_SHIFT;
tmp = set_flags & 1;
res |= tmp << 45;
tmp = write_swap & 1;
res |= tmp << 44;
tmp = waddr_add & 0x3f;
res |= tmp << QPU_WADDR_ADD_SHIFT;
tmp = waddr_mul & 0x3f;
res |= tmp << QPU_WADDR_MUL_SHIFT;
tmp = imm_val & 0x7ffffff;
res |= tmp << 5;
tmp = incr_sem & 1;
res |= tmp << 4;
res |= sem & 0xf;
return res;
}
//write immediate value across simd array
uint64_t encode_load_imm(uint8_t pack_unpack_select,
uint8_t pack_mode,
qpu_cond cond_add,
qpu_cond cond_mul,
uint8_t set_flags,
uint8_t write_swap,
qpu_waddr waddr_add,
qpu_waddr waddr_mul,
uint32_t imm //2x16bit or 1x32bit uint
)
{
uint64_t res = 0;
uint64_t tmp = 0;
tmp = 0x70;
res |= tmp << 57;
tmp = pack_unpack_select & 1;
res |= tmp << 56;
tmp = pack_mode & 0xf;
res |= tmp << QPU_PACK_SHIFT;
tmp = cond_add & 0x7;
res |= tmp << QPU_COND_ADD_SHIFT;
tmp = cond_mul & 0x7;
res |= tmp << QPU_COND_MUL_SHIFT;
tmp = set_flags & 1;
res |= tmp << 45;
tmp = write_swap & 1;
res |= tmp << 44;
tmp = waddr_add & 0x3f;
res |= tmp << QPU_WADDR_ADD_SHIFT;
tmp = waddr_mul & 0x3f;
res |= tmp << QPU_WADDR_MUL_SHIFT;
res |= imm;
return res;
}
//write per element MS bit and LS bit across simd array
uint64_t encode_load_imm_per_elem(
uint8_t signed_or_unsigned, //0 for signed, 1 for unsigned
uint8_t pack_unpack_select,
uint8_t pack_mode,
qpu_cond cond_add,
qpu_cond cond_mul,
uint8_t set_flags,
uint8_t write_swap,
qpu_waddr waddr_add,
qpu_waddr waddr_mul,
uint16_t ms_bit, //per element MS (sign) bit
uint16_t ls_bit //per element LS bit
)
{
uint64_t res = 0;
uint64_t tmp = 0;
tmp = 0x71;
tmp |= signed_or_unsigned << 1;
res |= tmp << 57;
tmp = pack_unpack_select & 1;
res |= tmp << 56;
tmp = pack_mode & 0xf;
res |= tmp << QPU_PACK_SHIFT;
tmp = cond_add & 0x7;
res |= tmp << QPU_COND_ADD_SHIFT;
tmp = cond_mul & 0x7;
res |= tmp << QPU_COND_MUL_SHIFT;
tmp = set_flags & 1;
res |= tmp << 45;
tmp = write_swap & 1;
res |= tmp << 44;
tmp = waddr_add & 0x3f;
res |= tmp << QPU_WADDR_ADD_SHIFT;
tmp = waddr_mul & 0x3f;
res |= tmp << QPU_WADDR_MUL_SHIFT;
tmp = ms_bit;
res |= tmp << 16;
res |= ls_bit;
return res;
}
/*
Format:
#comment
sig_bit_optional ; dstAdd.pack_mode_optional = add_opcode.sf_optional.condition.unpack_mode_optional(srcA, srcB, imm_optional) ; dstMul.pack_mode_optional = mul_opcode.condition(srcA, srcB) ;
sig_bit_branch ; dstAdd = branch.rel_optional.reg_optional.condition(address, srcA_optional) ; dstMul = branch() ;
sig_bit_none ; dstAdd.pack_mode_optional = sem_inc.sf_optional.condition(sem_number, 27bit_imm_value_optional) ; dstMul.pack_mode_optional = sem_inc.condition() ;
sig_load_imm ; dstAdd.pack_mode_optional = load32.sf_optional.condition(immediate_value) ; dstMul.pack_mode_optional = load32.condition() ;
sig_load_imm ; dstAdd.pack_mode_optional = load16.signed_optional.sf_optional.condition(int16_imm, in16_imm) ; dstMul.pack_mode_optional = load16.condition() ;
Examples:
sig_none ; ra0.nop = add.sf.always(r0, r1, 0) ; rb0.nop = fmul.sf.always(r2, r3) ;
sig_branch ; ra0 = branch.rel.reg.always(0xdeadbeef, ra1) ; rb0 = branch() ;
sig_none ; ra0.nop = sem_inc.sf.always(1, 0x7ffffff) ; rb0.nop = sem_inc.always() ;
sig_load_imm ; ra0.nop = load32.sf.always(0xdeadbeef) ; rb0.nop = load32.always() ;
sig_load_imm ; ra0.nop = load16.sf.signed.always(1, 2) ; rb0.nop = load16.always() ;
*/
qpu_sig_bits parse_sig_bit(char* str)
{
unsigned num_sig_bits = sizeof(qpu_sig_bits_str) / sizeof(const char *);
for(unsigned c = 0; c < num_sig_bits && str; ++c)
{
if(qpu_sig_bits_str[c] && strcmp(str, qpu_sig_bits_str[c]) == 0)
{
return c;
}
}
return -1;
}
void parse_dst(char** str, qpu_waddr* waddr, uint8_t* pack_mode, uint8_t* ws, unsigned is_add)
{
char* dst = strtok(*str, ".");
char* pack = strtok(0, ".");
//advance token past dst strings so we can tokenize further
if(dst)
{
if(pack)
{
*str = pack;
}
else
{
*str = dst;
}
while(**str)
{
(*str)++;
}
*str += 1;
}
uint8_t waddr_res = 0;
uint8_t pack_mode_res = 0;
for(unsigned c = 0; c < 2 && dst && !waddr_res; ++c)
{
for(unsigned d = 0; d < 64; ++d)
{
if(qpu_waddr_str[c][d] && strcmp(dst, qpu_waddr_str[c][d]) == 0)
{
waddr_res = d;
break;
}
}
}
if(!waddr_res && dst && dst[0] == 'r')
{
unsigned is_a = dst[1] == 'a' ? 1 : 0;
//add normally writes to regfile A
*ws = !is_add && is_a;
waddr_res = atoi(dst+2);
}
unsigned num_pack_a_str = sizeof(qpu_pack_a_str) / sizeof(const char *);
for(unsigned c = 0; c < num_pack_a_str && pack && !pack_mode_res; ++c)
{
if(qpu_pack_a_str[c] && strcmp(pack, qpu_pack_a_str[c]) == 0)
{
pack_mode_res = c;
break;
}
}
unsigned num_pack_mul_str = sizeof(qpu_pack_mul_str) / sizeof(const char *);
for(unsigned c = 0; c < num_pack_mul_str && pack && !pack_mode_res; ++c)
{
if(qpu_pack_mul_str[c] && strcmp(pack, qpu_pack_mul_str[c]) == 0)
{
pack_mode_res = c;
break;
}
}
*waddr = waddr_res;
*pack_mode = pack_mode_res;
}
void parse_op_modifiers(char** str, uint8_t* sf, qpu_cond* condition, qpu_unpack* unpack_mode, uint8_t* rel, uint8_t* reg)
{
char* modifier = strtok(*str, ".");
//at most 3 modifiers supported
for(int c = 0; c < 3; ++c)
{
if(modifier)
{
*str = modifier;
if(strcmp(modifier, "rel") == 0)
{
*rel = 1;
modifier = strtok(0, ".");
continue;
}
if(strcmp(modifier, "reg") == 0)
{
*reg = 1;
modifier = strtok(0, ".");
continue;
}
if(strcmp(modifier, "sf") == 0)
{
*sf = 1;
modifier = strtok(0, ".");
continue;
}
unsigned found = 0;
unsigned num_conds = sizeof(qpu_cond_str) / sizeof(const char *);
for(unsigned d = 0; d < num_conds; ++d)
{
if(qpu_cond_str[d] && strcmp(modifier, qpu_cond_str[d]) == 0)
{
*condition = d;
found = 1;
break;
}
}
if(found)
{
modifier = strtok(0, ".");
continue;
}
unsigned num_unpack_modes = sizeof(qpu_unpack_str) / sizeof(const char *);
for(unsigned d = 0; d < num_unpack_modes; ++d)
{
if(qpu_unpack_str[d] && strcmp(modifier, qpu_unpack_str[d]) == 0)
{
*unpack_mode = d;
break;
}
}
modifier = strtok(0, ".");
}
}
//advance token past op strings so we can tokenize further
while(**str)
{
(*str)++;
}
*str += 1;
}
void parse_op(char** str, qpu_alu_type* type, qpu_op_add* op_add, qpu_op_mul* op_mul, uint8_t* is_sem_inc)
{
char* op = strtok(*str, ".");
if(op && strcmp(op, "sem_inc") == 0)
{
*type = QPU_SEM;
*is_sem_inc = 1;
}
else if(op && strcmp(op, "sem_dec") == 0)
{
*type = QPU_SEM;
*is_sem_inc = 0;
}
else
{
*type = QPU_ALU;
unsigned num_add_ops = sizeof(qpu_op_add_str) / sizeof(const char *);
unsigned num_mul_ops = sizeof(qpu_op_mul_str) / sizeof(const char *);
for(unsigned c = 0; c < num_add_ops && op; ++c)
{
if(qpu_op_add_str[c] && strcmp(op, qpu_op_add_str[c]) == 0)
{
*op_add = c;
break;
}
}
for(unsigned c = 0; c < num_mul_ops && op; ++c)
{
if(qpu_op_mul_str[c] && strcmp(op, qpu_op_mul_str[c]) == 0)
{
*op_mul = c;
break;
}
}
}
if(op)
{
*str = op;
}
//advance token past op strings so we can tokenize further
while(**str)
{
(*str)++;
}
*str += 1;
}
void parse_args_alu(char** str, qpu_mux* in_a, qpu_mux* in_b, uint8_t* small_imm)
{
char* arg = strtok(*str, " \n\v\f\r\t,");
unsigned num_muxes = sizeof(qpu_mux_str) / sizeof(const char *);
for(unsigned c = 0; c < num_muxes && arg; ++c)
{
if(qpu_mux_str[c] && strcmp(arg, qpu_mux_str[c]) == 0)
{
*str = arg;
*in_a = c;
break;
}
}
arg = strtok(0, " \n\v\f\r\t,");
for(unsigned c = 0; c < num_muxes && arg; ++c)
{
if(qpu_mux_str[c] && strcmp(arg, qpu_mux_str[c]) == 0)
{
*str = arg;
*in_b = c;
break;
}
}
arg = strtok(0, " \n\v\f\r\t,");
if(arg)
{
uint32_t si = atoi(arg);
*small_imm = qpu_encode_small_immediate(si);
*str = arg;
}
//advance token past arg strings so we can tokenize further
while(**str)
{
(*str)++;
}
*str += 1;
}
uint64_t* assemble_qpu_asm(char* str)
{
unsigned num_instructions = 0;
char* ptr = str;
while(ptr && *ptr != '\0')
{
ptr = strstr(ptr, ";");
ptr = strstr(ptr+(ptr!=0), ";");
ptr = strstr(ptr+(ptr!=0), ";");
if(ptr)
{
ptr += 1;
num_instructions += 1;
}
}
printf("Num instructions: %i\n", num_instructions);
if(!num_instructions)
{
return 0;
}
uint64_t* instructions = malloc(sizeof(uint64_t)*num_instructions);
unsigned instruction_counter = 0;
char* token = strtok(str, " \n\v\f\r\t;");
while(token)
{
qpu_sig_bits sig_bit = QPU_SIG_NONE;
qpu_alu_type type = QPU_ALU;
qpu_op_add op_add = QPU_A_NOP;
qpu_op_mul op_mul = QPU_M_NOP;
qpu_mux mul_a = 0;
qpu_mux mul_b = 0;
qpu_mux add_a = 0;
qpu_mux add_b = 0;
qpu_cond cond_mul = QPU_COND_ALWAYS;
qpu_cond cond_add = QPU_COND_ALWAYS;
qpu_waddr waddr_add = QPU_W_NOP;
qpu_waddr waddr_mul = QPU_W_NOP;
qpu_waddr raddr_add = QPU_R_NOP;
qpu_waddr raddr_mul = QPU_R_NOP;
uint8_t pack_unpack_select = 0;
uint8_t pack_mode = QPU_PACK_A_NOP;
qpu_unpack unpack_mode = QPU_UNPACK_NOP;
uint8_t is_sem_inc = 0;
uint8_t rel = 0;
uint8_t reg = 0;
uint8_t ws = 0;
uint8_t sf = 0;
uint32_t imm32 = 0;
uint16_t ms_imm16 = 0;
uint16_t ls_imm16 = 0;
uint8_t semaphore = 0;
qpu_load_type load_type = QPU_LOAD32;
uint8_t signed_or_unsigned = 0;
qpu_branch_cond branch_cond = QPU_COND_BRANCH_ALWAYS;
sig_bit = parse_sig_bit(token);
if(sig_bit < 0)
{
break;
}
//get dst for add
token = strtok(0, " \n\v\f\r\t=;");
parse_dst(&token, &waddr_add, &pack_mode, &ws, 1);
//check op
token = strtok(token, " \n\v\f\r\t.=");
parse_op(&token, &type, &op_add, &op_mul, &is_sem_inc);
//get modifiers
token = strtok(token, " \n\v\f\r\t(");
parse_op_modifiers(&token, &sf, &cond_add, &unpack_mode, &rel, &reg);
if(type == QPU_ALU)
{
//get arguments for add
token = strtok(token, ")");
parse_args_alu(&token, &add_a, &add_b, &imm32);
}
else if(type == QPU_SEM)
{
}
else if(type == QPU_BRANCH)
{
}
else if(type == QPU_LOAD_IMM)
{
}
//get dst for mul
token = strtok(token, " \n\v\f\r\t=;");
parse_dst(&token, &waddr_mul, &pack_mode, &ws, 0);
//check op
token = strtok(token, " \n\v\f\r\t.=");
parse_op(&token, &type, &op_add, &op_mul, &is_sem_inc);
if(type == QPU_ALU)
{
//get arguments for add
token = strtok(token, ")");
parse_args_alu(&token, &mul_a, &mul_b, &imm32);
}
//get modifiers
token = strtok(token, " \n\v\f\r\t(");
parse_op_modifiers(&token, &sf, &cond_mul, &unpack_mode, &rel, &reg);
//EMIT INSTRUCTION HERE
if(type == QPU_ALU)
{
if(sig_bit == QPU_SIG_SMALL_IMM)
{
instructions[instruction_counter] = encode_alu_small_imm(unpack_mode, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, op_mul, op_add, raddr_add, imm32, add_a, add_b, mul_a, mul_b);
}
else
{
instructions[instruction_counter] = encode_alu(sig_bit, unpack_mode, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, op_mul, op_add, raddr_add, raddr_mul, add_a, add_b, mul_a, mul_b);
}
}
else if(type == QPU_SEM)
{
instructions[instruction_counter] = encode_semaphore(pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, is_sem_inc, semaphore, imm32);
}
else if(type == QPU_BRANCH)
{
instructions[instruction_counter] = encode_branch(branch_cond, rel, reg, raddr_add, ws, waddr_add, waddr_mul, imm32);
}
else if(type == QPU_LOAD_IMM)
{
if(load_type == QPU_LOAD32)
{
instructions[instruction_counter] = encode_load_imm(pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, imm32);
}
else
{
instructions[instruction_counter] = encode_load_imm_per_elem(signed_or_unsigned, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, ms_imm16, ls_imm16);
}
}
instruction_counter++;
token = strtok(0, " \n\v\f\r\t;");
}
return instructions;
}
int main()
{
char asm_code[] =
"sig_none ; ra0.nop = add.sf.always.nop(r0, r1, 0) ; rb0.nop = fmul.sf.always(r2, r3) ;"
"sig_branch ; ra0 = branch.rel.reg.always(0xdeadbeef, ra1) ; rb0 = branch() ;"
"sig_none ; ra0.nop = sem_inc.sf.always(1, 0x7ffffff) ; rb0.nop = sem_inc.always() ;"
"sig_load_imm ; ra0.nop = load32.sf.always(0xdeadbeef) ; rb0.nop = load32.always() ;";
uint64_t assembly = assemble_qpu_asm(asm_code);
return 0;
}