#include #include #include #include "vc4_qpu_defines.h" /********************************************************************************************************************* Instruction restrictions * The last three instructions of any program (Thread End plus the following two delay-slot instructions) must not do varyings read, uniforms read or any kind of VPM, VDR, or VDW read or write. * The Program End instruction must not write to either physical regfile A or B. * The Program End instruction and the following two delay slot instructions must not write or read address 14 in either regfile A or B. * The final program instruction (the second delay slot instruction) must not do a TLB Z write. * A scoreboard wait must not occur in the first two instructions of a fragment shader. This is either the explicit Wait for Scoreboard signal or an implicit wait with the first tile-buffer read or write instruction. * If TMU_NOSWAP is written, the write must be three instructions before the first TMU write instruction. For example, if TMU_NOSWAP is written in the first shader instruction, the first TMU write cannot occur before the 4th shader instruction. * An instruction must not read from a location in physical regfile A or B that was written to by the previous instruction. * After an SFU lookup instruction, accumulator r4 must not be read in the following two instructions. Any other instruction that results in r4 being written (that is, TMU read, TLB read, SFU lookup) cannot occur in the two instructions following an SFU lookup. * An instruction that does a vector rotate by r5 must not immediately follow an instruction that writes to r5. * An instruction that does a vector rotate must not immediately follow an instruction that writes to the accumulator that is being rotated. * After an instruction that does a TLB Z write, the multisample mask must not be read as an instruction input argument in the following two instruction. The TLB Z write instruction can, however, be followed immediately by a TLB color write. * A single instruction can only perform a maximum of one of the following closely coupled peripheral accesses in a single instruction: TMU write, TMU read, TLB write, TLB read, TLB combined color read and write, SFU write, Mutex read or Semaphore access. *********************************************************************************************************************/ uint64_t encode_alu(qpu_sig_bits sig_bits, qpu_unpack unpack_mode, //If the pm bit is set, the unpack field programs the r4 unpack unit, //and the pack field is used to program the color //conversion on the output of the mul unit uint8_t pack_unpack_select, uint8_t pack_mode, qpu_cond add_cond, qpu_cond mul_cond, uint8_t set_flags, //Flags are updated from the add ALU unless the add ALU performed a NOP (or its condition code was NEVER) in which case flags are updated from the mul ALU uint8_t write_swap_flag, //0: add writes to A, mul to B, 1: add writes to B, mul to A qpu_waddr waddr_add, qpu_waddr waddr_mul, qpu_op_mul op_mul, qpu_op_add op_add, qpu_raddr raddr_a, qpu_raddr raddr_b, qpu_mux add_a, qpu_mux add_b, qpu_mux mul_a, qpu_mux mul_b ) { uint64_t res = 0; uint64_t tmp = 0; tmp = sig_bits & 0xf; //mask ls 4 bits res |= tmp << QPU_SIG_SHIFT; tmp = unpack_mode & 0x7; //mask ls 3 bits res |= tmp << QPU_UNPACK_SHIFT; tmp = pack_unpack_select & 1; res |= tmp << 56; tmp = pack_mode & 0xf; res |= tmp << QPU_PACK_SHIFT; tmp = add_cond & 0x7; res |= tmp << QPU_COND_ADD_SHIFT; tmp = mul_cond & 0x7; res |= tmp << QPU_COND_MUL_SHIFT; tmp = set_flags & 1; res |= tmp << 45; tmp = write_swap_flag & 1; res |= tmp << 44; tmp = waddr_add & 0x3f; res |= tmp << QPU_WADDR_ADD_SHIFT; tmp = waddr_mul & 0x3f; res |= tmp << QPU_WADDR_MUL_SHIFT; tmp = op_mul & 0x7; res |= tmp << QPU_OP_MUL_SHIFT; tmp = op_add & 0x1f; res |= tmp << QPU_OP_ADD_SHIFT; tmp = raddr_a & 0x3f; res |= tmp << QPU_RADDR_A_SHIFT; tmp = raddr_b & 0x3f; res |= tmp << QPU_RADDR_B_SHIFT; tmp = add_a & 0x7; res |= tmp << QPU_ADD_A_SHIFT; tmp = add_b & 0x7; res |= tmp << QPU_ADD_B_SHIFT; tmp = mul_a & 0x7; res |= tmp << QPU_MUL_A_SHIFT; tmp = mul_b & 0x7; res |= tmp << QPU_MUL_B_SHIFT; return res; } uint64_t encode_alu_small_imm(qpu_unpack unpack_mode, uint8_t pack_unpack_select, uint8_t pack_mode, qpu_cond add_cond, qpu_cond mul_cond, uint8_t set_flags, //Flags are updated from the add ALU unless the add ALU performed a NOP (or its condition code was NEVER) in which case flags are updated from the mul ALU uint8_t write_swap_flag, //0: add writes to A, mul to B, 1: add writes to B, mul to A qpu_waddr waddr_add, qpu_waddr waddr_mul, qpu_op_mul op_mul, qpu_op_add op_add, qpu_raddr raddr_a, uint8_t small_imm, qpu_mux add_a, qpu_mux add_b, qpu_mux mul_a, qpu_mux mul_b ) { return encode_alu(0xd, unpack_mode, pack_unpack_select, pack_mode, add_cond, mul_cond, set_flags, write_swap_flag, waddr_add, waddr_mul, op_mul, op_add, raddr_a, small_imm, add_a, add_b, mul_a, mul_b); } uint64_t encode_branch(qpu_branch_cond branch_cond, uint8_t is_relative, //if set branch target is relative to PC+4 uint8_t use_raddr_a, //if set add value of raddr_a (from simd elem 0) to branch target qpu_raddr raddr_a, uint8_t write_swap_bit, qpu_waddr waddr_add, qpu_waddr waddr_mul, uint32_t imm //always added to branch target, set to 0 if unused ) { uint64_t res = 0; uint64_t tmp = 0; tmp = 0xf; res |= tmp << 60; tmp = branch_cond & 0xf; res |= tmp << QPU_BRANCH_COND_SHIFT; tmp = is_relative & 1; res |= tmp << 51; tmp = use_raddr_a & 1; res |= tmp << 50; tmp = raddr_a & 0x1f; res |= tmp << QPU_BRANCH_RADDR_A_SHIFT; tmp = write_swap_bit & 1; res |= tmp << 44; tmp = waddr_add & 0x3f; res |= tmp << QPU_WADDR_ADD_SHIFT; tmp = waddr_mul & 0x3f; res |= tmp << QPU_WADDR_MUL_SHIFT; res |= imm; return res; } uint64_t encode_semaphore(uint8_t pack_unpack_select, uint8_t pack_mode, qpu_cond cond_add, qpu_cond cond_mul, uint8_t set_flags, uint8_t write_swap, qpu_waddr waddr_add, qpu_waddr waddr_mul, uint8_t incr_sem, //if 1 increment semaphore uint8_t sem, //4 bit semaphore selector uint32_t imm_val //27bit immediate value loaded into all 16 simd elements ) { uint64_t res = 0; uint64_t tmp = 0; tmp = 0x74; res |= tmp << 57; tmp = pack_unpack_select & 1; res |= tmp << 56; tmp = pack_mode & 0xf; res |= tmp << QPU_PACK_SHIFT; tmp = cond_add & 0x7; res |= tmp << QPU_COND_ADD_SHIFT; tmp = cond_mul & 0x7; res |= tmp << QPU_COND_MUL_SHIFT; tmp = set_flags & 1; res |= tmp << 45; tmp = write_swap & 1; res |= tmp << 44; tmp = waddr_add & 0x3f; res |= tmp << QPU_WADDR_ADD_SHIFT; tmp = waddr_mul & 0x3f; res |= tmp << QPU_WADDR_MUL_SHIFT; tmp = imm_val & 0x7ffffff; res |= tmp << 5; tmp = incr_sem & 1; res |= tmp << 4; res |= sem & 0xf; return res; } //write immediate value across simd array uint64_t encode_load_imm(uint8_t pack_unpack_select, uint8_t pack_mode, qpu_cond cond_add, qpu_cond cond_mul, uint8_t set_flags, uint8_t write_swap, qpu_waddr waddr_add, qpu_waddr waddr_mul, uint32_t imm //2x16bit or 1x32bit uint ) { uint64_t res = 0; uint64_t tmp = 0; tmp = 0x70; res |= tmp << 57; tmp = pack_unpack_select & 1; res |= tmp << 56; tmp = pack_mode & 0xf; res |= tmp << QPU_PACK_SHIFT; tmp = cond_add & 0x7; res |= tmp << QPU_COND_ADD_SHIFT; tmp = cond_mul & 0x7; res |= tmp << QPU_COND_MUL_SHIFT; tmp = set_flags & 1; res |= tmp << 45; tmp = write_swap & 1; res |= tmp << 44; tmp = waddr_add & 0x3f; res |= tmp << QPU_WADDR_ADD_SHIFT; tmp = waddr_mul & 0x3f; res |= tmp << QPU_WADDR_MUL_SHIFT; res |= imm; return res; } //write per element MS bit and LS bit across simd array uint64_t encode_load_imm_per_elem( uint8_t signed_or_unsigned, //0 for signed, 1 for unsigned uint8_t pack_unpack_select, uint8_t pack_mode, qpu_cond cond_add, qpu_cond cond_mul, uint8_t set_flags, uint8_t write_swap, qpu_waddr waddr_add, qpu_waddr waddr_mul, uint16_t ms_bit, //per element MS (sign) bit uint16_t ls_bit //per element LS bit ) { uint64_t res = 0; uint64_t tmp = 0; tmp = 0x71; tmp |= signed_or_unsigned << 1; res |= tmp << 57; tmp = pack_unpack_select & 1; res |= tmp << 56; tmp = pack_mode & 0xf; res |= tmp << QPU_PACK_SHIFT; tmp = cond_add & 0x7; res |= tmp << QPU_COND_ADD_SHIFT; tmp = cond_mul & 0x7; res |= tmp << QPU_COND_MUL_SHIFT; tmp = set_flags & 1; res |= tmp << 45; tmp = write_swap & 1; res |= tmp << 44; tmp = waddr_add & 0x3f; res |= tmp << QPU_WADDR_ADD_SHIFT; tmp = waddr_mul & 0x3f; res |= tmp << QPU_WADDR_MUL_SHIFT; tmp = ms_bit; res |= tmp << 16; res |= ls_bit; return res; } /* Format: #comment sig_bit_optional ; dstAdd.pack_mode_optional = add_opcode.sf_optional.condition.unpack_mode_optional(srcA, srcB, imm_optional) ; dstMul.pack_mode_optional = mul_opcode.condition(srcA, srcB) ; sig_bit_branch ; dstAdd = branch.rel_optional.reg_optional.condition(address, srcA_optional) ; dstMul = branch() ; sig_bit_none ; dstAdd.pack_mode_optional = sem_inc.sf_optional.condition(sem_number, 27bit_imm_value_optional) ; dstMul.pack_mode_optional = sem_inc.condition() ; sig_load_imm ; dstAdd.pack_mode_optional = load32.sf_optional.condition(immediate_value) ; dstMul.pack_mode_optional = load32.condition() ; sig_load_imm ; dstAdd.pack_mode_optional = load16.signed_optional.sf_optional.condition(int16_imm, in16_imm) ; dstMul.pack_mode_optional = load16.condition() ; Examples: sig_none ; ra0.nop = add.sf.always(r0, r1, 0) ; rb0.nop = fmul.sf.always(r2, r3) ; sig_branch ; ra0 = branch.rel.reg.always(0xdeadbeef, ra1) ; rb0 = branch() ; sig_none ; ra0.nop = sem_inc.sf.always(1, 0x7ffffff) ; rb0.nop = sem_inc.always() ; sig_load_imm ; ra0.nop = load32.sf.always(0xdeadbeef) ; rb0.nop = load32.always() ; sig_load_imm ; ra0.nop = load16.sf.signed.always(1, 2) ; rb0.nop = load16.always() ; */ qpu_sig_bits parse_sig_bit(char* str) { unsigned num_sig_bits = sizeof(qpu_sig_bits_str) / sizeof(const char *); for(unsigned c = 0; c < num_sig_bits && str; ++c) { if(qpu_sig_bits_str[c] && strcmp(str, qpu_sig_bits_str[c]) == 0) { return c; } } return -1; } void parse_dst(char** str, qpu_waddr* waddr, uint8_t* pack_mode, uint8_t* ws, unsigned is_add) { char* dst = strtok(*str, "."); char* pack = strtok(0, "."); //advance token past dst strings so we can tokenize further if(dst) { if(pack) { *str = pack; } else { *str = dst; } while(**str) { (*str)++; } *str += 1; } uint8_t waddr_res = 0; uint8_t pack_mode_res = 0; for(unsigned c = 0; c < 2 && dst && !waddr_res; ++c) { for(unsigned d = 0; d < 64; ++d) { if(qpu_waddr_str[c][d] && strcmp(dst, qpu_waddr_str[c][d]) == 0) { waddr_res = d; break; } } } if(!waddr_res && dst && dst[0] == 'r') { unsigned is_a = dst[1] == 'a' ? 1 : 0; //add normally writes to regfile A *ws = !is_add && is_a; waddr_res = atoi(dst+2); } unsigned num_pack_a_str = sizeof(qpu_pack_a_str) / sizeof(const char *); for(unsigned c = 0; c < num_pack_a_str && pack && !pack_mode_res; ++c) { if(qpu_pack_a_str[c] && strcmp(pack, qpu_pack_a_str[c]) == 0) { pack_mode_res = c; break; } } unsigned num_pack_mul_str = sizeof(qpu_pack_mul_str) / sizeof(const char *); for(unsigned c = 0; c < num_pack_mul_str && pack && !pack_mode_res; ++c) { if(qpu_pack_mul_str[c] && strcmp(pack, qpu_pack_mul_str[c]) == 0) { pack_mode_res = c; break; } } *waddr = waddr_res; *pack_mode = pack_mode_res; } void parse_op_modifiers(char** str, uint8_t* sf, qpu_cond* condition, qpu_unpack* unpack_mode, uint8_t* rel, uint8_t* reg) { char* modifier = strtok(*str, "."); //at most 3 modifiers supported for(int c = 0; c < 3; ++c) { if(modifier) { *str = modifier; if(strcmp(modifier, "rel") == 0) { *rel = 1; modifier = strtok(0, "."); continue; } if(strcmp(modifier, "reg") == 0) { *reg = 1; modifier = strtok(0, "."); continue; } if(strcmp(modifier, "sf") == 0) { *sf = 1; modifier = strtok(0, "."); continue; } unsigned found = 0; unsigned num_conds = sizeof(qpu_cond_str) / sizeof(const char *); for(unsigned d = 0; d < num_conds; ++d) { if(qpu_cond_str[d] && strcmp(modifier, qpu_cond_str[d]) == 0) { *condition = d; found = 1; break; } } if(found) { modifier = strtok(0, "."); continue; } unsigned num_unpack_modes = sizeof(qpu_unpack_str) / sizeof(const char *); for(unsigned d = 0; d < num_unpack_modes; ++d) { if(qpu_unpack_str[d] && strcmp(modifier, qpu_unpack_str[d]) == 0) { *unpack_mode = d; break; } } modifier = strtok(0, "."); } } //advance token past op strings so we can tokenize further while(**str) { (*str)++; } *str += 1; } void parse_op(char** str, qpu_alu_type* type, qpu_op_add* op_add, qpu_op_mul* op_mul, uint8_t* is_sem_inc) { char* op = strtok(*str, "."); if(op && strcmp(op, "sem_inc") == 0) { *type = QPU_SEM; *is_sem_inc = 1; } else if(op && strcmp(op, "sem_dec") == 0) { *type = QPU_SEM; *is_sem_inc = 0; } else { *type = QPU_ALU; unsigned num_add_ops = sizeof(qpu_op_add_str) / sizeof(const char *); unsigned num_mul_ops = sizeof(qpu_op_mul_str) / sizeof(const char *); for(unsigned c = 0; c < num_add_ops && op; ++c) { if(qpu_op_add_str[c] && strcmp(op, qpu_op_add_str[c]) == 0) { *op_add = c; break; } } for(unsigned c = 0; c < num_mul_ops && op; ++c) { if(qpu_op_mul_str[c] && strcmp(op, qpu_op_mul_str[c]) == 0) { *op_mul = c; break; } } } if(op) { *str = op; } //advance token past op strings so we can tokenize further while(**str) { (*str)++; } *str += 1; } void parse_args_alu(char** str, qpu_mux* in_a, qpu_mux* in_b, uint8_t* small_imm) { char* arg = strtok(*str, " \n\v\f\r\t,"); unsigned num_muxes = sizeof(qpu_mux_str) / sizeof(const char *); for(unsigned c = 0; c < num_muxes && arg; ++c) { if(qpu_mux_str[c] && strcmp(arg, qpu_mux_str[c]) == 0) { *str = arg; *in_a = c; break; } } arg = strtok(0, " \n\v\f\r\t,"); for(unsigned c = 0; c < num_muxes && arg; ++c) { if(qpu_mux_str[c] && strcmp(arg, qpu_mux_str[c]) == 0) { *str = arg; *in_b = c; break; } } arg = strtok(0, " \n\v\f\r\t,"); if(arg) { uint32_t si = atoi(arg); *small_imm = qpu_encode_small_immediate(si); *str = arg; } //advance token past arg strings so we can tokenize further while(**str) { (*str)++; } *str += 1; } uint64_t* assemble_qpu_asm(char* str) { unsigned num_instructions = 0; char* ptr = str; while(ptr && *ptr != '\0') { ptr = strstr(ptr, ";"); ptr = strstr(ptr+(ptr!=0), ";"); ptr = strstr(ptr+(ptr!=0), ";"); if(ptr) { ptr += 1; num_instructions += 1; } } printf("Num instructions: %i\n", num_instructions); if(!num_instructions) { return 0; } uint64_t* instructions = malloc(sizeof(uint64_t)*num_instructions); unsigned instruction_counter = 0; char* token = strtok(str, " \n\v\f\r\t;"); while(token) { qpu_sig_bits sig_bit = QPU_SIG_NONE; qpu_alu_type type = QPU_ALU; qpu_op_add op_add = QPU_A_NOP; qpu_op_mul op_mul = QPU_M_NOP; qpu_mux mul_a = 0; qpu_mux mul_b = 0; qpu_mux add_a = 0; qpu_mux add_b = 0; qpu_cond cond_mul = QPU_COND_ALWAYS; qpu_cond cond_add = QPU_COND_ALWAYS; qpu_waddr waddr_add = QPU_W_NOP; qpu_waddr waddr_mul = QPU_W_NOP; qpu_waddr raddr_add = QPU_R_NOP; qpu_waddr raddr_mul = QPU_R_NOP; uint8_t pack_unpack_select = 0; uint8_t pack_mode = QPU_PACK_A_NOP; qpu_unpack unpack_mode = QPU_UNPACK_NOP; uint8_t is_sem_inc = 0; uint8_t rel = 0; uint8_t reg = 0; uint8_t ws = 0; uint8_t sf = 0; uint32_t imm32 = 0; uint16_t ms_imm16 = 0; uint16_t ls_imm16 = 0; uint8_t semaphore = 0; qpu_load_type load_type = QPU_LOAD32; uint8_t signed_or_unsigned = 0; qpu_branch_cond branch_cond = QPU_COND_BRANCH_ALWAYS; sig_bit = parse_sig_bit(token); if(sig_bit < 0) { break; } //get dst for add token = strtok(0, " \n\v\f\r\t=;"); parse_dst(&token, &waddr_add, &pack_mode, &ws, 1); //check op token = strtok(token, " \n\v\f\r\t.="); parse_op(&token, &type, &op_add, &op_mul, &is_sem_inc); //get modifiers token = strtok(token, " \n\v\f\r\t("); parse_op_modifiers(&token, &sf, &cond_add, &unpack_mode, &rel, ®); if(type == QPU_ALU) { //get arguments for add token = strtok(token, ")"); parse_args_alu(&token, &add_a, &add_b, &imm32); } else if(type == QPU_SEM) { } else if(type == QPU_BRANCH) { } else if(type == QPU_LOAD_IMM) { } //get dst for mul token = strtok(token, " \n\v\f\r\t=;"); parse_dst(&token, &waddr_mul, &pack_mode, &ws, 0); //check op token = strtok(token, " \n\v\f\r\t.="); parse_op(&token, &type, &op_add, &op_mul, &is_sem_inc); if(type == QPU_ALU) { //get arguments for add token = strtok(token, ")"); parse_args_alu(&token, &mul_a, &mul_b, &imm32); } //get modifiers token = strtok(token, " \n\v\f\r\t("); parse_op_modifiers(&token, &sf, &cond_mul, &unpack_mode, &rel, ®); //EMIT INSTRUCTION HERE if(type == QPU_ALU) { if(sig_bit == QPU_SIG_SMALL_IMM) { instructions[instruction_counter] = encode_alu_small_imm(unpack_mode, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, op_mul, op_add, raddr_add, imm32, add_a, add_b, mul_a, mul_b); } else { instructions[instruction_counter] = encode_alu(sig_bit, unpack_mode, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, op_mul, op_add, raddr_add, raddr_mul, add_a, add_b, mul_a, mul_b); } } else if(type == QPU_SEM) { instructions[instruction_counter] = encode_semaphore(pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, is_sem_inc, semaphore, imm32); } else if(type == QPU_BRANCH) { instructions[instruction_counter] = encode_branch(branch_cond, rel, reg, raddr_add, ws, waddr_add, waddr_mul, imm32); } else if(type == QPU_LOAD_IMM) { if(load_type == QPU_LOAD32) { instructions[instruction_counter] = encode_load_imm(pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, imm32); } else { instructions[instruction_counter] = encode_load_imm_per_elem(signed_or_unsigned, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, ms_imm16, ls_imm16); } } instruction_counter++; token = strtok(0, " \n\v\f\r\t;"); } return instructions; } int main() { char asm_code[] = "sig_none ; ra0.nop = add.sf.always.nop(r0, r1, 0) ; rb0.nop = fmul.sf.always(r2, r3) ;" "sig_branch ; ra0 = branch.rel.reg.always(0xdeadbeef, ra1) ; rb0 = branch() ;" "sig_none ; ra0.nop = sem_inc.sf.always(1, 0x7ffffff) ; rb0.nop = sem_inc.always() ;" "sig_load_imm ; ra0.nop = load32.sf.always(0xdeadbeef) ; rb0.nop = load32.always() ;"; uint64_t assembly = assemble_qpu_asm(asm_code); return 0; }