1
0
mirror of https://github.com/Yours3lf/rpi-vk-driver.git synced 2025-02-19 16:54:18 +01:00

added qpu assembler/disassembler

This commit is contained in:
Unknown 2019-04-14 14:43:27 +01:00
parent 52ea2ca086
commit 6c4f4707e9
4 changed files with 1455 additions and 1 deletions

3
.gitignore vendored
View File

@ -1,2 +1,3 @@
CMakeLists.txt.user
build-debug
build-debug
QPUassembler/build

View File

@ -0,0 +1,11 @@
cmake_minimum_required(VERSION 2.8)
project(QPUassembler)
file(GLOB asmSrc
"*.h"
"*.c"
)
add_executable(${PROJECT_NAME} ${asmSrc})
target_compile_options(${PROJECT_NAME} PRIVATE -Wall -Werror=implicit-function-declaration -std=c11)

803
QPUassembler/main.c Normal file
View File

@ -0,0 +1,803 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "vc4_qpu_defines.h"
/*********************************************************************************************************************
Instruction restrictions
* The last three instructions of any program (Thread End plus the following two delay-slot instructions) must
not do varyings read, uniforms read or any kind of VPM, VDR, or VDW read or write.
* The Program End instruction must not write to either physical regfile A or B.
* The Program End instruction and the following two delay slot instructions must not write or read address 14
in either regfile A or B.
* The final program instruction (the second delay slot instruction) must not do a TLB Z write.
* A scoreboard wait must not occur in the first two instructions of a fragment shader. This is either the
explicit Wait for Scoreboard signal or an implicit wait with the first tile-buffer read or write instruction.
* If TMU_NOSWAP is written, the write must be three instructions before the first TMU write instruction.
For example, if TMU_NOSWAP is written in the first shader instruction, the first TMU write cannot occur
before the 4th shader instruction.
* An instruction must not read from a location in physical regfile A or B that was written to by the previous
instruction.
* After an SFU lookup instruction, accumulator r4 must not be read in the following two instructions. Any
other instruction that results in r4 being written (that is, TMU read, TLB read, SFU lookup) cannot occur in
the two instructions following an SFU lookup.
* An instruction that does a vector rotate by r5 must not immediately follow an instruction that writes to r5.
* An instruction that does a vector rotate must not immediately follow an instruction that writes to the
accumulator that is being rotated.
* After an instruction that does a TLB Z write, the multisample mask must not be read as an instruction
input argument in the following two instruction. The TLB Z write instruction can, however, be followed
immediately by a TLB color write.
* A single instruction can only perform a maximum of one of the following closely coupled peripheral
accesses in a single instruction: TMU write, TMU read, TLB write, TLB read, TLB combined color read and
write, SFU write, Mutex read or Semaphore access.
*********************************************************************************************************************/
uint64_t encode_alu(qpu_sig_bits sig_bits,
qpu_unpack unpack_mode,
//If the pm bit is set, the unpack field programs the r4 unpack unit,
//and the pack field is used to program the color
//conversion on the output of the mul unit
uint8_t pack_unpack_select,
uint8_t pack_mode,
qpu_cond add_cond,
qpu_cond mul_cond,
uint8_t set_flags, //Flags are updated from the add ALU unless the add ALU performed a NOP (or its condition code was NEVER) in which case flags are updated from the mul ALU
uint8_t write_swap_flag, //0: add writes to A, mul to B, 1: add writes to B, mul to A
qpu_waddr waddr_add,
qpu_waddr waddr_mul,
qpu_op_mul op_mul,
qpu_op_add op_add,
qpu_raddr raddr_a,
qpu_raddr raddr_b,
qpu_mux add_a,
qpu_mux add_b,
qpu_mux mul_a,
qpu_mux mul_b
)
{
uint64_t res = 0;
uint64_t tmp = 0;
tmp = sig_bits & 0xf; //mask ls 4 bits
res |= tmp << QPU_SIG_SHIFT;
tmp = unpack_mode & 0x7; //mask ls 3 bits
res |= tmp << QPU_UNPACK_SHIFT;
tmp = pack_unpack_select & 1;
res |= tmp << 56;
tmp = pack_mode & 0xf;
res |= tmp << QPU_PACK_SHIFT;
tmp = add_cond & 0x7;
res |= tmp << QPU_COND_ADD_SHIFT;
tmp = mul_cond & 0x7;
res |= tmp << QPU_COND_MUL_SHIFT;
tmp = set_flags & 1;
res |= tmp << 45;
tmp = write_swap_flag & 1;
res |= tmp << 44;
tmp = waddr_add & 0x3f;
res |= tmp << QPU_WADDR_ADD_SHIFT;
tmp = waddr_mul & 0x3f;
res |= tmp << QPU_WADDR_MUL_SHIFT;
tmp = op_mul & 0x7;
res |= tmp << QPU_OP_MUL_SHIFT;
tmp = op_add & 0x1f;
res |= tmp << QPU_OP_ADD_SHIFT;
tmp = raddr_a & 0x3f;
res |= tmp << QPU_RADDR_A_SHIFT;
tmp = raddr_b & 0x3f;
res |= tmp << QPU_RADDR_B_SHIFT;
tmp = add_a & 0x7;
res |= tmp << QPU_ADD_A_SHIFT;
tmp = add_b & 0x7;
res |= tmp << QPU_ADD_B_SHIFT;
tmp = mul_a & 0x7;
res |= tmp << QPU_MUL_A_SHIFT;
tmp = mul_b & 0x7;
res |= tmp << QPU_MUL_B_SHIFT;
return res;
}
uint64_t encode_alu_small_imm(qpu_unpack unpack_mode,
uint8_t pack_unpack_select,
uint8_t pack_mode,
qpu_cond add_cond,
qpu_cond mul_cond,
uint8_t set_flags, //Flags are updated from the add ALU unless the add ALU performed a NOP (or its condition code was NEVER) in which case flags are updated from the mul ALU
uint8_t write_swap_flag, //0: add writes to A, mul to B, 1: add writes to B, mul to A
qpu_waddr waddr_add,
qpu_waddr waddr_mul,
qpu_op_mul op_mul,
qpu_op_add op_add,
qpu_raddr raddr_a,
uint8_t small_imm,
qpu_mux add_a,
qpu_mux add_b,
qpu_mux mul_a,
qpu_mux mul_b
)
{
return encode_alu(0xd,
unpack_mode,
pack_unpack_select,
pack_mode,
add_cond,
mul_cond,
set_flags,
write_swap_flag,
waddr_add,
waddr_mul,
op_mul,
op_add,
raddr_a,
small_imm,
add_a,
add_b,
mul_a,
mul_b);
}
uint64_t encode_branch(qpu_branch_cond branch_cond,
uint8_t is_relative, //if set branch target is relative to PC+4
uint8_t use_raddr_a, //if set add value of raddr_a (from simd elem 0) to branch target
qpu_raddr raddr_a,
uint8_t write_swap_bit,
qpu_waddr waddr_add,
qpu_waddr waddr_mul,
uint32_t imm //always added to branch target, set to 0 if unused
)
{
uint64_t res = 0;
uint64_t tmp = 0;
tmp = 0xf;
res |= tmp << 60;
tmp = branch_cond & 0xf;
res |= tmp << QPU_BRANCH_COND_SHIFT;
tmp = is_relative & 1;
res |= tmp << 51;
tmp = use_raddr_a & 1;
res |= tmp << 50;
tmp = raddr_a & 0x1f;
res |= tmp << QPU_BRANCH_RADDR_A_SHIFT;
tmp = write_swap_bit & 1;
res |= tmp << 44;
tmp = waddr_add & 0x3f;
res |= tmp << QPU_WADDR_ADD_SHIFT;
tmp = waddr_mul & 0x3f;
res |= tmp << QPU_WADDR_MUL_SHIFT;
res |= imm;
return res;
}
uint64_t encode_semaphore(uint8_t pack_unpack_select,
uint8_t pack_mode,
qpu_cond cond_add,
qpu_cond cond_mul,
uint8_t set_flags,
uint8_t write_swap,
qpu_waddr waddr_add,
qpu_waddr waddr_mul,
uint8_t incr_sem, //if 1 increment semaphore
uint8_t sem, //4 bit semaphore selector
uint32_t imm_val //27bit immediate value loaded into all 16 simd elements
)
{
uint64_t res = 0;
uint64_t tmp = 0;
tmp = 0x74;
res |= tmp << 57;
tmp = pack_unpack_select & 1;
res |= tmp << 56;
tmp = pack_mode & 0xf;
res |= tmp << QPU_PACK_SHIFT;
tmp = cond_add & 0x7;
res |= tmp << QPU_COND_ADD_SHIFT;
tmp = cond_mul & 0x7;
res |= tmp << QPU_COND_MUL_SHIFT;
tmp = set_flags & 1;
res |= tmp << 45;
tmp = write_swap & 1;
res |= tmp << 44;
tmp = waddr_add & 0x3f;
res |= tmp << QPU_WADDR_ADD_SHIFT;
tmp = waddr_mul & 0x3f;
res |= tmp << QPU_WADDR_MUL_SHIFT;
tmp = imm_val & 0x7ffffff;
res |= tmp << 5;
tmp = incr_sem & 1;
res |= tmp << 4;
res |= sem & 0xf;
return res;
}
//write immediate value across simd array
uint64_t encode_load_imm(uint8_t pack_unpack_select,
uint8_t pack_mode,
qpu_cond cond_add,
qpu_cond cond_mul,
uint8_t set_flags,
uint8_t write_swap,
qpu_waddr waddr_add,
qpu_waddr waddr_mul,
uint32_t imm //2x16bit or 1x32bit uint
)
{
uint64_t res = 0;
uint64_t tmp = 0;
tmp = 0x70;
res |= tmp << 57;
tmp = pack_unpack_select & 1;
res |= tmp << 56;
tmp = pack_mode & 0xf;
res |= tmp << QPU_PACK_SHIFT;
tmp = cond_add & 0x7;
res |= tmp << QPU_COND_ADD_SHIFT;
tmp = cond_mul & 0x7;
res |= tmp << QPU_COND_MUL_SHIFT;
tmp = set_flags & 1;
res |= tmp << 45;
tmp = write_swap & 1;
res |= tmp << 44;
tmp = waddr_add & 0x3f;
res |= tmp << QPU_WADDR_ADD_SHIFT;
tmp = waddr_mul & 0x3f;
res |= tmp << QPU_WADDR_MUL_SHIFT;
res |= imm;
return res;
}
//write per element MS bit and LS bit across simd array
uint64_t encode_load_imm_per_elem(
uint8_t signed_or_unsigned, //0 for signed, 1 for unsigned
uint8_t pack_unpack_select,
uint8_t pack_mode,
qpu_cond cond_add,
qpu_cond cond_mul,
uint8_t set_flags,
uint8_t write_swap,
qpu_waddr waddr_add,
qpu_waddr waddr_mul,
uint16_t ms_bit, //per element MS (sign) bit
uint16_t ls_bit //per element LS bit
)
{
uint64_t res = 0;
uint64_t tmp = 0;
tmp = 0x71;
tmp |= signed_or_unsigned << 1;
res |= tmp << 57;
tmp = pack_unpack_select & 1;
res |= tmp << 56;
tmp = pack_mode & 0xf;
res |= tmp << QPU_PACK_SHIFT;
tmp = cond_add & 0x7;
res |= tmp << QPU_COND_ADD_SHIFT;
tmp = cond_mul & 0x7;
res |= tmp << QPU_COND_MUL_SHIFT;
tmp = set_flags & 1;
res |= tmp << 45;
tmp = write_swap & 1;
res |= tmp << 44;
tmp = waddr_add & 0x3f;
res |= tmp << QPU_WADDR_ADD_SHIFT;
tmp = waddr_mul & 0x3f;
res |= tmp << QPU_WADDR_MUL_SHIFT;
tmp = ms_bit;
res |= tmp << 16;
res |= ls_bit;
return res;
}
/*
Format:
#comment
sig_bit_optional ; dstAdd.pack_mode_optional = add_opcode.sf_optional.condition.unpack_mode_optional(srcA, srcB, imm_optional) ; dstMul.pack_mode_optional = mul_opcode.condition(srcA, srcB) ;
sig_bit_branch ; dstAdd = branch.rel_optional.reg_optional.condition(address, srcA_optional) ; dstMul = branch() ;
sig_bit_none ; dstAdd.pack_mode_optional = sem_inc.sf_optional.condition(sem_number, 27bit_imm_value_optional) ; dstMul.pack_mode_optional = sem_inc.condition() ;
sig_load_imm ; dstAdd.pack_mode_optional = load32.sf_optional.condition(immediate_value) ; dstMul.pack_mode_optional = load32.condition() ;
sig_load_imm ; dstAdd.pack_mode_optional = load16.signed_optional.sf_optional.condition(int16_imm, in16_imm) ; dstMul.pack_mode_optional = load16.condition() ;
Examples:
sig_none ; ra0.nop = add.sf.always(r0, r1, 0) ; rb0.nop = fmul.sf.always(r2, r3) ;
sig_branch ; ra0 = branch.rel.reg.always(0xdeadbeef, ra1) ; rb0 = branch() ;
sig_none ; ra0.nop = sem_inc.sf.always(1, 0x7ffffff) ; rb0.nop = sem_inc.always() ;
sig_load_imm ; ra0.nop = load32.sf.always(0xdeadbeef) ; rb0.nop = load32.always() ;
sig_load_imm ; ra0.nop = load16.sf.signed.always(1, 2) ; rb0.nop = load16.always() ;
*/
qpu_sig_bits parse_sig_bit(char* str)
{
unsigned num_sig_bits = sizeof(qpu_sig_bits_str) / sizeof(const char *);
for(unsigned c = 0; c < num_sig_bits && str; ++c)
{
if(qpu_sig_bits_str[c] && strcmp(str, qpu_sig_bits_str[c]) == 0)
{
return c;
}
}
return -1;
}
void parse_dst(char** str, qpu_waddr* waddr, uint8_t* pack_mode, uint8_t* ws, unsigned is_add)
{
char* dst = strtok(*str, ".");
char* pack = strtok(0, ".");
//advance token past dst strings so we can tokenize further
if(dst)
{
if(pack)
{
*str = pack;
}
else
{
*str = dst;
}
while(**str)
{
(*str)++;
}
*str += 1;
}
uint8_t waddr_res = 0;
uint8_t pack_mode_res = 0;
for(unsigned c = 0; c < 2 && dst && !waddr_res; ++c)
{
for(unsigned d = 0; d < 64; ++d)
{
if(qpu_waddr_str[c][d] && strcmp(dst, qpu_waddr_str[c][d]) == 0)
{
waddr_res = d;
break;
}
}
}
if(!waddr_res && dst && dst[0] == 'r')
{
unsigned is_a = dst[1] == 'a' ? 1 : 0;
//add normally writes to regfile A
*ws = !is_add && is_a;
waddr_res = atoi(dst+2);
}
unsigned num_pack_a_str = sizeof(qpu_pack_a_str) / sizeof(const char *);
for(unsigned c = 0; c < num_pack_a_str && pack && !pack_mode_res; ++c)
{
if(qpu_pack_a_str[c] && strcmp(pack, qpu_pack_a_str[c]) == 0)
{
pack_mode_res = c;
break;
}
}
unsigned num_pack_mul_str = sizeof(qpu_pack_mul_str) / sizeof(const char *);
for(unsigned c = 0; c < num_pack_mul_str && pack && !pack_mode_res; ++c)
{
if(qpu_pack_mul_str[c] && strcmp(pack, qpu_pack_mul_str[c]) == 0)
{
pack_mode_res = c;
break;
}
}
*waddr = waddr_res;
*pack_mode = pack_mode_res;
}
void parse_op_modifiers(char** str, uint8_t* sf, qpu_cond* condition, qpu_unpack* unpack_mode, uint8_t* rel, uint8_t* reg)
{
char* modifier = strtok(*str, ".");
//at most 3 modifiers supported
for(int c = 0; c < 3; ++c)
{
if(modifier)
{
*str = modifier;
if(strcmp(modifier, "rel") == 0)
{
*rel = 1;
modifier = strtok(0, ".");
continue;
}
if(strcmp(modifier, "reg") == 0)
{
*reg = 1;
modifier = strtok(0, ".");
continue;
}
if(strcmp(modifier, "sf") == 0)
{
*sf = 1;
modifier = strtok(0, ".");
continue;
}
unsigned found = 0;
unsigned num_conds = sizeof(qpu_cond_str) / sizeof(const char *);
for(unsigned d = 0; d < num_conds; ++d)
{
if(qpu_cond_str[d] && strcmp(modifier, qpu_cond_str[d]) == 0)
{
*condition = d;
found = 1;
break;
}
}
if(found)
{
modifier = strtok(0, ".");
continue;
}
unsigned num_unpack_modes = sizeof(qpu_unpack_str) / sizeof(const char *);
for(unsigned d = 0; d < num_unpack_modes; ++d)
{
if(qpu_unpack_str[d] && strcmp(modifier, qpu_unpack_str[d]) == 0)
{
*unpack_mode = d;
break;
}
}
modifier = strtok(0, ".");
}
}
//advance token past op strings so we can tokenize further
while(**str)
{
(*str)++;
}
*str += 1;
}
void parse_op(char** str, qpu_alu_type* type, qpu_op_add* op_add, qpu_op_mul* op_mul, uint8_t* is_sem_inc)
{
char* op = strtok(*str, ".");
if(op && strcmp(op, "sem_inc") == 0)
{
*type = QPU_SEM;
*is_sem_inc = 1;
}
else if(op && strcmp(op, "sem_dec") == 0)
{
*type = QPU_SEM;
*is_sem_inc = 0;
}
else
{
*type = QPU_ALU;
unsigned num_add_ops = sizeof(qpu_op_add_str) / sizeof(const char *);
unsigned num_mul_ops = sizeof(qpu_op_mul_str) / sizeof(const char *);
for(unsigned c = 0; c < num_add_ops && op; ++c)
{
if(qpu_op_add_str[c] && strcmp(op, qpu_op_add_str[c]) == 0)
{
*op_add = c;
break;
}
}
for(unsigned c = 0; c < num_mul_ops && op; ++c)
{
if(qpu_op_mul_str[c] && strcmp(op, qpu_op_mul_str[c]) == 0)
{
*op_mul = c;
break;
}
}
}
if(op)
{
*str = op;
}
//advance token past op strings so we can tokenize further
while(**str)
{
(*str)++;
}
*str += 1;
}
void parse_args_alu(char** str, qpu_mux* in_a, qpu_mux* in_b, uint8_t* small_imm)
{
char* arg = strtok(*str, " \n\v\f\r\t,");
unsigned num_muxes = sizeof(qpu_mux_str) / sizeof(const char *);
for(unsigned c = 0; c < num_muxes && arg; ++c)
{
if(qpu_mux_str[c] && strcmp(arg, qpu_mux_str[c]) == 0)
{
*str = arg;
*in_a = c;
break;
}
}
arg = strtok(0, " \n\v\f\r\t,");
for(unsigned c = 0; c < num_muxes && arg; ++c)
{
if(qpu_mux_str[c] && strcmp(arg, qpu_mux_str[c]) == 0)
{
*str = arg;
*in_b = c;
break;
}
}
arg = strtok(0, " \n\v\f\r\t,");
if(arg)
{
uint32_t si = atoi(arg);
*small_imm = qpu_encode_small_immediate(si);
*str = arg;
}
//advance token past arg strings so we can tokenize further
while(**str)
{
(*str)++;
}
*str += 1;
}
uint64_t* assemble_qpu_asm(char* str)
{
unsigned num_instructions = 0;
char* ptr = str;
while(ptr && *ptr != '\0')
{
ptr = strstr(ptr, ";");
ptr = strstr(ptr+(ptr!=0), ";");
ptr = strstr(ptr+(ptr!=0), ";");
if(ptr)
{
ptr += 1;
num_instructions += 1;
}
}
printf("Num instructions: %i\n", num_instructions);
if(!num_instructions)
{
return 0;
}
uint64_t* instructions = malloc(sizeof(uint64_t)*num_instructions);
unsigned instruction_counter = 0;
char* token = strtok(str, " \n\v\f\r\t;");
while(token)
{
qpu_sig_bits sig_bit = QPU_SIG_NONE;
qpu_alu_type type = QPU_ALU;
qpu_op_add op_add = QPU_A_NOP;
qpu_op_mul op_mul = QPU_M_NOP;
qpu_mux mul_a = 0;
qpu_mux mul_b = 0;
qpu_mux add_a = 0;
qpu_mux add_b = 0;
qpu_cond cond_mul = QPU_COND_ALWAYS;
qpu_cond cond_add = QPU_COND_ALWAYS;
qpu_waddr waddr_add = QPU_W_NOP;
qpu_waddr waddr_mul = QPU_W_NOP;
qpu_waddr raddr_add = QPU_R_NOP;
qpu_waddr raddr_mul = QPU_R_NOP;
uint8_t pack_unpack_select = 0;
uint8_t pack_mode = QPU_PACK_A_NOP;
qpu_unpack unpack_mode = QPU_UNPACK_NOP;
uint8_t is_sem_inc = 0;
uint8_t rel = 0;
uint8_t reg = 0;
uint8_t ws = 0;
uint8_t sf = 0;
uint32_t imm32 = 0;
uint16_t ms_imm16 = 0;
uint16_t ls_imm16 = 0;
uint8_t semaphore = 0;
qpu_load_type load_type = QPU_LOAD32;
uint8_t signed_or_unsigned = 0;
qpu_branch_cond branch_cond = QPU_COND_BRANCH_ALWAYS;
sig_bit = parse_sig_bit(token);
if(sig_bit < 0)
{
break;
}
//get dst for add
token = strtok(0, " \n\v\f\r\t=;");
parse_dst(&token, &waddr_add, &pack_mode, &ws, 1);
//check op
token = strtok(token, " \n\v\f\r\t.=");
parse_op(&token, &type, &op_add, &op_mul, &is_sem_inc);
//get modifiers
token = strtok(token, " \n\v\f\r\t(");
parse_op_modifiers(&token, &sf, &cond_add, &unpack_mode, &rel, &reg);
if(type == QPU_ALU)
{
//get arguments for add
token = strtok(token, ")");
parse_args_alu(&token, &add_a, &add_b, &imm32);
}
else if(type == QPU_SEM)
{
}
else if(type == QPU_BRANCH)
{
}
else if(type == QPU_LOAD_IMM)
{
}
//get dst for mul
token = strtok(token, " \n\v\f\r\t=;");
parse_dst(&token, &waddr_mul, &pack_mode, &ws, 0);
//check op
token = strtok(token, " \n\v\f\r\t.=");
parse_op(&token, &type, &op_add, &op_mul, &is_sem_inc);
if(type == QPU_ALU)
{
//get arguments for add
token = strtok(token, ")");
parse_args_alu(&token, &mul_a, &mul_b, &imm32);
}
//get modifiers
token = strtok(token, " \n\v\f\r\t(");
parse_op_modifiers(&token, &sf, &cond_mul, &unpack_mode, &rel, &reg);
//EMIT INSTRUCTION HERE
if(type == QPU_ALU)
{
if(sig_bit == QPU_SIG_SMALL_IMM)
{
instructions[instruction_counter] = encode_alu_small_imm(unpack_mode, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, op_mul, op_add, raddr_add, imm32, add_a, add_b, mul_a, mul_b);
}
else
{
instructions[instruction_counter] = encode_alu(sig_bit, unpack_mode, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, op_mul, op_add, raddr_add, raddr_mul, add_a, add_b, mul_a, mul_b);
}
}
else if(type == QPU_SEM)
{
instructions[instruction_counter] = encode_semaphore(pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, is_sem_inc, semaphore, imm32);
}
else if(type == QPU_BRANCH)
{
instructions[instruction_counter] = encode_branch(branch_cond, rel, reg, raddr_add, ws, waddr_add, waddr_mul, imm32);
}
else if(type == QPU_LOAD_IMM)
{
if(load_type == QPU_LOAD32)
{
instructions[instruction_counter] = encode_load_imm(pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, imm32);
}
else
{
instructions[instruction_counter] = encode_load_imm_per_elem(signed_or_unsigned, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, ms_imm16, ls_imm16);
}
}
instruction_counter++;
token = strtok(0, " \n\v\f\r\t;");
}
return instructions;
}
int main()
{
char asm_code[] =
"sig_none ; ra0.nop = add.sf.always.nop(r0, r1, 0) ; rb0.nop = fmul.sf.always(r2, r3) ;"
"sig_branch ; ra0 = branch.rel.reg.always(0xdeadbeef, ra1) ; rb0 = branch() ;"
"sig_none ; ra0.nop = sem_inc.sf.always(1, 0x7ffffff) ; rb0.nop = sem_inc.always() ;"
"sig_load_imm ; ra0.nop = load32.sf.always(0xdeadbeef) ; rb0.nop = load32.always() ;";
uint64_t assembly = assemble_qpu_asm(asm_code);
return 0;
}

View File

@ -0,0 +1,639 @@
/*
* Copyright © 2014 Broadcom
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef VC4_QPU_DEFINES_H
#define VC4_QPU_DEFINES_H
#include <assert.h>
#include <stdint.h>
typedef enum{
QPU_ALU,
QPU_SEM,
QPU_BRANCH,
QPU_LOAD_IMM
} qpu_alu_type;
typedef enum{
QPU_LOAD32,
QPU_LOAD16
} qpu_load_type;
//Condition Codes
//The QPU keeps a set of N, Z and C flag bits per 16 SIMD element. These flags are updated based on the result
//of the ADD ALU if the sf bit is set. If the sf bit is set and the ADD ALU executes a NOP or its condition code was
//NEVER, flags are set based upon the result of the MUL ALU result.
typedef enum {
QPU_COND_NEVER,
QPU_COND_ALWAYS,
QPU_COND_ZS, //set
QPU_COND_ZC, //clear
QPU_COND_NS,
QPU_COND_NC,
QPU_COND_CS,
QPU_COND_CC,
} qpu_cond;
static const char *qpu_cond_str[] = {
[QPU_COND_NEVER] = "never",
[QPU_COND_ALWAYS] = "always",
[QPU_COND_ZS] = "zs",
[QPU_COND_ZC] = "zc",
[QPU_COND_NS] = "ns",
[QPU_COND_NC] = "nc",
[QPU_COND_CS] = "cs",
[QPU_COND_CC] = "cc",
};
//ALU Input muxes
//selects one register for input
//The add_a, add_b, mul_a, and mul_b fields specify the input data for the A and B ports of the ADD and MUL
//pipelines, respectively
typedef enum {
/* hardware mux values */
QPU_MUX_R0,
QPU_MUX_R1,
QPU_MUX_R2,
QPU_MUX_R3,
QPU_MUX_R4, //special purpose, read only
QPU_MUX_R5, //special purpose
QPU_MUX_A,
QPU_MUX_B,
/**
* Non-hardware mux value, stores a small immediate field to be
* programmed into raddr_b in the qpu_reg.index.
*/
QPU_MUX_SMALL_IMM,
} qpu_mux;
static const char *qpu_mux_str[] = {
[QPU_MUX_R0] = "r0",
[QPU_MUX_R1] = "r1",
[QPU_MUX_R2] = "r2",
[QPU_MUX_R3] = "r3",
[QPU_MUX_R4] = "r4",
[QPU_MUX_R5] = "r5",
[QPU_MUX_A] = "a",
[QPU_MUX_B] = "b",
[QPU_MUX_SMALL_IMM] = "imm",
};
//Signaling Bits
//The 4-bit signaling field signal is connected to the 3d pipeline and is set to indicate one of a number of
//conditions to the 3d hardware. Values from this field are also used to encode a BKPT instruction, and to
//encode Branches and Load Immediate instructions.
typedef enum {
QPU_SIG_SW_BREAKPOINT,
QPU_SIG_NONE,
QPU_SIG_THREAD_SWITCH,
QPU_SIG_PROG_END,
QPU_SIG_WAIT_FOR_SCOREBOARD, //stall until this QPU can safely access tile buffer
QPU_SIG_SCOREBOARD_UNLOCK,
QPU_SIG_LAST_THREAD_SWITCH,
QPU_SIG_COVERAGE_LOAD, //from tile buffer to r4
QPU_SIG_COLOR_LOAD, //from tile buffer to r4
QPU_SIG_COLOR_LOAD_END, //color load and program end
QPU_SIG_LOAD_TMU0, //read data from TMU0 to r4
QPU_SIG_LOAD_TMU1, //read data from TMU1 to r4
QPU_SIG_ALPHA_MASK_LOAD, //from tile buffer to r4
QPU_SIG_SMALL_IMM, //ALU instruction with raddr_b specifying small immediate or vector rotate
QPU_SIG_LOAD_IMM, //load immediate instruction
QPU_SIG_BRANCH
} qpu_sig_bits;
static const char *qpu_sig_bits_str[] = {
[QPU_SIG_SW_BREAKPOINT] = "sig_brk",
[QPU_SIG_NONE] = "sig_none",
[QPU_SIG_THREAD_SWITCH] = "sig_switch",
[QPU_SIG_PROG_END] = "sig_end",
[QPU_SIG_WAIT_FOR_SCOREBOARD] = "sig_wait_score",
[QPU_SIG_SCOREBOARD_UNLOCK] = "sig_unlock_score",
[QPU_SIG_LAST_THREAD_SWITCH] = "sig_thread_switch",
[QPU_SIG_COVERAGE_LOAD] = "sig_coverage_load",
[QPU_SIG_COLOR_LOAD] = "sig_color_load",
[QPU_SIG_COLOR_LOAD_END] = "sig_color_load_end",
[QPU_SIG_LOAD_TMU0] = "sig_load_tmu0",
[QPU_SIG_LOAD_TMU1] = "sig_load_tmu1",
[QPU_SIG_ALPHA_MASK_LOAD] = "sig_alpha_mask_load",
[QPU_SIG_SMALL_IMM] = "sig_small_imm",
[QPU_SIG_LOAD_IMM] = "sig_load_imm",
[QPU_SIG_BRANCH] = "sig_branch",
};
//Small immediate encoding
//Returns the small immediate value to be encoded in to the raddr b field if
//the argument can be represented as one, or ~0 otherwise.
//48: Small immediate value for rotate-by-r5, and 49-63 are "rotate by n channels"
uint8_t qpu_encode_small_immediate(uint32_t i)
{
if (i <= 15)
return i;
if ((int)i < 0 && (int)i >= -16)
return i + 32;
switch (i) {
case 0x3f800000:
return 32;
case 0x40000000:
return 33;
case 0x40800000:
return 34;
case 0x41000000:
return 35;
case 0x41800000:
return 36;
case 0x42000000:
return 37;
case 0x42800000:
return 38;
case 0x43000000:
return 39;
case 0x3b800000:
return 40;
case 0x3c000000:
return 41;
case 0x3c800000:
return 42;
case 0x3d000000:
return 43;
case 0x3d800000:
return 44;
case 0x3e000000:
return 45;
case 0x3e800000:
return 46;
case 0x3f000000:
return 47;
}
return ~0;
}
//QPU unpack values
//(can be used to unpack from r4 too)
typedef enum {
QPU_UNPACK_NOP,
QPU_UNPACK_16A, //from A reg: convert 16bit float to 32bit float, or 16bit int to 32bit int, depending on the instruction
QPU_UNPACK_16B,
QPU_UNPACK_8D_REP, //replicate most significant byte (alpha) across word: {a, a, a, a}
QPU_UNPACK_8A, //convert 8bit color in range [0...1] to 32bit float or 32bit int, depending on the instruction
QPU_UNPACK_8B,
QPU_UNPACK_8C,
QPU_UNPACK_8D,
} qpu_unpack;
static const char *qpu_unpack_str[] = {
[QPU_UNPACK_NOP] = "nop",
[QPU_UNPACK_16A] = "16a",
[QPU_UNPACK_16B] = "16b",
[QPU_UNPACK_8D_REP] = "8d_rep",
[QPU_UNPACK_8A] = "8a",
[QPU_UNPACK_8B] = "8b",
[QPU_UNPACK_8C] = "8c",
[QPU_UNPACK_8D] = "8d",
};
//QPU pack regfile A
typedef enum {
QPU_PACK_A_NOP,
QPU_PACK_A_16A, //convert to 16 bit float if float input, or to int16 (just takes least significant 16bits)
QPU_PACK_A_16B,
QPU_PACK_A_8888, //convert to 8bit uint (just takes least significant 8bits) and replicate across all bytes of 32bit word
QPU_PACK_A_8A, // Convert to 8-bit unsigned int. (just takes least significant 8bits)
QPU_PACK_A_8B,
QPU_PACK_A_8C,
QPU_PACK_A_8D,
// Saturating variants of the previous instructions.
QPU_PACK_A_32_SAT, //saturate signed 32bit number (takes into account overflow/carry flags)
QPU_PACK_A_16A_SAT, //convert to 16bit float if float input, or int16, depending on input (with saturation)
QPU_PACK_A_16B_SAT,
QPU_PACK_A_8888_SAT, //convert to uint8 with saturation and replicate across all bytes of 32bit word
QPU_PACK_A_8A_SAT, //conver to uint8 with saturation
QPU_PACK_A_8B_SAT,
QPU_PACK_A_8C_SAT,
QPU_PACK_A_8D_SAT,
} qpu_pack_a;
static const char *qpu_pack_a_str[] = {
[QPU_PACK_A_NOP] = "nop",
[QPU_PACK_A_16A] = "16a",
[QPU_PACK_A_16B] = "16b",
[QPU_PACK_A_8888] = "8888",
[QPU_PACK_A_8A] = "8a",
[QPU_PACK_A_8B] = "8b",
[QPU_PACK_A_8C] = "8c",
[QPU_PACK_A_8D] = "8d",
[QPU_PACK_A_32_SAT] = "sat",
[QPU_PACK_A_16A_SAT] = "16a.sat",
[QPU_PACK_A_16B_SAT] = "16b.sat",
[QPU_PACK_A_8888_SAT] = "8888.sat",
[QPU_PACK_A_8A_SAT] = "8a.sat",
[QPU_PACK_A_8B_SAT] = "8b.sat",
[QPU_PACK_A_8C_SAT] = "8c.sat",
[QPU_PACK_A_8D_SAT] = "8d.sat",
};
//QPU pack MUL ALU values
typedef enum {
QPU_PACK_MUL_NOP,
QPU_PACK_MUL_8888 = 3, // converts mul float result to 8bit color in range [0...1] and replicate across all bytes of 32bit word
QPU_PACK_MUL_8A, // converts mul float result to 8bit color in range [0...1]
QPU_PACK_MUL_8B,
QPU_PACK_MUL_8C,
QPU_PACK_MUL_8D,
} qpu_pack_mul;
static const char *qpu_pack_mul_str[] = {
[QPU_PACK_MUL_NOP] = "nop",
[QPU_PACK_MUL_8888] = "8888",
[QPU_PACK_MUL_8A] = "8a",
[QPU_PACK_MUL_8B] = "8b",
[QPU_PACK_MUL_8C] = "8c",
[QPU_PACK_MUL_8D] = "8d",
};
typedef enum {
QPU_COND_BRANCH_ALL_ZS, //all z flags set
QPU_COND_BRANCH_ALL_ZC, //all z flags clear
QPU_COND_BRANCH_ANY_ZS,
QPU_COND_BRANCH_ANY_ZC,
QPU_COND_BRANCH_ALL_NS,
QPU_COND_BRANCH_ALL_NC,
QPU_COND_BRANCH_ANY_NS,
QPU_COND_BRANCH_ANY_NC,
QPU_COND_BRANCH_ALL_CS,
QPU_COND_BRANCH_ALL_CC,
QPU_COND_BRANCH_ANY_CS,
QPU_COND_BRANCH_ANY_CC,
QPU_COND_BRANCH_ALWAYS = 15 //always execute
} qpu_branch_cond;
static const char *qpu_branch_cond_str[] = {
[QPU_COND_BRANCH_ALL_ZS] = "all_zs",
[QPU_COND_BRANCH_ALL_ZC] = "all_zc",
[QPU_COND_BRANCH_ANY_ZS] = "any_zs",
[QPU_COND_BRANCH_ANY_ZC] = "any_zc",
[QPU_COND_BRANCH_ALL_NS] = "all_ns",
[QPU_COND_BRANCH_ALL_NC] = "all_nc",
[QPU_COND_BRANCH_ANY_NS] = "any_ns",
[QPU_COND_BRANCH_ANY_NC] = "any_nc",
[QPU_COND_BRANCH_ALL_CS] = "all_cs",
[QPU_COND_BRANCH_ALL_CC] = "all_cc",
[QPU_COND_BRANCH_ANY_CS] = "any_cs",
[QPU_COND_BRANCH_ANY_CC] = "any_cc",
[QPU_COND_BRANCH_ALWAYS] = "always",
};
//QPU ADD instruction set
typedef enum {
QPU_A_NOP,
QPU_A_FADD, //float add
QPU_A_FSUB,
QPU_A_FMIN,
QPU_A_FMAX,
QPU_A_FMINABS, //float min(abs(x))
QPU_A_FMAXABS,
QPU_A_FTOI, //convert float to int
QPU_A_ITOF, //convert int to float
QPU_A_ADD = 12, //int add
QPU_A_SUB,
QPU_A_SHR, //int shift right
QPU_A_ASR, //int arithmetic shift right
QPU_A_ROR, //int rotate right
QPU_A_SHL, //int shift left
QPU_A_MIN,
QPU_A_MAX,
QPU_A_AND,
QPU_A_OR,
QPU_A_XOR,
QPU_A_NOT,
QPU_A_CLZ, //int count leading zeroes
QPU_A_V8ADDS = 30, //add with saturation per 8bit element
QPU_A_V8SUBS = 31,
} qpu_op_add;
static const char *qpu_op_add_str[] = {
[QPU_A_NOP] = "nop",
[QPU_A_FADD] = "fadd",
[QPU_A_FSUB] = "fsub",
[QPU_A_FMIN] = "fmin",
[QPU_A_FMAX] = "fmax",
[QPU_A_FMINABS] = "fminabs",
[QPU_A_FMAXABS] = "fmaxabs",
[QPU_A_FTOI] = "ftoi",
[QPU_A_ITOF] = "itof",
[QPU_A_ADD] = "add",
[QPU_A_SUB] = "sub",
[QPU_A_SHR] = "shr",
[QPU_A_ASR] = "asr",
[QPU_A_ROR] = "ror",
[QPU_A_SHL] = "shl",
[QPU_A_MIN] = "min",
[QPU_A_MAX] = "max",
[QPU_A_AND] = "and",
[QPU_A_OR] = "or",
[QPU_A_XOR] = "xor",
[QPU_A_NOT] = "not",
[QPU_A_CLZ] = "clz",
[QPU_A_V8ADDS] = "v8adds",
[QPU_A_V8SUBS] = "v8subs",
};
//QPU MUL instruction set
typedef enum {
QPU_M_NOP,
QPU_M_FMUL, //float mul
QPU_M_MUL24, //24bit int mul?
QPU_M_V8MULD, //mul two vectors of 8bit ints in range [0...1]
QPU_M_V8MIN,
QPU_M_V8MAX,
QPU_M_V8ADDS, //add two vectors of 8bit ints in range [0...1] with saturation
QPU_M_V8SUBS,
} qpu_op_mul;
static const char *qpu_op_mul_str[] = {
[QPU_M_NOP] = "nop",
[QPU_M_FMUL] = "fmul",
[QPU_M_MUL24] = "mul24",
[QPU_M_V8MULD] = "v8muld",
[QPU_M_V8MIN] = "v8min",
[QPU_M_V8MAX] = "v8max",
[QPU_M_V8ADDS] = "v8adds",
[QPU_M_V8SUBS] = "v8subs",
};
//read and write ops may mean different things...
//hence two maps
//QPU register address read map
typedef enum {
QPU_R_FRAG_PAYLOAD_ZW = 15, /* W for A file, Z for B file */
/* 0-31 are the plain regfile a or b fields */
QPU_R_UNIF = 32, //uniform read
QPU_R_VARY = 35, //varying read
QPU_R_ELEM_QPU = 38, //element number
QPU_R_NOP,
QPU_R_XY_PIXEL_COORD = 41, // X for regfile a, Y for regfile b
QPU_R_MS_FLAGS = 42, //A reg
QPU_R_REV_FLAG = 42, //B reg
QPU_R_VPM = 48,
QPU_R_VPM_LD_BUSY = 49, //load busy for reg A
QPU_R_VPM_ST_BUSY = 49, //store busy for reg B
QPU_R_VPM_LD_WAIT = 50, //load wait for reg A
QPU_R_VPM_ST_WAIT = 50, //store wait for reg B
QPU_R_MUTEX_ACQUIRE,
} qpu_raddr;
static const char *qpu_raddr_str[][52] = {
{ //A
[QPU_R_FRAG_PAYLOAD_ZW] = "pay_zw",
[QPU_R_UNIF] = "uni",
[QPU_R_VARY] = "vary",
[QPU_R_ELEM_QPU] = "elem",
[QPU_R_NOP] = "nop",
[QPU_R_XY_PIXEL_COORD] = "x_pix",
[QPU_R_MS_FLAGS] = "ms_flags",
[QPU_R_VPM] = "vpm_read",
[QPU_R_VPM_LD_BUSY] = "vpm_ld_busy",
[QPU_R_VPM_LD_WAIT] = "vpm_ld_wait",
[QPU_R_MUTEX_ACQUIRE] = "mutex_acq"
},
{ //B
[QPU_R_FRAG_PAYLOAD_ZW] = "pay_zw",
[QPU_R_UNIF] = "uni",
[QPU_R_VARY] = "vary",
[QPU_R_ELEM_QPU] = "elem",
[QPU_R_NOP] = "nop",
[QPU_R_XY_PIXEL_COORD] = "y_pix",
[QPU_R_REV_FLAG] = "rev_flag",
[QPU_R_VPM] = "vpm_read",
[QPU_R_VPM_ST_BUSY] = "vpm_st_busy",
[QPU_R_VPM_ST_WAIT] = "vpm_st_wait",
[QPU_R_MUTEX_ACQUIRE] = "mutex_acq"
}
};
//QPU register address write map
typedef enum {
/* 0-31 are the plain regfile a or b fields */
QPU_W_ACC0 = 32, //accumulation 0, aka r0
QPU_W_ACC1,
QPU_W_ACC2,
QPU_W_ACC3,
QPU_W_TMU_NOSWAP,
QPU_W_ACC5, //replicate pixel0 per quad for reg A, replicate SIMD element0 for reg B
QPU_W_HOST_INT, //host interrupt
QPU_W_NOP,
QPU_W_UNIFORMS_ADDRESS,
QPU_W_QUAD_XY, // X for regfile a, Y for regfile b
QPU_W_MS_FLAGS = 42, //A reg
QPU_W_REV_FLAG = 42, //B reg
QPU_W_TLB_STENCIL_SETUP = 43,
QPU_W_TLB_Z,
QPU_W_TLB_COLOR_MS,
QPU_W_TLB_COLOR_ALL,
QPU_W_TLB_ALPHA_MASK,
QPU_W_VPM,
QPU_W_VPMVCD_SETUP, /* LD for regfile a, ST for regfile b */
QPU_W_VPM_ADDR, /* LD for regfile a, ST for regfile b */
QPU_W_MUTEX_RELEASE,
QPU_W_SFU_RECIP, //special function unit 1/x
QPU_W_SFU_RECIPSQRT, //1/sqrt(x)
QPU_W_SFU_EXP,
QPU_W_SFU_LOG,
QPU_W_TMU0_S,
QPU_W_TMU0_T,
QPU_W_TMU0_R,
QPU_W_TMU0_B,
QPU_W_TMU1_S,
QPU_W_TMU1_T,
QPU_W_TMU1_R,
QPU_W_TMU1_B,
} qpu_waddr;
static const char *qpu_waddr_str[][64] = {
{ //A
[QPU_W_ACC0] = "r0",
[QPU_W_ACC1] = "r1",
[QPU_W_ACC2] = "r2",
[QPU_W_ACC3] = "r3",
[QPU_W_TMU_NOSWAP] = "tmu_noswap",
[QPU_W_ACC5] = "r5",
[QPU_W_HOST_INT] = "host_int",
[QPU_W_NOP] = "nop",
[QPU_W_UNIFORMS_ADDRESS] = "uniforms_addr",
[QPU_W_QUAD_XY] = "quad_x",
[QPU_W_MS_FLAGS] = "ms_flags",
[QPU_W_TLB_STENCIL_SETUP] = "tlb_stencil_setup",
[QPU_W_TLB_Z] = "tlb_z",
[QPU_W_TLB_COLOR_MS] = "tlb_color_ms",
[QPU_W_TLB_COLOR_ALL] = "tlb_color_all",
[QPU_W_VPM] = "vpm",
[QPU_W_VPMVCD_SETUP] = "vr_setup",
[QPU_W_VPM_ADDR] = "vr_addr",
[QPU_W_MUTEX_RELEASE] = "mutex_release",
[QPU_W_SFU_RECIP] = "sfu_recip",
[QPU_W_SFU_RECIPSQRT] = "sfu_recipsqrt",
[QPU_W_SFU_EXP] = "sfu_exp",
[QPU_W_SFU_LOG] = "sfu_log",
[QPU_W_TMU0_S] = "tmu0_s",
[QPU_W_TMU0_T] = "tmu0_t",
[QPU_W_TMU0_R] = "tmu0_r",
[QPU_W_TMU0_B] = "tmu0_b",
[QPU_W_TMU1_S] = "tmu1_s",
[QPU_W_TMU1_T] = "tmu1_t",
[QPU_W_TMU1_R] = "tmu1_r",
[QPU_W_TMU1_B] = "tmu1_b",
},
{ //B
[QPU_W_ACC0] = "r0",
[QPU_W_ACC1] = "r1",
[QPU_W_ACC2] = "r2",
[QPU_W_ACC3] = "r3",
[QPU_W_TMU_NOSWAP] = "tmu_noswap",
[QPU_W_ACC5] = "r5",
[QPU_W_HOST_INT] = "host_int",
[QPU_W_NOP] = "nop",
[QPU_W_UNIFORMS_ADDRESS] = "uniforms_addr",
[QPU_W_QUAD_XY] = "quad_y",
[QPU_W_REV_FLAG] = "rev_flags",
[QPU_W_TLB_STENCIL_SETUP] = "tlb_stencil_setup",
[QPU_W_TLB_Z] = "tlb_z",
[QPU_W_TLB_COLOR_MS] = "tlb_color_ms",
[QPU_W_TLB_COLOR_ALL] = "tlb_color_all",
[QPU_W_VPM] = "vpm",
[QPU_W_VPMVCD_SETUP] = "vw_setup",
[QPU_W_VPM_ADDR] = "vw_addr",
[QPU_W_MUTEX_RELEASE] = "mutex_release",
[QPU_W_SFU_RECIP] = "sfu_recip",
[QPU_W_SFU_RECIPSQRT] = "sfu_recipsqrt",
[QPU_W_SFU_EXP] = "sfu_exp",
[QPU_W_SFU_LOG] = "sfu_log",
[QPU_W_TMU0_S] = "tmu0_s",
[QPU_W_TMU0_T] = "tmu0_t",
[QPU_W_TMU0_R] = "tmu0_r",
[QPU_W_TMU0_B] = "tmu0_b",
[QPU_W_TMU1_S] = "tmu1_s",
[QPU_W_TMU1_T] = "tmu1_t",
[QPU_W_TMU1_R] = "tmu1_r",
[QPU_W_TMU1_B] = "tmu1_b",
}
};
#define QPU_MASK(high, low) ((((uint64_t)1<<((high)-(low)+1))-1)<<(low))
/* Using the GNU statement expression extension */
#define QPU_SET_FIELD(value, field) \
({ \
uint64_t fieldval = (uint64_t)(value) << field ## _SHIFT; \
assert((fieldval & ~ field ## _MASK) == 0); \
fieldval & field ## _MASK; \
})
#define QPU_GET_FIELD(word, field) ((uint32_t)(((word) & field ## _MASK) >> field ## _SHIFT))
#define QPU_UPDATE_FIELD(inst, value, field) \
(((inst) & ~(field ## _MASK)) | QPU_SET_FIELD(value, field))
#define QPU_SIG_SHIFT 60
#define QPU_SIG_MASK QPU_MASK(63, 60)
#define QPU_UNPACK_SHIFT 57
#define QPU_UNPACK_MASK QPU_MASK(59, 57)
#define QPU_LOAD_IMM_MODE_SHIFT 57
#define QPU_LOAD_IMM_MODE_MASK QPU_MASK(59, 57)
# define QPU_LOAD_IMM_MODE_U32 0
# define QPU_LOAD_IMM_MODE_I2 1
# define QPU_LOAD_IMM_MODE_U2 3
/**
* If set, the pack field means PACK_MUL or R4 packing, instead of normal
* regfile a packing.
*/
#define QPU_PM ((uint64_t)1 << 56)
#define QPU_PACK_SHIFT 52
#define QPU_PACK_MASK QPU_MASK(55, 52)
#define QPU_COND_ADD_SHIFT 49
#define QPU_COND_ADD_MASK QPU_MASK(51, 49)
#define QPU_COND_MUL_SHIFT 46
#define QPU_COND_MUL_MASK QPU_MASK(48, 46)
#define QPU_BRANCH_COND_SHIFT 52
#define QPU_BRANCH_COND_MASK QPU_MASK(55, 52)
#define QPU_BRANCH_REL ((uint64_t)1 << 51)
#define QPU_BRANCH_REG ((uint64_t)1 << 50)
#define QPU_BRANCH_RADDR_A_SHIFT 45
#define QPU_BRANCH_RADDR_A_MASK QPU_MASK(49, 45)
#define QPU_SF ((uint64_t)1 << 45)
#define QPU_WADDR_ADD_SHIFT 38
#define QPU_WADDR_ADD_MASK QPU_MASK(43, 38)
#define QPU_WADDR_MUL_SHIFT 32
#define QPU_WADDR_MUL_MASK QPU_MASK(37, 32)
#define QPU_OP_MUL_SHIFT 29
#define QPU_OP_MUL_MASK QPU_MASK(31, 29)
#define QPU_RADDR_A_SHIFT 18
#define QPU_RADDR_A_MASK QPU_MASK(23, 18)
#define QPU_RADDR_B_SHIFT 12
#define QPU_RADDR_B_MASK QPU_MASK(17, 12)
#define QPU_SMALL_IMM_SHIFT 12
#define QPU_SMALL_IMM_MASK QPU_MASK(17, 12)
/* Small immediate value for rotate-by-r5, and 49-63 are "rotate by n
* channels"
*/
#define QPU_SMALL_IMM_MUL_ROT 48
#define QPU_ADD_A_SHIFT 9
#define QPU_ADD_A_MASK QPU_MASK(11, 9)
#define QPU_ADD_B_SHIFT 6
#define QPU_ADD_B_MASK QPU_MASK(8, 6)
#define QPU_MUL_A_SHIFT 3
#define QPU_MUL_A_MASK QPU_MASK(5, 3)
#define QPU_MUL_B_SHIFT 0
#define QPU_MUL_B_MASK QPU_MASK(2, 0)
#define QPU_WS ((uint64_t)1 << 44)
#define QPU_OP_ADD_SHIFT 24
#define QPU_OP_ADD_MASK QPU_MASK(28, 24)
#define QPU_LOAD_IMM_SHIFT 0
#define QPU_LOAD_IMM_MASK QPU_MASK(31, 0)
#define QPU_BRANCH_TARGET_SHIFT 0
#define QPU_BRANCH_TARGET_MASK QPU_MASK(31, 0)
#endif /* VC4_QPU_DEFINES_H */