diff --git a/QPUassembler/main.c b/QPUassembler/main.c index 80e0f0c..3443bdb 100644 --- a/QPUassembler/main.c +++ b/QPUassembler/main.c @@ -34,6 +34,109 @@ write, SFU write, Mutex read or Semaphore access. *********************************************************************************************************************/ +/* +Format: +#comment +sig_bit_opt ; dstAdd.pack_opt = add_op.pm_opt.sf_opt.cond.unpack_opt.ws_opt(srcA, srcB, raddr_a_opt, raddr_b_opt) ; dstMul.pack_opt = mul_op.cond(srcA, srcB) ; +sig_small_imm ; dstAdd.pack_opt = add_op.pm_opt.sf_opt.cond.unpack_opt.ws_opt(srcA, srcB, raddr_a_opt, small_imm) ; dstMul.pack_opt = mul_op.cond(srcA, srcB) ; +sig_branch ; dstAdd = branch.rel_opt.reg_opt.ws_opt(address, condition, raddr_a_opt) ; dstMul = branch() ; +sig_load_imm ; dstAdd.pack_opt = sem_inc.pm_opt.sf_opt.cond.ws_opt(sem_number, 27bit_imm_opt) ; dstMul.pack_opt = sem_inc.cond() ; +sig_load_imm ; dstAdd.pack_opt = load32.pm_opt.sf_opt.cond.ws_opt(immediate32bit_value) ; dstMul.pack_opt = load32.cond() ; +sig_load_imm ; dstAdd.pack_opt = load16.pm_opt.signed_opt.sf_opt.cond.ws_opt(int16_imm, int16_imm) ; dstMul.pack_opt = load16.cond() ; + +================================================================== +================How to formulate instructions:==================== +================================================================== +1) +You must specify the signal bits at the beginning of each instruction: +sig_brk, sig_none, sig_switch, sig_end, sig_wait_score, sig_unlock_score, sig_thread_switch, sig_coverage_load, +sig_color_load, sig_color_load_end, sig_load_tmu0, sig_load_tmu1, sig_alpha_mask_load, sig_small_imm, sig_load_imm, sig_branch + +2) +Then you must specify the output register for the ADD pipeline. +rx0-31, r0-3, r5, tmu_noswap, host_int, nop, uniforms_addr, quad_x, quad_y, ms_flags, rev_flags, tlb_stencil_setup +tlb_z, tlb_color_ms, tlb_color_all, vpm, vr_setup, vr_addr, mutex_release, sfu_recip, sfu_recipsqrt, sfu_exp, +sfu_log, tmu0_s, tmu0_t, tmu0_r, tmu0_b, tmu1_s, tmu1_t, tmu1_r, tmu1_b + +3) +If the ADD instruction writes to regfile A (ie. you don't specify the WS flag later) and PM flag won't be specified, +then you can specify the pack mode for regfile A here (omitting means nop) +nop, 16a, 16b, 8888, 8a, 8b, 8c, 8d, sta, 16a.sat, 16b.sat, 8888.sat, 8a.sat, 8b.sat, 8c.sat, 8d.sat + +4) +Then you must specify your operation for the ADD pipeline. If you are writing a non-ALU instruction, you can specify either +branch, sem_inc, sem_dec, load32 or load16 here instead. +Operations available: +nop, fadd, fsub, fmin, fmax, fminabs, fmaxabs, ftoi, itof, add, sub, shr, asr, ror, shl, min, max, and, or, xor, not, clz, v8adds, v8subs + +5) +Then you can specify a range of modifiers (order is not important): +PM bit: pm +SF bit: sf +WS bit: ws +REL bit: rel +REG bit: reg +SIGNED bit: signed +Conditional execution for the ADD pipeline: never, always, zs, zc, ns, nc, cs, cc +Unpack modes (from regfile A, or if PM is set from R4): nop, 16a, 16b, 8d_rep, 8a, 8b, 8c, 8d + +6) +Then you must specify the arguments for the ALU operation. +srcA, srcB can be: r0-r5 or a, b for regfiles A and B, or imm for the small immediate value. +raddr_a and raddr_b can be specified afterwards as optional extra arguments (omitting means nop). +raddr_a: ra0-31, pay_zw, uni, vary, elem, nop, x_pix, ms_flags, vpm_read, vpm_ld_busy, vpm_ld_wait, mutex_acq +raddr_b: rb0-31, pay_zw, uni, vary, elem, nop, y_pix, rev_flag, vpm_read, vpm_st_busy, vpm_st_wait, mutex_acq + +For branch operation, you must specify: +the jump address as a 32bit value (can be relative if REL is set) +the branch condition: all_zs, all_zc, any_zs, any_zc, all_ns, all_nc, any_ns, any_nc, all_cs, all_cc, any_cs, any_cc, always +and an optional raddr_a (if REG flag is set), see above + +For a semaphore instruction, you need to specify which semaphore (0-15) you want to modify, then an optional 27bit immediate value (ms 16bits might be usable...). + +7) +Then you must specify the output register for the MUL pipeline. +See above for options. + +8) +If the MUL instruction writes to regfile A (ie. you specify the WS flag) then you can set the pack operation for regfile A here: +nop, 16a, 16b, 8888, 8a, 8b, 8c, 8d, sta, 16a.sat, 16b.sat, 8888.sat, 8a.sat, 8b.sat, 8c.sat, 8d.sat +OR +You if specify the PM flag, then you can set the pack operation for the MUL output here: +nop, 8888, 8a, 8b, 8c, 8d + +9) +Then you must specify your operation for the MUL pipeline. If you are writing a non-ALU instruction, you can specify either +branch, sem_inc, sem_dec, load32 or load16 here instead. +Operations available: +nop, fmul, mul24, v8muld, v8min, v8max, v8adds, v8subs + +10) +Then you can specify a range of modifiers (order is not important): +Conditional execution for the MUL pipeline: never, always, zs, zc, ns, nc, cs, cc + +11) +Then you must specify the arguments for the ALU operation. +srcA, srcB can be: r0-r5 or a, b for regfiles A and B, or imm for the small immediate value. + +================================================================== +================================================================== + + +dstAdd: rx0-31, r0-5, special regs +raddr_a_opt: ra0-31, r0-5, special regs +raddr_b_opt: rb0-31, r0-5, special regs + +Examples: +sig_none ; rx0.nop = add.pm.sf.always(r0, r1, 0) ; rx0.nop = fmul.always(r2, r3) ; +sig_branch ; rx0 = branch.pm.rel.reg.always(0xdeadbeef, ra1) ; rx0 = branch() ; +sig_none ; rx0.nop = sem_inc.pm.sf.always(1, 0x7ffffff) ; rx0.nop = sem_inc.always() ; +sig_load_imm ; rx0.nop = load32.pm.sf.always(0xdeadbeef) ; rx0.nop = load32.always() ; +sig_load_imm ; rx0.nop = load16.pm.sf.signed.always(1, 2) ; rx0.nop = load16.always() ; +#mov +sig_none ; rx0.nop = or(r0, r0) ; rx0 = v8min(r1, r1) ; + */ + uint64_t encode_alu(qpu_sig_bits sig_bits, qpu_unpack unpack_mode, //If the pm bit is set, the unpack field programs the r4 unpack unit, @@ -1338,33 +1441,6 @@ void disassemble_qpu_asm(uint64_t instruction) printf("\n"); } -/* -Format: -#comment -sig_bit_opt ; dstAdd.pack_opt = add_op.pm_opt.sf_opt.cond.unpack_opt.ws_opt(srcA, srcB, raddr_a_opt, raddr_b_opt) ; dstMul.pack_opt = mul_op.cond(srcA, srcB) ; -sig_small_imm ; dstAdd.pack_opt = add_op.pm_opt.sf_opt.cond.unpack_opt.ws_opt(srcA, srcB, raddr_a_opt, small_imm) ; dstMul.pack_opt = mul_op.cond(srcA, srcB) ; -sig_branch ; dstAdd = branch.rel_opt.reg_opt.ws_opt(address, condition, srcA_opt) ; dstMul = branch() ; -sig_load_imm ; dstAdd.pack_opt = sem_inc.pm_opt.sf_opt.cond.ws_opt(sem_number, 27bit_imm_opt) ; dstMul.pack_opt = sem_inc.cond() ; -sig_load_imm ; dstAdd.pack_opt = load32.pm_opt.sf_opt.cond.ws_opt(immediate_value) ; dstMul.pack_opt = load32.cond() ; -sig_load_imm ; dstAdd.pack_opt = load16.pm_opt.signed_opt.sf_opt.cond.ws_opt(int16_imm, in16_imm) ; dstMul.pack_opt = load16.cond() ; - -You can specify the signal bits at the beginning of each instruction. - - -dstAdd: rx0-31, r0-5, special regs -raddr_a_opt: ra0-31, r0-5, special regs -raddr_b_opt: rb0-31, r0-5, special regs - -Examples: -sig_none ; rx0.nop = add.pm.sf.always(r0, r1, 0) ; rx0.nop = fmul.always(r2, r3) ; -sig_branch ; rx0 = branch.pm.rel.reg.always(0xdeadbeef, ra1) ; rx0 = branch() ; -sig_none ; rx0.nop = sem_inc.pm.sf.always(1, 0x7ffffff) ; rx0.nop = sem_inc.always() ; -sig_load_imm ; rx0.nop = load32.pm.sf.always(0xdeadbeef) ; rx0.nop = load32.always() ; -sig_load_imm ; rx0.nop = load16.pm.sf.signed.always(1, 2) ; rx0.nop = load16.always() ; -#mov -sig_none ; rx0.nop = or(r0, r0) ; rx0 = v8min(r1, r1) ; - */ - int main() { char asm_code[] = diff --git a/QPUassembler/vc4_qpu_defines.h b/QPUassembler/vc4_qpu_defines.h index fd9359d..7b1d27e 100644 --- a/QPUassembler/vc4_qpu_defines.h +++ b/QPUassembler/vc4_qpu_defines.h @@ -411,6 +411,7 @@ typedef enum { static const char *qpu_raddr_str[][52] = { { //A + //ra0-31 [QPU_R_FRAG_PAYLOAD_ZW] = "pay_zw", [QPU_R_UNIF] = "uni", [QPU_R_VARY] = "vary", @@ -424,6 +425,7 @@ static const char *qpu_raddr_str[][52] = { [QPU_R_MUTEX_ACQUIRE] = "mutex_acq" }, { //B + //rb0-31 [QPU_R_FRAG_PAYLOAD_ZW] = "pay_zw", [QPU_R_UNIF] = "uni", [QPU_R_VARY] = "vary",