added howto section for the qpu assembly

2025-02-22 19:54:18 +01:00 · 2019-04-20 13:55:40 +01:00 · 2019-04-20 13:55:40 +01:00 · 724f63f12a
commit 724f63f12a
parent 9cb1f24cf2
2 changed files with 105 additions and 27 deletions
--- a/QPUassembler/main.c
+++ b/QPUassembler/main.c
@ -34,6 +34,109 @@
 		write, SFU write, Mutex read or Semaphore access.
 *********************************************************************************************************************/
 /*
 Format:
 #comment
 sig_bit_opt		; dstAdd.pack_opt	= add_op.pm_opt.sf_opt.cond.unpack_opt.ws_opt(srcA, srcB, raddr_a_opt, raddr_b_opt)	; dstMul.pack_opt	= mul_op.cond(srcA, srcB)	;
 sig_small_imm	; dstAdd.pack_opt	= add_op.pm_opt.sf_opt.cond.unpack_opt.ws_opt(srcA, srcB, raddr_a_opt, small_imm)	; dstMul.pack_opt	= mul_op.cond(srcA, srcB)	;
 sig_branch		; dstAdd			= branch.rel_opt.reg_opt.ws_opt(address, condition, raddr_a_opt)					; dstMul			= branch()					;
 sig_load_imm	; dstAdd.pack_opt	= sem_inc.pm_opt.sf_opt.cond.ws_opt(sem_number, 27bit_imm_opt)						; dstMul.pack_opt	= sem_inc.cond()			;
 sig_load_imm	; dstAdd.pack_opt	= load32.pm_opt.sf_opt.cond.ws_opt(immediate32bit_value)							; dstMul.pack_opt	= load32.cond()				;
 sig_load_imm	; dstAdd.pack_opt	= load16.pm_opt.signed_opt.sf_opt.cond.ws_opt(int16_imm, int16_imm)					; dstMul.pack_opt	= load16.cond()				;
 ==================================================================
 ================How to formulate instructions:====================
 ==================================================================
 1)
 You must specify the signal bits at the beginning of each instruction:
 sig_brk, sig_none, sig_switch, sig_end, sig_wait_score, sig_unlock_score, sig_thread_switch, sig_coverage_load,
 sig_color_load, sig_color_load_end, sig_load_tmu0, sig_load_tmu1, sig_alpha_mask_load, sig_small_imm, sig_load_imm, sig_branch
 2)
 Then you must specify the output register for the ADD pipeline.
 rx0-31, r0-3, r5, tmu_noswap, host_int, nop, uniforms_addr, quad_x, quad_y, ms_flags, rev_flags, tlb_stencil_setup
 tlb_z, tlb_color_ms, tlb_color_all, vpm, vr_setup, vr_addr, mutex_release, sfu_recip, sfu_recipsqrt, sfu_exp,
 sfu_log, tmu0_s, tmu0_t, tmu0_r, tmu0_b, tmu1_s, tmu1_t, tmu1_r, tmu1_b
 3)
 If the ADD instruction writes to regfile A (ie. you don't specify the WS flag later) and PM flag won't be specified,
 then you can specify the pack mode for regfile A here (omitting means nop)
 nop, 16a, 16b, 8888, 8a, 8b, 8c, 8d, sta, 16a.sat, 16b.sat, 8888.sat, 8a.sat, 8b.sat, 8c.sat, 8d.sat
 4)
 Then you must specify your operation for the ADD pipeline. If you are writing a non-ALU instruction, you can specify either
 branch, sem_inc, sem_dec, load32 or load16 here instead.
 Operations available:
 nop, fadd, fsub, fmin, fmax, fminabs, fmaxabs, ftoi, itof, add, sub, shr, asr, ror, shl, min, max, and, or, xor, not, clz, v8adds, v8subs
 5)
 Then you can specify a range of modifiers (order is not important):
 PM bit: pm
 SF bit: sf
 WS bit: ws
 REL bit: rel
 REG bit: reg
 SIGNED bit: signed
 Conditional execution for the ADD pipeline: never, always, zs, zc, ns, nc, cs, cc
 Unpack modes (from regfile A, or if PM is set from R4): nop, 16a, 16b, 8d_rep, 8a, 8b, 8c, 8d
 6)
 Then you must specify the arguments for the ALU operation.
 srcA, srcB can be: r0-r5 or a, b for regfiles A and B, or imm for the small immediate value.
 raddr_a and raddr_b can be specified afterwards as optional extra arguments (omitting means nop).
 raddr_a: ra0-31, pay_zw, uni, vary, elem, nop, x_pix, ms_flags, vpm_read, vpm_ld_busy, vpm_ld_wait, mutex_acq
 raddr_b: rb0-31, pay_zw, uni, vary, elem, nop, y_pix, rev_flag, vpm_read, vpm_st_busy, vpm_st_wait, mutex_acq
 For branch operation, you must specify:
 the jump address as a 32bit value (can be relative if REL is set)
 the branch condition: all_zs, all_zc, any_zs, any_zc, all_ns, all_nc, any_ns, any_nc, all_cs, all_cc, any_cs, any_cc, always
 and an optional raddr_a (if REG flag is set), see above
 For a semaphore instruction, you need to specify which semaphore (0-15) you want to modify, then an optional 27bit immediate value (ms 16bits might be usable...).
 7)
 Then you must specify the output register for the MUL pipeline.
 See above for options.
 8)
 If the MUL instruction writes to regfile A (ie. you specify the WS flag) then you can set the pack operation for regfile A here:
 nop, 16a, 16b, 8888, 8a, 8b, 8c, 8d, sta, 16a.sat, 16b.sat, 8888.sat, 8a.sat, 8b.sat, 8c.sat, 8d.sat
 OR
 You if specify the PM flag, then you can set the pack operation for the MUL output here:
 nop, 8888, 8a, 8b, 8c, 8d
 9)
 Then you must specify your operation for the MUL pipeline. If you are writing a non-ALU instruction, you can specify either
 branch, sem_inc, sem_dec, load32 or load16 here instead.
 Operations available:
 nop, fmul, mul24, v8muld, v8min, v8max, v8adds, v8subs
 10)
 Then you can specify a range of modifiers (order is not important):
 Conditional execution for the MUL pipeline: never, always, zs, zc, ns, nc, cs, cc
 11)
 Then you must specify the arguments for the ALU operation.
 srcA, srcB can be: r0-r5 or a, b for regfiles A and B, or imm for the small immediate value.
 ==================================================================
 ==================================================================
 dstAdd: rx0-31, r0-5, special regs
 raddr_a_opt: ra0-31, r0-5, special regs
 raddr_b_opt: rb0-31, r0-5, special regs
 Examples:
 sig_none		; rx0.nop			= add.pm.sf.always(r0, r1, 0)														; rx0.nop					= fmul.always(r2, r3)	;
 sig_branch		; rx0				= branch.pm.rel.reg.always(0xdeadbeef, ra1)											; rx0						= branch()				;
 sig_none		; rx0.nop			= sem_inc.pm.sf.always(1, 0x7ffffff)												; rx0.nop					= sem_inc.always()		;
 sig_load_imm	; rx0.nop			= load32.pm.sf.always(0xdeadbeef)													; rx0.nop					= load32.always()		;
 sig_load_imm	; rx0.nop			= load16.pm.sf.signed.always(1, 2)													; rx0.nop					= load16.always()		;
 #mov
 sig_none		; rx0.nop			= or(r0, r0)																		; rx0						= v8min(r1, r1)			;
 */
 uint64_t encode_alu(qpu_sig_bits sig_bits,
 					qpu_unpack unpack_mode,
 					//If the pm bit is set, the unpack field programs the r4 unpack unit,
@ -1338,33 +1441,6 @@ void disassemble_qpu_asm(uint64_t instruction)
 	printf("\n");
 }
 /*
 Format:
 #comment
 sig_bit_opt			; dstAdd.pack_opt	= add_op.pm_opt.sf_opt.cond.unpack_opt.ws_opt(srcA, srcB, raddr_a_opt, raddr_b_opt)	; dstMul.pack_opt	= mul_op.cond(srcA, srcB)	;
 sig_small_imm	; dstAdd.pack_opt	= add_op.pm_opt.sf_opt.cond.unpack_opt.ws_opt(srcA, srcB, raddr_a_opt, small_imm)	; dstMul.pack_opt	= mul_op.cond(srcA, srcB)	;
 sig_branch		; dstAdd			= branch.rel_opt.reg_opt.ws_opt(address, condition, srcA_opt)						; dstMul			= branch()					;
 sig_load_imm		; dstAdd.pack_opt	= sem_inc.pm_opt.sf_opt.cond.ws_opt(sem_number, 27bit_imm_opt)						; dstMul.pack_opt	= sem_inc.cond()			;
 sig_load_imm		; dstAdd.pack_opt	= load32.pm_opt.sf_opt.cond.ws_opt(immediate_value)									; dstMul.pack_opt	= load32.cond()				;
 sig_load_imm		; dstAdd.pack_opt	= load16.pm_opt.signed_opt.sf_opt.cond.ws_opt(int16_imm, in16_imm)					; dstMul.pack_opt	= load16.cond()				;
 You can specify the signal bits at the beginning of each instruction.
 dstAdd: rx0-31, r0-5, special regs
 raddr_a_opt: ra0-31, r0-5, special regs
 raddr_b_opt: rb0-31, r0-5, special regs
 Examples:
 sig_none			; rx0.nop			= add.pm.sf.always(r0, r1, 0)														; rx0.nop					= fmul.always(r2, r3)	;
 sig_branch			; rx0				= branch.pm.rel.reg.always(0xdeadbeef, ra1)											; rx0						= branch()				;
 sig_none			; rx0.nop			= sem_inc.pm.sf.always(1, 0x7ffffff)												; rx0.nop					= sem_inc.always()		;
 sig_load_imm		; rx0.nop			= load32.pm.sf.always(0xdeadbeef)													; rx0.nop					= load32.always()		;
 sig_load_imm		; rx0.nop			= load16.pm.sf.signed.always(1, 2)													; rx0.nop					= load16.always()		;
 #mov
 sig_none			; rx0.nop			= or(r0, r0)																		; rx0						= v8min(r1, r1)			;
 */
 int main()
 {
 	char asm_code[] =
--- a/QPUassembler/vc4_qpu_defines.h
+++ b/QPUassembler/vc4_qpu_defines.h
@ -411,6 +411,7 @@ typedef enum {
 static const char *qpu_raddr_str[][52] = {
 	{ //A
 		//ra0-31
 		[QPU_R_FRAG_PAYLOAD_ZW] = "pay_zw",
 		[QPU_R_UNIF] = "uni",
 		[QPU_R_VARY] = "vary",
@ -424,6 +425,7 @@ static const char *qpu_raddr_str[][52] = {
 		[QPU_R_MUTEX_ACQUIRE] = "mutex_acq"
 	},
 	{ //B
 	  //rb0-31
 	  [QPU_R_FRAG_PAYLOAD_ZW] = "pay_zw",
 	  [QPU_R_UNIF] = "uni",
 	  [QPU_R_VARY] = "vary",