From 6206204164e22dfe963320ef0dca02e88a8630e2 Mon Sep 17 00:00:00 2001
From: Unknown <0.tamas.marton@gmail.com>
Date: Sat, 20 Apr 2019 14:30:42 +0100
Subject: [PATCH] moved assembler to standalone file

---
 QPUassembler/main.c          | 1441 +--------------------------------
 QPUassembler/qpu_assembler.c | 1442 ++++++++++++++++++++++++++++++++++
 QPUassembler/qpu_assembler.h |    6 +
 3 files changed, 1450 insertions(+), 1439 deletions(-)
 create mode 100644 QPUassembler/qpu_assembler.c
 create mode 100644 QPUassembler/qpu_assembler.h

diff --git a/QPUassembler/main.c b/QPUassembler/main.c
index 3443bdb..95a5bf4 100644
--- a/QPUassembler/main.c
+++ b/QPUassembler/main.c
@@ -1,1445 +1,8 @@
 ﻿#include <stdio.h>
-#include <stdlib.h>
 #include <string.h>
+#include <stdlib.h>
 
-#include "vc4_qpu_defines.h"
-
-/*********************************************************************************************************************
-	Instruction restrictions
-
-	* The last three instructions of any program (Thread End plus the following two delay-slot instructions) must
-		not do varyings read, uniforms read or any kind of VPM, VDR, or VDW read or write.
-	* The Program End instruction must not write to either physical regfile A or B.
-	* The Program End instruction and the following two delay slot instructions must not write or read address 14
-		in either regfile A or B.
-	* The final program instruction (the second delay slot instruction) must not do a TLB Z write.
-	* A scoreboard wait must not occur in the first two instructions of a fragment shader. This is either the
-		explicit Wait for Scoreboard signal or an implicit wait with the first tile-buffer read or write instruction.
-	* If TMU_NOSWAP is written, the write must be three instructions before the first TMU write instruction.
-		For example, if TMU_NOSWAP is written in the first shader instruction, the first TMU write cannot occur
-		before the 4th shader instruction.
-	* An instruction must not read from a location in physical regfile A or B that was written to by the previous
-		instruction.
-	* After an SFU lookup instruction, accumulator r4 must not be read in the following two instructions. Any
-		other instruction that results in r4 being written (that is, TMU read, TLB read, SFU lookup) cannot occur in
-		the two instructions following an SFU lookup.
-	* An instruction that does a vector rotate by r5 must not immediately follow an instruction that writes to r5.
-	* An instruction that does a vector rotate must not immediately follow an instruction that writes to the
-		accumulator that is being rotated.
-	* After an instruction that does a TLB Z write, the multisample mask must not be read as an instruction
-		input argument in the following two instruction. The TLB Z write instruction can, however, be followed
-		immediately by a TLB color write.
-	* A single instruction can only perform a maximum of one of the following closely coupled peripheral
-		accesses in a single instruction: TMU write, TMU read, TLB write, TLB read, TLB combined color read and
-		write, SFU write, Mutex read or Semaphore access.
- *********************************************************************************************************************/
-
-/*
-Format:
-#comment
-sig_bit_opt		; dstAdd.pack_opt	= add_op.pm_opt.sf_opt.cond.unpack_opt.ws_opt(srcA, srcB, raddr_a_opt, raddr_b_opt)	; dstMul.pack_opt	= mul_op.cond(srcA, srcB)	;
-sig_small_imm	; dstAdd.pack_opt	= add_op.pm_opt.sf_opt.cond.unpack_opt.ws_opt(srcA, srcB, raddr_a_opt, small_imm)	; dstMul.pack_opt	= mul_op.cond(srcA, srcB)	;
-sig_branch		; dstAdd			= branch.rel_opt.reg_opt.ws_opt(address, condition, raddr_a_opt)					; dstMul			= branch()					;
-sig_load_imm	; dstAdd.pack_opt	= sem_inc.pm_opt.sf_opt.cond.ws_opt(sem_number, 27bit_imm_opt)						; dstMul.pack_opt	= sem_inc.cond()			;
-sig_load_imm	; dstAdd.pack_opt	= load32.pm_opt.sf_opt.cond.ws_opt(immediate32bit_value)							; dstMul.pack_opt	= load32.cond()				;
-sig_load_imm	; dstAdd.pack_opt	= load16.pm_opt.signed_opt.sf_opt.cond.ws_opt(int16_imm, int16_imm)					; dstMul.pack_opt	= load16.cond()				;
-
-==================================================================
-================How to formulate instructions:====================
-==================================================================
-1)
-You must specify the signal bits at the beginning of each instruction:
-sig_brk, sig_none, sig_switch, sig_end, sig_wait_score, sig_unlock_score, sig_thread_switch, sig_coverage_load,
-sig_color_load, sig_color_load_end, sig_load_tmu0, sig_load_tmu1, sig_alpha_mask_load, sig_small_imm, sig_load_imm, sig_branch
-
-2)
-Then you must specify the output register for the ADD pipeline.
-rx0-31, r0-3, r5, tmu_noswap, host_int, nop, uniforms_addr, quad_x, quad_y, ms_flags, rev_flags, tlb_stencil_setup
-tlb_z, tlb_color_ms, tlb_color_all, vpm, vr_setup, vr_addr, mutex_release, sfu_recip, sfu_recipsqrt, sfu_exp,
-sfu_log, tmu0_s, tmu0_t, tmu0_r, tmu0_b, tmu1_s, tmu1_t, tmu1_r, tmu1_b
-
-3)
-If the ADD instruction writes to regfile A (ie. you don't specify the WS flag later) and PM flag won't be specified,
-then you can specify the pack mode for regfile A here (omitting means nop)
-nop, 16a, 16b, 8888, 8a, 8b, 8c, 8d, sta, 16a.sat, 16b.sat, 8888.sat, 8a.sat, 8b.sat, 8c.sat, 8d.sat
-
-4)
-Then you must specify your operation for the ADD pipeline. If you are writing a non-ALU instruction, you can specify either
-branch, sem_inc, sem_dec, load32 or load16 here instead.
-Operations available:
-nop, fadd, fsub, fmin, fmax, fminabs, fmaxabs, ftoi, itof, add, sub, shr, asr, ror, shl, min, max, and, or, xor, not, clz, v8adds, v8subs
-
-5)
-Then you can specify a range of modifiers (order is not important):
-PM bit: pm
-SF bit: sf
-WS bit: ws
-REL bit: rel
-REG bit: reg
-SIGNED bit: signed
-Conditional execution for the ADD pipeline: never, always, zs, zc, ns, nc, cs, cc
-Unpack modes (from regfile A, or if PM is set from R4): nop, 16a, 16b, 8d_rep, 8a, 8b, 8c, 8d
-
-6)
-Then you must specify the arguments for the ALU operation.
-srcA, srcB can be: r0-r5 or a, b for regfiles A and B, or imm for the small immediate value.
-raddr_a and raddr_b can be specified afterwards as optional extra arguments (omitting means nop).
-raddr_a: ra0-31, pay_zw, uni, vary, elem, nop, x_pix, ms_flags, vpm_read, vpm_ld_busy, vpm_ld_wait, mutex_acq
-raddr_b: rb0-31, pay_zw, uni, vary, elem, nop, y_pix, rev_flag, vpm_read, vpm_st_busy, vpm_st_wait, mutex_acq
-
-For branch operation, you must specify:
-the jump address as a 32bit value (can be relative if REL is set)
-the branch condition: all_zs, all_zc, any_zs, any_zc, all_ns, all_nc, any_ns, any_nc, all_cs, all_cc, any_cs, any_cc, always
-and an optional raddr_a (if REG flag is set), see above
-
-For a semaphore instruction, you need to specify which semaphore (0-15) you want to modify, then an optional 27bit immediate value (ms 16bits might be usable...).
-
-7)
-Then you must specify the output register for the MUL pipeline.
-See above for options.
-
-8)
-If the MUL instruction writes to regfile A (ie. you specify the WS flag) then you can set the pack operation for regfile A here:
-nop, 16a, 16b, 8888, 8a, 8b, 8c, 8d, sta, 16a.sat, 16b.sat, 8888.sat, 8a.sat, 8b.sat, 8c.sat, 8d.sat
-OR
-You if specify the PM flag, then you can set the pack operation for the MUL output here:
-nop, 8888, 8a, 8b, 8c, 8d
-
-9)
-Then you must specify your operation for the MUL pipeline. If you are writing a non-ALU instruction, you can specify either
-branch, sem_inc, sem_dec, load32 or load16 here instead.
-Operations available:
-nop, fmul, mul24, v8muld, v8min, v8max, v8adds, v8subs
-
-10)
-Then you can specify a range of modifiers (order is not important):
-Conditional execution for the MUL pipeline: never, always, zs, zc, ns, nc, cs, cc
-
-11)
-Then you must specify the arguments for the ALU operation.
-srcA, srcB can be: r0-r5 or a, b for regfiles A and B, or imm for the small immediate value.
-
-==================================================================
-==================================================================
-
-
-dstAdd: rx0-31, r0-5, special regs
-raddr_a_opt: ra0-31, r0-5, special regs
-raddr_b_opt: rb0-31, r0-5, special regs
-
-Examples:
-sig_none		; rx0.nop			= add.pm.sf.always(r0, r1, 0)														; rx0.nop					= fmul.always(r2, r3)	;
-sig_branch		; rx0				= branch.pm.rel.reg.always(0xdeadbeef, ra1)											; rx0						= branch()				;
-sig_none		; rx0.nop			= sem_inc.pm.sf.always(1, 0x7ffffff)												; rx0.nop					= sem_inc.always()		;
-sig_load_imm	; rx0.nop			= load32.pm.sf.always(0xdeadbeef)													; rx0.nop					= load32.always()		;
-sig_load_imm	; rx0.nop			= load16.pm.sf.signed.always(1, 2)													; rx0.nop					= load16.always()		;
-#mov
-sig_none		; rx0.nop			= or(r0, r0)																		; rx0						= v8min(r1, r1)			;
- */
-
-uint64_t encode_alu(qpu_sig_bits sig_bits,
-					qpu_unpack unpack_mode,
-					//If the pm bit is set, the unpack field programs the r4 unpack unit,
-					//and the pack field is used to program the color
-					//conversion on the output of the mul unit
-					uint8_t pack_unpack_select,
-					uint8_t pack_mode,
-					qpu_cond add_cond,
-					qpu_cond mul_cond,
-					uint8_t set_flags, //Flags are updated from the add ALU unless the add ALU performed a NOP (or its condition code was NEVER) in which case flags are updated from the mul ALU
-					uint8_t write_swap_flag, //0: add writes to A, mul to B, 1: add writes to B, mul to A
-					qpu_waddr waddr_add,
-					qpu_waddr waddr_mul,
-					qpu_op_mul op_mul,
-					qpu_op_add op_add,
-					qpu_raddr raddr_a,
-					qpu_raddr raddr_b,
-					qpu_mux add_a,
-					qpu_mux add_b,
-					qpu_mux mul_a,
-					qpu_mux mul_b
-					)
-{
-	uint64_t res = 0;
-	uint64_t tmp = 0;
-
-	tmp = sig_bits & 0xf; //mask ls 4 bits
-	res |= tmp << QPU_SIG_SHIFT;
-
-	tmp = unpack_mode & 0x7; //mask ls 3 bits
-	res |= tmp << QPU_UNPACK_SHIFT;
-
-	tmp = pack_unpack_select & 1;
-	res |= tmp << 56;
-
-	tmp = pack_mode & 0xf;
-	res |= tmp << QPU_PACK_SHIFT;
-
-	tmp = add_cond & 0x7;
-	res |= tmp << QPU_COND_ADD_SHIFT;
-
-	tmp = mul_cond & 0x7;
-	res |= tmp << QPU_COND_MUL_SHIFT;
-
-	tmp = set_flags & 1;
-	res |= tmp << 45;
-
-	tmp = write_swap_flag & 1;
-	res |= tmp << 44;
-
-	tmp = waddr_add & 0x3f;
-	res |= tmp << QPU_WADDR_ADD_SHIFT;
-
-	tmp = waddr_mul & 0x3f;
-	res |= tmp << QPU_WADDR_MUL_SHIFT;
-
-	tmp = op_mul & 0x7;
-	res |= tmp << QPU_OP_MUL_SHIFT;
-
-	tmp = op_add & 0x1f;
-	res |= tmp << QPU_OP_ADD_SHIFT;
-
-	tmp = raddr_a & 0x3f;
-	res |= tmp << QPU_RADDR_A_SHIFT;
-
-	tmp = raddr_b & 0x3f;
-	res |= tmp << QPU_RADDR_B_SHIFT;
-
-	tmp = add_a & 0x7;
-	res |= tmp << QPU_ADD_A_SHIFT;
-
-	tmp = add_b & 0x7;
-	res |= tmp << QPU_ADD_B_SHIFT;
-
-	tmp = mul_a & 0x7;
-	res |= tmp << QPU_MUL_A_SHIFT;
-
-	tmp = mul_b & 0x7;
-	res |= tmp << QPU_MUL_B_SHIFT;
-
-	return res;
-}
-
-uint64_t encode_alu_small_imm(qpu_unpack unpack_mode,
-							  uint8_t pack_unpack_select,
-							  uint8_t pack_mode,
-							  qpu_cond add_cond,
-							  qpu_cond mul_cond,
-							  uint8_t set_flags, //Flags are updated from the add ALU unless the add ALU performed a NOP (or its condition code was NEVER) in which case flags are updated from the mul ALU
-							  uint8_t write_swap_flag, //0: add writes to A, mul to B, 1: add writes to B, mul to A
-							  qpu_waddr waddr_add,
-							  qpu_waddr waddr_mul,
-							  qpu_op_mul op_mul,
-							  qpu_op_add op_add,
-							  qpu_raddr raddr_a,
-							  uint8_t small_imm,
-							  qpu_mux add_a,
-							  qpu_mux add_b,
-							  qpu_mux mul_a,
-							  qpu_mux mul_b
-		)
-{
-	return encode_alu(0xd,
-					  unpack_mode,
-					  pack_unpack_select,
-					  pack_mode,
-					  add_cond,
-					  mul_cond,
-					  set_flags,
-					  write_swap_flag,
-					  waddr_add,
-					  waddr_mul,
-					  op_mul,
-					  op_add,
-					  raddr_a,
-					  small_imm,
-					  add_a,
-					  add_b,
-					  mul_a,
-					  mul_b);
-}
-
-uint64_t encode_branch(qpu_branch_cond branch_cond,
-					   uint8_t is_relative, //if set branch target is relative to PC+4
-					   uint8_t use_raddr_a, //if set add value of raddr_a (from simd elem 0) to branch target
-					   qpu_raddr raddr_a,
-					   uint8_t write_swap_bit,
-					   qpu_waddr waddr_add,
-					   qpu_waddr waddr_mul,
-					   uint32_t imm //always added to branch target, set to 0 if unused
-					   )
-{
-	uint64_t res = 0;
-	uint64_t tmp = 0;
-
-	tmp = 0xf;
-	res |= tmp << 60;
-
-	tmp = branch_cond & 0xf;
-	res |= tmp << QPU_BRANCH_COND_SHIFT;
-
-	tmp = is_relative & 1;
-	res |= tmp << 51;
-
-	tmp = use_raddr_a & 1;
-	res |= tmp << 50;
-
-	tmp = raddr_a & 0x1f;
-	res |= tmp << QPU_BRANCH_RADDR_A_SHIFT;
-
-	tmp = write_swap_bit & 1;
-	res |= tmp << 44;
-
-	tmp = waddr_add & 0x3f;
-	res |= tmp << QPU_WADDR_ADD_SHIFT;
-
-	tmp = waddr_mul & 0x3f;
-	res |= tmp << QPU_WADDR_MUL_SHIFT;
-
-	res |= imm;
-
-	return res;
-}
-
-uint64_t encode_semaphore(uint8_t pack_unpack_select,
-						  uint8_t pack_mode,
-						  qpu_cond cond_add,
-						  qpu_cond cond_mul,
-						  uint8_t set_flags,
-						  uint8_t write_swap,
-						  qpu_waddr waddr_add,
-						  qpu_waddr waddr_mul,
-						  uint8_t incr_sem, //if 1 increment semaphore
-						  uint8_t sem, //4 bit semaphore selector
-						  uint32_t imm_val //27bit immediate value loaded into all 16 simd elements
-						  )
-{
-	uint64_t res = 0;
-	uint64_t tmp = 0;
-
-	tmp = 0x74;
-	res |= tmp << 57;
-
-	tmp = pack_unpack_select & 1;
-	res |= tmp << 56;
-
-	tmp = pack_mode & 0xf;
-	res |= tmp << QPU_PACK_SHIFT;
-
-	tmp = cond_add & 0x7;
-	res |= tmp << QPU_COND_ADD_SHIFT;
-
-	tmp = cond_mul & 0x7;
-	res |= tmp << QPU_COND_MUL_SHIFT;
-
-	tmp = set_flags & 1;
-	res |= tmp << 45;
-
-	tmp = write_swap & 1;
-	res |= tmp << 44;
-
-	tmp = waddr_add & 0x3f;
-	res |= tmp << QPU_WADDR_ADD_SHIFT;
-
-	tmp = waddr_mul & 0x3f;
-	res |= tmp << QPU_WADDR_MUL_SHIFT;
-
-	tmp = imm_val & 0x7ffffff;
-	res |= tmp << 5;
-
-	tmp = incr_sem & 1;
-	res |= tmp << 4;
-
-	res |= sem & 0xf;
-
-	return res;
-}
-
-//write immediate value across simd array
-uint64_t encode_load_imm(uint8_t pack_unpack_select,
-						 uint8_t pack_mode,
-						 qpu_cond cond_add,
-						 qpu_cond cond_mul,
-						 uint8_t set_flags,
-						 uint8_t write_swap,
-						 qpu_waddr waddr_add,
-						 qpu_waddr waddr_mul,
-						 uint32_t imm //2x16bit or 1x32bit uint
-		)
-{
-	uint64_t res = 0;
-	uint64_t tmp = 0;
-
-	tmp = 0x70;
-	res |= tmp << 57;
-
-	tmp = pack_unpack_select & 1;
-	res |= tmp << 56;
-
-	tmp = pack_mode & 0xf;
-	res |= tmp << QPU_PACK_SHIFT;
-
-	tmp = cond_add & 0x7;
-	res |= tmp << QPU_COND_ADD_SHIFT;
-
-	tmp = cond_mul & 0x7;
-	res |= tmp << QPU_COND_MUL_SHIFT;
-
-	tmp = set_flags & 1;
-	res |= tmp << 45;
-
-	tmp = write_swap & 1;
-	res |= tmp << 44;
-
-	tmp = waddr_add & 0x3f;
-	res |= tmp << QPU_WADDR_ADD_SHIFT;
-
-	tmp = waddr_mul & 0x3f;
-	res |= tmp << QPU_WADDR_MUL_SHIFT;
-
-	res |= imm;
-
-	return res;
-}
-
-//write per element MS bit and LS bit across simd array
-uint64_t encode_load_imm_per_elem(
-						 uint8_t signed_or_unsigned, //1 for signed, 0 for unsigned
-						 uint8_t pack_unpack_select,
-						 uint8_t pack_mode,
-						 qpu_cond cond_add,
-						 qpu_cond cond_mul,
-						 uint8_t set_flags,
-						 uint8_t write_swap,
-						 qpu_waddr waddr_add,
-						 qpu_waddr waddr_mul,
-						 uint16_t ms_bit, //per element MS (sign) bit
-						 uint16_t ls_bit //per element LS bit
-		)
-{
-	uint64_t res = 0;
-	uint64_t tmp = 0;
-
-	tmp = 0x71;
-	tmp |= !signed_or_unsigned << 1;
-	res |= tmp << 57;
-
-	tmp = pack_unpack_select & 1;
-	res |= tmp << 56;
-
-	tmp = pack_mode & 0xf;
-	res |= tmp << QPU_PACK_SHIFT;
-
-	tmp = cond_add & 0x7;
-	res |= tmp << QPU_COND_ADD_SHIFT;
-
-	tmp = cond_mul & 0x7;
-	res |= tmp << QPU_COND_MUL_SHIFT;
-
-	tmp = set_flags & 1;
-	res |= tmp << 45;
-
-	tmp = write_swap & 1;
-	res |= tmp << 44;
-
-	tmp = waddr_add & 0x3f;
-	res |= tmp << QPU_WADDR_ADD_SHIFT;
-
-	tmp = waddr_mul & 0x3f;
-	res |= tmp << QPU_WADDR_MUL_SHIFT;
-
-	tmp = ms_bit;
-	res |= tmp << 16;
-
-	res |= ls_bit;
-
-	return res;
-}
-
-qpu_sig_bits parse_sig_bit(char* str)
-{
-	unsigned num_sig_bits = sizeof(qpu_sig_bits_str) / sizeof(const char *);
-
-	for(unsigned c = 0; c < num_sig_bits && str; ++c)
-	{
-		if(qpu_sig_bits_str[c] && strcmp(str, qpu_sig_bits_str[c]) == 0)
-		{
-			return c;
-		}
-	}
-
-	return -1;
-}
-
-void parse_dst(char** str, qpu_waddr* waddr, uint8_t* pack_mode, unsigned is_add, unsigned pm_set)
-{
-	char* dst = strtok(*str, ".");
-	char* pack = strtok(0, ".");
-
-	//advance token past dst strings so we can tokenize further
-	if(dst)
-	{
-		if(pack)
-		{
-			*str = pack;
-		}
-		else
-		{
-			*str = dst;
-		}
-
-		while(**str)
-		{
-			(*str)++;
-		}
-
-		*str += 1;
-	}
-
-	uint8_t waddr_res = 0;
-	uint8_t pack_mode_res = 0;
-
-	for(unsigned c = 0; c < 2 && dst && !waddr_res; ++c)
-	{
-		for(unsigned d = 0; d < 64; ++d)
-		{
-			if(qpu_waddr_str[c][d] && strcmp(dst, qpu_waddr_str[c][d]) == 0)
-			{
-				waddr_res = d;
-				break;
-			}
-		}
-	}
-
-	if(dst && dst[0] == 'r' && dst[1] == 'x')
-	{
-		waddr_res = strtol(dst+2, 0, 0);
-	}
-
-	unsigned num_pack_a_str = sizeof(qpu_pack_a_str) / sizeof(const char *);
-	for(unsigned c = 0; c < num_pack_a_str && pack && !pack_mode_res; ++c)
-	{
-		if(qpu_pack_a_str[c] && strcmp(pack, qpu_pack_a_str[c]) == 0)
-		{
-			pack_mode_res = c;
-			break;
-		}
-	}
-
-	unsigned num_pack_mul_str = sizeof(qpu_pack_mul_str) / sizeof(const char *);
-	for(unsigned c = 0; c < num_pack_mul_str && pack && !pack_mode_res; ++c)
-	{
-		if(qpu_pack_mul_str[c] && strcmp(pack, qpu_pack_mul_str[c]) == 0)
-		{
-			pack_mode_res = c;
-			break;
-		}
-	}
-
-	*waddr = waddr_res;
-	if(is_add || pm_set)
-	{
-		*pack_mode = pack_mode_res;
-	}
-}
-
-void parse_op_modifiers(char** str, uint8_t* signed_or_unsigned, uint8_t* ws, uint8_t* pm, uint8_t* sf, qpu_cond* condition, qpu_unpack* unpack_mode, uint8_t* rel, uint8_t* reg, unsigned is_add)
-{
-	char* modifier = strtok(*str, ".");
-
-	//at most 5 modifiers supported
-	for(int c = 0; c < 5; ++c)
-	{
-		if(modifier)
-		{
-			*str = modifier;
-
-			if(strcmp(modifier, "pm") == 0 && is_add)
-			{
-				*pm = 1;
-				modifier = strtok(0, ".");
-				continue;
-			}
-
-			if(strcmp(modifier, "ws") == 0 && is_add)
-			{
-				*ws = 1;
-				modifier = strtok(0, ".");
-				continue;
-			}
-
-			if(strcmp(modifier, "rel") == 0 && is_add)
-			{
-				*rel = 1;
-				modifier = strtok(0, ".");
-				continue;
-			}
-
-			if(strcmp(modifier, "reg") == 0 && is_add)
-			{
-				*reg = 1;
-				modifier = strtok(0, ".");
-				continue;
-			}
-
-			if(strcmp(modifier, "sf") == 0 && is_add)
-			{
-				*sf = 1;
-				modifier = strtok(0, ".");
-				continue;
-			}
-
-			if(strcmp(modifier, "signed") == 0 && is_add)
-			{
-				*signed_or_unsigned = 1;
-				modifier = strtok(0, ".");
-				continue;
-			}
-
-			unsigned found = 0;
-			unsigned num_conds = sizeof(qpu_cond_str) / sizeof(const char *);
-
-			for(unsigned d = 0; d < num_conds; ++d)
-			{
-				if(qpu_cond_str[d] && strcmp(modifier, qpu_cond_str[d]) == 0)
-				{
-					*condition = d;
-					found = 1;
-					break;
-				}
-			}
-
-			if(found)
-			{
-				modifier = strtok(0, ".");
-				continue;
-			}
-
-			if(is_add)
-			{
-				unsigned num_unpack_modes = sizeof(qpu_unpack_str) / sizeof(const char *);
-
-				for(unsigned d = 0; d < num_unpack_modes; ++d)
-				{
-					if(qpu_unpack_str[d] && strcmp(modifier, qpu_unpack_str[d]) == 0)
-					{
-						*unpack_mode = d;
-						break;
-					}
-				}
-			}
-
-			modifier = strtok(0, ".");
-		}
-	}
-
-	//advance token past op strings so we can tokenize further
-	while(**str)
-	{
-		(*str)++;
-	}
-
-	*str += 1;
-}
-
-void parse_op(char** str, qpu_alu_type* type, qpu_op_add* op_add, qpu_op_mul* op_mul, uint8_t* is_sem_inc, qpu_load_type* load_type)
-{
-	char* op = strtok(*str, ".");
-
-	if(op && strcmp(op, "sem_inc") == 0)
-	{
-		*type = QPU_SEM;
-		*is_sem_inc = 1;
-	}
-	else if(op && strcmp(op, "sem_dec") == 0)
-	{
-		*type = QPU_SEM;
-		*is_sem_inc = 0;
-	}
-	else if(op && strcmp(op, "branch") == 0)
-	{
-		*type = QPU_BRANCH;
-	}
-	else if(op && strcmp(op, "load32") == 0)
-	{
-		*type = QPU_LOAD_IMM;
-		*load_type = QPU_LOAD32;
-	}
-	else if(op && strcmp(op, "load16") == 0)
-	{
-		*type =	QPU_LOAD_IMM;
-		*load_type = QPU_LOAD16;
-	}
-	else
-	{
-		*type = QPU_ALU;
-
-		unsigned num_add_ops = sizeof(qpu_op_add_str) / sizeof(const char *);
-		unsigned num_mul_ops = sizeof(qpu_op_mul_str) / sizeof(const char *);
-
-		for(unsigned c = 0; c < num_add_ops && op; ++c)
-		{
-			if(qpu_op_add_str[c] && strcmp(op, qpu_op_add_str[c]) == 0)
-			{
-				*op_add = c;
-				break;
-			}
-		}
-
-		for(unsigned c = 0; c < num_mul_ops && op; ++c)
-		{
-			if(qpu_op_mul_str[c] && strcmp(op, qpu_op_mul_str[c]) == 0)
-			{
-				*op_mul = c;
-				break;
-			}
-		}
-	}
-
-	if(op)
-	{
-		*str = op;
-	}
-
-	//advance token past op strings so we can tokenize further
-	while(**str)
-	{
-		(*str)++;
-	}
-
-	*str += 1;
-}
-
-void parse_args_alu(char** str, qpu_mux* in_a, qpu_mux* in_b, uint8_t* raddr_a, uint8_t* raddr_b, uint8_t is_si)
-{
-	char* arg = strtok(*str, " \n\v\f\r\t,");
-
-	unsigned num_muxes = sizeof(qpu_mux_str) / sizeof(const char *);
-	unsigned found = 0;
-
-	for(unsigned c = 0; c < num_muxes && arg; ++c)
-	{
-		if(qpu_mux_str[c] && strcmp(arg, qpu_mux_str[c]) == 0)
-		{
-			*str = arg;
-			*in_a = c;
-			found = 1;
-			break;
-		}
-	}
-
-	arg = strtok(0, " \n\v\f\r\t,");
-
-	for(unsigned c = 0; c < num_muxes && arg; ++c)
-	{
-		if(qpu_mux_str[c] && strcmp(arg, qpu_mux_str[c]) == 0)
-		{
-			*str = arg;
-			*in_b = c;
-			break;
-		}
-	}
-
-	arg = strtok(0, " \n\v\f\r\t,");
-
-	if(arg)
-	{
-		uint8_t raddr_a_res = 0;
-
-		for(unsigned d = 0; d < 52; ++d)
-		{
-			if(qpu_raddr_str[0][d] && strcmp(arg, qpu_raddr_str[0][d]) == 0)
-			{
-				raddr_a_res = d;
-				break;
-			}
-		}
-
-		if(!raddr_a_res && arg && arg[0] == 'r' && arg[1] == 'a')
-		{
-			raddr_a_res = strtol(arg+2, 0, 0);
-		}
-
-
-		*raddr_a = raddr_a_res;
-		*str = arg;
-	}
-
-	arg = strtok(0, " \n\v\f\r\t,");
-
-	if(arg)
-	{
-		uint8_t raddr_b_res = 0;
-
-		for(unsigned c = 0; c < 2 && arg && !raddr_b_res; ++c)
-		{
-			for(unsigned d = 0; d < 52; ++d)
-			{
-				if(qpu_raddr_str[c][d] && strcmp(arg, qpu_raddr_str[c][d]) == 0)
-				{
-					raddr_b_res = d;
-					break;
-				}
-			}
-		}
-
-		if(!raddr_b_res && arg && arg[0] == 'r' && arg[1] == 'b')
-		{
-			raddr_b_res = strtol(arg+2, 0, 0);
-		}
-
-		if(is_si)
-		{
-			uint32_t si = strtol(arg, 0, 0);
-			raddr_b_res = qpu_encode_small_immediate(si);
-		}
-
-		*raddr_b = raddr_b_res;
-		*str = arg;
-	}
-
-	//advance token past arg strings so we can tokenize further
-	while(**str)
-	{
-		(*str)++;
-	}
-
-	*str += 1;
-}
-
-void parse_args_sem(char** str, uint8_t* sem, uint32_t* imm32)
-{
-	char* arg = strtok(*str, " \n\v\f\r\t,");
-
-	if(arg)
-	{
-		*sem = strtol(arg, 0, 0);
-		*str = arg;
-	}
-
-	arg = strtok(0, " \n\v\f\r\t,");
-
-	if(arg)
-	{
-		*imm32 = strtol(arg, 0, 0);
-		*str = arg;
-	}
-
-	//advance token past arg strings so we can tokenize further
-	while(**str)
-	{
-		(*str)++;
-	}
-
-	*str += 1;
-}
-
-void parse_args_branch(char** str, uint32_t* imm32, qpu_branch_cond* branch_cond, uint8_t* raddr_a)
-{
-	char* arg = strtok(*str, " \n\v\f\r\t,");
-
-	if(arg)
-	{
-		*imm32 = strtol(arg, 0, 0);
-		*str = arg;
-	}
-
-	arg = strtok(0, " \n\v\f\r\t,");
-
-	if(arg)
-	{
-		unsigned num_branch_conds = sizeof(qpu_branch_cond_str) / sizeof(const char *);
-
-		for(unsigned c = 0; c < num_branch_conds && arg; ++c)
-		{
-			if(qpu_branch_cond_str[c] && strcmp(arg, qpu_branch_cond_str[c]) == 0)
-			{
-				*branch_cond = c;
-				*str = arg;
-				break;
-			}
-		}
-	}
-
-	arg = strtok(0, " \n\v\f\r\t,");
-
-	if(arg)
-	{
-		uint8_t raddr_a_res = 0;
-
-		for(unsigned c = 0; c < 2 && arg && !raddr_a_res; ++c)
-		{
-			for(unsigned d = 0; d < 52; ++d)
-			{
-				if(qpu_raddr_str[c][d] && strcmp(arg, qpu_raddr_str[c][d]) == 0)
-				{
-					raddr_a_res = d;
-					break;
-				}
-			}
-		}
-
-		if(!raddr_a_res && arg && arg[0] == 'r' && arg[1] == 'a')
-		{
-			raddr_a_res = strtol(arg+2, 0, 0);
-		}
-
-		*raddr_a = raddr_a_res;
-		*str = arg;
-	}
-
-	//advance token past arg strings so we can tokenize further
-	while(**str)
-	{
-		(*str)++;
-	}
-
-	*str += 1;
-}
-
-void parse_args_load(char** str, qpu_load_type load_type, uint32_t* imm32, uint16_t* ms_imm16, uint16_t* ls_imm16)
-{
-	char* arg = strtok(*str, " \n\v\f\r\t,");
-
-	if(load_type == QPU_LOAD32)
-	{
-		if(arg)
-		{
-			*imm32 = strtol(arg, 0, 0);
-			*str = arg;
-		}
-	}
-	else
-	{
-		if(arg)
-		{
-			*ms_imm16 = strtol(arg, 0, 0);
-			*str = arg;
-		}
-
-		arg = strtok(0, " \n\v\f\r\t,");
-
-		if(arg)
-		{
-			*ls_imm16 = strtol(arg, 0, 0);
-			*str = arg;
-		}
-	}
-
-	//advance token past arg strings so we can tokenize further
-	while(**str)
-	{
-		(*str)++;
-	}
-
-	*str += 1;
-}
-
-
-void assemble_qpu_asm(char* str, uint64_t* instructions)
-{
-	unsigned instruction_counter = 0;
-
-	//delete lines that have comments in them
-	char* comment_token = strstr(str, "#");
-
-	while(comment_token)
-	{
-		while(*comment_token != '\n')
-		{
-			*comment_token = ' ';
-			comment_token++;
-		}
-		*comment_token = ' ';
-		comment_token = strstr(comment_token, "#");
-	}
-
-
-	//parse string token by token
-	char* token = strtok(str, " \n\v\f\r\t;");
-
-	while(token)
-	{
-		qpu_sig_bits sig_bit = QPU_SIG_NONE;
-		qpu_alu_type type = QPU_ALU;
-		qpu_op_add op_add =	QPU_A_NOP;
-		qpu_op_mul op_mul =	QPU_M_NOP;
-		qpu_mux mul_a = 0;
-		qpu_mux mul_b = 0;
-		qpu_mux add_a = 0;
-		qpu_mux add_b = 0;
-		qpu_cond cond_mul = QPU_COND_ALWAYS;
-		qpu_cond cond_add = QPU_COND_ALWAYS;
-		qpu_waddr waddr_add = QPU_W_NOP;
-		qpu_waddr waddr_mul = QPU_W_NOP;
-		qpu_waddr raddr_a = QPU_R_NOP;
-		qpu_waddr raddr_b = QPU_R_NOP;
-		uint8_t pack_unpack_select = 0;
-		uint8_t pack_mode = QPU_PACK_A_NOP;
-		qpu_unpack unpack_mode = QPU_UNPACK_NOP;
-		uint8_t is_sem_inc = 0;
-		uint8_t rel = 0;
-		uint8_t reg = 0;
-		uint8_t ws = 0;
-		uint8_t sf = 0;
-		uint32_t imm32 = 0;
-		uint16_t ms_imm16 = 0;
-		uint16_t ls_imm16 = 0;
-		uint8_t semaphore = 0;
-		qpu_load_type load_type = QPU_LOAD32;
-		uint8_t is_signed = 0;
-		qpu_branch_cond branch_cond = QPU_COND_BRANCH_ALWAYS;
-
-		sig_bit = parse_sig_bit(token);
-		if(sig_bit < 0)
-		{
-			break;
-		}
-
-		//get dst for add
-		token = strtok(0, " \n\v\f\r\t=;");
-		parse_dst(&token, &waddr_add, &pack_mode, 1, 0);
-
-		//check op
-		token = strtok(token, " \n\v\f\r\t=(");
-		unsigned has_modifiers = strstr(token, ".") != 0;
-		parse_op(&token, &type, &op_add, &op_mul, &is_sem_inc, &load_type);
-
-		//get modifiers
-		if(has_modifiers)
-		{
-			//token = strtok(token, " \n\v\f\r\t");
-			parse_op_modifiers(&token, &is_signed, &ws, &pack_unpack_select, &sf, &cond_add, &unpack_mode, &rel, &reg, 1);
-		}
-
-		if(type == QPU_ALU)
-		{
-			//get arguments for add
-			token = strtok(token, ")");
-			parse_args_alu(&token, &add_a, &add_b, &raddr_a, &raddr_b, sig_bit == QPU_SIG_SMALL_IMM);
-		}
-		else if(type == QPU_SEM)
-		{
-			//get arguments for sem
-			token = strtok(token, ")");
-			parse_args_sem(&token, &semaphore, &imm32);
-		}
-		else if(type == QPU_BRANCH)
-		{
-			//get arguments for branch
-			token = strtok(token, ")");
-			parse_args_branch(&token, &imm32, &branch_cond, &raddr_a);
-		}
-		else if(type == QPU_LOAD_IMM)
-		{
-			//get arguments for load imm
-			token = strtok(token, ")");
-			parse_args_load(&token, load_type, &imm32, &ms_imm16, &ls_imm16);
-		}
-
-		//get dst for mul
-		token = strtok(token, " \n\v\f\r\t=;");
-		parse_dst(&token, &waddr_mul, &pack_mode, 0, pack_unpack_select);
-
-		//check op
-		token = strtok(token, " \n\v\f\r\t=(");
-		has_modifiers = strstr(token, ".") != 0;
-		parse_op(&token, &type, &op_add, &op_mul, &is_sem_inc, &load_type);
-
-		//get modifiers
-		if(has_modifiers)
-		{
-			//token = strtok(token, " \n\v\f\r\t(");
-			parse_op_modifiers(&token, &is_signed, &ws, &pack_unpack_select, &sf, &cond_mul, &unpack_mode, &rel, &reg, 0);
-		}
-
-		token = strtok(token, ")");
-
-		if(type == QPU_ALU)
-		{
-			//get arguments for mul
-			parse_args_alu(&token, &mul_a, &mul_b, &raddr_a, &raddr_b, sig_bit == QPU_SIG_SMALL_IMM);
-		}
-
-		//EMIT INSTRUCTION HERE
-		if(type == QPU_ALU)
-		{
-			if(sig_bit == QPU_SIG_SMALL_IMM)
-			{
-				instructions[instruction_counter] = encode_alu_small_imm(unpack_mode, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, op_mul, op_add, raddr_a, raddr_b, add_a, add_b, mul_a, mul_b);
-			}
-			else
-			{
-				instructions[instruction_counter] = encode_alu(sig_bit, unpack_mode, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, op_mul, op_add, raddr_a, raddr_b, add_a, add_b, mul_a, mul_b);
-			}
-		}
-		else if(type == QPU_SEM)
-		{
-			instructions[instruction_counter] = encode_semaphore(pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, is_sem_inc, semaphore, imm32);
-		}
-		else if(type ==	QPU_BRANCH)
-		{
-			instructions[instruction_counter] = encode_branch(branch_cond, rel, reg, raddr_a, ws, waddr_add, waddr_mul, imm32);
-		}
-		else if(type == QPU_LOAD_IMM)
-		{
-			if(load_type ==	QPU_LOAD32)
-			{
-				instructions[instruction_counter] = encode_load_imm(pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, imm32);
-			}
-			else
-			{
-				instructions[instruction_counter] = encode_load_imm_per_elem(is_signed, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, ms_imm16, ls_imm16);
-			}
-		}
-
-		instruction_counter++;
-		token = strtok(token, " \n\v\f\r\t;");
-	}
-}
-
-void disassemble_qpu_asm(uint64_t instruction)
-{
-#define GET_BITFIELD(num_bits, place) (((instruction) & ((uint64_t)num_bits << place)) >> place)
-
-	qpu_sig_bits sig_bits = GET_BITFIELD(0xf, 60);
-	printf("%s ; ", qpu_sig_bits_str[sig_bits]);
-
-	unsigned is_sem = GET_BITFIELD(0x7f, 57) == 0x74;
-
-	qpu_waddr waddr_add = GET_BITFIELD(0x3f, QPU_WADDR_ADD_SHIFT);
-	qpu_waddr waddr_mul = GET_BITFIELD(0x3f, QPU_WADDR_MUL_SHIFT);
-	uint8_t ws = GET_BITFIELD(1, 44);
-	uint8_t pm = GET_BITFIELD(1, 56);
-
-	if(waddr_add <= 31)
-	{
-		printf("rx%d", waddr_add);
-	}
-	else
-	{
-		printf("%s", qpu_waddr_str[ws][waddr_add]);
-	}
-
-	if(is_sem)
-	{
-		uint8_t pack_mode = GET_BITFIELD(0xf, QPU_PACK_SHIFT);
-
-		if(!ws && !pm)
-		{
-			printf(".%s", qpu_pack_a_str[pack_mode]);
-		}
-
-		uint8_t is_sem_inc = GET_BITFIELD(1, 4);
-
-		printf(" = %s", is_sem_inc ? "sem_inc" : "sem_dec");
-
-		if(ws)
-		{
-			printf(".ws");
-		}
-
-		if(pm)
-		{
-			printf(".pm");
-		}
-
-		qpu_cond cond_add = GET_BITFIELD(0x7, QPU_COND_ADD_SHIFT);
-
-		printf(".%s", qpu_cond_str[cond_add]);
-
-		uint8_t sf = GET_BITFIELD(1, 45);
-
-		if(sf)
-		{
-			printf(".sf");
-		}
-
-		uint8_t sem = GET_BITFIELD(0xf, 0);
-
-		uint32_t imm_val = GET_BITFIELD(0x7ffffff, 5);
-
-		printf("(%d, %#x) ; ", sem, imm_val);
-
-		if(waddr_mul <= 31)
-		{
-			printf("rx%d", waddr_mul);
-		}
-		else
-		{
-			printf("%s", qpu_waddr_str[!ws][waddr_mul]);
-		}
-
-		if(pm)
-		{
-			printf(".%s", qpu_pack_mul_str[pack_mode]);
-		}
-
-		printf(" = %s", is_sem_inc ? "sem_inc" : "sem_dec");
-
-		qpu_cond cond_mul = GET_BITFIELD(0x7, QPU_COND_MUL_SHIFT);
-
-		printf(".%s() ;", qpu_cond_str[cond_mul]);
-	}
-	else if(!is_sem && sig_bits == QPU_SIG_LOAD_IMM)
-	{
-		qpu_load_type load_type = GET_BITFIELD(0x7f, 57) != 0x70;
-
-		uint8_t is_signed = !GET_BITFIELD(1, 58);
-
-		uint8_t pack_mode = GET_BITFIELD(0xf, QPU_PACK_SHIFT);
-
-		if(!ws && !pm)
-		{
-			printf(".%s", qpu_pack_a_str[pack_mode]);
-		}
-
-		if(load_type == QPU_LOAD32)
-		{
-			printf(" = load32");
-		}
-		else
-		{
-			printf(" = load16");
-		}
-
-		if(ws)
-		{
-			printf(".ws");
-		}
-
-		if(pm)
-		{
-			printf(".pm");
-		}
-
-		qpu_cond cond_add = GET_BITFIELD(0x7, QPU_COND_ADD_SHIFT);
-
-		printf(".%s", qpu_cond_str[cond_add]);
-
-		uint8_t sf = GET_BITFIELD(1, 45);
-
-		if(sf)
-		{
-			printf(".sf");
-		}
-
-		if(load_type == QPU_LOAD32)
-		{
-			uint32_t imm = GET_BITFIELD(0xffffffff, 0);
-
-			printf("(%#x) ; ", imm);
-		}
-		else
-		{
-			if(is_signed)
-			{
-				printf(".signed");
-			}
-
-			uint16_t ms_imm = GET_BITFIELD(0xffff, 16);
-			uint16_t ls_imm = GET_BITFIELD(0xffff, 0);
-			printf(is_signed ? "(%#x, %#x) ; " : "(%#x, %#x) ; ", ms_imm, ls_imm);
-		}
-
-		if(waddr_mul <= 31)
-		{
-			printf("rx%d", waddr_mul);
-		}
-		else
-		{
-			printf("%s", qpu_waddr_str[!ws][waddr_mul]);
-		}
-
-		if(load_type == QPU_LOAD32)
-		{
-			printf(" = load32");
-		}
-		else
-		{
-			printf(" = load16");
-		}
-
-		if(pm)
-		{
-			printf(".%s", qpu_pack_mul_str[pack_mode]);
-		}
-
-		qpu_cond cond_mul = GET_BITFIELD(0x7, QPU_COND_MUL_SHIFT);
-
-		printf(".%s() ;", qpu_cond_str[cond_mul]);
-	}
-	else if(!is_sem && sig_bits == QPU_SIG_BRANCH)
-	{
-		printf(" = branch");
-
-		if(ws)
-		{
-			printf(".ws");
-		}
-
-		uint8_t is_relative = GET_BITFIELD(1, 51);
-
-		if(is_relative)
-		{
-			printf(".rel");
-		}
-
-		uint8_t use_addr_a = GET_BITFIELD(1, 50);
-
-		if(use_addr_a)
-		{
-			printf(".reg");
-		}
-
-		uint32_t imm = GET_BITFIELD(0xffffffff, 0);
-		qpu_branch_cond branch_cond = GET_BITFIELD(0xf, QPU_BRANCH_COND_SHIFT);
-		qpu_raddr raddr_a = GET_BITFIELD(0x1f, QPU_BRANCH_RADDR_A_SHIFT);
-
-		printf("(%#x, %s, ", imm, qpu_branch_cond_str[branch_cond]);
-
-		if(raddr_a <= 31)
-		{
-			printf("ra%d", raddr_a);
-		}
-		else
-		{
-			printf("%s", qpu_raddr_str[0][raddr_a]);
-		}
-
-		printf(") ; ");
-
-		if(waddr_mul <= 31)
-		{
-			printf("rx%d", waddr_mul);
-		}
-		else
-		{
-			printf("%s", qpu_waddr_str[!ws][waddr_mul]);
-		}
-
-		printf(" = branch() ;");
-	}
-	else
-	{
-		//ALU
-		uint8_t pack_mode = GET_BITFIELD(0xf, QPU_PACK_SHIFT);
-
-		if(!pm)
-		{
-			printf(".%s", qpu_pack_a_str[pack_mode]);
-		}
-
-		qpu_op_add op_add = GET_BITFIELD(0x1f, QPU_OP_ADD_SHIFT);
-
-		printf(" = %s", qpu_op_add_str[op_add]);
-
-		if(ws)
-		{
-			printf(".ws");
-		}
-
-		if(pm)
-		{
-			printf(".pm");
-		}
-
-		qpu_cond cond_add = GET_BITFIELD(0x7, QPU_COND_ADD_SHIFT);
-
-		printf(".%s", qpu_cond_str[cond_add]);
-
-		uint8_t sf = GET_BITFIELD(1, 45);
-
-		if(sf)
-		{
-			printf(".sf");
-		}
-
-		qpu_unpack unpack_mode = GET_BITFIELD(0X7, QPU_UNPACK_SHIFT);
-
-		printf(".%s", qpu_unpack_str[unpack_mode]);
-
-		qpu_raddr raddr_a = GET_BITFIELD(0x3f, QPU_RADDR_A_SHIFT);
-		qpu_raddr raddr_b = GET_BITFIELD(0x3f, QPU_RADDR_B_SHIFT);
-
-		qpu_mux add_a = GET_BITFIELD(0x7, QPU_ADD_A_SHIFT);
-		qpu_mux add_b = GET_BITFIELD(0x7, QPU_ADD_B_SHIFT);
-
-		printf("(");
-
-		printf("%s, %s, ", qpu_mux_str[add_a], qpu_mux_str[add_b]);
-
-		if(raddr_a <= 31)
-		{
-			printf("ra%i", raddr_a);
-		}
-		else
-		{
-			printf("%s", qpu_raddr_str[0][raddr_a]);
-		}
-
-		printf(", ");
-
-		if(sig_bits == QPU_SIG_SMALL_IMM)
-		{
-			if(raddr_b < 16)
-			{
-				printf("%i", raddr_b);
-			}
-			else if(raddr_b < 32)
-			{
-				printf("%i", raddr_b - 32);
-			}
-			else
-			{
-				float val = raddr_b < 40 ? 1 << (raddr_b - 32) : 1.0f / (float)(1 << (48 - raddr_b));
-				printf("%#x", *(uint32_t*)&val);
-			}
-		}
-		else
-		{
-			if(raddr_b <= 31)
-			{
-				printf("rb%i", raddr_b);
-			}
-			else
-			{
-				printf("%s", qpu_raddr_str[1][raddr_b]);
-			}
-		}
-
-		printf(") ; ");
-
-		if(waddr_mul <= 31)
-		{
-			printf("rx%d", waddr_mul);
-		}
-		else
-		{
-			printf("%s", qpu_waddr_str[!ws][waddr_mul]);
-		}
-
-		if(pm)
-		{
-			printf(".%s", qpu_pack_mul_str[pack_mode]);
-		}
-
-		qpu_op_mul op_mul = GET_BITFIELD(0x7, QPU_OP_MUL_SHIFT);
-
-		printf(" = %s", qpu_op_mul_str[op_mul]);
-
-		qpu_cond cond_mul = GET_BITFIELD(0x7, QPU_COND_MUL_SHIFT);
-
-		printf(".%s", qpu_cond_str[cond_mul]);
-
-		qpu_mux mul_a = GET_BITFIELD(0x7, QPU_MUL_A_SHIFT);
-		qpu_mux mul_b = GET_BITFIELD(0x7, QPU_MUL_B_SHIFT);
-
-		printf("(%s, %s) ; ", qpu_mux_str[mul_a], qpu_mux_str[mul_b]);
-	}
-
-	printf("\n");
-}
+#include "qpu_assembler.h"
 
 int main()
 {
diff --git a/QPUassembler/qpu_assembler.c b/QPUassembler/qpu_assembler.c
new file mode 100644
index 0000000..0446fee
--- /dev/null
+++ b/QPUassembler/qpu_assembler.c
@@ -0,0 +1,1442 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vc4_qpu_defines.h"
+
+/*********************************************************************************************************************
+	Instruction restrictions
+
+	* The last three instructions of any program (Thread End plus the following two delay-slot instructions) must
+		not do varyings read, uniforms read or any kind of VPM, VDR, or VDW read or write.
+	* The Program End instruction must not write to either physical regfile A or B.
+	* The Program End instruction and the following two delay slot instructions must not write or read address 14
+		in either regfile A or B.
+	* The final program instruction (the second delay slot instruction) must not do a TLB Z write.
+	* A scoreboard wait must not occur in the first two instructions of a fragment shader. This is either the
+		explicit Wait for Scoreboard signal or an implicit wait with the first tile-buffer read or write instruction.
+	* If TMU_NOSWAP is written, the write must be three instructions before the first TMU write instruction.
+		For example, if TMU_NOSWAP is written in the first shader instruction, the first TMU write cannot occur
+		before the 4th shader instruction.
+	* An instruction must not read from a location in physical regfile A or B that was written to by the previous
+		instruction.
+	* After an SFU lookup instruction, accumulator r4 must not be read in the following two instructions. Any
+		other instruction that results in r4 being written (that is, TMU read, TLB read, SFU lookup) cannot occur in
+		the two instructions following an SFU lookup.
+	* An instruction that does a vector rotate by r5 must not immediately follow an instruction that writes to r5.
+	* An instruction that does a vector rotate must not immediately follow an instruction that writes to the
+		accumulator that is being rotated.
+	* After an instruction that does a TLB Z write, the multisample mask must not be read as an instruction
+		input argument in the following two instruction. The TLB Z write instruction can, however, be followed
+		immediately by a TLB color write.
+	* A single instruction can only perform a maximum of one of the following closely coupled peripheral
+		accesses in a single instruction: TMU write, TMU read, TLB write, TLB read, TLB combined color read and
+		write, SFU write, Mutex read or Semaphore access.
+ *********************************************************************************************************************/
+
+/*
+Format:
+#comment
+sig_bit_opt		; dstAdd.pack_opt	= add_op.pm_opt.sf_opt.cond.unpack_opt.ws_opt(srcA, srcB, raddr_a_opt, raddr_b_opt)	; dstMul.pack_opt	= mul_op.cond(srcA, srcB)	;
+sig_small_imm	; dstAdd.pack_opt	= add_op.pm_opt.sf_opt.cond.unpack_opt.ws_opt(srcA, srcB, raddr_a_opt, small_imm)	; dstMul.pack_opt	= mul_op.cond(srcA, srcB)	;
+sig_branch		; dstAdd			= branch.rel_opt.reg_opt.ws_opt(address, condition, raddr_a_opt)					; dstMul			= branch()					;
+sig_load_imm	; dstAdd.pack_opt	= sem_inc.pm_opt.sf_opt.cond.ws_opt(sem_number, 27bit_imm_opt)						; dstMul.pack_opt	= sem_inc.cond()			;
+sig_load_imm	; dstAdd.pack_opt	= load32.pm_opt.sf_opt.cond.ws_opt(immediate32bit_value)							; dstMul.pack_opt	= load32.cond()				;
+sig_load_imm	; dstAdd.pack_opt	= load16.pm_opt.signed_opt.sf_opt.cond.ws_opt(int16_imm, int16_imm)					; dstMul.pack_opt	= load16.cond()				;
+
+==================================================================
+================How to formulate instructions:====================
+==================================================================
+1)
+You must specify the signal bits at the beginning of each instruction:
+sig_brk, sig_none, sig_switch, sig_end, sig_wait_score, sig_unlock_score, sig_thread_switch, sig_coverage_load,
+sig_color_load, sig_color_load_end, sig_load_tmu0, sig_load_tmu1, sig_alpha_mask_load, sig_small_imm, sig_load_imm, sig_branch
+
+2)
+Then you must specify the output register for the ADD pipeline.
+rx0-31, r0-3, r5, tmu_noswap, host_int, nop, uniforms_addr, quad_x, quad_y, ms_flags, rev_flags, tlb_stencil_setup
+tlb_z, tlb_color_ms, tlb_color_all, vpm, vr_setup, vr_addr, mutex_release, sfu_recip, sfu_recipsqrt, sfu_exp,
+sfu_log, tmu0_s, tmu0_t, tmu0_r, tmu0_b, tmu1_s, tmu1_t, tmu1_r, tmu1_b
+
+3)
+If the ADD instruction writes to regfile A (ie. you don't specify the WS flag later) and PM flag won't be specified,
+then you can specify the pack mode for regfile A here (omitting means nop)
+nop, 16a, 16b, 8888, 8a, 8b, 8c, 8d, sta, 16a.sat, 16b.sat, 8888.sat, 8a.sat, 8b.sat, 8c.sat, 8d.sat
+
+4)
+Then you must specify your operation for the ADD pipeline. If you are writing a non-ALU instruction, you can specify either
+branch, sem_inc, sem_dec, load32 or load16 here instead.
+Operations available:
+nop, fadd, fsub, fmin, fmax, fminabs, fmaxabs, ftoi, itof, add, sub, shr, asr, ror, shl, min, max, and, or, xor, not, clz, v8adds, v8subs
+
+5)
+Then you can specify a range of modifiers (order is not important):
+PM bit: pm
+SF bit: sf
+WS bit: ws
+REL bit: rel
+REG bit: reg
+SIGNED bit: signed
+Conditional execution for the ADD pipeline: never, always, zs, zc, ns, nc, cs, cc
+Unpack modes (from regfile A, or if PM is set from R4): nop, 16a, 16b, 8d_rep, 8a, 8b, 8c, 8d
+
+6)
+Then you must specify the arguments for the ALU operation.
+srcA, srcB can be: r0-r5 or a, b for regfiles A and B, or imm for the small immediate value.
+raddr_a and raddr_b can be specified afterwards as optional extra arguments (omitting means nop).
+raddr_a: ra0-31, pay_zw, uni, vary, elem, nop, x_pix, ms_flags, vpm_read, vpm_ld_busy, vpm_ld_wait, mutex_acq
+raddr_b: rb0-31, pay_zw, uni, vary, elem, nop, y_pix, rev_flag, vpm_read, vpm_st_busy, vpm_st_wait, mutex_acq
+
+For branch operation, you must specify:
+the jump address as a 32bit value (can be relative if REL is set)
+the branch condition: all_zs, all_zc, any_zs, any_zc, all_ns, all_nc, any_ns, any_nc, all_cs, all_cc, any_cs, any_cc, always
+and an optional raddr_a (if REG flag is set), see above
+
+For a semaphore instruction, you need to specify which semaphore (0-15) you want to modify, then an optional 27bit immediate value (ms 16bits might be usable...).
+
+7)
+Then you must specify the output register for the MUL pipeline.
+See above for options.
+
+8)
+If the MUL instruction writes to regfile A (ie. you specify the WS flag) then you can set the pack operation for regfile A here:
+nop, 16a, 16b, 8888, 8a, 8b, 8c, 8d, sta, 16a.sat, 16b.sat, 8888.sat, 8a.sat, 8b.sat, 8c.sat, 8d.sat
+OR
+You if specify the PM flag, then you can set the pack operation for the MUL output here:
+nop, 8888, 8a, 8b, 8c, 8d
+
+9)
+Then you must specify your operation for the MUL pipeline. If you are writing a non-ALU instruction, you can specify either
+branch, sem_inc, sem_dec, load32 or load16 here instead.
+Operations available:
+nop, fmul, mul24, v8muld, v8min, v8max, v8adds, v8subs
+
+10)
+Then you can specify a range of modifiers (order is not important):
+Conditional execution for the MUL pipeline: never, always, zs, zc, ns, nc, cs, cc
+
+11)
+Then you must specify the arguments for the ALU operation.
+srcA, srcB can be: r0-r5 or a, b for regfiles A and B, or imm for the small immediate value.
+
+==================================================================
+==================================================================
+
+
+dstAdd: rx0-31, r0-5, special regs
+raddr_a_opt: ra0-31, r0-5, special regs
+raddr_b_opt: rb0-31, r0-5, special regs
+
+Examples:
+sig_none		; rx0.nop			= add.pm.sf.always(r0, r1, 0)														; rx0.nop					= fmul.always(r2, r3)	;
+sig_branch		; rx0				= branch.pm.rel.reg.always(0xdeadbeef, ra1)											; rx0						= branch()				;
+sig_none		; rx0.nop			= sem_inc.pm.sf.always(1, 0x7ffffff)												; rx0.nop					= sem_inc.always()		;
+sig_load_imm	; rx0.nop			= load32.pm.sf.always(0xdeadbeef)													; rx0.nop					= load32.always()		;
+sig_load_imm	; rx0.nop			= load16.pm.sf.signed.always(1, 2)													; rx0.nop					= load16.always()		;
+#mov
+sig_none		; rx0.nop			= or(r0, r0)																		; rx0						= v8min(r1, r1)			;
+ */
+
+uint64_t encode_alu(qpu_sig_bits sig_bits,
+					qpu_unpack unpack_mode,
+					//If the pm bit is set, the unpack field programs the r4 unpack unit,
+					//and the pack field is used to program the color
+					//conversion on the output of the mul unit
+					uint8_t pack_unpack_select,
+					uint8_t pack_mode,
+					qpu_cond add_cond,
+					qpu_cond mul_cond,
+					uint8_t set_flags, //Flags are updated from the add ALU unless the add ALU performed a NOP (or its condition code was NEVER) in which case flags are updated from the mul ALU
+					uint8_t write_swap_flag, //0: add writes to A, mul to B, 1: add writes to B, mul to A
+					qpu_waddr waddr_add,
+					qpu_waddr waddr_mul,
+					qpu_op_mul op_mul,
+					qpu_op_add op_add,
+					qpu_raddr raddr_a,
+					qpu_raddr raddr_b,
+					qpu_mux add_a,
+					qpu_mux add_b,
+					qpu_mux mul_a,
+					qpu_mux mul_b
+					)
+{
+	uint64_t res = 0;
+	uint64_t tmp = 0;
+
+	tmp = sig_bits & 0xf; //mask ls 4 bits
+	res |= tmp << QPU_SIG_SHIFT;
+
+	tmp = unpack_mode & 0x7; //mask ls 3 bits
+	res |= tmp << QPU_UNPACK_SHIFT;
+
+	tmp = pack_unpack_select & 1;
+	res |= tmp << 56;
+
+	tmp = pack_mode & 0xf;
+	res |= tmp << QPU_PACK_SHIFT;
+
+	tmp = add_cond & 0x7;
+	res |= tmp << QPU_COND_ADD_SHIFT;
+
+	tmp = mul_cond & 0x7;
+	res |= tmp << QPU_COND_MUL_SHIFT;
+
+	tmp = set_flags & 1;
+	res |= tmp << 45;
+
+	tmp = write_swap_flag & 1;
+	res |= tmp << 44;
+
+	tmp = waddr_add & 0x3f;
+	res |= tmp << QPU_WADDR_ADD_SHIFT;
+
+	tmp = waddr_mul & 0x3f;
+	res |= tmp << QPU_WADDR_MUL_SHIFT;
+
+	tmp = op_mul & 0x7;
+	res |= tmp << QPU_OP_MUL_SHIFT;
+
+	tmp = op_add & 0x1f;
+	res |= tmp << QPU_OP_ADD_SHIFT;
+
+	tmp = raddr_a & 0x3f;
+	res |= tmp << QPU_RADDR_A_SHIFT;
+
+	tmp = raddr_b & 0x3f;
+	res |= tmp << QPU_RADDR_B_SHIFT;
+
+	tmp = add_a & 0x7;
+	res |= tmp << QPU_ADD_A_SHIFT;
+
+	tmp = add_b & 0x7;
+	res |= tmp << QPU_ADD_B_SHIFT;
+
+	tmp = mul_a & 0x7;
+	res |= tmp << QPU_MUL_A_SHIFT;
+
+	tmp = mul_b & 0x7;
+	res |= tmp << QPU_MUL_B_SHIFT;
+
+	return res;
+}
+
+uint64_t encode_alu_small_imm(qpu_unpack unpack_mode,
+							  uint8_t pack_unpack_select,
+							  uint8_t pack_mode,
+							  qpu_cond add_cond,
+							  qpu_cond mul_cond,
+							  uint8_t set_flags, //Flags are updated from the add ALU unless the add ALU performed a NOP (or its condition code was NEVER) in which case flags are updated from the mul ALU
+							  uint8_t write_swap_flag, //0: add writes to A, mul to B, 1: add writes to B, mul to A
+							  qpu_waddr waddr_add,
+							  qpu_waddr waddr_mul,
+							  qpu_op_mul op_mul,
+							  qpu_op_add op_add,
+							  qpu_raddr raddr_a,
+							  uint8_t small_imm,
+							  qpu_mux add_a,
+							  qpu_mux add_b,
+							  qpu_mux mul_a,
+							  qpu_mux mul_b
+		)
+{
+	return encode_alu(0xd,
+					  unpack_mode,
+					  pack_unpack_select,
+					  pack_mode,
+					  add_cond,
+					  mul_cond,
+					  set_flags,
+					  write_swap_flag,
+					  waddr_add,
+					  waddr_mul,
+					  op_mul,
+					  op_add,
+					  raddr_a,
+					  small_imm,
+					  add_a,
+					  add_b,
+					  mul_a,
+					  mul_b);
+}
+
+uint64_t encode_branch(qpu_branch_cond branch_cond,
+					   uint8_t is_relative, //if set branch target is relative to PC+4
+					   uint8_t use_raddr_a, //if set add value of raddr_a (from simd elem 0) to branch target
+					   qpu_raddr raddr_a,
+					   uint8_t write_swap_bit,
+					   qpu_waddr waddr_add,
+					   qpu_waddr waddr_mul,
+					   uint32_t imm //always added to branch target, set to 0 if unused
+					   )
+{
+	uint64_t res = 0;
+	uint64_t tmp = 0;
+
+	tmp = 0xf;
+	res |= tmp << 60;
+
+	tmp = branch_cond & 0xf;
+	res |= tmp << QPU_BRANCH_COND_SHIFT;
+
+	tmp = is_relative & 1;
+	res |= tmp << 51;
+
+	tmp = use_raddr_a & 1;
+	res |= tmp << 50;
+
+	tmp = raddr_a & 0x1f;
+	res |= tmp << QPU_BRANCH_RADDR_A_SHIFT;
+
+	tmp = write_swap_bit & 1;
+	res |= tmp << 44;
+
+	tmp = waddr_add & 0x3f;
+	res |= tmp << QPU_WADDR_ADD_SHIFT;
+
+	tmp = waddr_mul & 0x3f;
+	res |= tmp << QPU_WADDR_MUL_SHIFT;
+
+	res |= imm;
+
+	return res;
+}
+
+uint64_t encode_semaphore(uint8_t pack_unpack_select,
+						  uint8_t pack_mode,
+						  qpu_cond cond_add,
+						  qpu_cond cond_mul,
+						  uint8_t set_flags,
+						  uint8_t write_swap,
+						  qpu_waddr waddr_add,
+						  qpu_waddr waddr_mul,
+						  uint8_t incr_sem, //if 1 increment semaphore
+						  uint8_t sem, //4 bit semaphore selector
+						  uint32_t imm_val //27bit immediate value loaded into all 16 simd elements
+						  )
+{
+	uint64_t res = 0;
+	uint64_t tmp = 0;
+
+	tmp = 0x74;
+	res |= tmp << 57;
+
+	tmp = pack_unpack_select & 1;
+	res |= tmp << 56;
+
+	tmp = pack_mode & 0xf;
+	res |= tmp << QPU_PACK_SHIFT;
+
+	tmp = cond_add & 0x7;
+	res |= tmp << QPU_COND_ADD_SHIFT;
+
+	tmp = cond_mul & 0x7;
+	res |= tmp << QPU_COND_MUL_SHIFT;
+
+	tmp = set_flags & 1;
+	res |= tmp << 45;
+
+	tmp = write_swap & 1;
+	res |= tmp << 44;
+
+	tmp = waddr_add & 0x3f;
+	res |= tmp << QPU_WADDR_ADD_SHIFT;
+
+	tmp = waddr_mul & 0x3f;
+	res |= tmp << QPU_WADDR_MUL_SHIFT;
+
+	tmp = imm_val & 0x7ffffff;
+	res |= tmp << 5;
+
+	tmp = incr_sem & 1;
+	res |= tmp << 4;
+
+	res |= sem & 0xf;
+
+	return res;
+}
+
+//write immediate value across simd array
+uint64_t encode_load_imm(uint8_t pack_unpack_select,
+						 uint8_t pack_mode,
+						 qpu_cond cond_add,
+						 qpu_cond cond_mul,
+						 uint8_t set_flags,
+						 uint8_t write_swap,
+						 qpu_waddr waddr_add,
+						 qpu_waddr waddr_mul,
+						 uint32_t imm //2x16bit or 1x32bit uint
+		)
+{
+	uint64_t res = 0;
+	uint64_t tmp = 0;
+
+	tmp = 0x70;
+	res |= tmp << 57;
+
+	tmp = pack_unpack_select & 1;
+	res |= tmp << 56;
+
+	tmp = pack_mode & 0xf;
+	res |= tmp << QPU_PACK_SHIFT;
+
+	tmp = cond_add & 0x7;
+	res |= tmp << QPU_COND_ADD_SHIFT;
+
+	tmp = cond_mul & 0x7;
+	res |= tmp << QPU_COND_MUL_SHIFT;
+
+	tmp = set_flags & 1;
+	res |= tmp << 45;
+
+	tmp = write_swap & 1;
+	res |= tmp << 44;
+
+	tmp = waddr_add & 0x3f;
+	res |= tmp << QPU_WADDR_ADD_SHIFT;
+
+	tmp = waddr_mul & 0x3f;
+	res |= tmp << QPU_WADDR_MUL_SHIFT;
+
+	res |= imm;
+
+	return res;
+}
+
+//write per element MS bit and LS bit across simd array
+uint64_t encode_load_imm_per_elem(
+						 uint8_t signed_or_unsigned, //1 for signed, 0 for unsigned
+						 uint8_t pack_unpack_select,
+						 uint8_t pack_mode,
+						 qpu_cond cond_add,
+						 qpu_cond cond_mul,
+						 uint8_t set_flags,
+						 uint8_t write_swap,
+						 qpu_waddr waddr_add,
+						 qpu_waddr waddr_mul,
+						 uint16_t ms_bit, //per element MS (sign) bit
+						 uint16_t ls_bit //per element LS bit
+		)
+{
+	uint64_t res = 0;
+	uint64_t tmp = 0;
+
+	tmp = 0x71;
+	tmp |= !signed_or_unsigned << 1;
+	res |= tmp << 57;
+
+	tmp = pack_unpack_select & 1;
+	res |= tmp << 56;
+
+	tmp = pack_mode & 0xf;
+	res |= tmp << QPU_PACK_SHIFT;
+
+	tmp = cond_add & 0x7;
+	res |= tmp << QPU_COND_ADD_SHIFT;
+
+	tmp = cond_mul & 0x7;
+	res |= tmp << QPU_COND_MUL_SHIFT;
+
+	tmp = set_flags & 1;
+	res |= tmp << 45;
+
+	tmp = write_swap & 1;
+	res |= tmp << 44;
+
+	tmp = waddr_add & 0x3f;
+	res |= tmp << QPU_WADDR_ADD_SHIFT;
+
+	tmp = waddr_mul & 0x3f;
+	res |= tmp << QPU_WADDR_MUL_SHIFT;
+
+	tmp = ms_bit;
+	res |= tmp << 16;
+
+	res |= ls_bit;
+
+	return res;
+}
+
+qpu_sig_bits parse_sig_bit(char* str)
+{
+	unsigned num_sig_bits = sizeof(qpu_sig_bits_str) / sizeof(const char *);
+
+	for(unsigned c = 0; c < num_sig_bits && str; ++c)
+	{
+		if(qpu_sig_bits_str[c] && strcmp(str, qpu_sig_bits_str[c]) == 0)
+		{
+			return c;
+		}
+	}
+
+	return -1;
+}
+
+void parse_dst(char** str, qpu_waddr* waddr, uint8_t* pack_mode, unsigned is_add, unsigned pm_set)
+{
+	char* dst = strtok(*str, ".");
+	char* pack = strtok(0, ".");
+
+	//advance token past dst strings so we can tokenize further
+	if(dst)
+	{
+		if(pack)
+		{
+			*str = pack;
+		}
+		else
+		{
+			*str = dst;
+		}
+
+		while(**str)
+		{
+			(*str)++;
+		}
+
+		*str += 1;
+	}
+
+	uint8_t waddr_res = 0;
+	uint8_t pack_mode_res = 0;
+
+	for(unsigned c = 0; c < 2 && dst && !waddr_res; ++c)
+	{
+		for(unsigned d = 0; d < 64; ++d)
+		{
+			if(qpu_waddr_str[c][d] && strcmp(dst, qpu_waddr_str[c][d]) == 0)
+			{
+				waddr_res = d;
+				break;
+			}
+		}
+	}
+
+	if(dst && dst[0] == 'r' && dst[1] == 'x')
+	{
+		waddr_res = strtol(dst+2, 0, 0);
+	}
+
+	unsigned num_pack_a_str = sizeof(qpu_pack_a_str) / sizeof(const char *);
+	for(unsigned c = 0; c < num_pack_a_str && pack && !pack_mode_res; ++c)
+	{
+		if(qpu_pack_a_str[c] && strcmp(pack, qpu_pack_a_str[c]) == 0)
+		{
+			pack_mode_res = c;
+			break;
+		}
+	}
+
+	unsigned num_pack_mul_str = sizeof(qpu_pack_mul_str) / sizeof(const char *);
+	for(unsigned c = 0; c < num_pack_mul_str && pack && !pack_mode_res; ++c)
+	{
+		if(qpu_pack_mul_str[c] && strcmp(pack, qpu_pack_mul_str[c]) == 0)
+		{
+			pack_mode_res = c;
+			break;
+		}
+	}
+
+	*waddr = waddr_res;
+	if(is_add || pm_set)
+	{
+		*pack_mode = pack_mode_res;
+	}
+}
+
+void parse_op_modifiers(char** str, uint8_t* signed_or_unsigned, uint8_t* ws, uint8_t* pm, uint8_t* sf, qpu_cond* condition, qpu_unpack* unpack_mode, uint8_t* rel, uint8_t* reg, unsigned is_add)
+{
+	char* modifier = strtok(*str, ".");
+
+	//at most 5 modifiers supported
+	for(int c = 0; c < 5; ++c)
+	{
+		if(modifier)
+		{
+			*str = modifier;
+
+			if(strcmp(modifier, "pm") == 0 && is_add)
+			{
+				*pm = 1;
+				modifier = strtok(0, ".");
+				continue;
+			}
+
+			if(strcmp(modifier, "ws") == 0 && is_add)
+			{
+				*ws = 1;
+				modifier = strtok(0, ".");
+				continue;
+			}
+
+			if(strcmp(modifier, "rel") == 0 && is_add)
+			{
+				*rel = 1;
+				modifier = strtok(0, ".");
+				continue;
+			}
+
+			if(strcmp(modifier, "reg") == 0 && is_add)
+			{
+				*reg = 1;
+				modifier = strtok(0, ".");
+				continue;
+			}
+
+			if(strcmp(modifier, "sf") == 0 && is_add)
+			{
+				*sf = 1;
+				modifier = strtok(0, ".");
+				continue;
+			}
+
+			if(strcmp(modifier, "signed") == 0 && is_add)
+			{
+				*signed_or_unsigned = 1;
+				modifier = strtok(0, ".");
+				continue;
+			}
+
+			unsigned found = 0;
+			unsigned num_conds = sizeof(qpu_cond_str) / sizeof(const char *);
+
+			for(unsigned d = 0; d < num_conds; ++d)
+			{
+				if(qpu_cond_str[d] && strcmp(modifier, qpu_cond_str[d]) == 0)
+				{
+					*condition = d;
+					found = 1;
+					break;
+				}
+			}
+
+			if(found)
+			{
+				modifier = strtok(0, ".");
+				continue;
+			}
+
+			if(is_add)
+			{
+				unsigned num_unpack_modes = sizeof(qpu_unpack_str) / sizeof(const char *);
+
+				for(unsigned d = 0; d < num_unpack_modes; ++d)
+				{
+					if(qpu_unpack_str[d] && strcmp(modifier, qpu_unpack_str[d]) == 0)
+					{
+						*unpack_mode = d;
+						break;
+					}
+				}
+			}
+
+			modifier = strtok(0, ".");
+		}
+	}
+
+	//advance token past op strings so we can tokenize further
+	while(**str)
+	{
+		(*str)++;
+	}
+
+	*str += 1;
+}
+
+void parse_op(char** str, qpu_alu_type* type, qpu_op_add* op_add, qpu_op_mul* op_mul, uint8_t* is_sem_inc, qpu_load_type* load_type)
+{
+	char* op = strtok(*str, ".");
+
+	if(op && strcmp(op, "sem_inc") == 0)
+	{
+		*type = QPU_SEM;
+		*is_sem_inc = 1;
+	}
+	else if(op && strcmp(op, "sem_dec") == 0)
+	{
+		*type = QPU_SEM;
+		*is_sem_inc = 0;
+	}
+	else if(op && strcmp(op, "branch") == 0)
+	{
+		*type = QPU_BRANCH;
+	}
+	else if(op && strcmp(op, "load32") == 0)
+	{
+		*type = QPU_LOAD_IMM;
+		*load_type = QPU_LOAD32;
+	}
+	else if(op && strcmp(op, "load16") == 0)
+	{
+		*type =	QPU_LOAD_IMM;
+		*load_type = QPU_LOAD16;
+	}
+	else
+	{
+		*type = QPU_ALU;
+
+		unsigned num_add_ops = sizeof(qpu_op_add_str) / sizeof(const char *);
+		unsigned num_mul_ops = sizeof(qpu_op_mul_str) / sizeof(const char *);
+
+		for(unsigned c = 0; c < num_add_ops && op; ++c)
+		{
+			if(qpu_op_add_str[c] && strcmp(op, qpu_op_add_str[c]) == 0)
+			{
+				*op_add = c;
+				break;
+			}
+		}
+
+		for(unsigned c = 0; c < num_mul_ops && op; ++c)
+		{
+			if(qpu_op_mul_str[c] && strcmp(op, qpu_op_mul_str[c]) == 0)
+			{
+				*op_mul = c;
+				break;
+			}
+		}
+	}
+
+	if(op)
+	{
+		*str = op;
+	}
+
+	//advance token past op strings so we can tokenize further
+	while(**str)
+	{
+		(*str)++;
+	}
+
+	*str += 1;
+}
+
+void parse_args_alu(char** str, qpu_mux* in_a, qpu_mux* in_b, uint8_t* raddr_a, uint8_t* raddr_b, uint8_t is_si)
+{
+	char* arg = strtok(*str, " \n\v\f\r\t,");
+
+	unsigned num_muxes = sizeof(qpu_mux_str) / sizeof(const char *);
+	unsigned found = 0;
+
+	for(unsigned c = 0; c < num_muxes && arg; ++c)
+	{
+		if(qpu_mux_str[c] && strcmp(arg, qpu_mux_str[c]) == 0)
+		{
+			*str = arg;
+			*in_a = c;
+			found = 1;
+			break;
+		}
+	}
+
+	arg = strtok(0, " \n\v\f\r\t,");
+
+	for(unsigned c = 0; c < num_muxes && arg; ++c)
+	{
+		if(qpu_mux_str[c] && strcmp(arg, qpu_mux_str[c]) == 0)
+		{
+			*str = arg;
+			*in_b = c;
+			break;
+		}
+	}
+
+	arg = strtok(0, " \n\v\f\r\t,");
+
+	if(arg)
+	{
+		uint8_t raddr_a_res = 0;
+
+		for(unsigned d = 0; d < 52; ++d)
+		{
+			if(qpu_raddr_str[0][d] && strcmp(arg, qpu_raddr_str[0][d]) == 0)
+			{
+				raddr_a_res = d;
+				break;
+			}
+		}
+
+		if(!raddr_a_res && arg && arg[0] == 'r' && arg[1] == 'a')
+		{
+			raddr_a_res = strtol(arg+2, 0, 0);
+		}
+
+
+		*raddr_a = raddr_a_res;
+		*str = arg;
+	}
+
+	arg = strtok(0, " \n\v\f\r\t,");
+
+	if(arg)
+	{
+		uint8_t raddr_b_res = 0;
+
+		for(unsigned c = 0; c < 2 && arg && !raddr_b_res; ++c)
+		{
+			for(unsigned d = 0; d < 52; ++d)
+			{
+				if(qpu_raddr_str[c][d] && strcmp(arg, qpu_raddr_str[c][d]) == 0)
+				{
+					raddr_b_res = d;
+					break;
+				}
+			}
+		}
+
+		if(!raddr_b_res && arg && arg[0] == 'r' && arg[1] == 'b')
+		{
+			raddr_b_res = strtol(arg+2, 0, 0);
+		}
+
+		if(is_si)
+		{
+			uint32_t si = strtol(arg, 0, 0);
+			raddr_b_res = qpu_encode_small_immediate(si);
+		}
+
+		*raddr_b = raddr_b_res;
+		*str = arg;
+	}
+
+	//advance token past arg strings so we can tokenize further
+	while(**str)
+	{
+		(*str)++;
+	}
+
+	*str += 1;
+}
+
+void parse_args_sem(char** str, uint8_t* sem, uint32_t* imm32)
+{
+	char* arg = strtok(*str, " \n\v\f\r\t,");
+
+	if(arg)
+	{
+		*sem = strtol(arg, 0, 0);
+		*str = arg;
+	}
+
+	arg = strtok(0, " \n\v\f\r\t,");
+
+	if(arg)
+	{
+		*imm32 = strtol(arg, 0, 0);
+		*str = arg;
+	}
+
+	//advance token past arg strings so we can tokenize further
+	while(**str)
+	{
+		(*str)++;
+	}
+
+	*str += 1;
+}
+
+void parse_args_branch(char** str, uint32_t* imm32, qpu_branch_cond* branch_cond, uint8_t* raddr_a)
+{
+	char* arg = strtok(*str, " \n\v\f\r\t,");
+
+	if(arg)
+	{
+		*imm32 = strtol(arg, 0, 0);
+		*str = arg;
+	}
+
+	arg = strtok(0, " \n\v\f\r\t,");
+
+	if(arg)
+	{
+		unsigned num_branch_conds = sizeof(qpu_branch_cond_str) / sizeof(const char *);
+
+		for(unsigned c = 0; c < num_branch_conds && arg; ++c)
+		{
+			if(qpu_branch_cond_str[c] && strcmp(arg, qpu_branch_cond_str[c]) == 0)
+			{
+				*branch_cond = c;
+				*str = arg;
+				break;
+			}
+		}
+	}
+
+	arg = strtok(0, " \n\v\f\r\t,");
+
+	if(arg)
+	{
+		uint8_t raddr_a_res = 0;
+
+		for(unsigned c = 0; c < 2 && arg && !raddr_a_res; ++c)
+		{
+			for(unsigned d = 0; d < 52; ++d)
+			{
+				if(qpu_raddr_str[c][d] && strcmp(arg, qpu_raddr_str[c][d]) == 0)
+				{
+					raddr_a_res = d;
+					break;
+				}
+			}
+		}
+
+		if(!raddr_a_res && arg && arg[0] == 'r' && arg[1] == 'a')
+		{
+			raddr_a_res = strtol(arg+2, 0, 0);
+		}
+
+		*raddr_a = raddr_a_res;
+		*str = arg;
+	}
+
+	//advance token past arg strings so we can tokenize further
+	while(**str)
+	{
+		(*str)++;
+	}
+
+	*str += 1;
+}
+
+void parse_args_load(char** str, qpu_load_type load_type, uint32_t* imm32, uint16_t* ms_imm16, uint16_t* ls_imm16)
+{
+	char* arg = strtok(*str, " \n\v\f\r\t,");
+
+	if(load_type == QPU_LOAD32)
+	{
+		if(arg)
+		{
+			*imm32 = strtol(arg, 0, 0);
+			*str = arg;
+		}
+	}
+	else
+	{
+		if(arg)
+		{
+			*ms_imm16 = strtol(arg, 0, 0);
+			*str = arg;
+		}
+
+		arg = strtok(0, " \n\v\f\r\t,");
+
+		if(arg)
+		{
+			*ls_imm16 = strtol(arg, 0, 0);
+			*str = arg;
+		}
+	}
+
+	//advance token past arg strings so we can tokenize further
+	while(**str)
+	{
+		(*str)++;
+	}
+
+	*str += 1;
+}
+
+
+void assemble_qpu_asm(char* str, uint64_t* instructions)
+{
+	unsigned instruction_counter = 0;
+
+	//delete lines that have comments in them
+	char* comment_token = strstr(str, "#");
+
+	while(comment_token)
+	{
+		while(*comment_token != '\n')
+		{
+			*comment_token = ' ';
+			comment_token++;
+		}
+		*comment_token = ' ';
+		comment_token = strstr(comment_token, "#");
+	}
+
+
+	//parse string token by token
+	char* token = strtok(str, " \n\v\f\r\t;");
+
+	while(token)
+	{
+		qpu_sig_bits sig_bit = QPU_SIG_NONE;
+		qpu_alu_type type = QPU_ALU;
+		qpu_op_add op_add =	QPU_A_NOP;
+		qpu_op_mul op_mul =	QPU_M_NOP;
+		qpu_mux mul_a = 0;
+		qpu_mux mul_b = 0;
+		qpu_mux add_a = 0;
+		qpu_mux add_b = 0;
+		qpu_cond cond_mul = QPU_COND_ALWAYS;
+		qpu_cond cond_add = QPU_COND_ALWAYS;
+		qpu_waddr waddr_add = QPU_W_NOP;
+		qpu_waddr waddr_mul = QPU_W_NOP;
+		qpu_waddr raddr_a = QPU_R_NOP;
+		qpu_waddr raddr_b = QPU_R_NOP;
+		uint8_t pack_unpack_select = 0;
+		uint8_t pack_mode = QPU_PACK_A_NOP;
+		qpu_unpack unpack_mode = QPU_UNPACK_NOP;
+		uint8_t is_sem_inc = 0;
+		uint8_t rel = 0;
+		uint8_t reg = 0;
+		uint8_t ws = 0;
+		uint8_t sf = 0;
+		uint32_t imm32 = 0;
+		uint16_t ms_imm16 = 0;
+		uint16_t ls_imm16 = 0;
+		uint8_t semaphore = 0;
+		qpu_load_type load_type = QPU_LOAD32;
+		uint8_t is_signed = 0;
+		qpu_branch_cond branch_cond = QPU_COND_BRANCH_ALWAYS;
+
+		sig_bit = parse_sig_bit(token);
+		if(sig_bit < 0)
+		{
+			break;
+		}
+
+		//get dst for add
+		token = strtok(0, " \n\v\f\r\t=;");
+		parse_dst(&token, &waddr_add, &pack_mode, 1, 0);
+
+		//check op
+		token = strtok(token, " \n\v\f\r\t=(");
+		unsigned has_modifiers = strstr(token, ".") != 0;
+		parse_op(&token, &type, &op_add, &op_mul, &is_sem_inc, &load_type);
+
+		//get modifiers
+		if(has_modifiers)
+		{
+			//token = strtok(token, " \n\v\f\r\t");
+			parse_op_modifiers(&token, &is_signed, &ws, &pack_unpack_select, &sf, &cond_add, &unpack_mode, &rel, &reg, 1);
+		}
+
+		if(type == QPU_ALU)
+		{
+			//get arguments for add
+			token = strtok(token, ")");
+			parse_args_alu(&token, &add_a, &add_b, &raddr_a, &raddr_b, sig_bit == QPU_SIG_SMALL_IMM);
+		}
+		else if(type == QPU_SEM)
+		{
+			//get arguments for sem
+			token = strtok(token, ")");
+			parse_args_sem(&token, &semaphore, &imm32);
+		}
+		else if(type == QPU_BRANCH)
+		{
+			//get arguments for branch
+			token = strtok(token, ")");
+			parse_args_branch(&token, &imm32, &branch_cond, &raddr_a);
+		}
+		else if(type == QPU_LOAD_IMM)
+		{
+			//get arguments for load imm
+			token = strtok(token, ")");
+			parse_args_load(&token, load_type, &imm32, &ms_imm16, &ls_imm16);
+		}
+
+		//get dst for mul
+		token = strtok(token, " \n\v\f\r\t=;");
+		parse_dst(&token, &waddr_mul, &pack_mode, 0, pack_unpack_select);
+
+		//check op
+		token = strtok(token, " \n\v\f\r\t=(");
+		has_modifiers = strstr(token, ".") != 0;
+		parse_op(&token, &type, &op_add, &op_mul, &is_sem_inc, &load_type);
+
+		//get modifiers
+		if(has_modifiers)
+		{
+			//token = strtok(token, " \n\v\f\r\t(");
+			parse_op_modifiers(&token, &is_signed, &ws, &pack_unpack_select, &sf, &cond_mul, &unpack_mode, &rel, &reg, 0);
+		}
+
+		token = strtok(token, ")");
+
+		if(type == QPU_ALU)
+		{
+			//get arguments for mul
+			parse_args_alu(&token, &mul_a, &mul_b, &raddr_a, &raddr_b, sig_bit == QPU_SIG_SMALL_IMM);
+		}
+
+		//EMIT INSTRUCTION HERE
+		if(type == QPU_ALU)
+		{
+			if(sig_bit == QPU_SIG_SMALL_IMM)
+			{
+				instructions[instruction_counter] = encode_alu_small_imm(unpack_mode, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, op_mul, op_add, raddr_a, raddr_b, add_a, add_b, mul_a, mul_b);
+			}
+			else
+			{
+				instructions[instruction_counter] = encode_alu(sig_bit, unpack_mode, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, op_mul, op_add, raddr_a, raddr_b, add_a, add_b, mul_a, mul_b);
+			}
+		}
+		else if(type == QPU_SEM)
+		{
+			instructions[instruction_counter] = encode_semaphore(pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, is_sem_inc, semaphore, imm32);
+		}
+		else if(type ==	QPU_BRANCH)
+		{
+			instructions[instruction_counter] = encode_branch(branch_cond, rel, reg, raddr_a, ws, waddr_add, waddr_mul, imm32);
+		}
+		else if(type == QPU_LOAD_IMM)
+		{
+			if(load_type ==	QPU_LOAD32)
+			{
+				instructions[instruction_counter] = encode_load_imm(pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, imm32);
+			}
+			else
+			{
+				instructions[instruction_counter] = encode_load_imm_per_elem(is_signed, pack_unpack_select, pack_mode, cond_add, cond_mul, sf, ws, waddr_add, waddr_mul, ms_imm16, ls_imm16);
+			}
+		}
+
+		instruction_counter++;
+		token = strtok(token, " \n\v\f\r\t;");
+	}
+}
+
+void disassemble_qpu_asm(uint64_t instruction)
+{
+#define GET_BITFIELD(num_bits, place) (((instruction) & ((uint64_t)num_bits << place)) >> place)
+
+	qpu_sig_bits sig_bits = GET_BITFIELD(0xf, 60);
+	printf("%s ; ", qpu_sig_bits_str[sig_bits]);
+
+	unsigned is_sem = GET_BITFIELD(0x7f, 57) == 0x74;
+
+	qpu_waddr waddr_add = GET_BITFIELD(0x3f, QPU_WADDR_ADD_SHIFT);
+	qpu_waddr waddr_mul = GET_BITFIELD(0x3f, QPU_WADDR_MUL_SHIFT);
+	uint8_t ws = GET_BITFIELD(1, 44);
+	uint8_t pm = GET_BITFIELD(1, 56);
+
+	if(waddr_add <= 31)
+	{
+		printf("rx%d", waddr_add);
+	}
+	else
+	{
+		printf("%s", qpu_waddr_str[ws][waddr_add]);
+	}
+
+	if(is_sem)
+	{
+		uint8_t pack_mode = GET_BITFIELD(0xf, QPU_PACK_SHIFT);
+
+		if(!ws && !pm)
+		{
+			printf(".%s", qpu_pack_a_str[pack_mode]);
+		}
+
+		uint8_t is_sem_inc = GET_BITFIELD(1, 4);
+
+		printf(" = %s", is_sem_inc ? "sem_inc" : "sem_dec");
+
+		if(ws)
+		{
+			printf(".ws");
+		}
+
+		if(pm)
+		{
+			printf(".pm");
+		}
+
+		qpu_cond cond_add = GET_BITFIELD(0x7, QPU_COND_ADD_SHIFT);
+
+		printf(".%s", qpu_cond_str[cond_add]);
+
+		uint8_t sf = GET_BITFIELD(1, 45);
+
+		if(sf)
+		{
+			printf(".sf");
+		}
+
+		uint8_t sem = GET_BITFIELD(0xf, 0);
+
+		uint32_t imm_val = GET_BITFIELD(0x7ffffff, 5);
+
+		printf("(%d, %#x) ; ", sem, imm_val);
+
+		if(waddr_mul <= 31)
+		{
+			printf("rx%d", waddr_mul);
+		}
+		else
+		{
+			printf("%s", qpu_waddr_str[!ws][waddr_mul]);
+		}
+
+		if(pm)
+		{
+			printf(".%s", qpu_pack_mul_str[pack_mode]);
+		}
+
+		printf(" = %s", is_sem_inc ? "sem_inc" : "sem_dec");
+
+		qpu_cond cond_mul = GET_BITFIELD(0x7, QPU_COND_MUL_SHIFT);
+
+		printf(".%s() ;", qpu_cond_str[cond_mul]);
+	}
+	else if(!is_sem && sig_bits == QPU_SIG_LOAD_IMM)
+	{
+		qpu_load_type load_type = GET_BITFIELD(0x7f, 57) != 0x70;
+
+		uint8_t is_signed = !GET_BITFIELD(1, 58);
+
+		uint8_t pack_mode = GET_BITFIELD(0xf, QPU_PACK_SHIFT);
+
+		if(!ws && !pm)
+		{
+			printf(".%s", qpu_pack_a_str[pack_mode]);
+		}
+
+		if(load_type == QPU_LOAD32)
+		{
+			printf(" = load32");
+		}
+		else
+		{
+			printf(" = load16");
+		}
+
+		if(ws)
+		{
+			printf(".ws");
+		}
+
+		if(pm)
+		{
+			printf(".pm");
+		}
+
+		qpu_cond cond_add = GET_BITFIELD(0x7, QPU_COND_ADD_SHIFT);
+
+		printf(".%s", qpu_cond_str[cond_add]);
+
+		uint8_t sf = GET_BITFIELD(1, 45);
+
+		if(sf)
+		{
+			printf(".sf");
+		}
+
+		if(load_type == QPU_LOAD32)
+		{
+			uint32_t imm = GET_BITFIELD(0xffffffff, 0);
+
+			printf("(%#x) ; ", imm);
+		}
+		else
+		{
+			if(is_signed)
+			{
+				printf(".signed");
+			}
+
+			uint16_t ms_imm = GET_BITFIELD(0xffff, 16);
+			uint16_t ls_imm = GET_BITFIELD(0xffff, 0);
+			printf(is_signed ? "(%#x, %#x) ; " : "(%#x, %#x) ; ", ms_imm, ls_imm);
+		}
+
+		if(waddr_mul <= 31)
+		{
+			printf("rx%d", waddr_mul);
+		}
+		else
+		{
+			printf("%s", qpu_waddr_str[!ws][waddr_mul]);
+		}
+
+		if(load_type == QPU_LOAD32)
+		{
+			printf(" = load32");
+		}
+		else
+		{
+			printf(" = load16");
+		}
+
+		if(pm)
+		{
+			printf(".%s", qpu_pack_mul_str[pack_mode]);
+		}
+
+		qpu_cond cond_mul = GET_BITFIELD(0x7, QPU_COND_MUL_SHIFT);
+
+		printf(".%s() ;", qpu_cond_str[cond_mul]);
+	}
+	else if(!is_sem && sig_bits == QPU_SIG_BRANCH)
+	{
+		printf(" = branch");
+
+		if(ws)
+		{
+			printf(".ws");
+		}
+
+		uint8_t is_relative = GET_BITFIELD(1, 51);
+
+		if(is_relative)
+		{
+			printf(".rel");
+		}
+
+		uint8_t use_addr_a = GET_BITFIELD(1, 50);
+
+		if(use_addr_a)
+		{
+			printf(".reg");
+		}
+
+		uint32_t imm = GET_BITFIELD(0xffffffff, 0);
+		qpu_branch_cond branch_cond = GET_BITFIELD(0xf, QPU_BRANCH_COND_SHIFT);
+		qpu_raddr raddr_a = GET_BITFIELD(0x1f, QPU_BRANCH_RADDR_A_SHIFT);
+
+		printf("(%#x, %s, ", imm, qpu_branch_cond_str[branch_cond]);
+
+		if(raddr_a <= 31)
+		{
+			printf("ra%d", raddr_a);
+		}
+		else
+		{
+			printf("%s", qpu_raddr_str[0][raddr_a]);
+		}
+
+		printf(") ; ");
+
+		if(waddr_mul <= 31)
+		{
+			printf("rx%d", waddr_mul);
+		}
+		else
+		{
+			printf("%s", qpu_waddr_str[!ws][waddr_mul]);
+		}
+
+		printf(" = branch() ;");
+	}
+	else
+	{
+		//ALU
+		uint8_t pack_mode = GET_BITFIELD(0xf, QPU_PACK_SHIFT);
+
+		if(!pm)
+		{
+			printf(".%s", qpu_pack_a_str[pack_mode]);
+		}
+
+		qpu_op_add op_add = GET_BITFIELD(0x1f, QPU_OP_ADD_SHIFT);
+
+		printf(" = %s", qpu_op_add_str[op_add]);
+
+		if(ws)
+		{
+			printf(".ws");
+		}
+
+		if(pm)
+		{
+			printf(".pm");
+		}
+
+		qpu_cond cond_add = GET_BITFIELD(0x7, QPU_COND_ADD_SHIFT);
+
+		printf(".%s", qpu_cond_str[cond_add]);
+
+		uint8_t sf = GET_BITFIELD(1, 45);
+
+		if(sf)
+		{
+			printf(".sf");
+		}
+
+		qpu_unpack unpack_mode = GET_BITFIELD(0X7, QPU_UNPACK_SHIFT);
+
+		printf(".%s", qpu_unpack_str[unpack_mode]);
+
+		qpu_raddr raddr_a = GET_BITFIELD(0x3f, QPU_RADDR_A_SHIFT);
+		qpu_raddr raddr_b = GET_BITFIELD(0x3f, QPU_RADDR_B_SHIFT);
+
+		qpu_mux add_a = GET_BITFIELD(0x7, QPU_ADD_A_SHIFT);
+		qpu_mux add_b = GET_BITFIELD(0x7, QPU_ADD_B_SHIFT);
+
+		printf("(");
+
+		printf("%s, %s, ", qpu_mux_str[add_a], qpu_mux_str[add_b]);
+
+		if(raddr_a <= 31)
+		{
+			printf("ra%i", raddr_a);
+		}
+		else
+		{
+			printf("%s", qpu_raddr_str[0][raddr_a]);
+		}
+
+		printf(", ");
+
+		if(sig_bits == QPU_SIG_SMALL_IMM)
+		{
+			if(raddr_b < 16)
+			{
+				printf("%i", raddr_b);
+			}
+			else if(raddr_b < 32)
+			{
+				printf("%i", raddr_b - 32);
+			}
+			else
+			{
+				float val = raddr_b < 40 ? 1 << (raddr_b - 32) : 1.0f / (float)(1 << (48 - raddr_b));
+				printf("%#x", *(uint32_t*)&val);
+			}
+		}
+		else
+		{
+			if(raddr_b <= 31)
+			{
+				printf("rb%i", raddr_b);
+			}
+			else
+			{
+				printf("%s", qpu_raddr_str[1][raddr_b]);
+			}
+		}
+
+		printf(") ; ");
+
+		if(waddr_mul <= 31)
+		{
+			printf("rx%d", waddr_mul);
+		}
+		else
+		{
+			printf("%s", qpu_waddr_str[!ws][waddr_mul]);
+		}
+
+		if(pm)
+		{
+			printf(".%s", qpu_pack_mul_str[pack_mode]);
+		}
+
+		qpu_op_mul op_mul = GET_BITFIELD(0x7, QPU_OP_MUL_SHIFT);
+
+		printf(" = %s", qpu_op_mul_str[op_mul]);
+
+		qpu_cond cond_mul = GET_BITFIELD(0x7, QPU_COND_MUL_SHIFT);
+
+		printf(".%s", qpu_cond_str[cond_mul]);
+
+		qpu_mux mul_a = GET_BITFIELD(0x7, QPU_MUL_A_SHIFT);
+		qpu_mux mul_b = GET_BITFIELD(0x7, QPU_MUL_B_SHIFT);
+
+		printf("(%s, %s) ; ", qpu_mux_str[mul_a], qpu_mux_str[mul_b]);
+	}
+
+	printf("\n");
+}
\ No newline at end of file
diff --git a/QPUassembler/qpu_assembler.h b/QPUassembler/qpu_assembler.h
new file mode 100644
index 0000000..c5e923c
--- /dev/null
+++ b/QPUassembler/qpu_assembler.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <stdint.h>
+
+void disassemble_qpu_asm(uint64_t instruction);
+void assemble_qpu_asm(char* str, uint64_t* instructions);