diff --git a/docs/finn/faq.rst b/docs/finn/faq.rst
index 70c2f24ed2..0d643feba3 100644
--- a/docs/finn/faq.rst
+++ b/docs/finn/faq.rst
@@ -81,7 +81,7 @@ Which data layout do FINN-generated accelerators use? Big-endian? Little-endian?
     If you need to do this manually, first examine how the `FINN PYNQ Python drivers <https://github.com/Xilinx/finn-examples/blob/main/finn_examples/driver.py#L379>`_ do this – notice how the input data is
     first reshaped to create the “folded input shape” that reflects the word size of the first layer based on how much it
     was parallelized, then data packing is applied to obtain a raw byte array (with some reversals going on) that can be
-    fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/data_packing.py#L289>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.
+    fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn/blob/dev/src/finn/util/data_packing.py#L284>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.
 
 Why does FIFO sizing take so long for my network? Is something wrong?
     The automatic FIFO sizing in FINN can take quite long. It unfortunately doesn’t really parallelize on multiple cores since
diff --git a/fetch-repos.sh b/fetch-repos.sh
index 2033973f2a..a4fc124fa4 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,9 +27,9 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-QONNX_COMMIT="fd61cfeebbdaba351abf7e9d54cd785d7776fa4f"
+QONNX_COMMIT="2281a777d84aa5cbd7469085c2e534fb4a03ccf9"
 FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
-BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
+BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
 HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
diff --git a/finn-rtllib/fifo/hdl/Q_srl.v b/finn-rtllib/fifo/hdl/Q_srl.v
index d1ce33c41f..0b01973163 100644
--- a/finn-rtllib/fifo/hdl/Q_srl.v
+++ b/finn-rtllib/fifo/hdl/Q_srl.v
@@ -184,58 +184,58 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
    end // always @ (posedge clock or negedge reset)
 
    always @* begin					// - combi always
-        srlo_       <=  'bx;
-        shift_en_o_ <= 1'bx;
-        shift_en_   <= 1'bx;
-        addr_       <=  'bx;
-        state_      <= 2'bx;
+        srlo_       =  'bx;
+        shift_en_o_ = 1'bx;
+        shift_en_   = 1'bx;
+        addr_       =  'bx;
+        state_      = 2'bx;
       case (state)
 
 	state_empty: begin		    // - (empty, will not produce)
 	      if (i_v) begin		    // - empty & i_v => consume
-		 srlo_       <= i_d;
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_one;
+		 srlo_       = i_d;
+		 shift_en_o_ = 1;
+		 shift_en_   = 1'bx;
+		 addr_       = 0;
+		 state_      = state_one;
 	      end
 	      else	begin		    // - empty & !i_v => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_empty;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1'bx;
+		 addr_       = 0;
+		 state_      = state_empty;
 	      end
 	end
 
 	state_one: begin		    // - (contains one)
 	      if (i_v && o_b) begin	    // - one & i_v & o_b => consume
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1;
-		 addr_       <= 0;
-		 state_      <= state_more;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1;
+		 addr_       = 0;
+		 state_      = state_more;
 	      end
 	      else if (i_v && !o_b) begin   // - one & i_v & !o_b => cons+prod
-		 srlo_       <= i_d;
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 1;
-		 addr_       <= 0;
-		 state_      <= state_one;
+		 srlo_       = i_d;
+		 shift_en_o_ = 1;
+		 shift_en_   = 1;
+		 addr_       = 0;
+		 state_      = state_one;
 	      end
 	      else if (!i_v && o_b) begin   // - one & !i_v & o_b => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_one;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1'bx;
+		 addr_       = 0;
+		 state_      = state_one;
 	      end
 	      else if (!i_v && !o_b) begin  // - one & !i_v & !o_b => produce
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1'bx;
-		 addr_       <= 0;
-		 state_      <= state_empty;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1'bx;
+		 addr_       = 0;
+		 state_      = state_empty;
 	      end
 	end // case: state_one
 
@@ -244,60 +244,60 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
 					    // - (full, will not consume)
 					    // - (full here if depth==2)
 	      if (o_b) begin		    // - full & o_b => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 0;
-		 addr_       <= addr;
-		 state_      <= state_more;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 0;
+		 addr_       = addr;
+		 state_      = state_more;
 	      end
 	      else begin		    // - full & !o_b => produce
-		 srlo_       <= srl[addr];
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 0;
-//		 addr_       <= addr-1;
-//		 state_      <= state_more;
-		 addr_       <= addr_zero_ ? 0         : addr-1;
-		 state_      <= addr_zero_ ? state_one : state_more;
+		 srlo_       = srl[addr];
+		 shift_en_o_ = 1;
+		 shift_en_   = 0;
+//		 addr_       = addr-1;
+//		 state_      = state_more;
+		 addr_       = addr_zero_ ? 0         : addr-1;
+		 state_      = addr_zero_ ? state_one : state_more;
 	      end
 	   end
 	   else begin			    // - (mid: neither empty nor full)
 	      if (i_v && o_b) begin	    // - mid & i_v & o_b => consume
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 1;
-		 addr_       <= addr+1;
-		 state_      <= state_more;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 1;
+		 addr_       = addr+1;
+		 state_      = state_more;
 	      end
 	      else if (i_v && !o_b) begin   // - mid & i_v & !o_b => cons+prod
-		 srlo_       <= srl[addr];
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 1;
-		 addr_       <= addr;
-		 state_      <= state_more;
+		 srlo_       = srl[addr];
+		 shift_en_o_ = 1;
+		 shift_en_   = 1;
+		 addr_       = addr;
+		 state_      = state_more;
 	      end
 	      else if (!i_v && o_b) begin   // - mid & !i_v & o_b => idle
-		 srlo_       <= 'bx;
-		 shift_en_o_ <= 0;
-		 shift_en_   <= 0;
-		 addr_       <= addr;
-		 state_      <= state_more;
+		 srlo_       = 'bx;
+		 shift_en_o_ = 0;
+		 shift_en_   = 0;
+		 addr_       = addr;
+		 state_      = state_more;
 	      end
 	      else if (!i_v && !o_b) begin  // - mid & !i_v & !o_b => produce
-		 srlo_       <= srl[addr];
-		 shift_en_o_ <= 1;
-		 shift_en_   <= 0;
-		 addr_       <= addr_zero_ ? 0         : addr-1;
-		 state_      <= addr_zero_ ? state_one : state_more;
+		 srlo_       = srl[addr];
+		 shift_en_o_ = 1;
+		 shift_en_   = 0;
+		 addr_       = addr_zero_ ? 0         : addr-1;
+		 state_      = addr_zero_ ? state_one : state_more;
 	      end
 	   end // else: !if(addr_full)
 	end // case: state_more
 
 	default: begin
-		 srlo_       <=  'bx;
-		 shift_en_o_ <= 1'bx;
-		 shift_en_   <= 1'bx;
-		 addr_       <=  'bx;
-		 state_      <= 2'bx;
+		 srlo_       =  'bx;
+		 shift_en_o_ = 1'bx;
+		 shift_en_   = 1'bx;
+		 addr_       =  'bx;
+		 state_      = 2'bx;
 	end // case: default
 
       endcase // case(state)
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
index 107a00918e..dabb36647e 100644
--- a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -72,6 +72,10 @@ module mvu_8sx8u_dsp48 #(
 		return  res;
 	endfunction : init_leave_loads
 
+	function int unsigned sum_width(input int unsigned  n, input int unsigned  w);
+		return	w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n);
+	endfunction : sum_width
+
 	// Pipeline for last indicator flag
 	logic [1:5] L = '0;
 	always_ff @(posedge clk) begin
@@ -445,7 +449,7 @@ module mvu_8sx8u_dsp48 #(
 		// Stage #4: Cross-SIMD Reduction
 
 		// Count leaves reachable from each node
-		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
+		localparam leave_load_t  LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
 
 		// Range of Cross-lane Contribution Tracked in Hi4
 		/*
@@ -462,7 +466,7 @@ module mvu_8sx8u_dsp48 #(
 		 *   signed value is determined by its lower bound to be at least:
 		 *		1 + $clog2(2^(w-1)+SIMD)
 		 */
-		localparam int unsigned  HI_WIDTH = 1 + $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD);
+		localparam int unsigned  HI_WIDTH = 1 + ($clog2(SIMD) < ACCU_WIDTH-D[1]? ACCU_WIDTH-D[1] : $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD));
 
 		uwire signed [ACCU_WIDTH       -1:0]  up4;
 		uwire signed [HI_WIDTH         -1:0]  hi4;
@@ -504,12 +508,12 @@ module mvu_8sx8u_dsp48 #(
 			// Conclusive low part accumulation
 			if(i >= PE_REM) begin : blkLo
 				// Adder Tree across all SIMD low contributions (all unsigned arithmetic)
-				localparam int unsigned  ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+				localparam int unsigned  ROOT_WIDTH = sum_width(SIMD, LO_WIDTH);
 				uwire [2*SIMD-2:0][ROOT_WIDTH-1:0]  tree;
 				for(genvar  s = 0; s < SIMD;   s++)  assign  tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
 				for(genvar  n = 0; n < SIMD-1; n++) begin
 					// Sum truncated to actual maximum bit width at this node
-					localparam int unsigned  NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+					localparam int unsigned  NODE_WIDTH = sum_width(LEAVE_LOAD[n], LO_WIDTH);
 					uwire [NODE_WIDTH-1:0]  s = tree[2*n+1] + tree[2*n+2];
 					assign  tree[n] = s;
 				end
diff --git a/finn-rtllib/mvu/tb/mvu_accu_tb.dat b/finn-rtllib/mvu/tb/mvu_accu_tb.dat
new file mode 100644
index 0000000000..7e102ab6ab
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_accu_tb.dat
@@ -0,0 +1,192 @@
+9
+4
+d
+9
+2
+a
+d
+7
+9
+7
+b
+4
+4
+7
+0
+0
+c
+9
+9
+1
+9
+0
+a
+0
+5
+5
+7
+7
+2
+6
+7
+9
+0
+0
+9
+7
+7
+c
+7
+9
+7
+1
+2
+0
+f
+7
+1
+7
+f
+7
+1
+7
+1
+6
+6
+9
+e
+f
+e
+a
+6
+1
+7
+9
+d
+a
+7
+7
+f
+4
+7
+f
+9
+f
+9
+1
+9
+f
+7
+3
+4
+1
+1
+0
+d
+c
+d
+b
+9
+9
+f
+7
+0
+5
+e
+6
+7
+e
+7
+1
+7
+0
+e
+3
+c
+4
+9
+7
+9
+9
+d
+e
+c
+1
+f
+7
+0
+7
+1
+7
+d
+0
+7
+e
+a
+1
+9
+4
+b
+7
+9
+0
+a
+e
+6
+7
+2
+9
+0
+9
+0
+9
+1
+9
+0
+0
+7
+2
+7
+1
+5
+9
+1
+9
+6
+7
+c
+1
+9
+d
+9
+f
+c
+9
+9
+9
+b
+b
+9
+f
+9
+5
+1
+3
+0
+9
+0
+9
+2
+a
+9
+0
+f
+0
+7
+0
+a
+7
+3
+e
+5
+7
diff --git a/finn-rtllib/mvu/tb/mvu_accu_tb.sv b/finn-rtllib/mvu/tb/mvu_accu_tb.sv
new file mode 100644
index 0000000000..ceeb31194c
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_accu_tb.sv
@@ -0,0 +1,162 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *  3. Neither the name of the copyright holder nor the names of its
+ *     contributors may be used to endorse or promote products derived from
+ *     this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief	Testbench for MVU core compute kernel.
+ *****************************************************************************/
+
+module mvu_accu_tb;
+
+	localparam	IS_MVU = 1;
+	localparam	COMPUTE_CORE = "mvu_8sx8u_dsp48";
+	localparam	PUMPED_COMPUTE = 0;
+	localparam	MW = 6;
+	localparam	MH = 32;
+	localparam	PE = 1;
+	localparam	SIMD = 1;
+	localparam	ACTIVATION_WIDTH = 8;
+	localparam	WEIGHT_WIDTH = 4;
+	localparam	NARROW_WEIGHTS = 1;
+	localparam	SIGNED_ACTIVATIONS = 1;
+	localparam	SEGMENTLEN = 1;
+	localparam	FORCE_BEHAVIORAL = 0;
+
+	// Safely deducible parameters
+	localparam  WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8;
+	localparam  INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8;
+
+	// Global Control
+	logic  clk = 0;
+	always #5ns clk = !clk;
+	logic  rst = 1;
+	initial begin
+		repeat(16) @(posedge clk);
+		rst <= 0;
+	end
+
+	logic [WEIGHT_WIDTH-1:0]  WeightMem[MH*MW];
+	initial  $readmemh("mvu_accu_tb.dat", WeightMem);
+
+	// Shared Input Feed
+	logic [INPUT_STREAM_WIDTH_BA-1:0]  in_TDATA;
+	logic  in_TVALID[2];
+	uwire  in_TREADY[2];
+	initial begin
+		in_TDATA = 'x;
+		in_TVALID = '{ default: 0 };
+		@(posedge clk iff !rst);
+
+		repeat(2161*MW) begin
+			automatic logic [ACTIVATION_WIDTH-1:0]  a = $urandom();
+			in_TDATA  <= a;
+			in_TVALID <= '{ default: 1 };
+			fork
+				begin
+					@(posedge clk iff in_TREADY[0]);
+					in_TVALID[0] <= 0;
+				end
+				begin
+					@(posedge clk iff in_TREADY[1]);
+					in_TVALID[1] <= 0;
+				end
+			join
+		end
+
+		repeat(MH*MW) @(posedge clk);
+		$display("Test completed.");
+		$finish;
+	end
+
+	// DUTs
+	localparam int unsigned  ACCU_WIDTHS[2] = '{ 16, 32 };
+	int  OutQ[2][$];
+	for(genvar  i = 0; i < $size(ACCU_WIDTHS); i++) begin : genDUTs
+		localparam int unsigned  ACCU_WIDTH = ACCU_WIDTHS[i];
+		localparam int unsigned  OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+		// Private Weight Feed
+		logic [WEIGHT_STREAM_WIDTH_BA-1:0]  weights_TDATA;
+		logic  weights_TVALID;
+		uwire  weights_TREADY;
+		initial begin
+			weights_TDATA  = 'x;
+			weights_TVALID = 0;
+			@(posedge clk iff !rst);
+
+			weights_TVALID <= 1;
+			forever begin
+				for(int unsigned  i = 0; i < MH*MW; i++)  begin
+					weights_TDATA <= WeightMem[i];
+					@(posedge clk iff weights_TREADY);
+				end
+			end
+		end
+
+		// Private Output Capture into Queue
+		uwire signed [OUTPUT_STREAM_WIDTH_BA-1:0]  out_TDATA;
+		uwire  out_TVALID;
+		uwire  out_TREADY = !rst;
+		always_ff @(posedge clk iff !rst) begin
+			if(out_TVALID)  OutQ[i].push_back(out_TDATA);
+		end
+
+		// Actual DUT Instance
+		mvu_vvu_axi #(
+			.IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
+			.ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), .NARROW_WEIGHTS(NARROW_WEIGHTS),
+			.SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+		) dut (
+			.ap_clk(clk),
+			.ap_clk2x(1'b0),
+			.ap_rst_n(!rst),
+			.s_axis_weights_tdata(weights_TDATA),
+			.s_axis_weights_tvalid(weights_TVALID),
+			.s_axis_weights_tready(weights_TREADY),
+			.s_axis_input_tdata(in_TDATA),
+			.s_axis_input_tvalid(in_TVALID[i]),
+			.s_axis_input_tready(in_TREADY[i]),
+			.m_axis_output_tdata(out_TDATA),
+			.m_axis_output_tvalid(out_TVALID),
+			.m_axis_output_tready(out_TREADY)
+		);
+	end : genDUTs
+
+	// Output Equivalence Checker
+	always_ff @(posedge clk) begin
+		if(OutQ[0].size && OutQ[1].size) begin
+			automatic int unsigned  y0 = OutQ[0].pop_front();
+			automatic int unsigned  y1 = OutQ[1].pop_front();
+			assert(y0 == y1) else begin
+				$error("Output Mismatch: %0d vs. %0d", y0, y1);
+				$stop;
+			end
+		end
+	end
+
+endmodule : mvu_accu_tb
diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb
index bdd2976412..4c80c0263b 100644
--- a/notebooks/advanced/2_custom_op.ipynb
+++ b/notebooks/advanced/2_custom_op.ipynb
@@ -649,7 +649,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# run with FINN's execute_onnx, custom node will use c++ execution\n",
+    "# run with FINN's execute_onnx, custom node will use C++ execution\n",
     "new_op_inst.set_nodeattr(\"exec_mode\", \"c++\")\n",
     "ret = execute_onnx(mixedop_graph_new, inp_dict)\n",
     "ret"
diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb
index 5139377342..4a0f2bc695 100644
--- a/notebooks/advanced/4_advanced_builder_settings.ipynb
+++ b/notebooks/advanced/4_advanced_builder_settings.ipynb
@@ -199,7 +199,7 @@
    "id": "d746eff3",
    "metadata": {},
    "source": [
-    "After each FINN builder step, the graph is saved as .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HW abstraction layers (`step_convert_to_hw`). Then there is a partition created from all layers that were converted to HW layers (`step_create_dataflow_partition`), then we convert each of the HW abstraction layers into an HLS or RTL variant (`step_specialize_layers`). Afterwards optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step."
+    "After each FINN builder step, the graph is saved as an .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HW abstraction layers (`step_convert_to_hw`). Then there is a partition created from all layers that were converted to HW layers (`step_create_dataflow_partition`), then we convert each of the HW abstraction layers into an HLS or RTL variant (`step_specialize_layers`). Afterwards optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step."
    ]
   },
   {
@@ -218,7 +218,7 @@
    "id": "bccebd0d",
    "metadata": {},
    "source": [
-    "The analysis of these .onnx files can help us identifying points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HW layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HW layers have the module `finn.custom_op.fpgadataflow`."
+    "The analysis of these .onnx files can help us identify points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HW layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HW layers have the module `finn.custom_op.fpgadataflow`."
    ]
   },
   {
@@ -361,7 +361,7 @@
    "id": "2809f6a7",
    "metadata": {},
    "source": [
-    "Each steps gets the model (`model: ModelWrapper`) and the build configuration (`cfg: DataflowBuildConfig`) as input arguments. Then a certain sequence of transformations is applied to the model. In some of the steps, verification can be run to ensure that the applied transformations have not changed the behaviour of the network. In the end the modified model is returned."
+    "Each steps gets the model (`model: ModelWrapper`) and the build configuration (`cfg: DataflowBuildConfig`) as input arguments. Then a certain sequence of transformations is applied to the model. In some of the steps, verification can be run to ensure that the applied transformations have not changed the behaviour of the network. In the end, the modified model is returned."
    ]
   },
   {
@@ -993,7 +993,7 @@
    "id": "fd1519fe",
    "metadata": {},
    "source": [
-    "In the following part of the tutorial, we will use the auto generated json file as starting point to create two new json files which explore the `ram_style` attribute. We will use one of the generated reports from the FINN builder to see the impact of these changes.\n",
+    "In the following part of the tutorial, we will use the auto generated json file as a starting point to create two new json files which explore the `ram_style` attribute. We will use one of the generated reports from the FINN builder to see the impact of these changes.\n",
     "For that, we will extract the total resources from the *estimate_layer_resources.json* report in the following cell."
    ]
   },
@@ -1254,7 +1254,7 @@
    "id": "97f87780",
    "metadata": {},
    "source": [
-    "The initial implementation already had a high utilization of BRAM, but the estimations went now up to ~500 BRAMs while the LUT count went down to ~99k."
+    "The initial implementation already had a high utilization of BRAM, but the estimations now went up to ~500 BRAMs while the LUT count went down to ~99k."
    ]
   },
   {
@@ -1278,7 +1278,7 @@
    "id": "f7012b9a",
    "metadata": {},
    "source": [
-    "In this section, we will have a peak into additional builder arguments the FINN compiler exposes. We will not be able to cover all but you will be able to have a look at a list and we encourage you to take your time to look into the different options there are to customize the FINN builder configuration."
+    "In this section, we will have a peek into additional builder arguments the FINN compiler exposes. We will not be able to cover all but you will be able to have a look at a list and we encourage you to take your time to look into the different options there are to customize the FINN builder configuration."
    ]
   },
   {
@@ -1302,7 +1302,7 @@
    "id": "308d52ba",
    "metadata": {},
    "source": [
-    "Earlier in the tutorial, we had a look at how build steps are written. When investigating the `step_tidy_up`, we can see that before the changed model is returned a verification step can be run. In the case of `step_tidy_up` it is the step `\"initial python\"` that can be initiated by setting `VerificationStepType.TIDY_UP_PYTHON`."
+    "Earlier in the tutorial, we had a look at how build steps are written. When investigating the `step_tidy_up`, we can see that before the changed model is returned, a verification step can be run. In the case of `step_tidy_up` it is the step `\"initial python\"` that can be initiated by setting `VerificationStepType.TIDY_UP_PYTHON`."
    ]
   },
   {
@@ -1536,7 +1536,7 @@
    "source": [
     "There are attributes that come from the dataclasses-json class: `to_dict`, `to_json`, `schema`, `from_json`, `from_dict`. This class is used for the implementation of the FINN builder. In this tutorial, we are mainly interested in the FINN specific arguments.  \n",
     "\n",
-    "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. `target_fps`, `fpga_part` and `folding_config_file`. In the code of the FINN builder, the function of each builder argument is documents, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments."
+    "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. `target_fps`, `fpga_part` and `folding_config_file`. In the code of the FINN builder, the function of each builder argument is documented, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments."
    ]
   },
   {
@@ -1602,7 +1602,7 @@
    "id": "c249f141",
    "metadata": {},
    "source": [
-    "This concludes the advanced builder settings tutorial. Below you can find code that can help you investigating more of the builder arguments and invoking the whole flow to generate a bitfile."
+    "This concludes the advanced builder settings tutorial. Below you can find code that can help you in investigating more of the builder arguments and invoking the whole flow to generate a bitfile."
    ]
   },
   {
diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index da037050bb..3f8d65497b 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -53,7 +53,7 @@
     "    * [(Option 1) Train the Model from Scratch](#train_scratch)\n",
     "    * [(Option 2) Load Pre-Trained Parameters](#load_pretrained)\n",
     "* [Network Surgery Before Export](#network_surgery)\n",
-    "* [Export to QONNX and Conversion to FINN-ONNX](#export_qonnx)"
+    "* [Export to QONNX](#export_qonnx)"
    ]
   },
   {
@@ -194,7 +194,7 @@
    "source": [
     "# Define a PyTorch Device <a id='define_pytorch_device'></a> \n",
     "\n",
-    "GPUs can significantly speed-up training of deep neural networks. We check for availability of a GPU and if so define it as target device."
+    "GPUs can significantly speed-up training of deep neural networks. We check for availability of a GPU and if so define it as the target device."
    ]
   },
   {
@@ -667,12 +667,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Export to QONNX and Conversion to FINN-ONNX <a id=\"export_qonnx\" ></a>\n",
+    "# Export to QONNX <a id=\"export_qonnx\" ></a>\n",
     "\n",
     "\n",
     "[ONNX](https://onnx.ai/) is an open format built to represent machine learning models, and the FINN compiler expects an ONNX model as input. We'll now export our network into ONNX to be imported and used in FINN for the next notebooks. Note that the particular ONNX representation used for FINN differs from standard ONNX, you can read more about this [here](https://finn.readthedocs.io/en/latest/internals.html#intermediate-representation-finn-onnx).\n",
     "\n",
-    "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation (QONNX). QONNX is the format we can export from Brevitas, to feed it into the FINN compiler, we will need to make a conversion to the FINN-ONNX format which is the intermediate representation the compiler works on. The conversion of the FINN-ONNX format is a FINN compiler transformation and to be able to apply it to our model, we will need to wrap it into [ModelWrapper](https://finn.readthedocs.io/en/latest/internals.html#modelwrapper). This is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. Then we can call the conversion function to obtain the model in FINN-ONNX format."
+    "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation (QONNX). QONNX is the format we can export from Brevitas, to feed it into the FINN compiler, we will need to make a conversion to the FINN-ONNX format which is the intermediate representation the compiler works on. This will be done in the next notebook. For now, we simply export and save the QONNX model."
    ]
   },
   {
@@ -707,13 +707,6 @@
     "# clean-up\n",
     "qonnx_cleanup(ready_model_filename, out_file=ready_model_filename)\n",
     "\n",
-    "# ModelWrapper\n",
-    "model = ModelWrapper(ready_model_filename)\n",
-    "# Setting the input datatype explicitly because it doesn't get derived from the export function\n",
-    "model.set_tensor_datatype(model.graph.input[0].name, DataType[\"BIPOLAR\"])\n",
-    "model = model.transform(ConvertQONNXtoFINN())\n",
-    "model.save(ready_model_filename)\n",
-    "\n",
     "print(\"Model saved to %s\" % ready_model_filename)"
    ]
   },
@@ -721,16 +714,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## View the Exported ONNX in Netron\n",
+    "## View the Exported QONNX in Netron\n",
     "\n",
-    "Let's examine the exported ONNX model with [Netron](https://github.com/lutzroeder/netron), which is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties. Particular things of note:\n",
+    "Let's examine the exported QONNX model with [Netron](https://github.com/lutzroeder/netron), which is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties. Particular things of note:\n",
     "\n",
-    "* The input tensor \"0\" is annotated with `quantization: finn_datatype: BIPOLAR`\n",
     "* The input preprocessing (x + 1) / 2 is exported as part of the network (initial `Add` and `Div` layers)\n",
-    "* Brevitas `QuantLinear` layers are exported to ONNX as `MatMul`. We've exported the padded version; shape of the first MatMul node's weight parameter is 600x64\n",
-    "* The weight parameters (second inputs) for MatMul nodes are annotated with `quantization: finn_datatype: INT2`\n",
-    "* The quantized activations are exported as `MultiThreshold` nodes with `domain=qonnx.custom_op.general`\n",
-    "* There's a final `MultiThreshold` node with threshold=0 to produce the final bipolar output (this is the `qnt_output` from `CybSecMLPForExport`"
+    "* Brevitas `QuantLinear` layers are exported to QONNX as `Gemm`. We've exported the padded version; shape of the first `Gemm` node's weight parameter is 600x64\n",
+    "* The quantized activations are exported as `Quant` nodes with `domain=qonnx.custom_op.general`\n",
+    "* The weight parameters (second inputs) for the `Gemm` node can also be viewed by opening up the producer `Quant` node, scrolling down to the `Inputs` section and pressing the plus sign to the right of the first input parameter. For the first `Quant` node, this would be the parameter named `Quant_0_param0`\n",
+    "* The bitwidth of the weights are also shown as the 4th value in the `Quant` node, (3=2) meaning that we quantize to 2 bits total.\n",
+    "* There's a final `BipolarQuant` node with a single input and output value to produce the final bipolar output (this is the `qnt_output` from `CybSecMLPForExport`)"
    ]
   },
   {
diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
index 33b64e11c0..70f1acae0a 100644
--- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
+++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
@@ -12,7 +12,7 @@
     "\n",
     "**Also remember to 'close and halt' any other FINN notebooks, since Netron visualizations use the same port.**\n",
     "\n",
-    "In this notebook we will show how to import the network we trained in Brevitas and verify it in the FINN compiler. \n",
+    "In this notebook we will show how to import the network we trained in Brevitas, convert it from the QONNX format to FINN-ONNX, going over the differences and, lastly, verify it in the FINN compiler. \n",
     "This verification process can actually be done at various stages in the compiler [as explained in this notebook](../bnn-pynq/tfc_end2end_verification.ipynb) but for this example we'll only consider the first step: verifying the exported high-level FINN-ONNX model.\n",
     "Another goal of this notebook is to introduce you to the concept of *graph transformations* -- we'll be applying some transformations to the graph to make it executable for verification. \n",
     "Once this model is sucessfully verified, we'll generate an FPGA accelerator from it in the next notebook."
@@ -41,7 +41,7 @@
    "source": [
     "## Outline\n",
     "-------------\n",
-    "1. [Import model into FINN with ModelWrapper](#brevitas_import_visualization)\n",
+    "1. [Convert model from QONNX to FINN-ONNX](#brevitas_import_visualization)\n",
     "2. [Network preparations: Tidy-up transformations](#network_preparations)\n",
     "3. [Load the dataset and Brevitas model](#load_dataset) \n",
     "4. [Compare FINN and Brevitas execution](#compare_brevitas)"
@@ -51,9 +51,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# 1. Import model into FINN with ModelWrapper <a id=\"brevitas_import_visualization\"></a>\n",
+    "# 1. Convert model from QONNX to FINN-ONNX <a id=\"brevitas_import_visualization\"></a>\n",
     "\n",
-    "Now that we have the model in .onnx format, we can work with it using FINN. To import it into FINN, we'll use the [`ModelWrapper`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#qonnx.core.modelwrapper.ModelWrapper). It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model."
+    "\n",
+    "Even though the input to finn is the QONNX format, an IR called FINN-ONNX is used inside the compiler. In this part of the notebook, we show how to convert QONNX to FINN-ONNX and explain the key differences between the three representations: ONNX, QONNX and FINN-ONNX.\n",
+    "\n",
+    "QONNX and FINN-ONNX are extensions to the standard ONNX format. Currently, ONNX provides only limited support for expressing quantization, while QONNX and FINN-ONNX provide fully flexible quantization support. However the way in which they do differs: QONNX provides special node types called `Quant` which ingest weights or previous node output streams as inputs to produce quantized output streams. Meanwhile, FINN-ONNX uses tensor annotation to express quantization and has a special node type called `MultiThreshold`, which implements quantization on the activation data path.\n",
+    "\n",
+    "Beyond, this, there are other node types which differ in FINN-ONNX as opposed to QONNX. Thus we need a conversion function, which we will explore in more detail shortly.\n",
+    "\n",
+    "Lastly, we want to emphasize that we use the uppercase naming (ONNX, QONNX, FINN-ONNX) for the intermediate representations (IR), while the lower case naming (onnx, qonnx, finn) are usually used to refer to the compiler toolkits themselves.\n",
+    "\n",
+    "\n",
+    "## 1.1 Using ModelWrapper to load and observe a model\n",
+    "We first load the model which we prepared in the last notebook by using the\n",
+    "[`ModelWrapper`](https://finn.readthedocs.io/en/latest/source_code/finn.core.html#qonnx.core.modelwrapper.ModelWrapper). It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model."
    ]
   },
   {
@@ -64,17 +76,79 @@
    "source": [
     "import os\n",
     "from qonnx.core.modelwrapper import ModelWrapper\n",
+    "from qonnx.core.datatype import DataType\n",
+    "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n",
     "\n",
     "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/end2end_example/cybersecurity\"\n",
     "ready_model_filename = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
-    "model_for_sim = ModelWrapper(ready_model_filename)"
+    "\n",
+    "# ModelWrapper\n",
+    "model = ModelWrapper(ready_model_filename)\n",
+    "\n",
+    "print(\"Model loaded from %s\" % ready_model_filename)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To import the model into FINN, we will need to use the `ConvertQONNXtoFINN` transformation. But before that, let us use some of the member functions exposed by `ModelWrapper` to see what kind of information we can extract from it and have a baseline to compare to when we do call the `ConvertQONNXtoFINN` transformation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dir(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Many of these helper functions relate to extracting information about the structure and properties of the ONNX model. You can find out more about examining and manipulating ONNX models programmatically in [this tutorial](../../basics/0_how_to_work_with_onnx.ipynb), but we'll show a few basic functions here. For instance, we can extract the shape and datatype annotation for various tensors in the graph, as well as information related to the operation types associated with each node. We will do this now."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from qonnx.core.datatype import DataType\n",
+    "\n",
+    "in_tensor_name = model.graph.input[0].name\n",
+    "out_tensor_name = model.graph.output[0].name\n",
+    "print(\"Input tensor name: %s\" % in_tensor_name)\n",
+    "print(\"Output tensor name: %s\" % out_tensor_name)\n",
+    "model_in_shape = model.get_tensor_shape(in_tensor_name)\n",
+    "model_out_shape = model.get_tensor_shape(out_tensor_name)\n",
+    "print(\"Input tensor shape: %s\" % str(model_in_shape))\n",
+    "print(\"Output tensor shape: %s\" % str(model_out_shape))\n",
+    "model_in_dt = model.get_tensor_datatype(in_tensor_name)\n",
+    "model_out_dt = model.get_tensor_datatype(out_tensor_name)\n",
+    "print(\"Input tensor datatype: %s\" % str(model_in_dt.name))\n",
+    "print(\"Output tensor datatype: %s\" % str(model_out_dt.name))\n",
+    "print(\"List of node operator types in the graph: \")\n",
+    "print([x.op_type for x in model.graph.node])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's have a look at some of the member functions exposed by `ModelWrapper` to see what kind of information we can extract from it."
+    "Note that the input and output tensors are (as of yet) marked as a float32 values, even though we know they are binary. The output datatype will get inferred when we call the `ConvertQONNXtoFINN` transformation, which internally features an `InferDataTypes` transformation, while the input we will adjust manually with the `set_tensor_datatype` function."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1.2 Converting to from QONNX to FINN-ONNX using ConvertQONNXtoFINN\n",
+    "\n",
+    "At this point, we would like to move from the QONNX IR onto the FINN-ONNX IR. We can do this by using the `ConvertQONNXtoFINN()` function on a QONNX model."
    ]
   },
   {
@@ -83,14 +157,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dir(model_for_sim)"
+    "# Setting the input datatype explicitly because it doesn't get derived from the export function\n",
+    "model.set_tensor_datatype(model.graph.input[0].name, DataType[\"BIPOLAR\"])\n",
+    "\n",
+    "# Calling the actual QONNX -> FINN transformation\n",
+    "model = model.transform(ConvertQONNXtoFINN())"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Many of these helper functions relate to extracting information about the structure and properties of the ONNX model. You can find out more about examining and manipulating ONNX models programmatically in [this tutorial](../../basics/0_how_to_work_with_onnx.ipynb), but we'll show a few basic functions here. For instance, we can extract the shape and datatype annotation for various tensors in the graph, as well as information related to the operation types associated with each node."
+    "We can look at the tensor datatypes and operator types again to see how they have changed."
    ]
   },
   {
@@ -101,27 +179,51 @@
    "source": [
     "from qonnx.core.datatype import DataType\n",
     "\n",
-    "finnonnx_in_tensor_name = model_for_sim.graph.input[0].name\n",
-    "finnonnx_out_tensor_name = model_for_sim.graph.output[0].name\n",
-    "print(\"Input tensor name: %s\" % finnonnx_in_tensor_name)\n",
-    "print(\"Output tensor name: %s\" % finnonnx_out_tensor_name)\n",
-    "finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)\n",
-    "finnonnx_model_out_shape = model_for_sim.get_tensor_shape(finnonnx_out_tensor_name)\n",
-    "print(\"Input tensor shape: %s\" % str(finnonnx_model_in_shape))\n",
-    "print(\"Output tensor shape: %s\" % str(finnonnx_model_out_shape))\n",
-    "finnonnx_model_in_dt = model_for_sim.get_tensor_datatype(finnonnx_in_tensor_name)\n",
-    "finnonnx_model_out_dt = model_for_sim.get_tensor_datatype(finnonnx_out_tensor_name)\n",
-    "print(\"Input tensor datatype: %s\" % str(finnonnx_model_in_dt.name))\n",
-    "print(\"Output tensor datatype: %s\" % str(finnonnx_model_out_dt.name))\n",
+    "in_tensor_name = model.graph.input[0].name\n",
+    "out_tensor_name = model.graph.output[0].name\n",
+    "print(\"Input tensor name: %s\" % in_tensor_name)\n",
+    "print(\"Output tensor name: %s\" % out_tensor_name)\n",
+    "model_in_shape = model.get_tensor_shape(in_tensor_name)\n",
+    "model_out_shape = model.get_tensor_shape(out_tensor_name)\n",
+    "print(\"Input tensor shape: %s\" % str(model_in_shape))\n",
+    "print(\"Output tensor shape: %s\" % str(model_out_shape))\n",
+    "model_in_dt = model.get_tensor_datatype(in_tensor_name)\n",
+    "model_out_dt = model.get_tensor_datatype(out_tensor_name)\n",
+    "print(\"Input tensor datatype: %s\" % str(model_in_dt.name))\n",
+    "print(\"Output tensor datatype: %s\" % str(model_out_dt.name))\n",
     "print(\"List of node operator types in the graph: \")\n",
-    "print([x.op_type for x in model_for_sim.graph.node])"
+    "print([x.op_type for x in model.graph.node])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notice that the input and output tensor datatypes now correctly show `BIPOLAR` while the operator types have also heavily changed compared to the QONNX version. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`ConvertQONNXtoFINN` internally called many transformations which change the operators in such a manner and we can actually peek at the source code to see them using the `showSrc` function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.util.visualization import showSrc\n",
+    "showSrc(ConvertQONNXtoFINN.apply)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Note that the output tensor is (as of yet) marked as a float32 value, even though we know the output is binary. This will be automatically inferred by the compiler in the next step when we run the `InferDataTypes` transformation."
+    "As we can see, `ConvertQONNXtoFINN`  turned `Gemm` operation into `MatMuls` using the `GemmToMatMul()` transform and turned `Quant` nodes into `MultiThreshold` nodes using the `ConvertQuantActToMultiThreshold()` transform to name a few. However, these nodes do need further transformations before they can be turned into FPGA operators."
    ]
   },
   {
@@ -146,15 +248,15 @@
     "from qonnx.transformation.infer_datatypes import InferDataTypes\n",
     "from qonnx.transformation.fold_constants import FoldConstants\n",
     "\n",
-    "model_for_sim = model_for_sim.transform(InferShapes())\n",
-    "model_for_sim = model_for_sim.transform(FoldConstants())\n",
-    "model_for_sim = model_for_sim.transform(GiveUniqueNodeNames())\n",
-    "model_for_sim = model_for_sim.transform(GiveReadableTensorNames())\n",
-    "model_for_sim = model_for_sim.transform(InferDataTypes())\n",
-    "model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())\n",
+    "model = model.transform(InferShapes())\n",
+    "model = model.transform(FoldConstants())\n",
+    "model = model.transform(GiveUniqueNodeNames())\n",
+    "model = model.transform(GiveReadableTensorNames())\n",
+    "model = model.transform(InferDataTypes())\n",
+    "model = model.transform(RemoveStaticGraphInputs())\n",
     "\n",
     "verif_model_filename = model_dir + \"/cybsec-mlp-verification.onnx\"\n",
-    "model_for_sim.save(verif_model_filename)"
+    "model.save(verif_model_filename)"
    ]
   },
   {
@@ -309,22 +411,22 @@
     "import finn.core.onnx_exec as oxe\n",
     "\n",
     "def inference_with_finn_onnx(current_inp):\n",
-    "    finnonnx_in_tensor_name = model_for_sim.graph.input[0].name\n",
-    "    finnonnx_model_in_shape = model_for_sim.get_tensor_shape(finnonnx_in_tensor_name)\n",
-    "    finnonnx_out_tensor_name = model_for_sim.graph.output[0].name\n",
+    "    in_tensor_name = model.graph.input[0].name\n",
+    "    model_in_shape = model.get_tensor_shape(in_tensor_name)\n",
+    "    out_tensor_name = model.graph.output[0].name\n",
     "    # convert input to numpy for FINN\n",
     "    current_inp = current_inp.detach().numpy()\n",
     "    # add padding and re-scale to bipolar\n",
     "    current_inp = np.pad(current_inp, [(0, 0), (0, 7)])\n",
     "    current_inp = 2*current_inp-1\n",
     "    # reshape to expected input (add 1 for batch dimension)\n",
-    "    current_inp = current_inp.reshape(finnonnx_model_in_shape)\n",
+    "    current_inp = current_inp.reshape(model_in_shape)\n",
     "    # create the input dictionary\n",
-    "    input_dict = {finnonnx_in_tensor_name : current_inp} \n",
+    "    input_dict = {in_tensor_name : current_inp} \n",
     "    # run with FINN's execute_onnx\n",
-    "    output_dict = oxe.execute_onnx(model_for_sim, input_dict)\n",
+    "    output_dict = oxe.execute_onnx(model, input_dict)\n",
     "    #get the output tensor\n",
-    "    finn_output = output_dict[finnonnx_out_tensor_name] \n",
+    "    finn_output = output_dict[out_tensor_name] \n",
     "    return finn_output"
    ]
   },
diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
index 73cd25cf20..28702d0286 100644
--- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
+++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
@@ -78,7 +78,7 @@
     "### Configuring the Board and FPGA Part <a id=\"config_fpga\"></a>\n",
     "\n",
     "* `fpga_part`: Xilinx FPGA part to be used for synthesis, can be left unspecified to be inferred from `board` below, or specified explicitly for e.g. out-of-context synthesis.\n",
-    "* `board`: target Xilinx Zynq or Alveo board for generating accelerators integrated into a shell. See the `pynq_part_map` and `alveo_part_map` dicts in [this file](https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/basic.py#L41) for a list of possible boards.\n",
+    "* `board`: target Xilinx Zynq or Alveo board for generating accelerators integrated into a shell. See the `pynq_part_map` and `alveo_part_map` dicts in [this file](https://github.com/Xilinx/finn/blob/dev/src/finn/util/basic.py#L39) for a list of possible boards.\n",
     "* `shell_flow_type`: the target [shell flow type](https://finn-dev.readthedocs.io/en/latest/source_code/finn.builder.html#finn.builder.build_dataflow_config.ShellFlowType), only needed for generating full bitfiles where the FINN design is integrated into a shell (so only needed if `BITFILE` is selected) \n",
     "\n",
     "### Configuring the Performance <a id=\"config_perf\"></a>\n",
diff --git a/requirements.txt b/requirements.txt
index f4b3199ab5..1683695576 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ gspread==3.6.0
 importlib-resources==6.1.0
 ipython==8.12.2
 numpy==1.24.1
-onnx==1.16.0
+onnx==1.17.0
 onnxoptimizer
 onnxruntime==1.18.1
 pre-commit==3.3.2
diff --git a/run-docker.sh b/run-docker.sh
index b1fe44eb0c..ec55299f6c 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -102,6 +102,7 @@ SCRIPTPATH=$(dirname "$SCRIPT")
 : ${FINN_SINGULARITY=""}
 : ${FINN_SKIP_XRT_DOWNLOAD=""}
 : ${FINN_XRT_PATH=""}
+: ${FINN_DOCKER_NO_CACHE="0"}
 
 DOCKER_INTERACTIVE=""
 
@@ -142,7 +143,7 @@ elif [ "$1" = "build_custom" ]; then
   DOCKER_INTERACTIVE="-it"
   #FINN_HOST_BUILD_DIR=$BUILD_DATAFLOW_DIR/build
   gecho "Running build_custom: $BUILD_CUSTOM_DIR/$FLOW_NAME.py"
-  DOCKER_CMD="python -mpdb -cc -cq $FLOW_NAME.py"
+  DOCKER_CMD="python -mpdb -cc -cq $FLOW_NAME.py ${@:4}"
 elif [ -z "$1" ]; then
    gecho "Running container only"
    DOCKER_CMD="bash"
@@ -190,6 +191,10 @@ if [ -d "$FINN_XRT_PATH" ];then
   export LOCAL_XRT=1
 fi
 
+if [ "$FINN_DOCKER_NO_CACHE" = "1" ]; then
+  FINN_DOCKER_BUILD_EXTRA+="--no-cache "
+fi
+
 # Build the FINN Docker image
 if [ "$FINN_DOCKER_PREBUILT" = "0" ] && [ -z "$FINN_SINGULARITY" ]; then
   # Need to ensure this is done within the finn/ root folder:
@@ -226,6 +231,9 @@ DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS "
 # Workaround for FlexLM issue, see:
 # https://community.flexera.com/t5/InstallAnywhere-Forum/Issues-when-running-Xilinx-tools-or-Other-vendor-tools-in-docker/m-p/245820#M10647
 DOCKER_EXEC+="-e LD_PRELOAD=/lib/x86_64-linux-gnu/libudev.so.1 "
+# Workaround for running multiple Vivado instances simultaneously, see:
+# https://adaptivesupport.amd.com/s/article/63253?language=en_US
+DOCKER_EXEC+="-e XILINX_LOCAL_USER_DATA=no "
 if [ "$FINN_DOCKER_RUN_AS_ROOT" = "0" ] && [ -z "$FINN_SINGULARITY" ];then
   DOCKER_EXEC+="-v /etc/group:/etc/group:ro "
   DOCKER_EXEC+="-v /etc/passwd:/etc/passwd:ro "
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 5d69802337..d6437a2e5c 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -35,7 +35,7 @@
 from typing import Any, List, Optional
 
 from finn.transformation.fpgadataflow.vitis_build import VitisOptStrategy
-from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map
+from finn.util.basic import alveo_default_platform, part_map
 
 
 class AutoFIFOSizingMethod(str, Enum):
@@ -370,11 +370,10 @@ def _resolve_driver_platform(self):
     def _resolve_fpga_part(self):
         if self.fpga_part is None:
             # lookup from part map if not specified
-            if self.shell_flow_type == ShellFlowType.VIVADO_ZYNQ:
-                return pynq_part_map[self.board]
-            elif self.shell_flow_type == ShellFlowType.VITIS_ALVEO:
-                return alveo_part_map[self.board]
-            else:
+            try:
+                fpga_part = part_map[self.board]
+                return fpga_part
+            except KeyError:
                 raise Exception("Couldn't resolve fpga_part for " + self.board)
         else:
             # return as-is when explicitly specified
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index b8ed8daec7..ab2280554c 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -121,6 +121,7 @@
 )
 from finn.transformation.streamline import Streamline
 from finn.transformation.streamline.reorder import MakeMaxPoolNHWC
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.basic import (
     get_rtlsim_trace_depth,
     pyverilate_get_liveness_threshold_cycles,
@@ -503,6 +504,7 @@ def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
     if cfg.minimize_bit_width:
         model = model.transform(MinimizeWeightBitWidth())
         model = model.transform(MinimizeAccumulatorWidth())
+        model = model.transform(RoundAndClipThresholds())
         # make sure the changed datatypes are propagated through the network
         model = model.transform(InferDataTypes())
     return model
@@ -666,7 +668,7 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
         estimate_network_performance = verify_model.analysis(dataflow_performance)
         prev_liveness = pyverilate_get_liveness_threshold_cycles()
         os.environ["LIVENESS_THRESHOLD"] = str(
-            int(estimate_network_performance["critical_path_cycles"])
+            int(estimate_network_performance["critical_path_cycles"] * 1.1)
         )
         if cfg.verify_save_rtlsim_waveforms:
             report_dir = cfg.output_dir + "/report"
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 96f49069c7..1fb4940fb4 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -27,6 +27,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+import warnings
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
@@ -141,6 +142,27 @@ def infer_node_datatype(self, model):
         node = self.onnx_node
         # data type stays the same
         dtype = model.get_tensor_datatype(node.input[0])
+
+        # Test for changing input datatype
+        if dtype != self.get_nodeattr("inputDataType"):
+            # Issue a warning message
+            warnings.warn(
+                f"{node.name}: inputDataType changing from"
+                f" {self.get_nodeattr('inputDataType')} to {dtype}"
+            )
+            # Set the new datatype attribute
+            self.set_nodeattr("inputDataType", dtype.name)
+
+        # Test for changing output datatype
+        if dtype != self.get_nodeattr("outputDataType"):
+            # Issue a warning message
+            warnings.warn(
+                f"{node.name}: outputDataType changing from"
+                f" {self.get_nodeattr('outputDataType')} to {dtype}"
+            )
+            # Set the new datatype attribute
+            self.set_nodeattr("outputDataType", dtype.name)
+        # Propagate the datatype through the model graph
         model.set_tensor_datatype(node.output[0], dtype)
 
     def verify_node(self):
diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index d8210fd684..4677960ea8 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -54,6 +54,8 @@ def get_nodeattr_types(self):
             "code_gen_dir_cppsim": ("s", False, ""),
             "executable_path": ("s", False, ""),
             "res_hls": ("s", False, ""),
+            # temporary node attribute to keep track of interface style of hls ops
+            "cpp_interface": ("s", False, "packed", {"packed", "hls_vector"}),
         }
 
     def get_all_verilog_paths(self):
@@ -206,7 +208,13 @@ def code_generation_cppsim(self, model):
         self.dataoutstrm()
         self.save_as_npy()
 
-        template = templates.docompute_template
+        if self.get_nodeattr("cpp_interface") == "hls_vector":
+            self.timeout_value()
+            self.timeout_condition()
+            self.timeout_read_stream()
+            template = templates.docompute_template_timeout
+        else:
+            template = templates.docompute_template
 
         for key in self.code_gen_dict:
             # transform list into long string separated by '\n'
@@ -371,24 +379,40 @@ def read_npy_data(self):
         if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
             dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_instream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
         npy_in = "%s/input_0.npy" % code_gen_dir
         self.code_gen_dict["$READNPYDATA$"] = []
-        self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                npy_in,
-                self.hls_sname(),
+
+        cpp_interface = self.get_nodeattr("cpp_interface")
+
+        if cpp_interface == "packed":
+            elem_bits = dtype.bitwidth()
+            packed_bits = self.get_instream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    npy_in,
+                    self.hls_sname(),
+                )
+            )
+        else:
+            folded_shape = self.get_folded_input_shape()
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2vectorstream<%s, %s, %d>("%s", in0_%s, false);'
+                % (
+                    elem_hls_type,
+                    npy_type,
+                    folded_shape[-1],
+                    npy_in,
+                    self.hls_sname(),
+                )
             )
-        )
 
     def strm_decl(self):
         """Function to generate the commands for the stream declaration in c++,
@@ -422,27 +446,43 @@ def dataoutstrm(self):
         if dtype == DataType["BIPOLAR"]:
             # use binary for bipolar storage
             dtype = DataType["BINARY"]
-        elem_bits = dtype.bitwidth()
-        packed_bits = self.get_outstream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
         npy_out = "%s/output.npy" % code_gen_dir
         oshape = self.get_folded_output_shape()
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
 
-        self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                self.hls_sname(),
-                oshape_cpp_str,
-                npy_out,
-            )
-        ]
+        cpp_interface = self.get_nodeattr("cpp_interface")
+
+        if cpp_interface == "packed":
+            elem_bits = dtype.bitwidth()
+            packed_bits = self.get_outstream_width()
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+
+            self.code_gen_dict["$DATAOUTSTREAM$"] = [
+                'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    self.hls_sname(),
+                    oshape_cpp_str,
+                    npy_out,
+                )
+            ]
+        else:
+            folded_shape = self.get_folded_output_shape()
+            self.code_gen_dict["$DATAOUTSTREAM$"] = [
+                'vectorstream2npy<%s, %s, %d>(strm, %s, "%s");'
+                % (
+                    elem_hls_type,
+                    npy_type,
+                    folded_shape[-1],
+                    oshape_cpp_str,
+                    npy_out,
+                )
+            ]
 
     def save_as_npy(self):
         """Function to generate the commands for saving data in .npy file in c++"""
@@ -474,3 +514,17 @@ def get_ap_int_max_w(self):
         ret = max([instream, outstream])
         assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret
         return ret
+
+    def timeout_value(self):
+        """Set timeout value for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_VALUE$"] = ["1000"]
+
+    def timeout_condition(self):
+        """Set timeout condition for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_CONDITION$"] = ["out_{}.empty()".format(self.hls_sname())]
+
+    def timeout_read_stream(self):
+        """Set reading output stream procedure for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [
+            "strm << out_{}.read();".format(self.hls_sname())
+        ]
diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
index 1c86ae7b7a..8f0a987bce 100644
--- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
+++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py
@@ -130,6 +130,8 @@ def get_nodeattr_types(self):
     def execute_node(self, context, graph):
         node = self.onnx_node
         in_act = context[node.input[0]]
+        # ensure that shape is compatible
+        in_act = in_act.reshape(self.get_normal_input_shape())
         mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0]
         mvau_w = np_helper.to_array(mvau_w_init)
         # Matrix multiplication
diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
index 3e81aa93e0..d9ab501117 100644
--- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
+++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py
@@ -208,7 +208,10 @@ def _resolve_impl_style(self, dsp_block):
         weight_width = self.get_input_datatype(1).bitwidth()
 
         if dsp_block == "DSP58":
-            return "mvu_vvu_8sx9_dsp58"
+            if act_width <= 4 and weight_width <= 4:
+                return "mvu_4sx4u_dsp48e2"
+            else:
+                return "mvu_vvu_8sx9_dsp58"
         else:
             if act_width <= 4 and weight_width <= 4:
                 if dsp_block == "DSP48E1":
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 3d89a0ab23..d2100a7516 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -32,6 +32,7 @@
 #define AP_INT_MAX_W $AP_INT_MAX_W$
 #include "cnpy.h"
 #include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
 #include <vector>
 #include "bnn-library.h"
 
@@ -58,6 +59,51 @@
 
 """
 
+# template for single node execution with timeout (for single clock hls operations)
+docompute_template_timeout = """
+#define AP_INT_MAX_W $AP_INT_MAX_W$
+#include "cnpy.h"
+#include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
+#include <vector>
+#include "bnn-library.h"
+
+// includes for network parameters
+$GLOBALS$
+
+// defines for network parameters
+$DEFINES$
+
+int main(){
+$PRAGMAS$
+
+$STREAMDECLARATIONS$
+
+$READNPYDATA$
+
+unsigned timeout = 0;
+while(timeout < $TIMEOUT_VALUE$){
+
+$DOCOMPUTE$
+
+if($TIMEOUT_CONDITION$){
+timeout++;
+}
+
+else{
+$TIMEOUT_READ_STREAM$
+timeout = 0;
+}
+}
+
+$DATAOUTSTREAM$
+
+$SAVEASCNPY$
+
+}
+
+"""
+
 # templates for single node ip generation
 
 # cpp file
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index e14181b140..b02bc89db8 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -529,65 +529,119 @@ def apply(self, model):
         graph = model.graph
         node_ind = 0
         graph_modified = False
+        # check first if global input is split
+        successors = model.find_consumers(graph.input[0].name)
+        dt = model.get_tensor_datatype(graph.input[0].name)
+        if successors is not None and len(successors) >= 2 and dt.is_integer():
+            output_tensor = graph.input[0].name
+            n_outputs = len(successors)
+            dt = model.get_tensor_datatype(output_tensor)
+
+            # create clone tensors
+            out_shape = model.get_tensor_shape(output_tensor)
+            out_tensor_clones = []
+            for i in range(n_outputs):
+                clone = helper.make_tensor_value_info(
+                    model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                )
+                model.graph.value_info.append(clone)
+                out_tensor_clones += [clone.name]
+
+            num_ch = int(out_shape[-1])
+            vecs = out_shape[:-1]
+
+            # create node with no parallelization first
+            pe = 1
+
+            dup_node = helper.make_node(
+                "DuplicateStreams",
+                [output_tensor],
+                out_tensor_clones,
+                domain="finn.custom_op.fpgadataflow",
+                backend="fpgadataflow",
+                NumChannels=num_ch,
+                PE=pe,
+                inputDataType=dt.name,
+                numInputVectors=vecs,
+                NumOutputStreams=n_outputs,
+                outFIFODepths=[2] * n_outputs,
+                name="DuplicateStreams_" + output_tensor,
+            )
+
+            graph.node.insert(0, dup_node)
+
+            # connect successors to out tensor clone
+            clone_idx = 0
+            for successor in successors:
+                for i, succ_input in enumerate(successor.input):
+                    if succ_input == output_tensor:
+                        successor.input[i] = out_tensor_clones[clone_idx]
+                        clone_idx += 1
+                        # if one node has multiple connections to the same output
+                        # find_direct_successors will return one node per input
+                        # so break the inner loop will result in correct behaviour
+                        break
+            graph_modified = True
+
         for node in graph.node:
             node_ind += 1
-            successors = model.find_consumers(node.output[0])
-            if successors is not None and len(successors) >= 2:
-                output_tensor = node.output[0]
-                n_outputs = len(successors)
+            for output_tensor in node.output:
+                successors = model.find_consumers(output_tensor)
+                if successors is not None and len(successors) >= 2:
+                    n_outputs = len(successors)
 
-                dt = model.get_tensor_datatype(output_tensor)
+                    dt = model.get_tensor_datatype(output_tensor)
 
-                # skip conversion for layers with float input
-                if not dt.is_integer():
-                    continue
+                    # skip conversion for layers with float input
+                    if not dt.is_integer():
+                        continue
 
-                # create clone tensors
-                out_shape = model.get_tensor_shape(output_tensor)
-                out_tensor_clones = []
-                for i in range(n_outputs):
-                    clone = helper.make_tensor_value_info(
-                        model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
-                    )
-                    model.graph.value_info.append(clone)
-                    out_tensor_clones += [clone.name]
+                    # create clone tensors
+                    out_shape = model.get_tensor_shape(output_tensor)
+                    out_tensor_clones = []
+                    for i in range(n_outputs):
+                        clone = helper.make_tensor_value_info(
+                            model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+                        )
+                        model.graph.value_info.append(clone)
+                        out_tensor_clones += [clone.name]
 
-                num_ch = int(out_shape[-1])
-                vecs = out_shape[:-1]
+                    num_ch = int(out_shape[-1])
+                    vecs = out_shape[:-1]
 
-                # create node with no parallelization first
-                pe = 1
+                    # create node with no parallelization first
+                    pe = 1
 
-                dup_node = helper.make_node(
-                    "DuplicateStreams",
-                    [output_tensor],
-                    out_tensor_clones,
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                    NumChannels=num_ch,
-                    PE=pe,
-                    inputDataType=dt.name,
-                    numInputVectors=vecs,
-                    NumOutputStreams=n_outputs,
-                    outFIFODepths=[2] * n_outputs,
-                    name="DuplicateStreams_" + node.name,
-                )
+                    dup_node = helper.make_node(
+                        "DuplicateStreams",
+                        [output_tensor],
+                        out_tensor_clones,
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        NumChannels=num_ch,
+                        PE=pe,
+                        inputDataType=dt.name,
+                        numInputVectors=vecs,
+                        NumOutputStreams=n_outputs,
+                        outFIFODepths=[2] * n_outputs,
+                        name="DuplicateStreams_" + node.name,
+                    )
 
-                graph.node.insert(node_ind, dup_node)
+                    graph.node.insert(node_ind, dup_node)
 
-                # connect successors to out tensor clone
-                clone_idx = 0
-                for successor in successors:
-                    for i, succ_input in enumerate(successor.input):
-                        if succ_input == output_tensor:
-                            successor.input[i] = out_tensor_clones[clone_idx]
-                            clone_idx += 1
-                            # if one node has multiple connections to the same output
-                            # find_direct_successors will return one node per input
-                            # so break the inner loop will result in correct behaviour
-                            break
+                    # connect successors to out tensor clone
+                    clone_idx = 0
+                    for successor in successors:
+                        for i, succ_input in enumerate(successor.input):
+                            if succ_input == output_tensor:
+                                successor.input[i] = out_tensor_clones[clone_idx]
+                                clone_idx += 1
+                                # if one node has multiple connections to the same output
+                                # find_direct_successors will return one node per input
+                                # so break the inner loop will result in correct behaviour
+                                break
 
-                graph_modified = True
+                    graph_modified = True
 
         if graph_modified:
             model = model.transform(SortGraph())
@@ -1197,8 +1251,8 @@ def apply(self, model):
 
 
 class InferStreamingEltwise(Transformation):
-    """Convert eltwise Sub or Sub -> Abs to StreamingEltwise layer
-    with SubEltwise or AbsDiffEltwise op."""
+    """Convert eltwise Add, Sub or Sub -> Abs to StreamingEltwise layer
+    with AddEltwise, SubEltwise or AbsDiffEltwise op."""
 
     def apply(self, model):
         graph = model.graph
@@ -1206,7 +1260,7 @@ def apply(self, model):
         graph_modified = False
         for node in graph.node:
             node_ind += 1
-            if node.op_type == "Sub":
+            if node.op_type in ["Sub", "Add"]:
                 in0 = node.input[0]
                 in1 = node.input[1]
                 result = node.output[0]
@@ -1230,14 +1284,15 @@ def apply(self, model):
                 if not (idt0.is_integer() and idt1.is_integer()):
                     continue
 
-                eltwiseOp = "Sub"
+                eltwiseOp = node.op_type
                 nodes_to_remove = [node]
-                # look for a downstream Abs node
-                res_consumer = model.find_consumer(result)
-                if (res_consumer is not None) and (res_consumer.op_type == "Abs"):
-                    eltwiseOp = "AbsDiff"
-                    result = res_consumer.output[0]
-                    nodes_to_remove.append(res_consumer)
+                if node.op_type == "Sub":
+                    # look for a downstream Abs node
+                    res_consumer = model.find_consumer(result)
+                    if (res_consumer is not None) and (res_consumer.op_type == "Abs"):
+                        eltwiseOp = "AbsDiff"
+                        result = res_consumer.output[0]
+                        nodes_to_remove.append(res_consumer)
 
                 # check layout and convert if necessary
                 in0_layout = model.get_tensor_layout(in0)
@@ -1438,6 +1493,9 @@ def apply(self, model):
             if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None:
                 mm_input = n.input[0]
                 mm_weight = n.input[1]
+                # if mm_weight is not constant, skip node
+                if model.get_initializer(n.input[1]) is None:
+                    continue
                 mm_output = n.output[0]
                 mm_in_shape = model.get_tensor_shape(mm_input)
                 mm_out_shape = model.get_tensor_shape(mm_output)
diff --git a/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py b/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py
index 8dbf7071fc..e1dcf1dde5 100644
--- a/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py
+++ b/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py
@@ -2,8 +2,7 @@
 import warnings
 from onnx import TensorProto, helper
 from qonnx.transformation.base import Transformation
-from qonnx.transformation.lower_convs_to_matmul import _auto_pad_to_explicit_padding
-from qonnx.util.basic import get_by_name
+from qonnx.util.basic import auto_pad_to_explicit_padding, get_by_name
 
 
 class InferPixelPaddingDeconv(Transformation):
@@ -61,7 +60,7 @@ def apply(self, model):
                         # use specified padding
                         pad = get_by_name(n.attribute, "pads").ints
                     else:
-                        pad = _auto_pad_to_explicit_padding(
+                        pad = auto_pad_to_explicit_padding(
                             auto_pad,
                             ifm_dim_h,
                             ifm_dim_w,
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 33cc3e86d3..b56c8b74ea 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -26,7 +26,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -110,12 +109,15 @@ def apply(self, model):
                             # determine shape for dwc
                             dwc_shape = n0.get_normal_output_shape()
 
-                            # determine dtype for dwc
+                            # determine FINN dtype for dwc
                             dtype = n0.get_output_datatype()
+                            # determine onnx tensor dtype for dwc
+                            n0_otensor = model.get_tensor_valueinfo(output_name)
+                            n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type
 
                             dwc_output_tensor = oh.make_tensor_value_info(
                                 model.make_new_valueinfo_name(),
-                                TensorProto.FLOAT,
+                                n0_tensor_dtype,
                                 dwc_shape,
                             )
                             graph.value_info.append(dwc_output_tensor)
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index 9df193efcf..9ed0f51cd4 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -29,7 +29,6 @@
 
 import numpy as np
 import warnings
-from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
@@ -114,6 +113,8 @@ def apply(self, model):
                         # determine fifo node attributes
                         fld_shape = n0.get_folded_output_shape()
                         dtype = n0.get_output_datatype()
+                        n0_otensor = model.get_tensor_valueinfo(output_name)
+                        n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type
 
                         # check if folded_shape of output of first node and
                         # input of the second node is equal
@@ -145,7 +146,7 @@ def apply(self, model):
                             # or unless create_shallow_fifos is specified
                             fifo_output_tensor = oh.make_tensor_value_info(
                                 model.make_new_valueinfo_name(),
-                                TensorProto.FLOAT,
+                                n0_tensor_dtype,
                                 n0.get_normal_output_shape(),
                             )
                             graph.value_info.append(fifo_output_tensor)
@@ -196,13 +197,15 @@ def apply(self, model):
                     fld_shape = n0.get_folded_input_shape(inp_ind)
                     n_shape = n0.get_normal_input_shape(inp_ind)
                     dtype = n0.get_input_datatype(inp_ind)
+                    n0_itensor = model.get_tensor_valueinfo(graph_in_name)
+                    n0_tensor_dtype = n0_itensor.type.tensor_type.elem_type
                     fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind]
 
                     if fifo_depth > 2 or self.create_shallow_fifos:
                         # create fifo node
                         fifo_output_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
-                            TensorProto.FLOAT,
+                            n0_tensor_dtype,
                             n0.get_normal_input_shape(inp_ind),
                         )
                         graph.value_info.append(fifo_output_tensor)
@@ -256,14 +259,16 @@ def apply(self, model):
                     fld_shape = n0.get_folded_output_shape(out_ind)
                     n_shape = n0.get_normal_output_shape(out_ind)
                     dtype = n0.get_output_datatype(out_ind)
+                    n0_otensor = model.get_tensor_valueinfo(graph_out_name)
+                    n0_tensor_dtype = n0_otensor.type.tensor_type.elem_type
                     fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind]
 
                     if fifo_depth > 2 or self.create_shallow_fifos:
                         # create fifo node
                         fifo_input_tensor = oh.make_tensor_value_info(
                             model.make_new_valueinfo_name(),
-                            TensorProto.FLOAT,
-                            n0.get_normal_output_shape(),
+                            n0_tensor_dtype,
+                            n0.get_normal_output_shape(out_ind),
                         )
                         graph.value_info.append(fifo_input_tensor)
                         model.set_tensor_datatype(fifo_input_tensor.name, dtype)
@@ -289,7 +294,7 @@ def apply(self, model):
                         graph.node.append(fifo_node)
 
                         # set fifo output tensor as new input tensor of second node
-                        final_node.output[0] = fifo_input_tensor.name
+                        final_node.output[out_ind] = fifo_input_tensor.name
                     else:
                         warnings.warn(
                             """Output FIFO for %s has depth %d and won't
diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
index 1158253aea..36181e7a48 100644
--- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -423,6 +423,13 @@ def _calculate_thresholds(self):
 
         # ToDo: The index 1 needs to be changed to -1 for the channels last format
         num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[cdim]
+
+        assert (
+                thresholds.shape[0] == 1 or thresholds.shape[
+            0] == num_output_channels
+        ), """Quant node cannot be converted to MultiThreshold because only
+            per tensor or per channel quantization supported."""
+
         final_shape = (num_output_channels, num_thresholds)
         if thresholds.shape != final_shape:
             thresholds = np.broadcast_to(thresholds, final_shape)
@@ -585,6 +592,11 @@ def _calculate_thresholds(self):
             # ToDo: The index 1 needs to be changed to -1 for the channels last format
             num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[cdim]
 
+            assert (
+                thresholds.shape[0] == 1 or thresholds.shape[0] == num_output_channels
+            ), """Quant node cannot be converted to MultiThreshold because only
+                per tensor or per channel quantization supported."""
+
             final_shape = (num_output_channels, num_thresholds)
             if thresholds.shape != final_shape:
                 thresholds = np.broadcast_to(thresholds, final_shape)
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 8ac2d7dad6..9a7e9d0723 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -29,6 +29,7 @@
 import numpy as np
 import qonnx.core.data_layout as DataLayout
 import warnings
+from copy import deepcopy
 from onnx import TensorProto
 from onnx import helper as oh
 from qonnx.core.datatype import DataType
@@ -641,6 +642,10 @@ def apply(self, model):
                     # if initializer is not scalar, skip
                     if np.prod(init0.shape) != 1:
                         continue
+                    if model.is_fork_node(prod0):
+                        model = model.transform(MoveOpPastFork(prod0.op_type))
+                        # topology modified, "ask" ModelWrapper to apply this transform again
+                        return (model, True)
                     # Flatten input if required
                     if len(init0.shape) > 0:
                         init0 = init0.flatten()[0]
@@ -713,6 +718,12 @@ def apply(self, model):
                 elif producer is not None and producer.op_type == "Transpose":
                     perms = list(get_by_name(producer.attribute, "perm").ints)
                     if perms == [0, 3, 1, 2]:
+                        # check if the producer is a fork node
+                        # (need to move it past the fork before this transform)
+                        if model.is_fork_node(producer):
+                            model = model.transform(MoveTransposePastFork())
+                            # topology modified, "ask" ModelWrapper to apply this transform again
+                            return (model, True)
                         ceil_mode = get_by_name(n.attribute, "ceil_mode")
                         if ceil_mode is not None:
                             ceil_mode = ceil_mode.i
@@ -764,6 +775,12 @@ def apply(self, model):
                 if producer is not None and producer.op_type == "Transpose":
                     perms = list(get_by_name(producer.attribute, "perm").ints)
                     if perms == [0, 3, 1, 2]:
+                        # check if the producer is a fork node
+                        # (need to move it past the fork before this transform)
+                        if model.is_fork_node(producer):
+                            model = model.transform(MoveTransposePastFork())
+                            # topology modified, "ask" ModelWrapper to apply this transform again
+                            return (model, True)
                         old_value = model.get_initializer(n.input[scales_ind])
                         new_value = np.array(
                             [old_value[idx] for idx in (0, 2, 3, 1)],
@@ -813,10 +830,9 @@ class MoveOpPastFork(Transformation):
     can be merged with nodes in the branches
     """
 
-    def __init__(self, op_name_list, get_attrs_fxn=lambda x: {}):
+    def __init__(self, op_name_list):
         super().__init__()
         self.ops_to_move = op_name_list
-        self.get_attrs_fxn = get_attrs_fxn
 
     def apply(self, model):
         graph = model.graph
@@ -859,11 +875,9 @@ def apply(self, model):
                         new_param_name = model.make_new_valueinfo_name()
                         new_inp_list = [n.input[0], new_param_name]
                         model.set_initializer(new_param_name, op_init_param)
-                    attrs = self.get_attrs_fxn(n)
-                    # TODO use copy of original node instead to get attrs?
-                    new_node = oh.make_node(
-                        n.op_type, new_inp_list, [new_output_tensor_name], **attrs
-                    )
+                    new_node = deepcopy(n)
+                    new_node.input[:] = new_inp_list
+                    new_node.output[:] = [new_output_tensor_name]
                     graph.node.insert(node_ind, new_node)
                     node_ind += 1
 
@@ -901,7 +915,7 @@ def __init__(self):
 
 class MoveTransposePastFork(MoveOpPastFork):
     def __init__(self):
-        super().__init__(["Transpose"], lambda x: {"perm": get_by_name(x.attribute, "perm").ints})
+        super().__init__(["Transpose"])
 
 
 class MoveMaxPoolPastMultiThreshold(Transformation):
diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py
index 5ba5ee0ff5..312db404ac 100644
--- a/src/finn/transformation/streamline/round_thresholds.py
+++ b/src/finn/transformation/streamline/round_thresholds.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020-2022, Xilinx
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,42 +28,67 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import numpy as np
+from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
 from qonnx.transformation.base import Transformation
+from qonnx.transformation.infer_datatypes import InferDataTypes
 
 
 class RoundAndClipThresholds(Transformation):
     """For MultiThreshold nodes operating on integer inputs, round up
     thresholds values to the nearest integer. Additionally, if the input
-    is unsigned, sets negative thresholds to zero."""
+    is unsigned, sets negative thresholds to zero. Type-casts thresholds (back)
+    to the float32 container type (this is separate from the quantization
+    annotation). Runs InferDataTypes() afterward to propagate any changes to the
+    quantization data types."""
 
-    def apply(self, model):
+    def apply(self, model: ModelWrapper):  # noqa
         graph = model.graph
         graph_modified = False
-        for n in graph.node:
-            if n.op_type == "MultiThreshold":
-                idtype = model.get_tensor_datatype(n.input[0])
-                T = model.get_initializer(n.input[1])
-                Tnew = np.ceil(T)
-                if idtype.is_integer() and (T != Tnew).any():
-                    # round up the thresholds to nearest integer
-                    model.set_initializer(n.input[1], Tnew)
-                    # use same datatype as inputs for thresholds
-                    model.set_tensor_datatype(n.input[1], idtype)
+        for index, node in enumerate(graph.node):
+            op_type = node.op_type
+            if op_type == "MultiThreshold" or op_type.startswith("Thresholding"):
+                thresholds = model.get_initializer(node.input[1])
+                if thresholds is None:
+                    continue
+                dtype = model.get_tensor_datatype(node.input[0])
+                # This transformation only applies to thresholding operations
+                # operating on integer inputs
+                if not dtype.is_integer():
+                    continue
+                # Round thresholds up to nearest integer and clip thresholds
+                # outside the input range
+                #   Note: This might promote the thresholds to float64 and
+                #   introduce extra inaccuracies due to large integers not being
+                #   exactly representable in floating-point representation.
+                #   See for example: np.ceil(np.float32(16777217)) == 16777216
+                new_thresholds = np.clip(np.ceil(thresholds), dtype.min(), dtype.max() + 1)
+                # Convert back to the preferred float32 container type
+                new_thresholds = new_thresholds.astype(np.float32)
+                # Insert the rounded and clipped thresholds back into the model
+                model.set_initializer(node.input[1], new_thresholds)
+                # The rounded and clipped thresholds now fit into a data type
+                # that is one bit bigger than the input datatype
+                # Determine new max_value
+                max_val = dtype.max() + 1
+                if not dtype.signed():
+                    tdt = DataType.get_smallest_possible(max_val)
+                else:
+                    tdt = DataType.get_smallest_possible(-(max_val) - 1)
+                model.set_tensor_datatype(node.input[1], tdt)
+                # If hw op we need to set the weight data type attribute as well
+                if op_type.startswith("Thresholding"):
+                    inst = getCustomOp(node)
+                    inst.set_nodeattr("weightDataType", tdt.name)
+                # ones
+                if np.any(new_thresholds != thresholds):
+                    # Track the graph has been modified to inform the transform
+                    # container to exhaustively repeat this transformation until
+                    # no changes are possible
                     graph_modified = True
-                if idtype.is_integer() and not idtype.signed() and (Tnew < 0).any():
-                    # clip any negative thresholds if input is unsigned
-                    Tnew = np.clip(Tnew, 0, None)
-                    model.set_initializer(n.input[1], Tnew)
-                    # use same datatype as inputs for thresholds
-                    model.set_tensor_datatype(n.input[1], idtype)
-                    graph_modified = True
-                if idtype.is_integer() and (
-                    (Tnew < (idtype.min() - 1)).any() or (Tnew > (idtype.max() + 1)).any()
-                ):
-                    # clip any large thresholds to input range + 1
-                    Tnew = np.clip(Tnew, idtype.min() - 1, idtype.max() + 1)
-                    model.set_initializer(n.input[1], Tnew)
-                    # use same datatype as inputs for thresholds
-                    model.set_tensor_datatype(n.input[1], idtype)
-                    graph_modified = True
-        return (model, graph_modified)
+                    # Immediately exit here to propagate the data type changes
+                    # before considering the next node
+                    break
+        model = model.transform(InferDataTypes())
+        return model, graph_modified
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 91c191962f..5eb72194ea 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -81,6 +81,7 @@
 part_map = {**pynq_part_map, **alveo_part_map}
 part_map["VEK280"] = "xcve2802-vsvh1760-2MP-e-S"
 part_map["VCK190"] = "xcvc1902-vsva2197-2MP-e-S"
+part_map["V80"] = "xcv80-lsva4737-2MHP-e-s"
 
 
 def get_rtlsim_trace_depth():
@@ -292,10 +293,10 @@ def memutil(req_mem_spec, primitive_spec):
 
 def is_versal(fpgapart):
     """Returns whether board is part of the Versal family"""
-    return (
-        fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"]
-        or fpgapart[0:5] == "xqrvc"
-    )
+    return fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] or fpgapart[0:5] in [
+        "xqrvc",
+        "xcv80",
+    ]
 
 
 def get_dsp_block(fpgapart):
diff --git a/tests/brevitas/test_brevitas_debug.py b/tests/brevitas/test_brevitas_debug.py
index d6879a727b..3d059a6856 100644
--- a/tests/brevitas/test_brevitas_debug.py
+++ b/tests/brevitas/test_brevitas_debug.py
@@ -35,6 +35,7 @@
 import os
 import torch
 from brevitas.export import export_qonnx
+from brevitas.quant_tensor import _unpack_quant_tensor
 from pkgutil import get_data
 from qonnx.core.modelwrapper import ModelWrapper
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
@@ -90,7 +91,7 @@ def test_brevitas_debug(QONNX_FINN_conversion):
     else:
         assert len(names_common) == 8
     for dbg_name in names_common:
-        tensor_pytorch = dbg_hook.values[dbg_name].value.detach().numpy()
+        tensor_pytorch = _unpack_quant_tensor(dbg_hook.values[dbg_name]).detach().numpy()
         tensor_finn = output_dict[dbg_name]
         assert np.isclose(tensor_finn, tensor_pytorch, atol=1e-5).all()
     os.remove(finn_onnx)
diff --git a/tests/brevitas/test_brevitas_fc.py b/tests/brevitas/test_brevitas_fc.py
index 842d099f57..a7a73a5ed4 100644
--- a/tests/brevitas/test_brevitas_fc.py
+++ b/tests/brevitas/test_brevitas_fc.py
@@ -45,8 +45,6 @@
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = make_build_dir("test_brevitas_fc_")
-
 
 @pytest.mark.brevitas_export
 # act bits
@@ -61,6 +59,7 @@ def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     nname = "%s_%dW%dA" % (size, wbits, abits)
+    export_onnx_path = make_build_dir("test_brevitas_fc_")
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     ishape = (1, 1, 28, 28)
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 81c6316ec1..0d3418624a 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -94,6 +94,7 @@
     MakeMaxPoolNHWC,
     MoveScalarLinearPastInvariants,
 )
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 from finn.util.basic import get_finn_root, make_build_dir, test_board_map
 from finn.util.pytorch import ToTensor
 from finn.util.test import (
@@ -672,6 +673,7 @@ def test_minimize_bit_width(self, topology, wbits, abits, board):
         model = load_test_checkpoint_or_skip(prev_chkpt_name)
         model = model.transform(MinimizeAccumulatorWidth())
         model = model.transform(MinimizeWeightBitWidth())
+        model = model.transform(RoundAndClipThresholds())
         curr_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width")
         model.save(curr_chkpt_name)
 
diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py
index 01d995c147..4c52277970 100644
--- a/tests/end2end/test_end2end_mobilenet_v1.py
+++ b/tests/end2end/test_end2end_mobilenet_v1.py
@@ -353,6 +353,7 @@ def test_end2end_mobilenet_minimize_bit_width():
     model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx")
     model = model.transform(MinimizeAccumulatorWidth())
     model = model.transform(MinimizeWeightBitWidth())
+    model = model.transform(RoundAndClipThresholds())
     model.save(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx")
 
 
diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py
index 338204c0c7..e5f9659665 100644
--- a/tests/fpgadataflow/test_fifosizing.py
+++ b/tests/fpgadataflow/test_fifosizing.py
@@ -70,7 +70,6 @@ def test_fifosizing_linear(method, topology):
         synth_clk_period_ns=10.0,
         board="Pynq-Z1",
         rtlsim_batch_size=100 if topology == "tfc" else 2,
-        shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ,
         generate_outputs=[
             build_cfg.DataflowOutputType.ESTIMATE_REPORTS,
             build_cfg.DataflowOutputType.STITCHED_IP,
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
index 9c45b06f4a..26ce8f5f0e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py
@@ -41,11 +41,13 @@
 from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from qonnx.transformation.infer_datatypes import InferDataTypes
 from qonnx.transformation.infer_shapes import InferShapes
-from qonnx.transformation.lower_convs_to_matmul import (
-    LowerConvsToMatMul,
-    _auto_pad_to_explicit_padding,
+from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+from qonnx.util.basic import (
+    auto_pad_to_explicit_padding,
+    gen_finn_dt_tensor,
+    get_by_name,
+    qonnx_make_model,
 )
-from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model
 
 import finn.core.onnx_exec as oxe
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
@@ -69,11 +71,11 @@ def create_conv_model(idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, d
     group = ifm if depthwise else 1
     group_str = str(group)
     ishp = (1, ifm, idim_h, idim_w)
-    pad_0 = _auto_pad_to_explicit_padding(pad_mode, idim_h, idim_w, k, k, stride, stride, 2)
+    pad_0 = auto_pad_to_explicit_padding(pad_mode, idim_h, idim_w, k, k, stride, stride, 2)
     int_dim_h = compute_conv_output_dim(idim_h, k, stride, total_pad=pad_0[0] + pad_0[2])
     int_dim_w = compute_conv_output_dim(idim_w, k, stride, total_pad=pad_0[1] + pad_0[3])
 
-    pad_1 = _auto_pad_to_explicit_padding(pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2)
+    pad_1 = auto_pad_to_explicit_padding(pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2)
     odim_h = compute_conv_output_dim(int_dim_h, k, stride, total_pad=pad_1[0] + pad_1[2])
     odim_w = compute_conv_output_dim(int_dim_w, k, stride, total_pad=pad_1[1] + pad_1[3])
     oshp = (1, ifm, odim_h, odim_w) if depthwise else (1, ofm, odim_h, odim_w)
diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
index fe7ba3d9fb..2079fe7fc5 100644
--- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py
+++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py
@@ -49,6 +49,7 @@
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
 
 test_fpga_part = "xczu3eg-sbva484-1-e"
 target_clk_ns = 5
@@ -133,10 +134,8 @@ def make_single_multithresholding_modelwrapper(
 @pytest.mark.parametrize(
     "idt_tdt_cfg",
     [
-        (DataType["INT8"], DataType["INT8"]),
-        (DataType["INT8"], DataType["INT9"]),
-        (DataType["UINT5"], DataType["UINT5"]),
-        (DataType["UINT5"], DataType["UINT6"]),
+        (DataType["INT8"], DataType["INT25"]),
+        (DataType["UINT5"], DataType["UINT8"]),
     ],
 )
 @pytest.mark.parametrize("fold", [-1, 1, 2])
@@ -145,6 +144,7 @@ def make_single_multithresholding_modelwrapper(
 @pytest.mark.parametrize("impl_style", ["hls", "rtl"])
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"])
+@pytest.mark.parametrize("round_thresh", [True, False])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
@@ -159,6 +159,7 @@ def test_fpgadataflow_thresholding(
     impl_style,
     exec_mode,
     mem_mode,
+    round_thresh,
 ):
     # the mem_mode parameter can only be used for the hls thresholding
     # so the test will only be executed once for impl_style=rtl and once skipped
@@ -234,6 +235,8 @@ def test_fpgadataflow_thresholding(
     node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0]
     inst = getCustomOp(node)
     inst.set_nodeattr("PE", pe)
+    if round_thresh is True:
+        model = model.transform(RoundAndClipThresholds())
     model = model.transform(GiveUniqueNodeNames())
 
     if impl_style == "hls":
diff --git a/tests/transformation/streamline/test_round_thresholds.py b/tests/transformation/streamline/test_round_thresholds.py
index 85c60b37d5..6de82e6750 100644
--- a/tests/transformation/streamline/test_round_thresholds.py
+++ b/tests/transformation/streamline/test_round_thresholds.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020-2022, Xilinx, Inc.
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -32,39 +33,238 @@
 from onnx import TensorProto, helper
 from qonnx.core.datatype import DataType
 from qonnx.core.modelwrapper import ModelWrapper
-from qonnx.util.basic import qonnx_make_model
+from qonnx.util.basic import gen_finn_dt_tensor
 
 import finn.core.onnx_exec as oxe
 from finn.transformation.streamline import RoundAndClipThresholds
 
 
+# Tests the RoundAndClipThresholds transformation under various input, output
+# data type combinations with purely integer inputs. Without proper rounding,
+# this tests only the clipping, range and type-casting behavior of the
+# transformation.
+@pytest.mark.parametrize(
+    "i_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Below 24-bit thresholds we will not observe any interesting rounding
+        #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+        #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+        #    generate test inputs slightly above and below this.
+        # 2. We want to test out-of-range clipping of thresholds, in particular
+        #    clipping of the negative portion of signed thresholds. Thus, we only
+        #    generate signed thresholds, but test with signed and unsigned
+        #    inputs of smaller, larger and equal range.
+        # 3. Testing proper floating-point thresholds requires a separate test-case
+        "INT23",
+        "UINT23",
+        "INT24",
+        "UINT24",
+        "INT25",
+        "UINT25",
+        "INT26",
+        "UINT26",
+    ],
+)
+@pytest.mark.parametrize(
+    "o_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+        #    inputs and thresholds.
+        # 2. However, with randomly samples thresholds from a rather large range due
+        #    to the selected input bit-widths (see above), we risk not adequately
+        #    covering the input range if we sample too few thresholds. The number of
+        #    thresholds sampled depends on the bit-width of the output, thus we use
+        #    rather high bit-width for testing.
+        # 3. For a "real" model, the quantization procedure *should* take care of
+        #    adequately covering the true input range.
+        "INT8",
+        "UINT8",
+    ],
+)
+@pytest.mark.parametrize(
+    "n_elems",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+        # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+        1,
+        2,
+        3,
+        4,
+        256,
+    ],
+)
 @pytest.mark.streamline
-def test_round_thresholds():
-    v = helper.make_tensor_value_info("v", TensorProto.FLOAT, [1, 4])
-    thresholds = helper.make_tensor_value_info("thresholds", TensorProto.FLOAT, [4, 1])
-    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, 4])
-    node_def = helper.make_node(
-        "MultiThreshold", ["v", "thresholds"], ["out"], domain="qonnx.custom_op.general"
+def test_round_and_clip_thresholds_ints(i_dtype, o_dtype, n_elems):
+    i_dtype = DataType[i_dtype]
+    t_dtype = DataType["INT25"]  # Note: Matches configuration above
+    o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
+    node = helper.make_node(
+        "MultiThreshold",
+        domain="qonnx.custom_op.general",
+        inputs=["inp", "thresholds"],
+        outputs=["out"],
+        out_dtype=str(o_dtype),
+        out_bias=float(o_dtype.min()),
     )
-    graph_def = helper.make_graph([node_def], "test_model", [v, thresholds], [out])
-    model_def = qonnx_make_model(graph_def)
-    model = ModelWrapper(model_def)
-    threshold_val = np.asarray([[-1.1], [0.7], [2.3], [5.1]], dtype=np.float32)
-    model.set_initializer("thresholds", threshold_val)
-    model.set_tensor_datatype("v", DataType["INT8"])
-    inp_dict_f = {"v": np.floor(threshold_val).T}
-    inp_dict_n = {"v": np.round(threshold_val).T}
-    inp_dict_c = {"v": np.ceil(threshold_val).T}
-    orig_f = oxe.execute_onnx(model, inp_dict_f)["out"]
-    orig_n = oxe.execute_onnx(model, inp_dict_n)["out"]
-    orig_c = oxe.execute_onnx(model, inp_dict_c)["out"]
-    assert model.get_tensor_datatype("thresholds") == DataType["FLOAT32"]
-    new_model = model.transform(RoundAndClipThresholds())
-    # rounded up thresholds should have same dtype as input
-    assert new_model.get_tensor_datatype("thresholds") == DataType["INT8"]
-    new_f = oxe.execute_onnx(new_model, inp_dict_f)["out"]
-    new_n = oxe.execute_onnx(new_model, inp_dict_n)["out"]
-    new_c = oxe.execute_onnx(new_model, inp_dict_c)["out"]
-    assert np.isclose(orig_f, new_f, atol=1e-3).all()
-    assert np.isclose(orig_n, new_n, atol=1e-3).all()
-    assert np.isclose(orig_c, new_c, atol=1e-3).all()
+    n_thresholds = o_dtype.get_num_possible_values() - 1
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
+    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
+    thresholds = helper.make_tensor_value_info(
+        "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
+    )
+    graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
+    model = ModelWrapper(helper.make_model(graph))
+
+    inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
+    inp[0][0] = i_dtype.max()
+    thresholds = np.sort(gen_finn_dt_tensor(t_dtype, [n_elems, n_thresholds]))
+    model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
+    model.set_tensor_datatype("thresholds", t_dtype)
+    model.set_tensor_datatype("out", o_dtype)
+    model.set_initializer("thresholds", thresholds)
+
+    # Execute the model before running the RoundAndClipThresholds transformation
+    out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
+    assert model.get_tensor_datatype("thresholds") == t_dtype
+
+    model = model.transform(RoundAndClipThresholds())
+
+    # After this transformation, the thresholds and output data type should be
+    # inferred correctly
+    if not i_dtype.signed():
+        new_tdt = DataType.get_smallest_possible(i_dtype.max() + 1)
+    else:
+        new_tdt = DataType.get_smallest_possible(-(i_dtype.max() + 1) - 1)
+    assert model.get_tensor_datatype("thresholds") == new_tdt
+    assert model.get_tensor_datatype("out") == o_dtype
+
+    # After this transformation, the container type used to store the thresholds
+    # values must be float32. No other type-cast or type promotion may happen.
+    assert model.get_initializer("thresholds").dtype == np.float32
+
+    # After rounding, all thresholds must be integers represented as float32
+    assert all(x.is_integer() for x in model.get_initializer("thresholds").flatten())
+
+    # Execute the model after running the RoundAndClipThresholds transformation
+    out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
+
+    assert np.all(out_produced == out_expected)
+
+
+# Tests the RoundAndClipThresholds transformation under various input, output
+# data type combinations with purely integer inputs. This test case tests actual
+# rounding of floating-point thresholds.
+@pytest.mark.parametrize(
+    "i_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Below 24-bit thresholds we will not observe any interesting rounding
+        #    behavior, as all integers < 2^24 can be exactly represented in 32-bit
+        #    floating-point. Thus, we test thresholds at 25-bit signed integers and
+        #    generate test inputs slightly above and below this.
+        # 2. We want to test out-of-range clipping of thresholds, in particular
+        #    clipping of the negative portion of signed thresholds. Thus, we only
+        #    generate signed thresholds, but test with signed and unsigned
+        #    inputs of smaller, larger and equal range.
+        # 3. Testing proper floating-point thresholds requires a separate test-case
+        "INT23",
+        "UINT23",
+        "INT24",
+        "UINT24",
+        "INT25",
+        "UINT25",
+        "INT26",
+        "UINT26",
+    ],
+)
+@pytest.mark.parametrize(
+    "o_dtype",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Outputs of MultiThreshold are typically much smaller bit-width than the
+        #    inputs and thresholds.
+        # 2. However, with randomly samples thresholds from a rather large range due
+        #    to the selected input bit-widths (see above), we risk not adequately
+        #    covering the input range if we sample too few thresholds. The number of
+        #    thresholds sampled depends on the bit-width of the output, thus we use
+        #    rather high bit-width for testing.
+        # 3. For a "real" model, the quantization procedure *should* take care of
+        #    adequately covering the true input range.
+        "INT8",
+        "UINT8",
+    ],
+)
+@pytest.mark.parametrize(
+    "n_elems",
+    [
+        # Explanation for selecting these test configurations:
+        # 1. Small edge cases and quickly running through tests: 1, 2, 3, 4
+        # 2. Large test case 256, hopefully amplifying any rarely occurring errors
+        1,
+        2,
+        3,
+        4,
+        256,
+    ],
+)
+@pytest.mark.streamline
+def test_round_and_clip_thresholds_floats(i_dtype, o_dtype, n_elems):
+    i_dtype = DataType[i_dtype]
+    t_dtype = DataType["FLOAT32"]
+    o_dtype = DataType[o_dtype]  # noqa: Duplicate model setup code
+    node = helper.make_node(
+        "MultiThreshold",
+        domain="qonnx.custom_op.general",
+        inputs=["inp", "thresholds"],
+        outputs=["out"],
+        out_dtype=str(o_dtype),
+    )
+    n_thresholds = o_dtype.get_num_possible_values() - 1
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, n_elems])
+    out = helper.make_tensor_value_info("out", TensorProto.FLOAT, [1, n_elems])
+    thresholds = helper.make_tensor_value_info(
+        "thresholds", TensorProto.FLOAT, [n_elems, n_thresholds]
+    )
+    graph = helper.make_graph([node], "thresholds", [inp, thresholds], [out])
+    model = ModelWrapper(helper.make_model(graph))
+
+    inp = gen_finn_dt_tensor(i_dtype, [1, n_elems])
+    # Draw uniformly random prototype thresholds in [0,+1] range
+    thresholds = np.random.rand(n_elems, n_thresholds)
+    # Type alias to 25-bit signed integer type used to set the range of the
+    # thresholds
+    INT25 = DataType["INT25"]  # noqa: Variable name not lowercase
+    # Map the prototype thresholds into the test integer range and sort
+    thresholds = np.sort((INT25.max() - INT25.min()) * thresholds + INT25.min())
+    # Set data type annotations for the input and thresholds tensor
+    model.set_tensor_datatype("inp", i_dtype)  # noqa: Duplicate model execution
+    model.set_tensor_datatype("thresholds", t_dtype)
+    model.set_tensor_datatype("out", o_dtype)
+    model.set_initializer("thresholds", thresholds)
+
+    # Execute the model before running the RoundAndClipThresholds transformation
+    out_expected = oxe.execute_onnx(model, {"inp": inp})["out"]
+    # Before rounding the threshold data type must be as annotated
+    assert model.get_tensor_datatype("thresholds") == t_dtype
+
+    model = model.transform(RoundAndClipThresholds())
+
+    if not i_dtype.signed():
+        new_tdt = DataType.get_smallest_possible(i_dtype.max() + 1)
+    else:
+        new_tdt = DataType.get_smallest_possible(-(i_dtype.max() + 1) - 1)
+    assert model.get_tensor_datatype("thresholds") == new_tdt
+    assert model.get_tensor_datatype("out") == o_dtype
+
+    # After this transformation, the container type used to store the thresholds
+    # values must be float32. No other type-cast or type promotion may happen.
+    assert model.get_initializer("thresholds").dtype == np.float32
+    # After rounding, all thresholds must be integers represented as float32
+    assert all(x.is_integer() for x in model.get_initializer("thresholds").flatten())
+
+    out_produced = oxe.execute_onnx(model, {"inp": inp})["out"]
+
+    assert np.allclose(out_produced, out_expected, atol=1.0e-3)
diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py
index 8a91a49278..9e206c843a 100644
--- a/tests/transformation/streamline/test_streamline_cnv.py
+++ b/tests/transformation/streamline/test_streamline_cnv.py
@@ -50,8 +50,6 @@
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = make_build_dir("test_streamline_cnv_")
-
 
 @pytest.mark.streamline
 # act bits
@@ -64,6 +62,7 @@ def test_streamline_cnv(size, wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     nname = "%s_%dW%dA" % (size, wbits, abits)
+    export_onnx_path = make_build_dir("test_streamline_cnv_")
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     export_qonnx(fc, torch.randn(1, 3, 32, 32), finn_onnx)
diff --git a/tests/transformation/streamline/test_streamline_fc.py b/tests/transformation/streamline/test_streamline_fc.py
index edc4a96fe2..9ce2f2ab65 100644
--- a/tests/transformation/streamline/test_streamline_fc.py
+++ b/tests/transformation/streamline/test_streamline_fc.py
@@ -52,8 +52,6 @@
 from finn.util.basic import make_build_dir
 from finn.util.test import get_test_model_trained
 
-export_onnx_path = make_build_dir("test_streamline_fc_")
-
 
 @pytest.mark.streamline
 # act bits
@@ -68,6 +66,7 @@ def test_streamline_fc(size, wbits, abits):
     if wbits > abits:
         pytest.skip("No wbits > abits cases at the moment")
     nname = "%s_%dW%dA" % (size, wbits, abits)
+    export_onnx_path = make_build_dir("test_streamline_fc_")
     finn_onnx = export_onnx_path + "/%s.onnx" % nname
     fc = get_test_model_trained(size, wbits, abits)
     export_qonnx(fc, torch.randn(1, 1, 28, 28), finn_onnx)
diff --git a/tutorials/fpga_flow/README.md b/tutorials/fpga_flow/README.md
index 2aaad0423b..71f2a2a625 100644
--- a/tutorials/fpga_flow/README.md
+++ b/tutorials/fpga_flow/README.md
@@ -25,20 +25,29 @@ This demo was created using Vivado 2022.1.
 Prior to running, insure the following prerequisites have been met:
 - Install FINN and prerequisites.  The [Getting Started](https://finn.readthedocs.io/en/latest/getting_started.html#quickstart) section of the FINN documentation might be helpful for this.
 - Ensure you have the `FINN_XILINX_PATH` and `FINN_XILINX_VERSION` env variables set appropriately for your install.  For example:
-> export FINN_XILINX_PATH=/opt/Xilinx
-> export FINN_XILINX_VERSION=2022.1
+```shell
+export FINN_XILINX_PATH=/opt/Xilinx
+export FINN_XILINX_VERSION=2022.1
+```
+
 - Set the env variable for your `finn` install top directory (where you cloned the FINN compiler repo):
-> export FINN_ROOT=/home/foo/finn
+```shell
+export FINN_ROOT=/home/foo/finn
+```
 
 Then, change to `finn` install directory and invoke the build as follows:
-> cd ${FINN_ROOT}
-> ./run-docker.sh build_custom ${FINN_ROOT}/tutorials/fpga_flow/
+```shell
+cd ${FINN_ROOT}
+./run-docker.sh build_custom ${FINN_ROOT}/tutorials/fpga_flow/
+```
 
 Alternatively, since the tutorials folder is already part of the FINN compiler installation, you can invoke it from within the Docker container:
-> cd ${FINN_ROOT}
-> ./run-docker.sh
-> cd tutorials/fpga_flow
-> python build.py
+```shell
+cd ${FINN_ROOT}
+./run-docker.sh
+cd tutorials/fpga_flow
+python build.py
+```
 
 The build should finish in about 10 minutes, and the FINN docker will close on success.
 
@@ -59,12 +68,14 @@ The build should finish in about 10 minutes, and the FINN docker will close on s
 ### Examine the Stitched IP
 
 Navigate to the stitched IP project directory:
-
-> cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip
+```shell
+cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/stitched_ip
+```
 
 And, open the project:
-
-> vivado finn_vivado_stitch_proj.xpr
+```shell
+vivado finn_vivado_stitch_proj.xpr
+```
 
 Explore the IPI board design and note the interfaces.
 
@@ -89,9 +100,10 @@ them under `${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim`. Let's ex
    the FINN compiler. Used for launching the testbench simulation.
 
 You can now launch the simulation as follows:
-
-> cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim
-> vivado -mode gui -source make_sim_proj.tcl
+```shell
+cd ${FINN_ROOT}/tutorials/fpga_flow/output_tfc_w0a1_fpga/sim
+vivado -mode gui -source make_sim_proj.tcl
+```
 
 The simulation should complete with:
 
diff --git a/tutorials/fpga_flow/folding_config.json b/tutorials/fpga_flow/folding_config.json
index 642200d02b..bf94f8058d 100644
--- a/tutorials/fpga_flow/folding_config.json
+++ b/tutorials/fpga_flow/folding_config.json
@@ -1,30 +1,29 @@
 {
   "Defaults": {},
-  "Thresholding_Batch_0": {
-    "PE": 49,
-    "ram_style": "block"
+  "Thresholding_rtl_0": {
+    "PE": 49
   },
-  "MatrixVectorActivation_0": {
+  "MVAU_hls_0": {
     "PE": 16,
     "SIMD": 49,
     "ram_style": "block"
   },
-  "MatrixVectorActivation_1": {
+  "MVAU_hls_1": {
     "PE": 8,
     "SIMD": 8,
     "ram_style": "auto"
   },
-  "MatrixVectorActivation_2": {
+  "MVAU_hls_2": {
     "PE": 8,
     "SIMD": 8,
     "ram_style": "auto"
   },
-  "MatrixVectorActivation_3": {
+  "MVAU_hls_3": {
     "PE": 10,
     "SIMD": 8,
     "ram_style": "distributed"
   },
-  "LabelSelect_Batch_0": {
+  "LabelSelect_hls_0": {
     "PE": 1
   }
 }