Skip to content

Commit

Permalink
Merge remote-tracking branch 'xilinx/dev' into feature/generalized_mu…
Browse files Browse the repository at this point in the history
…lti_threshold_layouts
  • Loading branch information
iksnagreb committed Jan 20, 2025
2 parents 4a69267 + 88e207e commit a8bcfcb
Show file tree
Hide file tree
Showing 40 changed files with 1,289 additions and 367 deletions.
2 changes: 1 addition & 1 deletion docs/finn/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ Which data layout do FINN-generated accelerators use? Big-endian? Little-endian?
If you need to do this manually, first examine how the `FINN PYNQ Python drivers <https://github.com/Xilinx/finn-examples/blob/main/finn_examples/driver.py#L379>`_ do this – notice how the input data is
first reshaped to create the “folded input shape” that reflects the word size of the first layer based on how much it
was parallelized, then data packing is applied to obtain a raw byte array (with some reversals going on) that can be
fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn-base/blob/dev/src/finn/util/data_packing.py#L289>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.
fed directly to the hardware. Another example of this is the `npy_to_rtlsim_input <https://github.com/Xilinx/finn/blob/dev/src/finn/util/data_packing.py#L284>`_ function, which converts npy arrays to lists of Python arbitrary-precision integers that we feed into pyverilator for rtl simulation.

Why does FIFO sizing take so long for my network? Is something wrong?
The automatic FIFO sizing in FINN can take quite long. It unfortunately doesn’t really parallelize on multiple cores since
Expand Down
4 changes: 2 additions & 2 deletions fetch-repos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

QONNX_COMMIT="fd61cfeebbdaba351abf7e9d54cd785d7776fa4f"
QONNX_COMMIT="2281a777d84aa5cbd7469085c2e534fb4a03ccf9"
FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
Expand Down
144 changes: 72 additions & 72 deletions finn-rtllib/fifo/hdl/Q_srl.v
Original file line number Diff line number Diff line change
Expand Up @@ -184,58 +184,58 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
end // always @ (posedge clock or negedge reset)

always @* begin // - combi always
srlo_ <= 'bx;
shift_en_o_ <= 1'bx;
shift_en_ <= 1'bx;
addr_ <= 'bx;
state_ <= 2'bx;
srlo_ = 'bx;
shift_en_o_ = 1'bx;
shift_en_ = 1'bx;
addr_ = 'bx;
state_ = 2'bx;
case (state)

state_empty: begin // - (empty, will not produce)
if (i_v) begin // - empty & i_v => consume
srlo_ <= i_d;
shift_en_o_ <= 1;
shift_en_ <= 1'bx;
addr_ <= 0;
state_ <= state_one;
srlo_ = i_d;
shift_en_o_ = 1;
shift_en_ = 1'bx;
addr_ = 0;
state_ = state_one;
end
else begin // - empty & !i_v => idle
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1'bx;
addr_ <= 0;
state_ <= state_empty;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 1'bx;
addr_ = 0;
state_ = state_empty;
end
end

state_one: begin // - (contains one)
if (i_v && o_b) begin // - one & i_v & o_b => consume
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1;
addr_ <= 0;
state_ <= state_more;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 1;
addr_ = 0;
state_ = state_more;
end
else if (i_v && !o_b) begin // - one & i_v & !o_b => cons+prod
srlo_ <= i_d;
shift_en_o_ <= 1;
shift_en_ <= 1;
addr_ <= 0;
state_ <= state_one;
srlo_ = i_d;
shift_en_o_ = 1;
shift_en_ = 1;
addr_ = 0;
state_ = state_one;
end
else if (!i_v && o_b) begin // - one & !i_v & o_b => idle
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1'bx;
addr_ <= 0;
state_ <= state_one;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 1'bx;
addr_ = 0;
state_ = state_one;
end
else if (!i_v && !o_b) begin // - one & !i_v & !o_b => produce
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1'bx;
addr_ <= 0;
state_ <= state_empty;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 1'bx;
addr_ = 0;
state_ = state_empty;
end
end // case: state_one

Expand All @@ -244,60 +244,60 @@ module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
// - (full, will not consume)
// - (full here if depth==2)
if (o_b) begin // - full & o_b => idle
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 0;
addr_ <= addr;
state_ <= state_more;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 0;
addr_ = addr;
state_ = state_more;
end
else begin // - full & !o_b => produce
srlo_ <= srl[addr];
shift_en_o_ <= 1;
shift_en_ <= 0;
// addr_ <= addr-1;
// state_ <= state_more;
addr_ <= addr_zero_ ? 0 : addr-1;
state_ <= addr_zero_ ? state_one : state_more;
srlo_ = srl[addr];
shift_en_o_ = 1;
shift_en_ = 0;
// addr_ = addr-1;
// state_ = state_more;
addr_ = addr_zero_ ? 0 : addr-1;
state_ = addr_zero_ ? state_one : state_more;
end
end
else begin // - (mid: neither empty nor full)
if (i_v && o_b) begin // - mid & i_v & o_b => consume
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1;
addr_ <= addr+1;
state_ <= state_more;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 1;
addr_ = addr+1;
state_ = state_more;
end
else if (i_v && !o_b) begin // - mid & i_v & !o_b => cons+prod
srlo_ <= srl[addr];
shift_en_o_ <= 1;
shift_en_ <= 1;
addr_ <= addr;
state_ <= state_more;
srlo_ = srl[addr];
shift_en_o_ = 1;
shift_en_ = 1;
addr_ = addr;
state_ = state_more;
end
else if (!i_v && o_b) begin // - mid & !i_v & o_b => idle
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 0;
addr_ <= addr;
state_ <= state_more;
srlo_ = 'bx;
shift_en_o_ = 0;
shift_en_ = 0;
addr_ = addr;
state_ = state_more;
end
else if (!i_v && !o_b) begin // - mid & !i_v & !o_b => produce
srlo_ <= srl[addr];
shift_en_o_ <= 1;
shift_en_ <= 0;
addr_ <= addr_zero_ ? 0 : addr-1;
state_ <= addr_zero_ ? state_one : state_more;
srlo_ = srl[addr];
shift_en_o_ = 1;
shift_en_ = 0;
addr_ = addr_zero_ ? 0 : addr-1;
state_ = addr_zero_ ? state_one : state_more;
end
end // else: !if(addr_full)
end // case: state_more

default: begin
srlo_ <= 'bx;
shift_en_o_ <= 1'bx;
shift_en_ <= 1'bx;
addr_ <= 'bx;
state_ <= 2'bx;
srlo_ = 'bx;
shift_en_o_ = 1'bx;
shift_en_ = 1'bx;
addr_ = 'bx;
state_ = 2'bx;
end // case: default

endcase // case(state)
Expand Down
12 changes: 8 additions & 4 deletions finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ module mvu_8sx8u_dsp48 #(
return res;
endfunction : init_leave_loads

function int unsigned sum_width(input int unsigned n, input int unsigned w);
return w <= 16? $clog2(1 + n*(2**w - 1)) : w + $clog2(n);
endfunction : sum_width

// Pipeline for last indicator flag
logic [1:5] L = '0;
always_ff @(posedge clk) begin
Expand Down Expand Up @@ -445,7 +449,7 @@ module mvu_8sx8u_dsp48 #(
// Stage #4: Cross-SIMD Reduction

// Count leaves reachable from each node
localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0 }; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop

// Range of Cross-lane Contribution Tracked in Hi4
/*
Expand All @@ -462,7 +466,7 @@ module mvu_8sx8u_dsp48 #(
* signed value is determined by its lower bound to be at least:
* 1 + $clog2(2^(w-1)+SIMD)
*/
localparam int unsigned HI_WIDTH = 1 + $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD);
localparam int unsigned HI_WIDTH = 1 + ($clog2(SIMD) < ACCU_WIDTH-D[1]? ACCU_WIDTH-D[1] : $clog2(2**(ACCU_WIDTH-D[1]-1)+SIMD));

uwire signed [ACCU_WIDTH -1:0] up4;
uwire signed [HI_WIDTH -1:0] hi4;
Expand Down Expand Up @@ -504,12 +508,12 @@ module mvu_8sx8u_dsp48 #(
// Conclusive low part accumulation
if(i >= PE_REM) begin : blkLo
// Adder Tree across all SIMD low contributions (all unsigned arithmetic)
localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
localparam int unsigned ROOT_WIDTH = sum_width(SIMD, LO_WIDTH);
uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree;
for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
for(genvar n = 0; n < SIMD-1; n++) begin
// Sum truncated to actual maximum bit width at this node
localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
localparam int unsigned NODE_WIDTH = sum_width(LEAVE_LOAD[n], LO_WIDTH);
uwire [NODE_WIDTH-1:0] s = tree[2*n+1] + tree[2*n+2];
assign tree[n] = s;
end
Expand Down
Loading

0 comments on commit a8bcfcb

Please sign in to comment.