diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index 5126ed3ff4..bd951f7857 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -65,12 +65,18 @@ RUN apt-get update && \ python-is-python3 \ python3-pip \ python3-setuptools-scm \ - python3-venv + python3-venv \ + pybind11-dev \ + libfmt-dev \ + libboost-dev \ + libjansson-dev \ + libgetdata-dev \ + libtinfo5 RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config RUN locale-gen "en_US.UTF-8" # install Verilator from source to get the right version -RUN apt-get install -y git perl make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlib1g zlib1g-dev +RUN apt-get install -y git perl make autoconf g++-10 flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlib1g zlib1g-dev RUN git clone https://github.com/verilator/verilator RUN cd verilator && \ git checkout v4.224 && \ diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index c7500bcaa6..af6b716cd7 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -105,6 +105,22 @@ else fi fi +if [ -z "${XILINX_VIVADO}" ]; then + yecho "pyxsi will be unavailable since Vivado was not found" +else + if [ -f "${FINN_ROOT}/deps/pyxsi/pyxsi.so" ]; then + gecho "Found pyxsi at ${FINN_ROOT}/deps/pyxsi/pyxsi.so" + else + OLDPWD=$(pwd) + cd ${FINN_ROOT}/deps/pyxsi + touch .dockerenv + make + cd $OLDPWD + fi + export PYTHONPATH=$PYTHONPATH:${FINN_ROOT}/deps/pyxsi:${FINN_ROOT}/deps/pyxsi/py + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/lib/x86_64-linux-gnu/:${XILINX_VIVADO}/lib/lnx64.o +fi + if [ -f "$HLS_PATH/settings64.sh" ];then # source Vitis HLS env.vars source $HLS_PATH/settings64.sh @@ -129,6 +145,7 @@ if [ -d "$FINN_ROOT/.Xilinx" ]; then mkdir "$HOME/.Xilinx/Vivado/" cp "$FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl" "$HOME/.Xilinx/Vivado/" gecho "Found Vivado_init.tcl and copied to $HOME/.Xilinx/Vivado/Vivado_init.tcl" + else yecho "Unable to find $FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl" fi diff --git a/docker/jenkins/Jenkinsfile b/docker/jenkins/Jenkinsfile index 6d51fffd64..0b869b80a5 100644 --- a/docker/jenkins/Jenkinsfile +++ b/docker/jenkins/Jenkinsfile @@ -313,7 +313,7 @@ void createMultiMarkerScript(String markers, String testResultsFilename, String // Passing multiple markers when running ./run-docker.sh does not work with bash. // Therefore, create a script to maintain the single quotes that surround the markers sh """echo "#!/bin/bash -python -m pytest -m \'${markers}\' --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html ${additionalOptions}" >> run-tests.sh +python -m pytest -m \'${markers}\' --forked --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html ${additionalOptions}" >> run-tests.sh """ // Give permissions to script @@ -321,7 +321,7 @@ python -m pytest -m \'${markers}\' --junitxml=${testResultsFilename}.xml --html= } void runDockerPytestWithMarker(String marker, String testResultsFilename, String additionalOptions) { - sh """./run-docker.sh python -m pytest -m ${marker} --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html ${additionalOptions}""" + sh """./run-docker.sh python -m pytest -m ${marker} --forked --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html ${additionalOptions}""" } def findBoardBuildFiles(String searchDir, String dirToFind) { diff --git a/fetch-repos.sh b/fetch-repos.sh index a4fc124fa4..0cd4cd6eb4 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -39,6 +39,7 @@ XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e" RFSOC4x2_BDF_COMMIT="13fb6f6c02c7dfd7e4b336b18b959ad5115db696" KV260_BDF_COMMIT="98e0d3efc901f0b974006bc4370c2a7ad8856c79" EXP_BOARD_FILES_MD5="226ca927a16ea4ce579f1332675e9e9a" +PYXSI_COMMIT="4f4ec10a3631c4c44b5bc0ede698d41c924d2b86" QONNX_URL="https://github.com/fastmachinelearning/qonnx.git" FINN_EXP_URL="https://github.com/Xilinx/finn-experimental.git" @@ -51,6 +52,7 @@ AVNET_BDF_URL="https://github.com/Avnet/bdf.git" XIL_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git" RFSOC4x2_BDF_URL="https://github.com/RealDigitalOrg/RFSoC4x2-BSP.git" KV260_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git" +PYXSI_URL="https://github.com/maltanar/pyxsi.git" QONNX_DIR="qonnx" FINN_EXP_DIR="finn-experimental" @@ -63,6 +65,7 @@ AVNET_BDF_DIR="avnet-bdf" XIL_BDF_DIR="xil-bdf" RFSOC4x2_BDF_DIR="rfsoc4x2-bdf" KV260_SOM_BDF_DIR="kv260-som-bdf" +PYXSI_DIR="pyxsi" # absolute path to this script, e.g. /home/user/bin/foo.sh SCRIPT=$(readlink -f "$0") @@ -126,6 +129,7 @@ fetch_repo $AVNET_BDF_URL $AVNET_BDF_COMMIT $AVNET_BDF_DIR fetch_repo $XIL_BDF_URL $XIL_BDF_COMMIT $XIL_BDF_DIR fetch_repo $RFSOC4x2_BDF_URL $RFSOC4x2_BDF_COMMIT $RFSOC4x2_BDF_DIR fetch_repo $KV260_BDF_URL $KV260_BDF_COMMIT $KV260_SOM_BDF_DIR +fetch_repo $PYXSI_URL $PYXSI_COMMIT $PYXSI_DIR # Can skip downloading of board files entirely if desired if [ "$FINN_SKIP_BOARD_FILES" = "1" ]; then diff --git a/finn-rtllib/swg/swg_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v index 22dc6bd8cd..bb657a7478 100644 --- a/finn-rtllib/swg/swg_template_wrapper.v +++ b/finn-rtllib/swg/swg_template_wrapper.v @@ -71,4 +71,8 @@ $TOP_MODULE_NAME$_impl #( .out_V_V_TREADY(out_V_TREADY) ); +if (OUT_WIDTH_PADDED > BUF_OUT_WIDTH) begin + assign out_V_TDATA[OUT_WIDTH_PADDED-1:BUF_OUT_WIDTH] = {(OUT_WIDTH_PADDED-BUF_OUT_WIDTH){1'b0}}; +end + endmodule : $TOP_MODULE_NAME$ diff --git a/finn-rtllib/swg/swg_template_wrapper_dynamic.v b/finn-rtllib/swg/swg_template_wrapper_dynamic.v index 158f3132e3..7e49d3eafb 100644 --- a/finn-rtllib/swg/swg_template_wrapper_dynamic.v +++ b/finn-rtllib/swg/swg_template_wrapper_dynamic.v @@ -180,4 +180,8 @@ $TOP_MODULE_NAME$_impl #( .cfg_last_write(cfg_last_write) ); +if (OUT_WIDTH_PADDED > BUF_OUT_WIDTH) begin + assign out_V_TDATA[OUT_WIDTH_PADDED-1:BUF_OUT_WIDTH] = {(OUT_WIDTH_PADDED-BUF_OUT_WIDTH){1'b0}}; +end + endmodule : $TOP_MODULE_NAME$ diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv index 39756e5c2b..04c13424c9 100644 --- a/finn-rtllib/thresholding/hdl/thresholding_axi.sv +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -191,7 +191,10 @@ module thresholding_axi #( .cfg_rack, .cfg_q, .irdy(s_axis_tready), .ivld(s_axis_tvalid), .idat, - .ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata) + .ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata[PE*O_BITS-1:0]) ); + if($bits(m_axis_tdata) > PE*O_BITS) begin : genPadOut + assign m_axis_tdata[$left(m_axis_tdata):PE*O_BITS] = '0; + end : genPadOut endmodule : thresholding_axi diff --git a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv index cfd875f5c4..1a2b8402a0 100644 --- a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv +++ b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv @@ -232,7 +232,7 @@ module thresholding_axi_tb #( end join_any done <= 1; - repeat(N+6) @(posedge clk); + repeat(2*N+8) @(posedge clk); assert(QW.size() == 0) else begin $error("Missing %0d outputs.", QW.size()); diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb index aacd12ef05..e914781b21 100644 --- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb +++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb @@ -404,6 +404,7 @@ "child_model = child_model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))\n", "child_model = child_model.transform(PrepareRTLSim())\n", "child_model.set_metadata_prop(\"exec_mode\",\"rtlsim\")\n", + "child_model.set_metadata_prop(\"rtlsim_backend\",\"pyxsi\")\n", "child_model.save(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\");" ] }, diff --git a/requirements.txt b/requirements.txt index 1683695576..29d9e45b66 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ pre-commit==3.3.2 protobuf==3.20.3 psutil==5.9.4 pyscaffold==4.4 +pytest-forked==1.6.0 scipy==1.10.1 setupext-janitor>=1.1.2 sigtools==4.0.1 diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index d6437a2e5c..ad572e5221 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -281,6 +281,14 @@ class DataflowBuildConfig: #: Only relevant when `auto_fifo_depths = True` large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO + #: Enable input throttling for simulation-based FIFO sizing + #: Only relevant if auto_fifo_strategy = LARGEFIFO_RTLSIM + fifosim_input_throttle: Optional[bool] = True + + #: Enable saving waveforms from simulation-based FIFO sizing + #: Only relevant if auto_fifo_strategy = LARGEFIFO_RTLSIM + fifosim_save_waveform: Optional[bool] = False + #: Target clock frequency (in nanoseconds) for Vitis HLS synthesis. #: e.g. `hls_clk_period_ns=5.0` will target a 200 MHz clock. #: If not specified it will default to synth_clk_period_ns diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index ab2280554c..edac225cbb 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -109,6 +109,7 @@ InsertAndSetFIFODepths, RemoveShallowFIFOs, SplitLargeFIFOs, + xsi_fifosim, ) from finn.transformation.fpgadataflow.set_folding import SetFolding from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers @@ -126,7 +127,6 @@ get_rtlsim_trace_depth, pyverilate_get_liveness_threshold_cycles, ) -from finn.util.pyverilator import verilator_fifosim from finn.util.test import execute_parent @@ -250,6 +250,8 @@ def prepare_for_stitched_ip_rtlsim(verify_model, cfg): # set top-level prop for stitched-ip rtlsim and launch verify_model.set_metadata_prop("exec_mode", "rtlsim") # TODO make configurable + verify_model.set_metadata_prop("rtlsim_backend", "pyxsi") + # TODO make configurable # verify_model.set_metadata_prop("rtlsim_trace", "trace.vcd") return verify_model @@ -583,6 +585,8 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): "Multi-in/out streams currently not supported " + "in FINN C++ verilator driver, falling back to Python" ) + if cfg.fifosim_save_waveform: + model.set_metadata_prop("rtlsim_trace", "fifosim_trace.wdb") model = model.transform( InsertAndSetFIFODepths( cfg._resolve_fpga_part(), @@ -590,8 +594,12 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): swg_exception=cfg.default_swg_exception, vivado_ram_style=cfg.large_fifo_mem_style, force_python_sim=force_python_sim, + fifosim_input_throttle=cfg.fifosim_input_throttle, ) ) + if cfg.fifosim_save_waveform: + # un-set rtlsim_trace to remove unwanted traces in later steps + model.set_metadata_prop("rtlsim_trace", "") # InsertAndSetFIFODepths internally removes any shallow FIFOs # so no need to call RemoveShallowFIFOs here else: @@ -719,7 +727,7 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs) rtlsim_perf_dict["latency_cycles"] = rtlsim_latency_dict["cycles"] else: - rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs) + rtlsim_perf_dict = xsi_fifosim(model, rtlsim_bs) # keep keys consistent between the Python and C++-styles cycles = rtlsim_perf_dict["cycles"] clk_ns = float(model.get_metadata_prop("clk_ns")) diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py index 588e97e9e4..7c0d69e17a 100644 --- a/src/finn/core/onnx_exec.py +++ b/src/finn/core/onnx_exec.py @@ -52,44 +52,38 @@ def execute_onnx(model, input_dict, return_full_exec_context=False, start_node=N model_exec_mode = model.get_metadata_prop("exec_mode") if (model_exec_mode is None) or (model_exec_mode == ""): return execute_onnx_base(model, input_dict, return_full_exec_context, start_node, end_node) + elif model_exec_mode == "rtlsim": + # check sanity of model and then use stitched IP for rtlsim + if not model.check_all_tensor_shapes_specified(): + raise Exception("Found unspecified tensor shapes, try infer_shapes") + ret = model.analysis(ta.nodes_topologically_sorted) + assert ( + ret["nodes_topologically_sorted"] is True + ), """Nodes must be + topologically sorted.""" - if not model.check_all_tensor_shapes_specified(): - raise Exception("Found unspecified tensor shapes, try infer_shapes") - ret = model.analysis(ta.nodes_topologically_sorted) - assert ( - ret["nodes_topologically_sorted"] is True - ), """Nodes must be - topologically sorted.""" - - graph = model.graph - # first, we need to make sure that every variable required by the graph has - # some buffer associated with it. this includes graph inputs (which includes - # the input data as well as the trained parameters) and the graph ValueInfo - # (intermediate tensors between layers) - # this is provided by the execution_context, which is a dict of np.ndarray - execution_context = model.make_empty_exec_context() - # fill in any inputs provided to this function - for inp_name in input_dict.keys(): - if inp_name in execution_context: - if execution_context[inp_name].shape == input_dict[inp_name].shape: - execution_context[inp_name] = input_dict[inp_name] - else: - raise Exception( - "Shape mismatch for provided input %s: found %s expected %s " - % ( - inp_name, - str(execution_context[inp_name].shape), - str(input_dict[inp_name].shape), + graph = model.graph + # first, we need to make sure that every variable required by the graph has + # some buffer associated with it. this includes graph inputs (which includes + # the input data as well as the trained parameters) and the graph ValueInfo + # (intermediate tensors between layers) + # this is provided by the execution_context, which is a dict of np.ndarray + execution_context = model.make_empty_exec_context() + # fill in any inputs provided to this function + for inp_name in input_dict.keys(): + if inp_name in execution_context: + if execution_context[inp_name].shape == input_dict[inp_name].shape: + execution_context[inp_name] = input_dict[inp_name] + else: + raise Exception( + "Shape mismatch for provided input %s: found %s expected %s " + % ( + inp_name, + str(execution_context[inp_name].shape), + str(input_dict[inp_name].shape), + ) ) - ) - # check if model has an execution mode set - # if None, execute model node by node using execute_node() - # if set to "rtlsim" execute model using pyverilator - model_exec_mode = model.get_metadata_prop("exec_mode") - if (model_exec_mode is None) or (model_exec_mode == ""): - return execute_onnx_base() - elif model_exec_mode == "rtlsim": # use stitched IP for rtlsim rtlsim_exec(model, execution_context) else: diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py index d45c972928..920b29aa80 100644 --- a/src/finn/core/rtlsim_exec.py +++ b/src/finn/core/rtlsim_exec.py @@ -26,11 +26,18 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np import os from pyverilator.util.axi_utils import reset_rtlsim, rtlsim_multi_io from qonnx.custom_op.registry import getCustomOp -from finn.util.basic import pyverilate_get_liveness_threshold_cycles +from finn.util.basic import ( + get_finn_root, + get_vivado_root, + launch_process_helper, + make_build_dir, + pyverilate_get_liveness_threshold_cycles, +) from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy from finn.util.pyverilator import pyverilate_stitched_ip @@ -39,35 +46,13 @@ except ModuleNotFoundError: PyVerilator = None +try: + import pyxsi_utils +except ModuleNotFoundError: + pyxsi_utils = None -def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None): - """Use PyVerilator to execute given model with stitched IP. The execution - context contains the input values. Hook functions can be optionally - specified to observe/alter the state of the circuit, receiving the - PyVerilator sim object as their first argument: - - pre_hook : hook function to be called before sim start (after reset) - - post_hook : hook function to be called after sim end - """ - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - # ensure stitched ip project already exists - assert os.path.isfile( - model.get_metadata_prop("wrapper_filename") - ), """The - file name from metadata property "wrapper_filename" doesn't exist.""" - assert os.path.isdir( - model.get_metadata_prop("vivado_stitch_proj") - ), """The - directory from metadata property "vivado_stitch_proj" doesn't exist""" - trace_file = model.get_metadata_prop("rtlsim_trace") - if trace_file is None: - trace_file = "" - extra_verilator_args = model.get_metadata_prop("extra_verilator_args") - if extra_verilator_args is None: - extra_verilator_args = [] - else: - extra_verilator_args = eval(extra_verilator_args) +def prep_rtlsim_io_dict(model, execution_context): # extract i/o info to prepare io_dict io_dict = {"inputs": {}, "outputs": {}} if_dict = eval(model.get_metadata_prop("vivado_stitch_ifnames")) @@ -123,6 +108,301 @@ def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None): o_stream_w = last_node.get_outstream_width() o_tensor_info.append((o_stream_w, o_dt, o_folded_shape, o_shape)) num_out_values += batchsize * last_node.get_number_output_values() + return io_dict, if_dict, num_out_values, o_tensor_info + + +def file_to_basename(x): + return os.path.basename(os.path.realpath(x)) + + +def rtlsim_exec_cppxsi( + model, + execution_context, + dummy_data_mode=False, + postproc_cpp="", + timeout_cycles=None, + throttle_cycles=0, +): + """Use XSI C++ rtl simulation to execute given model with stitched IP. + The dummy_data_mode flag controls whether the simulation is driven by + dummy data or real data. The execution_context parameter must be formatted + according to whether dummy or real data is used. + Example with dummy_data = True: + execution_context = { + "inputs" : {"" : }, + "outputs" : {"" : }, + } + Example with dummy_data = False: + execution_context = { + "" : + } + + The postproc_cpp optional argument can be used to inject C++ code to retrieve + extra data when the simulation is finished. See the @POSTPROC_CPP@ template argument + in the xsi_simdriver.cpp file to see what context and functions are available. + If timeout_cycles is not None, the default value from pyverilate_get_liveness_threshold_cycles + will be used. + throttle_cycles will be used to pause the input stream every time an input frame is finished. + """ + # TODO: support running functional rtlsim with real I/O data + # TODO: support running with multiple inputs/outputs + # TODO: rename utility fxn to remove "pyverilate", used for other backends too + if timeout_cycles is None: + timeout_cycles = pyverilate_get_liveness_threshold_cycles() + + assert dummy_data_mode, "Only dummy_data_mode=True is supported for now" + + # ensure stitched ip project already exists + assert os.path.isfile( + model.get_metadata_prop("wrapper_filename") + ), """The + file name from metadata property "wrapper_filename" doesn't exist.""" + assert os.path.isdir( + model.get_metadata_prop("vivado_stitch_proj") + ), """The + directory from metadata property "vivado_stitch_proj" doesn't exist""" + trace_file = model.get_metadata_prop("rtlsim_trace") + if not dummy_data_mode: + io_dict, if_dict, num_out_values, o_tensor_info = prep_rtlsim_io_dict( + model, execution_context + ) + + # prepare rtlsim compiled object (unless it already exists) + rtlsim_so = model.get_metadata_prop("rtlsim_so") + top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename")) + top_module_name = top_module_file_name.strip(".v") + if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)): + vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj") + with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f: + all_verilog_srcs = f.read().split() + single_src_dir = make_build_dir("rtlsim_" + top_module_name + "_") + + rtlsim_so = pyxsi_utils.compile_sim_obj(top_module_name, all_verilog_srcs, single_src_dir) + # save generated lib filename in attribute + model.set_metadata_prop("rtlsim_so", rtlsim_so[0] + "/" + rtlsim_so[1]) + sim_base, sim_rel = rtlsim_so + # pass in correct tracefile from attribute + if trace_file == "default": + trace_file = top_module_file_name + ".wdb" + else: + sim_base, sim_rel = rtlsim_so.split("xsim.dir") + sim_rel = "xsim.dir" + sim_rel + # prepare the C++ sim driver template + fifosim_cpp_fname = get_finn_root() + "/src/finn/qnn-data/cpp/xsi_simdriver.cpp" + with open(fifosim_cpp_fname, "r") as f: + fifosim_cpp_template = f.read() + + instream_iters = [] + outstream_iters = [] + for top_inp in model.graph.input: + iname = top_inp.name + first_node = model.find_consumer(iname) + assert first_node is not None, "Failed to find consumer for " + iname + fnode_inst = getCustomOp(first_node) + top_ind = list(first_node.input).index(iname) + ishape_folded = fnode_inst.get_folded_input_shape(ind=top_ind) + instream_iters.append(np.prod(ishape_folded[:-1])) + for top_out in model.graph.output: + oname = top_out.name + last_node = model.find_producer(oname) + assert last_node is not None, "Failed to find producer for " + oname + lnode_inst = getCustomOp(last_node) + top_ind = list(last_node.output).index(oname) + oshape_folded = lnode_inst.get_folded_output_shape(ind=top_ind) + outstream_iters.append(np.prod(oshape_folded[:-1])) + + # retrieve the number of inputs from execution_context + n_inferences = execution_context[model.graph.input[0].name] + # determine according to presence of clk2x + ifnames = model.get_metadata_prop("vivado_stitch_ifnames") + assert not ( + ifnames is None + ), "Couldn't find stitched-IP interface names, did you run IP stitching first?" + ifnames = eval(ifnames) + if "clk2x" in ifnames.keys(): + is_double_pumped = ifnames["clk2x"] != [] + else: + is_double_pumped = False + clknames = "clk_and_clk2x" if is_double_pumped else "clk" + instream_names = [x[0] for x in ifnames["s_axis"]] + instream_names_str = "{" + ", ".join(['"' + x + '"' for x in instream_names]) + "}" + outstream_names = [x[0] for x in ifnames["m_axis"]] + outstream_names_str = "{" + ", ".join(['"' + x + '"' for x in outstream_names]) + "}" + instream_iters_str = "{" + ", ".join([str(x) for x in instream_iters]) + "}" + outstream_iters_str = "{" + ", ".join([str(x) for x in outstream_iters]) + "}" + # fill in the template arguments for sim driver + template_dict = { + # number of input transactions per inference + "ITERS_PER_INPUT": instream_iters_str, + # number of output transactions per inference + "ITERS_PER_OUTPUT": outstream_iters_str, + # number of inferences + "N_INFERENCES": n_inferences, + # max number of cycles to wait for output activity before timeout + "MAX_ITERS": timeout_cycles, + # name of the top-level HDL module + "TOP_MODULE_NAME": top_module_name, + # names of the top-level AXI streams and signals + "INSTREAM_NAME": instream_names_str, + "OUTSTREAM_NAME": outstream_names_str, + "CLK_NAME": "ap_clk", + "CLK2X_NAME": "ap_clk2x", + "CLKNAMES": clknames, + "NRST_NAME": "ap_rst_n", + # control tracing and trace filename + "TRACE_FILE": "NULL" if trace_file is None else f'"{trace_file}"', + "TRACE_CMD": "" if trace_file is None else "top->trace_all();", + # code to post-process final sim status to extract more data + "POSTPROC_CPP": postproc_cpp, + # sim kernel .so to use (depends on Vivado version) + "SIMKERNEL_SO": pyxsi_utils.get_simkernel_so(), + # input throttling for rate limit + "THROTTLE_CYCLES": throttle_cycles, + } + for key, val in template_dict.items(): + fifosim_cpp_template = fifosim_cpp_template.replace(f"@{key}@", str(val)) + with open(sim_base + "/rtlsim_xsi.cpp", "w") as f: + f.write(fifosim_cpp_template) + + vivado_incl_dir = get_vivado_root() + "/data/xsim/include" + xsi_include_dir = get_finn_root() + "/deps/pyxsi/src" + # launch g++ to compile the rtlsim executable + build_cmd = [ + "g++", + f"-I{xsi_include_dir}", + f"-I{vivado_incl_dir}", + "-std=c++14", + "-O3", + "-o", + "rtlsim_xsi", + "rtlsim_xsi.cpp", + f"{xsi_include_dir}/xsi_loader.cpp", + "-ldl", + "-lrt", + ] + # write compilation command to a file for easy re-running/debugging + with open(sim_base + "/compile_rtlsim.sh", "w") as f: + f.write(" ".join(build_cmd)) + launch_process_helper(build_cmd, cwd=sim_base) + assert os.path.isfile(sim_base + "/rtlsim_xsi"), "Failed to compile rtlsim executable" + + # launch the rtlsim executable + # important to specify LD_LIBRARY_PATH here for XSI to work correctly + runsim_env = os.environ.copy() + runsim_env["LD_LIBRARY_PATH"] = get_vivado_root() + "/lib/lnx64.o" + runsim_cmd = ["./rtlsim_xsi"] + with open(sim_base + "/run_rtlsim.sh", "w") as f: + f.write(f"LD_LIBRARY_PATH={runsim_env['LD_LIBRARY_PATH']} ./rtlsim_xsi") + launch_process_helper(runsim_cmd, proc_env=runsim_env, cwd=sim_base) + + # parse results file and return dict + results_filename = sim_base + "/results.txt" + with open(results_filename, "r") as f: + results = f.read().strip().split("\n") + ret_dict = {} + for result_line in results: + key, val = result_line.split("\t") + ret_dict[key] = int(val) + if "TIMEOUT" in ret_dict.keys(): + assert ret_dict["TIMEOUT"] == 0, f"XSI C++ simulation timed out, see {results_filename}" + return ret_dict + + +def rtlsim_exec_pyxsi(model, execution_context, pre_hook=None, post_hook=None): + """Use PyXSI to execute given model with stitched IP. The execution + context contains the input values. Hook functions can be optionally + specified to observe/alter the state of the circuit, receiving the + PyXSI RPC sim handle as their first argument: + - pre_hook : hook function to be called before sim start (after reset) + - post_hook : hook function to be called after sim end + """ + # ensure stitched ip project already exists + assert os.path.isfile( + model.get_metadata_prop("wrapper_filename") + ), """The + file name from metadata property "wrapper_filename" doesn't exist.""" + assert os.path.isdir( + model.get_metadata_prop("vivado_stitch_proj") + ), """The + directory from metadata property "vivado_stitch_proj" doesn't exist""" + trace_file = model.get_metadata_prop("rtlsim_trace") + io_dict, if_dict, num_out_values, o_tensor_info = prep_rtlsim_io_dict(model, execution_context) + + # prepare rtlsim model + rtlsim_so = model.get_metadata_prop("rtlsim_so") + if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)): + vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj") + with open(vivado_stitch_proj_dir + "/all_verilog_srcs.txt", "r") as f: + all_verilog_srcs = f.read().split() + top_module_file_name = file_to_basename(model.get_metadata_prop("wrapper_filename")) + top_module_name = top_module_file_name.strip(".v") + single_src_dir = make_build_dir("rtlsim_" + top_module_name + "_") + + rtlsim_so = pyxsi_utils.compile_sim_obj(top_module_name, all_verilog_srcs, single_src_dir) + # save generated lib filename in attribute + model.set_metadata_prop("rtlsim_so", rtlsim_so[0] + "/" + rtlsim_so[1]) + sim_base, sim_rel = rtlsim_so + # pass in correct tracefile from attribute + if trace_file == "default": + trace_file = top_module_file_name + ".wdb" + sim = pyxsi_utils.load_sim_obj(sim_base, sim_rel, trace_file) + else: + sim_base, sim_rel = rtlsim_so.split("xsim.dir") + sim_rel = "xsim.dir" + sim_rel + sim = pyxsi_utils.load_sim_obj(sim_base, sim_rel, trace_file) + + # reset and call rtlsim, including any pre/post hooks + pyxsi_utils.reset_rtlsim(sim) + if pre_hook is not None: + pre_hook(sim) + n_cycles = pyxsi_utils.rtlsim_multi_io( + sim, + io_dict, + num_out_values, + sname="_", + liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + ) + if post_hook is not None: + post_hook(sim) + # important to call close_rtlsim for pyxsi to flush traces and stop + # the RPC server process + pyxsi_utils.close_rtlsim(sim) + + # unpack outputs and put back into execution context + for o, o_vi in enumerate(model.graph.output): + o_name = o_vi.name + if_name = if_dict["m_axis"][o][0] + o_stream_w, o_dt, o_folded_shape, o_shape = o_tensor_info[o] + packed_output = io_dict["outputs"][if_name] + o_folded_tensor = rtlsim_output_to_npy( + packed_output, None, o_dt, o_folded_shape, o_stream_w, o_dt.bitwidth() + ) + execution_context[o_name] = o_folded_tensor.reshape(o_shape) + + model.set_metadata_prop("cycles_rtlsim", str(n_cycles)) + + +def rtlsim_exec_pyverilator(model, execution_context, pre_hook=None, post_hook=None): + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + # ensure stitched ip project already exists + assert os.path.isfile( + model.get_metadata_prop("wrapper_filename") + ), """The + file name from metadata property "wrapper_filename" doesn't exist.""" + assert os.path.isdir( + model.get_metadata_prop("vivado_stitch_proj") + ), """The + directory from metadata property "vivado_stitch_proj" doesn't exist""" + trace_file = model.get_metadata_prop("rtlsim_trace") + if trace_file is None: + trace_file = "" + extra_verilator_args = model.get_metadata_prop("extra_verilator_args") + if extra_verilator_args is None: + extra_verilator_args = [] + else: + extra_verilator_args = eval(extra_verilator_args) + io_dict, if_dict, num_out_values, o_tensor_info = prep_rtlsim_io_dict(model, execution_context) # prepare pyverilator model rtlsim_so = model.get_metadata_prop("rtlsim_so") @@ -159,3 +439,21 @@ def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None): execution_context[o_name] = o_folded_tensor.reshape(o_shape) model.set_metadata_prop("cycles_rtlsim", str(n_cycles)) + + +def rtlsim_exec(model, execution_context, pre_hook=None, post_hook=None): + """Use PyVerilator or PyXSI to execute given model with stitched IP, depending + on the rtlsim_backend metadata_prop on the model. The execution + context contains the input values. Hook functions can be optionally + specified to observe/alter the state of the circuit, receiving the + PyVerilator sim object as their first argument: + - pre_hook : hook function to be called before sim start (after reset) + - post_hook : hook function to be called after sim end + """ + backend = model.get_metadata_prop("rtlsim_backend") + if backend == "pyverilator": + rtlsim_exec_pyverilator(model, execution_context, pre_hook, post_hook) + elif backend == "pyxsi": + rtlsim_exec_pyxsi(model, execution_context, pre_hook, post_hook) + else: + assert False, f"Unrecognized rtlsim_backend value: {backend}" diff --git a/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py index a3f0e043f8..b713be14e5 100644 --- a/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py @@ -126,8 +126,12 @@ def execute_node(self, context, graph): "{}/input_1.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = {"inputs": {"in0": rtlsim_inp0, "in1": rtlsim_inp1}, "outputs": {"out": []}} + self.rtlsim_multi_io(sim, io_dict) + rtlsim_output = io_dict["outputs"]["out"] + super().close_rtlsim(sim) odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py index 14efa113dd..c224cf64d4 100644 --- a/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py @@ -284,8 +284,15 @@ def execute_node(self, context, graph): nbits = self.get_instream_width() inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) super().reset_rtlsim(sim) - super().toggle_clk(sim) - output = self.rtlsim(sim, inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + output = io_dict["outputs"]["out"] odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py index 8a72ca3c6c..5bef15c66f 100644 --- a/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py @@ -188,12 +188,14 @@ def execute_node(self, context, graph): nbits = self.get_instream_width() inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) super().reset_rtlsim(sim) - super().toggle_clk(sim) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) io_dict = { "inputs": {"in0": inp}, "outputs": {"out": []}, } self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) output = io_dict["outputs"]["out"] odt = self.get_output_datatype() target_bits = odt.bitwidth() diff --git a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py index 008fa9cee8..bf1f906b63 100644 --- a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py @@ -143,9 +143,10 @@ def execute_node(self, context, graph): ) io_dict["inputs"]["in%d" % i] = rtlsim_inp super().reset_rtlsim(sim) - super().toggle_clk(sim) - + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) rtlsim_output = io_dict["outputs"]["out"] odt = self.get_output_datatype() target_bits = odt.bitwidth() diff --git a/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py index 4a5c02ee06..0e45ea7ef5 100644 --- a/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py @@ -387,8 +387,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = export_idt target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py index 56f472b9c0..df045583fc 100644 --- a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py @@ -138,8 +138,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = export_idt target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py index e19149435e..a9fbe3ddf0 100644 --- a/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py @@ -148,7 +148,8 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) rtlsim_dict = { "inputs": {"in0": rtlsim_inp}, "outputs": {}, @@ -156,6 +157,7 @@ def execute_node(self, context, graph): for i in range(n_outputs): rtlsim_dict["outputs"]["out%d" % i] = [] self.rtlsim_multi_io(sim, rtlsim_dict) + super().close_rtlsim(sim) odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py index d57699af05..6355acba9b 100644 --- a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py @@ -185,8 +185,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = export_idt target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py index b7ba301fbc..a39b7e5b03 100644 --- a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py @@ -140,8 +140,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = export_idt target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py index 9b2a7b25b0..0d2ba2ff0b 100644 --- a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py @@ -118,8 +118,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py index 1e2c0d034a..19e1318205 100644 --- a/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py @@ -120,8 +120,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py index ba44deb898..98a04b0bc9 100644 --- a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py @@ -297,8 +297,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() out_npy_path = "{}/output.npy".format(code_gen_dir) diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py index cae1c30eb6..a355445c48 100644 --- a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -542,7 +542,8 @@ def execute_node(self, context, graph): nbits = self.get_instream_width() inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) self.reset_rtlsim(sim) - self.toggle_clk(sim) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) if mem_mode == "external" or mem_mode == "internal_decoupled": wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() @@ -556,10 +557,14 @@ def execute_node(self, context, graph): "inputs": {"in0": inp, "weights": wei * num_w_reps}, "outputs": {"out": []}, } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] else: - output = self.rtlsim(sim, inp) + io_dict = { + "inputs": {"in0": inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + output = io_dict["outputs"]["out"] odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/pool_hls.py b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py index 64c6ec33f8..2918f88a81 100644 --- a/src/finn/custom_op/fpgadataflow/hls/pool_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py @@ -235,8 +235,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py index 4619a1756b..fb8ee42f5a 100644 --- a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py @@ -177,8 +177,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = export_idt target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py index 0d618d832a..efa98f2ea6 100644 --- a/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py @@ -129,8 +129,15 @@ def execute_node(self, context, graph): "{}/input_1.npy".format(code_gen_dir), export_idt1, nbits1 ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp0, rtlsim_inp1) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp0, "in1": rtlsim_inp1}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py index 69db7b4606..c03d9a0ece 100755 --- a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py @@ -190,8 +190,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = export_idt target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index b753bc7a03..a2a53a6689 100644 --- a/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -336,7 +336,8 @@ def execute_node(self, context, graph): nbits = self.get_instream_width() inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) super().reset_rtlsim(sim) - super().toggle_clk(sim) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) if self.get_nodeattr("mem_mode") == "internal_decoupled": wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() @@ -348,12 +349,16 @@ def execute_node(self, context, graph): "inputs": {"in0": inp, "weights": wei * num_w_reps}, "outputs": {"out": []}, } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] elif self.get_nodeattr("mem_mode") == "internal_embedded": - output = self.rtlsim(sim, inp) + io_dict = { + "inputs": {"in0": inp}, + "outputs": {"out": []}, + } else: raise Exception("Unrecognized mem_mode") + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + output = io_dict["outputs"]["out"] odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py index 05d26eddb2..0dfe9096b0 100644 --- a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py @@ -148,8 +148,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = export_idt target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py index f9ba68e6b6..455d477c88 100644 --- a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -191,7 +191,8 @@ def execute_node(self, context, graph): nbits = self.get_instream_width() inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) super().reset_rtlsim(sim) - super().toggle_clk(sim) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) if mem_mode == "external" or mem_mode == "internal_decoupled": wnbits = self.get_weightstream_width() @@ -208,10 +209,14 @@ def execute_node(self, context, graph): "inputs": {"in0": inp, "weights": wei * num_w_reps}, "outputs": {"out": []}, } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] else: - output = self.rtlsim(sim, inp) + io_dict = { + "inputs": {"in0": inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + output = io_dict["outputs"]["out"] odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py index 4677960ea8..a0c61ec5b3 100644 --- a/src/finn/custom_op/fpgadataflow/hlsbackend.py +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -42,6 +42,11 @@ except ModuleNotFoundError: PyVerilator = None +try: + import pyxsi_utils +except ModuleNotFoundError: + pyxsi_utils = None + class HLSBackend(ABC): """HLSBackend class all custom ops that correspond to a finn-hlslib @@ -67,8 +72,15 @@ def get_all_verilog_paths(self): ), """Node attribute "code_gen_dir_ipgen" is not set. Please run HLSSynthIP first.""" verilog_path = "{}/project_{}/sol1/impl/verilog/".format(code_gen_dir, self.onnx_node.name) - # default impl only returns the HLS verilog codegen dir - return [verilog_path] + subcore_verilog_path = "{}/project_{}/sol1/impl/ip/hdl/ip/".format( + code_gen_dir, self.onnx_node.name + ) + # default impl only returns the HLS verilog codegen dir and subcore (impl/ip/hdl/ip) dir + # if it exists + ret = [verilog_path] + if os.path.isdir(subcore_verilog_path): + ret += [subcore_verilog_path] + return ret def get_all_verilog_filenames(self, abspath=False): "Return list of all Verilog files used for this node." @@ -89,25 +101,39 @@ def prepare_rtlsim(self): for this node, sets the rtlsim_so attribute to its path and returns a PyVerilator wrapper around it.""" - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - + rtlsim_backend = self.get_nodeattr("rtlsim_backend") verilog_files = self.get_all_verilog_filenames(abspath=True) single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_") - tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") - target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v" - make_single_source_file(verilog_files, target_file) - - # build the Verilator emu library - sim = PyVerilator.build( - self.get_verilog_top_module_name() + ".v", - build_dir=tmp_build_dir, - verilog_path=[single_src_dir], - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) + if rtlsim_backend == "pyverilator": + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") + target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v" + make_single_source_file(verilog_files, target_file) + + # build the Verilator emu library + sim = PyVerilator.build( + self.get_verilog_top_module_name() + ".v", + build_dir=tmp_build_dir, + verilog_path=[single_src_dir], + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + elif rtlsim_backend == "pyxsi": + ret = pyxsi_utils.compile_sim_obj( + self.get_verilog_top_module_name(), verilog_files, single_src_dir + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", ret[0] + "/" + ret[1]) + # TODO return val of this function is never used + # refactor s.t. it does not return anything at all, + # consistently between pyverilator and pyxsi + sim = None + else: + assert False, "Unknown rtlsim_backend" + return sim def code_generation_ipgen(self, model, fpgapart, clk): @@ -244,6 +270,7 @@ def compile_singlenode_code(self): builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib") builder.append_includes("-I$FINN_ROOT/custom_hls") builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"])) + builder.append_includes("-I{}/include".format(os.environ["VITIS_PATH"])) builder.append_includes("--std=c++14") builder.append_includes("-O3") builder.append_sources(code_gen_dir + "/*.cpp") diff --git a/src/finn/custom_op/fpgadataflow/hwcustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py index b40b8f3074..491b31c482 100644 --- a/src/finn/custom_op/fpgadataflow/hwcustomop.py +++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py @@ -30,7 +30,7 @@ import os import warnings from abc import abstractmethod -from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io +from pyverilator.util.axi_utils import rtlsim_multi_io from qonnx.custom_op.base import CustomOp from qonnx.util.basic import roundup_to_integer_multiple @@ -41,6 +41,11 @@ except ModuleNotFoundError: PyVerilator = None +try: + import pyxsi_utils +except ModuleNotFoundError: + pyxsi_utils = None + class HWCustomOp(CustomOp): """HWCustomOp class all custom ops that can be implemented with either @@ -67,6 +72,7 @@ def get_nodeattr_types(self): "res_estimate": ("s", False, ""), "res_synth": ("s", False, ""), "rtlsim_so": ("s", False, ""), + "rtlsim_backend": ("s", False, "pyxsi", {"pyverilator", "pyxsi"}), # partitioning info # ID of SLR to which the Op is attached in Vitis builds # Set to -1 as 'don't care' @@ -132,10 +138,36 @@ def get_rtlsim(self): rtlsim_so = self.get_nodeattr("rtlsim_so") assert os.path.isfile(rtlsim_so), "Cannot find rtlsim library." - # create PyVerilator wrapper - sim = PyVerilator(rtlsim_so) + rtlsim_backend = self.get_nodeattr("rtlsim_backend") + + if rtlsim_backend == "pyverilator": + # create PyVerilator wrapper + sim = PyVerilator(rtlsim_so) + elif rtlsim_backend == "pyxsi": + sim_base, sim_rel = rtlsim_so.split("xsim.dir") + sim_rel = "xsim.dir" + sim_rel + # pass in correct tracefile from attribute + tracefile = self.get_nodeattr("rtlsim_trace") + if tracefile == "default": + tracefile = self.onnx_node.name + ".wdb" + sim = pyxsi_utils.load_sim_obj(sim_base, sim_rel, tracefile) + else: + assert False, "Unknown rtlsim_backend" + return sim + def close_rtlsim(self, sim): + "Close and free up resources for rtlsim." + rtlsim_backend = self.get_nodeattr("rtlsim_backend") + + if rtlsim_backend == "pyverilator": + # no action needed + pass + elif rtlsim_backend == "pyxsi": + pyxsi_utils.close_rtlsim(sim) + else: + assert False, "Unknown rtlsim_backend" + def node_res_estimation(self, fpgapart): """Returns summarized resource estimation of BRAMs and LUTs of the node as a dictionary.""" @@ -194,114 +226,58 @@ def get_op_and_param_counts(self): def reset_rtlsim(self, sim): """Sets reset input in pyverilator to zero, toggles the clock and set it back to one""" - sim.io.ap_rst_n = 0 - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 - sim.io.ap_rst_n = 1 + rtlsim_backend = self.get_nodeattr("rtlsim_backend") + if rtlsim_backend == "pyverilator": + sim.io.ap_rst_n = 0 + sim.io.ap_clk = 1 + sim.io.ap_clk = 0 + sim.io.ap_rst_n = 1 + elif rtlsim_backend == "pyxsi": + pyxsi_utils.reset_rtlsim(sim) + else: + assert False, f"Unknown rtlsim_backend {rtlsim_backend}" def toggle_clk(self, sim): """Toggles the clock input in pyverilator once.""" - sim.io.ap_clk = 1 - sim.io.ap_clk = 0 - - def rtlsim(self, sim, inp, inp2=None): - """Runs the pyverilator simulation by passing the input values to the simulation, - toggle the clock and observing the execution time. Function contains also an - observation loop that can abort the simulation if no output value is produced - after 100 cycles.""" - - trace_file = self.get_nodeattr("rtlsim_trace") - if trace_file != "": - if trace_file == "default": - trace_file = self.onnx_node.name + ".vcd" - sim.start_vcd_trace(trace_file) - inputs = inp - outputs = [] - sname = self.hls_sname() - o_ready = "out_" + sname + "_TREADY" - o_valid = "out_" + sname + "_TVALID" - o_data = "out_" + sname + "_TDATA" - in0_ready = "in0_" + sname + "_TREADY" - in0_valid = "in0_" + sname + "_TVALID" - in0_data = "in0_" + sname + "_TDATA" - in1_ready = "in1_" + sname + "_TREADY" - in1_valid = "in1_" + sname + "_TVALID" - in1_data = "in1_" + sname + "_TDATA" - - sim.io[o_ready] = 1 - - # observe if output is completely calculated - # observation_count will contain the number of cycles the calculation ran - num_out_values = self.get_number_output_values() - output_observed = False - observation_count = 0 - - # avoid infinite looping of simulation by aborting when there is no change in - # output values after 100 cycles - no_change_count = 0 - old_outputs = outputs - liveness_threshold = pyverilate_get_liveness_threshold_cycles() - - while not (output_observed): - sim.io[in0_valid] = 1 if len(inputs) > 0 else 0 - sim.io[in0_data] = inputs[0] if len(inputs) > 0 else 0 - if sim.io[in0_ready] == 1 and sim.io[in0_valid] == 1: - inputs = inputs[1:] - - if inp2 is not None: - sim.io[in1_valid] = 1 if len(inp2) > 0 else 0 - sim.io[in1_data] = inp2[0] if len(inp2) > 0 else 0 - if sim.io[in1_ready] == 1 and sim.io[in1_valid] == 1: - inp2 = inp2[1:] - - if sim.io[o_valid] == 1 and sim.io[o_ready] == 1: - outputs = outputs + [sim.io[o_data]] + rtlsim_backend = self.get_nodeattr("rtlsim_backend") + if rtlsim_backend == "pyverilator": sim.io.ap_clk = 1 sim.io.ap_clk = 0 + elif rtlsim_backend == "pyxsi": + pyxsi_utils.toggle_clk(sim) + else: + assert False, f"Unknown rtlsim_backend {rtlsim_backend}" - observation_count = observation_count + 1 - no_change_count = no_change_count + 1 - - if len(outputs) == num_out_values: - self.set_nodeattr("cycles_rtlsim", observation_count) - output_observed = True - - if no_change_count == liveness_threshold: - if old_outputs == outputs: - if trace_file != "": - sim.flush_vcd_trace() - sim.stop_vcd_trace() - raise Exception( - "Error in simulation! Takes too long to produce output. " - "Consider setting the LIVENESS_THRESHOLD env.var. to a " - "larger value." - ) - else: - no_change_count = 0 - old_outputs = outputs - if trace_file != "": - sim.flush_vcd_trace() - sim.stop_vcd_trace() - return outputs - - def rtlsim_multi_io(self, sim, io_dict): + def rtlsim_multi_io(self, sim, io_dict, hook_postclk=None): "Run rtlsim for this node, supports multiple i/o streams." - - # signal name + # signal name suffix sname = "_" + self.hls_sname() + "_" - - trace_file = self.get_nodeattr("rtlsim_trace") - if trace_file == "default": - trace_file = self.onnx_node.name + ".vcd" + rtlsim_backend = self.get_nodeattr("rtlsim_backend") num_out_values = self.get_number_output_values() - total_cycle_count = rtlsim_multi_io( - sim, - io_dict, - num_out_values, - trace_file=trace_file, - sname=sname, - liveness_threshold=pyverilate_get_liveness_threshold_cycles(), - ) + if rtlsim_backend == "pyverilator": + trace_file = self.get_nodeattr("rtlsim_trace") + if trace_file == "default": + trace_file = self.onnx_node.name + ".vcd" + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + num_out_values, + trace_file=trace_file, + sname=sname, + liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + ) + elif rtlsim_backend == "pyxsi": + total_cycle_count = pyxsi_utils.rtlsim_multi_io( + sim, + io_dict, + num_out_values, + sname=sname, + liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + hook_postclk=hook_postclk, + ) + else: + assert False, f"Unknown rtlsim_backend {rtlsim_backend}" + self.set_nodeattr("cycles_rtlsim", total_cycle_count) def generate_params(self, model, path): @@ -382,8 +358,6 @@ def derive_characteristic_fxns(self, period, override_rtlsim_dict=None): exp_cycles, ) sim = self.get_rtlsim() - # signal name - sname = "_" + self.hls_sname() + "_" if override_rtlsim_dict is not None: io_dict = override_rtlsim_dict else: @@ -398,33 +372,33 @@ def derive_characteristic_fxns(self, period, override_rtlsim_dict=None): # note that we restrict key names to filter out weight streams etc txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key} txns_out = {key: [] for (key, value) in io_dict["outputs"].items() if "out" in key} + # signal name + sname = "_" + self.hls_sname() + "_" def monitor_txns(sim_obj): for inp in txns_in: - in_ready = _read_signal(sim, inp + sname + "TREADY") == 1 - in_valid = _read_signal(sim, inp + sname + "TVALID") == 1 + in_ready = pyxsi_utils._read_signal(sim_obj, inp + sname + "TREADY") == 1 + in_valid = pyxsi_utils._read_signal(sim_obj, inp + sname + "TVALID") == 1 if in_ready and in_valid: txns_in[inp].append(1) else: txns_in[inp].append(0) for outp in txns_out: if ( - _read_signal(sim, outp + sname + "TREADY") == 1 - and _read_signal(sim, outp + sname + "TVALID") == 1 + pyxsi_utils._read_signal(sim_obj, outp + sname + "TREADY") == 1 + and pyxsi_utils._read_signal(sim_obj, outp + sname + "TVALID") == 1 ): txns_out[outp].append(1) else: txns_out[outp].append(0) - reset_rtlsim(sim) - total_cycle_count = rtlsim_multi_io( + self.reset_rtlsim(sim) + self.rtlsim_multi_io( sim, io_dict, - n_outs, - sname=sname, - liveness_threshold=period, - hook_preclk=monitor_txns, + hook_postclk=monitor_txns, ) + total_cycle_count = self.get_nodeattr("cycles_rtlsim") assert ( total_cycle_count <= period ), """Total cycle count from rtl simulation is higher than diff --git a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py index 321522e7ba..3c063c00d9 100755 --- a/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py @@ -40,14 +40,8 @@ ConvolutionInputGenerator, ) from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -try: - from pyverilator import PyVerilator -except ModuleNotFoundError: - PyVerilator = None - # RTL Convolution Input Generator / Sliding Window Generator (SWG) # Matches and extends the functionality of all ConvolutionInputGenerator_* functions # in finn-hlslib by generating HDL code for two different implementation styles: @@ -336,8 +330,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = export_idt target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() @@ -932,37 +933,23 @@ def generate_hdl(self, model, fpgapart, clk): self.set_nodeattr("ipgen_path", code_gen_dir) self.set_nodeattr("ip_path", code_gen_dir) - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - # Modified to use generated (System-)Verilog instead of HLS output products - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_paths = [code_gen_dir] + def get_rtl_file_list(self, abspath=False): + if abspath: + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/" + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/swg/") + else: + code_gen_dir = "" + rtllib_dir = "" verilog_files = [ - "swg_pkg.sv", - self.get_nodeattr("gen_top_module") + "_wrapper.v", - self.get_nodeattr("gen_top_module") + "_impl.sv", - "swg_common.sv", + rtllib_dir + "swg_pkg.sv", + code_gen_dir + self.get_nodeattr("gen_top_module") + "_wrapper.v", + code_gen_dir + self.get_nodeattr("gen_top_module") + "_impl.sv", + rtllib_dir + "swg_common.sv", ] if self.get_nodeattr("dynamic_mode"): - verilog_files.append(self.get_nodeattr("gen_top_module") + "_axilite.v") - - # build the Verilator emu library - sim = PyVerilator.build( - verilog_files, - build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), - verilog_path=verilog_paths, - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) - return sim + verilog_files.append(code_gen_dir + self.get_nodeattr("gen_top_module") + "_axilite.v") + + return verilog_files def code_generation_ipi(self): """Constructs and returns the TCL for node instantiation in Vivado IPI.""" diff --git a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py index cc49446ea3..6ee1e27e2d 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py @@ -34,14 +34,8 @@ from finn.custom_op.fpgadataflow.fmpadding import FMPadding from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -try: - from pyverilator import PyVerilator -except ModuleNotFoundError: - PyVerilator = None - class FMPadding_rtl(FMPadding, RTLBackend): """CustomOp wrapper for the finn-rtllib fmpadding_axi component @@ -96,8 +90,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = export_idt target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() @@ -206,35 +207,21 @@ def generate_hdl(self, model, fpgapart, clk): self.set_nodeattr("ipgen_path", code_gen_dir) self.set_nodeattr("ip_path", code_gen_dir) - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - # Modified to use generated (System-)Verilog instead of HLS output products - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") + def get_rtl_file_list(self, abspath=False): + if abspath: + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/" + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/fmpadding/hdl/") + else: + code_gen_dir = "" + rtllib_dir = "" - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_paths = [code_gen_dir] verilog_files = [ - "fmpadding_axi.sv", - "fmpadding.sv", - "axi2we.sv", - self.get_nodeattr("gen_top_module") + ".v", + rtllib_dir + "fmpadding_axi.sv", + rtllib_dir + "fmpadding.sv", + rtllib_dir + "axi2we.sv", + code_gen_dir + self.get_nodeattr("gen_top_module") + ".v", ] - - # build the Verilator emu library - sim = PyVerilator.build( - verilog_files, - build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), - verilog_path=verilog_paths, - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) - return sim + return verilog_files def code_generation_ipi(self): """Constructs and returns the TCL for node instantiation in Vivado IPI.""" diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py index d9ab501117..61797dd2fd 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -28,11 +28,10 @@ import numpy as np import os -from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend -from finn.util.basic import get_dsp_block, get_rtlsim_trace_depth, make_build_dir +from finn.util.basic import get_dsp_block from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy try: @@ -95,8 +94,9 @@ def execute_node(self, context, graph): sim = self.get_rtlsim() nbits = self.get_instream_width() inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - reset_rtlsim(sim) - toggle_clk(sim) + super().reset_rtlsim(sim) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) if mem_mode in ["external", "internal_decoupled"]: wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() @@ -108,10 +108,14 @@ def execute_node(self, context, graph): "inputs": {"in0": inp, "weights": wei * num_w_reps}, "outputs": {"out": []}, } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] else: - output = self.rtlsim(sim, inp) + io_dict = { + "inputs": {"in0": inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + output = io_dict["outputs"]["out"] odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() @@ -282,28 +286,24 @@ def prepare_codegen_default(self, fpgapart, clk): return template_path, code_gen_dict - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - # Path to (System-)Verilog files used by top-module & path to top-module - verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] - verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] - - # build the Verilator emu library - sim = PyVerilator.build( - verilog_files, - build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), - verilog_path=verilog_paths, - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) + def get_rtl_file_list(self, abspath=False): + if abspath: + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/" + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + else: + code_gen_dir = "" + rtllib_dir = "" + verilog_files = [ + code_gen_dir + self.get_nodeattr("gen_top_module") + "_wrapper_sim.v", + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + return verilog_files - return sim + def get_verilog_paths(self): + verilog_paths = super().get_verilog_paths() + verilog_paths.append(os.environ["FINN_ROOT"] + "/finn-rtllib/mvu") + return verilog_paths diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py index e79782eb6d..496e38acfc 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py @@ -34,14 +34,8 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter import ( StreamingDataWidthConverter, ) -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -try: - from pyverilator import PyVerilator -except ModuleNotFoundError: - PyVerilator = None - class StreamingDataWidthConverter_rtl(StreamingDataWidthConverter, RTLBackend): """Class that corresponds to finn-rtllib datawidth converter @@ -100,8 +94,15 @@ def execute_node(self, context, graph): "{}/input_0.npy".format(code_gen_dir), export_idt, nbits ) super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] odt = export_idt target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() @@ -167,34 +168,21 @@ def generate_hdl(self, model, fpgapart, clk): self.set_nodeattr("ipgen_path", code_gen_dir) self.set_nodeattr("ip_path", code_gen_dir) - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - # Modified to use generated (System-)Verilog instead of HLS output products - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") + def get_rtl_file_list(self, abspath=False): + if abspath: + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/" + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/dwc/hdl/") + else: + code_gen_dir = "" + rtllib_dir = "" - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_paths = [code_gen_dir] verilog_files = [ - "dwc_axi.sv", - "dwc.sv", - self.get_nodeattr("gen_top_module") + ".v", + rtllib_dir + "dwc_axi.sv", + rtllib_dir + "dwc.sv", + code_gen_dir + self.get_nodeattr("gen_top_module") + ".v", ] - # build the Verilator emu library - sim = PyVerilator.build( - verilog_files, - build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), - verilog_path=verilog_paths, - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) - return sim + return verilog_files def code_generation_ipi(self): """Constructs and returns the TCL for node instantiation in Vivado IPI.""" diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py index f8f27cb647..05b45f9e4b 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py @@ -33,14 +33,8 @@ from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO -from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -try: - from pyverilator import PyVerilator -except ModuleNotFoundError: - PyVerilator = None - class StreamingFIFO_rtl(StreamingFIFO, RTLBackend): def __init__(self, onnx_node, **kwargs): @@ -152,8 +146,15 @@ def execute_node(self, context, graph): nbits = self.get_instream_width() inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) super().reset_rtlsim(sim) - super().toggle_clk(sim) - output = self.rtlsim(sim, inp) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + io_dict = { + "inputs": {"in0": inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + output = io_dict["outputs"]["out"] odt = DataType[self.get_nodeattr("dataType")] target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() @@ -254,30 +255,23 @@ def code_generation_ipi(self): "FIFO implementation style %s not supported, please use rtl or vivado" % impl_style ) + def get_rtl_file_list(self, abspath=False): + if abspath: + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/" + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/fifo/hdl/") + else: + code_gen_dir = "" + rtllib_dir = "" + + verilog_files = [ + rtllib_dir + "Q_srl.v", + code_gen_dir + self.get_nodeattr("gen_top_module") + ".v", + ] + return verilog_files + def prepare_rtlsim(self): assert self.get_nodeattr("impl_style") != "vivado", ( "StreamingFIFO impl_style " "cannot be vivado for rtlsim. Only impl_style=rtl supported." ) - # Modified to use generated (System-)Verilog instead of HLS output products - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_paths = [code_gen_dir] - verilog_files = [ - "Q_srl.v", - self.get_nodeattr("gen_top_module") + ".v", - ] - # build the Verilator emu library - sim = PyVerilator.build( - verilog_files, - build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), - verilog_path=verilog_paths, - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) - return sim + return super().prepare_rtlsim() diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py index d1e9387b1b..0edc59d16c 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -30,19 +30,12 @@ import numpy as np import os import shutil -from pyverilator.util.axi_utils import reset_rtlsim, rtlsim_multi_io from qonnx.core.datatype import DataType from qonnx.util.basic import roundup_to_integer_multiple from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.custom_op.fpgadataflow.thresholding import Thresholding -from finn.util.basic import ( - get_memutil_alternatives, - get_rtlsim_trace_depth, - make_build_dir, - mem_primitives_versal, - pyverilate_get_liveness_threshold_cycles, -) +from finn.util.basic import get_memutil_alternatives, mem_primitives_versal from finn.util.data_packing import ( npy_to_rtlsim_input, pack_innermost_dim_as_hex_string, @@ -245,9 +238,7 @@ def prepare_codegen_rtl_values(self, model): code_gen_dict["$THRESHOLDS_PATH$"] = ['"./%s_"' % self.onnx_node.name] # Identify the module name - code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ - self.get_verilog_top_module_name() + "_axi_wrapper" - ] + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] # Set the top module name - AXI wrapper code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] @@ -289,46 +280,22 @@ def prepare_codegen_rtl_values(self, model): code_gen_dict["$DEEP_PIPELINE$"] = [str(deep_pipeline)] return code_gen_dict - def get_rtl_file_list(self): + def get_rtl_file_list(self, abspath=False): """Thresholding binary search RTL file list""" - return [ - "axilite_if.v", - "thresholding.sv", - "thresholding_axi.sv", - "thresholding_template_wrapper.v", - ] + if abspath: + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/" + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/thresholding/hdl/") + else: + code_gen_dir = "" + rtllib_dir = "" - def get_rtl_file_paths(self): - """Get full path of all RTL files""" - rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/" - rtl_file_list = self.get_rtl_file_list() - rtl_file_paths = [rtl_root_dir + file for file in rtl_file_list] - return rtl_file_paths - - def get_rtl_template_data(self, path): - """Return RTL file contents as a template""" - with open(path, "r") as f: - template = f.read() - return template - - def fill_in_rtl_template_data(self, replace_dict, template_data): - """Use attribute values to finn in RTL template placeholders""" - template_data_cp = template_data - for key in replace_dict: - replacement_line = "\n".join(replace_dict[key]) - template_data_cp = template_data_cp.replace(key, replacement_line) - return template_data_cp - - def dump_rtl_data(self, dest_dir, filename, data): - """Dump filled-in-template RTL files for future synthesis step""" - # when generating template files, handle a special case: - # if the filename contains the word "template", replace that - # with the node name to distinguish between instances - if "template" in filename: - filename = self.get_nodeattr("gen_top_module") + ".v" - with open(os.path.join(dest_dir, filename), "w") as f: - f.write(data) - return + verilog_files = [ + rtllib_dir + "axilite_if.v", + rtllib_dir + "thresholding.sv", + rtllib_dir + "thresholding_axi.sv", + code_gen_dir + self.get_nodeattr("gen_top_module") + ".v", + ] + return verilog_files def generate_hdl(self, model, fpgapart, clk): """Prepare HDL files from templates for synthesis""" @@ -342,14 +309,23 @@ def generate_hdl(self, model, fpgapart, clk): # by PyVerilator and IPI generation self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0]) - for rtl_file_path in self.get_rtl_file_paths(): - # read in original RTL template file - template_data = self.get_rtl_template_data(rtl_file_path) - # apply code generation to templates - data = self.fill_in_rtl_template_data(code_gen_dict, template_data) - # dump filled-in template to destination directory for compilation - file_only_path = rtl_file_path.split("/")[-1] - self.dump_rtl_data(code_gen_dir, file_only_path, data) + rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl" + template_path = rtlsrc + "/thresholding_template_wrapper.v" + with open(template_path, "r") as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + ".v"), + "w", + ) as f: + f.write(template_wrapper) + + sv_files = ["axilite_if.v", "thresholding.sv", "thresholding_axi.sv"] + for sv_file in sv_files: + shutil.copy(rtlsrc + "/" + sv_file, code_gen_dir) # set ipgen_path and ip_path so that HLS-Synth transformation # and stich_ip transformation do not complain @@ -358,39 +334,6 @@ def generate_hdl(self, model, fpgapart, clk): self.set_nodeattr("ip_path", code_gen_dir) return - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_paths = [code_gen_dir] - verilog_files = [ - x.replace("thresholding_template_wrapper", self.get_nodeattr("gen_top_module")) - for x in self.get_rtl_file_list() - ] - dat_files = self.get_all_meminit_filenames(abspath=True) - single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") - for dat_file in dat_files: - shutil.copy(dat_file, single_src_dir) - - # build the Verilator emulation library - sim = PyVerilator.build( - verilog_files, - build_dir=single_src_dir, - verilog_path=verilog_paths, - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_nodeattr("gen_top_module"), - auto_eval=False, - ) - - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) - return sim - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") @@ -431,38 +374,23 @@ def execute_node(self, context, graph): # Create a PyVerilator wrapper of the RTLSim .so sim = self.get_rtlsim() nbits = self.get_instream_width() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - io_names = self.get_verilog_top_module_intf_names() - istream_name = io_names["s_axis"][0][0] - ostream_name = io_names["m_axis"][0][0] + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) io_dict = { - "inputs": {istream_name: inp}, - "outputs": {ostream_name: []}, + "inputs": {"in0": rtlsim_inp}, + "outputs": {"out": []}, } - trace_file = self.get_nodeattr("rtlsim_trace") if trace_file == "default": trace_file = self.onnx_node.name + ".vcd" - sname = "_" - - # Change into so directory to ensure threshold files can be found - rtlsim_so = self.get_nodeattr("rtlsim_so") - so_dir = os.path.dirname(os.path.realpath(rtlsim_so)) - olcwd = os.getcwd() - os.chdir(so_dir) - num_out_values = self.get_number_output_values() - reset_rtlsim(sim) - total_cycle_count = rtlsim_multi_io( - sim, - io_dict, - num_out_values, - trace_file=trace_file, - sname=sname, - liveness_threshold=pyverilate_get_liveness_threshold_cycles(), - ) - self.set_nodeattr("cycles_rtlsim", total_cycle_count) - os.chdir(olcwd) - output = io_dict["outputs"][ostream_name] + + super().reset_rtlsim(sim) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + rtlsim_output = io_dict["outputs"]["out"] # Manage output data odt = self.get_output_datatype() @@ -471,7 +399,9 @@ def execute_node(self, context, graph): out_npy_path = "{}/output.npy".format(code_gen_dir) out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) # load and reshape output output = np.load(out_npy_path) @@ -489,10 +419,7 @@ def execute_node(self, context, graph): def code_generation_ipi(self): """Constructs and returns the TCL commands for node instantiation as an RTL block.""" - rtl_file_list = [ - x.replace("thresholding_template_wrapper", self.get_nodeattr("gen_top_module")) - for x in self.get_rtl_file_list() - ] + rtl_file_list = self.get_rtl_file_list() code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name cmd = ["file mkdir %s" % source_target] diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py index 32943d86cf..23ba4f5fc9 100644 --- a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py @@ -28,12 +28,11 @@ import numpy as np import os -from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk from qonnx.core.datatype import DataType from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU -from finn.util.basic import get_rtlsim_trace_depth, is_versal, make_build_dir +from finn.util.basic import is_versal from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy try: @@ -95,8 +94,9 @@ def execute_node(self, context, graph): sim = self.get_rtlsim() nbits = self.get_instream_width() inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) - reset_rtlsim(sim) - toggle_clk(sim) + super().reset_rtlsim(sim) + if self.get_nodeattr("rtlsim_backend") == "pyverilator": + super().toggle_clk(sim) if mem_mode in ["external", "internal_decoupled"]: wnbits = self.get_weightstream_width() @@ -115,10 +115,14 @@ def execute_node(self, context, graph): "inputs": {"in0": inp, "weights": wei * num_w_reps}, "outputs": {"out": []}, } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] else: - output = self.rtlsim(sim, inp) + io_dict = { + "inputs": {"in0": inp}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + super().close_rtlsim(sim) + output = io_dict["outputs"]["out"] odt = self.get_output_datatype() target_bits = odt.bitwidth() packed_bits = self.get_outstream_width() @@ -274,28 +278,25 @@ def prepare_codegen_default(self, fpgapart, clk): return template_path, code_gen_dict - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - # Path to (System-)Verilog files used by top-module & path to top-module - verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] - verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] + def get_rtl_file_list(self, abspath=False): + if abspath: + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + "/" + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + else: + code_gen_dir = "" + rtllib_dir = "" - # build the Verilator emu library - sim = PyVerilator.build( - verilog_files, - build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), - verilog_path=verilog_paths, - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) + verilog_files = [ + code_gen_dir + self.get_nodeattr("gen_top_module") + "_wrapper_sim.v", + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + return verilog_files - return sim + def get_verilog_paths(self): + verilog_paths = super().get_verilog_paths() + verilog_paths.append(os.environ["FINN_ROOT"] + "/finn-rtllib/mvu") + return verilog_paths diff --git a/src/finn/custom_op/fpgadataflow/rtlbackend.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py index 2e4d647b22..5aae52ad4b 100644 --- a/src/finn/custom_op/fpgadataflow/rtlbackend.py +++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py @@ -28,6 +28,18 @@ from abc import ABC, abstractmethod +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + +try: + import pyxsi_utils +except ModuleNotFoundError: + pyxsi_utils = None + class RTLBackend(ABC): """RTLBackend class all custom ops that correspond to a module in finn-rtllib @@ -45,8 +57,56 @@ def get_nodeattr_types(self): def generate_hdl(self, model, fpgapart, clk): pass - @abstractmethod def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + verilog_paths = self.get_verilog_paths() + rtlsim_backend = self.get_nodeattr("rtlsim_backend") + if rtlsim_backend == "pyverilator": + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + verilog_files = self.get_rtl_file_list(abspath=False) + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_nodeattr("gen_top_module"), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + elif rtlsim_backend == "pyxsi": + verilog_files = self.get_rtl_file_list(abspath=True) + single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_") + ret = pyxsi_utils.compile_sim_obj( + self.get_verilog_top_module_name(), verilog_files, single_src_dir + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", ret[0] + "/" + ret[1]) + # TODO return val of this function is never used + # refactor s.t. it does not return anything at all, + # consistently between pyverilator and pyxsi + sim = None + else: + assert False, "Unknown rtlsim_backend" + return sim + + def get_verilog_paths(self): + """Returns path to code gen directory. Can be overwritten to + return additional paths to relevant verilog files""" + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + return [code_gen_dir] + + @abstractmethod + def get_rtl_file_list(self, abspath=False): + """Returns list of rtl files. Needs to be filled by each node.""" pass @abstractmethod diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index d2100a7516..8a23e05339 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -30,6 +30,7 @@ # template for single node execution docompute_template = """ #define AP_INT_MAX_W $AP_INT_MAX_W$ +#define HLS_NO_XIL_FPO_LIB #include "cnpy.h" #include "npy2apintstream.hpp" #include "npy2vectorstream.hpp" diff --git a/src/finn/qnn-data/cpp/xsi_simdriver.cpp b/src/finn/qnn-data/cpp/xsi_simdriver.cpp new file mode 100644 index 0000000000..61ec0f0af8 --- /dev/null +++ b/src/finn/qnn-data/cpp/xsi_simdriver.cpp @@ -0,0 +1,417 @@ +/* Copyright (C) 2024, Advanced Micro Devices, Inc. +All rights reserved. +# +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +# +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +# +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +# +* Neither the name of FINN nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. +# +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + +/* C++ streaming rtlsim driver template for Verilog designs using XSI + - pushes input data into input AXI stream(s), either dummy or from file + - dumps output data from output AXI stream(s) if desired + - option to examine final simulation status to capture more info + +Note: all code template arguments formatted like @TEMPLATE@ must be filled in +prior to compilation +*/ + +#include +#include +#include +#include +// currently using the pyxsi version and not the original Vivado version +#include "xsi_loader.h" + +#include +#include +#include +#include +#include +#include + +using namespace std; + +// utility functions and other declarations: +// constant binary 1- and 0-values for control logic +const s_xsi_vlog_logicval one_val = {0X00000001, 0X00000000}; +const s_xsi_vlog_logicval zero_val = {0X00000000, 0X00000000}; + +// rounded-up integer division +size_t roundup_int_div(size_t dividend, size_t divisor) { + return (dividend + divisor - 1) / divisor; +} + +// clear bit of 32-bit value at given index +// index must be in range [0, 31] +void clear_bit_atindex(XSI_UINT32 &container, size_t ind) { + container = container & ~((XSI_UINT32)1 << ind); +} + + +// set bit of 32-bit value at given index +// index must be in range [0, 31] +void set_bit_atindex(XSI_UINT32 &container, size_t ind) { + container = container | ((XSI_UINT32)1 << ind); +} + +// test bit of 32-bit value at given index +// index must be in range [0, 31] +bool test_bit_atindex(XSI_UINT32 &container, size_t ind) { + return ((container & ((XSI_UINT32)1 << ind)) > 0 ? true : false); +} + +// set bit of given s_xsi_vlog_logicval (Verilog signal dtype) +// index must be in range [0, 31] +void set_logic_val_atindex(s_xsi_vlog_logicval &logicval, size_t ind, char val) { + switch(val) { + case '0': + clear_bit_atindex((logicval.aVal), ind); + clear_bit_atindex((logicval.bVal), ind); + break; + case '1': + set_bit_atindex((logicval.aVal), ind); + clear_bit_atindex((logicval.bVal), ind); + break; + case 'X': + set_bit_atindex((logicval.aVal), ind); + set_bit_atindex((logicval.bVal), ind); + break; + case 'Z': + clear_bit_atindex((logicval.aVal), ind); + set_bit_atindex((logicval.bVal), ind); + break; + default: + throw std::runtime_error("Unrecognized value for set_logic_val_atindex: "+val); + } +} + +// convert a given Verilog logic value string into an array of s_xsi_vlog_logicval +// string must be composed of Verilog logic values: 0, 1, X, Z +void string_to_logic_val(std::string str, s_xsi_vlog_logicval* value) { + size_t str_len = str.length(); + size_t num_words = roundup_int_div(str_len, 32); + memset(value, 0, sizeof(s_xsi_vlog_logicval)*num_words); + for(size_t i = 0; i < str_len; i++) { + size_t array_ind = i / 32; + size_t bit_ind = i % 32; + set_logic_val_atindex(value[array_ind], bit_ind, str[str_len-i-1]); + } +} + +// convert array of Verilog logic values to a string +// n_bits specifies how many actual bits of value the array contains +// length of returned string (in characters) will be equal to n_bits +std::string logic_val_to_string(s_xsi_vlog_logicval* value, size_t n_bits) { + std::string ret(n_bits, '?'); + for(size_t i = 0; i < n_bits; i++) { + size_t array_ind = i / 32; + size_t bit_ind = i % 32; + bool is_set_aVal = test_bit_atindex(value[array_ind].aVal, bit_ind); + bool is_set_bVal = test_bit_atindex(value[array_ind].bVal, bit_ind); + if(!is_set_aVal && !is_set_bVal) { + ret[n_bits-i-1] = '0'; + } else if(is_set_aVal && !is_set_bVal) { + ret[n_bits-i-1] = '1'; + } else if(!is_set_aVal && is_set_bVal) { + ret[n_bits-i-1] = 'X'; + } else { + ret[n_bits-i-1] = 'Z'; + } + } + //std::cout << "logic_val_to_string logicval.a=" << std::hex << value[0].aVal << " logicval.b=" << value[0].bVal << " retstr " << ret << std::dec << std::endl; + return ret; +} + +// top-level sim object for the simulation +Xsi::Loader *top; +// mapping of port names to port numbers +map port_map; + +// walk the top-level IO interfaces to populate the port_map +void populate_port_map() { + for(int i=0; inum_ports(); i++) { + string port_name = top->get_str_property_port(i, xsiNameTopPort); + port_map[port_name] = i; + } +} + +string read_signal_binstr(string name) { + int port_id = port_map[name]; + int n_bits = top->get_int_property_port(port_id, xsiHDLValueSize); + size_t n_logicvals = roundup_int_div(n_bits, 32); + s_xsi_vlog_logicval *buf = new s_xsi_vlog_logicval[n_logicvals]; + top->get_value(port_id, buf); + string ret = logic_val_to_string(buf, n_bits); + delete [] buf; + return ret; +} + +unsigned int read_signal_uint(string name) { + return stoi(read_signal_binstr(name), 0, 2); +} + +// set the 1-bit signal with given name to 1 +void set_bool(string name) { + top->put_value(port_map[name], &one_val); +} + +// set the 1-bit signal with given name to 0 +void clear_bool(string name) { + top->put_value(port_map[name], &zero_val); +} + +// check the 1-bit signal with given name for equality to 1 +bool chk_bool(string name) { + s_xsi_vlog_logicval buf = {0X00000000, 0X00000000}; + top->get_value(port_map[name], &buf); + return logic_val_to_string(&buf, 1)[0] == '1'; +} + +// rising clock edge + high clock +inline void toggle_clk_1() { + set_bool("@CLK_NAME@"); + top->run(5); +} + +inline void toggle_clk_and_clk2x_1() { + set_bool("@CLK_NAME@"); + set_bool("@CLK2X_NAME@"); + top->run(5); + clear_bool("@CLK2X_NAME@"); + top->run(5); +} + +// falling clock edge + low clock +inline void toggle_clk_0() { + clear_bool("@CLK_NAME@"); + top->run(5); +} + +inline void toggle_clk_and_clk2x_0() { + clear_bool("@CLK_NAME@"); + set_bool("@CLK2X_NAME@"); + top->run(5); + clear_bool("@CLK2X_NAME@"); + top->run(5); +} + +// drive simulation for 1 clock period +inline void toggle_clk() { + toggle_clk_0(); + toggle_clk_1(); +} + +inline void toggle_clk_and_clk2x() { + toggle_clk_and_clk2x_0(); + toggle_clk_and_clk2x_1(); +} + +// apply reset to the simulation +void reset() { + clear_bool("@CLK_NAME@"); + clear_bool("@NRST_NAME@"); + toggle_@CLKNAMES@(); + toggle_@CLKNAMES@(); + set_bool("@NRST_NAME@"); + toggle_@CLKNAMES@(); + toggle_@CLKNAMES@(); +} + +int main(int argc, char *argv[]) { + // load pre-compiled rtl simulation + std::string simengine_libname = "@SIMKERNEL_SO@"; + std::string design_libname = "xsim.dir/@TOP_MODULE_NAME@/xsimk.so"; + top = new Xsi::Loader(design_libname, simengine_libname); + s_xsi_setup_info info; + memset(&info, 0, sizeof(info)); + info.logFileName = NULL; + info.wdbFileName = @TRACE_FILE@; + top->open(&info); + @TRACE_CMD@ + + populate_port_map(); + + vector instream_names = @INSTREAM_NAME@; + vector outstream_names = @OUTSTREAM_NAME@; + // how much data to push into/pull out of sim + vector n_iters_per_input = @ITERS_PER_INPUT@; + vector n_iters_per_output = @ITERS_PER_OUTPUT@; + unsigned n_inferences = @N_INFERENCES@; + unsigned max_iters = @MAX_ITERS@; + + reset(); + + vector n_in_txns(instream_names.size(), 0), n_out_txns(outstream_names.size(), 0); + vector throttle_input_until_time(instream_names.size(), 0); + size_t total_n_in_txns = 0, total_n_out_txns = 0; + unsigned iters = 0, last_output_at = 0; + unsigned latency = 0; + unsigned cycles_since_last_output = 0; + size_t n_finished_instreams = 0, n_finished_outstreams = 0; + + bool exit_criterion = false; + + cout << "Simulation starting" << endl; + //cout << "Number of inputs to write " << n_iters_per_input * n_inputs << endl; + //cout << "Number of outputs to expect " << n_iters_per_output * n_inputs << endl; + cout << "No-output timeout clock cycles " << max_iters << endl; + + chrono::steady_clock::time_point begin = chrono::steady_clock::now(); + + bool input_done = false; + bool output_done = false; + bool timeout = false; + + // enable reception on the output streams + for (auto & outstream_name : outstream_names) { + set_bool(outstream_name + "_tready"); + } + + while(!exit_criterion) { + // keep track of which signals to write + // actual writes will be done after rising clock edge + // TODO needs to be extended to non-bool signals for actual input data + map signals_to_write; + // toggle falling clock edge and drive low clock + toggle_@CLKNAMES@_0(); + // check for transactions on the input streams + for(size_t i = 0; i < instream_names.size(); i++) { + string instream_name = instream_names[i]; + if(chk_bool(instream_name+"_tready") && chk_bool(instream_name + "_tvalid")) { + n_in_txns[i]++; + // determine whether this input will be throttled for rate-limiting + // every time an input frame is finished, we throttle for @THROTTLE_CYCLES@ cycles + if(n_in_txns[i] % n_iters_per_input[i] == 0) throttle_input_until_time[i] = iters + @THROTTLE_CYCLES@; + total_n_in_txns++; + // determine whether we have more inputs to feed + if(n_in_txns[i] == n_iters_per_input[i] * n_inferences) { + signals_to_write[instream_name + "_tvalid"] = false; + n_finished_instreams++; + } + } + + if(n_in_txns[i] < n_iters_per_input[i] * n_inferences) { + bool enable_throttled_input = (iters >= throttle_input_until_time[i]); + signals_to_write[instream_name + "_tvalid"] = enable_throttled_input; + } else if(n_in_txns[i] > n_iters_per_input[i] * n_inferences) { + // more input transactions than specified, should never happen + // most likely a bug in the C++ driver code if this happens + cout << "WARNING: Unknown stream condition for input " << instream_name << endl; + signals_to_write[instream_name + "_tvalid"] = false; + } + } + + // check for transactions on the output streams + size_t n_outstreams_with_no_txn = 0; + for(size_t i = 0; i < outstream_names.size(); i++) { + string outstream_name = outstream_names[i]; + if(chk_bool(outstream_name+"_tready") && chk_bool(outstream_name + "_tvalid")) { + // reset the no-output timeout counter + cycles_since_last_output = 0; + // TODO add output data capture to file here + // (unless we are in dummy data mode) + n_out_txns[i]++; + total_n_out_txns++; + // determine whether we have more outputs to consume + if(n_out_txns[i] == n_iters_per_output[i] * n_inferences) { + signals_to_write[outstream_name + "_tready"] = false; + n_finished_outstreams++; + } + } else { + n_outstreams_with_no_txn++; + } + if(n_out_txns[i] < n_iters_per_output[i] * n_inferences) { + signals_to_write[outstream_name + "_tready"] = true; + } else if(n_out_txns[i] > n_iters_per_output[i] * n_inferences) { + // more output transactions than specified + cout << "WARNING: Unknown stream condition for output " << outstream_name << endl; + signals_to_write[outstream_name + "_tready"] = false; + } + } + if(n_outstreams_with_no_txn == outstream_names.size()) { + // if none of the output streams had any activity: + // keep track of no-activity cycles for timeout + cycles_since_last_output++; + } + + // toggle rising clock edge and drive high clock + toggle_@CLKNAMES@_1(); + // actually write the desired signals from the map + for (auto const& x : signals_to_write) + { + if(x.second) set_bool(x.first); + else clear_bool(x.first); + } + // keep track of elapsed clock cycles + iters++; + // show a progress message once in a while + if(iters % 1000 == 0) { + cout << "Elapsed iters " << iters << " inps " << total_n_in_txns << " outs " << total_n_out_txns << endl; + chrono::steady_clock::time_point end = chrono::steady_clock::now(); + cout << "Elapsed since last report = " << chrono::duration_cast(end - begin).count() << "[s]" << endl; + begin = end; + } + // check whether the exit criteria are reached + input_done = (n_finished_instreams == instream_names.size()); + output_done = (n_finished_outstreams == outstream_names.size()); + timeout = (cycles_since_last_output > max_iters); + exit_criterion = (input_done && output_done) || timeout; + // latency computation: when all outputs have generated 1 full sample + if(latency == 0) { + size_t n_outputs_with_one_completion = 0; + for(size_t i = 0; i < outstream_names.size(); i++) { + if(n_out_txns[i] == n_iters_per_output[i]) n_outputs_with_one_completion++; + } + if(n_outputs_with_one_completion == outstream_names.size()) { + cout << "All outputs have now produced a sample, latency = " << iters << " cycles" << endl; + latency = iters; + } + } + } + + // dump final simulation statistics to stdout and file + cout << "Simulation finished" << endl; + cout << "Number of inputs consumed " << total_n_in_txns << endl; + cout << "Number of outputs produced " << total_n_out_txns << endl; + cout << "Number of clock cycles " << iters << endl; + cout << "Input done? " << input_done << endl; + cout << "Output done? " << output_done << endl; + cout << "Timeout? " << timeout << endl; + + ofstream results_file; + results_file.open("results.txt", ios::out | ios::trunc); + results_file << "N_IN_TXNS" << "\t" << total_n_in_txns << endl; + results_file << "N_OUT_TXNS" << "\t" << total_n_out_txns << endl; + results_file << "cycles" << "\t" << iters << endl; + results_file << "N" << "\t" << n_inferences << endl; + results_file << "latency_cycles" << "\t" << latency << endl; + results_file << "TIMEOUT" << "\t" << (timeout ? 1 : 0) << endl; + results_file << "INPUT_DONE" << "\t" << (input_done ? 1 : 0) << endl; + results_file << "OUTPUT_DONE" << "\t" << (output_done ? 1 : 0) << endl; + // optionally, extract more data from final status + @POSTPROC_CPP@ + results_file.close(); + top->close(); + + return 0; +} diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 82ee536d50..276bb0b968 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -42,6 +42,7 @@ ) from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance +from finn.core.rtlsim_exec import rtlsim_exec_cppxsi from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -196,6 +197,53 @@ def apply(self, model): return (model, False) +def xsi_fifosim(model, n_inferences, max_iters=None, throttle_cycles=0): + """Create a XSI model of stitched IP and use a simple C++ + driver to drive the input stream. Useful for FIFO sizing, latency + and throughput measurement. If max_iters is None, use the default + liveness threshold instead. throttle_cycles can be used for throttling + the input stream every time a frame is finished.""" + + assert len(model.graph.input) == 1, "Only a single input stream is supported" + assert len(model.graph.output) == 1, "Only a single output stream is supported" + iname = model.graph.input[0].name + first_node = model.find_consumer(iname) + oname = model.graph.output[0].name + last_node = model.find_producer(oname) + assert (first_node is not None) and (last_node is not None), "Failed to find first/last nodes" + # define execution context for dummy data mode: + # only number of transactions, no real data + # TODO add support for multiple I/O streams + ctx = { + "global_in": n_inferences, + } + # create C++ code snippet for postprocessing: + # grab maxcount values from FIFOs, dump into existing results file + fifo_log = [] + fifo_log_templ = ' results_file << "maxcount%s" << "\\t" ' + fifo_log_templ += '<< to_string(read_signal_uint("maxcount%s")) << endl;' + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO_rtl") + fifo_ind = 0 + for fifo_node in fifo_nodes: + fifo_node = getCustomOp(fifo_node) + if fifo_node.get_nodeattr("depth_monitor") == 1: + suffix = "" if fifo_ind == 0 else "_%d" % fifo_ind + fifo_log.append(fifo_log_templ % (suffix, suffix)) + fifo_ind += 1 + fifo_log = "\n".join(fifo_log) + # run XSI sim with postproc + ret_dict = rtlsim_exec_cppxsi( + model, + ctx, + dummy_data_mode=True, + postproc_cpp=fifo_log, + timeout_cycles=max_iters, + throttle_cycles=throttle_cycles, + ) + + return ret_dict + + class InsertAndSetFIFODepths(Transformation): """Insert appropriate-depth StreamingFIFOs through RTLSim that preserve throughput in the created accelerator. @@ -211,6 +259,8 @@ class InsertAndSetFIFODepths(Transformation): smaller where appropriate :parameter vivado_ram_style: the StreamingFIFO.ram_style attribute to be used for large FIFOs implemented by Vivado afterwards + :parameter fifosim_input_throttle: use input throttling based on dataflow analysis + while doing simulation-based FIFO sizing Assumed input graph properties: @@ -246,6 +296,7 @@ def __init__( swg_exception=False, vivado_ram_style="auto", force_python_sim=False, + fifosim_input_throttle=True, ): super().__init__() self.fpgapart = fpgapart @@ -255,6 +306,7 @@ def __init__( self.swg_exception = swg_exception self.vivado_ram_style = vivado_ram_style self.force_python_sim = force_python_sim + self.fifosim_input_throttle = fifosim_input_throttle def apply(self, model): # these optypes may potentially use external weights @@ -378,6 +430,8 @@ def apply(self, model): warnings.warn("No output detected, calculated FIFO depths may not be correct") else: # do rtlsim in C++ for FIFO sizing + # use the rtlsim_backend metadata_prop to decide which backend to use + backend = model.get_metadata_prop("rtlsim_backend") # determine # inputs for FIFO sizing according to topology type swg_nodes = [ x for x in model.graph.node if x.op_type.startswith("ConvolutionInputGenerator") @@ -385,13 +439,32 @@ def apply(self, model): if len(swg_nodes) == 0: # MLP, no layer overlap # assuming half the nodes are now FIFOs, use half the # of - # nodes as # inputs to drive the imulation - n_inputs = int(len(model.graph.node) / 2) + # nodes as # inputs to drive the simulation + n_inferences = int(len(model.graph.node) / 2) else: # convnet, two inputs are typically enough to fill entire # layer pipeline due to overlaps - n_inputs = 2 - sim = verilator_fifosim(model, n_inputs) + n_inferences = 2 + + # use the critical_path_cycles estimate to set the timeout limit for FIFO sim + max_iters = latency + + # set up rate limit for input throttling + if self.fifosim_input_throttle: + first_node = getCustomOp(model.graph.node[0]) + inp_fold = np.prod(first_node.get_folded_input_shape()[:-1]) + throttle_cycles = max(0, perf["max_cycles"] - inp_fold) + else: + throttle_cycles = 0 + + if backend in ["verilator", "pyverilator"]: + sim = verilator_fifosim(model, n_inferences, max_iters=max_iters) + elif backend is None or backend in ["xsi", "pyxsi"]: + sim = xsi_fifosim( + model, n_inferences, max_iters=max_iters, throttle_cycles=throttle_cycles + ) + else: + assert False, f"Unrecognized backend for InsertAndSetFIFODepths: {backend}" for ind, node in enumerate(fifo_nodes): maxcount_name = "maxcount_%d" % ind @@ -447,6 +520,15 @@ def apply(self, model): # remove shallow FIFOs model = model.transform(RemoveShallowFIFOs()) + # clean up references to stitched IP and rtlsim objects + # (the stitched IP needs to be re-done after FIFO sizing) + model.set_metadata_prop("rtlsim_trace", "") + model.set_metadata_prop("rtlsim_so", "") + model.set_metadata_prop("vivado_stitch_proj", "") + model.set_metadata_prop("wrapper_filename", "") + model.set_metadata_prop("vivado_stitch_vlnv", "") + model.set_metadata_prop("vivado_stitch_ifnames", "") + # reflect final values in attributes for node in model.graph.node: if not node.op_type.startswith("StreamingFIFO"): diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 5eb72194ea..471c9a4dd3 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -126,11 +126,24 @@ def get_finn_root(): ) +def get_vivado_root(): + "Return the root directory that Vivado is installed into." + + try: + return os.environ["XILINX_VIVADO"] + except KeyError: + raise Exception( + """Environment variable XILINX_VIVADO must be set + correctly. Please ensure you have launched the Docker contaier correctly. + """ + ) + + def pyverilate_get_liveness_threshold_cycles(): """Return the number of no-output cycles rtlsim will wait before assuming the simulation is not finishing and throwing an exception.""" - return int(os.getenv("LIVENESS_THRESHOLD", 10000)) + return int(os.getenv("LIVENESS_THRESHOLD", 1000000)) def make_build_dir(prefix=""): diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 0d3418624a..385bd66e3d 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -746,6 +746,7 @@ def test_ipstitch_rtlsim(self, topology, wbits, abits, board): model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_backend", "pyxsi") os.environ["LIVENESS_THRESHOLD"] = str(int(latency * 1.1)) if rtlsim_trace: model.set_metadata_prop("rtlsim_trace", "%s_w%da%d.vcd" % (topology, wbits, abits)) diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index 4c52277970..9bf9be617b 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -60,6 +60,7 @@ import finn.transformation.streamline.reorder as reorder from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.core.onnx_exec import execute_onnx +from finn.core.throughput_test import throughput_test_rtlsim from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.create_dataflow_partition import ( @@ -89,7 +90,6 @@ from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds from finn.util.basic import get_finn_root from finn.util.pytorch import NormalizePreProc -from finn.util.pyverilator import verilator_fifosim from finn.util.test import ( crop_center, get_test_model_trained, @@ -502,6 +502,7 @@ def test_end2end_mobilenet_stitched_ip_rtlsim(): # set top-level prop for stitched-ip rtlsim and launch model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_backend", "pyxsi") ret_rtlsim_ip = execute_onnx(model, inp_dict, True) res_rtlsim_ip = ret_rtlsim_ip[out_name] np.save(build_dir + "/end2end_mobilenet_result_rtlsim_ip.npy", res_rtlsim_ip) @@ -527,7 +528,7 @@ def test_end2end_mobilenet_rtlsim_performance(): # multi-in/out streams currently not supported in our C++ verilator driver rtlsim_bs = 1 - rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs) + rtlsim_perf_dict = throughput_test_rtlsim(model, batchsize=rtlsim_bs) # keep keys consistent between the Python and C++-styles cycles = rtlsim_perf_dict["cycles"] clk_ns = float(model.get_metadata_prop("clk_ns")) diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py index 484cbbe04a..3b12e86bfa 100644 --- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py @@ -47,7 +47,7 @@ from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_addstreams_modelwrapper(ch, pe, idt): +def make_addstreams_modelwrapper(ch, pe, idt, rtlsim_backend): inp1 = helper.make_tensor_value_info("inp1", TensorProto.FLOAT, [1, ch]) inp2 = helper.make_tensor_value_info("inp2", TensorProto.FLOAT, [1, ch]) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch]) @@ -62,6 +62,7 @@ def make_addstreams_modelwrapper(ch, pe, idt): PE=pe, inputDataType=idt.name, preferred_impl_style="hls", + rtlsim_backend=rtlsim_backend, ) graph = helper.make_graph( nodes=[addstreams_node], @@ -91,20 +92,28 @@ def prepare_inputs(input1, input2): @pytest.mark.parametrize("fold", [-1, 2, 1]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# rtlsim_backend +@pytest.mark.parametrize("rtlsim_backend", ["pyverilator", "pyxsi"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): +def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode, rtlsim_backend): if fold == -1: pe = 1 else: pe = max(1, ch // fold) assert ch % pe == 0 + if exec_mode == "cppsim" and rtlsim_backend == "pyxsi": + pytest.skip( + """Skip combination of paramaters because rtlsim_backend + only influences rtlsim and not cppsim.""" + ) + # generate input data x1 = gen_finn_dt_tensor(idt, (1, ch)) x2 = gen_finn_dt_tensor(idt, (1, ch)) - model = make_addstreams_modelwrapper(ch, pe, idt) + model = make_addstreams_modelwrapper(ch, pe, idt, rtlsim_backend) # prepare input data input_dict = prepare_inputs(x1, x2) diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py index 817d13e13d..8198990512 100644 --- a/tests/fpgadataflow/test_fpgadataflow_checksum.py +++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py @@ -31,7 +31,6 @@ import numpy as np from onnx import TensorProto, helper -from pyverilator.util.axi_utils import axilite_read, axilite_write from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp @@ -51,6 +50,11 @@ from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +try: + import pyxsi_utils +except ModuleNotFoundError: + pyxsi_utils = None + test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -182,6 +186,7 @@ def test_fpgadataflow_checksum(): model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_backend", "pyxsi") # define function to read out the checksums from axilite checksums = [] @@ -192,8 +197,8 @@ def read_checksum_and_drain(sim): drain_addr = 32 for i in range(len(model.get_nodes_by_op_type("CheckSum_hls"))): axi_name = "s_axi_checksum_{}_".format(i) - checksums.append(axilite_read(sim, chk_addr, basename=axi_name)) - drain.append(axilite_read(sim, drain_addr, basename=axi_name)) + checksums.append(pyxsi_utils.axilite_read(sim, chk_addr, basename=axi_name)) + drain.append(pyxsi_utils.axilite_read(sim, drain_addr, basename=axi_name)) drain_value = False @@ -201,7 +206,7 @@ def write_drain(sim): addr = 32 for i in range(len(model.get_nodes_by_op_type("CheckSum_hls"))): axi_name = "s_axi_checksum_{}_".format(i) - axilite_write(sim, addr, drain_value, basename=axi_name) + pyxsi_utils.axilite_write(sim, addr, drain_value, basename=axi_name) rtlsim_exec(model, inp, pre_hook=write_drain, post_hook=read_checksum_and_drain) checksum0_rtlsim = int(checksums[0]) diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py index 25c738d049..2a6a19e4a3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_concat.py +++ b/tests/fpgadataflow/test_fpgadataflow_concat.py @@ -157,6 +157,7 @@ def test_fpgadataflow_concat_stitchedip(): ) ) model.set_metadata_prop("exec_mode", "rtlsim") - model.set_metadata_prop("rtlsim_trace", "trace.vcd") + model.set_metadata_prop("rtlsim_backend", "pyxsi") + model.set_metadata_prop("rtlsim_trace", "trace.wdb") ret_sim = execute_onnx(model, inp_dict) assert (exp_out == ret_sim[oname]).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py index 26ce8f5f0e..110c479a56 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py @@ -32,8 +32,8 @@ import numpy as np import onnx.parser as oprs import os +from bitstring import BitArray from onnx import TensorProto, helper -from pyverilator.util.axi_utils import axilite_write, reset_rtlsim from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.im2col import compute_conv_output_dim @@ -65,6 +65,11 @@ from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import pyverilate_get_liveness_threshold_cycles +try: + import pyxsi_utils +except ModuleNotFoundError: + pyxsi_utils = None + def create_conv_model(idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise): np.random.seed(0) @@ -159,13 +164,18 @@ def config_hook(configs): return None def write_swg_config(sim): - reset_rtlsim(sim) + pyxsi_utils.reset_rtlsim(sim) for axi_name, config in configs: # Write config registers to the SWG/FMPadding dict # defines (addr, value) tuples for config_entry in config.values(): - axilite_write(sim, config_entry[0], config_entry[1], basename=axi_name) - reset_rtlsim(sim) + addr, val = config_entry + if val < 0: + # ensure any negative vals are expressed as two's complement, + # SWG control regs are currently always 32 bits + val = BitArray(int=val, length=32).uint + pyxsi_utils.axilite_write(sim, addr, val, basename=axi_name) + pyxsi_utils.reset_rtlsim(sim) return write_swg_config @@ -290,6 +300,7 @@ def test_fpgadataflow_conv_dynamic(cfg): model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP("xc7z020clg400-1", 5, vitis=do_synth)) model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_backend", "pyxsi") # loop through experiment configurations for exp_cfg in exp_cfgs: @@ -535,6 +546,7 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic( model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP("xc7z020clg400-1", 5)) model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_backend", "pyxsi") # Simulate 1 FM for each dimension in the series for i, ifm_dim in enumerate(ifm_dim_series): diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py index 6b79a39ed5..6507bf6710 100644 --- a/tests/fpgadataflow/test_fpgadataflow_dwc.py +++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py @@ -165,6 +165,7 @@ def test_fpgadataflow_dwc_stitched_rtlsim(config, impl_style): model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_backend", "pyxsi") y = oxe.execute_onnx(model, input_dict)["outp"] assert ( diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py index 2061601b4a..84c9f7f362 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -53,7 +53,6 @@ from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext from finn.transformation.fpgadataflow.vitis_build import VitisBuild from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map -from finn.util.pyverilator import pyverilate_stitched_ip from finn.util.test import load_test_checkpoint_or_skip test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -239,39 +238,9 @@ def test_fpgadataflow_ipstitch_rtlsim(mem_mode): model = load_test_checkpoint_or_skip( ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch_%s.onnx" % mem_mode ) - model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd") - sim = pyverilate_stitched_ip(model) - exp_io = [ - "ap_clk", - "ap_rst_n", - "s_axis_0_tdata", - "s_axis_0_tready", - "s_axis_0_tvalid", - "m_axis_0_tdata", - "m_axis_0_tkeep", - "m_axis_0_tlast", - "m_axis_0_tready", - "m_axis_0_tvalid", - "s_axi_control_0_araddr", - "s_axi_control_0_arready", - "s_axi_control_0_arvalid", - "s_axi_control_0_awaddr", - "s_axi_control_0_awready", - "s_axi_control_0_awvalid", - "s_axi_control_0_bready", - "s_axi_control_0_bresp", - "s_axi_control_0_bvalid", - "s_axi_control_0_rdata", - "s_axi_control_0_rready", - "s_axi_control_0_rresp", - "s_axi_control_0_rvalid", - "s_axi_control_0_wdata", - "s_axi_control_0_wready", - "s_axi_control_0_wstrb", - "s_axi_control_0_wvalid", - ] - assert sorted(dir(sim.io)) == sorted(exp_io) + model.set_metadata_prop("rtlsim_trace", "whole_trace.wdb") model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_backend", "pyxsi") idt = model.get_tensor_datatype("inp") ishape = model.get_tensor_shape("inp") x = gen_finn_dt_tensor(idt, ishape) diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index 1ec77f4eec..6237c97782 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -723,8 +723,8 @@ def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(part, clk_ns)) - model.set_metadata_prop("rtlsim_so", "") model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_backend", "pyxsi") output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"] assert ( diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py index e6175ac58b..cd5bda6c27 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding_runtime.py @@ -31,7 +31,6 @@ import numpy as np import os from onnx import TensorProto, helper -from pyverilator.util.axi_utils import axilite_read, axilite_write from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.multithreshold import multithreshold @@ -47,6 +46,12 @@ from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +try: + import pyxsi_utils +except ModuleNotFoundError: + pyxsi_utils = None + + test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -186,6 +191,7 @@ def test_runtime_thresholds_read(impl_style, idt_act_cfg, cfg, narrow, per_tenso model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model = model.transform(PrepareRTLSim()) model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_backend", "pyxsi") # add two copies of the input tensor as the first one is just used to # "flush out" the pipeline (as mvau already starts receiving old weights while # we read/write new ones and reads seem to cause a disturbance too) @@ -199,7 +205,9 @@ def test_runtime_thresholds_read(impl_style, idt_act_cfg, cfg, narrow, per_tenso def read_weights(sim): addr = 0 for i in range(len(old_weight_stream)): - extracted_weight_stream.append(axilite_read(sim, addr, basename="s_axilite_0_")) + extracted_weight_stream.append( + pyxsi_utils.axilite_read(sim, addr, basename="s_axilite_0_") + ) addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=read_weights) @@ -299,6 +307,7 @@ def test_runtime_thresholds_write(impl_style, idt_act_cfg, cfg, narrow, per_tens model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model = model.transform(PrepareRTLSim()) model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_backend", "pyxsi") # add two copies of the input tensor as the first one is just used to # "flush out" the pipeline (as mvau already starts receiving old weights while # we read/write new ones and reads seem to cause a disturbance too) @@ -311,7 +320,7 @@ def test_runtime_thresholds_write(impl_style, idt_act_cfg, cfg, narrow, per_tens def write_weights(sim): addr = 0 for nw in T_write_stream: - axilite_write(sim, addr, nw, basename="s_axilite_0_") + pyxsi_utils.axilite_write(sim, addr, nw, basename="s_axilite_0_") addr += 4 T_read_stream = [] @@ -319,7 +328,7 @@ def write_weights(sim): def read_weights(sim): addr = 0 for i in range(len(T_write_stream)): - T_read_stream.append(axilite_read(sim, addr, basename="s_axilite_0_")) + T_read_stream.append(pyxsi_utils.axilite_read(sim, addr, basename="s_axilite_0_")) addr += 4 rtlsim_exec(model, exec_ctx_write, pre_hook=write_weights, post_hook=read_weights) diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index 236176faa6..d16226010e 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -457,6 +457,7 @@ def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, pa partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5)) # set top-level prop for stitched-ip rtlsim and launch partitioned_model.set_metadata_prop("exec_mode", "rtlsim") + partitioned_model.set_metadata_prop("rtlsim_backend", "pyxsi") # transpose input since we're now simulating HW layers (NCHW --> NHWC) input_dict["global_in"] = np.transpose(input_dict["global_in"], (0, 2, 3, 1)) output_vvau_stitched = oxe.execute_onnx( diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py index 4ca61578c3..b63b531ff7 100644 --- a/tests/fpgadataflow/test_runtime_weights.py +++ b/tests/fpgadataflow/test_runtime_weights.py @@ -31,7 +31,6 @@ import numpy as np import os -from pyverilator.util.axi_utils import axilite_read, axilite_write from qonnx.core.datatype import DataType from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames @@ -45,6 +44,12 @@ from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.create import hls_random_mlp_maker +try: + import pyxsi_utils +except ModuleNotFoundError: + pyxsi_utils = None + + test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -89,6 +94,7 @@ def test_runtime_weights_single_layer(): model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") + model.set_metadata_prop("rtlsim_backend", "pyxsi") in_tensor = np.asarray(range(mw), dtype=np.float32) # add two copies of the input tensor as the first one is just used to # "flush out" the pipeline (as mvau already starts receiving old weights while @@ -100,7 +106,9 @@ def test_runtime_weights_single_layer(): def read_weights(sim): addr = 0 for i in range(len(old_weight_stream)): - extracted_weight_stream.append(axilite_read(sim, addr, basename="s_axilite_0_")) + extracted_weight_stream.append( + pyxsi_utils.axilite_read(sim, addr, basename="s_axilite_0_") + ) addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=read_weights) @@ -121,7 +129,7 @@ def read_weights(sim): def write_weights(sim): addr = 0 for nw in new_weight_stream: - axilite_write(sim, addr, nw, basename="s_axilite_0_") + pyxsi_utils.axilite_write(sim, addr, nw, basename="s_axilite_0_") addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=write_weights) diff --git a/tests/util/test_data_packing.py b/tests/util/test_data_packing.py index a718f171e2..e821a2c6fb 100644 --- a/tests/util/test_data_packing.py +++ b/tests/util/test_data_packing.py @@ -99,9 +99,9 @@ def test_npy2apintstream(test_shape, dtype): f.write("\n".join(test_app_string)) cmd_compile = """ g++ -o test_npy2apintstream test.cpp $FINN_ROOT/deps/cnpy/cnpy.cpp \ --I$FINN_ROOT/deps/cnpy/ -I{}/include -I$FINN_ROOT/src/finn/qnn-data/cpp \ +-I$FINN_ROOT/deps/cnpy/ -I{}/include -I{}/include -I$FINN_ROOT/src/finn/qnn-data/cpp \ --std=c++11 -lz""".format( - os.environ["HLS_PATH"] + os.environ["HLS_PATH"], os.environ["VITIS_PATH"] ) with open(test_dir + "/compile.sh", "w") as f: f.write(cmd_compile) diff --git a/tests/util/test_hls_vector.py b/tests/util/test_hls_vector.py index 35d9b1b2fc..20fa0bf072 100644 --- a/tests/util/test_hls_vector.py +++ b/tests/util/test_hls_vector.py @@ -95,9 +95,9 @@ def test_npy2vectorstream(test_shape, dtype): f.write("\n".join(test_app_string)) cmd_compile = """ g++ -o test_npy2vectorstream test.cpp $FINN_ROOT/deps/cnpy/cnpy.cpp \ --I$FINN_ROOT/deps/cnpy/ -I{}/include -I$FINN_ROOT/src/finn/qnn-data/cpp \ +-I$FINN_ROOT/deps/cnpy/ -I{}/include -I{}/include -I$FINN_ROOT/src/finn/qnn-data/cpp \ --std=c++14 -lz """.format( - os.environ["HLS_PATH"] + os.environ["HLS_PATH"], os.environ["VITIS_PATH"] ) with open(test_dir + "/compile.sh", "w") as f: f.write(cmd_compile)