Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integration of new deconvolution implementation based on the REVD2 algorithm (Ongoing) #1263

Draft
wants to merge 13 commits into
base: feature/deconv
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fetch-repos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
HLSLIB_COMMIT="d56b1d0c1eeb844a873fb29a29240a86e00d9f80"
HLSLIB_COMMIT="16cfc4b3ab895babf30f7db7c4bcac27d68317a9"
OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
Expand Down
33 changes: 25 additions & 8 deletions src/finn/custom_op/fpgadataflow/hls/deconvolution_hls.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
weight_tensor = self.get_hw_compatible_weight_tensor(weights)
export_wdt = self.get_weight_datatype()
if weight_file_mode == "hls_header":
weight_hls_code = numpy_to_hls_code1(weight_tensor, export_wdt, "weights", True, True)
weight_hls_code = numpy_to_hls_code1(weight_tensor, export_wdt, "weights", False, True)
# write weights into C++ header file as dictated by finn-hlslib
f_weights = open(weight_file_name, "w")
f_weights.write(
Expand Down Expand Up @@ -118,19 +118,21 @@ def get_hw_compatible_weight_tensor(self, orig_weight_matrix):
wmem = self.calc_wmem()
assert orig_weight_matrix.shape == (
ofm_ch,
k_h * k_w * ifm_ch,
k_h,
k_w,
ifm_ch,
), """Weights matrix doesn't
#have expected shape (k_h*k_w*ifm_ch, ofm_ch)"""
have expected shape (ofm_ch, k_h, k_w, ifm_ch)"""
assert ofm_ch % pe == 0, "Requirement output channels divisable by PE is violated."
assert ifm_ch % simd == 0, "Requirement input channels divisable by SIMD is violated."
# interleave rows between PEs and reshape
# distribute rows between PEs
ret = orig_weight_matrix
ret = ret.reshape(ofm_ch, k_h * k_w * ifm_ch)
ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
# create SIMD as innermost dimension and add a dummy outer dim
# create SIMD as innermost dimension
ret = ret.reshape(1, pe, wmem, simd)
# reverse the SIMD dimension
ret = np.flip(ret, axis=-1)
ret = ret.transpose(0, 2, 1, 3)
return ret

def global_includes(self):
Expand Down Expand Up @@ -196,12 +198,27 @@ def strm_decl(self):
def docompute(self):
odtype = self.get_output_datatype()
pe = self.get_nodeattr("PE")
ishape = self.get_normal_input_shape()
simd = self.get_nodeattr("SIMD")
i_ch = self.get_nodeattr("IFMChannels")
k_h, k_w = self.get_nodeattr("KernelDim")
s_h, s_w = self.get_nodeattr("Stride")
i_h, i_w = self.get_nodeattr("IFMDim")
p_h, p_w = self.get_nodeattr("Padding")
if p_w >= k_w - s_w:
padup = 0
else:
padup = (k_w - p_w - 1) / s_w
crop = s_w * padup - ((k_w - s_w) - p_w)
sf = i_ch / simd
w_eff = padup + i_w + padup
wo_eff = (w_eff - 1) * s_w + k_w
self.code_gen_dict["$DOCOMPUTE$"] = [
"hls::stream<hls::vector<{},{}>> strm;".format(odtype.get_hls_datatype_str(), pe)
]
self.code_gen_dict["$DOCOMPUTE$"].append("unsigned timeout = 0;")
self.code_gen_dict["$DOCOMPUTE$"].append("while(timeout < %s) {" % np.prod(ishape))
self.code_gen_dict["$DOCOMPUTE$"].append(
"while(timeout < %s) {" % (wo_eff * (crop + 1) * ((k_w / s_w) ** 2) * 2 * sf + 50)
)
self.code_gen_dict["$DOCOMPUTE$"].append(
"""deconv<Kernel, Stride, Padding, IFMH, IFMW, OCH, ICH, PE1, SIMD1>
(weights, in0_{}, out_{});""".format(
Expand Down
88 changes: 39 additions & 49 deletions tests/fpgadataflow/test_fpgadataflow_deconv.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,35 +41,23 @@
import finn.core.onnx_exec as oxe
from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
from finn.transformation.fpgadataflow.convert_to_hw_layers import (
InferConvInpGen,
InferQuantizedMatrixVectorActivation,
)
from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
from finn.transformation.fpgadataflow.infer_pixel_padding_deconv import (
InferPixelPaddingDeconv,
)
from finn.transformation.fpgadataflow.minimize_accumulator_width import (
MinimizeAccumulatorWidth,
)
from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
from finn.util.basic import pynq_part_map

test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
test_fpga_part = pynq_part_map[test_pynq_board]
target_clk_ns = 10


def set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding):
def set_up_reference_model(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
idim_h, idim_w = idim
stride_h, stride_w = stride
odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1
odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1
odt = DataType["INT32"]

inp = helper.make_tensor_value_info(
"inp",
Expand Down Expand Up @@ -120,10 +108,10 @@ def set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding):

model = model.transform(InferShapes())

return model
return model, w_tensor


def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding, w_tensor):
idim_h, idim_w = idim
stride_h, stride_w = stride
odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1
Expand All @@ -140,8 +128,7 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
],
)
outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, odim_h, odim_w, ofm_ch])

W = helper.make_tensor_value_info("W", TensorProto.FLOAT, [ifm_ch * k * k, ofm_ch])
W = helper.make_tensor_value_info("W", TensorProto.FLOAT, [ofm_ch, k, k, ifm_ch])

Deconv = helper.make_node(
"Deconvolution_hls",
Expand All @@ -154,6 +141,7 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
OFMChannels=ofm_ch,
IFMDim=idim,
Stride=[stride_h, stride_w],
Padding=[padding, padding],
PE=1,
SIMD=1,
inputDataType=idt.name,
Expand All @@ -180,7 +168,7 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):
model.set_tensor_datatype(model.graph.output[0].name, odt)
model.set_tensor_datatype("W", wdt)

w_tensor = gen_finn_dt_tensor(wdt, [ifm_ch * k * k, ofm_ch])
w_tensor = w_tensor.transpose(1, 2, 3, 0)
model.set_initializer("W", w_tensor)

model = model.transform(InferShapes())
Expand All @@ -189,33 +177,37 @@ def create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding):


# input image dimension
@pytest.mark.parametrize("idim", [[8, 8], [10, 8]])
@pytest.mark.parametrize("idim", [[8, 8]])
# number of rows and number of cols to add
@pytest.mark.parametrize("stride", [[2, 2], [2, 3]])
@pytest.mark.parametrize("stride", [[2, 2]])
# number of channels
@pytest.mark.parametrize("ifm_ch", [2])
# number of channels
@pytest.mark.parametrize("ofm_ch", [4])
@pytest.mark.parametrize("ofm_ch", [3])
# Input parallelism
@pytest.mark.parametrize("simd", [1, 2])
@pytest.mark.parametrize("simd", [1])
# PE
@pytest.mark.parametrize("pe", [1, 2])
@pytest.mark.parametrize("pe", [1])
# kernel size
@pytest.mark.parametrize("k", [2])
@pytest.mark.parametrize("k", [4])
# padding
@pytest.mark.parametrize("padding", [0, 1])
@pytest.mark.parametrize("padding", [1])
# exec mode
@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
@pytest.mark.parametrize("exec_mode", ["cppsim"])
@pytest.mark.fpgadataflow
@pytest.mark.slow
@pytest.mark.vivado
def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, exec_mode):
idt = wdt = DataType["INT4"]
idt = wdt = DataType["INT8"]
wdt = idt
odt = DataType["INT32"]
idim_h, idim_w = idim
stride_h, stride_w = stride

ref_model = set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding)
ref_model, w_tensor = set_up_reference_model(
idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding
)
model = create_deconv_node(idt, wdt, odt, k, idim, ifm_ch, ofm_ch, stride, padding, w_tensor)

odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1
odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1
Expand All @@ -225,34 +217,31 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding,

y_expected = oxe.execute_onnx(ref_model, input_dict)["outp"]

model = ref_model.transform(InferPixelPaddingDeconv())
model = model.transform(InferConvInpGen())
model = model.transform(InferQuantizedMatrixVectorActivation())
model = model.transform(InferShapes())
model = model.transform(GiveUniqueNodeNames())

y_produced = oxe.execute_onnx(model, input_dict)["outp"]
assert (y_produced == y_expected).all()
# model = model.transform(InferShapes())
# model = model.transform(GiveUniqueNodeNames())
input_tensor_nhwc = input_tensor.transpose(0, 2, 3, 1)
input_dict_nhwc = {"inp": input_tensor_nhwc}
# y_produced = oxe.execute_onnx(model, input_dict_nhwc)["outp"]
# assert (y_produced == y_expected).all()

model = model.transform(SpecializeLayers(test_fpga_part))
model = model.transform(MinimizeAccumulatorWidth())
# model = model.transform(SpecializeLayers(test_fpga_part))
# model = model.transform(MinimizeAccumulatorWidth())

for n in model.graph.node:
if n.op_type.startswith("ConvolutionInputGenerator"):
convinputgen_node = getCustomOp(n)
convinputgen_node.set_nodeattr("SIMD", simd)
elif n.op_type.startswith("MVAU"):
mvau_node = getCustomOp(n)
mvau_node.set_nodeattr("PE", pe)
mvau_node.set_nodeattr("SIMD", simd)

expected_oshape = (1, ofm_ch, odim_h, odim_w)
if n.op_type.startswith("Deconvolution_hls"):
deconv_node = getCustomOp(n)
deconv_node.set_nodeattr("PE", pe)
deconv_node.set_nodeattr("SIMD", simd)

expected_oshape = (1, odim_h, odim_w, ofm_ch)
# model.save("deconv.onnx")
# cppsim
if exec_mode == "cppsim":
model = model.transform(GiveUniqueNodeNames())
model = model.transform(PrepareCppSim())
model = model.transform(CompileCppSim())
model = model.transform(SetExecMode("cppsim"))
# breakpoint()

# rtlsim
else:
Expand All @@ -262,12 +251,13 @@ def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding,
model = model.transform(PrepareRTLSim())
model = model.transform(SetExecMode("rtlsim"))

y_produced = oxe.execute_onnx(model, input_dict)["outp"]
y_produced = oxe.execute_onnx(model, input_dict_nhwc)["outp"]
assert y_produced.shape == expected_oshape
y_produced = y_produced.transpose(0, 3, 1, 2)
assert (y_produced == y_expected).all()

if exec_mode == "rtlsim":
node = model.get_nodes_by_op_type("FMPadding_Pixel_hls")[0]
node = model.get_nodes_by_op_type("Deconvolution_hls")[0]
inst = getCustomOp(node)
cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim")
exp_cycles_dict = model.analysis(exp_cycles_per_layer)
Expand Down