Skip to content

Commit

Permalink
northd: Fix broadcast on transit spine switches.
Browse files Browse the repository at this point in the history
Currently, OVN can't learn FDB entries from remote ports, because
learning is happening in the ingress pipeline, while only the egress
pipeline is executed in the destination availability zone.  So, the
destination zone can't learn the source MAC address and has to
broadcast replies.  The source zone also can't learn from replies,
so has to broadcast all the subsequent packets as well.

Fix that by introducing FDB learning in the egress pipeline for
remote ports.

Note: For some reason remote ports were not tracked by ovn-controller.
This wasn't a big problem because we didn't use inport/outport matches
on the traffic from remote ports in most cases.  However, FDB lookups
need to match on the input port, and ovn-controller will skip those
flows if ports are not related, so marking them as such. (The removal
is already handled in a generic way.)

Fixes: 67100f0 ("ic: Add support for spine-leaf topology for transit switches.")
Signed-off-by: Ilya Maximets <[email protected]>
  • Loading branch information
igsilya committed Feb 11, 2025
1 parent be08a37 commit 46fda79
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 39 deletions.
7 changes: 6 additions & 1 deletion controller/binding.c
Original file line number Diff line number Diff line change
Expand Up @@ -2219,7 +2219,9 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out)
}

case LP_REMOTE:
/* Nothing to be done for REMOTE type. */
/* Remote ports are related, since we can have rules in the
* egress pipeline matching on traffic coming from another AZ. */
update_related_lport(pb, b_ctx_out);
break;

case LP_UNKNOWN: {
Expand Down Expand Up @@ -3131,6 +3133,9 @@ handle_updated_port(struct binding_ctx_in *b_ctx_in,
}

case LP_REMOTE:
update_related_lport(pb, b_ctx_out);
break;

case LP_UNKNOWN:
break;
}
Expand Down
2 changes: 1 addition & 1 deletion lib/ovn-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ BUILD_ASSERT_DECL(
#define SCTP_ABORT_CHUNK_FLAG_T (1 << 0)

/* The number of tables for the ingress and egress pipelines. */
#define LOG_PIPELINE_LEN 30
#define LOG_PIPELINE_LEN 33

static inline uint32_t
hash_add_in6_addr(uint32_t hash, const struct in6_addr *addr)
Expand Down
18 changes: 14 additions & 4 deletions northd/northd.c
Original file line number Diff line number Diff line change
Expand Up @@ -5763,8 +5763,9 @@ build_lswitch_learn_fdb_op(
return;
}

if (!strcmp(op->nbsp->type, "")
|| lsp_is_switch(op->nbsp)
bool remote = lsp_is_remote(op->nbsp);

if (remote || !strcmp(op->nbsp->type, "") || lsp_is_switch(op->nbsp)
|| (lsp_is_localnet(op->nbsp) && localnet_can_learn_mac(op->nbsp))) {
ds_clear(match);
ds_clear(actions);
Expand All @@ -5775,15 +5776,19 @@ build_lswitch_learn_fdb_op(
ds_put_format(actions, REGBIT_LKUP_FDB
" = lookup_fdb(inport, eth.src); next;");
ovn_lflow_add_with_lport_and_hint(lflows, op->od,
S_SWITCH_IN_LOOKUP_FDB, 100,
remote ? S_SWITCH_OUT_LOOKUP_FDB
: S_SWITCH_IN_LOOKUP_FDB,
100,
ds_cstr(match), ds_cstr(actions),
op->key, &op->nbsp->header_,
op->lflow_ref);

ds_put_cstr(match, " && "REGBIT_LKUP_FDB" == 0");
ds_clear(actions);
ds_put_cstr(actions, "put_fdb(inport, eth.src); next;");
ovn_lflow_add_with_lport_and_hint(lflows, op->od, S_SWITCH_IN_PUT_FDB,
ovn_lflow_add_with_lport_and_hint(lflows, op->od,
remote ? S_SWITCH_OUT_PUT_FDB
: S_SWITCH_IN_PUT_FDB,
100, ds_cstr(match),
ds_cstr(actions), op->key,
&op->nbsp->header_,
Expand All @@ -5803,6 +5808,11 @@ build_lswitch_learn_fdb_od(
lflow_ref);
ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 0, "1",
"outport = get_fdb(eth.dst); next;", lflow_ref);

ovn_lflow_add(lflows, od, S_SWITCH_OUT_LOOKUP_FDB, 0, "1", "next;",
lflow_ref);
ovn_lflow_add(lflows, od, S_SWITCH_OUT_PUT_FDB, 0, "1", "next;",
lflow_ref);
}

/* Egress tables 8: Egress port security - IP (priority 0)
Expand Down
24 changes: 13 additions & 11 deletions northd/northd.h
Original file line number Diff line number Diff line change
Expand Up @@ -498,17 +498,19 @@ enum ovn_stage {
PIPELINE_STAGE(SWITCH, IN, L2_UNKNOWN, 29, "ls_in_l2_unknown") \
\
/* Logical switch egress stages. */ \
PIPELINE_STAGE(SWITCH, OUT, PRE_ACL, 0, "ls_out_pre_acl") \
PIPELINE_STAGE(SWITCH, OUT, PRE_LB, 1, "ls_out_pre_lb") \
PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful") \
PIPELINE_STAGE(SWITCH, OUT, ACL_HINT, 3, "ls_out_acl_hint") \
PIPELINE_STAGE(SWITCH, OUT, ACL_EVAL, 4, "ls_out_acl_eval") \
PIPELINE_STAGE(SWITCH, OUT, ACL_SAMPLE, 5, "ls_out_acl_sample") \
PIPELINE_STAGE(SWITCH, OUT, ACL_ACTION, 6, "ls_out_acl_action") \
PIPELINE_STAGE(SWITCH, OUT, QOS, 7, "ls_out_qos") \
PIPELINE_STAGE(SWITCH, OUT, STATEFUL, 8, "ls_out_stateful") \
PIPELINE_STAGE(SWITCH, OUT, CHECK_PORT_SEC, 9, "ls_out_check_port_sec") \
PIPELINE_STAGE(SWITCH, OUT, APPLY_PORT_SEC, 10, "ls_out_apply_port_sec") \
PIPELINE_STAGE(SWITCH, OUT, LOOKUP_FDB, 0, "ls_out_lookup_fdb") \
PIPELINE_STAGE(SWITCH, OUT, PUT_FDB, 1, "ls_out_put_fdb") \
PIPELINE_STAGE(SWITCH, OUT, PRE_ACL, 2, "ls_out_pre_acl") \
PIPELINE_STAGE(SWITCH, OUT, PRE_LB, 3, "ls_out_pre_lb") \
PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 4, "ls_out_pre_stateful") \
PIPELINE_STAGE(SWITCH, OUT, ACL_HINT, 5, "ls_out_acl_hint") \
PIPELINE_STAGE(SWITCH, OUT, ACL_EVAL, 6, "ls_out_acl_eval") \
PIPELINE_STAGE(SWITCH, OUT, ACL_SAMPLE, 7, "ls_out_acl_sample") \
PIPELINE_STAGE(SWITCH, OUT, ACL_ACTION, 8, "ls_out_acl_action") \
PIPELINE_STAGE(SWITCH, OUT, QOS, 9, "ls_out_qos") \
PIPELINE_STAGE(SWITCH, OUT, STATEFUL, 10, "ls_out_stateful") \
PIPELINE_STAGE(SWITCH, OUT, CHECK_PORT_SEC, 11, "ls_out_check_port_sec") \
PIPELINE_STAGE(SWITCH, OUT, APPLY_PORT_SEC, 12, "ls_out_apply_port_sec") \
\
/* Logical router ingress stages. */ \
PIPELINE_STAGE(ROUTER, IN, ADMISSION, 0, "lr_in_admission") \
Expand Down
93 changes: 71 additions & 22 deletions tests/ovn-ic.at
Original file line number Diff line number Diff line change
Expand Up @@ -2779,6 +2779,26 @@ ovn_as az3
check ovn-nbctl --wait=hv sync
ovn-sbctl dump-flows > az3/sbflows

dnl Check that FDB learning is enabled for remote ports in the egress pipeline.
AT_CHECK([grep -E "ls_out.*fdb.*spine-to-" az1/sbflows | ovn_strip_lflows], [0], [dnl
table=??(ls_out_lookup_fdb ), priority=100 , match=(inport == "spine-to-ls2"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
table=??(ls_out_lookup_fdb ), priority=100 , match=(inport == "spine-to-ls3"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
table=??(ls_out_put_fdb ), priority=100 , match=(inport == "spine-to-ls2" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
table=??(ls_out_put_fdb ), priority=100 , match=(inport == "spine-to-ls3" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
])
AT_CHECK([grep -E "ls_out.*fdb.*spine-to-" az2/sbflows | ovn_strip_lflows], [0], [dnl
table=??(ls_out_lookup_fdb ), priority=100 , match=(inport == "spine-to-ls1"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
table=??(ls_out_lookup_fdb ), priority=100 , match=(inport == "spine-to-ls3"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
table=??(ls_out_put_fdb ), priority=100 , match=(inport == "spine-to-ls1" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
table=??(ls_out_put_fdb ), priority=100 , match=(inport == "spine-to-ls3" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
])
AT_CHECK([grep -E "ls_out.*fdb.*spine-to-" az3/sbflows | ovn_strip_lflows], [0], [dnl
table=??(ls_out_lookup_fdb ), priority=100 , match=(inport == "spine-to-ls1"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
table=??(ls_out_lookup_fdb ), priority=100 , match=(inport == "spine-to-ls2"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
table=??(ls_out_put_fdb ), priority=100 , match=(inport == "spine-to-ls1" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
table=??(ls_out_put_fdb ), priority=100 , match=(inport == "spine-to-ls2" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
])

check ovn-ic-nbctl --wait=sb sync

ovn-ic-nbctl show > ic-nbctl.dump
Expand Down Expand Up @@ -2836,15 +2856,13 @@ ovn_as az1
wait_row_count FDB 1
check ovn-nbctl --wait=hv sync

# Only one entry is expected in the other zones as well - the entry for
# the ls[23]-to-spine port in ls[23] switches. Technically, we also need
# an entry for a remote spine-to-ls1 port, but learning from remote ports
# is not implemented yet.
# Two entries are expected in the other zones - one for the remote port on
# the spine switch and one for the switch port on the leaf.
ovn_as az2
wait_row_count FDB 1
wait_row_count FDB 2
check ovn-nbctl --wait=hv sync
ovn_as az3
wait_row_count FDB 1
wait_row_count FDB 2
check ovn-nbctl --wait=hv sync

# FDB entry was created from the userspace() action in the datapath, but
Expand Down Expand Up @@ -2887,9 +2905,10 @@ OVN_CHECK_PACKETS([hv3/vif5-tx.pcap], [expected])
reply=$(fmt_pkt "Ether(dst='${src_mac}', src='${dst_mac}')/ \
IP(src='${dst_ip}', dst='${src_ip}')/ \
UDP(sport=4369, dport=1538)")
# Reply packet is still learned and broadcasted in the spine switch, because
# learning from remote ports is not implemented, so we don't know where the
# vif1 is located, even though we received some traffic from it.
# For the reply packet we expect only one userspace action for FDB update on
# the spine switch and only one tunnel push and send, because we already
# learned that MAC of vif1 is behind spine-ls1 and no longer need to broadcast
# to zone 3.
AT_CHECK([ovn_as az2 as hv2 ovs-appctl ofproto/trace --names \
br-int in_port=vif3 $reply > ofproto-trace-2])
AT_CAPTURE_FILE([ofproto-trace-2])
Expand All @@ -2899,34 +2918,64 @@ Megaflow: recirc_id=0,eth,ip,in_port=vif3,dl_src=f0:00:00:01:02:03,dl_dst=f0:00:
AT_CHECK([cat ofproto-trace-2 | tail -1 \
| grep -oE 'tnl_push|userspace|clone|br-phys_n1|vif[[0-9]]'], [0], [dnl
userspace
clone
tnl_push
br-phys_n1
tnl_push
br-phys_n1
])

# Now actually send it.
check as hv2 ovs-appctl netdev-dummy/receive vif3 $reply

# Zones 1 and 2 should have 2 FDB entries now each. One per side of a
# switch-switch port connecting ls[12] with the spine. Zone 3 only has two
# entries on ls3 for traffic broadcasted in the spine from both vif1 and vif3.
ovn_as az1 wait_row_count FDB 2
ovn_as az2 wait_row_count FDB 2
ovn_as az3 wait_row_count FDB 2

AT_CHECK([echo $reply > reply])
# Check that it is delivered where needed and not delivered where not.
# While the traffic is broadcasted within the spine and arrives in zone 3, the
# packets must be dropped, because ls3 learned that their destination addresses
# are behind the spine switch, so no new packets should be seen on vif5.
OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [reply])
OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [empty])
OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected])
OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [empty])
OVN_CHECK_PACKETS([hv3/vif5-tx.pcap], [expected])

# Zones 1 and 2 should have 2 FDB entries now each. One for a remote port and
# one per side of a switch-switch port connecting ls[12] with the spine. Zone
# 3 still only has two entries created for the original request from vif1.
ovn_as az1
wait_row_count FDB 3
check ovn-nbctl --wait=hv sync
ovn_as az2
wait_row_count FDB 3
check ovn-nbctl --wait=hv sync
ovn_as az3
wait_row_count FDB 2
check ovn-nbctl --wait=hv sync

# Packets should flow directly to the destination (via tunnels) in both
# directions now.
AT_CHECK([as hv1 ovs-appctl ofproto/trace --names \
br-int in_port=vif1 $packet | tail -2 \
| grep -oE 'Megaflow.*|tnl_push|userspace|clone|br-phys_n1|vif[[0-9]]'], [0], [dnl
Megaflow: recirc_id=0,eth,ip,in_port=vif1,dl_src=f0:00:00:01:02:01,dl_dst=f0:00:00:01:02:03,nw_ecn=0,nw_frag=no
tnl_push
br-phys_n1
])
AT_CHECK([as hv2 ovs-appctl ofproto/trace --names \
br-int in_port=vif3 $reply | tail -2 \
| grep -oE 'Megaflow.*|tnl_push|userspace|clone|br-phys_n1|vif[[0-9]]'], [0], [dnl
Megaflow: recirc_id=0,eth,ip,in_port=vif3,dl_src=f0:00:00:01:02:03,dl_dst=f0:00:00:01:02:01,nw_ecn=0,nw_frag=no
tnl_push
br-phys_n1
])

# Send and check one more time.
check as hv1 ovs-appctl netdev-dummy/receive vif1 $packet
check as hv2 ovs-appctl netdev-dummy/receive vif3 $reply

AT_CHECK([cp expected expected-vif5])
AT_CHECK([echo $packet >> expected])
AT_CHECK([echo $reply >> reply])
OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [reply])
OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [empty])
OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected])
OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [empty])
OVN_CHECK_PACKETS([hv3/vif5-tx.pcap], [expected-vif5])

OVN_CLEANUP_IC([az1], [az2], [az3])
AT_CLEANUP
])

0 comments on commit 46fda79

Please sign in to comment.