Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xdp-forward: Introduce xdp-fwd-flowtable support #441

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions headers/linux/hlist.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@

struct list_head;

struct rhash_head {
struct rhash_head *next;
};

#define HLIST_POISON_POINTER_DELTA 0
#define HLIST_POISON1 ((void *) 0x100 + HLIST_POISON_POINTER_DELTA)
#define HLIST_POISON2 ((void *) 0x200 + HLIST_POISON_POINTER_DELTA)
Expand Down
114 changes: 114 additions & 0 deletions headers/linux/netfilter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#ifndef _LINUX_NETFILTER_H
#define _LINUX_NETFILTER_H

#include <stdbool.h>
#include <linux/types.h>
#include <bpf/bpf_helpers.h>
#include <xdp/parsing_helpers.h>

#include "hlist.h"

struct flow_ports {
__be16 source, dest;
};

enum ip_conntrack_dir {
IP_CT_DIR_ORIGINAL,
IP_CT_DIR_REPLY,
IP_CT_DIR_MAX
};

enum flow_offload_tuple_dir {
FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL,
FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY,
FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX,
};

enum flow_offload_type {
NF_FLOW_OFFLOAD_UNSPEC,
NF_FLOW_OFFLOAD_ROUTE,
};

enum nf_flow_flags {
NF_FLOW_SNAT,
NF_FLOW_DNAT,
NF_FLOW_TEARDOWN,
NF_FLOW_HW,
NF_FLOW_HW_DYING,
NF_FLOW_HW_DEAD,
NF_FLOW_HW_PENDING,
NF_FLOW_HW_BIDIRECTIONAL,
NF_FLOW_HW_ESTABLISHED,
};

enum flow_offload_xmit_type {
FLOW_OFFLOAD_XMIT_UNSPEC,
FLOW_OFFLOAD_XMIT_NEIGH,
FLOW_OFFLOAD_XMIT_XFRM,
FLOW_OFFLOAD_XMIT_DIRECT,
FLOW_OFFLOAD_XMIT_TC,
};

#define NF_FLOW_TABLE_ENCAP_MAX 2
struct flow_offload_tuple {
union {
struct in_addr src_v4;
struct in6_addr src_v6;
};
union {
struct in_addr dst_v4;
struct in6_addr dst_v6;
};
struct {
__be16 src_port;
__be16 dst_port;
};

int iifidx;

__u8 l3proto;
__u8 l4proto;
struct {
__u16 id;
__be16 proto;
} encap[NF_FLOW_TABLE_ENCAP_MAX];

/* All members above are keys for lookups, see flow_offload_hash(). */
struct { } __hash;

__u8 dir:2,
xmit_type:3,
encap_num:2,
in_vlan_ingress:2;
__u16 mtu;
union {
struct {
struct dst_entry *dst_cache;
__u32 dst_cookie;
};
struct {
__u32 ifidx;
__u32 hw_ifidx;
__u8 h_source[ETH_ALEN];
__u8 h_dest[ETH_ALEN];
} out;
struct {
__u32 iifidx;
} tc;
};
};

struct flow_offload_tuple_rhash {
struct rhash_head node;
struct flow_offload_tuple tuple;
};

struct flow_offload {
struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX];
struct nf_conn *ct;
unsigned long flags;
__u16 type;
__u32 timeout;
};

#endif /* _LINUX_NETFILTER_H */
2 changes: 1 addition & 1 deletion lib/testing/test_runner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ ALL_TESTS=""
VERBOSE_TESTS=${V:-0}
NUM_NS=2

NEEDED_TOOLS="capinfos ethtool ip ping sed tc tcpdump timeout nc tshark"
NEEDED_TOOLS="capinfos ethtool ip ping sed tc tcpdump timeout nc tshark nft"

if [ -f "$TEST_CONFIG" ]; then
source "$TEST_CONFIG"
Expand Down
2 changes: 1 addition & 1 deletion xdp-forward/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0

XDP_TARGETS := xdp_forward.bpf
XDP_TARGETS := xdp_forward.bpf xdp_flowtable.bpf xdp_flowtable_sample.bpf
BPF_SKEL_TARGETS := $(XDP_TARGETS)

XDP_OBJ_INSTALL :=
Expand Down
61 changes: 52 additions & 9 deletions xdp-forward/README.org
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ xdp-forward is an XDP forwarding plane, which will accelerate packet forwarding
using XDP. To use it, simply load it on the set of interfaces to accelerate
forwarding between. The userspace component of xdp-forward will then configure
and load XDP programs on those interfaces, and forward packets between them
using XDP_REDIRECT, using the kernel routing table to determine the destination
if each packet.
using XDP_REDIRECT, using the kernel routing table or netfilter flowtable to
determine the destination for each packet.

Any packets that xdp-forward does not know how to forward will be passed up to
the networking stack and handled by the kernel like normal. Depending on the
Expand Down Expand Up @@ -56,7 +56,11 @@ Specifies which forwarding mode =xdp-forward= should operate in. Depending on
the mode selected, =xdp-forward= will perform forwarding in different ways,
which can lead to different behaviour, including which subset of kernel
configuration (such as firewall rules) is respected during forwarding. See the
section *OPERATING MODES* below for a full description of each mode.
section *FORWARDING MODES* below for a full description of each mode.

** -F, --fib-mode <mode>
Specifies how =xdp-forward= performs routing table lookup in the linux kernel.
See the section *FIB MODES* below for a full description of each mode.

** -m, --mode <mode>
Specifies which mode to load the XDP program to be loaded in. The valid values
Expand Down Expand Up @@ -98,12 +102,12 @@ Enable debug logging. Specify twice for even more verbosity.
** -h, --help
Display a summary of the available options

* OPERATING MODES
The =xdp-forward= utility supports the following operating modes (selected by
* FORWARDING MODES
The =xdp-forward= utility supports the following forwarding modes (selected by
the =--fwd-mode= parameter to =xdp-forward load=.

** fib-full (default)
In the =fib-full= operating mode, =xdp-forward= will perform a full lookup in
** fib (default)
In the =fib= forwarding mode, =xdp-forward= will perform a lookup in
the kernel routing table (or FIB) for each packet, and forward packets between
the configured interfaces based on the result of the lookup. Any packet where
the lookup fails will be passed up to the stack. This includes packets that
Expand All @@ -115,12 +119,51 @@ Note that no checks other than the FIB lookup is performed; in particular, this
completely bypasses the netfilter subsystem, so firewall rules will not be
checked before forwarding.

** fib-direct
The =fib-direct= mode functions like =fib-full=, except it passes the
** flowtable
The =flowtable= operating mode offloads netfilter sw flowtable logic in
the XDP layer if the hardware flowtable is not available.
At the moment =xdp-forward= is able to offload just TCP or UDP netfilter
flowtable entries to XDP. The user is supposed to configure the flowtable
separately.
LorenzoBianconi marked this conversation as resolved.
Show resolved Hide resolved

* FIB MODES
The =xdp-forward= utility supports the following fib modes (selected by
the =--fib-mode= parameter to =xdp-forward load=.

** full (default)
In the =full= operating mode, =xdp-forward= will perform a full lookup in
the kernel routing table (or FIB) for each packet, and forward packets between
the configured interfaces based on the result of the lookup. In particular,
it will apply any policy routing rules configured by the user.

** direct
The =direct= mode functions like =full=, except it passes the
=BPF_FIB_LOOKUP_DIRECT= flag to the FIB lookup routine. This means that any
policy routing rules configured will be skipped during the lookup, which can
improve performance (but won't obey the policy of those rules, obviously).

* Examples

In order to enable flowtable offloading for tcp and udp traffic between NICs
n0 and n1, issue the following commands:
LorenzoBianconi marked this conversation as resolved.
Show resolved Hide resolved

#+begin_src sh
#nft -f /dev/stdin <<EOF
table inet filter {
flowtable ft {
hook ingress priority filter
devices = { n0, n1 }
}
chain forward {
type filter hook forward priority filter
meta l4proto { tcp, udp } flow add @ft
}
}
EOF

#xdp-forward load -f flowtable n0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, so upon seeing this, I realised there's a semantic difference between the flowtable XDP program and the fib XDP program: The latter uses bpf_redirect_map(), and explicitly sets up the map with only the interfaces passed on the command line. Whereas the flowtable program uses bpf_redirect(), and will redirect to any interface that the fib lookup succeeds against.

This is pretty inconsistent, and we should not have this difference only based on the mode the program is loaded in. Also, the way the fib program works is deliberate: the user should be able to select only a subset of interfaces to forward between (in the future I am planning to add an auto mode that uses XDP feature flags to automatically select compatible interfaces). So please change the flowtable program to behave the same as the other one.

And sorry for not realising this sooner. I did notice the difference in helpers, but didn't realise the implications until seeing this documentation change just now :(

#+end_src

* SEE ALSO
=libxdp(3)= for details on the XDP loading semantics and kernel compatibility
requirements.
Expand Down
91 changes: 85 additions & 6 deletions xdp-forward/tests/test-xdp-forward.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
XDP_LOADER=${XDP_LOADER:-./xdp-loader}
XDP_FORWARD=${XDP_FORWARD:-./xdp-forward}
ALL_TESTS="test_ping test_load test_fwd_full test_fwd_direct"

ALL_TESTS="test_ping test_load test_fwd_full test_fwd_direct test_flowtable"

test_ping()
{
Expand All @@ -27,7 +26,7 @@ test_fwd_full()
# veth NAPI GRO support added this symbol; forwarding won't work without it
skip_if_missing_kernel_symbol veth_set_features

check_run $XDP_FORWARD load -f fib-full ${NS_NAMES[@]}
check_run $XDP_FORWARD load -f fib -F full ${NS_NAMES[@]}
for ip in "${ALL_INSIDE_IP4[@]}"; do
check_run ns_exec ping -c 1 -W 2 $ip
done
Expand All @@ -42,7 +41,7 @@ test_fwd_direct()
# veth NAPI GRO support added this symbol; forwarding won't work without it
skip_if_missing_kernel_symbol veth_set_features

check_run $XDP_FORWARD load -f fib-direct ${NS_NAMES[@]}
check_run $XDP_FORWARD load -f fib -F direct ${NS_NAMES[@]}
for ip in "${ALL_INSIDE_IP4[@]}"; do
check_run ns_exec ping -c 1 -W 2 $ip
done
Expand All @@ -52,8 +51,88 @@ test_fwd_direct()
check_run $XDP_FORWARD unload ${NS_NAMES[@]}
}

test_flowtable()
{
local INPUT_FILE="${STATEDIR}/in_$$_$RANDOM"

# veth NAPI GRO support added this symbol; forwarding won't work without it
skip_if_missing_kernel_symbol veth_set_features
# check if bpf flowtable lookup is available
skip_if_missing_kernel_symbol bpf_xdp_flow_lookup

# disable {tx,rx} checksum offload since it is not currently suported
# by XDP_REDIRECT
for n in ${NS_NAMES[@]}; do
ip netns exec $n ethtool -K veth0 tx-checksumming off rx-checksumming off
ethtool -K $n tx-checksumming off rx-checksumming off
done

# create data to send via tcp
dd if=/dev/urandom of="${INPUT_FILE}" bs=8192 count=32 status=none

# create flowtable configuration in the main namespace
check_run nft -f /dev/stdin <<EOF
table inet nat {
# enable DNAT to server <ip:port> in pre-routing chain
chain prerouting {
type nat hook prerouting priority filter; policy accept;
iifname == "${NS_NAMES[0]}" meta nfproto ipv4 tcp dport 12345 dnat ip to ${ALL_INSIDE_IP4[-1]}:10000
iifname == "${NS_NAMES[0]}" meta nfproto ipv6 tcp dport 12345 dnat ip6 to [${ALL_INSIDE_IP6[-1]}]:10000
LorenzoBianconi marked this conversation as resolved.
Show resolved Hide resolved
}
# enable SNAT of the client ip via masquerading in post-routing chain
chain postrouting {
type nat hook postrouting priority filter; policy accept;
oifname "${NS_NAMES[-1]}" masquerade
}
}
table inet filter {
flowtable ft {
hook ingress priority filter
devices = { ${NS_NAMES[0]}, ${NS_NAMES[-1]} }
}
chain forward {
type filter hook forward priority filter
meta l4proto { tcp } flow add @ft
}
}
EOF

# Add some nft rules to check {dnat/snat} is done properly in
# the main namespace
check_run ip netns exec ${NS_NAMES[-1]} nft -f /dev/stdin <<EOF
table inet filter {
chain input {
type filter hook input priority 0; policy drop
ip saddr $OUTSIDE_IP4 ip daddr ${ALL_INSIDE_IP4[-1]} tcp dport 10000 accept
ip6 saddr $OUTSIDE_IP6 ip6 daddr ${ALL_INSIDE_IP6[-1]} tcp dport 10000 accept
}
}
EOF
# wait a bit to configure nft
sleep 2

check_run $XDP_FORWARD load -f flowtable ${NS_NAMES[0]}

PID=$(start_background_ns_devnull "nc -4 -l --no-shutdown 10000")
check_run ip netns exec ${NS_NAMES[0]} nc -w 1 -4 ${OUTSIDE_IP4} 12345 < ${INPUT_FILE}
stop_background $PID

PID=$(start_background_ns_devnull "nc -6 -l --no-shutdown 10000")
check_run ip netns exec ${NS_NAMES[0]} nc -w 1 -6 ${OUTSIDE_IP6} 12345 < ${INPUT_FILE}
stop_background $PID
}

cleanup_tests()
{
$XDP_FORWARD unload ${NS_NAMES[@]} >/dev/null 2>&1
$XDP_LOADER unload $NS --all >/dev/null 2>&1
# enable {tx,rx} checksum offload
for n in ${NS_NAMES[@]}; do
ip netns exec $n ethtool -K veth0 tx-checksumming on rx-checksumming on
ethtool -K $n tx-checksumming on rx-checksumming on
done >/dev/null 2>&1
{
$XDP_FORWARD unload ${NS_NAMES[@]}
$XDP_LOADER unload $NS --all
check_run ip netns exec ${NS_NAMES[-1]} nft flush ruleset
check_run nft flush ruleset
} >/dev/null 2>&1
}
Loading