From 0b59c3b2a7249e2569e6e2b3f7ffabb591bdc608 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Thu, 14 Sep 2017 14:30:05 +0300 Subject: [PATCH 1/2] UCT/UD: Filter incoming packets by DGID When UD is used with RoCE, packets intended for other peers may arrive (as a result of unicast flooding). These packets needs to be dropped. --- src/uct/ib/ud/accel/ud_mlx5.c | 8 ++++ src/uct/ib/ud/base/ud_iface.c | 68 +++++++++++++++++++++++++++++++++- src/uct/ib/ud/base/ud_iface.h | 37 ++++++++++++++++++ src/uct/ib/ud/verbs/ud_verbs.c | 5 +++ 4 files changed, 116 insertions(+), 2 deletions(-) diff --git a/src/uct/ib/ud/accel/ud_mlx5.c b/src/uct/ib/ud/accel/ud_mlx5.c index 907c187abcb..3e254103533 100644 --- a/src/uct/ib/ud/accel/ud_mlx5.c +++ b/src/uct/ib/ud/accel/ud_mlx5.c @@ -402,6 +402,14 @@ ucs_status_t uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t *iface, int is_async) len = ntohl(cqe->byte_cnt); VALGRIND_MAKE_MEM_DEFINED(packet, len); + + if (!uct_ud_iface_check_grh(&iface->super, packet + UCT_IB_GRH_LEN, + (ntohl(cqe->flags_rqpn) >> 28) & 3)) { + ucs_mpool_put_inline(desc); + status = UCS_ERR_NO_PROGRESS; + goto out; + } + uct_ib_mlx5_log_rx(&iface->super.super, IBV_QPT_UD, cqe, packet, uct_ud_dump_packet); uct_ud_ep_process_rx(&iface->super, diff --git a/src/uct/ib/ud/base/ud_iface.c b/src/uct/ib/ud/base/ud_iface.c index 378cfbb88b1..f162a698551 100644 --- a/src/uct/ib/ud/base/ud_iface.c +++ b/src/uct/ib/ud/base/ud_iface.c @@ -14,8 +14,22 @@ #include #include #include +#include +#define UCT_UD_IPV4_ADDR_LEN sizeof(struct in_addr) +#define UCT_UD_IPV6_ADDR_LEN sizeof(struct in6_addr) + +#if ENABLE_STATS +static ucs_stats_class_t uct_ud_iface_stats_class = { + .name = "ud_iface", + .num_counters = UCT_UD_IFACE_STAT_LAST, + .counter_names = { + [UCT_UD_IFACE_STAT_RX_DROP] = "rx_drop" + } +}; +#endif + SGLIB_DEFINE_LIST_FUNCTIONS(uct_ud_iface_peer_t, uct_ud_iface_peer_cmp, next) SGLIB_DEFINE_HASHED_CONTAINER_FUNCTIONS(uct_ud_iface_peer_t, UCT_UD_HASH_SIZE, @@ -387,6 +401,39 @@ void uct_ud_iface_remove_async_handlers(uct_ud_iface_t *iface) ucs_async_remove_handler(iface->async.timer_id, 1); } +/* Calculate real GIDs len. Can be either 16 (RoCEv1 or RoCEv2/IPv6) + * or 4 (RoCEv2/IPv4). This len is used for packets filtering by DGIDs. + * + * According to Annex17_RoCEv2 (A17.4.5.2): + * "The first 40 bytes of user posted UD Receive Buffers are reserved for the L3 + * header of the incoming packet (as per the InfiniBand Spec Section 11.4.1.2). + * In RoCEv2, this area is filled up with the IP header. IPv6 header uses the + * entire 40 bytes. IPv4 headers use the 20 bytes in the second half of the + * reserved 40 bytes area (i.e. offset 20 from the beginning of the receive + * buffer). In this case, the content of the first 20 bytes is undefined." */ +static void uct_ud_iface_calc_gid_len(uct_ud_iface_t *iface) +{ + uint16_t *local_gid_u16 = (uint16_t*)iface->super.gid.raw; + + /* Make sure that daddr in IPv4 resides in the last 4 bytes in GRH */ + UCS_STATIC_ASSERT((UCT_IB_GRH_LEN - (20 + offsetof(struct iphdr, daddr))) == + UCT_UD_IPV4_ADDR_LEN); + + /* Make sure that dgid resides in the last 16 bytes in GRH */ + UCS_STATIC_ASSERT((UCT_IB_GRH_LEN - offsetof(struct ibv_grh, dgid)) == + UCT_UD_IPV6_ADDR_LEN); + + /* IPv4 mapped to IPv6 looks like: 0000:0000:0000:0000:0000:ffff:????:????, + * so check for leading zeroes and verify that 11-12 bytes are 0xff. + * Otherwise either RoCEv1 or RoCEv2/IPv6 are used. */ + if (local_gid_u16[0] == 0x0000) { + ucs_assert_always(local_gid_u16[5] == 0xffff); + iface->config.gid_len = UCT_UD_IPV4_ADDR_LEN; + } else { + iface->config.gid_len = UCT_UD_IPV6_ADDR_LEN; + } +} + UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, unsigned ud_rx_priv_len, @@ -434,6 +481,8 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, self->rx.available = config->super.rx.queue_len; self->config.tx_qp_len = config->super.tx.queue_len; self->config.peer_timeout = ucs_time_from_sec(config->peer_timeout); + self->config.check_grh_dgid = (config->dgid_check && + (self->super.addr_type == UCT_IB_ADDRESS_TYPE_ETH)); if (config->slow_timer_backoff <= 0.) { ucs_error("The slow timer back off should be > 0 (%lf)", @@ -472,7 +521,7 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, &config->super.tx.mp, self->config.tx_qp_len, uct_ud_iface_send_skb_init, "ud_tx_skb"); if (status != UCS_OK) { - goto err_mpool; + goto err_rx_mpool; } self->tx.skb = ucs_mpool_get(&self->tx.mp); self->tx.skb_inl.super.len = sizeof(uct_ud_neth_t); @@ -485,9 +534,19 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, ucs_queue_head_init(&self->rx.pending_q); + uct_ud_iface_calc_gid_len(self); + + status = UCS_STATS_NODE_ALLOC(&self->stats, &uct_ud_iface_stats_class, + self->super.super.stats); + if (status != UCS_OK) { + goto err_tx_mpool; + } + return UCS_OK; -err_mpool: +err_tx_mpool: + ucs_mpool_cleanup(&self->tx.mp, 1); +err_rx_mpool: ucs_mpool_cleanup(&self->rx.mp, 1); err_qp: ibv_destroy_qp(self->qp); @@ -514,6 +573,7 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ud_iface_t) ucs_ptr_array_cleanup(&self->eps); ucs_arbiter_cleanup(&self->tx.pending_q); ucs_assert(self->tx.pending_q_len == 0); + UCS_STATS_NODE_FREE(self->stats); uct_ud_leave(self); } @@ -527,6 +587,10 @@ ucs_config_field_t uct_ud_iface_config_table[] = { {"SLOW_TIMER_BACKOFF", "2.0", "Timeout multiplier for resending trigger", ucs_offsetof(uct_ud_iface_config_t, slow_timer_backoff), UCS_CONFIG_TYPE_DOUBLE}, + {"ETH_DGID_CHECK", "y", + "Enable checking destination GID for incoming packets of Ethernet network\n" + "Mismatched packets are silently dropped.", + ucs_offsetof(uct_ud_iface_config_t, dgid_check), UCS_CONFIG_TYPE_BOOL}, {NULL} }; diff --git a/src/uct/ib/ud/base/ud_iface.h b/src/uct/ib/ud/base/ud_iface.h index 9ddff8902d4..dda9579401d 100644 --- a/src/uct/ib/ud/base/ud_iface.h +++ b/src/uct/ib/ud/base/ud_iface.h @@ -23,12 +23,18 @@ #define UCT_UD_MIN_INLINE 48 +enum { + UCT_UD_IFACE_STAT_RX_DROP, + UCT_UD_IFACE_STAT_LAST +}; + /* TODO: maybe tx_moderation can be defined at compile-time since tx completions are used only to know how much space is there in tx qp */ typedef struct uct_ud_iface_config { uct_ib_iface_config_t super; double peer_timeout; double slow_timer_backoff; + int dgid_check; } uct_ud_iface_config_t; struct uct_ud_iface_peer { @@ -123,7 +129,12 @@ struct uct_ud_iface { double slow_timer_backoff; unsigned tx_qp_len; unsigned max_inline; + int check_grh_dgid; + unsigned gid_len; } config; + + UCS_STATS_NODE_DECLARE(stats); + ucs_ptr_array_t eps; uct_ud_iface_peer_t *peers[UCT_UD_HASH_SIZE]; struct { @@ -215,6 +226,32 @@ static UCS_F_ALWAYS_INLINE void uct_ud_leave(uct_ud_iface_t *iface) UCS_ASYNC_UNBLOCK(iface->super.super.worker->async); } +static UCS_F_ALWAYS_INLINE int +uct_ud_iface_check_grh(uct_ud_iface_t *iface, void *grh_end, int is_grh_present) +{ + void *dest_gid, *local_gid; + + if (!iface->config.check_grh_dgid) { + return 1; + } + + if (ucs_unlikely(!is_grh_present)) { + ucs_warn("RoCE packet does not contain GRH"); + return 1; + } + + local_gid = (char*)iface->super.gid.raw + (16 - iface->config.gid_len); + dest_gid = (char*)grh_end - iface->config.gid_len; + + if (memcmp(local_gid, dest_gid, iface->config.gid_len)) { + UCS_STATS_UPDATE_COUNTER(iface->stats, UCT_UD_IFACE_STAT_RX_DROP, 1); + ucs_trace_data("Drop packet with wrong dgid"); + return 0; + } + + return 1; +} + /* management of connecting endpoints (cep) diff --git a/src/uct/ib/ud/verbs/ud_verbs.c b/src/uct/ib/ud/verbs/ud_verbs.c index b09c93f3091..bfda4f2951b 100644 --- a/src/uct/ib/ud/verbs/ud_verbs.c +++ b/src/uct/ib/ud/verbs/ud_verbs.c @@ -323,6 +323,11 @@ uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface, int is_async) } UCT_IB_IFACE_VERBS_FOREACH_RXWQE(&iface->super.super, i, packet, wc, num_wcs) { + if (!uct_ud_iface_check_grh(&iface->super, packet + UCT_IB_GRH_LEN, + wc[i].wc_flags & IBV_WC_GRH)) { + ucs_mpool_put_inline((void*)wc[i].wr_id); + continue; + } uct_ib_log_recv_completion(&iface->super.super, IBV_QPT_UD, &wc[i], packet, wc[i].byte_len, uct_ud_dump_packet); uct_ud_ep_process_rx(&iface->super, From 0e471cbc4a9e4ca9040ca784aab73a7e988658a0 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Sun, 12 Nov 2017 16:51:45 +0200 Subject: [PATCH 2/2] TEST/JENKINS: Add debug build to test_jenkins --- contrib/test_jenkins.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh index fffc52d0a57..a9f9ce7833c 100755 --- a/contrib/test_jenkins.sh +++ b/contrib/test_jenkins.sh @@ -218,6 +218,17 @@ build_icc() { fi } +# +# Build debug version +# +build_debug() { + echo "==== Build with --enable-debug option ====" + ../contrib/configure-devel --prefix=$ucx_inst --enable-debug + $MAKE clean + $MAKE + $MAKE distclean +} + run_hello() { api=$1 shift @@ -480,6 +491,7 @@ run_tests() { export UCX_ERROR_MAIL_FOOTER=$JOB_URL/$BUILD_NUMBER/console do_distributed_task 0 4 build_icc + do_distributed_task 1 4 build_debug # all are running mpi tests run_mpi_tests