From 7d8d33953fe7e569cd09d6e19883b7bdd3833da0 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Thu, 14 Sep 2017 14:30:05 +0300 Subject: [PATCH 1/2] UCT/UD: Filter incoming packets by DGID When UD is used with RoCE, packets intended for other peers may arrive (as a result of unicast flooding). These packets needs to be dropped. --- src/uct/ib/ud/accel/ud_mlx5.c | 8 +++++-- src/uct/ib/ud/base/ud_iface.c | 42 ++++++++++++++++++++++++++++++++++ src/uct/ib/ud/base/ud_iface.h | 30 ++++++++++++++++++++++++ src/uct/ib/ud/verbs/ud_verbs.c | 5 ++++ 4 files changed, 83 insertions(+), 2 deletions(-) diff --git a/src/uct/ib/ud/accel/ud_mlx5.c b/src/uct/ib/ud/accel/ud_mlx5.c index 426f6d3be9b..852ee1ce96a 100644 --- a/src/uct/ib/ud/accel/ud_mlx5.c +++ b/src/uct/ib/ud/accel/ud_mlx5.c @@ -400,6 +400,12 @@ uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t *iface, int is_async) iface->super.rx.available++; iface->rx.wq.cq_wqe_counter++; + count = 1; + + if (uct_ud_iface_filter_dgid(&iface->super, packet + UCT_IB_GRH_LEN, desc, + (ntohl(cqe->flags_rqpn) >> 28) & 3)) { + goto out; + } len = ntohl(cqe->byte_cnt); VALGRIND_MAKE_MEM_DEFINED(packet, len); @@ -409,8 +415,6 @@ uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t *iface, int is_async) (uct_ud_neth_t *)(packet + UCT_IB_GRH_LEN), len - UCT_IB_GRH_LEN, (uct_ud_recv_skb_t *)desc, is_async); - count = 1; - out: if (iface->super.rx.available >= iface->super.super.config.rx_max_batch) { /* we need to try to post buffers always. Otherwise it is possible diff --git a/src/uct/ib/ud/base/ud_iface.c b/src/uct/ib/ud/base/ud_iface.c index 2b12a77df0c..acce53221b6 100644 --- a/src/uct/ib/ud/base/ud_iface.c +++ b/src/uct/ib/ud/base/ud_iface.c @@ -14,6 +14,7 @@ #include #include #include +#include SGLIB_DEFINE_LIST_FUNCTIONS(uct_ud_iface_peer_t, uct_ud_iface_peer_cmp, next) @@ -371,6 +372,40 @@ void uct_ud_iface_remove_async_handlers(uct_ud_iface_t *iface) ucs_async_remove_handler(iface->async.timer_id, 1); } +/* Calculate real GIDs len. Can be either 16 (RoCEv1 or RoCEv2/IPv6) + * or 4 (RoCEv2/IPv4). This len is used for packets filtering by DGIDs. + * + * According to Annex17_RoCEv2 (A17.4.5.2): + * "The first 40 bytes of user posted UD Receive Buffers are reserved for the L3 + * header of the incoming packet (as per the InfiniBand Spec Section 11.4.1.2). + * In RoCEv2, this area is filled up with the IP header. IPv6 header uses the + * entire 40 bytes. IPv4 headers use the 20 bytes in the second half of the + * reserved 40 bytes area (i.e. offset 20 from the beginning of the receive + * buffer). In this case, the content of the first 20 bytes is undefined." */ +static void uct_ud_iface_parse_gid_format(uct_ud_iface_t *iface) +{ + const int ipv4_len = 4; + const int ipv6_len = 16; + uint16_t *first = (uint16_t*)iface->super.gid.raw; + uint16_t *filled = (uint16_t*)iface->super.gid.raw + 5; + + /* Make sure that daddr in IPv4 takes last 4 bytes in GRH */ + UCS_STATIC_ASSERT((UCT_IB_GRH_LEN - (20 + offsetof(struct iphdr, daddr))) == ipv4_len); + + /* Make sure that dgid takes last 16 bytes in GRH */ + UCS_STATIC_ASSERT(UCT_IB_GRH_LEN - offsetof(struct ibv_grh, dgid) == ipv6_len); + + /* IPv4 mapped to IPv6 looks like: 0000:0000:0000:0000:0000:ffff:ac10:6510, + * so check for leading zeroes and verify that 11-12 bytes are 0xff. + * Otherwise either RoCEv1 or RoCEv2/IPv6 are used. */ + if (!(*first)) { + ucs_assert_always((*filled) == 0xffff); + iface->config.dgid_len = ipv4_len; + } else { + iface->config.dgid_len = ipv6_len; + } +} + UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, unsigned ud_rx_priv_len, @@ -418,6 +453,8 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, self->rx.available = config->super.rx.queue_len; self->config.tx_qp_len = config->super.tx.queue_len; self->config.peer_timeout = ucs_time_from_sec(config->peer_timeout); + self->config.filter_enabled = (config->filter_enable && + (self->super.addr_type == UCT_IB_ADDRESS_TYPE_ETH)); if (config->slow_timer_backoff <= 0.) { ucs_error("The slow timer back off should be > 0 (%lf)", @@ -469,6 +506,8 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, ucs_queue_head_init(&self->rx.pending_q); + uct_ud_iface_parse_gid_format(self); + return UCS_OK; err_mpool: @@ -511,6 +550,9 @@ ucs_config_field_t uct_ud_iface_config_table[] = { {"SLOW_TIMER_BACKOFF", "2.0", "Timeout multiplier for resending trigger", ucs_offsetof(uct_ud_iface_config_t, slow_timer_backoff), UCS_CONFIG_TYPE_DOUBLE}, + {"ETH_GID_FILTER_ENABLE", "y", + "Enable packets filtering by destination GIDs on an Ethernet network", + ucs_offsetof(uct_ud_iface_config_t, filter_enable), UCS_CONFIG_TYPE_BOOL}, {NULL} }; diff --git a/src/uct/ib/ud/base/ud_iface.h b/src/uct/ib/ud/base/ud_iface.h index 04167d47f1b..cd91c480ccf 100644 --- a/src/uct/ib/ud/base/ud_iface.h +++ b/src/uct/ib/ud/base/ud_iface.h @@ -30,6 +30,7 @@ typedef struct uct_ud_iface_config { uct_ib_iface_config_t super; double peer_timeout; double slow_timer_backoff; + int filter_enable; } uct_ud_iface_config_t; struct uct_ud_iface_peer { @@ -123,6 +124,8 @@ struct uct_ud_iface { double slow_timer_backoff; unsigned tx_qp_len; unsigned max_inline; + int filter_enabled; + unsigned dgid_len; } config; ucs_ptr_array_t eps; uct_ud_iface_peer_t *peers[UCT_UD_HASH_SIZE]; @@ -215,6 +218,33 @@ static UCS_F_ALWAYS_INLINE void uct_ud_leave(uct_ud_iface_t *iface) UCS_ASYNC_UNBLOCK(iface->super.super.worker->async); } +static UCS_F_ALWAYS_INLINE int +uct_ud_iface_filter_dgid(uct_ud_iface_t *iface,void *grh_end, + void *desc, int is_grh_present) +{ + void *dgid, *sgid; + + if (!iface->config.filter_enabled) { + return 0; + } + + if (ucs_unlikely(!is_grh_present)) { + ucs_error("RoCE packet does not contain GRH"); + return 0; + } + + sgid = (char*)iface->super.gid.raw + (16 - iface->config.dgid_len); + dgid = (char*)grh_end - iface->config.dgid_len; + + if (memcmp(sgid, dgid, iface->config.dgid_len)) { + ucs_mpool_put_inline(desc); + ucs_trace_data("Drop packet with wrong dgid"); + return 1; + } + + return 0; +} + /* management of connecting endpoints (cep) diff --git a/src/uct/ib/ud/verbs/ud_verbs.c b/src/uct/ib/ud/verbs/ud_verbs.c index f02ee2aa133..efd901d46f7 100644 --- a/src/uct/ib/ud/verbs/ud_verbs.c +++ b/src/uct/ib/ud/verbs/ud_verbs.c @@ -326,6 +326,11 @@ uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface, int is_async) } UCT_IB_IFACE_VERBS_FOREACH_RXWQE(&iface->super.super, i, packet, wc, num_wcs) { + if (uct_ud_iface_filter_dgid(&iface->super, packet + UCT_IB_GRH_LEN, + (void*)wc[i].wr_id, + wc[i].wc_flags & IBV_WC_GRH)) { + continue; + } uct_ib_log_recv_completion(&iface->super.super, IBV_QPT_UD, &wc[i], packet, wc[i].byte_len, uct_ud_dump_packet); uct_ud_ep_process_rx(&iface->super, From 5e00645393b4ceccb7deda371cece9dae93a5d03 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Wed, 20 Sep 2017 17:56:54 +0300 Subject: [PATCH 2/2] UCT/UD: CR comments p1 --- src/uct/ib/ud/accel/ud_mlx5.c | 9 +++++---- src/uct/ib/ud/base/ud_iface.c | 34 +++++++++++++++++----------------- src/uct/ib/ud/base/ud_iface.h | 30 ++++++++++++++---------------- src/uct/ib/ud/verbs/ud_verbs.c | 6 +++--- 4 files changed, 39 insertions(+), 40 deletions(-) diff --git a/src/uct/ib/ud/accel/ud_mlx5.c b/src/uct/ib/ud/accel/ud_mlx5.c index 852ee1ce96a..90070b7406c 100644 --- a/src/uct/ib/ud/accel/ud_mlx5.c +++ b/src/uct/ib/ud/accel/ud_mlx5.c @@ -401,14 +401,15 @@ uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t *iface, int is_async) iface->super.rx.available++; iface->rx.wq.cq_wqe_counter++; count = 1; + len = ntohl(cqe->byte_cnt); + VALGRIND_MAKE_MEM_DEFINED(packet, len); - if (uct_ud_iface_filter_dgid(&iface->super, packet + UCT_IB_GRH_LEN, desc, - (ntohl(cqe->flags_rqpn) >> 28) & 3)) { + if (!uct_ud_iface_check_grh(&iface->super, packet + UCT_IB_GRH_LEN, + (ntohl(cqe->flags_rqpn) >> 28) & 3)) { + ucs_mpool_put_inline(desc); goto out; } - len = ntohl(cqe->byte_cnt); - VALGRIND_MAKE_MEM_DEFINED(packet, len); uct_ib_mlx5_log_rx(&iface->super.super, IBV_QPT_UD, cqe, packet, uct_ud_dump_packet); uct_ud_ep_process_rx(&iface->super, diff --git a/src/uct/ib/ud/base/ud_iface.c b/src/uct/ib/ud/base/ud_iface.c index acce53221b6..d47a4fd9123 100644 --- a/src/uct/ib/ud/base/ud_iface.c +++ b/src/uct/ib/ud/base/ud_iface.c @@ -382,27 +382,26 @@ void uct_ud_iface_remove_async_handlers(uct_ud_iface_t *iface) * entire 40 bytes. IPv4 headers use the 20 bytes in the second half of the * reserved 40 bytes area (i.e. offset 20 from the beginning of the receive * buffer). In this case, the content of the first 20 bytes is undefined." */ -static void uct_ud_iface_parse_gid_format(uct_ud_iface_t *iface) +static void uct_ud_iface_calc_gid_len(uct_ud_iface_t *iface) { - const int ipv4_len = 4; - const int ipv6_len = 16; - uint16_t *first = (uint16_t*)iface->super.gid.raw; - uint16_t *filled = (uint16_t*)iface->super.gid.raw + 5; + const int ipv4_len = sizeof(struct in_addr); + const int ipv6_len = sizeof(struct in6_addr); + uint16_t *local_gid_u16 = (uint16_t*)iface->super.gid.raw; - /* Make sure that daddr in IPv4 takes last 4 bytes in GRH */ + /* Make sure that daddr in IPv4 resides in the last 4 bytes in GRH */ UCS_STATIC_ASSERT((UCT_IB_GRH_LEN - (20 + offsetof(struct iphdr, daddr))) == ipv4_len); - /* Make sure that dgid takes last 16 bytes in GRH */ + /* Make sure that dgid resides in the last 16 bytes in GRH */ UCS_STATIC_ASSERT(UCT_IB_GRH_LEN - offsetof(struct ibv_grh, dgid) == ipv6_len); - /* IPv4 mapped to IPv6 looks like: 0000:0000:0000:0000:0000:ffff:ac10:6510, + /* IPv4 mapped to IPv6 looks like: 0000:0000:0000:0000:0000:ffff:????:????, * so check for leading zeroes and verify that 11-12 bytes are 0xff. * Otherwise either RoCEv1 or RoCEv2/IPv6 are used. */ - if (!(*first)) { - ucs_assert_always((*filled) == 0xffff); - iface->config.dgid_len = ipv4_len; + if (local_gid_u16[0] == 0x0000) { + ucs_assert_always(local_gid_u16[5] == 0xffff); + iface->config.gid_len = ipv4_len; } else { - iface->config.dgid_len = ipv6_len; + iface->config.gid_len = ipv6_len; } } @@ -453,7 +452,7 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, self->rx.available = config->super.rx.queue_len; self->config.tx_qp_len = config->super.tx.queue_len; self->config.peer_timeout = ucs_time_from_sec(config->peer_timeout); - self->config.filter_enabled = (config->filter_enable && + self->config.check_grh_dgid = (config->dgid_check && (self->super.addr_type == UCT_IB_ADDRESS_TYPE_ETH)); if (config->slow_timer_backoff <= 0.) { @@ -506,7 +505,7 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, ucs_queue_head_init(&self->rx.pending_q); - uct_ud_iface_parse_gid_format(self); + uct_ud_iface_calc_gid_len(self); return UCS_OK; @@ -550,9 +549,10 @@ ucs_config_field_t uct_ud_iface_config_table[] = { {"SLOW_TIMER_BACKOFF", "2.0", "Timeout multiplier for resending trigger", ucs_offsetof(uct_ud_iface_config_t, slow_timer_backoff), UCS_CONFIG_TYPE_DOUBLE}, - {"ETH_GID_FILTER_ENABLE", "y", - "Enable packets filtering by destination GIDs on an Ethernet network", - ucs_offsetof(uct_ud_iface_config_t, filter_enable), UCS_CONFIG_TYPE_BOOL}, + {"ETH_DGID_CHECK", "y", + "Enable checking destination GID for incoming packets of Ethernet network\n" + "Mismatched packets are silently dropped.", + ucs_offsetof(uct_ud_iface_config_t, dgid_check), UCS_CONFIG_TYPE_BOOL}, {NULL} }; diff --git a/src/uct/ib/ud/base/ud_iface.h b/src/uct/ib/ud/base/ud_iface.h index cd91c480ccf..da1e1e4633f 100644 --- a/src/uct/ib/ud/base/ud_iface.h +++ b/src/uct/ib/ud/base/ud_iface.h @@ -30,7 +30,7 @@ typedef struct uct_ud_iface_config { uct_ib_iface_config_t super; double peer_timeout; double slow_timer_backoff; - int filter_enable; + int dgid_check; } uct_ud_iface_config_t; struct uct_ud_iface_peer { @@ -124,8 +124,8 @@ struct uct_ud_iface { double slow_timer_backoff; unsigned tx_qp_len; unsigned max_inline; - int filter_enabled; - unsigned dgid_len; + int check_grh_dgid; + unsigned gid_len; } config; ucs_ptr_array_t eps; uct_ud_iface_peer_t *peers[UCT_UD_HASH_SIZE]; @@ -219,30 +219,28 @@ static UCS_F_ALWAYS_INLINE void uct_ud_leave(uct_ud_iface_t *iface) } static UCS_F_ALWAYS_INLINE int -uct_ud_iface_filter_dgid(uct_ud_iface_t *iface,void *grh_end, - void *desc, int is_grh_present) +uct_ud_iface_check_grh(uct_ud_iface_t *iface, void *grh_end, int is_grh_present) { - void *dgid, *sgid; + void *dest_gid, *local_gid; - if (!iface->config.filter_enabled) { - return 0; + if (!iface->config.check_grh_dgid) { + return 1; } if (ucs_unlikely(!is_grh_present)) { - ucs_error("RoCE packet does not contain GRH"); - return 0; + ucs_warn("RoCE packet does not contain GRH"); + return 1; } - sgid = (char*)iface->super.gid.raw + (16 - iface->config.dgid_len); - dgid = (char*)grh_end - iface->config.dgid_len; + local_gid = (char*)iface->super.gid.raw + (16 - iface->config.gid_len); + dest_gid = (char*)grh_end - iface->config.gid_len; - if (memcmp(sgid, dgid, iface->config.dgid_len)) { - ucs_mpool_put_inline(desc); + if (memcmp(local_gid, dest_gid, iface->config.gid_len)) { ucs_trace_data("Drop packet with wrong dgid"); - return 1; + return 0; } - return 0; + return 1; } /* diff --git a/src/uct/ib/ud/verbs/ud_verbs.c b/src/uct/ib/ud/verbs/ud_verbs.c index efd901d46f7..a8a46c6bca6 100644 --- a/src/uct/ib/ud/verbs/ud_verbs.c +++ b/src/uct/ib/ud/verbs/ud_verbs.c @@ -326,9 +326,9 @@ uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface, int is_async) } UCT_IB_IFACE_VERBS_FOREACH_RXWQE(&iface->super.super, i, packet, wc, num_wcs) { - if (uct_ud_iface_filter_dgid(&iface->super, packet + UCT_IB_GRH_LEN, - (void*)wc[i].wr_id, - wc[i].wc_flags & IBV_WC_GRH)) { + if (!uct_ud_iface_check_grh(&iface->super, packet + UCT_IB_GRH_LEN, + wc[i].wc_flags & IBV_WC_GRH)) { + ucs_mpool_put_inline((void*)wc[i].wr_id); continue; } uct_ib_log_recv_completion(&iface->super.super, IBV_QPT_UD, &wc[i],