diff --git a/src/uct/ib/ud/accel/ud_mlx5.c b/src/uct/ib/ud/accel/ud_mlx5.c index 426f6d3be9b..90070b7406c 100644 --- a/src/uct/ib/ud/accel/ud_mlx5.c +++ b/src/uct/ib/ud/accel/ud_mlx5.c @@ -400,17 +400,22 @@ uct_ud_mlx5_iface_poll_rx(uct_ud_mlx5_iface_t *iface, int is_async) iface->super.rx.available++; iface->rx.wq.cq_wqe_counter++; - - len = ntohl(cqe->byte_cnt); + count = 1; + len = ntohl(cqe->byte_cnt); VALGRIND_MAKE_MEM_DEFINED(packet, len); + + if (!uct_ud_iface_check_grh(&iface->super, packet + UCT_IB_GRH_LEN, + (ntohl(cqe->flags_rqpn) >> 28) & 3)) { + ucs_mpool_put_inline(desc); + goto out; + } + uct_ib_mlx5_log_rx(&iface->super.super, IBV_QPT_UD, cqe, packet, uct_ud_dump_packet); uct_ud_ep_process_rx(&iface->super, (uct_ud_neth_t *)(packet + UCT_IB_GRH_LEN), len - UCT_IB_GRH_LEN, (uct_ud_recv_skb_t *)desc, is_async); - count = 1; - out: if (iface->super.rx.available >= iface->super.super.config.rx_max_batch) { /* we need to try to post buffers always. Otherwise it is possible diff --git a/src/uct/ib/ud/base/ud_iface.c b/src/uct/ib/ud/base/ud_iface.c index 2b12a77df0c..d47a4fd9123 100644 --- a/src/uct/ib/ud/base/ud_iface.c +++ b/src/uct/ib/ud/base/ud_iface.c @@ -14,6 +14,7 @@ #include #include #include +#include SGLIB_DEFINE_LIST_FUNCTIONS(uct_ud_iface_peer_t, uct_ud_iface_peer_cmp, next) @@ -371,6 +372,39 @@ void uct_ud_iface_remove_async_handlers(uct_ud_iface_t *iface) ucs_async_remove_handler(iface->async.timer_id, 1); } +/* Calculate real GIDs len. Can be either 16 (RoCEv1 or RoCEv2/IPv6) + * or 4 (RoCEv2/IPv4). This len is used for packets filtering by DGIDs. + * + * According to Annex17_RoCEv2 (A17.4.5.2): + * "The first 40 bytes of user posted UD Receive Buffers are reserved for the L3 + * header of the incoming packet (as per the InfiniBand Spec Section 11.4.1.2). + * In RoCEv2, this area is filled up with the IP header. IPv6 header uses the + * entire 40 bytes. IPv4 headers use the 20 bytes in the second half of the + * reserved 40 bytes area (i.e. offset 20 from the beginning of the receive + * buffer). In this case, the content of the first 20 bytes is undefined." */ +static void uct_ud_iface_calc_gid_len(uct_ud_iface_t *iface) +{ + const int ipv4_len = sizeof(struct in_addr); + const int ipv6_len = sizeof(struct in6_addr); + uint16_t *local_gid_u16 = (uint16_t*)iface->super.gid.raw; + + /* Make sure that daddr in IPv4 resides in the last 4 bytes in GRH */ + UCS_STATIC_ASSERT((UCT_IB_GRH_LEN - (20 + offsetof(struct iphdr, daddr))) == ipv4_len); + + /* Make sure that dgid resides in the last 16 bytes in GRH */ + UCS_STATIC_ASSERT(UCT_IB_GRH_LEN - offsetof(struct ibv_grh, dgid) == ipv6_len); + + /* IPv4 mapped to IPv6 looks like: 0000:0000:0000:0000:0000:ffff:????:????, + * so check for leading zeroes and verify that 11-12 bytes are 0xff. + * Otherwise either RoCEv1 or RoCEv2/IPv6 are used. */ + if (local_gid_u16[0] == 0x0000) { + ucs_assert_always(local_gid_u16[5] == 0xffff); + iface->config.gid_len = ipv4_len; + } else { + iface->config.gid_len = ipv6_len; + } +} + UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, unsigned ud_rx_priv_len, @@ -418,6 +452,8 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, self->rx.available = config->super.rx.queue_len; self->config.tx_qp_len = config->super.tx.queue_len; self->config.peer_timeout = ucs_time_from_sec(config->peer_timeout); + self->config.check_grh_dgid = (config->dgid_check && + (self->super.addr_type == UCT_IB_ADDRESS_TYPE_ETH)); if (config->slow_timer_backoff <= 0.) { ucs_error("The slow timer back off should be > 0 (%lf)", @@ -469,6 +505,8 @@ UCS_CLASS_INIT_FUNC(uct_ud_iface_t, uct_ud_iface_ops_t *ops, uct_md_h md, ucs_queue_head_init(&self->rx.pending_q); + uct_ud_iface_calc_gid_len(self); + return UCS_OK; err_mpool: @@ -511,6 +549,10 @@ ucs_config_field_t uct_ud_iface_config_table[] = { {"SLOW_TIMER_BACKOFF", "2.0", "Timeout multiplier for resending trigger", ucs_offsetof(uct_ud_iface_config_t, slow_timer_backoff), UCS_CONFIG_TYPE_DOUBLE}, + {"ETH_DGID_CHECK", "y", + "Enable checking destination GID for incoming packets of Ethernet network\n" + "Mismatched packets are silently dropped.", + ucs_offsetof(uct_ud_iface_config_t, dgid_check), UCS_CONFIG_TYPE_BOOL}, {NULL} }; diff --git a/src/uct/ib/ud/base/ud_iface.h b/src/uct/ib/ud/base/ud_iface.h index 04167d47f1b..da1e1e4633f 100644 --- a/src/uct/ib/ud/base/ud_iface.h +++ b/src/uct/ib/ud/base/ud_iface.h @@ -30,6 +30,7 @@ typedef struct uct_ud_iface_config { uct_ib_iface_config_t super; double peer_timeout; double slow_timer_backoff; + int dgid_check; } uct_ud_iface_config_t; struct uct_ud_iface_peer { @@ -123,6 +124,8 @@ struct uct_ud_iface { double slow_timer_backoff; unsigned tx_qp_len; unsigned max_inline; + int check_grh_dgid; + unsigned gid_len; } config; ucs_ptr_array_t eps; uct_ud_iface_peer_t *peers[UCT_UD_HASH_SIZE]; @@ -215,6 +218,31 @@ static UCS_F_ALWAYS_INLINE void uct_ud_leave(uct_ud_iface_t *iface) UCS_ASYNC_UNBLOCK(iface->super.super.worker->async); } +static UCS_F_ALWAYS_INLINE int +uct_ud_iface_check_grh(uct_ud_iface_t *iface, void *grh_end, int is_grh_present) +{ + void *dest_gid, *local_gid; + + if (!iface->config.check_grh_dgid) { + return 1; + } + + if (ucs_unlikely(!is_grh_present)) { + ucs_warn("RoCE packet does not contain GRH"); + return 1; + } + + local_gid = (char*)iface->super.gid.raw + (16 - iface->config.gid_len); + dest_gid = (char*)grh_end - iface->config.gid_len; + + if (memcmp(local_gid, dest_gid, iface->config.gid_len)) { + ucs_trace_data("Drop packet with wrong dgid"); + return 0; + } + + return 1; +} + /* management of connecting endpoints (cep) diff --git a/src/uct/ib/ud/verbs/ud_verbs.c b/src/uct/ib/ud/verbs/ud_verbs.c index f02ee2aa133..a8a46c6bca6 100644 --- a/src/uct/ib/ud/verbs/ud_verbs.c +++ b/src/uct/ib/ud/verbs/ud_verbs.c @@ -326,6 +326,11 @@ uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface, int is_async) } UCT_IB_IFACE_VERBS_FOREACH_RXWQE(&iface->super.super, i, packet, wc, num_wcs) { + if (!uct_ud_iface_check_grh(&iface->super, packet + UCT_IB_GRH_LEN, + wc[i].wc_flags & IBV_WC_GRH)) { + ucs_mpool_put_inline((void*)wc[i].wr_id); + continue; + } uct_ib_log_recv_completion(&iface->super.super, IBV_QPT_UD, &wc[i], packet, wc[i].byte_len, uct_ud_dump_packet); uct_ud_ep_process_rx(&iface->super,