From 460afd4e6dfcdf2b53efa98d1f7ce15bcbb1cf29 Mon Sep 17 00:00:00 2001 From: Mikhail Brinskii Date: Fri, 15 Dec 2023 17:25:09 +0200 Subject: [PATCH] UCT/IB: Add FLID based routing support --- src/uct/ib/base/ib_device.h | 1 + src/uct/ib/base/ib_iface.c | 83 ++++++++++++++++++++++++++---- src/uct/ib/base/ib_iface.h | 7 +++ src/uct/ib/dc/dc_mlx5.c | 8 ++- src/uct/ib/rc/accel/rc_mlx5_devx.c | 2 + 5 files changed, 88 insertions(+), 13 deletions(-) diff --git a/src/uct/ib/base/ib_device.h b/src/uct/ib/base/ib_device.h index 55d30f704d2..8b0057a0f6c 100644 --- a/src/uct/ib/base/ib_device.h +++ b/src/uct/ib/base/ib_device.h @@ -52,6 +52,7 @@ #define UCT_IB_LINK_LOCAL_PREFIX be64toh(0xfe80000000000000ul) /* IBTA 4.1.1 12a */ #define UCT_IB_SITE_LOCAL_PREFIX be64toh(0xfec0000000000000ul) /* IBTA 4.1.1 12b */ #define UCT_IB_SITE_LOCAL_MASK be64toh(0xffffffffffff0000ul) /* IBTA 4.1.1 12b */ +#define UCT_IB_SITE_LOCAL_FLID_MASK be64toh(0xffffffff00000000ul) /* site-local + flid */ #define UCT_IB_DEFAULT_ROCEV2_DSCP 106 /* Default DSCP for RoCE v2 */ #define UCT_IB_ROCE_UDP_SRC_PORT_BASE 0xC000 #define UCT_IB_CQE_SL_PKTYPE_MASK 0x7 /* SL for IB or packet type diff --git a/src/uct/ib/base/ib_iface.c b/src/uct/ib/base/ib_iface.c index bac12b51936..f392fa5217c 100644 --- a/src/uct/ib/base/ib_iface.c +++ b/src/uct/ib/base/ib_iface.c @@ -134,6 +134,10 @@ ucs_config_field_t uct_ib_iface_config_table[] = { "Force interface to use global routing.", ucs_offsetof(uct_ib_iface_config_t, is_global), UCS_CONFIG_TYPE_BOOL}, + {"FLID_ROUTE", "y", + "Enable FLID based routing with site-local GIDs.", + ucs_offsetof(uct_ib_iface_config_t, flid_enabled), UCS_CONFIG_TYPE_BOOL}, + {"SL", "auto", "InfiniBand: Service level. 'auto' will select a value matching UCX_IB_AR configuration.\n" "RoCEv2: Ethernet Priority. 'auto' will select 0 by default.", @@ -342,6 +346,18 @@ size_t uct_ib_address_size(const uct_ib_address_pack_params_t *params) return size; } +static int uct_ib_address_gid_is_site_local(const union ibv_gid *gid) +{ + return (gid->global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) == + UCT_IB_SITE_LOCAL_PREFIX; +} + +static int uct_ib_address_gid_is_global(const union ibv_gid *gid) +{ + return !uct_ib_address_gid_is_site_local(gid) && + (gid->global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX); +} + void uct_ib_address_pack(const uct_ib_address_pack_params_t *params, uct_ib_address_t *ib_addr) { @@ -375,14 +391,13 @@ void uct_ib_address_pack(const uct_ib_address_pack_params_t *params, } if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) { - if ((params->gid.global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) == - UCT_IB_SITE_LOCAL_PREFIX) { + if (uct_ib_address_gid_is_site_local(¶ms->gid)) { /* Site-local */ ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET16; *ucs_serialize_next(&ptr, uint16_t) = params->gid.global.subnet_prefix >> 48; - } else if (params->gid.global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX) { - /* Global */ + } else if (uct_ib_address_gid_is_global(¶ms->gid)) { + /* Global or site local GID with non-zero FLID */ ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET64; *ucs_serialize_next(&ptr, uint64_t) = params->gid.global.subnet_prefix; @@ -691,6 +706,22 @@ int uct_ib_iface_is_same_device(const uct_ib_address_t *ib_addr, uint16_t dlid, (params.gid.global.interface_id == dgid->global.interface_id); } +static int uct_ib_iface_gid_extract_flid(const union ibv_gid *gid) +{ + if ((gid->global.subnet_prefix & UCT_IB_SITE_LOCAL_FLID_MASK) != + UCT_IB_SITE_LOCAL_PREFIX) { + return 0; + } + + return ntohs(*((uint16_t*)UCS_PTR_BYTE_OFFSET(gid->raw, 4))); +} + +static int uct_ib_iface_is_flid_enabled(const uct_ib_iface_t *iface) +{ + return iface->config.flid_enabled && + (uct_ib_iface_gid_extract_flid(&iface->gid_info.gid) != 0); +} + static int uct_ib_iface_dev_addr_is_reachable(uct_ib_iface_t *iface, const uct_ib_address_t *ib_addr) { @@ -707,9 +738,14 @@ static int uct_ib_iface_dev_addr_is_reachable(uct_ib_iface_t *iface, } if (!is_local_eth && !(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) { - /* same subnet prefix */ - return params.gid.global.subnet_prefix == - iface->gid_info.gid.global.subnet_prefix; + if (params.gid.global.subnet_prefix == + iface->gid_info.gid.global.subnet_prefix) { + return 1; + } + + /* Check FLID route: is enabled locally, and remote GID has it */ + return (uct_ib_iface_is_flid_enabled(iface) && + uct_ib_iface_gid_extract_flid(¶ms.gid) != 0); } else if (is_local_eth && (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) { /* there shouldn't be a lid and the UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH * flag should be on. If reachable, the remote and local RoCE versions @@ -809,6 +845,27 @@ void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid, uct_ib_ah_attr_str(buf, sizeof(buf), ah_attr)); } +static uint16_t uct_ib_gid_site_local_subnet_prefix(const union ibv_gid *gid) +{ + return be64toh(gid->global.subnet_prefix) & 0xffff; +} + +uint16_t uct_ib_iface_resolve_remote_flid(const uct_ib_iface_t *iface, + const union ibv_gid *gid) +{ + if (!uct_ib_iface_is_flid_enabled(iface)) { + return 0; + } + + if (uct_ib_gid_site_local_subnet_prefix(gid) == + uct_ib_gid_site_local_subnet_prefix(&iface->gid_info.gid)) { + /* On the same subnet, no need to use FLID*/ + return 0; + } + + return uct_ib_iface_gid_extract_flid(gid); +} + void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface, const uct_ib_address_t *ib_addr, unsigned path_index, @@ -816,6 +873,7 @@ void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface, enum ibv_mtu *path_mtu) { union ibv_gid *gid = NULL; + uint16_t lid, flid = 0; uct_ib_address_pack_params_t params; ucs_assert(!uct_ib_iface_is_roce(iface) == @@ -840,12 +898,13 @@ void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface, UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID | UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) || params.flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) { - gid = ¶ms.gid; + gid = ¶ms.gid; + flid = uct_ib_iface_resolve_remote_flid(iface, gid); } - uct_ib_iface_fill_ah_attr_from_gid_lid(iface, params.lid, gid, - params.gid_index, path_index, - ah_attr); + lid = (flid == 0) ? params.lid : flid; + uct_ib_iface_fill_ah_attr_from_gid_lid(iface, lid, gid, params.gid_index, + path_index, ah_attr); } static ucs_status_t uct_ib_iface_init_pkey(uct_ib_iface_t *iface, @@ -1364,6 +1423,7 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_iface_ops_t *tl_ops, self->config.hop_limit = config->hop_limit; self->release_desc.cb = uct_ib_iface_release_desc; self->config.qp_type = init_attr->qp_type; + self->config.flid_enabled = config->flid_enabled; uct_ib_iface_set_path_mtu(self, config); if (ucs_derived_of(worker, uct_priv_worker_t)->thread_mode == UCS_THREAD_MODE_MULTI) { @@ -1438,6 +1498,7 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_iface_ops_t *tl_ops, /* Address scope and size */ if (uct_ib_iface_is_roce(self) || config->is_global || uct_ib_grh_required(uct_ib_iface_port_attr(self)) || + uct_ib_address_gid_is_global(&self->gid_info.gid) || /* check ADDR_TYPE for backward compatibility */ (config->addr_type == UCT_IB_ADDRESS_TYPE_SITE_LOCAL) || (config->addr_type == UCT_IB_ADDRESS_TYPE_GLOBAL)) { diff --git a/src/uct/ib/base/ib_iface.h b/src/uct/ib/base/ib_iface.h index 3fabdf6e199..2881af83530 100644 --- a/src/uct/ib/base/ib_iface.h +++ b/src/uct/ib/base/ib_iface.h @@ -157,6 +157,9 @@ struct uct_ib_iface_config { /* Force global routing */ int is_global; + /* Use FLID based routing */ + int flid_enabled; + /* IB SL to use (default: AUTO) */ unsigned long sl; @@ -296,6 +299,7 @@ struct uct_ib_iface { uint8_t hop_limit; uint8_t qp_type; uint8_t force_global_addr; + uint8_t flid_enabled; enum ibv_mtu path_mtu; uint8_t counter_set_id; } config; @@ -578,6 +582,9 @@ void uct_ib_iface_fill_attr(uct_ib_iface_t *iface, uint8_t uct_ib_iface_config_select_sl(const uct_ib_iface_config_t *ib_config); +uint16_t uct_ib_iface_resolve_remote_flid(const uct_ib_iface_t *iface, + const union ibv_gid *gid); + #define UCT_IB_IFACE_FMT \ "%s:%d/%s" #define UCT_IB_IFACE_ARG(_iface) \ diff --git a/src/uct/ib/dc/dc_mlx5.c b/src/uct/ib/dc/dc_mlx5.c index b2051d786bf..c7d3f139854 100644 --- a/src/uct/ib/dc/dc_mlx5.c +++ b/src/uct/ib/dc/dc_mlx5.c @@ -1153,9 +1153,10 @@ uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_num, { uct_dc_mlx5_iface_t *iface = ucs_derived_of(rc_iface, uct_dc_mlx5_iface_t); uint8_t fc_hdr = uct_rc_fc_get_fc_hdr(hdr->am_id); + const union ibv_gid *gid; uct_dc_fc_sender_data_t *sender; uct_dc_fc_request_t *dc_req; - int16_t cur_wnd; + int16_t cur_wnd, flid; ucs_status_t status; uct_dc_mlx5_ep_t *ep; ucs_arbiter_t *waitq; @@ -1178,9 +1179,12 @@ uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_num, dc_req->super.super.func = uct_dc_mlx5_iface_fc_grant; dc_req->super.ep = &ep->super.super; dc_req->dct_num = imm_data; - dc_req->lid = lid; dc_req->sender = *((uct_dc_fc_sender_data_t*)(hdr + 1)); + gid = ucs_unaligned_ptr(&dc_req->sender.payload.gid); + flid = uct_ib_iface_resolve_remote_flid(&rc_iface->super, gid); + dc_req->lid = (flid == 0) ? lid : htons(flid); /* dc_req->lid is BE */ + status = uct_dc_mlx5_iface_fc_grant(&dc_req->super.super); if (status == UCS_ERR_NO_RESOURCE){ uct_dc_mlx5_ep_do_pending_fc(ep, dc_req); diff --git a/src/uct/ib/rc/accel/rc_mlx5_devx.c b/src/uct/ib/rc/accel/rc_mlx5_devx.c index 29286f7854b..852fd7573d6 100644 --- a/src/uct/ib/rc/accel/rc_mlx5_devx.c +++ b/src/uct/ib/rc/accel/rc_mlx5_devx.c @@ -445,6 +445,8 @@ ucs_status_t uct_rc_mlx5_iface_common_devx_connect_qp( iface->super.super.config.sl); if (ah_attr->is_global) { + UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.src_addr_index, + ah_attr->grh.sgid_index); UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.hop_limit, ah_attr->grh.hop_limit); memcpy(UCT_IB_MLX5DV_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),