Skip to content

Commit

Permalink
Merge pull request #9557 from yosefe/topic/uct-ib-add-flid-based-rout…
Browse files Browse the repository at this point in the history
…ing-support-v1.16.x

UCT/IB: Add FLID based routing support - v1.16.x
  • Loading branch information
yosefe authored Dec 25, 2023
2 parents 4efdff9 + 460afd4 commit 76758f8
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 13 deletions.
1 change: 1 addition & 0 deletions src/uct/ib/base/ib_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
#define UCT_IB_LINK_LOCAL_PREFIX be64toh(0xfe80000000000000ul) /* IBTA 4.1.1 12a */
#define UCT_IB_SITE_LOCAL_PREFIX be64toh(0xfec0000000000000ul) /* IBTA 4.1.1 12b */
#define UCT_IB_SITE_LOCAL_MASK be64toh(0xffffffffffff0000ul) /* IBTA 4.1.1 12b */
#define UCT_IB_SITE_LOCAL_FLID_MASK be64toh(0xffffffff00000000ul) /* site-local + flid */
#define UCT_IB_DEFAULT_ROCEV2_DSCP 106 /* Default DSCP for RoCE v2 */
#define UCT_IB_ROCE_UDP_SRC_PORT_BASE 0xC000
#define UCT_IB_CQE_SL_PKTYPE_MASK 0x7 /* SL for IB or packet type
Expand Down
83 changes: 72 additions & 11 deletions src/uct/ib/base/ib_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,10 @@ ucs_config_field_t uct_ib_iface_config_table[] = {
"Force interface to use global routing.",
ucs_offsetof(uct_ib_iface_config_t, is_global), UCS_CONFIG_TYPE_BOOL},

{"FLID_ROUTE", "y",
"Enable FLID based routing with site-local GIDs.",
ucs_offsetof(uct_ib_iface_config_t, flid_enabled), UCS_CONFIG_TYPE_BOOL},

{"SL", "auto",
"InfiniBand: Service level. 'auto' will select a value matching UCX_IB_AR configuration.\n"
"RoCEv2: Ethernet Priority. 'auto' will select 0 by default.",
Expand Down Expand Up @@ -342,6 +346,18 @@ size_t uct_ib_address_size(const uct_ib_address_pack_params_t *params)
return size;
}

static int uct_ib_address_gid_is_site_local(const union ibv_gid *gid)
{
return (gid->global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) ==
UCT_IB_SITE_LOCAL_PREFIX;
}

static int uct_ib_address_gid_is_global(const union ibv_gid *gid)
{
return !uct_ib_address_gid_is_site_local(gid) &&
(gid->global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX);
}

void uct_ib_address_pack(const uct_ib_address_pack_params_t *params,
uct_ib_address_t *ib_addr)
{
Expand Down Expand Up @@ -375,14 +391,13 @@ void uct_ib_address_pack(const uct_ib_address_pack_params_t *params,
}

if (params->flags & UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) {
if ((params->gid.global.subnet_prefix & UCT_IB_SITE_LOCAL_MASK) ==
UCT_IB_SITE_LOCAL_PREFIX) {
if (uct_ib_address_gid_is_site_local(&params->gid)) {
/* Site-local */
ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET16;
*ucs_serialize_next(&ptr, uint16_t) =
params->gid.global.subnet_prefix >> 48;
} else if (params->gid.global.subnet_prefix != UCT_IB_LINK_LOCAL_PREFIX) {
/* Global */
} else if (uct_ib_address_gid_is_global(&params->gid)) {
/* Global or site local GID with non-zero FLID */
ib_addr->flags |= UCT_IB_ADDRESS_FLAG_SUBNET64;
*ucs_serialize_next(&ptr, uint64_t) =
params->gid.global.subnet_prefix;
Expand Down Expand Up @@ -691,6 +706,22 @@ int uct_ib_iface_is_same_device(const uct_ib_address_t *ib_addr, uint16_t dlid,
(params.gid.global.interface_id == dgid->global.interface_id);
}

static int uct_ib_iface_gid_extract_flid(const union ibv_gid *gid)
{
if ((gid->global.subnet_prefix & UCT_IB_SITE_LOCAL_FLID_MASK) !=
UCT_IB_SITE_LOCAL_PREFIX) {
return 0;
}

return ntohs(*((uint16_t*)UCS_PTR_BYTE_OFFSET(gid->raw, 4)));
}

static int uct_ib_iface_is_flid_enabled(const uct_ib_iface_t *iface)
{
return iface->config.flid_enabled &&
(uct_ib_iface_gid_extract_flid(&iface->gid_info.gid) != 0);
}

static int uct_ib_iface_dev_addr_is_reachable(uct_ib_iface_t *iface,
const uct_ib_address_t *ib_addr)
{
Expand All @@ -707,9 +738,14 @@ static int uct_ib_iface_dev_addr_is_reachable(uct_ib_iface_t *iface,
}

if (!is_local_eth && !(ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) {
/* same subnet prefix */
return params.gid.global.subnet_prefix ==
iface->gid_info.gid.global.subnet_prefix;
if (params.gid.global.subnet_prefix ==
iface->gid_info.gid.global.subnet_prefix) {
return 1;
}

/* Check FLID route: is enabled locally, and remote GID has it */
return (uct_ib_iface_is_flid_enabled(iface) &&
uct_ib_iface_gid_extract_flid(&params.gid) != 0);
} else if (is_local_eth && (ib_addr->flags & UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH)) {
/* there shouldn't be a lid and the UCT_IB_ADDRESS_FLAG_LINK_LAYER_ETH
* flag should be on. If reachable, the remote and local RoCE versions
Expand Down Expand Up @@ -809,13 +845,35 @@ void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid,
uct_ib_ah_attr_str(buf, sizeof(buf), ah_attr));
}

static uint16_t uct_ib_gid_site_local_subnet_prefix(const union ibv_gid *gid)
{
return be64toh(gid->global.subnet_prefix) & 0xffff;
}

uint16_t uct_ib_iface_resolve_remote_flid(const uct_ib_iface_t *iface,
const union ibv_gid *gid)
{
if (!uct_ib_iface_is_flid_enabled(iface)) {
return 0;
}

if (uct_ib_gid_site_local_subnet_prefix(gid) ==
uct_ib_gid_site_local_subnet_prefix(&iface->gid_info.gid)) {
/* On the same subnet, no need to use FLID*/
return 0;
}

return uct_ib_iface_gid_extract_flid(gid);
}

void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface,
const uct_ib_address_t *ib_addr,
unsigned path_index,
struct ibv_ah_attr *ah_attr,
enum ibv_mtu *path_mtu)
{
union ibv_gid *gid = NULL;
uint16_t lid, flid = 0;
uct_ib_address_pack_params_t params;

ucs_assert(!uct_ib_iface_is_roce(iface) ==
Expand All @@ -840,12 +898,13 @@ void uct_ib_iface_fill_ah_attr_from_addr(uct_ib_iface_t *iface,
UCT_IB_ADDRESS_PACK_FLAG_INTERFACE_ID |
UCT_IB_ADDRESS_PACK_FLAG_SUBNET_PREFIX) ||
params.flags & UCT_IB_ADDRESS_PACK_FLAG_ETH) {
gid = &params.gid;
gid = &params.gid;
flid = uct_ib_iface_resolve_remote_flid(iface, gid);
}

uct_ib_iface_fill_ah_attr_from_gid_lid(iface, params.lid, gid,
params.gid_index, path_index,
ah_attr);
lid = (flid == 0) ? params.lid : flid;
uct_ib_iface_fill_ah_attr_from_gid_lid(iface, lid, gid, params.gid_index,
path_index, ah_attr);
}

static ucs_status_t uct_ib_iface_init_pkey(uct_ib_iface_t *iface,
Expand Down Expand Up @@ -1364,6 +1423,7 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_iface_ops_t *tl_ops,
self->config.hop_limit = config->hop_limit;
self->release_desc.cb = uct_ib_iface_release_desc;
self->config.qp_type = init_attr->qp_type;
self->config.flid_enabled = config->flid_enabled;
uct_ib_iface_set_path_mtu(self, config);

if (ucs_derived_of(worker, uct_priv_worker_t)->thread_mode == UCS_THREAD_MODE_MULTI) {
Expand Down Expand Up @@ -1438,6 +1498,7 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_iface_ops_t *tl_ops,
/* Address scope and size */
if (uct_ib_iface_is_roce(self) || config->is_global ||
uct_ib_grh_required(uct_ib_iface_port_attr(self)) ||
uct_ib_address_gid_is_global(&self->gid_info.gid) ||
/* check ADDR_TYPE for backward compatibility */
(config->addr_type == UCT_IB_ADDRESS_TYPE_SITE_LOCAL) ||
(config->addr_type == UCT_IB_ADDRESS_TYPE_GLOBAL)) {
Expand Down
7 changes: 7 additions & 0 deletions src/uct/ib/base/ib_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,9 @@ struct uct_ib_iface_config {
/* Force global routing */
int is_global;

/* Use FLID based routing */
int flid_enabled;

/* IB SL to use (default: AUTO) */
unsigned long sl;

Expand Down Expand Up @@ -296,6 +299,7 @@ struct uct_ib_iface {
uint8_t hop_limit;
uint8_t qp_type;
uint8_t force_global_addr;
uint8_t flid_enabled;
enum ibv_mtu path_mtu;
uint8_t counter_set_id;
} config;
Expand Down Expand Up @@ -578,6 +582,9 @@ void uct_ib_iface_fill_attr(uct_ib_iface_t *iface,

uint8_t uct_ib_iface_config_select_sl(const uct_ib_iface_config_t *ib_config);

uint16_t uct_ib_iface_resolve_remote_flid(const uct_ib_iface_t *iface,
const union ibv_gid *gid);

#define UCT_IB_IFACE_FMT \
"%s:%d/%s"
#define UCT_IB_IFACE_ARG(_iface) \
Expand Down
8 changes: 6 additions & 2 deletions src/uct/ib/dc/dc_mlx5.c
Original file line number Diff line number Diff line change
Expand Up @@ -1153,9 +1153,10 @@ uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_num,
{
uct_dc_mlx5_iface_t *iface = ucs_derived_of(rc_iface, uct_dc_mlx5_iface_t);
uint8_t fc_hdr = uct_rc_fc_get_fc_hdr(hdr->am_id);
const union ibv_gid *gid;
uct_dc_fc_sender_data_t *sender;
uct_dc_fc_request_t *dc_req;
int16_t cur_wnd;
int16_t cur_wnd, flid;
ucs_status_t status;
uct_dc_mlx5_ep_t *ep;
ucs_arbiter_t *waitq;
Expand All @@ -1178,9 +1179,12 @@ uct_dc_mlx5_iface_fc_handler(uct_rc_iface_t *rc_iface, unsigned qp_num,
dc_req->super.super.func = uct_dc_mlx5_iface_fc_grant;
dc_req->super.ep = &ep->super.super;
dc_req->dct_num = imm_data;
dc_req->lid = lid;
dc_req->sender = *((uct_dc_fc_sender_data_t*)(hdr + 1));

gid = ucs_unaligned_ptr(&dc_req->sender.payload.gid);
flid = uct_ib_iface_resolve_remote_flid(&rc_iface->super, gid);
dc_req->lid = (flid == 0) ? lid : htons(flid); /* dc_req->lid is BE */

status = uct_dc_mlx5_iface_fc_grant(&dc_req->super.super);
if (status == UCS_ERR_NO_RESOURCE){
uct_dc_mlx5_ep_do_pending_fc(ep, dc_req);
Expand Down
2 changes: 2 additions & 0 deletions src/uct/ib/rc/accel/rc_mlx5_devx.c
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,8 @@ ucs_status_t uct_rc_mlx5_iface_common_devx_connect_qp(
iface->super.super.config.sl);

if (ah_attr->is_global) {
UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.src_addr_index,
ah_attr->grh.sgid_index);
UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.hop_limit,
ah_attr->grh.hop_limit);
memcpy(UCT_IB_MLX5DV_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
Expand Down

0 comments on commit 76758f8

Please sign in to comment.