From 023c9b8804c5c7a89c19120f9e1a74bec0e30c1e Mon Sep 17 00:00:00 2001 From: binl Date: Fri, 5 Mar 2021 09:34:19 +0200 Subject: [PATCH] # This is a combination of 2 commits. # The first commit's message is: UCT/IB/BASE: use random roce path factor to achieve high reliability. # This is the 2nd commit message: UCT/IB/BASE: Fixed 1st CR comments. --- src/uct/ib/base/ib_device.h | 1 + src/uct/ib/base/ib_iface.c | 26 +++++++++++++++++++++++++- src/uct/ib/base/ib_iface.h | 5 +++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/uct/ib/base/ib_device.h b/src/uct/ib/base/ib_device.h index 733266e99ce9..427d99160f88 100644 --- a/src/uct/ib/base/ib_device.h +++ b/src/uct/ib/base/ib_device.h @@ -47,6 +47,7 @@ #define UCT_IB_SITE_LOCAL_MASK be64toh(0xffffffffffff0000ul) /* IBTA 4.1.1 12b */ #define UCT_IB_DEFAULT_ROCEV2_DSCP 106 /* Default DSCP for RoCE v2 */ #define UCT_IB_ROCE_UDP_SRC_PORT_BASE 0xC000 +#define UCT_IB_ROCE_MAX_PATH_FACTOR 0x400 #define UCT_IB_DEVICE_SYSFS_PFX "/sys/class/infiniband/%s" #define UCT_IB_DEVICE_SYSFS_FMT UCT_IB_DEVICE_SYSFS_PFX "/device/%s" #define UCT_IB_DEVICE_SYSFS_GID_ATTR_PFX UCT_IB_DEVICE_SYSFS_PFX "/ports/%d/gid_attrs" diff --git a/src/uct/ib/base/ib_iface.c b/src/uct/ib/base/ib_iface.c index 3e27d9104186..d08f1925a36a 100644 --- a/src/uct/ib/base/ib_iface.c +++ b/src/uct/ib/base/ib_iface.c @@ -176,6 +176,10 @@ ucs_config_field_t uct_ib_iface_config_table[] = { "path for the same pair of endpoints.", ucs_offsetof(uct_ib_iface_config_t, roce_path_factor), UCS_CONFIG_TYPE_UINT}, + {"ROCE_RANDOM_PATH", "n", + "Enable/Disable random RoCE path generation.", + ucs_offsetof(uct_ib_iface_config_t, roce_random_path), UCS_CONFIG_TYPE_BOOL}, + {"LID_PATH_BITS", "0", "List of IB Path bits separated by comma (a,b,c) " "which will be the low portion of the LID, according to the LMC in the fabric.", @@ -529,7 +533,17 @@ void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid, ah_attr->grh.traffic_class = iface->config.traffic_class; if (uct_ib_iface_is_roce(iface)) { - udp_sport = iface->config.roce_path_factor * path_index; + if (iface->config.roce_random_path) { + if (path_index == 0) { + rand_r(&iface->rand_value); + } + udp_sport = (iface->rand_value % + (UCT_IB_ROCE_MAX_PATH_FACTOR + 1 - + iface->config.roce_path_factor)) + + iface->config.roce_path_factor * path_index; + } else { + udp_sport = iface->config.roce_path_factor * path_index; + } /* older drivers use dlid for udp.sport, new drivers use flow_label when its nonzero */ ah_attr->dlid = UCT_IB_ROCE_UDP_SRC_PORT_BASE | udp_sport; @@ -1003,6 +1017,11 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, uint8_t port_num; size_t inl; + if (config->roce_path_factor > UCT_IB_ROCE_MAX_PATH_FACTOR) { + status = UCS_ERR_INVALID_PARAM; + goto err; + } + if (!(params->open_mode & UCT_IFACE_OPEN_MODE_DEVICE)) { return UCS_ERR_UNSUPPORTED; } @@ -1041,6 +1060,7 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, self->config.rx_headroom_offset = self->config.rx_payload_offset - rx_headroom; self->config.seg_size = init_attr->seg_size; + self->config.roce_random_path = config->roce_random_path; self->config.roce_path_factor = config->roce_path_factor; self->config.tx_max_poll = config->tx.max_poll; self->config.rx_max_poll = config->rx.max_poll; @@ -1053,6 +1073,10 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, self->config.enable_res_domain = config->enable_res_domain; self->config.qp_type = init_attr->qp_type; + if (config->roce_random_path) { + self->rand_value = ucs_generate_uuid(0); + } + if (ucs_derived_of(worker, uct_priv_worker_t)->thread_mode == UCS_THREAD_MODE_MULTI) { ucs_error("IB transports do not support multi-threaded worker"); return UCS_ERR_INVALID_PARAM; diff --git a/src/uct/ib/base/ib_iface.h b/src/uct/ib/base/ib_iface.h index 2be85507647e..e2b3a59fb6ae 100644 --- a/src/uct/ib/base/ib_iface.h +++ b/src/uct/ib/base/ib_iface.h @@ -122,6 +122,9 @@ struct uct_ib_iface_config { /* Multiplier for RoCE LAG UDP source port calculation */ unsigned roce_path_factor; + /* Enable random path factor for RoCE LAG UDP source port calculation */ + int roce_random_path; + /* Ranges of path bits */ UCS_CONFIG_ARRAY_FIELD(ucs_range_spec_t, ranges) lid_path_bits; @@ -195,6 +198,7 @@ struct uct_ib_iface { uint16_t pkey_value; uint8_t addr_size; uct_ib_device_gid_info_t gid_info; + unsigned rand_value; struct { unsigned rx_payload_offset; /* offset from desc to payload */ @@ -205,6 +209,7 @@ struct uct_ib_iface { unsigned tx_max_poll; unsigned seg_size; unsigned roce_path_factor; + uint8_t roce_random_path; uint8_t max_inl_resp; uint8_t port_num; uint8_t sl;