From 7c81ab1260e828ee0f87b97cd00a328ae97171a4 Mon Sep 17 00:00:00 2001 From: dmitrygx Date: Wed, 16 Dec 2020 11:02:25 +0200 Subject: [PATCH] UCT/IB: Fixes for SL selection --- src/uct/ib/base/ib_iface.c | 14 +++-- src/uct/ib/base/ib_iface.h | 3 +- src/uct/ib/mlx5/ib_mlx5.c | 24 ++++++-- test/gtest/uct/ib/test_ib.cc | 105 ++++++++++++++++++++++++++--------- 4 files changed, 107 insertions(+), 39 deletions(-) diff --git a/src/uct/ib/base/ib_iface.c b/src/uct/ib/base/ib_iface.c index f998a70184a8..cd56f23934f7 100644 --- a/src/uct/ib/base/ib_iface.c +++ b/src/uct/ib/base/ib_iface.c @@ -647,7 +647,7 @@ void uct_ib_iface_fill_ah_attr_from_gid_lid(uct_ib_iface_t *iface, uint16_t lid, memset(ah_attr, 0, sizeof(*ah_attr)); - ucs_assert(iface->config.sl != UCT_IB_SL_INVALID); + ucs_assert(iface->config.sl < UCT_IB_SL_NUM); ah_attr->sl = iface->config.sl; ah_attr->port_num = iface->config.port_num; @@ -1179,9 +1179,12 @@ static void uct_ib_iface_set_path_mtu(uct_ib_iface_t *iface, uint8_t uct_ib_iface_config_select_sl(const uct_ib_iface_config_t *ib_config) { - ucs_assert((ib_config->sl <= UCT_IB_SL_MAX) || - (ib_config->sl == UCS_ULUNITS_AUTO)); - return (ib_config->sl == UCS_ULUNITS_AUTO) ? 0 : (uint8_t)ib_config->sl; + if (ib_config->sl == UCS_ULUNITS_AUTO) { + return 0; + } + + ucs_assert(ib_config->sl < UCT_IB_SL_NUM); + return (uint8_t)ib_config->sl; } UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, @@ -1244,7 +1247,8 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, self->config.rx_max_batch = ucs_min(config->rx.max_batch, config->rx.queue_len / 4); self->config.port_num = port_num; - self->config.sl = UCT_IB_SL_INVALID; + /* initialize to invalid value */ + self->config.sl = UCT_IB_SL_NUM; self->config.hop_limit = config->hop_limit; self->release_desc.cb = uct_ib_iface_release_desc; self->config.enable_res_domain = config->enable_res_domain; diff --git a/src/uct/ib/base/ib_iface.h b/src/uct/ib/base/ib_iface.h index c778c090d243..22dbe10adbca 100644 --- a/src/uct/ib/base/ib_iface.h +++ b/src/uct/ib/base/ib_iface.h @@ -26,8 +26,7 @@ #define UCT_IB_ADDRESS_INVALID_PATH_MTU ((enum ibv_mtu)0) #define UCT_IB_ADDRESS_INVALID_PKEY 0 #define UCT_IB_ADDRESS_DEFAULT_PKEY 0xffff -#define UCT_IB_SL_MAX 15 -#define UCT_IB_SL_INVALID (UCT_IB_SL_MAX + 1) +#define UCT_IB_SL_NUM 16 /* Forward declarations */ typedef struct uct_ib_iface_config uct_ib_iface_config_t; diff --git a/src/uct/ib/mlx5/ib_mlx5.c b/src/uct/ib/mlx5/ib_mlx5.c index 5081aaf68e9f..715cd237f823 100644 --- a/src/uct/ib/mlx5/ib_mlx5.c +++ b/src/uct/ib/mlx5/ib_mlx5.c @@ -736,7 +736,7 @@ uct_ib_mlx5_select_sl(const uct_ib_iface_config_t *ib_config, /* which SLs are allowed by user config */ sl_allow_mask = (ib_config->sl == UCS_ULUNITS_AUTO) ? - UCS_MASK(UCT_IB_SL_MAX) : UCS_BIT(ib_config->sl); + UCS_MASK(UCT_IB_SL_NUM) : UCS_BIT(ib_config->sl); if (have_sl_mask_cap) { sls_with_ar = sl_allow_mask & hw_sl_mask; @@ -814,8 +814,19 @@ uct_ib_mlx5_iface_select_sl(uct_ib_iface_t *iface, #endif uint16_t ooo_sl_mask = 0; ucs_status_t status; + ucs_ternary_auto_value_t ar_enable; + int have_ooo_sl_mask; - ucs_assert(iface->config.sl == UCT_IB_SL_INVALID); + ucs_assert(iface->config.sl == UCT_IB_SL_NUM); + + if (uct_ib_device_is_port_roce(uct_ib_iface_device(iface), + iface->config.port_num)) { + /* Ethernet priority for RoCE devices can't be selected regardless + * AR support requested by user, pass empty ooo_sl_mask */ + ar_enable = UCS_NO; + have_ooo_sl_mask = 1; + goto select_sl; + } #if HAVE_DEVX status = uct_ib_mlx5_devx_query_ooo_sl_mask(md, iface->config.port_num, @@ -827,8 +838,11 @@ uct_ib_mlx5_iface_select_sl(uct_ib_iface_t *iface, status = UCS_ERR_UNSUPPORTED; #endif - return uct_ib_mlx5_select_sl(ib_config, ib_mlx5_config->ar_enable, - ooo_sl_mask, status == UCS_OK, - UCT_IB_IFACE_ARG(iface), + ar_enable = ib_mlx5_config->ar_enable; + have_ooo_sl_mask = status == UCS_OK; + +select_sl: + return uct_ib_mlx5_select_sl(ib_config, ar_enable, ooo_sl_mask, + have_ooo_sl_mask, UCT_IB_IFACE_ARG(iface), &iface->config.sl); } diff --git a/test/gtest/uct/ib/test_ib.cc b/test/gtest/uct/ib/test_ib.cc index e83fb55a09fb..15b633fcbac1 100644 --- a/test/gtest/uct/ib/test_ib.cc +++ b/test/gtest/uct/ib/test_ib.cc @@ -519,10 +519,39 @@ class test_uct_ib_sl_utils : public test_uct_ib_utils { return status; } + static ucs_log_func_rc_t + wrap_errors_check_sl_masks_logger(const char *file, unsigned line, + const char *function, + ucs_log_level_t level, + const ucs_log_component_config_t * + comp_conf, + const char *message, va_list ap) + { + if (level == UCS_LOG_LEVEL_ERROR) { + std::string err_str = format_message(message, ap); + + for (uint8_t sl = 0; sl < UCT_IB_SL_NUM; ++sl) { + std::string sl_val = ucs::to_string(static_cast(sl)); + + if ((err_str.find(sl_val + ", ") == std::string::npos) && + (err_str.find(sl_val + " }") == std::string::npos)) { + return UCS_LOG_FUNC_RC_CONTINUE; + } + } + + return UCS_LOG_FUNC_RC_STOP; + } + + return UCS_LOG_FUNC_RC_CONTINUE; + } + ucs_status_t select_sl_nok(ucs_ternary_auto_value_t ar_enable, - uint64_t ooo_sl_mask, + unsigned long config_sl, uint64_t ooo_sl_mask, const uct_ib_iface_config_t &config) const { - scoped_log_handler slh(wrap_errors_logger); + scoped_log_handler slh(((ooo_sl_mask != m_ooo_sl_mask_not_detected) && + (config_sl == UCS_ULUNITS_AUTO)) ? + wrap_errors_check_sl_masks_logger : + wrap_errors_logger); uint8_t sl; EXPECT_NE(UCS_AUTO, ar_enable); @@ -540,7 +569,7 @@ class test_uct_ib_sl_utils : public test_uct_ib_utils { if (exp_status == UCS_OK) { status = select_sl_ok(ar_enable, config_sl, ooo_sl_mask, config); } else { - status = select_sl_nok(ar_enable, ooo_sl_mask, config); + status = select_sl_nok(ar_enable, config_sl, ooo_sl_mask, config); } EXPECT_EQ(exp_status, status); } @@ -574,30 +603,52 @@ UCS_TEST_F(test_uct_ib_sl_utils, sl_selection) { /* select the default SL, with OOO SL mask { 4 } */ select_sl(ar_enable, UCS_ULUNITS_AUTO, UCS_BIT(4), UCS_OK); - /* select SL=8, with empty OOO SL mask */ - select_sl(ar_enable, 8, 0, - (ar_enable == UCS_YES) ? err_status : UCS_OK); - - /* select SL=8, without OOO SL mask (not detected) */ - select_sl(ar_enable, 8, m_ooo_sl_mask_not_detected, - ((ar_enable != UCS_TRY) && (ar_enable != UCS_AUTO)) ? - err_status : UCS_OK); - - /* select SL=8, with OOO SL mask { 8 } */ - select_sl(ar_enable, 8, UCS_BIT(8), - (ar_enable == UCS_NO) ? err_status : UCS_OK); - - /* select SL=8, with OOO SL mask { 0 } */ - select_sl(ar_enable, 8, UCS_BIT(0), - (ar_enable == UCS_YES) ? err_status : UCS_OK); - - /* select SL=8, with OOO SL mask { 0, 4, 8 } */ - select_sl(ar_enable, 8, UCS_BIT(0) | UCS_BIT(4) | UCS_BIT(8), - (ar_enable == UCS_NO) ? err_status : UCS_OK); - - /* select SL=8, with OOO SL mask { 0, 8, 11 } */ - select_sl(ar_enable, 8, UCS_BIT(0) | UCS_BIT(4) | UCS_BIT(11), - (ar_enable == UCS_YES) ? err_status : UCS_OK); + for (uint8_t sl = 0; sl < UCT_IB_SL_NUM; ++sl) { + /* select SL=, with empty OOO SL mask */ + select_sl(ar_enable, sl, 0, + (ar_enable == UCS_YES) ? err_status : UCS_OK); + + /* select SL=, without OOO SL mask (not detected) */ + select_sl(ar_enable, sl, m_ooo_sl_mask_not_detected, + ((ar_enable != UCS_TRY) && (ar_enable != UCS_AUTO)) ? + err_status : UCS_OK); + + /* select SL=, with OOO SL mask { sl } */ + select_sl(ar_enable, sl, UCS_BIT(sl), + (ar_enable == UCS_NO) ? err_status : UCS_OK); + + /* select SL=, with OOO SL mask { UCT_IB_SL_NUM - 1 - sl }, + * i.e. OOO SL mask that doesn't contain */ + select_sl(ar_enable, sl, UCS_BIT(UCT_IB_SL_NUM - 1 - sl), + (ar_enable == UCS_YES) ? err_status : UCS_OK); + + /* select SL=, with OOO SL mask { (sl - 1) % UCT_IB_SL_NUM, + sl, + (sl + 1) % UCT_IB_SL_NUM } */ + select_sl(ar_enable, sl, + UCS_BIT((sl - 1) % UCT_IB_SL_NUM) | + UCS_BIT(sl) | + UCS_BIT((sl + 1) % UCT_IB_SL_NUM), + (ar_enable == UCS_NO) ? err_status : UCS_OK); + + /* select SL=, with OOO SL mask { (sl - 1) % UCT_IB_SL_NUM, + UCT_IB_SL_NUM - 1 - sl, + (sl + 1) % UCT_IB_SL_NUM }, + * i.e. OOO SL mask that doesn't contain */ + select_sl(ar_enable, sl, + UCS_BIT((sl - 1) % UCT_IB_SL_NUM) | + UCS_BIT(UCT_IB_SL_NUM - 1 - sl) | + UCS_BIT((sl + 1) % UCT_IB_SL_NUM), + (ar_enable == UCS_YES) ? err_status : UCS_OK); + + /* select SL=, with full OOO SL mask */ + select_sl(ar_enable, sl, UCS_MASK(UCT_IB_SL_NUM), + (ar_enable == UCS_NO) ? err_status : UCS_OK); + + /* select SL=, with full OOO SL mask, except */ + select_sl(ar_enable, sl, UCS_MASK(UCT_IB_SL_NUM) & ~UCS_BIT(sl), + (ar_enable == UCS_YES) ? err_status : UCS_OK); + } } }