Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/IB: Use separate resource domain for mlx5 trasnports #2338

Merged
merged 8 commits into from
Mar 26, 2018
10 changes: 10 additions & 0 deletions config/m4/ib.m4
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,16 @@ AS_IF([test "x$with_ib" == xyes],
[[#include <infiniband/mlx5_hw.h>]])],
[])

AC_CHECK_DECLS([IBV_EXP_QP_INIT_ATTR_RES_DOMAIN,
IBV_EXP_RES_DOMAIN_THREAD_MODEL,
ibv_exp_create_res_domain,
ibv_exp_destroy_res_domain],
[AC_DEFINE([HAVE_IBV_EXP_RES_DOMAIN], 1, [IB resource domain])],
[AC_MSG_WARN([Cannot use mlx5 accel because resource domains are not supported])
AC_MSG_WARN([Please upgrade MellanoxOFED to 3.1 or above])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add URL

with_mlx5_hw=no],
[[#include <infiniband/verbs_exp.h>]])

AS_IF([test "x$with_mlx5_hw" == xyes],
[AC_MSG_NOTICE([Compiling with mlx5 bare-metal support])
AC_DEFINE([HAVE_MLX5_HW], 1, [mlx5 bare-metal support])],
Expand Down
3 changes: 2 additions & 1 deletion contrib/test_jenkins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,8 @@ run_gtest() {
fi

export VALGRIND_EXTRA_ARGS="--xml=yes --xml-file=valgrind.xml --child-silent-after-fork=yes"
$AFFINITY $TIMEOUT_VALGRIND make -C test/gtest test_valgrind
$AFFINITY $TIMEOUT_VALGRIND make -C test/gtest test_valgrind \
VALGRIND_EXTRA_ARGS="--gen-suppressions=all"
(cd test/gtest && rename .tap _vg.tap *.tap && mv *.tap $GTEST_REPORT_DIR)
module unload tools/valgrind-latest
else
Expand Down
6 changes: 6 additions & 0 deletions contrib/valgrind.supp
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,9 @@
...
fun:ibv_exp_free_dm
}
{
res_domain_leak
Memcheck:Leak
...
fun:ibv_exp_create_res_domain
}
87 changes: 85 additions & 2 deletions src/uct/ib/base/ib_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,67 @@ static ucs_status_t uct_ib_iface_set_moderation(struct ibv_cq *cq,
return UCS_OK;
}

static int uct_ib_iface_res_domain_cmp(uct_ib_iface_res_domain_t *res_domain,
uct_ib_iface_t *iface)
{
#if HAVE_IBV_EXP_RES_DOMAIN
uct_ib_device_t *dev = uct_ib_iface_device(iface);

return res_domain->ibv_domain->context == dev->ibv_context;
#else
return 1;
#endif
}

static ucs_status_t
uct_ib_iface_res_domain_init(uct_ib_iface_res_domain_t *res_domain,
uct_ib_iface_t *iface)
{
#if HAVE_IBV_EXP_RES_DOMAIN
uct_ib_device_t *dev = uct_ib_iface_device(iface);
struct ibv_exp_res_domain_init_attr attr;

attr.comp_mask = IBV_EXP_RES_DOMAIN_THREAD_MODEL |
IBV_EXP_RES_DOMAIN_MSG_MODEL;
attr.msg_model = IBV_EXP_MSG_LOW_LATENCY;

switch (iface->super.worker->thread_mode) {
case UCS_THREAD_MODE_SINGLE:
attr.thread_model = IBV_EXP_THREAD_SINGLE;
break;
case UCS_THREAD_MODE_SERIALIZED:
attr.thread_model = IBV_EXP_THREAD_UNSAFE;
break;
default:
attr.thread_model = IBV_EXP_THREAD_SAFE;
break;
}

res_domain->ibv_domain = ibv_exp_create_res_domain(dev->ibv_context, &attr);
if (res_domain->ibv_domain == NULL) {
ucs_error("ibv_exp_create_res_domain() on %s failed: %m",
uct_ib_device_name(dev));
return UCS_ERR_IO_ERROR;
}
#endif
return UCS_OK;
}

static void uct_ib_iface_res_domain_cleanup(uct_ib_iface_res_domain_t *res_domain)
{
#if HAVE_IBV_EXP_RES_DOMAIN
struct ibv_exp_destroy_res_domain_attr attr;
int ret;

attr.comp_mask = 0;
ret = ibv_exp_destroy_res_domain(res_domain->ibv_domain->context,
res_domain->ibv_domain, &attr);
if (ret != 0) {
ucs_warn("ibv_exp_destroy_res_domain() failed: %m");
}
#endif
}

/**
* @param rx_headroom Headroom requested by the user.
* @param rx_priv_len Length of transport private data to reserve (0 if unused)
Expand All @@ -544,7 +605,7 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md,
uct_worker_h worker, const uct_iface_params_t *params,
unsigned rx_priv_len, unsigned rx_hdr_len,
unsigned tx_cq_len, unsigned rx_cq_len, size_t mss,
const uct_ib_iface_config_t *config)
uint32_t res_domain_key, const uct_ib_iface_config_t *config)
{
uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev;
int preferred_cpu = ucs_cpu_set_find_lcs(&params->cpu_mask);
Expand Down Expand Up @@ -607,11 +668,26 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md,
goto err;
}

if (res_domain_key == UCT_IB_IFACE_NULL_RES_DOMAIN_KEY) {
self->res_domain = NULL;
} else {
self->res_domain = uct_worker_tl_data_get(self->super.worker,
res_domain_key,
uct_ib_iface_res_domain_t,
uct_ib_iface_res_domain_cmp,
uct_ib_iface_res_domain_init,
self);
if (UCS_PTR_IS_ERR(self->res_domain)) {
status = UCS_PTR_STATUS(self->res_domain);
goto err_free_path_bits;
}
}

self->comp_channel = ibv_create_comp_channel(dev->ibv_context);
if (self->comp_channel == NULL) {
ucs_error("ibv_create_comp_channel() failed: %m");
status = UCS_ERR_IO_ERROR;
goto err_free_path_bits;
goto err_put_res_domain;
}

status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0);
Expand Down Expand Up @@ -675,6 +751,10 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md,
ibv_destroy_cq(self->send_cq);
err_destroy_comp_channel:
ibv_destroy_comp_channel(self->comp_channel);
err_put_res_domain:
if (self->res_domain != NULL) {
uct_worker_tl_data_put(self->res_domain, uct_ib_iface_res_domain_cleanup);
}
err_free_path_bits:
ucs_free(self->path_bits);
err:
Expand All @@ -700,6 +780,9 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ib_iface_t)
ucs_warn("ibv_destroy_comp_channel(comp_channel) returned %d: %m", ret);
}

if (self->res_domain != NULL) {
uct_worker_tl_data_put(self->res_domain, uct_ib_iface_res_domain_cleanup);
}
ucs_free(self->path_bits);
}

Expand Down
13 changes: 11 additions & 2 deletions src/uct/ib/base/ib_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
#include <ucs/datastruct/mpool.inl>
#include <ucs/sys/math.h>

#define UCT_IB_MAX_IOV 8UL
#define UCT_IB_MAX_IOV 8UL
#define UCT_IB_IFACE_NULL_RES_DOMAIN_KEY 0u


/* Forward declarations */
typedef struct uct_ib_iface_config uct_ib_iface_config_t;
Expand Down Expand Up @@ -98,6 +100,12 @@ struct uct_ib_iface_ops {
};


typedef struct uct_ib_iface_res_domain {
uct_worker_tl_data_t super;
struct ibv_exp_res_domain *ibv_domain;
} uct_ib_iface_res_domain_t;


struct uct_ib_iface {
uct_base_iface_t super;

Expand All @@ -113,6 +121,7 @@ struct uct_ib_iface {
uct_ib_address_type_t addr_type;
uint8_t addr_size;
union ibv_gid gid;
uct_ib_iface_res_domain_t *res_domain;

struct {
unsigned rx_payload_offset; /* offset from desc to payload */
Expand All @@ -135,7 +144,7 @@ struct uct_ib_iface {
};
UCS_CLASS_DECLARE(uct_ib_iface_t, uct_ib_iface_ops_t*, uct_md_h, uct_worker_h,
const uct_iface_params_t*, unsigned, unsigned, unsigned,
unsigned, size_t, const uct_ib_iface_config_t*)
unsigned, size_t, uint32_t, const uct_ib_iface_config_t*)


/*
Expand Down
10 changes: 10 additions & 0 deletions src/uct/ib/base/ib_verbs.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,16 @@ static inline int ibv_exp_cq_ignore_overrun(struct ibv_cq *cq)
#endif


/*
* Resource domain
*/
#if !HAVE_IBV_EXP_RES_DOMAIN

struct ibv_exp_res_domain {
};

#endif


typedef uint8_t uct_ib_uint24_t[3];

Expand Down
1 change: 1 addition & 0 deletions src/uct/ib/cm/cm_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ static UCS_CLASS_INIT_FUNC(uct_cm_iface_t, uct_md_h md, uct_worker_h worker,
1 /* tx_cq_len */,
config->super.rx.queue_len /* rx_cq_len */,
IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE, /* mss */
UCT_IB_IFACE_NULL_RES_DOMAIN_KEY,
&config->super);

if (self->super.super.worker->async == NULL) {
Expand Down
12 changes: 11 additions & 1 deletion src/uct/ib/dc/accel/dc_mlx5.c
Original file line number Diff line number Diff line change
Expand Up @@ -1141,6 +1141,15 @@ static ucs_status_t uct_dc_mlx5_iface_init_dcis(uct_dc_mlx5_iface_t *iface)
return UCS_OK;
}

static void uct_dc_mlx5_iface_cleanup_dcis(uct_dc_mlx5_iface_t *iface)
{
int i;

for (i = 0; i < iface->super.tx.ndci; i++) {
uct_ib_mlx5_txwq_cleanup(&iface->dci_wqs[i]);
}
}

static ucs_status_t uct_dc_mlx5_iface_tag_init(uct_dc_mlx5_iface_t *iface,
uct_rc_iface_config_t *rc_config)
{
Expand Down Expand Up @@ -1198,7 +1207,7 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h md, uct_worker_h worker
ucs_trace_func("");
UCS_CLASS_CALL_SUPER_INIT(uct_dc_iface_t, &uct_dc_mlx5_iface_ops, md,
worker, params, 0, &config->super,
IBV_EXP_TM_CAP_DC);
IBV_EXP_TM_CAP_DC, UCT_IB_MLX5_RES_DOMAIN_KEY);

status = uct_dc_mlx5_iface_tag_init(self, &config->super.super);
if (status != UCS_OK) {
Expand Down Expand Up @@ -1250,6 +1259,7 @@ static UCS_CLASS_CLEANUP_FUNC(uct_dc_mlx5_iface_t)
ucs_trace_func("");
uct_base_iface_progress_disable(&self->super.super.super.super.super,
UCT_PROGRESS_SEND | UCT_PROGRESS_RECV);
uct_dc_mlx5_iface_cleanup_dcis(self);
uct_rc_mlx5_iface_common_cleanup(&self->mlx5_common);
uct_dc_mlx5_iface_tag_cleanup(self);
}
Expand Down
5 changes: 3 additions & 2 deletions src/uct/ib/dc/base/dc_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -242,14 +242,15 @@ static void uct_dc_iface_init_version(uct_dc_iface_t *iface, uct_md_h md)
UCS_CLASS_INIT_FUNC(uct_dc_iface_t, uct_dc_iface_ops_t *ops, uct_md_h md,
uct_worker_h worker, const uct_iface_params_t *params,
unsigned rx_priv_len, uct_dc_iface_config_t *config,
int tm_cap_bit)
int tm_cap_bit, uint32_t res_domain_key)
{
ucs_status_t status;
ucs_trace_func("");

UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, &ops->super, md, worker, params,
&config->super, rx_priv_len,
sizeof(uct_dc_fc_request_t), tm_cap_bit);
sizeof(uct_dc_fc_request_t), tm_cap_bit,
res_domain_key);
if (config->ndci < 1) {
ucs_error("dc interface must have at least 1 dci (requested: %d)",
config->ndci);
Expand Down
2 changes: 1 addition & 1 deletion src/uct/ib/dc/base/dc_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ struct uct_dc_iface {

UCS_CLASS_DECLARE(uct_dc_iface_t, uct_dc_iface_ops_t*, uct_md_h,
uct_worker_h, const uct_iface_params_t*, unsigned,
uct_dc_iface_config_t*, int)
uct_dc_iface_config_t*, int, uint32_t)

extern ucs_config_field_t uct_dc_iface_config_table[];

Expand Down
3 changes: 2 additions & 1 deletion src/uct/ib/dc/verbs/dc_verbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1114,7 +1114,8 @@ static UCS_CLASS_INIT_FUNC(uct_dc_verbs_iface_t, uct_md_h md, uct_worker_h worke

UCS_CLASS_CALL_SUPER_INIT(uct_dc_iface_t, &uct_dc_verbs_iface_ops, md,
worker, params, 0, &config->super,
IBV_EXP_TM_CAP_DC);
IBV_EXP_TM_CAP_DC,
UCT_IB_IFACE_NULL_RES_DOMAIN_KEY);

uct_dc_verbs_iface_init_wrs(self);

Expand Down
1 change: 1 addition & 0 deletions src/uct/ib/mlx5/ib_mlx5.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#define UCT_IB_MLX5_CQE128_SIZE_LOG 7
#define UCT_IB_MLX5_MAX_BB 4
#define UCT_IB_MLX5_WORKER_BF_KEY 0x00c1b7e8u
#define UCT_IB_MLX5_RES_DOMAIN_KEY 0x1b1bda7aU
#define UCT_IB_MLX5_WORKER_DM_KEY 0xacdf1245u
#define UCT_IB_MLX5_EXTENDED_UD_AV 0x80 /* htonl(0x80000000) */
#define UCT_IB_MLX5_BF_REG_SIZE 256
Expand Down
3 changes: 2 additions & 1 deletion src/uct/ib/rc/accel/rc_mlx5_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,8 @@ static UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_t, uct_md_h md, uct_worker_h worker

UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, &uct_rc_mlx5_iface_ops, md, worker,
params, &config->super, 0,
sizeof(uct_rc_fc_request_t), IBV_EXP_TM_CAP_RC);
sizeof(uct_rc_fc_request_t), IBV_EXP_TM_CAP_RC,
UCT_IB_MLX5_RES_DOMAIN_KEY);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it ok that all accelerated transports use the same domain key?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes because they share the BF register structure as well (with the same key)



self->tx.bb_max = ucs_min(config->tx_max_bb, UINT16_MAX);
Expand Down
3 changes: 1 addition & 2 deletions src/uct/ib/rc/base/rc_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,7 @@ static void uct_rc_ep_tag_qp_destroy(uct_rc_ep_t *ep)
#endif
}

static ucs_status_t uct_rc_ep_tag_qp_create(uct_rc_iface_t *iface,
uct_rc_ep_t *ep)
static ucs_status_t uct_rc_ep_tag_qp_create(uct_rc_iface_t *iface, uct_rc_ep_t *ep)
{
#if IBV_EXP_HW_TM
struct ibv_qp_cap cap;
Expand Down
12 changes: 10 additions & 2 deletions src/uct/ib/rc/base/rc_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,8 @@ unsigned uct_rc_iface_do_progress(uct_iface_h tl_iface)
UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops, uct_md_h md,
uct_worker_h worker, const uct_iface_params_t *params,
const uct_rc_iface_config_t *config, unsigned rx_priv_len,
unsigned fc_req_size, int tm_cap_flag)
unsigned fc_req_size, int tm_cap_flag,
uint32_t res_domain_key)
{
uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev;
unsigned tx_cq_len = config->tx.cq_len;
Expand All @@ -694,7 +695,7 @@ UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops, uct_md_h md,

UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &ops->super, md, worker, params,
rx_priv_len, sizeof(uct_rc_hdr_t), tx_cq_len,
rx_cq_len, SIZE_MAX, &config->super);
rx_cq_len, SIZE_MAX, res_domain_key, &config->super);

self->tx.cq_available = tx_cq_len - 1;
self->rx.srq.available = 0;
Expand Down Expand Up @@ -930,6 +931,13 @@ ucs_status_t uct_rc_iface_qp_create(uct_rc_iface_t *iface, int qp_type,
qp_init_attr.max_inl_recv = iface->config.rx_inline;
# endif

#if HAVE_IBV_EXP_RES_DOMAIN
if (iface->super.res_domain != NULL) {
qp_init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_RES_DOMAIN;
qp_init_attr.res_domain = iface->super.res_domain->ibv_domain;
}
#endif

qp = ibv_exp_create_qp(dev->ibv_context, &qp_init_attr);
#else
qp = ibv_create_qp(uct_ib_iface_md(&iface->super)->pd, &qp_init_attr);
Expand Down
6 changes: 3 additions & 3 deletions src/uct/ib/rc/base/rc_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -299,9 +299,9 @@ struct uct_rc_iface {
/* Progress function (either regular or TM aware) */
ucs_callback_t progress;
};
UCS_CLASS_DECLARE(uct_rc_iface_t, uct_rc_iface_ops_t*, uct_md_h,
uct_worker_h, const uct_iface_params_t*,
const uct_rc_iface_config_t*, unsigned, unsigned, int)
UCS_CLASS_DECLARE(uct_rc_iface_t, uct_rc_iface_ops_t*, uct_md_h, uct_worker_h,
const uct_iface_params_t*, const uct_rc_iface_config_t*,
unsigned, unsigned, int, uint32_t)


struct uct_rc_iface_send_op {
Expand Down
3 changes: 2 additions & 1 deletion src/uct/ib/rc/verbs/rc_verbs_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,8 @@ static UCS_CLASS_INIT_FUNC(uct_rc_verbs_iface_t, uct_md_h md, uct_worker_h worke

UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, &uct_rc_verbs_iface_ops, md,
worker, params, &config->super, 0,
sizeof(uct_rc_fc_request_t), IBV_EXP_TM_CAP_RC);
sizeof(uct_rc_fc_request_t), IBV_EXP_TM_CAP_RC,
UCT_IB_IFACE_NULL_RES_DOMAIN_KEY);

self->config.tx_max_wr = ucs_min(config->verbs_common.tx_max_wr,
self->super.config.tx_qp_len);
Expand Down
3 changes: 2 additions & 1 deletion src/uct/ib/ud/accel/ud_mlx5.c
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,8 @@ static UCS_CLASS_INIT_FUNC(uct_ud_mlx5_iface_t,
ucs_trace_func("");

UCS_CLASS_CALL_SUPER_INIT(uct_ud_iface_t, &uct_ud_mlx5_iface_ops,
md, worker, params, 0, &config->super);
md, worker, params, 0, UCT_IB_MLX5_RES_DOMAIN_KEY,
&config->super);

uct_ib_iface_set_max_iov(&self->super.super, UCT_IB_MLX5_AM_ZCOPY_MAX_IOV);
self->super.config.max_inline = UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE);
Expand Down
Loading