Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/IB: Use separate resource domain for mlx5 trasnports #2338

Merged
merged 8 commits into from
Mar 26, 2018
10 changes: 10 additions & 0 deletions config/m4/ib.m4
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,16 @@ AS_IF([test "x$with_ib" == xyes],
[[#include <infiniband/mlx5_hw.h>]])],
[])

AC_CHECK_DECLS([IBV_EXP_QP_INIT_ATTR_RES_DOMAIN,
IBV_EXP_RES_DOMAIN_THREAD_MODEL,
ibv_exp_create_res_domain,
ibv_exp_destroy_res_domain],
[AC_DEFINE([HAVE_IBV_EXP_RES_DOMAIN], 1, [IB resource domain])],
[AC_MSG_WARN([Cannot use mlx5 accel because resource domains are not supported])
AC_MSG_WARN([Please upgrade MellanoxOFED to 3.1 or above])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add URL

with_mlx5_hw=no],
[[#include <infiniband/verbs_exp.h>]])

AS_IF([test "x$with_mlx5_hw" == xyes],
[AC_MSG_NOTICE([Compiling with mlx5 bare-metal support])
AC_DEFINE([HAVE_MLX5_HW], 1, [mlx5 bare-metal support])],
Expand Down
22 changes: 17 additions & 5 deletions src/uct/base/uct_worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ typedef struct uct_worker_progress {
#define uct_worker_tl_data_get(_worker, _key, _type, _cmp_fn, _init_fn, ...) \
({ \
uct_worker_tl_data_t *data; \
_type *result; \
ucs_status_t status; \
\
ucs_list_for_each(data, &(_worker)->tl_data, list) { \
if ((data->key == (_key)) && _cmp_fn(ucs_derived_of(data, _type), \
Expand All @@ -50,16 +52,26 @@ typedef struct uct_worker_progress {
} \
} \
\
if (&data->list == &(_worker)->tl_data) { \
if (&data->list == &(_worker)->tl_data) { /* not found */ \
data = ucs_malloc(sizeof(_type), UCS_PP_QUOTE(_type)); \
if (data != NULL) { \
if (data == NULL) { \
result = UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); \
} else { \
data->key = (_key); \
data->refcount = 1; \
_init_fn(ucs_derived_of(data, _type), ## __VA_ARGS__); \
ucs_list_add_tail(&(_worker)->tl_data, &data->list); \
status = _init_fn(ucs_derived_of(data, _type), ## __VA_ARGS__); \
if (status != UCS_OK) { \
ucs_free(data); \
result = UCS_STATUS_PTR(status); \
} else { \
ucs_list_add_tail(&(_worker)->tl_data, &data->list); \
result = ucs_derived_of(data, _type); \
} \
} \
} else { \
result = ucs_derived_of(data, _type); \
} \
ucs_derived_of(data, _type); \
result; \
})


Expand Down
80 changes: 78 additions & 2 deletions src/uct/ib/base/ib_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,60 @@ static ucs_status_t uct_ib_iface_set_moderation(struct ibv_cq *cq,
return UCS_OK;
}

static int uct_ib_iface_res_domain_cmp(uct_ib_iface_res_domain_t *res_domain,
uct_ib_iface_t *iface)
{
uct_ib_device_t *dev = uct_ib_iface_device(iface);

return res_domain->ibv_domain->context == dev->ibv_context;
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing #if HAVE_IBV_EXP_RES_DOMAIN

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

static ucs_status_t
uct_ib_iface_res_domain_init(uct_ib_iface_res_domain_t *res_domain,
uct_ib_iface_t *iface)
{
uct_ib_device_t *dev = uct_ib_iface_device(iface);
struct ibv_exp_res_domain_init_attr attr;

attr.comp_mask = IBV_EXP_RES_DOMAIN_THREAD_MODEL |
IBV_EXP_RES_DOMAIN_MSG_MODEL;
attr.msg_model = IBV_EXP_MSG_LOW_LATENCY;

switch (iface->super.worker->thread_mode) {
case UCS_THREAD_MODE_SINGLE:
attr.thread_model = IBV_EXP_THREAD_SINGLE;
break;
case UCS_THREAD_MODE_SERIALIZED:
attr.thread_model = IBV_EXP_THREAD_UNSAFE;
break;
default:
attr.thread_model = IBV_EXP_THREAD_SAFE;
break;
}

res_domain->ibv_domain = ibv_exp_create_res_domain(dev->ibv_context, &attr);
if (res_domain->ibv_domain == NULL) {
ucs_error("ibv_exp_create_res_domain() on %s failed: %m",
uct_ib_device_name(dev));
return UCS_ERR_IO_ERROR;
}

return UCS_OK;
}

static void uct_ib_iface_res_domain_cleanup(uct_ib_iface_res_domain_t *res_domain)
{
struct ibv_exp_destroy_res_domain_attr attr;
int ret;

attr.comp_mask = 0;
ret = ibv_exp_destroy_res_domain(res_domain->ibv_domain->context,
res_domain->ibv_domain, &attr);
if (ret != 0) {
ucs_warn("ibv_exp_destroy_res_domain() failed: %m");
}
}

/**
* @param rx_headroom Headroom requested by the user.
* @param rx_priv_len Length of transport private data to reserve (0 if unused)
Expand All @@ -544,7 +598,7 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md,
uct_worker_h worker, const uct_iface_params_t *params,
unsigned rx_priv_len, unsigned rx_hdr_len,
unsigned tx_cq_len, unsigned rx_cq_len, size_t mss,
const uct_ib_iface_config_t *config)
uint32_t res_domain_key, const uct_ib_iface_config_t *config)
{
uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev;
int preferred_cpu = ucs_cpu_set_find_lcs(&params->cpu_mask);
Expand Down Expand Up @@ -607,11 +661,26 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md,
goto err;
}

if (res_domain_key == UCT_IB_IFACE_NULL_RES_DOMAIN_KEY) {
self->res_domain = NULL;
} else {
self->res_domain = uct_worker_tl_data_get(self->super.worker,
res_domain_key,
uct_ib_iface_res_domain_t,
uct_ib_iface_res_domain_cmp,
uct_ib_iface_res_domain_init,
self);
if (UCS_PTR_IS_ERR(self->res_domain)) {
status = UCS_PTR_STATUS(self->res_domain);
goto err_free_path_bits;
}
}

self->comp_channel = ibv_create_comp_channel(dev->ibv_context);
if (self->comp_channel == NULL) {
ucs_error("ibv_create_comp_channel() failed: %m");
status = UCS_ERR_IO_ERROR;
goto err_free_path_bits;
goto err_put_res_domain;
}

status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0);
Expand Down Expand Up @@ -675,6 +744,10 @@ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md,
ibv_destroy_cq(self->send_cq);
err_destroy_comp_channel:
ibv_destroy_comp_channel(self->comp_channel);
err_put_res_domain:
if (self->res_domain != NULL) {
uct_worker_tl_data_put(self->res_domain, uct_ib_iface_res_domain_cleanup);
}
err_free_path_bits:
ucs_free(self->path_bits);
err:
Expand All @@ -700,6 +773,9 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ib_iface_t)
ucs_warn("ibv_destroy_comp_channel(comp_channel) returned %d: %m", ret);
}

if (self->res_domain != NULL) {
uct_worker_tl_data_put(self->res_domain, uct_ib_iface_res_domain_cleanup);
}
ucs_free(self->path_bits);
}

Expand Down
13 changes: 11 additions & 2 deletions src/uct/ib/base/ib_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
#include <ucs/datastruct/mpool.inl>
#include <ucs/sys/math.h>

#define UCT_IB_MAX_IOV 8UL
#define UCT_IB_MAX_IOV 8UL
#define UCT_IB_IFACE_NULL_RES_DOMAIN_KEY 0u


/* Forward declarations */
typedef struct uct_ib_iface_config uct_ib_iface_config_t;
Expand Down Expand Up @@ -98,6 +100,12 @@ struct uct_ib_iface_ops {
};


typedef struct uct_ib_iface_res_domain {
uct_worker_tl_data_t super;
struct ibv_exp_res_domain *ibv_domain;
} uct_ib_iface_res_domain_t;


struct uct_ib_iface {
uct_base_iface_t super;

Expand All @@ -113,6 +121,7 @@ struct uct_ib_iface {
uct_ib_address_type_t addr_type;
uint8_t addr_size;
union ibv_gid gid;
uct_ib_iface_res_domain_t *res_domain;

struct {
unsigned rx_payload_offset; /* offset from desc to payload */
Expand All @@ -135,7 +144,7 @@ struct uct_ib_iface {
};
UCS_CLASS_DECLARE(uct_ib_iface_t, uct_ib_iface_ops_t*, uct_md_h, uct_worker_h,
const uct_iface_params_t*, unsigned, unsigned, unsigned,
unsigned, size_t, const uct_ib_iface_config_t*)
unsigned, size_t, uint32_t, const uct_ib_iface_config_t*)


/*
Expand Down
10 changes: 10 additions & 0 deletions src/uct/ib/base/ib_verbs.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,16 @@ static inline int ibv_exp_cq_ignore_overrun(struct ibv_cq *cq)
#endif


/*
* Resource domain
*/
#if !HAVE_IBV_EXP_RES_DOMAIN

struct ibv_exp_res_domain {
};

#endif


typedef uint8_t uct_ib_uint24_t[3];

Expand Down
1 change: 1 addition & 0 deletions src/uct/ib/cm/cm_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ static UCS_CLASS_INIT_FUNC(uct_cm_iface_t, uct_md_h md, uct_worker_h worker,
1 /* tx_cq_len */,
config->super.rx.queue_len /* rx_cq_len */,
IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE, /* mss */
UCT_IB_IFACE_NULL_RES_DOMAIN_KEY,
&config->super);

if (self->super.super.worker->async == NULL) {
Expand Down
2 changes: 1 addition & 1 deletion src/uct/ib/dc/accel/dc_mlx5.c
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,7 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h md, uct_worker_h worker
ucs_trace_func("");
UCS_CLASS_CALL_SUPER_INIT(uct_dc_iface_t, &uct_dc_mlx5_iface_ops, md,
worker, params, 0, &config->super,
IBV_EXP_TM_CAP_DC);
IBV_EXP_TM_CAP_DC, UCT_IB_MLX5_RES_DOMAIN_KEY);

status = uct_dc_mlx5_iface_tag_init(self, &config->super.super);
if (status != UCS_OK) {
Expand Down
5 changes: 3 additions & 2 deletions src/uct/ib/dc/base/dc_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -245,14 +245,15 @@ static ucs_status_t uct_dc_iface_init_version(uct_dc_iface_t *iface,
UCS_CLASS_INIT_FUNC(uct_dc_iface_t, uct_dc_iface_ops_t *ops, uct_md_h md,
uct_worker_h worker, const uct_iface_params_t *params,
unsigned rx_priv_len, uct_dc_iface_config_t *config,
int tm_cap_bit)
int tm_cap_bit, uint32_t res_domain_key)
{
ucs_status_t status;
ucs_trace_func("");

UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, &ops->super, md, worker, params,
&config->super, rx_priv_len,
sizeof(uct_dc_fc_request_t), tm_cap_bit);
sizeof(uct_dc_fc_request_t), tm_cap_bit,
res_domain_key);
if (config->ndci < 1) {
ucs_error("dc interface must have at least 1 dci (requested: %d)",
config->ndci);
Expand Down
2 changes: 1 addition & 1 deletion src/uct/ib/dc/base/dc_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ struct uct_dc_iface {

UCS_CLASS_DECLARE(uct_dc_iface_t, uct_dc_iface_ops_t*, uct_md_h,
uct_worker_h, const uct_iface_params_t*, unsigned,
uct_dc_iface_config_t*, int)
uct_dc_iface_config_t*, int, uint32_t)

extern ucs_config_field_t uct_dc_iface_config_table[];

Expand Down
3 changes: 2 additions & 1 deletion src/uct/ib/dc/verbs/dc_verbs.c
Original file line number Diff line number Diff line change
Expand Up @@ -1114,7 +1114,8 @@ static UCS_CLASS_INIT_FUNC(uct_dc_verbs_iface_t, uct_md_h md, uct_worker_h worke

UCS_CLASS_CALL_SUPER_INIT(uct_dc_iface_t, &uct_dc_verbs_iface_ops, md,
worker, params, 0, &config->super,
IBV_EXP_TM_CAP_DC);
IBV_EXP_TM_CAP_DC,
UCT_IB_IFACE_NULL_RES_DOMAIN_KEY);

uct_dc_verbs_iface_init_wrs(self);

Expand Down
10 changes: 8 additions & 2 deletions src/uct/ib/mlx5/ib_mlx5.c
Original file line number Diff line number Diff line change
Expand Up @@ -276,13 +276,15 @@ void uct_ib_mlx5_check_completion(uct_ib_iface_t *iface, uct_ib_mlx5_cq_t *cq,

static int uct_ib_mlx5_bf_cmp(uct_ib_mlx5_bf_t *bf, uintptr_t addr, unsigned bf_size)
{
return ((bf->reg.addr & ~UCT_IB_MLX5_BF_REG_SIZE) == (addr & ~UCT_IB_MLX5_BF_REG_SIZE));
return (bf->reg.addr & ~UCT_IB_MLX5_BF_REG_SIZE) == (addr & ~UCT_IB_MLX5_BF_REG_SIZE);
}

static void uct_ib_mlx5_bf_init(uct_ib_mlx5_bf_t *bf, uintptr_t addr, unsigned bf_size)
static ucs_status_t uct_ib_mlx5_bf_init(uct_ib_mlx5_bf_t *bf, uintptr_t addr,
unsigned bf_size)
{
bf->reg.addr = addr;
bf->enable_bf = bf_size;
return UCS_OK;
}

static void uct_ib_mlx5_bf_cleanup(uct_ib_mlx5_bf_t *bf)
Expand Down Expand Up @@ -334,6 +336,10 @@ ucs_status_t uct_ib_mlx5_txwq_init(uct_priv_worker_t *worker,
uct_ib_mlx5_bf_init,
(uintptr_t)qp_info.bf.reg,
qp_info.bf.size);
if (UCS_PTR_IS_ERR(txwq->bf)) {
return UCS_PTR_STATUS(txwq->bf);
}

txwq->dbrec = &qp_info.dbrec[MLX5_SND_DBR];
/* need to reserve 2x because:
* - on completion we only get the index of last wqe and we do not
Expand Down
1 change: 1 addition & 0 deletions src/uct/ib/mlx5/ib_mlx5.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#define UCT_IB_MLX5_CQE128_SIZE_LOG 7
#define UCT_IB_MLX5_MAX_BB 4
#define UCT_IB_MLX5_WORKER_BF_KEY 0x00c1b7e8u
#define UCT_IB_MLX5_RES_DOMAIN_KEY 0x1b1bda7aU
#define UCT_IB_MLX5_EXTENDED_UD_AV 0x80 /* htonl(0x80000000) */
#define UCT_IB_MLX5_BF_REG_SIZE 256
#define UCT_IB_MLX5_CQE_VENDOR_SYND_ODP 0x93
Expand Down
3 changes: 2 additions & 1 deletion src/uct/ib/rc/accel/rc_mlx5_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,8 @@ static UCS_CLASS_INIT_FUNC(uct_rc_mlx5_iface_t, uct_md_h md, uct_worker_h worker

UCS_CLASS_CALL_SUPER_INIT(uct_rc_iface_t, &uct_rc_mlx5_iface_ops, md, worker,
params, &config->super, 0,
sizeof(uct_rc_fc_request_t), IBV_EXP_TM_CAP_RC);
sizeof(uct_rc_fc_request_t), IBV_EXP_TM_CAP_RC,
UCT_IB_MLX5_RES_DOMAIN_KEY);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it ok that all accelerated transports use the same domain key?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes because they share the BF register structure as well (with the same key)



self->tx.bb_max = ucs_min(config->tx_max_bb, UINT16_MAX);
Expand Down
3 changes: 1 addition & 2 deletions src/uct/ib/rc/base/rc_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,7 @@ static void uct_rc_ep_tag_qp_destroy(uct_rc_ep_t *ep)
#endif
}

static ucs_status_t uct_rc_ep_tag_qp_create(uct_rc_iface_t *iface,
uct_rc_ep_t *ep)
static ucs_status_t uct_rc_ep_tag_qp_create(uct_rc_iface_t *iface, uct_rc_ep_t *ep)
{
#if IBV_EXP_HW_TM
struct ibv_qp_cap cap;
Expand Down
14 changes: 11 additions & 3 deletions src/uct/ib/rc/base/rc_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -681,7 +681,8 @@ unsigned uct_rc_iface_do_progress(uct_iface_h tl_iface)
UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops, uct_md_h md,
uct_worker_h worker, const uct_iface_params_t *params,
const uct_rc_iface_config_t *config, unsigned rx_priv_len,
unsigned fc_req_size, int tm_cap_flag)
unsigned fc_req_size, int tm_cap_flag,
uint32_t res_domain_key)
{
uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev;
unsigned tx_cq_len = config->tx.cq_len;
Expand All @@ -693,7 +694,7 @@ UCS_CLASS_INIT_FUNC(uct_rc_iface_t, uct_rc_iface_ops_t *ops, uct_md_h md,

UCS_CLASS_CALL_SUPER_INIT(uct_ib_iface_t, &ops->super, md, worker, params,
rx_priv_len, sizeof(uct_rc_hdr_t), tx_cq_len,
rx_cq_len, SIZE_MAX, &config->super);
rx_cq_len, SIZE_MAX, res_domain_key, &config->super);

self->tx.cq_available = tx_cq_len - 1;
self->rx.srq.available = 0;
Expand Down Expand Up @@ -919,7 +920,7 @@ ucs_status_t uct_rc_iface_qp_create(uct_rc_iface_t *iface, int qp_type,

# if HAVE_DECL_IBV_EXP_ATOMIC_HCA_REPLY_BE
if (dev->dev_attr.exp_atomic_cap == IBV_EXP_ATOMIC_HCA_REPLY_BE) {
qp_init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS;
qp_init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

previous alignment was correct

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

qp_init_attr.exp_create_flags = IBV_EXP_QP_CREATE_ATOMIC_BE_REPLY;
}
# endif
Expand All @@ -929,6 +930,13 @@ ucs_status_t uct_rc_iface_qp_create(uct_rc_iface_t *iface, int qp_type,
qp_init_attr.max_inl_recv = iface->config.rx_inline;
# endif

#if HAVE_IBV_EXP_RES_DOMAIN
if (iface->super.res_domain != NULL) {
qp_init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_RES_DOMAIN;
qp_init_attr.res_domain = iface->super.res_domain->ibv_domain;
}
#endif

qp = ibv_exp_create_qp(dev->ibv_context, &qp_init_attr);
#else
qp = ibv_create_qp(uct_ib_iface_md(&iface->super)->pd, &qp_init_attr);
Expand Down
Loading