Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/DC/MLX5: Create DCI via DevX with full handshake option - v1.10.x #6750

Merged
merged 4 commits into from
May 4, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/uct/ib/base/ib_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ static const char *uct_ib_devx_objs[] = {
[UCT_IB_DEVX_OBJ_RCSRQ] = "rcsrq",
[UCT_IB_DEVX_OBJ_DCT] = "dct",
[UCT_IB_DEVX_OBJ_DCSRQ] = "dcsrq",
[UCT_IB_DEVX_OBJ_DCI] = "dci",
NULL
};

Expand Down Expand Up @@ -166,7 +167,7 @@ static ucs_config_field_t uct_ib_md_config_table[] = {
"DEVX support\n",
ucs_offsetof(uct_ib_md_config_t, devx), UCS_CONFIG_TYPE_TERNARY},

{"MLX5_DEVX_OBJECTS", "rcqp,rcsrq,dct,dcsrq",
{"MLX5_DEVX_OBJECTS", "rcqp,rcsrq,dct,dcsrq,dci",
"Objects to be created by DevX\n",
ucs_offsetof(uct_ib_md_config_t, devx_objs),
UCS_CONFIG_TYPE_BITMAP(uct_ib_devx_objs)},
Expand Down
3 changes: 2 additions & 1 deletion src/uct/ib/base/ib_md.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ enum {
UCT_IB_DEVX_OBJ_RCQP,
UCT_IB_DEVX_OBJ_RCSRQ,
UCT_IB_DEVX_OBJ_DCT,
UCT_IB_DEVX_OBJ_DCSRQ
UCT_IB_DEVX_OBJ_DCSRQ,
UCT_IB_DEVX_OBJ_DCI
};

typedef struct uct_ib_md_ext_config {
Expand Down
101 changes: 71 additions & 30 deletions src/uct/ib/dc/dc_mlx5.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,21 @@ ucs_config_field_t uct_dc_mlx5_iface_config_sub_table[] = {
ucs_offsetof(uct_dc_mlx5_iface_config_t, tx_policy),
UCS_CONFIG_TYPE_ENUM(uct_dc_tx_policy_names)},

{"DCI_FULL_HANDSHAKE", "n",
"Force full-handshake protocol for DC initiator. Enabling this mode\n"
"increases network latency, but is more resilient to packet drops.",
ucs_offsetof(uct_dc_mlx5_iface_config_t, dci_full_handshake),
UCS_CONFIG_TYPE_BOOL},

{"DCI_KA_FULL_HANDSHAKE", "n",
"Force full-handshake protocol for DC keepalive initiator.",
ucs_offsetof(uct_dc_mlx5_iface_config_t, dci_ka_full_handshake),
UCS_CONFIG_TYPE_BOOL},

{"DCT_FULL_HANDSHAKE", "n", "Force full-handshake protocol for DC target.",
ucs_offsetof(uct_dc_mlx5_iface_config_t, dct_full_handshake),
UCS_CONFIG_TYPE_BOOL},

{"RAND_DCI_SEED", "0",
"Seed for DCI allocation when \"rand\" dci policy is used (0 - use default).",
ucs_offsetof(uct_dc_mlx5_iface_config_t, rand_seed), UCS_CONFIG_TYPE_UINT},
Expand Down Expand Up @@ -268,12 +283,14 @@ static unsigned uct_dc_mlx5_iface_progress_tm(void *arg)

static void UCS_CLASS_DELETE_FUNC_NAME(uct_dc_mlx5_iface_t)(uct_iface_t*);

static ucs_status_t uct_dc_mlx5_iface_create_qp(uct_dc_mlx5_iface_t *iface,
struct ibv_qp_cap *cap,
uct_dc_dci_t *dci)
static ucs_status_t uct_dc_mlx5_iface_create_dci(uct_dc_mlx5_iface_t *iface,
int full_handshake,
uct_dc_dci_t *dci)
{
uct_ib_iface_t *ib_iface = &iface->super.super.super;
uct_ib_mlx5_qp_attr_t attr = {};
uct_ib_mlx5_md_t *md = ucs_derived_of(ib_iface->super.md,
uct_ib_mlx5_md_t);
ucs_status_t status;
#if HAVE_DC_DV
uct_ib_device_t *dev = uct_ib_iface_device(ib_iface);
Expand All @@ -283,6 +300,19 @@ static ucs_status_t uct_dc_mlx5_iface_create_qp(uct_dc_mlx5_iface_t *iface,
uct_rc_mlx5_iface_fill_attr(&iface->super, &attr,
iface->super.super.config.tx_qp_len,
&iface->super.rx.srq);

if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_DCI) {
attr.super.max_inl_cqe[UCT_IB_DIR_RX] = 0;
attr.full_handshake = full_handshake;
status = uct_ib_mlx5_devx_create_qp(ib_iface, &dci->txwq.super,
&dci->txwq, &attr);
if (status != UCS_OK) {
return status;
}

goto init_qp;
}

status = uct_ib_mlx5_iface_fill_attr(ib_iface, &dci->txwq.super, &attr);
if (status != UCS_OK) {
return status;
Expand Down Expand Up @@ -315,6 +345,7 @@ static ucs_status_t uct_dc_mlx5_iface_create_qp(uct_dc_mlx5_iface_t *iface,
}
#endif

init_qp:
status = uct_rc_txqp_init(&dci->txqp, &iface->super.super,
dci->txwq.super.qp_num
UCS_STATS_ARG(iface->super.super.stats));
Expand All @@ -337,21 +368,22 @@ static ucs_status_t uct_dc_mlx5_iface_create_qp(uct_dc_mlx5_iface_t *iface,
dci->flags = 0;
#endif

status = uct_ib_mlx5_txwq_init(iface->super.super.super.super.worker,
iface->super.tx.mmio_mode, &dci->txwq,
dci->txwq.super.verbs.qp);
if (status != UCS_OK) {
goto err;
if (dci->txwq.super.type == UCT_IB_MLX5_OBJ_TYPE_VERBS) {
status = uct_ib_mlx5_txwq_init(iface->super.super.super.super.worker,
iface->super.tx.mmio_mode, &dci->txwq,
dci->txwq.super.verbs.qp);
if (status != UCS_OK) {
goto err;
}
}

uct_rc_txqp_available_set(&dci->txqp, dci->txwq.bb_max);
*cap = attr.super.ibv.cap;
return UCS_OK;

err:
uct_rc_txqp_cleanup(&iface->super.super, &dci->txqp);
err_qp:
ibv_destroy_qp(dci->txwq.super.verbs.qp);
uct_ib_mlx5_destroy_qp(md, &dci->txwq.super);
return status;
}

Expand Down Expand Up @@ -421,7 +453,9 @@ ucs_status_t uct_dc_mlx5_iface_dci_connect(uct_dc_mlx5_iface_t *iface,
return UCS_OK;
}

ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface)
ucs_status_t
uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface,
const uct_dc_mlx5_iface_config_t *config)
{
uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md,
uct_ib_mlx5_md_t);
Expand All @@ -432,7 +466,8 @@ ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface)
int ret;

if (md->flags & UCT_IB_MLX5_MD_FLAG_DEVX_DCT) {
return uct_dc_mlx5_iface_devx_create_dct(iface);
return uct_dc_mlx5_iface_devx_create_dct(iface,
config->dct_full_handshake);
}

init_attr.comp_mask = IBV_QP_INIT_ATTR_PD;
Expand Down Expand Up @@ -520,9 +555,10 @@ void uct_dc_mlx5_destroy_dct(uct_dc_mlx5_iface_t *iface)

static void uct_dc_mlx5_iface_cleanup_dcis(uct_dc_mlx5_iface_t *iface)
{
int num_dcis = uct_dc_mlx5_iface_total_ndci(iface);
int i;

for (i = 0; i < iface->tx.ndci; i++) {
for (i = 0; i < num_dcis; i++) {
if (uct_dc_mlx5_iface_is_dci_rand(iface)) {
ucs_arbiter_group_cleanup(&iface->tx.dcis[i].arb_group);
}
Expand Down Expand Up @@ -636,7 +672,9 @@ void uct_dc_mlx5_cleanup_rx(uct_rc_iface_t *rc_iface)
}

#ifdef HAVE_DC_EXP
ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface)
ucs_status_t
uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface,
const uct_dc_mlx5_iface_config_t *config)
{
struct ibv_exp_dct_init_attr init_attr;

Expand Down Expand Up @@ -742,23 +780,19 @@ void uct_dc_mlx5_destroy_dct(uct_dc_mlx5_iface_t *iface)

void uct_dc_mlx5_iface_dcis_destroy(uct_dc_mlx5_iface_t *iface, int max)
{
uct_ib_mlx5_md_t *md = ucs_derived_of(iface->super.super.super.super.md,
uct_ib_mlx5_md_t);
int i;

for (i = 0; i < max; i++) {
uct_rc_txqp_cleanup(&iface->super.super, &iface->tx.dcis[i].txqp);
ucs_assert(iface->tx.dcis[i].txwq.super.type == UCT_IB_MLX5_OBJ_TYPE_VERBS);
uct_ib_destroy_qp(iface->tx.dcis[i].txwq.super.verbs.qp);
uct_ib_mlx5_destroy_qp(md, &iface->tx.dcis[i].txwq.super);
}
}

static ucs_status_t uct_dc_mlx5_iface_create_dci(uct_dc_mlx5_iface_t *iface,
uct_dc_dci_t *dci)
{
struct ibv_qp_cap cap = {};

return uct_dc_mlx5_iface_create_qp(iface, &cap, dci);
}

static ucs_status_t uct_dc_mlx5_iface_create_dcis(uct_dc_mlx5_iface_t *iface)
static ucs_status_t
uct_dc_mlx5_iface_create_dcis(uct_dc_mlx5_iface_t *iface,
const uct_dc_mlx5_iface_config_t *config)
{
ucs_status_t status;
int i;
Expand All @@ -768,7 +802,8 @@ static ucs_status_t uct_dc_mlx5_iface_create_dcis(uct_dc_mlx5_iface_t *iface)

iface->tx.stack_top = 0;
for (i = 0; i < iface->tx.ndci; i++) {
status = uct_dc_mlx5_iface_create_dci(iface, &iface->tx.dcis[i]);
status = uct_dc_mlx5_iface_create_dci(iface, config->dci_full_handshake,
&iface->tx.dcis[i]);
if (status != UCS_OK) {
goto err;
}
Expand Down Expand Up @@ -1196,14 +1231,18 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h tl_md, uct_worker_h wor
uct_dc_mlx5_iface_dci_do_rand_pending_tx :
uct_dc_mlx5_iface_dci_do_dcs_pending_tx;

if (config->dci_ka_full_handshake) {
self->flags |= UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE_FULL_HANDSHAKE;
}

/* create DC target */
status = uct_dc_mlx5_iface_create_dct(self);
status = uct_dc_mlx5_iface_create_dct(self, config);
if (status != UCS_OK) {
goto err;
}

/* create DC initiators */
status = uct_dc_mlx5_iface_create_dcis(self);
status = uct_dc_mlx5_iface_create_dcis(self, config);
if (status != UCS_OK) {
goto err_destroy_dct;
}
Expand Down Expand Up @@ -1332,6 +1371,8 @@ uct_dc_mlx5_dci_keepalive_handle_failure(uct_dc_mlx5_iface_t *iface,

ucs_status_t uct_dc_mlx5_iface_keepalive_init(uct_dc_mlx5_iface_t *iface)
{
int full_handshake = iface->flags &
UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE_FULL_HANDSHAKE;
ucs_status_t status;

if (ucs_likely(iface->flags & UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE)) {
Expand All @@ -1340,7 +1381,8 @@ ucs_status_t uct_dc_mlx5_iface_keepalive_init(uct_dc_mlx5_iface_t *iface)

ucs_assert(iface->tx.ndci <= UCT_DC_MLX5_IFACE_MAX_USER_DCIS);

status = uct_dc_mlx5_iface_create_dci(iface, &iface->tx.dcis[iface->tx.ndci]);
status = uct_dc_mlx5_iface_create_dci(iface, full_handshake,
&iface->tx.dcis[iface->tx.ndci]);
if (status != UCS_OK) {
return status;
}
Expand All @@ -1364,7 +1406,6 @@ void uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface, uint8_t dci,
uct_rc_txqp_available_set(txqp, (int16_t)iface->super.super.config.tx_qp_len);
uct_rc_txqp_purge_outstanding(&iface->super.super, txqp, ep_status,
txwq->sw_pi, 0);
ucs_assert(txwq->super.type == UCT_IB_MLX5_OBJ_TYPE_VERBS);

/* Synchronize CQ index with the driver, since it would remove pending
* completions for this QP (both send and receive) during ibv_destroy_qp().
Expand Down
16 changes: 13 additions & 3 deletions src/uct/ib/dc/dc_mlx5.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,11 @@ typedef enum {


typedef enum {
UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE = UCS_BIT(0) /**< keepalive dci is created */
/** Keepalive dci is created */
UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE = UCS_BIT(0),

/** Enable full handshake for keepalive DCI */
UCT_DC_MLX5_IFACE_FLAG_KEEPALIVE_FULL_HANDSHAKE = UCS_BIT(1)
} uct_dc_mlx5_iface_flags_t;


Expand Down Expand Up @@ -121,6 +125,9 @@ typedef struct uct_dc_mlx5_iface_config {
uct_ud_iface_common_config_t ud_common;
int ndci;
int tx_policy;
int dci_full_handshake;
int dci_ka_full_handshake;
int dct_full_handshake;
unsigned quota;
unsigned rand_seed;
uct_ud_mlx5_iface_common_config_t mlx5_ud;
Expand Down Expand Up @@ -218,7 +225,9 @@ struct uct_dc_mlx5_iface {

extern ucs_config_field_t uct_dc_mlx5_iface_config_table[];

ucs_status_t uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface);
ucs_status_t
uct_dc_mlx5_iface_create_dct(uct_dc_mlx5_iface_t *iface,
const uct_dc_mlx5_iface_config_t *config);

int uct_dc_mlx5_iface_is_reachable(const uct_iface_h tl_iface,
const uct_device_addr_t *dev_addr,
Expand Down Expand Up @@ -263,7 +272,8 @@ void uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface,

#if HAVE_DEVX

ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface);
ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface,
int full_handshake);

ucs_status_t uct_dc_mlx5_iface_devx_set_srq_dc_params(uct_dc_mlx5_iface_t *iface);

Expand Down
4 changes: 3 additions & 1 deletion src/uct/ib/dc/dc_mlx5_devx.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
#include <ucs/arch/bitops.h>


ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface)
ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface,
int full_handshake)
{
uct_ib_device_t *dev = uct_ib_iface_device(&iface->super.super.super);
struct mlx5dv_pd dvpd = {};
Expand Down Expand Up @@ -47,6 +48,7 @@ ucs_status_t uct_dc_mlx5_iface_devx_create_dct(uct_dc_mlx5_iface_t *iface)
UCT_IB_MLX5DV_SET(dctc, dctc, rre, true);
UCT_IB_MLX5DV_SET(dctc, dctc, rwe, true);
UCT_IB_MLX5DV_SET(dctc, dctc, rae, true);
UCT_IB_MLX5DV_SET(dctc, dctc, force_full_handshake, !!full_handshake);
UCT_IB_MLX5DV_SET(dctc, dctc, cs_res, uct_ib_mlx5_qpc_cs_res(
iface->super.super.super.config.max_inl_cqe[UCT_IB_DIR_RX], 1));
UCT_IB_MLX5DV_SET(dctc, dctc, atomic_mode, UCT_IB_MLX5_ATOMIC_MODE);
Expand Down
39 changes: 25 additions & 14 deletions src/uct/ib/mlx5/dv/ib_mlx5_dv.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,16 @@ ucs_status_t uct_ib_mlx5_devx_create_qp(uct_ib_iface_t *iface,

UCT_IB_MLX5DV_SET(create_qp_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_QP);
qpc = UCT_IB_MLX5DV_ADDR_OF(create_qp_in, in, qpc);
UCT_IB_MLX5DV_SET(qpc, qpc, st, UCT_IB_MLX5_QPC_ST_RC);
if (attr->super.qp_type == UCT_IB_QPT_DCI) {
UCT_IB_MLX5DV_SET(qpc, qpc, st, UCT_IB_MLX5_QPC_ST_DCI);
UCT_IB_MLX5DV_SET(qpc, qpc, full_handshake, !!attr->full_handshake);
} else if (attr->super.qp_type == IBV_QPT_RC) {
UCT_IB_MLX5DV_SET(qpc, qpc, st, UCT_IB_MLX5_QPC_ST_RC);
} else {
ucs_error("create qp failed: unknown type %d", attr->super.qp_type);
status = UCS_ERR_UNSUPPORTED;
goto err_free_db;
}
UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED);
UCT_IB_MLX5DV_SET(qpc, qpc, pd, dvpd.pdn);
UCT_IB_MLX5DV_SET(qpc, qpc, uar_page, uar->uar->page_id);
Expand Down Expand Up @@ -155,20 +164,22 @@ ucs_status_t uct_ib_mlx5_devx_create_qp(uct_ib_iface_t *iface,

qp->qp_num = UCT_IB_MLX5DV_GET(create_qp_out, out, qpn);

qpc = UCT_IB_MLX5DV_ADDR_OF(rst2init_qp_in, in_2init, qpc);
UCT_IB_MLX5DV_SET(rst2init_qp_in, in_2init, opcode, UCT_IB_MLX5_CMD_OP_RST2INIT_QP);
UCT_IB_MLX5DV_SET(rst2init_qp_in, in_2init, qpn, qp->qp_num);
UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED);
UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->super.port);
UCT_IB_MLX5DV_SET(qpc, qpc, rwe, true);
if (attr->super.qp_type == IBV_QPT_RC) {
qpc = UCT_IB_MLX5DV_ADDR_OF(rst2init_qp_in, in_2init, qpc);
UCT_IB_MLX5DV_SET(rst2init_qp_in, in_2init, opcode, UCT_IB_MLX5_CMD_OP_RST2INIT_QP);
UCT_IB_MLX5DV_SET(rst2init_qp_in, in_2init, qpn, qp->qp_num);
UCT_IB_MLX5DV_SET(qpc, qpc, pm_state, UCT_IB_MLX5_QPC_PM_STATE_MIGRATED);
UCT_IB_MLX5DV_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->super.port);
UCT_IB_MLX5DV_SET(qpc, qpc, rwe, true);

ret = mlx5dv_devx_obj_modify(qp->devx.obj, in_2init, sizeof(in_2init),
out_2init, sizeof(out_2init));
if (ret) {
ucs_error("mlx5dv_devx_obj_modify(2INIT_QP) failed, syndrome %x: %m",
UCT_IB_MLX5DV_GET(rst2init_qp_out, out_2init, syndrome));
status = UCS_ERR_IO_ERROR;
goto err_free;
ret = mlx5dv_devx_obj_modify(qp->devx.obj, in_2init, sizeof(in_2init),
out_2init, sizeof(out_2init));
if (ret) {
ucs_error("mlx5dv_devx_obj_modify(2INIT_QP) failed, syndrome %x: %m",
UCT_IB_MLX5DV_GET(rst2init_qp_out, out_2init, syndrome));
status = UCS_ERR_IO_ERROR;
goto err_free;
}
}

qp->type = UCT_IB_MLX5_OBJ_TYPE_DEVX;
Expand Down
9 changes: 6 additions & 3 deletions src/uct/ib/mlx5/dv/ib_mlx5_ifc.h
Original file line number Diff line number Diff line change
Expand Up @@ -759,8 +759,9 @@ struct uct_ib_mlx5_dctc_bits {
uint8_t atomic_like_write_en[0x1];
uint8_t latency_sensitive[0x1];
uint8_t rlky[0x1];
uint8_t free_ar[0x1];
uint8_t reserved_at_73[0xd];
uint8_t force_full_handshake[0x1];
uint8_t multi_path[0x1];
uint8_t reserved_at_73[0xc];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reserved_at_74


uint8_t reserved_at_80[0x8];
uint8_t cs_res[0x8];
Expand Down Expand Up @@ -1211,7 +1212,9 @@ struct uct_ib_mlx5_qpc_bits {
uint8_t counter_set_id[0x8];
uint8_t uar_page[0x18];

uint8_t reserved_at_80[0x8];
uint8_t reserved_at_80[0x3];
uint8_t full_handshake[0x1];
uint8_t cnak_reverse_sl[0x4];
uint8_t user_index[0x18];

uint8_t reserved_at_a0[0x3];
Expand Down
Loading