Skip to content

Commit

Permalink
UCT/IB/MLX5: Fix CQ and QP resources cleanup during RC EP destroy
Browse files Browse the repository at this point in the history
- Avoid of increasing cq.available before completions arrive, since it
  can cause CQ overrun.
- Move QP to error state and wait for all completions for this QP to
  arrive, so all operations on the QP are completed.
- If needed, post NOP operation to flush any unsignaled sends.
- Wait for LAST_WQE_REACHED async event, before destroying the QP.
  Otherwise some SRQ segments could be not released.
  • Loading branch information
yosefe committed Apr 29, 2020
1 parent dbfbea5 commit f372b50
Show file tree
Hide file tree
Showing 10 changed files with 216 additions and 79 deletions.
3 changes: 1 addition & 2 deletions src/uct/ib/dc/dc_mlx5.c
Original file line number Diff line number Diff line change
Expand Up @@ -282,8 +282,7 @@ ucs_status_t uct_dc_mlx5_iface_reset_dci(uct_dc_mlx5_iface_t *iface,
uct_rc_mlx5_iface_common_sync_cqs_ci(&iface->super,
&iface->super.super.super);

uct_rc_mlx5_iface_commom_clean(&iface->super.cq[UCT_IB_DIR_TX], NULL,
dci->txwq.super.qp_num);
uct_rc_mlx5_iface_commom_cq_clean_tx(&iface->super, &dci->txqp, &dci->txwq);

/* Resume posting from to the beginning of the QP */
uct_ib_mlx5_txwq_reset(&dci->txwq);
Expand Down
7 changes: 7 additions & 0 deletions src/uct/ib/mlx5/ib_mlx5.inl
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,13 @@ uct_ib_mlx5_poll_cq(uct_ib_iface_t *iface, uct_ib_mlx5_cq_t *cq)
}


static UCS_F_ALWAYS_INLINE uint32_t
uct_ib_mlx5_cqe_get_qpn(const struct mlx5_cqe64 *cqe)
{
return ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER);
}


static UCS_F_ALWAYS_INLINE uint16_t
uct_ib_mlx5_txwq_update_bb(uct_ib_mlx5_txwq_t *wq, uint16_t hw_ci)
{
Expand Down
1 change: 1 addition & 0 deletions src/uct/ib/rc/accel/rc_mlx5.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ ucs_status_t uct_rc_mlx5_ep_tag_rndv_request(uct_ep_h tl_ep, uct_tag_t tag,
ucs_status_t uct_rc_mlx5_ep_get_address(uct_ep_h tl_ep, uct_ep_addr_t *addr);

ucs_status_t uct_rc_mlx5_ep_handle_failure(uct_rc_mlx5_ep_t *ep,
struct mlx5_cqe64 *cqe,
ucs_status_t status);

ucs_status_t uct_rc_mlx5_ep_set_failed(uct_ib_iface_t *iface, uct_ep_h ep,
Expand Down
4 changes: 2 additions & 2 deletions src/uct/ib/rc/accel/rc_mlx5.inl
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ static UCS_F_ALWAYS_INLINE uct_rc_mlx5_mp_context_t*
uct_rc_mlx5_iface_rx_mp_context_from_ep(uct_rc_mlx5_iface_common_t *iface,
struct mlx5_cqe64 *cqe, unsigned *flags)
{
uint32_t qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER);
uint32_t qp_num = uct_ib_mlx5_cqe_get_qpn(cqe);
uct_rc_mlx5_ep_t *ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super,
qp_num),
uct_rc_mlx5_ep_t);
Expand Down Expand Up @@ -387,7 +387,7 @@ uct_rc_mlx5_iface_common_am_handler(uct_rc_mlx5_iface_common_t *iface,
uct_rc_mlx5_common_packet_dump);

if (ucs_unlikely(hdr->rc_hdr.am_id & UCT_RC_EP_FC_MASK)) {
qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER);
qp_num = uct_ib_mlx5_cqe_get_qpn(cqe);
rc_ops = ucs_derived_of(iface->super.super.ops, uct_rc_iface_ops_t);

/* coverity[overrun-buffer-val] */
Expand Down
140 changes: 106 additions & 34 deletions src/uct/ib/rc/accel/rc_mlx5_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -1052,6 +1052,18 @@ void uct_rc_mlx5_iface_common_query(uct_ib_iface_t *ib_iface,
uct_rc_mlx5_tag_query(iface, iface_attr, max_inline, max_tag_eager_iov);
}

void uct_rc_mlx5_common_post_nop(uct_rc_mlx5_iface_common_t *iface,
uct_rc_txqp_t *txqp,
uct_ib_mlx5_txwq_t *txwq)
{
uct_rc_mlx5_txqp_inline_post(iface, IBV_QPT_RC, txqp, txwq,
MLX5_OPCODE_NOP, NULL, 0,
0, 0, 0,
0, 0,
NULL, NULL, 0, 0,
INT_MAX);
}

void uct_rc_mlx5_iface_common_update_cqs_ci(uct_rc_mlx5_iface_common_t *iface,
uct_ib_iface_t *ib_iface)
{
Expand All @@ -1070,59 +1082,119 @@ void uct_rc_mlx5_iface_common_sync_cqs_ci(uct_rc_mlx5_iface_common_t *iface,
#endif
}

int uct_rc_mlx5_iface_commom_clean(uct_ib_mlx5_cq_t *mlx5_cq,
uct_ib_mlx5_srq_t *srq, uint32_t qpn)
void uct_rc_mlx5_iface_commom_cq_clean(uct_rc_mlx5_iface_common_t *iface,
uct_ib_dir_t dir, uint32_t qp_num,
uct_rc_mlx5_cq_clean_callback_t cb,
void *arg)
{
const size_t cqe_sz = 1ul << mlx5_cq->cqe_size_log;
uct_ib_mlx5_cq_t *cq = &iface->cq[dir];
const size_t cqe_sz = 1ul << cq->cqe_size_log;
volatile struct mlx5_cqe64 *vcqe;
struct mlx5_cqe64 *cqe, *dest;
uct_ib_mlx5_srq_seg_t *seg;
unsigned pi, idx;
uint8_t owner_bit;
unsigned pi;
int nfreed;

pi = mlx5_cq->cq_ci;
for (;;) {
cqe = uct_ib_mlx5_get_cqe(mlx5_cq, pi);
if (uct_ib_mlx5_cqe_is_hw_owned(cqe->op_own, pi, mlx5_cq->cq_length)) {
break;
}

ucs_assert((cqe->op_own >> 4) != MLX5_CQE_INVALID);

++pi;
if (pi == (mlx5_cq->cq_ci + mlx5_cq->cq_length - 1)) {
break;
int done;

pi = cq->cq_ci;
do {
vcqe = cqe = uct_ib_mlx5_get_cqe(cq, pi);
if (uct_ib_mlx5_cqe_is_hw_owned(vcqe->op_own, pi, cq->cq_length)) {
/* CQE not available, check if can complete */
done = cb(iface, NULL, arg);
} else {
ucs_memory_cpu_load_fence();
ucs_assert((vcqe->op_own >> 4) != MLX5_CQE_INVALID);
if (uct_ib_mlx5_cqe_get_qpn(cqe) == qp_num) {
/* CQE on the relevant QP */
done = cb(iface, cqe, arg);
} else {
/* CQE on other QP */
done = 0;
}
++pi;
}
}

ucs_memory_cpu_load_fence();
} while (!done && (pi != (cq->cq_ci + cq->cq_length - 1)));

/* Remove CQEs of the destroyed QP, so the driver would not see them and try
* to remove them itself, creating a mess with the free-list.
*/
nfreed = 0;
while ((int)--pi - (int)mlx5_cq->cq_ci >= 0) {
cqe = uct_ib_mlx5_get_cqe(mlx5_cq, pi);
if ((ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER)) == qpn) {
idx = ntohs(cqe->wqe_counter);
if (srq) {
seg = uct_ib_mlx5_srq_get_wqe(srq, idx);
seg->srq.free = 1;
ucs_trace("cq %p: freed srq seg[%d] of qpn 0x%x",
mlx5_cq, idx, qpn);
}
while ((int)--pi - (int)cq->cq_ci >= 0) {
cqe = uct_ib_mlx5_get_cqe(cq, pi);
if (uct_ib_mlx5_cqe_get_qpn(cqe) == qp_num) {
++nfreed;
} else if (nfreed) {
dest = uct_ib_mlx5_get_cqe(mlx5_cq, pi + nfreed);
dest = uct_ib_mlx5_get_cqe(cq, pi + nfreed);
owner_bit = dest->op_own & MLX5_CQE_OWNER_MASK;
memcpy(UCS_PTR_BYTE_OFFSET(dest + 1, -cqe_sz),
UCS_PTR_BYTE_OFFSET(cqe + 1, -cqe_sz), cqe_sz);
dest->op_own = (dest->op_own & ~MLX5_CQE_OWNER_MASK) | owner_bit;
}
}

mlx5_cq->cq_ci += nfreed;
cq->cq_ci += nfreed;
}

typedef struct {
uct_rc_txqp_t *txqp;
uct_ib_mlx5_txwq_t *txwq;
int post_nop;
} uct_rc_mlx5_common_clean_tx_cq_ctx_t;

return nfreed;
static int uct_rc_mlx5_common_clean_tx_cq_cb(uct_rc_mlx5_iface_common_t *iface,
const struct mlx5_cqe64 *cqe,
void *arg)
{
uct_rc_mlx5_common_clean_tx_cq_ctx_t *ctx = arg;

if (cqe != NULL) {
uct_rc_mlx5_common_update_tx_res(&iface->super, ctx->txwq, ctx->txqp,
htons(cqe->wqe_counter));
}

if (uct_rc_txqp_available(ctx->txqp) == ctx->txwq->bb_max) {
return 1;
}

/* If not posted NOP already, and have the resources, post it to flush
* any unsignaled sends
*/
if (ctx->post_nop && (iface->super.tx.cq_available > 0) &&
(uct_rc_txqp_available(ctx->txqp) > 0))
{
ucs_trace("qp 0x%x: posted NOP", ctx->txwq->super.qp_num);
uct_rc_mlx5_common_post_nop(iface, ctx->txqp, ctx->txwq);
ucs_assert(uct_rc_txqp_unsignaled(ctx->txqp) == 0);
ctx->post_nop = 0;
}

return 0;
}

void uct_rc_mlx5_iface_commom_cq_clean_tx(uct_rc_mlx5_iface_common_t *iface,
uct_rc_txqp_t *txqp,
uct_ib_mlx5_txwq_t *txwq)
{
uct_rc_mlx5_common_clean_tx_cq_ctx_t ctx = {
.txqp = txqp,
.txwq = txwq,
.post_nop = (uct_rc_txqp_unsignaled(txqp) > 0)
};
uct_rc_mlx5_iface_commom_cq_clean(iface, UCT_IB_DIR_TX, txwq->super.qp_num,
uct_rc_mlx5_common_clean_tx_cq_cb, &ctx);
}

void uct_rc_mlx5_iface_print(uct_rc_mlx5_iface_common_t *mlx5_iface,
const char *title)
{
ucs_trace("%s: txcq [n 0x%x avail %d ci 0x%x] rcxq [n 0x%x ci 0x%x] "
"srq [n 0x%x avail %d]", title,
mlx5_iface->cq[UCT_IB_DIR_TX].cq_num,
mlx5_iface->super.tx.cq_available,
mlx5_iface->cq[UCT_IB_DIR_TX].cq_ci,
mlx5_iface->cq[UCT_IB_DIR_RX].cq_num,
mlx5_iface->cq[UCT_IB_DIR_RX].cq_ci,
mlx5_iface->rx.srq.srq_num,
mlx5_iface->super.rx.srq.available);
}
33 changes: 31 additions & 2 deletions src/uct/ib/rc/accel/rc_mlx5_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,22 @@ UCS_CLASS_DECLARE(uct_rc_mlx5_iface_common_t,
uct_ib_iface_init_attr_t*);


/**
* Callback for cleaning the completion queue.
*
* @param [in] iface Interface which is being cleaned
* @param [in] cqe Completion entry being cleaned. If NULL, the callback
* should just return whether more polling is required.
* @param [in] arg User-defined argument
*
* @return Nonzero if cleaning should be completed, zero if cleaning should
* continue and poll for more completions.
*/
typedef int (uct_rc_mlx5_cq_clean_callback_t)(uct_rc_mlx5_iface_common_t *iface,
const struct mlx5_cqe64 *cqe,
void *arg);


#define UCT_RC_MLX5_TM_STAT(_iface, _op) \
UCS_STATS_UPDATE_COUNTER((_iface)->tm.stats, UCT_RC_MLX5_STAT_TAG_##_op, 1)

Expand Down Expand Up @@ -615,14 +631,24 @@ void uct_rc_mlx5_iface_common_query(uct_ib_iface_t *ib_iface,
uct_iface_attr_t *iface_attr,
size_t max_inline, size_t max_tag_eager_iov);

void uct_rc_mlx5_common_post_nop(uct_rc_mlx5_iface_common_t *iface,
uct_rc_txqp_t *txqp,
uct_ib_mlx5_txwq_t *txwq);

void uct_rc_mlx5_iface_common_update_cqs_ci(uct_rc_mlx5_iface_common_t *iface,
uct_ib_iface_t *ib_iface);

void uct_rc_mlx5_iface_common_sync_cqs_ci(uct_rc_mlx5_iface_common_t *iface,
uct_ib_iface_t *ib_iface);

int uct_rc_mlx5_iface_commom_clean(uct_ib_mlx5_cq_t *mlx5_cq,
uct_ib_mlx5_srq_t *srq, uint32_t qpn);
void uct_rc_mlx5_iface_commom_cq_clean(uct_rc_mlx5_iface_common_t *iface,
uct_ib_dir_t dir, uint32_t qp_num,
uct_rc_mlx5_cq_clean_callback_t cb,
void *arg);

void uct_rc_mlx5_iface_commom_cq_clean_tx(uct_rc_mlx5_iface_common_t *iface,
uct_rc_txqp_t *txqp,
uct_ib_mlx5_txwq_t *txwq);

static UCS_F_MAYBE_UNUSED void
uct_rc_mlx5_iface_tm_set_cmd_qp_len(uct_rc_mlx5_iface_common_t *iface)
Expand Down Expand Up @@ -735,4 +761,7 @@ uct_rc_mlx5_common_iface_init_rx(uct_rc_mlx5_iface_common_t *iface,

void uct_rc_mlx5_destroy_srq(uct_ib_mlx5_srq_t *srq);

void uct_rc_mlx5_iface_print(uct_rc_mlx5_iface_common_t *mlx5_iface,
const char *title);

#endif
Loading

0 comments on commit f372b50

Please sign in to comment.