Skip to content

Commit

Permalink
Merge pull request #2428 from yosefe/topic/uct-rc-ml5-cleanup-qp-rese…
Browse files Browse the repository at this point in the history
…t-v1.3.x

UCT/IB/MLX5: Move QP to RESET (instead of error) before cleaning the SRQ - v1.3.x
  • Loading branch information
yosefe authored Mar 20, 2018
2 parents 9b49f00 + d040641 commit e2e8196
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 6 deletions.
3 changes: 3 additions & 0 deletions config/m4/ib.m4
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,9 @@ AS_IF([test "x$with_ib" == xyes],
[], [#include <infiniband/verbs_exp.h>])
])
AC_CHECK_DECLS([ibv_cmd_modify_qp],
[], [], [[#include <infiniband/driver.h>]])
mlnx_valg_libdir=$with_verbs/lib${libsuff}/mlnx_ofed/valgrind
AC_MSG_NOTICE([Checking OFED valgrind libs $mlnx_valg_libdir])
Expand Down
35 changes: 30 additions & 5 deletions src/uct/ib/rc/accel/rc_mlx5_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

#include "rc_mlx5.h"

#if HAVE_DECL_IBV_CMD_MODIFY_QP
#include <infiniband/driver.h>
#endif
#include <uct/ib/mlx5/ib_mlx5_log.h>
#include <ucs/arch/cpu.h>
#include <ucs/sys/compiler.h>
Expand Down Expand Up @@ -567,18 +570,40 @@ UCS_CLASS_INIT_FUNC(uct_rc_mlx5_ep_t, uct_iface_h tl_iface)
return UCS_OK;
}

static void uct_rc_mlx5_ep_reset_qp(uct_rc_mlx5_ep_t *ep)
{
uct_rc_txqp_t *txqp = &ep->super.txqp;

/* Make the HW generate CQEs for all in-progress SRQ receives from the QP,
* so we clean them all before ibv_modify_qp() can see them.
*/
#if HAVE_DECL_IBV_CMD_MODIFY_QP
struct ibv_qp_attr qp_attr;
struct ibv_modify_qp cmd;
int ret;

/* Bypass mlx5 driver, and go directly to command interface, to avoid
* cleaning the CQ in mlx5 driver
*/
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.qp_state = IBV_QPS_RESET;
ret = ibv_cmd_modify_qp(txqp->qp, &qp_attr, IBV_QP_STATE, &cmd, sizeof(cmd));
if (ret) {
ucs_warn("modify qp 0x%x to RESET failed: %m", txqp->qp->qp_num);
}
#else
(void)uct_rc_modify_qp(txqp, IBV_QPS_ERR);
#endif
}

static UCS_CLASS_CLEANUP_FUNC(uct_rc_mlx5_ep_t)
{
uct_rc_mlx5_iface_t *iface = ucs_derived_of(self->super.super.super.iface,
uct_rc_mlx5_iface_t);

uct_ib_mlx5_txwq_cleanup(&self->tx.wq);

/* Modify QP to error to make HW generate CQEs for all in-progress SRQ
* receives from the QP, so we clean them all before ibv_modify_qp() can
* see them.
*/
(void)uct_rc_modify_qp(&self->super.txqp, IBV_QPS_ERR);
uct_rc_mlx5_ep_reset_qp(self);
uct_rc_mlx5_iface_commom_clean_srq(&iface->mlx5_common, &iface->super,
self->qp_num);

Expand Down
3 changes: 2 additions & 1 deletion src/uct/ib/rc/base/rc_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,8 @@ ucs_status_t uct_rc_modify_qp(uct_rc_txqp_t *txqp, enum ibv_qp_state state)
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.qp_state = state;
if (ibv_modify_qp(txqp->qp, &qp_attr, IBV_QP_STATE)) {
ucs_warn("modify qp 0x%x to RESET failed: %m", txqp->qp->qp_num);
ucs_warn("modify qp 0x%x to state %d failed: %m", state,
txqp->qp->qp_num);
return UCS_ERR_IO_ERROR;
}

Expand Down

0 comments on commit e2e8196

Please sign in to comment.