From 37123836bd7b077c3d458493693168094eb4f293 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Sun, 12 Sep 2021 11:48:38 +0300 Subject: [PATCH] UCT/RC/MLX5: Fix keepalive send condition with CQ moderation - Send FC grant message (which is also used for keepalive) with CQ signaling enabled. - Skip sending a keepalive message only if there are no unsignaled sends: if there are unsignaled sends, they could be completed already, so skipping a keepalive message could fail to detect a dead connection. --- src/uct/ib/rc/accel/rc_mlx5_ep.c | 2 +- src/uct/ib/rc/accel/rc_mlx5_iface.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/uct/ib/rc/accel/rc_mlx5_ep.c b/src/uct/ib/rc/accel/rc_mlx5_ep.c index db6d6b9cd9e..a0cbe3b773d 100644 --- a/src/uct/ib/rc/accel/rc_mlx5_ep.c +++ b/src/uct/ib/rc/accel/rc_mlx5_ep.c @@ -589,7 +589,7 @@ ucs_status_t uct_rc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, NULL, 0, UCT_RC_EP_FC_PURE_GRANT, 0, 0, 0, 0, - NULL, NULL, 0, 0, + NULL, NULL, 0, MLX5_WQE_CTRL_CQ_UPDATE, INT_MAX); return UCS_OK; } diff --git a/src/uct/ib/rc/accel/rc_mlx5_iface.c b/src/uct/ib/rc/accel/rc_mlx5_iface.c index ce8082e2a80..ddb8a6685a8 100644 --- a/src/uct/ib/rc/accel/rc_mlx5_iface.c +++ b/src/uct/ib/rc/accel/rc_mlx5_iface.c @@ -173,10 +173,13 @@ uct_rc_mlx5_common_ka_progress(uct_rc_mlx5_iface_common_t *iface) ucs_spin_lock(&iface->super.ep_list_lock); ucs_list_for_each(ep, &iface->super.ep_list, super.list) { - if (ep->super.txqp.available < ep->tx.wq.bb_max) { - /* have outstanding operations */ + if ((ep->super.txqp.available < ep->tx.wq.bb_max) && + (ep->super.txqp.unsignaled == 0)) { + /* Have outstanding uncompleted operations - no need to send + keepalive message */ continue; } + ucs_trace("send keepalive grant on ep %p", ep); uct_rc_ep_fc_send_grant(&ep->super); }