From 6f32ff219a00ba4f4369fea98d099301b2bff1ea Mon Sep 17 00:00:00 2001 From: Evgeny Leksikov Date: Wed, 22 Dec 2021 08:15:24 +0200 Subject: [PATCH] UCT/IB: complete RX CQ cleanup on device failure --- src/uct/ib/base/ib_device.c | 4 ++++ src/uct/ib/rc/accel/rc_mlx5_ep.c | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/uct/ib/base/ib_device.c b/src/uct/ib/base/ib_device.c index d14e702e15d6..f05ff9192215 100644 --- a/src/uct/ib/base/ib_device.c +++ b/src/uct/ib/base/ib_device.c @@ -298,6 +298,10 @@ static void uct_ib_async_event_handler(int fd, void *arg) level = UCS_LOG_LEVEL_DEBUG; break; case IBV_EVENT_DEVICE_FATAL: + ucs_assert(event.element.port_num == 0); + uct_ib_device_async_event_dispatch(dev, event.event_type, + event.element.port_num); + /* fallthrough */ case IBV_EVENT_PORT_ERR: snprintf(event_info, sizeof(event_info), "%s on port %d", ibv_event_type_str(event.event_type), event.element.port_num); diff --git a/src/uct/ib/rc/accel/rc_mlx5_ep.c b/src/uct/ib/rc/accel/rc_mlx5_ep.c index a0cbe3b773de..cace652664b1 100644 --- a/src/uct/ib/rc/accel/rc_mlx5_ep.c +++ b/src/uct/ib/rc/accel/rc_mlx5_ep.c @@ -955,10 +955,13 @@ static int uct_rc_mlx5_ep_clean_rx_cq_cb(uct_rc_mlx5_iface_common_t *iface, unsigned count; if (cqe == NULL) { - /* Check that last WQE reached event has arrived */ + /* Check that last WQE reached event has arrived, or device fatal */ count = uct_ib_device_async_event_get_count(dev, IBV_EVENT_QP_LAST_WQE_REACHED, - qp->qp_num); + qp->qp_num) + + uct_ib_device_async_event_get_count(dev, + IBV_EVENT_DEVICE_FATAL, + 0 /* port_num */); return count > 0; }