Skip to content

Commit

Permalink
UCT/IB: complete RX CQ cleanup on device failure
Browse files Browse the repository at this point in the history
  • Loading branch information
evgeny-leksikov committed Dec 22, 2021
1 parent e4b9d9e commit 6f32ff2
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 2 deletions.
4 changes: 4 additions & 0 deletions src/uct/ib/base/ib_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,10 @@ static void uct_ib_async_event_handler(int fd, void *arg)
level = UCS_LOG_LEVEL_DEBUG;
break;
case IBV_EVENT_DEVICE_FATAL:
ucs_assert(event.element.port_num == 0);
uct_ib_device_async_event_dispatch(dev, event.event_type,
event.element.port_num);
/* fallthrough */
case IBV_EVENT_PORT_ERR:
snprintf(event_info, sizeof(event_info), "%s on port %d",
ibv_event_type_str(event.event_type), event.element.port_num);
Expand Down
7 changes: 5 additions & 2 deletions src/uct/ib/rc/accel/rc_mlx5_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -955,10 +955,13 @@ static int uct_rc_mlx5_ep_clean_rx_cq_cb(uct_rc_mlx5_iface_common_t *iface,
unsigned count;

if (cqe == NULL) {
/* Check that last WQE reached event has arrived */
/* Check that last WQE reached event has arrived, or device fatal */
count = uct_ib_device_async_event_get_count(dev,
IBV_EVENT_QP_LAST_WQE_REACHED,
qp->qp_num);
qp->qp_num) +
uct_ib_device_async_event_get_count(dev,
IBV_EVENT_DEVICE_FATAL,
0 /* port_num */);
return count > 0;
}

Expand Down

0 comments on commit 6f32ff2

Please sign in to comment.