Skip to content

Commit

Permalink
Merge pull request #231 from evgeny-leksikov/integration3
Browse files Browse the repository at this point in the history
UCT/IB: complete RX CQ cleanup on device failure
  • Loading branch information
yosefe authored Jan 10, 2022
2 parents 12ab072 + 984f799 commit 0610d17
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
2 changes: 2 additions & 0 deletions src/uct/ib/base/ib_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,8 @@ static void uct_ib_async_event_handler(int fd, void *arg)
level = UCS_LOG_LEVEL_DEBUG;
break;
case IBV_EVENT_DEVICE_FATAL:
uct_ib_device_async_event_dispatch(dev, event.event_type, 0);
/* fallthrough */
case IBV_EVENT_PORT_ERR:
snprintf(event_info, sizeof(event_info), "%s on port %d",
ibv_event_type_str(event.event_type), event.element.port_num);
Expand Down
7 changes: 5 additions & 2 deletions src/uct/ib/rc/accel/rc_mlx5_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -955,10 +955,13 @@ static int uct_rc_mlx5_ep_clean_rx_cq_cb(uct_rc_mlx5_iface_common_t *iface,
unsigned count;

if (cqe == NULL) {
/* Check that last WQE reached event has arrived */
/* Check that last WQE reached event has arrived, or device fatal */
count = uct_ib_device_async_event_get_count(dev,
IBV_EVENT_QP_LAST_WQE_REACHED,
qp->qp_num);
qp->qp_num) +
uct_ib_device_async_event_get_count(dev,
IBV_EVENT_DEVICE_FATAL,
0 /* port_num */);
return count > 0;
}

Expand Down

0 comments on commit 0610d17

Please sign in to comment.