From ddc025eb92448d39fdb7eb654f8b21d3c6da16fe Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Tue, 15 Jun 2021 23:05:21 +0300 Subject: [PATCH] UCT/DC/MLX5: Fix deadlock between DCI resources and RDMA_READ credits If we check for RDMA_READ credits before DCI credits, we may put the pending request on DCI-wait arbiter (because ep->dci==-1) while it's actually waiting for RDMA_READ resources on the iface. As a result, when RDMA_READ credits do arrive, but for a **different** pool, we will not dispatch this pending operation and it will be stuck forever. The fix is to always check for DCI resources before TX resources (iface or QP) because pending requests always start from DCI-wait arbiter and then move to TX-wait arbiter. TX resources will always arrive because it's driven by HW progress. (Also, we can not add an ep without allocated DCI directly to TX-wait arbiter). --- src/uct/ib/dc/dc_mlx5_ep.h | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/src/uct/ib/dc/dc_mlx5_ep.h b/src/uct/ib/dc/dc_mlx5_ep.h index c251133a17d..3ef779abc16 100644 --- a/src/uct/ib/dc/dc_mlx5_ep.h +++ b/src/uct/ib/dc/dc_mlx5_ep.h @@ -294,6 +294,24 @@ uct_dc_mlx5_iface_dci_can_alloc(uct_dc_mlx5_iface_t *iface, uint8_t pool_index) return iface->tx.dci_pool[pool_index].stack_top < iface->tx.ndci; } +static UCS_F_ALWAYS_INLINE void +uct_dc_mlx5_iface_check_tx(uct_dc_mlx5_iface_t *iface) +{ +#if UCS_ENABLE_ASSERT + uint8_t pool_index; + + for (pool_index = 0; pool_index < iface->tx.num_dci_pools; ++pool_index) { + if (uct_dc_mlx5_iface_dci_can_alloc(iface, pool_index)) { + ucs_assertv(ucs_arbiter_is_empty( + uct_dc_mlx5_iface_dci_waitq(iface, pool_index)), + "dc_iface %p pool %d: can allocate dci, but pending is " + "not empty", + iface, pool_index); + } + } +#endif +} + static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_iface_progress_pending(uct_dc_mlx5_iface_t *iface, uint8_t pool_index) @@ -322,6 +340,7 @@ uct_dc_mlx5_iface_progress_pending(uct_dc_mlx5_iface_t *iface, } while (ucs_unlikely(!ucs_arbiter_is_empty(dci_waitq) && uct_dc_mlx5_iface_dci_can_alloc(iface, pool_index))); + uct_dc_mlx5_iface_check_tx(iface); } static inline int uct_dc_mlx5_iface_dci_ep_can_send(uct_dc_mlx5_ep_t *ep) @@ -523,10 +542,7 @@ uct_dc_mlx5_iface_dci_get(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep) return UCS_OK; } - /* Do not alloc dci if no TX desc resources, - * otherwise this dci may never be released. */ - if (uct_dc_mlx5_iface_dci_can_alloc(iface, pool_index) && - uct_dc_mlx5_iface_has_tx_resources(iface)) { + if (uct_dc_mlx5_iface_dci_can_alloc(iface, pool_index)) { uct_dc_mlx5_iface_dci_alloc(iface, ep); return UCS_OK; } @@ -562,15 +578,14 @@ static inline struct mlx5_grh_av *uct_dc_mlx5_ep_get_grh(uct_dc_mlx5_ep_t *ep) #define UCT_DC_CHECK_RES_PTR(_iface, _ep) \ { \ - UCT_RC_CHECK_NUM_RDMA_READ_RET(&(_iface)->super.super, \ - UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE)) \ { \ - ucs_status_t status = \ - uct_dc_mlx5_iface_dci_get(_iface, _ep); \ + ucs_status_t status = uct_dc_mlx5_iface_dci_get(_iface, _ep); \ if (ucs_unlikely(status != UCS_OK)) { \ return UCS_STATUS_PTR(status); \ } \ } \ + UCT_RC_CHECK_NUM_RDMA_READ_RET(&(_iface)->super.super, \ + UCS_STATUS_PTR(UCS_ERR_NO_RESOURCE)) \ } @@ -582,9 +597,9 @@ static inline struct mlx5_grh_av *uct_dc_mlx5_ep_get_grh(uct_dc_mlx5_ep_t *ep) */ #define UCT_DC_MLX5_CHECK_RES(_iface, _ep) \ { \ + UCT_DC_MLX5_CHECK_DCI_RES(_iface, _ep) \ UCT_RC_CHECK_NUM_RDMA_READ_RET(&(_iface)->super.super, \ UCS_ERR_NO_RESOURCE) \ - UCT_DC_MLX5_CHECK_DCI_RES(_iface, _ep) \ } @@ -594,6 +609,7 @@ static inline struct mlx5_grh_av *uct_dc_mlx5_ep_get_grh(uct_dc_mlx5_ep_t *ep) * available TX resources. */ #define UCT_DC_CHECK_RES_AND_FC(_iface, _ep) \ { \ + UCT_DC_MLX5_CHECK_RES(_iface, _ep) \ if (ucs_unlikely((_ep)->fc.fc_wnd <= \ (_iface)->super.super.config.fc_hard_thresh)) { \ ucs_status_t _status = uct_dc_mlx5_ep_check_fc(_iface, _ep); \ @@ -607,7 +623,6 @@ static inline struct mlx5_grh_av *uct_dc_mlx5_ep_get_grh(uct_dc_mlx5_ep_t *ep) return _status; \ } \ } \ - UCT_DC_MLX5_CHECK_RES(_iface, _ep) \ if (!uct_dc_mlx5_iface_is_dci_rand(_iface)) { \ uct_rc_iface_check_pending(&(_iface)->super.super, \ &(_ep)->arb_group); \