From 28b0f30b5a736af1ec438cae27cd92ee1f5de671 Mon Sep 17 00:00:00 2001 From: dmitrygx Date: Sun, 5 Jun 2022 19:01:10 +0300 Subject: [PATCH] UCT/IB/DC: Always schedule DCI allocation during FC_HARD_REQ progress --- src/uct/ib/dc/dc_mlx5.inl | 7 ++++--- src/uct/ib/dc/dc_mlx5_ep.c | 14 ++++++++++++-- src/uct/ib/dc/dc_mlx5_ep.h | 11 ++++++----- 3 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/uct/ib/dc/dc_mlx5.inl b/src/uct/ib/dc/dc_mlx5.inl index 26718778394..e88eb637740 100644 --- a/src/uct/ib/dc/dc_mlx5.inl +++ b/src/uct/ib/dc/dc_mlx5.inl @@ -41,7 +41,8 @@ uct_dc_mlx5_get_arbiter_params(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep, } static UCS_F_ALWAYS_INLINE void -uct_dc_mlx5_ep_schedule(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep) +uct_dc_mlx5_ep_schedule(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep, + int force) { if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) { /* no dci: @@ -49,7 +50,7 @@ uct_dc_mlx5_ep_schedule(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep) * arbiter. This way we can assure fairness between all eps waiting for * dci allocation. Relevant for dcs and dcs_quota policies. */ - uct_dc_mlx5_iface_schedule_dci_alloc(iface, ep); + uct_dc_mlx5_iface_schedule_dci_alloc(iface, ep, force); } else { uct_dc_mlx5_iface_dci_sched_tx(iface, ep); } @@ -83,5 +84,5 @@ uct_dc_mlx5_ep_pending_common(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep, return; } - uct_dc_mlx5_ep_schedule(iface, ep); + uct_dc_mlx5_ep_schedule(iface, ep, 0); } diff --git a/src/uct/ib/dc/dc_mlx5_ep.c b/src/uct/ib/dc/dc_mlx5_ep.c index 5a044ba993c..c5fc641cee2 100644 --- a/src/uct/ib/dc/dc_mlx5_ep.c +++ b/src/uct/ib/dc/dc_mlx5_ep.c @@ -1504,6 +1504,7 @@ static unsigned uct_dc_mlx5_ep_fc_hard_req_progress(void *arg) ucs_time_t now = ucs_get_time(); uint64_t ep_key; uct_dc_mlx5_ep_t *ep; + ucs_status_t UCS_V_UNUSED status; if (ucs_likely(now < iface->tx.fc_hard_req_resend_time)) { return 0; @@ -1516,7 +1517,16 @@ static unsigned uct_dc_mlx5_ep_fc_hard_req_progress(void *arg) * packet in case of failure on the remote FC endpoint */ kh_foreach_key(&iface->tx.fc_hash, ep_key, { ep = (uct_dc_mlx5_ep_t*)ep_key; - uct_dc_mlx5_ep_schedule(iface, ep); + + /* Allocate DCI for the endpoint to schedule the endpoint to DCI wait + * queue if there is free DCI */ + status = uct_dc_mlx5_iface_dci_get(iface, ep); + ucs_assertv((status == UCS_OK) || (status == UCS_ERR_NO_RESOURCE), + "%s", ucs_status_string(status)); + + /* Force DCI scheduling, since FC resources may never become available + * unless we send FC_HARD_REQ packet */ + uct_dc_mlx5_ep_schedule(iface, ep, 1); }) return 1; @@ -1636,7 +1646,7 @@ void uct_dc_mlx5_ep_handle_failure(uct_dc_mlx5_ep_t *ep, void *arg, /* Since DCI isn't assigned for the FC endpoint, schedule DCI * allocation for progressing possible FC_PURE_GRANT re-sending * operation which are scheduled on the pending queue */ - uct_dc_mlx5_iface_schedule_dci_alloc(iface, ep); + uct_dc_mlx5_iface_schedule_dci_alloc(iface, ep, 0); } } diff --git a/src/uct/ib/dc/dc_mlx5_ep.h b/src/uct/ib/dc/dc_mlx5_ep.h index b83d87f8670..5bcc7da4443 100644 --- a/src/uct/ib/dc/dc_mlx5_ep.h +++ b/src/uct/ib/dc/dc_mlx5_ep.h @@ -381,13 +381,14 @@ static inline int uct_dc_mlx5_iface_dci_ep_can_send(uct_dc_mlx5_ep_t *ep) } static UCS_F_ALWAYS_INLINE -void uct_dc_mlx5_iface_schedule_dci_alloc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep) +void uct_dc_mlx5_iface_schedule_dci_alloc(uct_dc_mlx5_iface_t *iface, + uct_dc_mlx5_ep_t *ep, int force) { ucs_arbiter_t *waitq; - /* If FC window is empty the group will be scheduled when - * grant is received */ - if (uct_rc_fc_has_resources(&iface->super.super, &ep->fc)) { + /* If FC window is empty and force scheduling wasn't requested, the group + * will be scheduled when grant is received */ + if (force || uct_rc_fc_has_resources(&iface->super.super, &ep->fc)) { waitq = uct_dc_mlx5_iface_dci_waitq(iface, uct_dc_mlx5_ep_pool_index(ep)); ucs_arbiter_group_schedule(waitq, &ep->arb_group); } @@ -475,7 +476,7 @@ uct_dc_mlx5_iface_dci_put(uct_dc_mlx5_iface_t *iface, uint8_t dci_index) * move the group to the 'wait for dci alloc' state */ ucs_arbiter_group_desched(uct_dc_mlx5_iface_tx_waitq(iface), &ep->arb_group); - uct_dc_mlx5_iface_schedule_dci_alloc(iface, ep); + uct_dc_mlx5_iface_schedule_dci_alloc(iface, ep, 0); } static inline void uct_dc_mlx5_iface_dci_alloc(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep)