diff --git a/src/uct/base/uct_iface.c b/src/uct/base/uct_iface.c index 23840016b729..1e767090c13c 100644 --- a/src/uct/base/uct_iface.c +++ b/src/uct/base/uct_iface.c @@ -325,6 +325,9 @@ void uct_set_ep_failed(ucs_class_t *cls, uct_ep_h tl_ep, uct_iface_h tl_iface) if (iface->err_handler) { iface->err_handler(iface->err_handler_arg, tl_ep, UCS_ERR_ENDPOINT_TIMEOUT); + } else { + ucs_warn("Error %s was not handled for ep %p", + ucs_status_string(UCS_ERR_ENDPOINT_TIMEOUT), tl_ep); } } diff --git a/src/uct/ib/ud/base/ud_ep.c b/src/uct/ib/ud/base/ud_ep.c index 21ac47923877..e687b4f5c16a 100644 --- a/src/uct/ib/ud/base/ud_ep.c +++ b/src/uct/ib/ud/base/ud_ep.c @@ -125,8 +125,9 @@ static void uct_ud_ep_slow_timer(ucs_wtimer_t *self) iface->super.ops->handle_failure(&iface->super, ep); return; } else if (diff > 3*uct_ud_slow_tick()) { - ucs_trace("sceduling resend now: %lu send_time: %lu diff: %lu tick: %lu", - now, ep->tx.send_time, now - ep->tx.send_time, uct_ud_slow_tick()); + ucs_trace("scheduling resend now: %lu send_time: %lu diff: %lu tick: %lu", + now, ep->tx.send_time, now - ep->tx.send_time, + ep->tx.slow_tick); uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_ACK_REQ); uct_ud_ep_ca_drop(ep); uct_ud_ep_resend_start(iface, ep); @@ -138,8 +139,10 @@ static void uct_ud_ep_slow_timer(ucs_wtimer_t *self) uct_ud_ep_ctl_op_add(iface, ep, UCT_UD_EP_OP_ACK_REQ); } - ucs_wtimer_add(&iface->async.slow_timer, &ep->slow_timer, - uct_ud_slow_tick()); + /* Cool down the timer on rescheduling/resending */ + ep->tx.slow_tick *= 2; + ep->tx.slow_tick = ucs_min(ep->tx.slow_tick, iface->config.peer_timeout/3); + ucs_wtimer_add(&iface->async.slow_timer, &ep->slow_timer, ep->tx.slow_tick); } UCS_CLASS_INIT_FUNC(uct_ud_ep_t, uct_ud_iface_t *iface) @@ -153,6 +156,7 @@ UCS_CLASS_INIT_FUNC(uct_ud_ep_t, uct_ud_iface_t *iface) uct_ud_ep_reset(self); ucs_list_head_init(&self->cep_list); uct_ud_iface_add_ep(iface, self); + self->tx.slow_tick = uct_ud_slow_tick(); ucs_wtimer_init(&self->slow_timer, uct_ud_ep_slow_timer); ucs_arbiter_group_init(&self->tx.pending.group); ucs_arbiter_elem_init(&self->tx.pending.elem); @@ -402,6 +406,7 @@ uct_ud_ep_process_ack(uct_ud_iface_t *iface, uct_ud_ep_t *ep, ucs_arbiter_group_schedule(&iface->tx.pending_q, &ep->tx.pending.group); + ep->tx.slow_tick = uct_ud_slow_tick(); ep->tx.send_time = uct_ud_iface_get_async_time(iface); } diff --git a/src/uct/ib/ud/base/ud_ep.h b/src/uct/ib/ud/base/ud_ep.h index c554c7bd9d41..f2a30f84b94a 100644 --- a/src/uct/ib/ud/base/ud_ep.h +++ b/src/uct/ib/ud/base/ud_ep.h @@ -215,6 +215,7 @@ struct uct_ud_ep { ucs_queue_head_t window; /* send window: [acked_psn+1, psn-1] */ uct_ud_ep_pending_op_t pending; /* pending ops */ ucs_time_t send_time; /* tx time of last packet */ + ucs_time_t slow_tick; /* timeout to trigger slow timer */ UCS_STATS_NODE_DECLARE(stats); UCT_UD_EP_HOOK_DECLARE(tx_hook); } tx; diff --git a/src/uct/ib/ud/base/ud_inl.h b/src/uct/ib/ud/base/ud_inl.h index 1b9941abc38f..70fbd4456d56 100644 --- a/src/uct/ib/ud/base/ud_inl.h +++ b/src/uct/ib/ud/base/ud_inl.h @@ -134,6 +134,7 @@ uct_ud_iface_complete_tx_inl(uct_ud_iface_t *iface, uct_ud_ep_t *ep, uct_ud_iface_get_async_time(iface) - ucs_twheel_get_time(&iface->async.slow_timer) + uct_ud_slow_tick()); + ep->tx.slow_tick = uct_ud_slow_tick(); ep->tx.send_time = uct_ud_iface_get_async_time(iface); } @@ -148,6 +149,7 @@ uct_ud_iface_complete_tx_skb(uct_ud_iface_t *iface, uct_ud_ep_t *ep, uct_ud_iface_get_async_time(iface) - ucs_twheel_get_time(&iface->async.slow_timer) + uct_ud_slow_tick()); + ep->tx.slow_tick = uct_ud_slow_tick(); ep->tx.send_time = uct_ud_iface_get_async_time(iface); }