Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
- Fix hang in MPI_Finalize with UCX_TLS=rc[_x],sm
  • Loading branch information
evgeny-leksikov committed May 21, 2017
1 parent 6f9d5bc commit 7bc4db7
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 4 deletions.
3 changes: 3 additions & 0 deletions src/uct/base/uct_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,9 @@ void uct_set_ep_failed(ucs_class_t *cls, uct_ep_h tl_ep, uct_iface_h tl_iface)
if (iface->err_handler) {
iface->err_handler(iface->err_handler_arg, tl_ep,
UCS_ERR_ENDPOINT_TIMEOUT);
} else {
ucs_warn("Error %s was not handled for ep %p",
ucs_status_string(UCS_ERR_ENDPOINT_TIMEOUT), tl_ep);
}
}

Expand Down
13 changes: 9 additions & 4 deletions src/uct/ib/ud/base/ud_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,9 @@ static void uct_ud_ep_slow_timer(ucs_wtimer_t *self)
iface->super.ops->handle_failure(&iface->super, ep);
return;
} else if (diff > 3*uct_ud_slow_tick()) {
ucs_trace("sceduling resend now: %lu send_time: %lu diff: %lu tick: %lu",
now, ep->tx.send_time, now - ep->tx.send_time, uct_ud_slow_tick());
ucs_trace("scheduling resend now: %lu send_time: %lu diff: %lu tick: %lu",
now, ep->tx.send_time, now - ep->tx.send_time,
ep->tx.slow_tick);
uct_ud_ep_ctl_op_del(ep, UCT_UD_EP_OP_ACK_REQ);
uct_ud_ep_ca_drop(ep);
uct_ud_ep_resend_start(iface, ep);
Expand All @@ -138,8 +139,10 @@ static void uct_ud_ep_slow_timer(ucs_wtimer_t *self)
uct_ud_ep_ctl_op_add(iface, ep, UCT_UD_EP_OP_ACK_REQ);
}

ucs_wtimer_add(&iface->async.slow_timer, &ep->slow_timer,
uct_ud_slow_tick());
/* Cool down the timer on rescheduling/resending */
ep->tx.slow_tick *= 2;
ep->tx.slow_tick = ucs_min(ep->tx.slow_tick, iface->config.peer_timeout/3);
ucs_wtimer_add(&iface->async.slow_timer, &ep->slow_timer, ep->tx.slow_tick);
}

UCS_CLASS_INIT_FUNC(uct_ud_ep_t, uct_ud_iface_t *iface)
Expand All @@ -153,6 +156,7 @@ UCS_CLASS_INIT_FUNC(uct_ud_ep_t, uct_ud_iface_t *iface)
uct_ud_ep_reset(self);
ucs_list_head_init(&self->cep_list);
uct_ud_iface_add_ep(iface, self);
self->tx.slow_tick = uct_ud_slow_tick();
ucs_wtimer_init(&self->slow_timer, uct_ud_ep_slow_timer);
ucs_arbiter_group_init(&self->tx.pending.group);
ucs_arbiter_elem_init(&self->tx.pending.elem);
Expand Down Expand Up @@ -402,6 +406,7 @@ uct_ud_ep_process_ack(uct_ud_iface_t *iface, uct_ud_ep_t *ep,

ucs_arbiter_group_schedule(&iface->tx.pending_q, &ep->tx.pending.group);

ep->tx.slow_tick = uct_ud_slow_tick();
ep->tx.send_time = uct_ud_iface_get_async_time(iface);
}

Expand Down
1 change: 1 addition & 0 deletions src/uct/ib/ud/base/ud_ep.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ struct uct_ud_ep {
ucs_queue_head_t window; /* send window: [acked_psn+1, psn-1] */
uct_ud_ep_pending_op_t pending; /* pending ops */
ucs_time_t send_time; /* tx time of last packet */
ucs_time_t slow_tick; /* timeout to trigger slow timer */
UCS_STATS_NODE_DECLARE(stats);
UCT_UD_EP_HOOK_DECLARE(tx_hook);
} tx;
Expand Down
2 changes: 2 additions & 0 deletions src/uct/ib/ud/base/ud_inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ uct_ud_iface_complete_tx_inl(uct_ud_iface_t *iface, uct_ud_ep_t *ep,
uct_ud_iface_get_async_time(iface) -
ucs_twheel_get_time(&iface->async.slow_timer) +
uct_ud_slow_tick());
ep->tx.slow_tick = uct_ud_slow_tick();
ep->tx.send_time = uct_ud_iface_get_async_time(iface);
}

Expand All @@ -148,6 +149,7 @@ uct_ud_iface_complete_tx_skb(uct_ud_iface_t *iface, uct_ud_ep_t *ep,
uct_ud_iface_get_async_time(iface) -
ucs_twheel_get_time(&iface->async.slow_timer) +
uct_ud_slow_tick());
ep->tx.slow_tick = uct_ud_slow_tick();
ep->tx.send_time = uct_ud_iface_get_async_time(iface);
}

Expand Down

0 comments on commit 7bc4db7

Please sign in to comment.