From 31754bfd7fc7e819c63e3f8b6dec18ebfa3a6bdc Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Wed, 10 Jun 2020 15:24:07 +0300 Subject: [PATCH 1/3] UCP/FLUSH: Support set_ep_failed() during flush --- src/ucp/rma/flush.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/ucp/rma/flush.c b/src/ucp/rma/flush.c index fed93c3f984..025805d8863 100644 --- a/src/ucp/rma/flush.c +++ b/src/ucp/rma/flush.c @@ -32,8 +32,9 @@ static int ucp_ep_flush_is_completed(ucp_request_t *req) static void ucp_ep_flush_progress(ucp_request_t *req) { - ucp_ep_h ep = req->send.ep; - ucp_lane_map_t all_lanes = UCS_MASK(ucp_ep_num_lanes(ep)); + ucp_ep_h ep = req->send.ep; + unsigned num_lanes = ucp_ep_num_lanes(ep); + ucp_lane_map_t all_lanes = UCS_MASK(num_lanes); ucp_ep_flush_state_t *flush_state; ucp_lane_index_t lane; ucs_status_t status; @@ -42,19 +43,23 @@ static void ucp_ep_flush_progress(ucp_request_t *req) /* If the number of lanes changed since flush operation was submitted, adjust * the number of expected completions */ - if (ucs_unlikely(req->send.flush.num_lanes != ucp_ep_num_lanes(ep))) { - diff = ucp_ep_num_lanes(ep) - - req->send.flush.num_lanes; - if (diff < 0) { - ucs_fatal("ep %p: unsupported endpoint reconfiguration from %d to %d" - " lanes during flush", ep, req->send.flush.num_lanes, - ucp_ep_num_lanes(ep)); + if (ucs_unlikely(req->send.flush.num_lanes != num_lanes)) { + req->send.flush.num_lanes = num_lanes; + diff = num_lanes - req->send.flush.num_lanes; + if (diff >= 0) { + ucp_trace_req(req, + "ep %p: adjusting expected flush completion count by %d", + ep, diff); + req->send.state.uct_comp.count += diff; + } else { + /* If we have less lanes, it means we are in error flow and + * ucp_worker_set_ep_failed() was completed, so we should have + * completed the flush on all lanes. + */ + ucs_assertv(req->send.state.uct_comp.count == 0, + "uct_comp.count=%d num_lanes=%d", + req->send.state.uct_comp.count, num_lanes); } - - req->send.state.uct_comp.count += diff; - req->send.flush.num_lanes = ucp_ep_num_lanes(ep); - ucs_trace_req("flush req %p: adjusting expected completion count by %d", - req, diff); } ucs_trace("ep %p: progress flush req %p, started_lanes 0x%x count %d", ep, From 868b5be1da10c94022625ef9f662ab57d512a960 Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Wed, 10 Jun 2020 21:44:52 +0300 Subject: [PATCH 2/3] UCP/FLUSH: Fix assignment order --- src/ucp/rma/flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ucp/rma/flush.c b/src/ucp/rma/flush.c index 025805d8863..5757b8d2214 100644 --- a/src/ucp/rma/flush.c +++ b/src/ucp/rma/flush.c @@ -44,8 +44,8 @@ static void ucp_ep_flush_progress(ucp_request_t *req) /* If the number of lanes changed since flush operation was submitted, adjust * the number of expected completions */ if (ucs_unlikely(req->send.flush.num_lanes != num_lanes)) { - req->send.flush.num_lanes = num_lanes; diff = num_lanes - req->send.flush.num_lanes; + req->send.flush.num_lanes = num_lanes; if (diff >= 0) { ucp_trace_req(req, "ep %p: adjusting expected flush completion count by %d", From 417839dae431ca96e157ab509f76ac46770db90f Mon Sep 17 00:00:00 2001 From: Yossi Itigin Date: Thu, 11 Jun 2020 11:00:22 +0300 Subject: [PATCH 3/3] UCP/FLUSH: Add log for num_lanes changed during flush --- src/ucp/rma/flush.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ucp/rma/flush.c b/src/ucp/rma/flush.c index 5757b8d2214..8ba086be64f 100644 --- a/src/ucp/rma/flush.c +++ b/src/ucp/rma/flush.c @@ -44,6 +44,8 @@ static void ucp_ep_flush_progress(ucp_request_t *req) /* If the number of lanes changed since flush operation was submitted, adjust * the number of expected completions */ if (ucs_unlikely(req->send.flush.num_lanes != num_lanes)) { + ucp_trace_req(req, "ep %p: number of lanes changed from %d to %d", + ep, req->send.flush.num_lanes, num_lanes); diff = num_lanes - req->send.flush.num_lanes; req->send.flush.num_lanes = num_lanes; if (diff >= 0) {