From 3d25dd1fae9b49d9fa0aca96f17aa146524d3734 Mon Sep 17 00:00:00 2001 From: Evgeny Leksikov Date: Wed, 27 Jan 2021 09:18:12 +0200 Subject: [PATCH] UCP/UCT/TOOLS: convert err to diag for uct_cm_open - UCT/RDMACM: decrease log level from error to diag if IB device is not present - UCP/WORKER: do not fail ucp_worker_create by uct_cm_open - TOOLS/UCX_INFO: return status from print_ucp_info --- src/tools/info/proto_info.c | 16 +++++++++++----- src/tools/info/ucx_info.c | 6 ++++-- src/tools/info/ucx_info.h | 10 ++++++---- src/ucp/core/ucp_worker.c | 10 +++++----- src/uct/ib/rdmacm/rdmacm_cm.c | 14 +++++++++++--- 5 files changed, 37 insertions(+), 19 deletions(-) diff --git a/src/tools/info/proto_info.c b/src/tools/info/proto_info.c index 4c74cd0dc8c..506db8d065f 100644 --- a/src/tools/info/proto_info.c +++ b/src/tools/info/proto_info.c @@ -93,10 +93,12 @@ static void print_resource_usage(const resource_usage_t *usage_before, printf("#\n"); } -void print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags, - uint64_t ctx_features, const ucp_ep_params_t *base_ep_params, - size_t estimated_num_eps, size_t estimated_num_ppn, - unsigned dev_type_bitmap, const char *mem_size) +ucs_status_t print_ucp_info(int print_opts, + ucs_config_print_flags_t print_flags, + uint64_t ctx_features, + const ucp_ep_params_t *base_ep_params, + size_t estimated_num_eps, size_t estimated_num_ppn, + unsigned dev_type_bitmap, const char *mem_size) { ucp_config_t *config; ucs_status_t status; @@ -113,7 +115,7 @@ void print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags, status = ucp_config_read(NULL, NULL, &config); if (status != UCS_OK) { - return; + goto out; } memset(¶ms, 0, sizeof(params)); @@ -200,6 +202,8 @@ void print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags, } while (status == UCS_INPROGRESS); ucp_request_release(status_ptr); } + + status = UCS_OK; } out_destroy_worker: @@ -208,4 +212,6 @@ void print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags, ucp_cleanup(context); out_release_config: ucp_config_release(config); +out: + return status; } diff --git a/src/tools/info/ucx_info.c b/src/tools/info/ucx_info.c index 27be23d238d..dd2d7917af9 100644 --- a/src/tools/info/ucx_info.c +++ b/src/tools/info/ucx_info.c @@ -212,8 +212,10 @@ int main(int argc, char **argv) usage(); return -1; } - print_ucp_info(print_opts, print_flags, ucp_features, &ucp_ep_params, - ucp_num_eps, ucp_num_ppn, dev_type_bitmap, mem_size); + + return print_ucp_info(print_opts, print_flags, ucp_features, + &ucp_ep_params, ucp_num_eps, ucp_num_ppn, + dev_type_bitmap, mem_size); } return 0; diff --git a/src/tools/info/ucx_info.h b/src/tools/info/ucx_info.h index 037de535c6f..7596bcd7311 100644 --- a/src/tools/info/ucx_info.h +++ b/src/tools/info/ucx_info.h @@ -35,9 +35,11 @@ void print_uct_info(int print_opts, ucs_config_print_flags_t print_flags, void print_type_info(const char * tl_name); -void print_ucp_info(int print_opts, ucs_config_print_flags_t print_flags, - uint64_t ctx_features, const ucp_ep_params_t *base_ep_params, - size_t estimated_num_eps, size_t estimated_num_ppn, - unsigned dev_type_bitmap, const char *mem_size); +ucs_status_t print_ucp_info(int print_opts, + ucs_config_print_flags_t print_flags, + uint64_t ctx_features, + const ucp_ep_params_t *base_ep_params, + size_t estimated_num_eps, size_t estimated_num_ppn, + unsigned dev_type_bitmap, const char *mem_size); #endif diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c index dcea5adda36..5c1adf0d5c3 100644 --- a/src/ucp/core/ucp_worker.c +++ b/src/ucp/core/ucp_worker.c @@ -1428,11 +1428,12 @@ static ucs_status_t ucp_worker_add_resource_cms(ucp_worker_h worker) } status = uct_cm_open(cmpt, worker->uct, cm_config, &worker->cms[i].cm); + uct_config_release(cm_config); if (status != UCS_OK) { - ucs_error("failed to open CM on component %s with status %s", - context->tl_cmpts[cmpt_index].attr.name, - ucs_status_string(status)); - goto err_free_cms; + ucs_diag("failed to open CM on component %s with status %s", + context->tl_cmpts[cmpt_index].attr.name, + ucs_status_string(status)); + continue; } worker->cms[i].attr.field_mask = UCT_CM_ATTR_FIELD_MAX_CONN_PRIV; @@ -1445,7 +1446,6 @@ static ucs_status_t ucp_worker_add_resource_cms(ucp_worker_h worker) goto err_free_cms; } - uct_config_release(cm_config); worker->cms[i++].cmpt_idx = cmpt_index; } diff --git a/src/uct/ib/rdmacm/rdmacm_cm.c b/src/uct/ib/rdmacm/rdmacm_cm.c index 558208f9772..5fdc8a9dd28 100644 --- a/src/uct/ib/rdmacm/rdmacm_cm.c +++ b/src/uct/ib/rdmacm/rdmacm_cm.c @@ -626,6 +626,7 @@ UCS_CLASS_INIT_FUNC(uct_rdmacm_cm_t, uct_component_h component, { uct_priv_worker_t *worker_priv; ucs_status_t status; + ucs_log_level_t log_lvl; UCS_CLASS_CALL_SUPER_INIT(uct_cm_t, &uct_rdmacm_cm_ops, &uct_rdmacm_cm_iface_ops, worker, component, @@ -633,10 +634,17 @@ UCS_CLASS_INIT_FUNC(uct_rdmacm_cm_t, uct_component_h component, kh_init_inplace(uct_rdmacm_cm_cqs, &self->cqs); - self->ev_ch = rdma_create_event_channel(); + self->ev_ch = rdma_create_event_channel(); if (self->ev_ch == NULL) { - ucs_error("rdma_create_event_channel failed: %m"); - status = UCS_ERR_IO_ERROR; + if (errno == ENODEV) { + status = UCS_ERR_NO_DEVICE; + log_lvl = UCS_LOG_LEVEL_DIAG; + } else { + status = UCS_ERR_IO_ERROR; + log_lvl = UCS_LOG_LEVEL_ERROR; + } + + ucs_log(log_lvl, "rdma_create_event_channel failed: %m"); goto err; }