From 6f5e377fe0e4eb467c80d6dda60bf29eb86efaa9 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Fri, 19 Aug 2016 19:07:14 -0700 Subject: [PATCH] btl/usnic: update for libfabric v1.4 With libfabric v1.4, the usnic provider changed the values of its fabric and domain name strings (compared to libfabric --- opal/mca/btl/usnic/btl_usnic_cclient.c | 2 +- opal/mca/btl/usnic/btl_usnic_compat.c | 6 +- opal/mca/btl/usnic/btl_usnic_component.c | 113 +++++++++++++++-------- opal/mca/btl/usnic/btl_usnic_hwloc.c | 4 +- opal/mca/btl/usnic/btl_usnic_map.c | 14 +-- opal/mca/btl/usnic/btl_usnic_module.c | 62 +++++++------ opal/mca/btl/usnic/btl_usnic_module.h | 2 + opal/mca/btl/usnic/btl_usnic_proc.c | 6 +- opal/mca/btl/usnic/btl_usnic_send.h | 2 +- opal/mca/btl/usnic/btl_usnic_stats.c | 4 +- 10 files changed, 130 insertions(+), 85 deletions(-) diff --git a/opal/mca/btl/usnic/btl_usnic_cclient.c b/opal/mca/btl/usnic/btl_usnic_cclient.c index 3f0279d463..77615937e4 100644 --- a/opal/mca/btl/usnic/btl_usnic_cclient.c +++ b/opal/mca/btl/usnic/btl_usnic_cclient.c @@ -197,7 +197,7 @@ int opal_btl_usnic_connectivity_listen(opal_btl_usnic_module_t *module) /* Ensure to NULL-terminate the passed strings */ strncpy(cmd.nodename, opal_process_info.nodename, CONNECTIVITY_NODENAME_LEN - 1); - strncpy(cmd.usnic_name, module->fabric_info->fabric_attr->name, + strncpy(cmd.usnic_name, module->linux_device_name, CONNECTIVITY_IFNAME_LEN - 1); if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(cmd), &cmd)) { diff --git a/opal/mca/btl/usnic/btl_usnic_compat.c b/opal/mca/btl/usnic/btl_usnic_compat.c index 2346dc56c5..de649cb514 100644 --- a/opal/mca/btl/usnic/btl_usnic_compat.c +++ b/opal/mca/btl/usnic/btl_usnic_compat.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * @@ -536,7 +536,7 @@ opal_btl_usnic_prepare_src( #if MSGDEBUG2 opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n", - module->fabric_info->fabric_attr->name, + module->linux_device_name, (reserve + *size) <= module->max_frag_payload?"small":"large", (void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize, (void *)convertor); @@ -723,7 +723,7 @@ opal_btl_usnic_prepare_src(struct mca_btl_base_module_t *base_module, #if MSGDEBUG2 opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n", - module->fabric_info->fabric_attr->name, + module->linux_device_name, (reserve + *size) <= module->max_frag_payload?"small":"large", (void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize, (void *)convertor); diff --git a/opal/mca/btl/usnic/btl_usnic_component.c b/opal/mca/btl/usnic/btl_usnic_component.c index a063b72da6..a0523691e8 100644 --- a/opal/mca/btl/usnic/btl_usnic_component.c +++ b/opal/mca/btl/usnic/btl_usnic_component.c @@ -378,7 +378,7 @@ static int check_usnic_config(opal_btl_usnic_module_t *module, "not enough usnic resources", true, opal_process_info.nodename, - info->fabric_attr->name, + module->linux_device_name, str); return OPAL_ERROR; } @@ -543,10 +543,12 @@ static bool filter_module(opal_btl_usnic_module_t *module, struct fi_usnic_info *uip; struct fi_info *info; bool match; + const char *linux_device_name; info = module->fabric_info; uip = &module->usnic_info; src = info->src_addr; + linux_device_name = module->linux_device_name; module_mask = src->sin_addr.s_addr & uip->ui.v1.ui_netmask_be; match = false; for (i = 0; i < filter->n_elt; ++i) { @@ -559,7 +561,7 @@ static bool filter_module(opal_btl_usnic_module_t *module, } } else { - if (strcmp(filter->elts[i].if_name, info->fabric_attr->name) == 0) { + if (strcmp(filter->elts[i].if_name, linux_device_name) == 0) { match = true; break; } @@ -590,6 +592,25 @@ static void free_filter(usnic_if_filter_t *filter) free(filter); } +static int do_fi_getinfo(uint32_t version, struct fi_info **info_list) +{ + struct fi_info hints = {0}; + struct fi_ep_attr ep_attr = {0}; + struct fi_fabric_attr fabric_attr = {0}; + + /* We only want providers named "usnic" that are of type EP_DGRAM */ + fabric_attr.prov_name = "usnic"; + ep_attr.type = FI_EP_DGRAM; + + hints.caps = FI_MSG; + hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX; + hints.addr_format = FI_SOCKADDR; + hints.ep_attr = &ep_attr; + hints.fabric_attr = &fabric_attr; + + return fi_getinfo(version, NULL, 0, 0, &hints, info_list); +} + /* * UD component initialization: * (1) read interface list from kernel and compare against component @@ -611,9 +632,6 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, int min_distance, num_local_procs; struct fi_info *info_list; struct fi_info *info; - struct fi_info hints = {0}; - struct fi_ep_attr ep_attr = {0}; - struct fi_fabric_attr fabric_attr = {0}; struct fid_fabric *fabric; struct fid_domain *domain; int ret; @@ -636,19 +654,9 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, OBJ_CONSTRUCT(&btl_usnic_lock, opal_recursive_mutex_t); - /* We only want providers named "usnic that are of type EP_DGRAM */ - fabric_attr.prov_name = "usnic"; - ep_attr.type = FI_EP_DGRAM; - - hints.caps = FI_MSG; - hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX; - hints.addr_format = FI_SOCKADDR; - hints.ep_attr = &ep_attr; - hints.fabric_attr = &fabric_attr; - - /* This code understands libfabric API v1.0 and v1.1. Even if we - were compiled with libfabric API v1.0, we still want to request - v1.1 -- here's why: + /* This code understands libfabric API versions v1.0, v1.1, and + v1.4. Even if we were compiled with libfabric API v1.0, we + still want to request v1.1 -- here's why: - In libfabric v1.0.0 (i.e., API v1.0), the usnic provider did not check the value of the "version" parameter passed into @@ -664,6 +672,17 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, So never request API v1.0 -- always request a minimum of v1.1. + The usnic provider changed the strings in the fabric and domain + names in API v1.4. With API <= v1.3: + + - fabric name is "usnic_X" (device name) + - domain name is NULL + + With libfabric API >= v1.4: + + - fabric name is "a.b.c.d/e" (CIDR notation of network) + - domain name is "usnic_X" (device name) + NOTE: The configure.m4 in this component will require libfabric >= v1.1.0 (i.e., it won't accept v1.0.0) because of a critical bug in the usnic provider in libfabric v1.0.0. However, the @@ -677,9 +696,17 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, Someday, #2 may no longer be true, and we may therefore rip out the libfabric v1.0.0 compatibility code. */ + + /* First try API version 1.4. If that doesn't work, try API + version 1.1. */ uint32_t libfabric_api; - libfabric_api = FI_VERSION(1, 1); - ret = fi_getinfo(libfabric_api, NULL, 0, 0, &hints, &info_list); + libfabric_api = FI_VERSION(1, 4); + ret = do_fi_getinfo(libfabric_api, &info_list); + // Libfabric core will return -FI_ENOSYS if it is too old + if (-FI_ENOSYS == ret) { + libfabric_api = FI_VERSION(1, 1); + ret = do_fi_getinfo(libfabric_api, &info_list); + } if (0 != ret) { opal_output_verbose(5, USNIC_OUT, "btl:usnic: disqualifiying myself due to fi_getinfo failure: %s (%d)", strerror(-ret), ret); @@ -800,13 +827,21 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, i < mca_btl_usnic_component.max_modules); ++i, info = info->next) { + // The fabric/domain names changed at libfabric API v1.4 (see above). + char *linux_device_name; + if (libfabric_api <= FI_VERSION(1, 3)) { + linux_device_name = info->fabric_attr->name; + } else { + linux_device_name = info->domain_attr->name; + } + ret = fi_fabric(info->fabric_attr, &fabric, NULL); if (0 != ret) { opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed", true, opal_process_info.nodename, - info->fabric_attr->name, + linux_device_name, "fi_fabric()", __FILE__, __LINE__, ret, strerror(-ret)); @@ -820,7 +855,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, "libfabric API failed", true, opal_process_info.nodename, - info->fabric_attr->name, + linux_device_name, "fi_domain()", __FILE__, __LINE__, ret, strerror(-ret)); @@ -829,8 +864,8 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, opal_memchecker_base_mem_defined(&domain, sizeof(domain)); opal_output_verbose(5, USNIC_OUT, - "btl:usnic: found: usNIC direct device %s", - info->fabric_attr->name); + "btl:usnic: found: usNIC device %s", + linux_device_name); /* Save a little info on the module that we have already gathered. The rest of the module will be filled in @@ -841,6 +876,12 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, module->fabric = fabric; module->domain = domain; module->fabric_info = info; + module->libfabric_api = libfabric_api; + module->linux_device_name = strdup(linux_device_name); + if (NULL == module->linux_device_name) { + OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE); + goto error; + } /* Obtain usnic-specific device info (e.g., netmask) that doesn't come in the normal fi_getinfo(). This allows us to @@ -850,7 +891,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, if (ret != 0) { opal_output_verbose(5, USNIC_OUT, "btl:usnic: device %s fabric_open_ops failed %d (%s)", - info->fabric_attr->name, ret, fi_strerror(-ret)); + module->linux_device_name, ret, fi_strerror(-ret)); fi_close(&domain->fid); fi_close(&fabric->fid); continue; @@ -863,14 +904,14 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, if (ret != 0) { opal_output_verbose(5, USNIC_OUT, "btl:usnic: device %s usnic_getinfo failed %d (%s)", - info->fabric_attr->name, ret, fi_strerror(-ret)); + module->linux_device_name, ret, fi_strerror(-ret)); fi_close(&domain->fid); fi_close(&fabric->fid); continue; } opal_output_verbose(5, USNIC_OUT, "btl:usnic: device %s usnic_info: link speed=%d, netmask=0x%x, ifname=%s, num_vf=%d, qp/vf=%d, cq/vf=%d", - info->fabric_attr->name, + module->linux_device_name, (unsigned int) module->usnic_info.ui.v1.ui_link_speed, (unsigned int) module->usnic_info.ui.v1.ui_netmask_be, module->usnic_info.ui.v1.ui_ifname, @@ -884,7 +925,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, opal_output_verbose(5, USNIC_OUT, "btl:usnic: %s %s due to %s", (keep_module ? "keeping" : "skipping"), - info->fabric_attr->name, + module->linux_device_name, (filter_incl ? "if_include" : "if_exclude")); if (!keep_module) { fi_close(&domain->fid); @@ -902,7 +943,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, check_usnic_config(module, num_local_procs) != OPAL_SUCCESS) { opal_output_verbose(5, USNIC_OUT, "btl:usnic: device %s is not provisioned with enough resources -- skipping", - info->fabric_attr->name); + module->linux_device_name); fi_close(&domain->fid); fi_close(&fabric->fid); @@ -916,7 +957,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, opal_output_verbose(5, USNIC_OUT, "btl:usnic: device %s looks good!", - info->fabric_attr->name); + module->linux_device_name); /* Let this module advance to the next round! */ btls[j++] = &(module->super); @@ -966,7 +1007,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, btls[num_final_modules++] = &(module->super); /* Output all of this module's values. */ - const char *devname = module->fabric_info->fabric_attr->name; + const char *devname = module->linux_device_name; opal_output_verbose(5, USNIC_OUT, "btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d, num aveqe=%d", devname, @@ -1212,7 +1253,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module, if (cq_ret != -FI_EAVAIL) { BTL_ERROR(("%s: cq_read ret = %d (%s)", - module->fabric_info->fabric_attr->name, cq_ret, + module->linux_device_name, cq_ret, fi_strerror(-cq_ret))); channel->chan_error = true; } @@ -1222,7 +1263,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module, return; } else if (rc != mca_btl_usnic_component.cq_readerr_success_value) { BTL_ERROR(("%s: cq_readerr ret = %d (expected %d)", - module->fabric_info->fabric_attr->name, rc, + module->linux_device_name, rc, (int) mca_btl_usnic_component.cq_readerr_success_value)); channel->chan_error = true; } @@ -1235,7 +1276,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module, static int once = 0; if (once++ == 0) { BTL_ERROR(("%s: Channel %d, %s", - module->fabric_info->fabric_attr->name, + module->linux_device_name, channel->chan_index, FI_ECRC == err_entry.prov_errno ? "CRC error" : "message truncation")); @@ -1256,7 +1297,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module, } } else { BTL_ERROR(("%s: CQ[%d] prov_err = %d", - module->fabric_info->fabric_attr->name, channel->chan_index, + module->linux_device_name, channel->chan_index, err_entry.prov_errno)); channel->chan_error = true; } @@ -1469,7 +1510,7 @@ void opal_btl_usnic_component_debug(void) module = mca_btl_usnic_component.usnic_active_modules[i]; opal_output(0, "active_modules[%d]=%p %s max{frag,chunk,tiny}=%llu,%llu,%llu\n", - i, (void *)module, module->fabric_info->fabric_attr->name, + i, (void *)module, module->linux_device_name, (unsigned long long)module->max_frag_payload, (unsigned long long)module->max_chunk_payload, (unsigned long long)module->max_tiny_payload); diff --git a/opal/mca/btl/usnic/btl_usnic_hwloc.c b/opal/mca/btl/usnic/btl_usnic_hwloc.c index ff9442eef3..78ef4c3abc 100644 --- a/opal/mca/btl/usnic/btl_usnic_hwloc.c +++ b/opal/mca/btl/usnic/btl_usnic_hwloc.c @@ -162,7 +162,7 @@ static hwloc_obj_t find_device_numa(opal_btl_usnic_module_t *module) if (obj->type != HWLOC_OBJ_NODE) { opal_output_verbose(5, USNIC_OUT, "btl:usnic:filter_numa: could not find NUMA node for %s; filtering by NUMA distance not possible", - module->fabric_info->fabric_attr->name); + module->linux_device_name); return NULL; } @@ -218,7 +218,7 @@ int opal_btl_usnic_hwloc_distance(opal_btl_usnic_module_t *module) opal_output_verbose(5, USNIC_OUT, "btl:usnic:filter_numa: %s is distance %d from me", - module->fabric_info->fabric_attr->name, + module->linux_device_name, module->numa_distance); } diff --git a/opal/mca/btl/usnic/btl_usnic_map.c b/opal/mca/btl/usnic/btl_usnic_map.c index ce2aca6abe..c9cbd8a83c 100644 --- a/opal/mca/btl/usnic/btl_usnic_map.c +++ b/opal/mca/btl/usnic/btl_usnic_map.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved * $COPYRIGHT$ * @@ -30,8 +30,8 @@ static int map_compare_modules(const void *aa, const void *bb) opal_btl_usnic_module_t *a = *((opal_btl_usnic_module_t**) aa); opal_btl_usnic_module_t *b = *((opal_btl_usnic_module_t**) bb); - return strcmp(a->fabric_info->fabric_attr->name, - b->fabric_info->fabric_attr->name); + return strcmp(a->linux_device_name, + b->linux_device_name); } /* @@ -74,7 +74,7 @@ static int map_output_modules(FILE *fp) prefix_len); fprintf(fp, "device=%s,ip=%s,mss=%" PRIsize_t "\n", - modules[i]->fabric_info->fabric_attr->name, + modules[i]->linux_device_name, ipv4, modules[i]->fabric_info->ep_attr->max_msg_size); } @@ -102,8 +102,8 @@ static int map_compare_endpoints(const void *aa, const void *bb) return -1; } - return strcmp(a->endpoint_module->fabric_info->fabric_attr->name, - b->endpoint_module->fabric_info->fabric_attr->name); + return strcmp(a->endpoint_module->linux_device_name, + b->endpoint_module->linux_device_name); } /* @@ -148,7 +148,7 @@ static int map_output_endpoints(FILE *fp, opal_btl_usnic_proc_t *proc) eps[i]->endpoint_remote_modex.netmask); fprintf(fp, "device=%s@peer_ip=%s", - eps[i]->endpoint_module->fabric_info->fabric_attr->name, + eps[i]->endpoint_module->linux_device_name, ipv4); ++num_output; } diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c index 7cca5354ab..b92bcb8196 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.c +++ b/opal/mca/btl/usnic/btl_usnic_module.c @@ -102,7 +102,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, if (opal_proc == my_proc) { opal_output_verbose(75, USNIC_OUT, "btl:usnic:add_procs:%s: not connecting to self", - module->fabric_info->fabric_attr->name); + module->linux_device_name); continue; } @@ -110,7 +110,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) { opal_output_verbose(75, USNIC_OUT, "btl:usnic:add_procs:%s: not connecting to %s on same server", - module->fabric_info->fabric_attr->name, + module->linux_device_name, usnic_compat_proc_name_print(&opal_proc->proc_name)); continue; } @@ -126,7 +126,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, skip it */ opal_output_verbose(75, USNIC_OUT, "btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping", - module->fabric_info->fabric_attr->name, + module->linux_device_name, usnic_compat_proc_name_print(&opal_proc->proc_name), opal_get_proc_hostname(opal_proc)); continue; @@ -142,7 +142,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, if (OPAL_SUCCESS != rc) { opal_output_verbose(5, USNIC_OUT, "btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s", - module->fabric_info->fabric_attr->name, + module->linux_device_name, usnic_compat_proc_name_print(&opal_proc->proc_name), opal_get_proc_hostname(opal_proc)); OBJ_RELEASE(usnic_proc); @@ -161,7 +161,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, opal_output_verbose(5, USNIC_OUT, "btl:usnic:add_procs:%s: new usnic peer endpoint: %s, proirity port %d, data port %d", - module->fabric_info->fabric_attr->name, + module->linux_device_name, str, modex->ports[USNIC_PRIORITY_CHANNEL], modex->ports[USNIC_DATA_CHANNEL]); @@ -197,14 +197,14 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module, opal_output_verbose(15, USNIC_OUT, "btl:usnic: %s (which is %s) couldn't reach peer %s", - module->fabric_info->fabric_attr->name, + module->linux_device_name, module->if_ipv4_addr_str, remote); opal_show_help("help-mpi-btl-usnic.txt", "unreachable peer IP", true, opal_process_info.nodename, module->if_ipv4_addr_str, - module->fabric_info->fabric_attr->name, + module->linux_device_name, opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), remote); } @@ -303,7 +303,7 @@ add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module, "libfabric API failed", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "async insertion result", __FILE__, __LINE__, err_entry.err, "Failed to insert address to AV"); @@ -327,7 +327,7 @@ add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_eq_readerr()", __FILE__, __LINE__, ret, "Returned != sizeof(err_entry)"); @@ -348,7 +348,7 @@ add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_eq_sread()", __FILE__, __LINE__, ret, "Returned != (sizeof(entry) or -FI_EAVAIL)"); @@ -904,6 +904,8 @@ static int usnic_finalize(struct mca_btl_base_module_t* btl) fi_close(&module->domain->fid); fi_close(&module->fabric->fid); + free(module->linux_device_name); + return OPAL_SUCCESS; } @@ -1423,7 +1425,7 @@ static void module_async_event_callback(int fd, short flags, void *arg) opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_eq_read()", __FILE__, __LINE__, ret, "Failed to get domain event"); @@ -1442,7 +1444,7 @@ static void module_async_event_callback(int fd, short flags, void *arg) ignore it. */ opal_output_verbose(10, USNIC_OUT, "btl:usnic: got LINK_UP on %s", - module->fabric_info->fabric_attr->name); + module->linux_device_name); break; case 1: // USD_EVENT_LINK_DOWN: @@ -1461,7 +1463,7 @@ static void module_async_event_callback(int fd, short flags, void *arg) opal_show_help("help-mpi-btl-usnic.txt", "async event", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, str, entry.data); fatal = true; } @@ -1492,7 +1494,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_dupinfo() failed", __FILE__, __LINE__, -1, "Unknown"); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1510,14 +1512,14 @@ static int create_ep(opal_btl_usnic_module_t* module, opal_process_info.my_local_rank); } - rc = fi_getinfo(FI_VERSION(1, 1), NULL, 0, 0, hint, &channel->info); + rc = fi_getinfo(module->libfabric_api, NULL, 0, 0, hint, &channel->info); fi_freeinfo(hint); if (0 != rc) { opal_show_help("help-mpi-btl-usnic.txt", "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_getinfo() failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1553,7 +1555,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_endpoint() failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1566,7 +1568,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_ep_bind() SCQ to EP failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1577,7 +1579,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_ep_bind() RCQ to EP failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1588,7 +1590,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_ep_bind() AV to EP failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1601,7 +1603,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_enable() failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1623,7 +1625,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_getname() failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1714,7 +1716,7 @@ static int init_one_channel(opal_btl_usnic_module_t *module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "failed to create CQ", __FILE__, __LINE__); goto error; } @@ -1770,7 +1772,7 @@ static int init_one_channel(opal_btl_usnic_module_t *module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "Failed to get receive buffer from freelist", __FILE__, __LINE__); goto error; @@ -1786,7 +1788,7 @@ static int init_one_channel(opal_btl_usnic_module_t *module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "Failed to post receive buffer", __FILE__, __LINE__); goto error; @@ -1853,7 +1855,7 @@ static void init_local_modex_part1(opal_btl_usnic_module_t *module) opal_output_verbose(5, USNIC_OUT, "btl:usnic: %s IP charactertics: %s, %u Mbps", - module->fabric_info->fabric_attr->name, + module->linux_device_name, module->if_ipv4_addr_str, modex->link_speed_mbps); } @@ -2074,7 +2076,7 @@ static int init_mpool(opal_btl_usnic_module_t *module) "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "create rcache", __FILE__, __LINE__); return OPAL_ERROR; } @@ -2082,7 +2084,7 @@ static int init_mpool(opal_btl_usnic_module_t *module) mca_mpool_base_module_lookup (mca_btl_usnic_component.usnic_mpool_hints); #else asprintf(&mpool_resources.pool_name, "%s", - module->fabric_info->fabric_attr->name); + module->linux_device_name); module->super.btl_mpool = mca_mpool_base_module_create(mca_btl_usnic_component.usnic_mpool_name, &module->super, &mpool_resources); @@ -2092,7 +2094,7 @@ static int init_mpool(opal_btl_usnic_module_t *module) "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "create mpool", __FILE__, __LINE__); return OPAL_ERROR; } @@ -2205,7 +2207,7 @@ static void init_async_event(opal_btl_usnic_module_t *module) "libfabric API failed", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_control(eq, FI_GETWAIT)", __FILE__, __LINE__, ret, fi_strerror(-ret)); diff --git a/opal/mca/btl/usnic/btl_usnic_module.h b/opal/mca/btl/usnic/btl_usnic_module.h index b7f49c596b..aa0e9c3ee3 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.h +++ b/opal/mca/btl/usnic/btl_usnic_module.h @@ -103,8 +103,10 @@ typedef struct opal_btl_usnic_module_t { /* Cache for use during component_init to associate a module with the libfabric device that it came from. */ + uint32_t libfabric_api; struct fid_fabric *fabric; struct fid_domain *domain; + char *linux_device_name; struct fi_info *fabric_info; struct fi_usnic_ops_fabric *usnic_fabric_ops; struct fi_usnic_ops_av *usnic_av_ops; diff --git a/opal/mca/btl/usnic/btl_usnic_proc.c b/opal/mca/btl/usnic/btl_usnic_proc.c index 9d71a6ed9d..f0fefbff96 100644 --- a/opal/mca/btl/usnic/btl_usnic_proc.c +++ b/opal/mca/btl/usnic/btl_usnic_proc.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013-2014 Intel, Inc. All rights reserved * $COPYRIGHT$ * @@ -643,7 +643,7 @@ static int match_modex(opal_btl_usnic_module_t *module, opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, module->fabric_info->ep_attr->max_msg_size, (NULL == proc->proc_opal->proc_hostname) ? "unknown" : proc->proc_opal->proc_hostname, @@ -700,7 +700,7 @@ static int start_av_insert(opal_btl_usnic_module_t *module, opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_av_insert()", __FILE__, __LINE__, ret, "Failed to initiate AV insert"); diff --git a/opal/mca/btl/usnic/btl_usnic_send.h b/opal/mca/btl/usnic/btl_usnic_send.h index 2020544f20..86676a35b9 100644 --- a/opal/mca/btl/usnic/btl_usnic_send.h +++ b/opal/mca/btl/usnic/btl_usnic_send.h @@ -216,7 +216,7 @@ opal_btl_usnic_endpoint_send_segment( "CHUNK" : "FRAG", sseg->ss_base.us_btl_header->pkt_seq, sseg->ss_base.us_btl_header->sender, - endpoint->endpoint_module->fabric_info->fabric_attr->name, + endpoint->endpoint_module->linux_device_name, local_ip, module->local_modex.ports[sseg->ss_channel], (void*)sseg, diff --git a/opal/mca/btl/usnic/btl_usnic_stats.c b/opal/mca/btl/usnic/btl_usnic_stats.c index 9c3acac868..a0c3393cc7 100644 --- a/opal/mca/btl/usnic/btl_usnic_stats.c +++ b/opal/mca/btl/usnic/btl_usnic_stats.c @@ -86,7 +86,7 @@ void opal_btl_usnic_print_stats( prefix, opal_proc_local_get()->proc_name.vpid, - module->fabric_info->fabric_attr->name, + module->linux_device_name, module->stats.num_total_sends, module->mod_channels[USNIC_PRIORITY_CHANNEL].num_channel_sends, @@ -394,7 +394,7 @@ static void setup_mpit_pvars_enum(void) devices[i].value = i; rc = asprintf(&str, "%s,%hhu.%hhu.%hhu.%hhu/%" PRIu32, - m->fabric_info->fabric_attr->name, + m->linux_device_name, c[0], c[1], c[2], c[3], usnic_netmask_to_cidrlen(sin->sin_addr.s_addr)); assert(rc > 0);