From e88b6c6737ab8c6ad8806e1c0ce3f8b1ee2f294a Mon Sep 17 00:00:00 2001 From: Matt Baker Date: Fri, 5 May 2017 11:10:02 -0400 Subject: [PATCH 1/4] UCT/UGNI: Start using spinlocks to protect critical structures --- src/uct/ugni/base/ugni_device.c | 99 +++++++++++++++++++- src/uct/ugni/base/ugni_device.h | 2 + src/uct/ugni/base/ugni_ep.c | 7 +- src/uct/ugni/base/ugni_iface.c | 137 +++++----------------------- src/uct/ugni/base/ugni_iface.h | 20 ++-- src/uct/ugni/base/ugni_md.c | 23 +++-- src/uct/ugni/base/ugni_md.h | 1 + src/uct/ugni/base/ugni_types.h | 48 +++++----- src/uct/ugni/rdma/ugni_rdma_ep.c | 2 +- src/uct/ugni/rdma/ugni_rdma_iface.c | 16 +--- src/uct/ugni/smsg/ugni_smsg_ep.c | 6 +- src/uct/ugni/smsg/ugni_smsg_iface.c | 101 +++----------------- src/uct/ugni/udt/ugni_udt_iface.c | 29 ++---- src/uct/ugni/udt/ugni_udt_iface.h | 4 + 14 files changed, 207 insertions(+), 288 deletions(-) diff --git a/src/uct/ugni/base/ugni_device.c b/src/uct/ugni/base/ugni_device.c index 612d4cf7d3e..88f46a8c43a 100644 --- a/src/uct/ugni/base/ugni_device.c +++ b/src/uct/ugni/base/ugni_device.c @@ -9,10 +9,33 @@ #endif #include "ugni_device.h" +#include "ugni_md.h" #include "ugni_iface.h" #include #include +#if ENABLE_MT +#define uct_ugni_check_lock_needed(_cdm) UCS_THREAD_MODE_MULTI == _cdm->thread_mode +#define uct_ugni_device_init_lock(_dev) ucs_spinlock_init(&_dev->lock) +#define uct_ugni_device_destroy_lock(_dev) ucs_spinlock_destroy(&_dev->lock) +#define uct_ugni_device_lock(_cdm) \ +if (uct_ugni_check_lock_needed(_cdm)) { \ + ucs_spin_lock(&cdm->dev->lock); \ +} +#define uct_ugni_device_unlock(_cdm) \ +if (uct_ugni_check_lock_needed(_cdm)) { \ + ucs_spin_unlock(&cdm->dev->lock); \ +} +#else +#define uct_ugni_device_init_lock(x) UCS_OK +#define uct_ugni_device_destroy_lock(x) UCS_OK +#define uct_ugni_device_lock(x) +#define uct_ugni_device_unlock(x) +#define uct_ugni_check_lock_needed(x) 0 +#endif + +uint16_t ugni_domain_counter = 0; + void uct_ugni_device_get_resource(const char *tl_name, uct_ugni_device_t *dev, uct_tl_resource_desc_t *resource) { @@ -106,21 +129,93 @@ ucs_status_t uct_ugni_device_create(int dev_id, int index, uct_ugni_device_t *de ucs_snprintf_zero(dev_p->fname, sizeof(dev_p->fname), "%s:%d", dev_p->type_name, dev_p->device_index); + status = uct_ugni_device_init_lock(dev_p); + if (UCS_OK != status) { + ucs_error("Couldn't initalize device lock."); + return status; + } dev_p->attached = false; return UCS_OK; } void uct_ugni_device_destroy(uct_ugni_device_t *dev) { - /* Nop */ + ucs_status_t status; + + status = uct_ugni_device_destroy_lock(dev); + if (UCS_OK != status) { + ucs_error("Couldn't destroy device lock."); + } } ucs_status_t uct_ugni_iface_get_dev_address(uct_iface_t *tl_iface, uct_device_addr_t *addr) { uct_ugni_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_iface_t); uct_devaddr_ugni_t *ugni_dev_addr = (uct_devaddr_ugni_t *)addr; + uct_ugni_device_t *dev = uct_ugni_iface_device(iface); + + ugni_dev_addr->nic_addr = dev->address; + + return UCS_OK; +} + +ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, ucs_thread_mode_t thread_mode) +{ + uct_ugni_job_info_t *job_info; + int modes; + gni_return_t ugni_rc; + ucs_status_t status = UCS_OK; - ugni_dev_addr->nic_addr = iface->dev->address; + job_info = uct_ugni_get_job_info(); + if (NULL == job_info) { + return UCS_ERR_IO_ERROR; + } + + cdm->thread_mode = thread_mode; + cdm->dev = device; + uct_ugni_device_lock(cdm); + cdm->domain_id = job_info->pmi_rank_id + job_info->pmi_num_of_ranks * ugni_domain_counter++; + ucs_debug("Creating new command domain with id %d (%d + %d * %d)", + cdm->domain_id, job_info->pmi_rank_id, + job_info->pmi_num_of_ranks, ugni_domain_counter); + modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED | + GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL; + ugni_rc = GNI_CdmCreate(cdm->domain_id, job_info->ptag, job_info->cookie, + modes, &cdm->cdm_handle); + if (GNI_RC_SUCCESS != ugni_rc) { + ucs_error("GNI_CdmCreate failed, Error status: %s %d", + gni_err_str[ugni_rc], ugni_rc); + status = UCS_ERR_NO_DEVICE; + goto out_unlock; + } + + ugni_rc = GNI_CdmAttach(cdm->cdm_handle, device->device_id, + &cdm->address, &cdm->nic_handle); + if (GNI_RC_SUCCESS != ugni_rc) { + ucs_error("GNI_CdmAttach failed (domain id %d, %d), Error status: %s %d", + cdm->domain_id, ugni_domain_counter, gni_err_str[ugni_rc], ugni_rc); + GNI_CdmDestroy(cdm->cdm_handle); + status = UCS_ERR_NO_DEVICE; + } +out_unlock: + uct_ugni_device_unlock(cdm); + if (UCS_OK == status) { + ucs_debug("Made ugni cdm. nic_addr = %i domain_id = %i", device->address, cdm->domain_id); + } + return status; +} + +ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm) +{ + gni_return_t ugni_rc; + + ucs_debug("MD GNI_CdmDestroy"); + ugni_rc = GNI_CdmDestroy(cdm->cdm_handle); + if (GNI_RC_SUCCESS != ugni_rc) { + ucs_error("GNI_CdmDestroy error status: %s (%d)", + gni_err_str[ugni_rc], ugni_rc); + return UCS_ERR_IO_ERROR; + } return UCS_OK; } diff --git a/src/uct/ugni/base/ugni_device.h b/src/uct/ugni/base/ugni_device.h index 71c5b964214..defb78d6762 100644 --- a/src/uct/ugni/base/ugni_device.h +++ b/src/uct/ugni/base/ugni_device.h @@ -15,4 +15,6 @@ void uct_ugni_device_destroy(uct_ugni_device_t *dev); void uct_ugni_device_get_resource(const char *tl_name, uct_ugni_device_t *dev, uct_tl_resource_desc_t *resource); ucs_status_t uct_ugni_iface_get_dev_address(uct_iface_t *tl_iface, uct_device_addr_t *addr); +ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, ucs_thread_mode_t thread_mode); +ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm); #endif diff --git a/src/uct/ugni/base/ugni_ep.c b/src/uct/ugni/base/ugni_ep.c index 3ac3e26ea47..4ec309e7147 100644 --- a/src/uct/ugni/base/ugni_ep.c +++ b/src/uct/ugni/base/ugni_ep.c @@ -187,6 +187,7 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface, const uct_devaddr_ugni_t *ugni_dev_addr = (const uct_devaddr_ugni_t *)dev_addr; ucs_status_t rc = UCS_OK; gni_return_t ugni_rc; + uint32_t *big_hash; self->arb_sched = 0; UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super); @@ -196,7 +197,7 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface, self->flush_group->parent = NULL; #endif - ugni_rc = GNI_EpCreate(iface->nic_handle, iface->local_cq, &self->ep); + ugni_rc = GNI_EpCreate(uct_ugni_iface_nic_handle(iface), iface->local_cq, &self->ep); if (GNI_RC_SUCCESS != ugni_rc) { ucs_error("GNI_CdmCreate failed, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); @@ -208,11 +209,9 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface, } ucs_arbiter_group_init(&self->arb_group); - - uint32_t *big_hash; big_hash = (void *)&self->ep; self->hash_key = big_hash[0]; - if (GNI_DEVICE_ARIES == iface->dev->type) { + if (uct_ugni_check_device_type(iface, GNI_DEVICE_ARIES)) { self->hash_key &= 0x00FFFFFF; } ucs_debug("Adding ep hash %x to iface %p", self->hash_key, iface); diff --git a/src/uct/ugni/base/ugni_iface.c b/src/uct/ugni/base/ugni_iface.c index 53ceda9a075..d4d12acdd33 100644 --- a/src/uct/ugni/base/ugni_iface.c +++ b/src/uct/ugni/base/ugni_iface.c @@ -10,8 +10,6 @@ #include "ugni_iface.h" #include -static uint16_t ugni_domain_global_counter = 0; - void uct_ugni_base_desc_init(ucs_mpool_t *mp, void *obj, void *chunk) { uct_ugni_base_desc_t *base = (uct_ugni_base_desc_t *) obj; @@ -132,7 +130,7 @@ ucs_status_t uct_ugni_iface_get_address(uct_iface_h tl_iface, uct_ugni_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_iface_t); uct_sockaddr_ugni_t *iface_addr = (uct_sockaddr_ugni_t*)addr; - iface_addr->domain_id = iface->domain_id; + iface_addr->domain_id = iface->cdm.domain_id; return UCS_OK; } @@ -183,7 +181,7 @@ static ucs_status_t get_ptag(uint8_t *ptag) return UCS_OK; } -static ucs_status_t uct_ugni_fetch_pmi() +ucs_status_t uct_ugni_fetch_pmi() { int spawned = 0, rc; @@ -234,108 +232,6 @@ static ucs_status_t uct_ugni_fetch_pmi() return UCS_OK; } -ucs_status_t uct_ugni_init_nic(int device_index, - uint16_t *domain_id, - gni_cdm_handle_t *cdm_handle, - gni_nic_handle_t *nic_handle, - uint32_t *address) -{ - int modes; - ucs_status_t status; - gni_return_t ugni_rc = GNI_RC_SUCCESS; - - status = uct_ugni_fetch_pmi(); - if (UCS_OK != status) { - ucs_error("Failed to activate context, Error status: %d", status); - return status; - } - - *domain_id = job_info.pmi_rank_id + job_info.pmi_num_of_ranks * ugni_domain_global_counter; - modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED | - GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL; - ucs_debug("Creating new command domain with id %d (%d + %d * %d)", - *domain_id, job_info.pmi_rank_id, - job_info.pmi_num_of_ranks, ugni_domain_global_counter); - ugni_rc = GNI_CdmCreate(*domain_id, job_info.ptag, job_info.cookie, - modes, cdm_handle); - if (GNI_RC_SUCCESS != ugni_rc) { - ucs_error("GNI_CdmCreate failed, Error status: %s %d", - gni_err_str[ugni_rc], ugni_rc); - return UCS_ERR_NO_DEVICE; - } - - /* For now we use the first device for allocation of the domain */ - ugni_rc = GNI_CdmAttach(*cdm_handle, job_info.devices[device_index].device_id, - address, nic_handle); - if (GNI_RC_SUCCESS != ugni_rc) { - ucs_error("GNI_CdmAttach failed (domain id %d, %d), Error status: %s %d", - *domain_id, ugni_domain_global_counter, gni_err_str[ugni_rc], ugni_rc); - return UCS_ERR_NO_DEVICE; - } - - ++ugni_domain_global_counter; - return UCS_OK; -} - -ucs_status_t ugni_activate_iface(uct_ugni_iface_t *iface) -{ - ucs_status_t status; - gni_return_t ugni_rc; - uint32_t pe_address; - - if(iface->activated) { - return UCS_OK; - } - - status = uct_ugni_init_nic(0, &iface->domain_id, - &iface->cdm_handle, &iface->nic_handle, - &pe_address); - if (UCS_OK != status) { - ucs_error("Failed to UGNI NIC, Error status: %d", status); - return status; - } - - ucs_debug("Made ugni interface. iface->dev->nic_addr = %i iface->domain_id = %i", iface->dev->address, iface->domain_id); - - ugni_rc = GNI_CqCreate(iface->nic_handle, UCT_UGNI_LOCAL_CQ, 0, - GNI_CQ_NOBLOCK, - NULL, NULL, &iface->local_cq); - if (GNI_RC_SUCCESS != ugni_rc) { - ucs_error("GNI_CqCreate failed, Error status: %s %d", - gni_err_str[ugni_rc], ugni_rc); - return UCS_ERR_NO_DEVICE; - } - iface->activated = true; - - /* iface is activated */ - return UCS_OK; -} - -ucs_status_t ugni_deactivate_iface(uct_ugni_iface_t *iface) -{ - gni_return_t ugni_rc; - - if(!iface->activated) { - return UCS_OK; - } - - ugni_rc = GNI_CqDestroy(iface->local_cq); - if (GNI_RC_SUCCESS != ugni_rc) { - ucs_warn("GNI_CqDestroy failed, Error status: %s %d", - gni_err_str[ugni_rc], ugni_rc); - return UCS_ERR_IO_ERROR; - } - ugni_rc = GNI_CdmDestroy(iface->cdm_handle); - if (GNI_RC_SUCCESS != ugni_rc) { - ucs_warn("GNI_CdmDestroy error status: %s (%d)", - gni_err_str[ugni_rc], ugni_rc); - return UCS_ERR_IO_ERROR; - } - - iface->activated = false ; - return UCS_OK; -} - static ucs_mpool_ops_t uct_ugni_flush_mpool_ops = { .chunk_alloc = ucs_mpool_chunk_malloc, .chunk_release = ucs_mpool_chunk_free, @@ -350,20 +246,32 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker, UCS_STATS_ARG(ucs_stats_node_t *stats_parent)) { uct_ugni_device_t *dev; + gni_return_t ugni_rc; ucs_status_t status; uct_ugni_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_iface_config_t); unsigned grow = (config->mpool.bufs_grow == 0) ? 128 : config->mpool.bufs_grow; + UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, uct_ugni_iface_ops, md, worker, + params, tl_config UCS_STATS_ARG(params->stats_root) + UCS_STATS_ARG(UCT_UGNI_MD_NAME)); dev = uct_ugni_device_by_name(params->dev_name); if (NULL == dev) { ucs_error("No device was found: %s", params->dev_name); return UCS_ERR_NO_DEVICE; } - UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, uct_ugni_iface_ops, md, worker, - params, tl_config UCS_STATS_ARG(params->stats_root) - UCS_STATS_ARG(UCT_UGNI_MD_NAME)); - self->dev = dev; - self->activated = false; + status = uct_ugni_create_cdm(&self->cdm, dev, worker->thread_mode); + if (UCS_OK != status) { + ucs_error("Failed to UGNI NIC, Error status: %d", status); + return status; + } + ugni_rc = GNI_CqCreate(uct_ugni_iface_nic_handle(self), UCT_UGNI_LOCAL_CQ, 0, + GNI_CQ_NOBLOCK, + NULL, NULL, &self->local_cq); + if (GNI_RC_SUCCESS != ugni_rc) { + ucs_error("GNI_CqCreate failed, Error status: %s %d", + gni_err_str[ugni_rc], ugni_rc); + return UCS_ERR_NO_DEVICE; + } self->outstanding = 0; sglib_hashed_uct_ugni_ep_t_init(self->eps); ucs_arbiter_init(&self->arbiter); @@ -386,9 +294,10 @@ UCS_CLASS_DEFINE_NEW_FUNC(uct_ugni_iface_t, uct_iface_t, uct_md_h, uct_worker_h, const uct_iface_params_t*, uct_iface_ops_t *, const uct_iface_config_t * UCS_STATS_ARG(ucs_stats_node_t *)); -static UCS_CLASS_CLEANUP_FUNC(uct_ugni_iface_t){ - - ugni_deactivate_iface(self); +static UCS_CLASS_CLEANUP_FUNC(uct_ugni_iface_t) +{ + GNI_CqDestroy(self->local_cq); + uct_ugni_destroy_cdm(&self->cdm); ucs_arbiter_cleanup(&self->arbiter); } diff --git a/src/uct/ugni/base/ugni_iface.h b/src/uct/ugni/base/ugni_iface.h index 0f7c30e7c34..d6c14bf2c77 100644 --- a/src/uct/ugni/base/ugni_iface.h +++ b/src/uct/ugni/base/ugni_iface.h @@ -23,22 +23,24 @@ ucs_status_t uct_ugni_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t * int uct_ugni_iface_is_reachable(uct_iface_h tl_iface, const uct_device_addr_t *dev_addr, const uct_iface_addr_t *iface_addr); void uct_ugni_progress(void *arg); -ucs_status_t ugni_activate_iface(uct_ugni_iface_t *iface); -ucs_status_t ugni_deactivate_iface(uct_ugni_iface_t *iface); -ucs_status_t uct_ugni_init_nic(int device_index, - uint16_t *domain_id, - gni_cdm_handle_t *cdm_handle, - gni_nic_handle_t *nic_handle, - uint32_t *address); +ucs_status_t uct_ugni_fetch_pmi(); void uct_ugni_base_desc_init(ucs_mpool_t *mp, void *obj, void *chunk); void uct_ugni_base_desc_key_init(uct_iface_h iface, void *obj, uct_mem_h memh); ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name, uct_tl_resource_desc_t **resource_p, unsigned *num_resources_p); - static inline uct_ugni_device_t *uct_ugni_iface_device(uct_ugni_iface_t *iface) { - return iface->dev; + return iface->cdm.dev; +} +static inline gni_nic_handle_t uct_ugni_iface_nic_handle(uct_ugni_iface_t *iface) +{ + return iface->cdm.nic_handle; +} +static inline int uct_ugni_check_device_type(uct_ugni_iface_t *iface, gni_nic_device_t type) +{ + uct_ugni_device_t *dev = uct_ugni_iface_device(iface); + return dev->type == type; } #endif diff --git a/src/uct/ugni/base/ugni_md.c b/src/uct/ugni/base/ugni_md.c index e23af39cabc..93cf4f1f97b 100644 --- a/src/uct/ugni/base/ugni_md.c +++ b/src/uct/ugni/base/ugni_md.c @@ -37,6 +37,18 @@ uct_ugni_job_info_t job_info = { .initialized = false, }; +uct_ugni_job_info_t *uct_ugni_get_job_info() +{ + ucs_status_t status; + + status = uct_ugni_fetch_pmi(); + if (UCS_OK != status) { + ucs_error("Could not fetch PMI info."); + return NULL; + } + return &job_info; +} + static ucs_status_t uct_ugni_md_query(uct_md_h md, uct_md_attr_t *md_attr) { md_attr->rkey_packed_size = 3 * sizeof(uint64_t); @@ -72,7 +84,7 @@ static ucs_status_t uct_ugni_mem_reg(uct_md_h md, void *address, size_t length, goto mem_err; } - ugni_rc = GNI_MemRegister(ugni_md->nic_handle, (uint64_t)address, + ugni_rc = GNI_MemRegister(ugni_md->cdm.nic_handle, (uint64_t)address, length, NULL, GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING, -1, mem_hndl); @@ -104,7 +116,7 @@ static ucs_status_t uct_ugni_mem_dereg(uct_md_h md, uct_mem_h memh) pthread_mutex_lock(&uct_ugni_global_lock); - ugni_rc = GNI_MemDeregister(ugni_md->nic_handle, mem_hndl); + ugni_rc = GNI_MemDeregister(ugni_md->cdm.nic_handle, mem_hndl); if (GNI_RC_SUCCESS != ugni_rc) { ucs_error("GNI_MemDeregister failed, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); @@ -250,7 +262,7 @@ static void uct_ugni_md_close(uct_md_h md) pthread_mutex_lock(&uct_ugni_global_lock); ugni_md->ref_count--; if (!ugni_md->ref_count) { - ugni_rc = GNI_CdmDestroy(ugni_md->cdm_handle); + ugni_rc = GNI_CdmDestroy(ugni_md->cdm.cdm_handle); if (GNI_RC_SUCCESS != ugni_rc) { ucs_warn("GNI_CdmDestroy error status: %s (%d)", gni_err_str[ugni_rc], ugni_rc); @@ -263,7 +275,6 @@ static void uct_ugni_md_close(uct_md_h md) static ucs_status_t uct_ugni_md_open(const char *md_name, const uct_md_config_t *md_config, uct_md_h *md_p) { - uint16_t domain_id; ucs_status_t status = UCS_OK; pthread_mutex_lock(&uct_ugni_global_lock); @@ -291,9 +302,7 @@ static ucs_status_t uct_ugni_md_open(const char *md_name, const uct_md_config_t ucs_error("Failed to init device list, Error status: %d", status); goto error; } - status = uct_ugni_init_nic(0, &domain_id, - &md.cdm_handle, &md.nic_handle, - &md.address); + status = uct_ugni_create_cdm(&md.cdm, &job_info.devices[0], UCS_THREAD_MODE_MULTI); if (UCS_OK != status) { ucs_error("Failed to UGNI NIC, Error status: %d", status); goto error; diff --git a/src/uct/ugni/base/ugni_md.h b/src/uct/ugni/base/ugni_md.h index d1b03340037..f2b02f13874 100644 --- a/src/uct/ugni/base/ugni_md.h +++ b/src/uct/ugni/base/ugni_md.h @@ -27,6 +27,7 @@ typedef struct uct_ugni_job_info { bool initialized; /**< Info status */ } uct_ugni_job_info_t; +uct_ugni_job_info_t *uct_ugni_get_job_info(); extern uct_ugni_job_info_t job_info; extern uct_md_component_t uct_ugni_md_component; diff --git a/src/uct/ugni/base/ugni_types.h b/src/uct/ugni/base/ugni_types.h index 54c97e6d397..f46ce3d0b7f 100644 --- a/src/uct/ugni/base/ugni_types.h +++ b/src/uct/ugni/base/ugni_types.h @@ -12,22 +12,6 @@ #include #include -/** - * @brief UGNI Memory domain - * - * Ugni does not define MD, instead I use - * device handle that "simulates" the MD. - * Memory that is registered with one device handle - * can be accessed with any other. - */ -typedef struct uct_ugni_md { - uct_md_t super; /**< Domain info */ - gni_cdm_handle_t cdm_handle; /**< Ugni communication domain */ - gni_nic_handle_t nic_handle; /**< Ugni NIC handle */ - uint32_t address; /**< UGNI address */ - int ref_count; /**< UGNI Domain ref count */ -} uct_ugni_md_t; - typedef struct uct_ugni_device { gni_nic_device_t type; /**< Device type */ char type_name[UCT_UGNI_MAX_TYPE_NAME]; /**< Device type name */ @@ -40,9 +24,35 @@ typedef struct uct_ugni_device { to the device */ cpu_set_t cpu_mask; /**< CPU mask */ bool attached; /**< device was attached */ +#if ENABLE_MT + ucs_spinlock_t lock; /**< Device lock */ +#endif /* TBD - reference counter */ } uct_ugni_device_t; +typedef struct uct_ugni_cdm { + gni_cdm_handle_t cdm_handle; /**< Ugni communication domain */ + gni_nic_handle_t nic_handle; /**< Ugni NIC handle */ + uct_ugni_device_t *dev; /**< Ugni device the cdm is connected to */ + ucs_thread_mode_t thread_mode; + uint32_t address; + uint16_t domain_id; +} uct_ugni_cdm_t; + +/** + * @brief UGNI Memory domain + * + * Ugni does not define MD, instead I use + * device handle that "simulates" the MD. + * Memory that is registered with one device handle + * can be accessed with any other. + */ +typedef struct uct_ugni_md { + uct_md_t super; /**< Domain info */ + uct_ugni_cdm_t cdm; /**< Communication domain for memory registration*/ + int ref_count; /**< UGNI Domain ref count */ +} uct_ugni_md_t; + typedef struct uct_devaddr_ugni_t { uint32_t nic_addr; } UCS_S_PACKED uct_devaddr_ugni_t; @@ -70,15 +80,11 @@ typedef struct uct_ugni_ep { typedef struct uct_ugni_iface { uct_base_iface_t super; - uct_ugni_device_t *dev; /**< Ugni device iface is connected to */ - gni_cdm_handle_t cdm_handle; /**< Ugni communication domain */ - gni_nic_handle_t nic_handle; /**< Ugni NIC handle */ + uct_ugni_cdm_t cdm; /**< Ugni communication domain and handles */ gni_cq_handle_t local_cq; /**< Completion queue */ - uint16_t domain_id; /**< Id for UGNI domain creation */ uct_ugni_ep_t *eps[UCT_UGNI_HASH_SIZE]; /**< Array of QPs */ unsigned outstanding; /**< Counter for outstanding packets on the interface */ - bool activated; /**< nic status */ ucs_arbiter_t arbiter; /**< arbiter structure for pending operations */ ucs_mpool_t flush_pool; /**< Memory pool for flush objects */ } uct_ugni_iface_t; diff --git a/src/uct/ugni/rdma/ugni_rdma_ep.c b/src/uct/ugni/rdma/ugni_rdma_ep.c index 6b37bbe3099..7045045dd5e 100644 --- a/src/uct/ugni/rdma/ugni_rdma_ep.c +++ b/src/uct/ugni/rdma/ugni_rdma_ep.c @@ -713,7 +713,7 @@ ucs_status_t uct_ugni_ep_get_zcopy(uct_ep_h tl_ep, const uct_iov_t *iov, size_t iface->config.rdma_max_size, "get_zcopy"); /* Special flow for an unalign data */ - if (ucs_unlikely((GNI_DEVICE_GEMINI == iface->super.dev->type && + if (ucs_unlikely((uct_ugni_check_device_type(&iface->super, GNI_DEVICE_GEMINI) && ucs_check_if_align_pow2((uintptr_t)buffer, UGNI_GET_ALIGN)) || ucs_check_if_align_pow2(remote_addr, UGNI_GET_ALIGN) || ucs_check_if_align_pow2(length, UGNI_GET_ALIGN))) { diff --git a/src/uct/ugni/rdma/ugni_rdma_iface.c b/src/uct/ugni/rdma/ugni_rdma_iface.c index ad36361bdbe..f786b48be46 100644 --- a/src/uct/ugni/rdma/ugni_rdma_iface.c +++ b/src/uct/ugni/rdma/ugni_rdma_iface.c @@ -69,7 +69,7 @@ static ucs_status_t uct_ugni_rdma_iface_query(uct_iface_h tl_iface, uct_iface_at UCT_IFACE_FLAG_CONNECT_TO_IFACE | UCT_IFACE_FLAG_PENDING; - if(GNI_DEVICE_ARIES == iface->super.dev->type) { + if (uct_ugni_check_device_type(&iface->super, GNI_DEVICE_ARIES)) { iface_attr->cap.flags |= UCT_IFACE_FLAG_PUT_SHORT | UCT_IFACE_FLAG_ATOMIC_SWAP64 | UCT_IFACE_FLAG_ATOMIC_SWAP32 | @@ -90,12 +90,6 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ugni_rdma_iface_t) { uct_worker_progress_unregister(self->super.super.worker, uct_ugni_progress, self); - - if (!self->super.activated) { - /* We done with release */ - return; - } - ucs_mpool_cleanup(&self->free_desc_get_buffer, 1); ucs_mpool_cleanup(&self->free_desc_get, 1); ucs_mpool_cleanup(&self->free_desc_famo, 1); @@ -271,20 +265,12 @@ static UCS_CLASS_INIT_FUNC(uct_ugni_rdma_iface_t, uct_md_h md, uct_worker_h work goto clean_famo; } - status = ugni_activate_iface(&self->super); - if (UCS_OK != status) { - ucs_error("Failed to activate the interface"); - goto clean_get_buffer; - } - /* TBD: eventually the uct_ugni_progress has to be moved to * rdma layer so each ugni layer will have own progress */ uct_worker_progress_register(worker, uct_ugni_progress, self); pthread_mutex_unlock(&uct_ugni_global_lock); return UCS_OK; -clean_get_buffer: - ucs_mpool_cleanup(&self->free_desc_get_buffer, 1); clean_famo: ucs_mpool_cleanup(&self->free_desc_famo, 1); clean_buffer: diff --git a/src/uct/ugni/smsg/ugni_smsg_ep.c b/src/uct/ugni/smsg/ugni_smsg_ep.c index 5cf59da5803..f84f6197c8c 100644 --- a/src/uct/ugni/smsg/ugni_smsg_ep.c +++ b/src/uct/ugni/smsg/ugni_smsg_ep.c @@ -53,7 +53,7 @@ static ucs_status_t uct_ugni_smsg_mbox_reg(uct_ugni_smsg_iface_t *iface, uct_ugn } pthread_mutex_lock(&uct_ugni_global_lock); - ugni_rc = GNI_MemRegister(iface->super.nic_handle, (uint64_t)address, + ugni_rc = GNI_MemRegister(uct_ugni_iface_nic_handle(&iface->super), (uint64_t)address, iface->bytes_per_mbox, iface->remote_cq, GNI_MEM_READWRITE, -1, &(mbox->gni_mem)); @@ -75,7 +75,7 @@ static ucs_status_t uct_ugni_smsg_mbox_dereg(uct_ugni_smsg_iface_t *iface, uct_u gni_return_t ugni_rc; pthread_mutex_lock(&uct_ugni_global_lock); - ugni_rc = GNI_MemDeregister(iface->super.nic_handle, &mbox->gni_mem); + ugni_rc = GNI_MemDeregister(uct_ugni_iface_nic_handle(&iface->super), &mbox->gni_mem); pthread_mutex_unlock(&uct_ugni_global_lock); if (GNI_RC_SUCCESS != ugni_rc) { @@ -180,7 +180,7 @@ ucs_status_t uct_ugni_smsg_ep_connect_to_ep(uct_ep_h tl_ep, } ep_hash = (uint32_t)iface_addr->ep_hash; - gni_rc = GNI_EpSetEventData(ep->super.ep, iface->domain_id, ep_hash); + gni_rc = GNI_EpSetEventData(ep->super.ep, iface->cdm.domain_id, ep_hash); if(GNI_RC_SUCCESS != gni_rc){ ucs_error("Could not set GNI_EpSetEventData!"); diff --git a/src/uct/ugni/smsg/ugni_smsg_iface.c b/src/uct/ugni/smsg/ugni_smsg_iface.c index 4035ff94c51..0ec4718307c 100644 --- a/src/uct/ugni/smsg/ugni_smsg_iface.c +++ b/src/uct/ugni/smsg/ugni_smsg_iface.c @@ -217,12 +217,9 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ugni_smsg_iface_t) { uct_worker_progress_unregister(self->super.super.worker, uct_ugni_smsg_progress, self); - if (!self->super.activated) { - return; - } - ucs_mpool_cleanup(&self->free_desc, 1); ucs_mpool_cleanup(&self->free_mbox, 1); + GNI_CqDestroy(self->remote_cq); } uct_iface_ops_t uct_ugni_smsg_iface_ops = { @@ -243,82 +240,6 @@ uct_iface_ops_t uct_ugni_smsg_iface_ops = { .ep_flush = uct_ugni_ep_flush, }; -static ucs_status_t ugni_smsg_activate_iface(uct_ugni_smsg_iface_t *iface) -{ - ucs_status_t status; - gni_return_t ugni_rc; - uint32_t pe_address; - - if(iface->super.activated) { - return UCS_OK; - } - /*pull out these chunks into common routines */ - status = uct_ugni_init_nic(0, &iface->super.domain_id, - &iface->super.cdm_handle, &iface->super.nic_handle, - &pe_address); - if (UCS_OK != status) { - ucs_error("Failed to UGNI NIC, Error status: %d", status); - return status; - } - - ugni_rc = GNI_CqCreate(iface->super.nic_handle, UCT_UGNI_LOCAL_CQ, 0, - GNI_CQ_NOBLOCK, - NULL, NULL, &iface->super.local_cq); - if (GNI_RC_SUCCESS != ugni_rc) { - ucs_error("GNI_CqCreate failed, Error status: %s %d", - gni_err_str[ugni_rc], ugni_rc); - return UCS_ERR_NO_DEVICE; - } - - ugni_rc = GNI_CqCreate(iface->super.nic_handle, 40000, 0, - GNI_CQ_NOBLOCK, - NULL, NULL, &iface->remote_cq); - - if (GNI_RC_SUCCESS != ugni_rc) { - ucs_error("GNI_CqCreate failed, Error status: %s %d", - gni_err_str[ugni_rc], ugni_rc); - return UCS_ERR_NO_DEVICE; - } - - iface->super.activated = true; - - /* iface is activated */ - return UCS_OK; -} - -static ucs_status_t ugni_smsg_deactivate_iface(uct_ugni_smsg_iface_t *iface) -{ - gni_return_t ugni_rc; - - if(!iface->super.activated) { - return UCS_OK; - } - - ugni_rc = GNI_CqDestroy(iface->super.local_cq); - if (GNI_RC_SUCCESS != ugni_rc) { - ucs_warn("GNI_CqDestroy failed, Error status: %s %d", - gni_err_str[ugni_rc], ugni_rc); - return UCS_ERR_IO_ERROR; - } - - ugni_rc = GNI_CqDestroy(iface->remote_cq); - if (GNI_RC_SUCCESS != ugni_rc) { - ucs_warn("GNI_CqDestroy failed, Error status: %s %d", - gni_err_str[ugni_rc], ugni_rc); - return UCS_ERR_IO_ERROR; - } - - ugni_rc = GNI_CdmDestroy(iface->super.cdm_handle); - if (GNI_RC_SUCCESS != ugni_rc) { - ucs_warn("GNI_CdmDestroy error status: %s (%d)", - gni_err_str[ugni_rc], ugni_rc); - return UCS_ERR_IO_ERROR; - } - - iface->super.activated = false ; - return UCS_OK; -} - static ucs_mpool_ops_t uct_ugni_smsg_desc_mpool_ops = { .chunk_alloc = ucs_mpool_hugetlb_malloc, .chunk_release = ucs_mpool_hugetlb_free, @@ -360,6 +281,14 @@ static UCS_CLASS_INIT_FUNC(uct_ugni_smsg_iface_t, uct_md_h md, uct_worker_h work smsg_attr.mbox_maxcredit = self->config.smsg_max_credit; smsg_attr.msg_maxsize = self->config.smsg_seg_size; + ugni_rc = GNI_CqCreate(uct_ugni_iface_nic_handle(&self->super), 40000, 0, + GNI_CQ_NOBLOCK, + NULL, NULL, &self->remote_cq); + if (GNI_RC_SUCCESS != ugni_rc) { + ucs_error("GNI_CqCreate failed, Error status: %s %d", + gni_err_str[ugni_rc], ugni_rc); + return UCS_ERR_NO_DEVICE; + } ugni_rc = GNI_SmsgBufferSizeNeeded(&(smsg_attr), &bytes_per_mbox); self->bytes_per_mbox = ucs_align_up_pow2(bytes_per_mbox, ucs_get_page_size()); @@ -396,21 +325,15 @@ static UCS_CLASS_INIT_FUNC(uct_ugni_smsg_iface_t, uct_md_h md, uct_worker_h work if (UCS_OK != status) { ucs_error("Mbox Mpool creation failed"); - goto clean_desc; - } - - status = ugni_smsg_activate_iface(self); - if (UCS_OK != status) { - ucs_error("Failed to activate the interface"); goto clean_mbox; } - ugni_rc = GNI_SmsgSetMaxRetrans(self->super.nic_handle, self->config.smsg_max_retransmit); + ugni_rc = GNI_SmsgSetMaxRetrans(uct_ugni_iface_nic_handle(&self->super), self->config.smsg_max_retransmit); if (ugni_rc != GNI_RC_SUCCESS) { ucs_error("Smsg setting max retransmit count failed."); status = UCS_ERR_INVALID_PARAM; - goto clean_iface; + goto clean_desc; } /* TBD: eventually the uct_ugni_progress has to be moved to @@ -419,8 +342,6 @@ static UCS_CLASS_INIT_FUNC(uct_ugni_smsg_iface_t, uct_md_h md, uct_worker_h work pthread_mutex_unlock(&uct_ugni_global_lock); return UCS_OK; - clean_iface: - ugni_smsg_deactivate_iface(self); clean_desc: ucs_mpool_cleanup(&self->free_desc, 1); clean_mbox: diff --git a/src/uct/ugni/udt/ugni_udt_iface.c b/src/uct/ugni/udt/ugni_udt_iface.c index 4424bf5b41e..5bba0947df8 100644 --- a/src/uct/ugni/udt/ugni_udt_iface.c +++ b/src/uct/ugni/udt/ugni_udt_iface.c @@ -113,7 +113,7 @@ static void *uct_ugni_udt_device_thread(void *arg) pthread_cond_wait(&iface->device_condition, &iface->device_lock); } pthread_mutex_unlock(&iface->device_lock); - ugni_rc = GNI_PostdataProbeWaitById(iface->super.nic_handle,-1,&id); + ugni_rc = GNI_PostdataProbeWaitById(uct_ugni_udt_iface_nic_handle(iface),-1,&id); if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) { ucs_error("GNI_PostDataProbeWaitById, Error status: %s %d\n", gni_err_str[ugni_rc], ugni_rc); @@ -202,7 +202,7 @@ void uct_ugni_proccess_datagram_pipe(int event_id, void *arg) { ucs_trace_func(""); pthread_mutex_lock(&uct_ugni_global_lock); - ugni_rc = GNI_PostDataProbeById(iface->super.nic_handle, &id); + ugni_rc = GNI_PostDataProbeById(uct_ugni_udt_iface_nic_handle(iface), &id); pthread_mutex_unlock(&uct_ugni_global_lock); while (GNI_RC_SUCCESS == ugni_rc) { status = recieve_datagram(iface, id, &ep); @@ -240,7 +240,7 @@ void uct_ugni_proccess_datagram_pipe(int event_id, void *arg) { } } pthread_mutex_lock(&uct_ugni_global_lock); - ugni_rc = GNI_PostDataProbeById(iface->super.nic_handle, &id); + ugni_rc = GNI_PostDataProbeById(uct_ugni_udt_iface_nic_handle(iface), &id); pthread_mutex_unlock(&uct_ugni_global_lock); } @@ -298,13 +298,13 @@ static inline void uct_ugni_udt_terminate_thread(uct_ugni_udt_iface_t *iface) gni_return_t ugni_rc; gni_ep_handle_t ep; - ugni_rc = GNI_EpCreate(iface->super.nic_handle, iface->super.local_cq, &ep); + ugni_rc = GNI_EpCreate(uct_ugni_udt_iface_nic_handle(iface), iface->super.local_cq, &ep); if (GNI_RC_SUCCESS != ugni_rc) { ucs_error("GNI_EpCreate, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); return; } - ugni_rc = GNI_EpBind(ep, iface->super.dev->address, iface->super.domain_id); + ugni_rc = GNI_EpBind(ep, iface->super.cdm.dev->address, iface->super.cdm.domain_id); if (GNI_RC_SUCCESS != ugni_rc) { GNI_EpDestroy(ep); ucs_error("GNI_EpBind failed, Error status: %s %d", @@ -330,11 +330,6 @@ static inline void uct_ugni_udt_terminate_thread(uct_ugni_udt_iface_t *iface) static UCS_CLASS_CLEANUP_FUNC(uct_ugni_udt_iface_t) { void *dummy; - - if (!self->super.activated) { - /* We done with release */ - return; - } uct_ugni_enter_async(&self->super); uct_ugni_udt_clean_wildcard(self); ucs_async_remove_handler(ucs_async_pipe_rfd(&self->event_pipe),1); @@ -415,20 +410,12 @@ static UCS_CLASS_INIT_FUNC(uct_ugni_udt_iface_t, uct_md_h md, uct_worker_h worke goto clean_pipe; } - pthread_mutex_lock(&uct_ugni_global_lock); - status = ugni_activate_iface(&self->super); - pthread_mutex_unlock(&uct_ugni_global_lock); - if (UCS_OK != status) { - ucs_error("Failed to activate the interface"); - goto clean_free_desc; - } - - ugni_rc = GNI_EpCreate(self->super.nic_handle, NULL, &self->ep_any); + ugni_rc = GNI_EpCreate(uct_ugni_udt_iface_nic_handle(self), NULL, &self->ep_any); if (GNI_RC_SUCCESS != ugni_rc) { ucs_error("GNI_EpCreate failed, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); status = UCS_ERR_NO_DEVICE; - goto clean_iface_activate; + goto clean_free_desc; } UCT_TL_IFACE_GET_TX_DESC(&self->super.super, &self->free_desc, @@ -479,8 +466,6 @@ static UCS_CLASS_INIT_FUNC(uct_ugni_udt_iface_t, uct_md_h md, uct_worker_h worke ucs_warn("GNI_EpDestroy failed, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); } - clean_iface_activate: - ugni_deactivate_iface(&self->super); clean_free_desc: ucs_mpool_cleanup(&self->free_desc, 1); clean_pipe: diff --git a/src/uct/ugni/udt/ugni_udt_iface.h b/src/uct/ugni/udt/ugni_udt_iface.h index 0e1e7c745dc..c9a9d708366 100644 --- a/src/uct/ugni/udt/ugni_udt_iface.h +++ b/src/uct/ugni/udt/ugni_udt_iface.h @@ -101,4 +101,8 @@ static inline int uct_ugni_udt_ep_any_post(uct_ugni_udt_iface_t *iface) return UCS_OK; } +static inline gni_nic_handle_t uct_ugni_udt_iface_nic_handle(uct_ugni_udt_iface_t *iface) +{ + return uct_ugni_iface_nic_handle(&iface->super); +} #endif From 6dc47d9be067433588ad576af2428b4b39a2b70c Mon Sep 17 00:00:00 2001 From: Matt Baker Date: Thu, 11 May 2017 15:39:44 -0400 Subject: [PATCH 2/4] Add back in missing cdm cleanup paths --- src/uct/ugni/base/ugni_iface.c | 20 ++++++++++++++++---- src/uct/ugni/base/ugni_iface.h | 1 + src/uct/ugni/rdma/ugni_rdma_iface.c | 1 + src/uct/ugni/smsg/ugni_smsg_iface.c | 1 + src/uct/ugni/udt/ugni_udt_iface.c | 1 + 5 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/uct/ugni/base/ugni_iface.c b/src/uct/ugni/base/ugni_iface.c index d4d12acdd33..3affd897014 100644 --- a/src/uct/ugni/base/ugni_iface.c +++ b/src/uct/ugni/base/ugni_iface.c @@ -239,6 +239,14 @@ static ucs_mpool_ops_t uct_ugni_flush_mpool_ops = { .obj_cleanup = NULL }; +void uct_ugni_cleanup_base_iface(uct_ugni_iface_t *iface) +{ + ucs_arbiter_cleanup(&iface->arbiter); + ucs_mpool_cleanup(&iface->flush_pool, 1); + GNI_CqDestroy(iface->local_cq); + uct_ugni_destroy_cdm(&iface->cdm); +} + UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, uct_iface_ops_t *uct_ugni_iface_ops, @@ -270,7 +278,7 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker, if (GNI_RC_SUCCESS != ugni_rc) { ucs_error("GNI_CqCreate failed, Error status: %s %d", gni_err_str[ugni_rc], ugni_rc); - return UCS_ERR_NO_DEVICE; + goto clean_cdm; } self->outstanding = 0; sglib_hashed_uct_ugni_ep_t_init(self->eps); @@ -286,8 +294,14 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker, "UGNI-DESC-ONLY"); if (UCS_OK != status) { ucs_error("Could not init iface"); + goto clean_cq; } return status; +clean_cq: + GNI_CqDestroy(self->local_cq); +clean_cdm: + uct_ugni_destroy_cdm(&self->cdm); + return status; } UCS_CLASS_DEFINE_NEW_FUNC(uct_ugni_iface_t, uct_iface_t, uct_md_h, uct_worker_h, @@ -296,9 +310,7 @@ UCS_CLASS_DEFINE_NEW_FUNC(uct_ugni_iface_t, uct_iface_t, uct_md_h, uct_worker_h, static UCS_CLASS_CLEANUP_FUNC(uct_ugni_iface_t) { - GNI_CqDestroy(self->local_cq); - uct_ugni_destroy_cdm(&self->cdm); - ucs_arbiter_cleanup(&self->arbiter); + uct_ugni_cleanup_base_iface(self); } UCS_CLASS_DEFINE(uct_ugni_iface_t, uct_base_iface_t); diff --git a/src/uct/ugni/base/ugni_iface.h b/src/uct/ugni/base/ugni_iface.h index d6c14bf2c77..1d4bf92034b 100644 --- a/src/uct/ugni/base/ugni_iface.h +++ b/src/uct/ugni/base/ugni_iface.h @@ -29,6 +29,7 @@ void uct_ugni_base_desc_key_init(uct_iface_h iface, void *obj, uct_mem_h memh); ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name, uct_tl_resource_desc_t **resource_p, unsigned *num_resources_p); +void uct_ugni_cleanup_base_iface(uct_ugni_iface_t *iface); static inline uct_ugni_device_t *uct_ugni_iface_device(uct_ugni_iface_t *iface) { return iface->cdm.dev; diff --git a/src/uct/ugni/rdma/ugni_rdma_iface.c b/src/uct/ugni/rdma/ugni_rdma_iface.c index f786b48be46..964ee328aa6 100644 --- a/src/uct/ugni/rdma/ugni_rdma_iface.c +++ b/src/uct/ugni/rdma/ugni_rdma_iface.c @@ -280,6 +280,7 @@ static UCS_CLASS_INIT_FUNC(uct_ugni_rdma_iface_t, uct_md_h md, uct_worker_h work clean_desc: ucs_mpool_cleanup(&self->free_desc, 1); exit: + uct_ugni_cleanup_base_iface(&self->super); ucs_error("Failed to activate interface"); pthread_mutex_unlock(&uct_ugni_global_lock); return status; diff --git a/src/uct/ugni/smsg/ugni_smsg_iface.c b/src/uct/ugni/smsg/ugni_smsg_iface.c index 0ec4718307c..a2bb69272ae 100644 --- a/src/uct/ugni/smsg/ugni_smsg_iface.c +++ b/src/uct/ugni/smsg/ugni_smsg_iface.c @@ -347,6 +347,7 @@ static UCS_CLASS_INIT_FUNC(uct_ugni_smsg_iface_t, uct_md_h md, uct_worker_h work clean_mbox: ucs_mpool_cleanup(&self->free_mbox, 1); exit: + uct_ugni_cleanup_base_iface(&self->super); ucs_error("Failed to activate interface"); pthread_mutex_unlock(&uct_ugni_global_lock); return status; diff --git a/src/uct/ugni/udt/ugni_udt_iface.c b/src/uct/ugni/udt/ugni_udt_iface.c index 5bba0947df8..a13edc9c789 100644 --- a/src/uct/ugni/udt/ugni_udt_iface.c +++ b/src/uct/ugni/udt/ugni_udt_iface.c @@ -471,6 +471,7 @@ static UCS_CLASS_INIT_FUNC(uct_ugni_udt_iface_t, uct_md_h md, uct_worker_h worke clean_pipe: ucs_async_pipe_destroy(&self->event_pipe); exit: + uct_ugni_cleanup_base_iface(&self->super); ucs_error("Failed to activate interface"); return status; } From 6a82d5cafa3e000bb430a545c366c50e9e804627 Mon Sep 17 00:00:00 2001 From: Matt Baker Date: Thu, 11 May 2017 17:02:00 -0400 Subject: [PATCH 3/4] UCT/UGNI: Change small inline functions into macros --- src/uct/ugni/base/ugni_iface.h | 17 +++-------------- src/uct/ugni/udt/ugni_udt_iface.h | 6 ++---- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/src/uct/ugni/base/ugni_iface.h b/src/uct/ugni/base/ugni_iface.h index 1d4bf92034b..7afad519b4a 100644 --- a/src/uct/ugni/base/ugni_iface.h +++ b/src/uct/ugni/base/ugni_iface.h @@ -30,18 +30,7 @@ ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name, uct_tl_resource_desc_t **resource_p, unsigned *num_resources_p); void uct_ugni_cleanup_base_iface(uct_ugni_iface_t *iface); -static inline uct_ugni_device_t *uct_ugni_iface_device(uct_ugni_iface_t *iface) -{ - return iface->cdm.dev; -} -static inline gni_nic_handle_t uct_ugni_iface_nic_handle(uct_ugni_iface_t *iface) -{ - return iface->cdm.nic_handle; -} -static inline int uct_ugni_check_device_type(uct_ugni_iface_t *iface, gni_nic_device_t type) -{ - uct_ugni_device_t *dev = uct_ugni_iface_device(iface); - return dev->type == type; -} - +#define uct_ugni_iface_device(_iface) ((_iface)->cdm.dev) +#define uct_ugni_iface_nic_handle(_iface) ((_iface)->cdm.nic_handle) +#define uct_ugni_check_device_type(_iface, _type) ((_iface)->cdm.dev->type == _type) #endif diff --git a/src/uct/ugni/udt/ugni_udt_iface.h b/src/uct/ugni/udt/ugni_udt_iface.h index c9a9d708366..5fe4896903e 100644 --- a/src/uct/ugni/udt/ugni_udt_iface.h +++ b/src/uct/ugni/udt/ugni_udt_iface.h @@ -77,6 +77,8 @@ if (ucs_unlikely(GNI_RC_SUCCESS != rc)) { \ } \ } +#define uct_ugni_udt_iface_nic_handle(_iface) uct_ugni_iface_nic_handle(&(_iface)->super) + static inline void uct_ugni_udt_reset_desc(uct_ugni_udt_desc_t *desc, uct_ugni_udt_iface_t *iface) { uct_ugni_udt_header_t *sheader = uct_ugni_udt_get_sheader(desc, iface); @@ -101,8 +103,4 @@ static inline int uct_ugni_udt_ep_any_post(uct_ugni_udt_iface_t *iface) return UCS_OK; } -static inline gni_nic_handle_t uct_ugni_udt_iface_nic_handle(uct_ugni_udt_iface_t *iface) -{ - return uct_ugni_iface_nic_handle(&iface->super); -} #endif From aa2b76c6b7b23089d6670fe1b3539c8b03a14e25 Mon Sep 17 00:00:00 2001 From: Matt Baker Date: Thu, 18 May 2017 16:37:15 -0400 Subject: [PATCH 4/4] Log an error message of tearing down the CDM fails. --- src/uct/ugni/base/ugni_device.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/uct/ugni/base/ugni_device.c b/src/uct/ugni/base/ugni_device.c index 88f46a8c43a..1605d46a663 100644 --- a/src/uct/ugni/base/ugni_device.c +++ b/src/uct/ugni/base/ugni_device.c @@ -194,7 +194,11 @@ ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, if (GNI_RC_SUCCESS != ugni_rc) { ucs_error("GNI_CdmAttach failed (domain id %d, %d), Error status: %s %d", cdm->domain_id, ugni_domain_counter, gni_err_str[ugni_rc], ugni_rc); - GNI_CdmDestroy(cdm->cdm_handle); + ugni_rc = GNI_CdmDestroy(cdm->cdm_handle); + if (GNI_RC_SUCCESS != ugni_rc) { + ucs_error("GNI_CdmDestroy error status: %s (%d)", + gni_err_str[ugni_rc], ugni_rc); + } status = UCS_ERR_NO_DEVICE; }