Skip to content

Commit

Permalink
Merge pull request #2436 from hjelmn/v1.3.x_ugni
Browse files Browse the repository at this point in the history
uct/ugni: fix internal locking
  • Loading branch information
shamisp authored Mar 21, 2018
2 parents f99e9e4 + 834e858 commit 8bf5730
Show file tree
Hide file tree
Showing 12 changed files with 91 additions and 97 deletions.
21 changes: 11 additions & 10 deletions src/uct/ugni/base/ugni_def.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* Copyright (c) UT-Battelle, LLC. 2017. ALL RIGHTS RESERVED.
* Copyright (c) Los Alamos National Security, LLC. 2018. ALL RIGHTS RESERVED.
* See file LICENSE for terms.
*/

Expand Down Expand Up @@ -45,23 +46,23 @@ do {\

#if ENABLE_MT
#define uct_ugni_check_lock_needed(_cdm) UCS_THREAD_MODE_MULTI == (_cdm)->thread_mode
#define uct_ugni_device_init_lock(_dev) ucs_spinlock_init(&(_dev)->lock)
#define uct_ugni_device_destroy_lock(_dev) ucs_spinlock_destroy(&(_dev)->lock)
#define uct_ugni_device_lock(_cdm) \
#define uct_ugni_cdm_init_lock(_cdm) ucs_spinlock_init(&(_cdm)->lock)
#define uct_ugni_cdm_destroy_lock(_cdm) ucs_spinlock_destroy(&(_cdm)->lock)
#define uct_ugni_cdm_lock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_trace_async("Taking lock"); \
ucs_spin_lock(&(_cdm)->dev->lock); \
ucs_spin_lock(&(_cdm)->lock); \
}
#define uct_ugni_device_unlock(_cdm) \
#define uct_ugni_cdm_unlock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_trace_async("Releasing lock"); \
ucs_spin_unlock(&(_cdm)->dev->lock); \
ucs_spin_unlock(&(_cdm)->lock); \
}
#else
#define uct_ugni_device_init_lock(x) UCS_OK
#define uct_ugni_device_destroy_lock(x) UCS_OK
#define uct_ugni_device_lock(x)
#define uct_ugni_device_unlock(x)
#define uct_ugni_cdm_init_lock(x) UCS_OK
#define uct_ugni_cdm_destroy_lock(x) UCS_OK
#define uct_ugni_cdm_lock(x)
#define uct_ugni_cdm_unlock(x)
#define uct_ugni_check_lock_needed(x) 0
#endif

Expand Down
35 changes: 13 additions & 22 deletions src/uct/ugni/base/ugni_device.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/**
* Copyright (c) UT-Battelle, LLC. 2014-2017. ALL RIGHTS RESERVED.
* Copyright (C) Mellanox Technologies Ltd. 2001-2014. ALL RIGHTS RESERVED.
* Copyright (c) Los Alamos National Security, LLC. 2018. ALL RIGHTS RESERVED.
* See file LICENSE for terms.
*/

Expand Down Expand Up @@ -370,22 +371,11 @@ ucs_status_t uct_ugni_device_create(int dev_id, int index, uct_ugni_device_t *de
ucs_snprintf_zero(dev_p->fname, sizeof(dev_p->fname), "%s:%d",
dev_p->type_name, index);

status = uct_ugni_device_init_lock(dev_p);
if (UCS_OK != status) {
ucs_error("Couldn't initalize device lock.");
return status;
}
return UCS_OK;
}

void uct_ugni_device_destroy(uct_ugni_device_t *dev)
{
ucs_status_t status;

status = uct_ugni_device_destroy_lock(dev);
if (UCS_OK != status) {
ucs_error("Couldn't destroy device lock.");
}
}

ucs_status_t uct_ugni_iface_get_dev_address(uct_iface_t *tl_iface, uct_device_addr_t *addr)
Expand Down Expand Up @@ -413,7 +403,6 @@ ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device,

cdm->thread_mode = thread_mode;
cdm->dev = device;
uct_ugni_device_lock(cdm);
cdm->domain_id = job_info->pmi_rank_id + job_info->pmi_num_of_ranks * ucs_atomic_fadd32(&ugni_domain_counter,1);
ucs_debug("Creating new command domain with id %d (%d + %d * %d)",
cdm->domain_id, job_info->pmi_rank_id,
Expand All @@ -425,8 +414,7 @@ ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device,
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
status = UCS_ERR_NO_DEVICE;
goto out_unlock;
return UCS_ERR_NO_DEVICE;
}

ugni_rc = GNI_CdmAttach(cdm->cdm_handle, device->device_id,
Expand All @@ -440,8 +428,11 @@ ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device,
status = UCS_ERR_NO_DEVICE;
}

out_unlock:
uct_ugni_device_unlock(cdm);
status = uct_ugni_cdm_init_lock(cdm);
if (UCS_OK != status) {
ucs_error("Couldn't initalize CDM lock.");
}

if (UCS_OK == status) {
ucs_debug("Made ugni cdm. nic_addr = %i domain_id = %i", device->address, cdm->domain_id);
}
Expand All @@ -456,11 +447,15 @@ ucs_status_t uct_ugni_create_md_cdm(uct_ugni_cdm_t *cdm)
ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm)
{
gni_return_t ugni_rc;
ucs_status_t status;

status = uct_ugni_cdm_destroy_lock(cdm);
if (UCS_OK != status) {
ucs_error("Couldn't destroy cdm lock.");
}

ucs_trace_func("cdm=%p", cdm);
uct_ugni_device_lock(cdm);
ugni_rc = GNI_CdmDestroy(cdm->cdm_handle);
uct_ugni_device_unlock(cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
Expand All @@ -473,11 +468,9 @@ ucs_status_t uct_ugni_create_cq(gni_cq_handle_t *cq, unsigned cq_size, uct_ugni_
{
gni_return_t ugni_rc;

uct_ugni_device_lock(cdm);
ugni_rc = GNI_CqCreate(cdm->nic_handle, UCT_UGNI_LOCAL_CQ, 0,
GNI_CQ_NOBLOCK,
NULL, NULL, cq);
uct_ugni_device_unlock(cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CqCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand All @@ -491,9 +484,7 @@ ucs_status_t uct_ugni_destroy_cq(gni_cq_handle_t cq, uct_ugni_cdm_t *cdm)
{
gni_return_t ugni_rc;

uct_ugni_device_lock(cdm);
ugni_rc = GNI_CqDestroy(cq);
uct_ugni_device_unlock(cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_warn("GNI_CqDestroy failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand Down
16 changes: 8 additions & 8 deletions src/uct/ugni/base/ugni_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,13 @@ ucs_status_t ugni_connect_ep(uct_ugni_iface_t *iface,
uct_ugni_ep_t *ep){
gni_return_t ugni_rc;

uct_ugni_device_lock(&iface->cdm);
uct_ugni_cdm_lock(&iface->cdm);
ugni_rc = GNI_EpBind(ep->ep, dev_addr->nic_addr, iface_addr->domain_id);
uct_ugni_device_unlock(&iface->cdm);
uct_ugni_cdm_unlock(&iface->cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
uct_ugni_device_lock(&iface->cdm);
uct_ugni_cdm_lock(&iface->cdm);
(void)GNI_EpDestroy(ep->ep);
uct_ugni_device_unlock(&iface->cdm);
uct_ugni_cdm_unlock(&iface->cdm);
ucs_error("GNI_EpBind failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_UNREACHABLE;
Expand Down Expand Up @@ -200,9 +200,9 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface,
self->flush_group->flush_comp.func = NULL;
self->flush_group->parent = NULL;
#endif
uct_ugni_device_lock(&iface->cdm);
uct_ugni_cdm_lock(&iface->cdm);
ugni_rc = GNI_EpCreate(uct_ugni_iface_nic_handle(iface), iface->local_cq, &self->ep);
uct_ugni_device_unlock(&iface->cdm);
uct_ugni_cdm_unlock(&iface->cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand Down Expand Up @@ -235,9 +235,9 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ugni_ep_t)

ucs_arbiter_group_purge(&iface->arbiter, &self->arb_group,
uct_ugni_ep_abriter_purge_cb, NULL);
uct_ugni_device_lock(&iface->cdm);
uct_ugni_cdm_lock(&iface->cdm);
ugni_rc = GNI_EpDestroy(self->ep);
uct_ugni_device_unlock(&iface->cdm);
uct_ugni_cdm_unlock(&iface->cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_warn("GNI_EpDestroy failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand Down
8 changes: 4 additions & 4 deletions src/uct/ugni/base/ugni_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,12 @@ static ucs_status_t uct_ugni_mem_reg(uct_md_h md, void *address, size_t length,
goto mem_err;
}

uct_ugni_device_lock(&ugni_md->cdm);
uct_ugni_cdm_lock(&ugni_md->cdm);
ugni_rc = GNI_MemRegister(ugni_md->cdm.nic_handle, (uint64_t)address,
length, NULL,
GNI_MEM_READWRITE | GNI_MEM_RELAXED_PI_ORDERING,
-1, mem_hndl);
uct_ugni_device_unlock(&ugni_md->cdm);
uct_ugni_cdm_unlock(&ugni_md->cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_MemRegister failed (addr %p, size %zu), Error status: %s %d",
address, length, gni_err_str[ugni_rc], ugni_rc);
Expand All @@ -94,9 +94,9 @@ static ucs_status_t uct_ugni_mem_dereg(uct_md_h md, uct_mem_h memh)
gni_return_t ugni_rc;
ucs_status_t status = UCS_OK;

uct_ugni_device_lock(&ugni_md->cdm);
uct_ugni_cdm_lock(&ugni_md->cdm);
ugni_rc = GNI_MemDeregister(ugni_md->cdm.nic_handle, mem_hndl);
uct_ugni_device_unlock(&ugni_md->cdm);
uct_ugni_cdm_unlock(&ugni_md->cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_MemDeregister failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand Down
8 changes: 5 additions & 3 deletions src/uct/ugni/base/ugni_types.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/**
* Copyright (c) UT-Battelle, LLC. 2014-2017. ALL RIGHTS RESERVED.
* Copyright (c) Los Alamos National Security, LLC. 2018. ALL RIGHTS RESERVED.
* See file LICENSE for terms.
*/

Expand All @@ -21,9 +22,6 @@ typedef struct uct_ugni_device {
uint32_t cpu_id; /**< CPU attached directly
to the device */
cpu_set_t cpu_mask; /**< CPU mask */
#if ENABLE_MT
ucs_spinlock_t lock; /**< Device lock */
#endif
/* TBD - reference counter */
} uct_ugni_device_t;

Expand All @@ -34,6 +32,10 @@ typedef struct uct_ugni_cdm {
ucs_thread_mode_t thread_mode;
uint32_t address;
uint16_t domain_id;

#if ENABLE_MT
ucs_spinlock_t lock; /**< Device lock */
#endif
} uct_ugni_cdm_t;

/**
Expand Down
8 changes: 4 additions & 4 deletions src/uct/ugni/rdma/ugni_rdma_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ static inline ucs_status_t uct_ugni_post_rdma(uct_ugni_rdma_iface_t *iface,
ucs_mpool_put(rdma);
return UCS_ERR_NO_RESOURCE;
}
uct_ugni_device_lock(&iface->super.cdm);
uct_ugni_cdm_lock(&iface->super.cdm);
ugni_rc = GNI_PostRdma(ep->ep, &rdma->desc);
uct_ugni_device_unlock(&iface->super.cdm);
uct_ugni_cdm_unlock(&iface->super.cdm);
if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) {
ucs_mpool_put(rdma);
if(GNI_RC_ERROR_RESOURCE == ugni_rc || GNI_RC_ERROR_NOMEM == ugni_rc) {
Expand Down Expand Up @@ -127,9 +127,9 @@ static inline ssize_t uct_ugni_post_fma(uct_ugni_rdma_iface_t *iface,
ucs_mpool_put(fma);
return UCS_ERR_NO_RESOURCE;
}
uct_ugni_device_lock(&iface->super.cdm);
uct_ugni_cdm_lock(&iface->super.cdm);
ugni_rc = GNI_PostFma(ep->ep, &fma->desc);
uct_ugni_device_unlock(&iface->super.cdm);
uct_ugni_cdm_unlock(&iface->super.cdm);
if (ucs_unlikely(GNI_RC_SUCCESS != ugni_rc)) {
ucs_mpool_put(fma);
if(GNI_RC_ERROR_RESOURCE == ugni_rc || GNI_RC_ERROR_NOMEM == ugni_rc) {
Expand Down
8 changes: 4 additions & 4 deletions src/uct/ugni/rdma/ugni_rdma_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,21 +114,21 @@ unsigned uct_ugni_progress(void *arg)
unsigned count = 0;

while (1) {
uct_ugni_device_lock(&iface->cdm);
uct_ugni_cdm_lock(&iface->cdm);
ugni_rc = GNI_CqGetEvent(iface->local_cq, &event_data);
if (GNI_RC_NOT_DONE == ugni_rc) {
uct_ugni_device_unlock(&iface->cdm);
uct_ugni_cdm_unlock(&iface->cdm);
break;
}
if ((GNI_RC_SUCCESS != ugni_rc && !event_data) || GNI_CQ_OVERRUN(event_data)) {
uct_ugni_device_unlock(&iface->cdm);
uct_ugni_cdm_unlock(&iface->cdm);
ucs_error("GNI_CqGetEvent falied. Error status %s %d ",
gni_err_str[ugni_rc], ugni_rc);
return count;
}

ugni_rc = GNI_GetCompleted(iface->local_cq, event_data, &event_post_desc_ptr);
uct_ugni_device_unlock(&iface->cdm);
uct_ugni_cdm_unlock(&iface->cdm);
if (GNI_RC_SUCCESS != ugni_rc && GNI_RC_TRANSACTION_ERROR != ugni_rc) {
ucs_error("GNI_GetCompleted falied. Error status %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand Down
20 changes: 10 additions & 10 deletions src/uct/ugni/smsg/ugni_smsg_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@ static ucs_status_t uct_ugni_smsg_mbox_reg(uct_ugni_smsg_iface_t *iface, uct_ugn
return UCS_ERR_INVALID_PARAM;
}

uct_ugni_device_lock(&iface->super.cdm);
uct_ugni_cdm_lock(&iface->super.cdm);
ugni_rc = GNI_MemRegister(uct_ugni_iface_nic_handle(&iface->super), (uint64_t)address,
iface->bytes_per_mbox, iface->remote_cq,
GNI_MEM_READWRITE,
-1, &(mbox->gni_mem));
uct_ugni_device_unlock(&iface->super.cdm);
uct_ugni_cdm_unlock(&iface->super.cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_MemRegister failed (addr %p, size %zu), Error status: %s %d",
address, iface->bytes_per_mbox, gni_err_str[ugni_rc], ugni_rc);
Expand All @@ -72,9 +72,9 @@ static ucs_status_t uct_ugni_smsg_mbox_reg(uct_ugni_smsg_iface_t *iface, uct_ugn
static ucs_status_t uct_ugni_smsg_mbox_dereg(uct_ugni_smsg_iface_t *iface, uct_ugni_smsg_mbox_t *mbox){
gni_return_t ugni_rc;

uct_ugni_device_lock(&iface->super.cdm);
uct_ugni_cdm_lock(&iface->super.cdm);
ugni_rc = GNI_MemDeregister(uct_ugni_iface_nic_handle(&iface->super), &mbox->gni_mem);
uct_ugni_device_unlock(&iface->super.cdm);
uct_ugni_cdm_unlock(&iface->super.cdm);

if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_MemDeregister failed Error status: %s %d",
Expand Down Expand Up @@ -162,9 +162,9 @@ ucs_status_t uct_ugni_smsg_ep_connect_to_ep(uct_ep_h tl_ep,
ucs_error("Could not connect ep in smsg");
return rc;
}
uct_ugni_device_lock(&iface->cdm);
uct_ugni_cdm_lock(&iface->cdm);
gni_rc = GNI_SmsgInit(ep->super.ep, local_attr, &remote_attr);
uct_ugni_device_unlock(&iface->cdm);
uct_ugni_cdm_unlock(&iface->cdm);

if(GNI_RC_SUCCESS != gni_rc){
ucs_error("Failed to initalize smsg. %s [%i]", gni_err_str[gni_rc], gni_rc);
Expand All @@ -176,9 +176,9 @@ ucs_status_t uct_ugni_smsg_ep_connect_to_ep(uct_ep_h tl_ep,
}

ep_hash = (uint32_t)iface_addr->ep_hash;
uct_ugni_device_lock(&iface->cdm);
uct_ugni_cdm_lock(&iface->cdm);
gni_rc = GNI_EpSetEventData(ep->super.ep, iface->cdm.domain_id, ep_hash);
uct_ugni_device_unlock(&iface->cdm);
uct_ugni_cdm_unlock(&iface->cdm);

if(GNI_RC_SUCCESS != gni_rc){
ucs_error("Could not set GNI_EpSetEventData!");
Expand All @@ -199,10 +199,10 @@ uct_ugni_smsg_ep_am_common_send(uct_ugni_smsg_ep_t *ep, uct_ugni_smsg_iface_t *i

desc->msg_id = iface->smsg_id++;
desc->flush_group = ep->super.flush_group;
uct_ugni_device_lock(&iface->super.cdm);
uct_ugni_cdm_lock(&iface->super.cdm);
gni_rc = GNI_SmsgSendWTag(ep->super.ep, header, header_length,
payload, payload_length, desc->msg_id, am_id);
uct_ugni_device_unlock(&iface->super.cdm);
uct_ugni_cdm_unlock(&iface->super.cdm);
if(GNI_RC_SUCCESS != gni_rc){
goto exit_no_res;
}
Expand Down
Loading

0 comments on commit 8bf5730

Please sign in to comment.