Skip to content

Commit

Permalink
Merge pull request #1546 from MattBBaker/topic/ugni-api-spinlocks
Browse files Browse the repository at this point in the history
UCT/UGNI: Spinlocks around all of the GNI_* functions
  • Loading branch information
shamisp authored May 30, 2017
2 parents 06eac29 + 4482606 commit c7b4edd
Show file tree
Hide file tree
Showing 15 changed files with 200 additions and 168 deletions.
26 changes: 26 additions & 0 deletions src/uct/ugni/base/ugni_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
#ifndef UCT_UGNI_DEF_H
#define UCT_UGNI_DEF_H

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include <ucs/async/async.h>

#define UCT_UGNI_MD_NAME "ugni"
Expand Down Expand Up @@ -38,4 +42,26 @@ do {\
UCS_ASYNC_UNBLOCK((x)->super.worker->async); \
} while(0)

#if ENABLE_MT
#define uct_ugni_check_lock_needed(_cdm) UCS_THREAD_MODE_MULTI == (_cdm)->thread_mode
#define uct_ugni_device_init_lock(_dev) ucs_spinlock_init(&(_dev)->lock)
#define uct_ugni_device_destroy_lock(_dev) ucs_spinlock_destroy(&(_dev)->lock)
#define uct_ugni_device_lock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_trace_async("Taking lock"); \
ucs_spin_lock(&(_cdm)->dev->lock); \
}
#define uct_ugni_device_unlock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_trace_async("Releasing lock"); \
ucs_spin_unlock(&(_cdm)->dev->lock); \
}
#else
#define uct_ugni_device_init_lock(x) UCS_OK
#define uct_ugni_device_destroy_lock(x) UCS_OK
#define uct_ugni_device_lock(x)
#define uct_ugni_device_unlock(x)
#define uct_ugni_check_lock_needed(x) 0
#endif

#endif
106 changes: 58 additions & 48 deletions src/uct/ugni/base/ugni_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,38 +4,14 @@
* See file LICENSE for terms.
*/

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include "ugni_device.h"
#include "ugni_md.h"
#include "ugni_iface.h"
#include <uct/base/uct_md.h>
#include <ucs/arch/atomic.h>
#include <ucs/sys/string.h>
#include <pmi.h>

#if ENABLE_MT
#define uct_ugni_check_lock_needed(_cdm) UCS_THREAD_MODE_MULTI == _cdm->thread_mode
#define uct_ugni_device_init_lock(_dev) ucs_spinlock_init(&_dev->lock)
#define uct_ugni_device_destroy_lock(_dev) ucs_spinlock_destroy(&_dev->lock)
#define uct_ugni_device_lock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_spin_lock(&cdm->dev->lock); \
}
#define uct_ugni_device_unlock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_spin_unlock(&cdm->dev->lock); \
}
#else
#define uct_ugni_device_init_lock(x) UCS_OK
#define uct_ugni_device_destroy_lock(x) UCS_OK
#define uct_ugni_device_lock(x)
#define uct_ugni_device_unlock(x)
#define uct_ugni_check_lock_needed(x) 0
#endif

uint16_t ugni_domain_counter = 0;
/**
* @breif Static information about UGNI job
*
Expand All @@ -62,6 +38,8 @@ static uct_ugni_job_info_t job_info = {
.initialized = false,
};

uint32_t ugni_domain_counter = 0;

void uct_ugni_device_get_resource(const char *tl_name, uct_ugni_device_t *dev,
uct_tl_resource_desc_t *resource)
{
Expand All @@ -80,8 +58,6 @@ ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name,
int i;
ucs_status_t status = UCS_OK;

pthread_mutex_lock(&uct_ugni_global_lock);

resources = ucs_calloc(job_info.num_devices, sizeof(uct_tl_resource_desc_t),
"resource desc");
if (NULL == resources) {
Expand All @@ -99,7 +75,6 @@ ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name,
error:
*num_resources_p = num_devices;
*resource_p = resources;
pthread_mutex_unlock(&uct_ugni_global_lock);

return status;
}
Expand Down Expand Up @@ -215,45 +190,46 @@ ucs_status_t init_device_list()
int i, num_active_devices;
int *dev_ids = NULL;
gni_return_t ugni_rc = GNI_RC_SUCCESS;
uct_ugni_job_info_t *inf = uct_ugni_get_job_info();

/* check if devices were already initilized */

if (-1 != job_info.num_devices) {
if (-1 != inf->num_devices) {
ucs_debug("The device list is already initialized");
status = UCS_OK;
goto err_zero;
}

ugni_rc = GNI_GetNumLocalDevices(&job_info.num_devices);
ugni_rc = GNI_GetNumLocalDevices(&inf->num_devices);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_GetNumLocalDevices failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
status = UCS_ERR_NO_DEVICE;
goto err_zero;
}

if (0 == job_info.num_devices) {
if (0 == inf->num_devices) {
ucs_debug("UGNI No device found");
status = UCS_OK;
goto err_zero;
}

if (job_info.num_devices >= UCT_UGNI_MAX_DEVICES) {
if (inf->num_devices >= UCT_UGNI_MAX_DEVICES) {
ucs_error("UGNI, number of discovered devices (%d) " \
"is above the maximum supported devices (%d)",
job_info.num_devices, UCT_UGNI_MAX_DEVICES);
inf->num_devices, UCT_UGNI_MAX_DEVICES);
status = UCS_ERR_UNSUPPORTED;
goto err_zero;
}

dev_ids = ucs_calloc(job_info.num_devices, sizeof(int), "ugni device ids");
dev_ids = ucs_calloc(inf->num_devices, sizeof(int), "ugni device ids");
if (NULL == dev_ids) {
ucs_error("Failed to allocate memory");
status = UCS_ERR_NO_MEMORY;
goto err_zero;
}

ugni_rc = GNI_GetLocalDeviceIds(job_info.num_devices, dev_ids);
ugni_rc = GNI_GetLocalDeviceIds(inf->num_devices, dev_ids);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_GetLocalDeviceIds failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand All @@ -262,8 +238,8 @@ ucs_status_t init_device_list()
}

num_active_devices = 0;
for (i = 0; i < job_info.num_devices; i++) {
status = uct_ugni_device_create(dev_ids[i], num_active_devices, &job_info.devices[i]);
for (i = 0; i < inf->num_devices; i++) {
status = uct_ugni_device_create(dev_ids[i], num_active_devices, &inf->devices[i]);
if (status != UCS_OK) {
ucs_warn("Failed to initialize ugni device %d (%s), ignoring it",
i, ucs_status_string(status));
Expand All @@ -272,13 +248,13 @@ ucs_status_t init_device_list()
}
}

if (num_active_devices != job_info.num_devices) {
if (num_active_devices != inf->num_devices) {
ucs_warn("Error in detection devices");
status = UCS_ERR_NO_DEVICE;
goto err_dev_id;
}

ucs_debug("Initialized UGNI component with %d devices", job_info.num_devices);
ucs_debug("Initialized UGNI component with %d devices", inf->num_devices);

err_dev_id:
ucs_free(dev_ids);
Expand Down Expand Up @@ -438,7 +414,7 @@ ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device,
cdm->thread_mode = thread_mode;
cdm->dev = device;
uct_ugni_device_lock(cdm);
cdm->domain_id = job_info->pmi_rank_id + job_info->pmi_num_of_ranks * ugni_domain_counter++;
cdm->domain_id = job_info->pmi_rank_id + job_info->pmi_num_of_ranks * ucs_atomic_fadd32(&ugni_domain_counter,1);
ucs_debug("Creating new command domain with id %d (%d + %d * %d)",
cdm->domain_id, job_info->pmi_rank_id,
job_info->pmi_num_of_ranks, ugni_domain_counter);
Expand All @@ -456,13 +432,11 @@ ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device,
ugni_rc = GNI_CdmAttach(cdm->cdm_handle, device->device_id,
&cdm->address, &cdm->nic_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmAttach failed (domain id %d, %d), Error status: %s %d",
cdm->domain_id, ugni_domain_counter, gni_err_str[ugni_rc], ugni_rc);
ugni_rc = GNI_CdmDestroy(cdm->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
}
ucs_error("GNI_CdmAttach failed, Error status: %s\n"
"Created domain %d (%d + %d * %d)",
gni_err_str[ugni_rc], cdm->domain_id, job_info->pmi_rank_id,
job_info->pmi_num_of_ranks, ugni_domain_counter);
uct_ugni_destroy_cdm(cdm);
status = UCS_ERR_NO_DEVICE;
}

Expand All @@ -483,12 +457,48 @@ ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm)
{
gni_return_t ugni_rc;

ucs_debug("MD GNI_CdmDestroy");
ucs_trace_func("cdm=%p", cdm);
uct_ugni_device_lock(cdm);
ugni_rc = GNI_CdmDestroy(cdm->cdm_handle);
uct_ugni_device_unlock(cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}
return UCS_OK;
}

ucs_status_t uct_ugni_create_cq(gni_cq_handle_t *cq, unsigned cq_size, uct_ugni_cdm_t *cdm)
{
gni_return_t ugni_rc;

uct_ugni_device_lock(cdm);
ugni_rc = GNI_CqCreate(cdm->nic_handle, UCT_UGNI_LOCAL_CQ, 0,
GNI_CQ_NOBLOCK,
NULL, NULL, cq);
uct_ugni_device_unlock(cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CqCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_NO_DEVICE;
}

return UCS_OK;
}

ucs_status_t uct_ugni_destroy_cq(gni_cq_handle_t cq, uct_ugni_cdm_t *cdm)
{
gni_return_t ugni_rc;

uct_ugni_device_lock(cdm);
ugni_rc = GNI_CqDestroy(cq);
uct_ugni_device_unlock(cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_warn("GNI_CqDestroy failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}

return UCS_OK;
}
3 changes: 3 additions & 0 deletions src/uct/ugni/base/ugni_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@ ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name,
unsigned *num_resources_p);
ucs_status_t init_device_list();
ucs_status_t uct_ugni_create_md_cdm(uct_ugni_cdm_t *cdm);
ucs_status_t uct_ugni_create_cq(gni_cq_handle_t *cq, unsigned cq_size, uct_ugni_cdm_t *cdm);
ucs_status_t uct_ugni_destroy_cq(gni_cq_handle_t cq, uct_ugni_cdm_t *cdm);

#endif
10 changes: 8 additions & 2 deletions src/uct/ugni/base/ugni_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,13 @@ ucs_status_t ugni_connect_ep(uct_ugni_iface_t *iface,
uct_ugni_ep_t *ep){
gni_return_t ugni_rc;

uct_ugni_device_lock(&iface->cdm);
ugni_rc = GNI_EpBind(ep->ep, dev_addr->nic_addr, iface_addr->domain_id);
uct_ugni_device_unlock(&iface->cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
uct_ugni_device_lock(&iface->cdm);
(void)GNI_EpDestroy(ep->ep);
uct_ugni_device_unlock(&iface->cdm);
ucs_error("GNI_EpBind failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_UNREACHABLE;
Expand Down Expand Up @@ -196,8 +200,9 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface,
self->flush_group->flush_comp.func = NULL;
self->flush_group->parent = NULL;
#endif

uct_ugni_device_lock(&iface->cdm);
ugni_rc = GNI_EpCreate(uct_ugni_iface_nic_handle(iface), iface->local_cq, &self->ep);
uct_ugni_device_unlock(&iface->cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand Down Expand Up @@ -230,8 +235,9 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ugni_ep_t)

ucs_arbiter_group_purge(&iface->arbiter, &self->arb_group,
uct_ugni_ep_abriter_purge_cb, NULL);

uct_ugni_device_lock(&iface->cdm);
ugni_rc = GNI_EpDestroy(self->ep);
uct_ugni_device_unlock(&iface->cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_warn("GNI_EpDestroy failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand Down
13 changes: 4 additions & 9 deletions src/uct/ugni/base/ugni_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ void uct_ugni_cleanup_base_iface(uct_ugni_iface_t *iface)
{
ucs_arbiter_cleanup(&iface->arbiter);
ucs_mpool_cleanup(&iface->flush_pool, 1);
GNI_CqDestroy(iface->local_cq);
uct_ugni_destroy_cq(iface->local_cq, &iface->cdm);
uct_ugni_destroy_cdm(&iface->cdm);
}

Expand All @@ -80,7 +80,6 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker,
UCS_STATS_ARG(ucs_stats_node_t *stats_parent))
{
uct_ugni_device_t *dev;
gni_return_t ugni_rc;
ucs_status_t status;
uct_ugni_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_iface_config_t);
unsigned grow = (config->mpool.bufs_grow == 0) ? 128 : config->mpool.bufs_grow;
Expand All @@ -98,12 +97,8 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker,
ucs_error("Failed to UGNI NIC, Error status: %d", status);
return status;
}
ugni_rc = GNI_CqCreate(uct_ugni_iface_nic_handle(self), UCT_UGNI_LOCAL_CQ, 0,
GNI_CQ_NOBLOCK,
NULL, NULL, &self->local_cq);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CqCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
status = uct_ugni_create_cq(&self->local_cq, UCT_UGNI_LOCAL_CQ, &self->cdm);
if (UCS_OK != status) {
goto clean_cdm;
}
self->outstanding = 0;
Expand All @@ -124,7 +119,7 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker,
}
return status;
clean_cq:
GNI_CqDestroy(self->local_cq);
uct_ugni_destroy_cq(self->local_cq, &self->cdm);
clean_cdm:
uct_ugni_destroy_cdm(&self->cdm);
return status;
Expand Down
Loading

0 comments on commit c7b4edd

Please sign in to comment.