Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/UGNI: Spinlocks around all of the GNI_* functions #1546

Merged
merged 1 commit into from
May 30, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions src/uct/ugni/base/ugni_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
#ifndef UCT_UGNI_DEF_H
#define UCT_UGNI_DEF_H

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include <ucs/async/async.h>

#define UCT_UGNI_MD_NAME "ugni"
Expand Down Expand Up @@ -38,4 +42,26 @@ do {\
UCS_ASYNC_UNBLOCK((x)->super.worker->async); \
} while(0)

#if ENABLE_MT
#define uct_ugni_check_lock_needed(_cdm) UCS_THREAD_MODE_MULTI == (_cdm)->thread_mode
#define uct_ugni_device_init_lock(_dev) ucs_spinlock_init(&(_dev)->lock)
#define uct_ugni_device_destroy_lock(_dev) ucs_spinlock_destroy(&(_dev)->lock)
#define uct_ugni_device_lock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_trace_async("Taking lock"); \
ucs_spin_lock(&(_cdm)->dev->lock); \
}
#define uct_ugni_device_unlock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_trace_async("Releasing lock"); \
ucs_spin_unlock(&(_cdm)->dev->lock); \
}
#else
#define uct_ugni_device_init_lock(x) UCS_OK
#define uct_ugni_device_destroy_lock(x) UCS_OK
#define uct_ugni_device_lock(x)
#define uct_ugni_device_unlock(x)
#define uct_ugni_check_lock_needed(x) 0
#endif

#endif
106 changes: 58 additions & 48 deletions src/uct/ugni/base/ugni_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,38 +4,14 @@
* See file LICENSE for terms.
*/

#ifdef HAVE_CONFIG_H
# include "config.h"
#endif

#include "ugni_device.h"
#include "ugni_md.h"
#include "ugni_iface.h"
#include <uct/base/uct_md.h>
#include <ucs/arch/atomic.h>
#include <ucs/sys/string.h>
#include <pmi.h>

#if ENABLE_MT
#define uct_ugni_check_lock_needed(_cdm) UCS_THREAD_MODE_MULTI == _cdm->thread_mode
#define uct_ugni_device_init_lock(_dev) ucs_spinlock_init(&_dev->lock)
#define uct_ugni_device_destroy_lock(_dev) ucs_spinlock_destroy(&_dev->lock)
#define uct_ugni_device_lock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_spin_lock(&cdm->dev->lock); \
}
#define uct_ugni_device_unlock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_spin_unlock(&cdm->dev->lock); \
}
#else
#define uct_ugni_device_init_lock(x) UCS_OK
#define uct_ugni_device_destroy_lock(x) UCS_OK
#define uct_ugni_device_lock(x)
#define uct_ugni_device_unlock(x)
#define uct_ugni_check_lock_needed(x) 0
#endif

uint16_t ugni_domain_counter = 0;
/**
* @breif Static information about UGNI job
*
Expand All @@ -62,6 +38,8 @@ static uct_ugni_job_info_t job_info = {
.initialized = false,
};

uint32_t ugni_domain_counter = 0;

void uct_ugni_device_get_resource(const char *tl_name, uct_ugni_device_t *dev,
uct_tl_resource_desc_t *resource)
{
Expand All @@ -80,8 +58,6 @@ ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name,
int i;
ucs_status_t status = UCS_OK;

pthread_mutex_lock(&uct_ugni_global_lock);

resources = ucs_calloc(job_info.num_devices, sizeof(uct_tl_resource_desc_t),
"resource desc");
if (NULL == resources) {
Expand All @@ -99,7 +75,6 @@ ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name,
error:
*num_resources_p = num_devices;
*resource_p = resources;
pthread_mutex_unlock(&uct_ugni_global_lock);

return status;
}
Expand Down Expand Up @@ -215,45 +190,46 @@ ucs_status_t init_device_list()
int i, num_active_devices;
int *dev_ids = NULL;
gni_return_t ugni_rc = GNI_RC_SUCCESS;
uct_ugni_job_info_t *inf = uct_ugni_get_job_info();

/* check if devices were already initilized */

if (-1 != job_info.num_devices) {
if (-1 != inf->num_devices) {
ucs_debug("The device list is already initialized");
status = UCS_OK;
goto err_zero;
}

ugni_rc = GNI_GetNumLocalDevices(&job_info.num_devices);
ugni_rc = GNI_GetNumLocalDevices(&inf->num_devices);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_GetNumLocalDevices failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
status = UCS_ERR_NO_DEVICE;
goto err_zero;
}

if (0 == job_info.num_devices) {
if (0 == inf->num_devices) {
ucs_debug("UGNI No device found");
status = UCS_OK;
goto err_zero;
}

if (job_info.num_devices >= UCT_UGNI_MAX_DEVICES) {
if (inf->num_devices >= UCT_UGNI_MAX_DEVICES) {
ucs_error("UGNI, number of discovered devices (%d) " \
"is above the maximum supported devices (%d)",
job_info.num_devices, UCT_UGNI_MAX_DEVICES);
inf->num_devices, UCT_UGNI_MAX_DEVICES);
status = UCS_ERR_UNSUPPORTED;
goto err_zero;
}

dev_ids = ucs_calloc(job_info.num_devices, sizeof(int), "ugni device ids");
dev_ids = ucs_calloc(inf->num_devices, sizeof(int), "ugni device ids");
if (NULL == dev_ids) {
ucs_error("Failed to allocate memory");
status = UCS_ERR_NO_MEMORY;
goto err_zero;
}

ugni_rc = GNI_GetLocalDeviceIds(job_info.num_devices, dev_ids);
ugni_rc = GNI_GetLocalDeviceIds(inf->num_devices, dev_ids);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_GetLocalDeviceIds failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand All @@ -262,8 +238,8 @@ ucs_status_t init_device_list()
}

num_active_devices = 0;
for (i = 0; i < job_info.num_devices; i++) {
status = uct_ugni_device_create(dev_ids[i], num_active_devices, &job_info.devices[i]);
for (i = 0; i < inf->num_devices; i++) {
status = uct_ugni_device_create(dev_ids[i], num_active_devices, &inf->devices[i]);
if (status != UCS_OK) {
ucs_warn("Failed to initialize ugni device %d (%s), ignoring it",
i, ucs_status_string(status));
Expand All @@ -272,13 +248,13 @@ ucs_status_t init_device_list()
}
}

if (num_active_devices != job_info.num_devices) {
if (num_active_devices != inf->num_devices) {
ucs_warn("Error in detection devices");
status = UCS_ERR_NO_DEVICE;
goto err_dev_id;
}

ucs_debug("Initialized UGNI component with %d devices", job_info.num_devices);
ucs_debug("Initialized UGNI component with %d devices", inf->num_devices);

err_dev_id:
ucs_free(dev_ids);
Expand Down Expand Up @@ -438,7 +414,7 @@ ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device,
cdm->thread_mode = thread_mode;
cdm->dev = device;
uct_ugni_device_lock(cdm);
cdm->domain_id = job_info->pmi_rank_id + job_info->pmi_num_of_ranks * ugni_domain_counter++;
cdm->domain_id = job_info->pmi_rank_id + job_info->pmi_num_of_ranks * ucs_atomic_fadd32(&ugni_domain_counter,1);
ucs_debug("Creating new command domain with id %d (%d + %d * %d)",
cdm->domain_id, job_info->pmi_rank_id,
job_info->pmi_num_of_ranks, ugni_domain_counter);
Expand All @@ -456,13 +432,11 @@ ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device,
ugni_rc = GNI_CdmAttach(cdm->cdm_handle, device->device_id,
&cdm->address, &cdm->nic_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmAttach failed (domain id %d, %d), Error status: %s %d",
cdm->domain_id, ugni_domain_counter, gni_err_str[ugni_rc], ugni_rc);
ugni_rc = GNI_CdmDestroy(cdm->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
}
ucs_error("GNI_CdmAttach failed, Error status: %s\n"
"Created domain %d (%d + %d * %d)",
gni_err_str[ugni_rc], cdm->domain_id, job_info->pmi_rank_id,
job_info->pmi_num_of_ranks, ugni_domain_counter);
uct_ugni_destroy_cdm(cdm);
status = UCS_ERR_NO_DEVICE;
}

Expand All @@ -483,12 +457,48 @@ ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm)
{
gni_return_t ugni_rc;

ucs_debug("MD GNI_CdmDestroy");
ucs_trace_func("cdm=%p", cdm);
uct_ugni_device_lock(cdm);
ugni_rc = GNI_CdmDestroy(cdm->cdm_handle);
uct_ugni_device_unlock(cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}
return UCS_OK;
}

ucs_status_t uct_ugni_create_cq(gni_cq_handle_t *cq, unsigned cq_size, uct_ugni_cdm_t *cdm)
{
gni_return_t ugni_rc;

uct_ugni_device_lock(cdm);
ugni_rc = GNI_CqCreate(cdm->nic_handle, UCT_UGNI_LOCAL_CQ, 0,
GNI_CQ_NOBLOCK,
NULL, NULL, cq);
uct_ugni_device_unlock(cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CqCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_NO_DEVICE;
}

return UCS_OK;
}

ucs_status_t uct_ugni_destroy_cq(gni_cq_handle_t cq, uct_ugni_cdm_t *cdm)
{
gni_return_t ugni_rc;

uct_ugni_device_lock(cdm);
ugni_rc = GNI_CqDestroy(cq);
uct_ugni_device_unlock(cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_warn("GNI_CqDestroy failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}

return UCS_OK;
}
3 changes: 3 additions & 0 deletions src/uct/ugni/base/ugni_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@ ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name,
unsigned *num_resources_p);
ucs_status_t init_device_list();
ucs_status_t uct_ugni_create_md_cdm(uct_ugni_cdm_t *cdm);
ucs_status_t uct_ugni_create_cq(gni_cq_handle_t *cq, unsigned cq_size, uct_ugni_cdm_t *cdm);
ucs_status_t uct_ugni_destroy_cq(gni_cq_handle_t cq, uct_ugni_cdm_t *cdm);

#endif
10 changes: 8 additions & 2 deletions src/uct/ugni/base/ugni_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,13 @@ ucs_status_t ugni_connect_ep(uct_ugni_iface_t *iface,
uct_ugni_ep_t *ep){
gni_return_t ugni_rc;

uct_ugni_device_lock(&iface->cdm);
ugni_rc = GNI_EpBind(ep->ep, dev_addr->nic_addr, iface_addr->domain_id);
uct_ugni_device_unlock(&iface->cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
uct_ugni_device_lock(&iface->cdm);
(void)GNI_EpDestroy(ep->ep);
uct_ugni_device_unlock(&iface->cdm);
ucs_error("GNI_EpBind failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_UNREACHABLE;
Expand Down Expand Up @@ -196,8 +200,9 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface,
self->flush_group->flush_comp.func = NULL;
self->flush_group->parent = NULL;
#endif

uct_ugni_device_lock(&iface->cdm);
ugni_rc = GNI_EpCreate(uct_ugni_iface_nic_handle(iface), iface->local_cq, &self->ep);
uct_ugni_device_unlock(&iface->cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand Down Expand Up @@ -230,8 +235,9 @@ static UCS_CLASS_CLEANUP_FUNC(uct_ugni_ep_t)

ucs_arbiter_group_purge(&iface->arbiter, &self->arb_group,
uct_ugni_ep_abriter_purge_cb, NULL);

uct_ugni_device_lock(&iface->cdm);
ugni_rc = GNI_EpDestroy(self->ep);
uct_ugni_device_unlock(&iface->cdm);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_warn("GNI_EpDestroy failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand Down
13 changes: 4 additions & 9 deletions src/uct/ugni/base/ugni_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ void uct_ugni_cleanup_base_iface(uct_ugni_iface_t *iface)
{
ucs_arbiter_cleanup(&iface->arbiter);
ucs_mpool_cleanup(&iface->flush_pool, 1);
GNI_CqDestroy(iface->local_cq);
uct_ugni_destroy_cq(iface->local_cq, &iface->cdm);
uct_ugni_destroy_cdm(&iface->cdm);
}

Expand All @@ -80,7 +80,6 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker,
UCS_STATS_ARG(ucs_stats_node_t *stats_parent))
{
uct_ugni_device_t *dev;
gni_return_t ugni_rc;
ucs_status_t status;
uct_ugni_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_iface_config_t);
unsigned grow = (config->mpool.bufs_grow == 0) ? 128 : config->mpool.bufs_grow;
Expand All @@ -98,12 +97,8 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker,
ucs_error("Failed to UGNI NIC, Error status: %d", status);
return status;
}
ugni_rc = GNI_CqCreate(uct_ugni_iface_nic_handle(self), UCT_UGNI_LOCAL_CQ, 0,
GNI_CQ_NOBLOCK,
NULL, NULL, &self->local_cq);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CqCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
status = uct_ugni_create_cq(&self->local_cq, UCT_UGNI_LOCAL_CQ, &self->cdm);
if (UCS_OK != status) {
goto clean_cdm;
}
self->outstanding = 0;
Expand All @@ -124,7 +119,7 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker,
}
return status;
clean_cq:
GNI_CqDestroy(self->local_cq);
uct_ugni_destroy_cq(self->local_cq, &self->cdm);
clean_cdm:
uct_ugni_destroy_cdm(&self->cdm);
return status;
Expand Down
Loading