Skip to content

Commit

Permalink
Merge pull request #1494 from MattBBaker/topic/uct-ugni-thread-safe-cdm
Browse files Browse the repository at this point in the history
UCT/UGNI: Start using spinlocks to protect critical structures
  • Loading branch information
shamisp authored May 19, 2017
2 parents d67863f + aa2b76c commit 6b29f42
Show file tree
Hide file tree
Showing 14 changed files with 219 additions and 293 deletions.
103 changes: 101 additions & 2 deletions src/uct/ugni/base/ugni_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,33 @@
#endif

#include "ugni_device.h"
#include "ugni_md.h"
#include "ugni_iface.h"
#include <uct/base/uct_md.h>
#include <ucs/sys/string.h>

#if ENABLE_MT
#define uct_ugni_check_lock_needed(_cdm) UCS_THREAD_MODE_MULTI == _cdm->thread_mode
#define uct_ugni_device_init_lock(_dev) ucs_spinlock_init(&_dev->lock)
#define uct_ugni_device_destroy_lock(_dev) ucs_spinlock_destroy(&_dev->lock)
#define uct_ugni_device_lock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_spin_lock(&cdm->dev->lock); \
}
#define uct_ugni_device_unlock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_spin_unlock(&cdm->dev->lock); \
}
#else
#define uct_ugni_device_init_lock(x) UCS_OK
#define uct_ugni_device_destroy_lock(x) UCS_OK
#define uct_ugni_device_lock(x)
#define uct_ugni_device_unlock(x)
#define uct_ugni_check_lock_needed(x) 0
#endif

uint16_t ugni_domain_counter = 0;

void uct_ugni_device_get_resource(const char *tl_name, uct_ugni_device_t *dev,
uct_tl_resource_desc_t *resource)
{
Expand Down Expand Up @@ -106,21 +129,97 @@ ucs_status_t uct_ugni_device_create(int dev_id, int index, uct_ugni_device_t *de
ucs_snprintf_zero(dev_p->fname, sizeof(dev_p->fname), "%s:%d",
dev_p->type_name, dev_p->device_index);

status = uct_ugni_device_init_lock(dev_p);
if (UCS_OK != status) {
ucs_error("Couldn't initalize device lock.");
return status;
}
dev_p->attached = false;
return UCS_OK;
}

void uct_ugni_device_destroy(uct_ugni_device_t *dev)
{
/* Nop */
ucs_status_t status;

status = uct_ugni_device_destroy_lock(dev);
if (UCS_OK != status) {
ucs_error("Couldn't destroy device lock.");
}
}

ucs_status_t uct_ugni_iface_get_dev_address(uct_iface_t *tl_iface, uct_device_addr_t *addr)
{
uct_ugni_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_iface_t);
uct_devaddr_ugni_t *ugni_dev_addr = (uct_devaddr_ugni_t *)addr;
uct_ugni_device_t *dev = uct_ugni_iface_device(iface);

ugni_dev_addr->nic_addr = dev->address;

return UCS_OK;
}

ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, ucs_thread_mode_t thread_mode)
{
uct_ugni_job_info_t *job_info;
int modes;
gni_return_t ugni_rc;
ucs_status_t status = UCS_OK;

job_info = uct_ugni_get_job_info();
if (NULL == job_info) {
return UCS_ERR_IO_ERROR;
}

cdm->thread_mode = thread_mode;
cdm->dev = device;
uct_ugni_device_lock(cdm);
cdm->domain_id = job_info->pmi_rank_id + job_info->pmi_num_of_ranks * ugni_domain_counter++;
ucs_debug("Creating new command domain with id %d (%d + %d * %d)",
cdm->domain_id, job_info->pmi_rank_id,
job_info->pmi_num_of_ranks, ugni_domain_counter);
modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED |
GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL;
ugni_rc = GNI_CdmCreate(cdm->domain_id, job_info->ptag, job_info->cookie,
modes, &cdm->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
status = UCS_ERR_NO_DEVICE;
goto out_unlock;
}

ugni_rc = GNI_CdmAttach(cdm->cdm_handle, device->device_id,
&cdm->address, &cdm->nic_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmAttach failed (domain id %d, %d), Error status: %s %d",
cdm->domain_id, ugni_domain_counter, gni_err_str[ugni_rc], ugni_rc);
ugni_rc = GNI_CdmDestroy(cdm->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
}
status = UCS_ERR_NO_DEVICE;
}

ugni_dev_addr->nic_addr = iface->dev->address;
out_unlock:
uct_ugni_device_unlock(cdm);
if (UCS_OK == status) {
ucs_debug("Made ugni cdm. nic_addr = %i domain_id = %i", device->address, cdm->domain_id);
}
return status;
}

ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm)
{
gni_return_t ugni_rc;

ucs_debug("MD GNI_CdmDestroy");
ugni_rc = GNI_CdmDestroy(cdm->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}
return UCS_OK;
}
2 changes: 2 additions & 0 deletions src/uct/ugni/base/ugni_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ void uct_ugni_device_destroy(uct_ugni_device_t *dev);
void uct_ugni_device_get_resource(const char *tl_name, uct_ugni_device_t *dev,
uct_tl_resource_desc_t *resource);
ucs_status_t uct_ugni_iface_get_dev_address(uct_iface_t *tl_iface, uct_device_addr_t *addr);
ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, ucs_thread_mode_t thread_mode);
ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm);
#endif
7 changes: 3 additions & 4 deletions src/uct/ugni/base/ugni_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface,
const uct_devaddr_ugni_t *ugni_dev_addr = (const uct_devaddr_ugni_t *)dev_addr;
ucs_status_t rc = UCS_OK;
gni_return_t ugni_rc;
uint32_t *big_hash;

self->arb_sched = 0;
UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super);
Expand All @@ -196,7 +197,7 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface,
self->flush_group->parent = NULL;
#endif

ugni_rc = GNI_EpCreate(iface->nic_handle, iface->local_cq, &self->ep);
ugni_rc = GNI_EpCreate(uct_ugni_iface_nic_handle(iface), iface->local_cq, &self->ep);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand All @@ -208,11 +209,9 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface,
}

ucs_arbiter_group_init(&self->arb_group);

uint32_t *big_hash;
big_hash = (void *)&self->ep;
self->hash_key = big_hash[0];
if (GNI_DEVICE_ARIES == iface->dev->type) {
if (uct_ugni_check_device_type(iface, GNI_DEVICE_ARIES)) {
self->hash_key &= 0x00FFFFFF;
}
ucs_debug("Adding ep hash %x to iface %p", self->hash_key, iface);
Expand Down
151 changes: 36 additions & 115 deletions src/uct/ugni/base/ugni_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
#include "ugni_iface.h"
#include <pmi.h>

static uint16_t ugni_domain_global_counter = 0;

void uct_ugni_base_desc_init(ucs_mpool_t *mp, void *obj, void *chunk)
{
uct_ugni_base_desc_t *base = (uct_ugni_base_desc_t *) obj;
Expand Down Expand Up @@ -132,7 +130,7 @@ ucs_status_t uct_ugni_iface_get_address(uct_iface_h tl_iface,
uct_ugni_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_iface_t);
uct_sockaddr_ugni_t *iface_addr = (uct_sockaddr_ugni_t*)addr;

iface_addr->domain_id = iface->domain_id;
iface_addr->domain_id = iface->cdm.domain_id;
return UCS_OK;
}

Expand Down Expand Up @@ -183,7 +181,7 @@ static ucs_status_t get_ptag(uint8_t *ptag)
return UCS_OK;
}

static ucs_status_t uct_ugni_fetch_pmi()
ucs_status_t uct_ugni_fetch_pmi()
{
int spawned = 0,
rc;
Expand Down Expand Up @@ -234,136 +232,54 @@ static ucs_status_t uct_ugni_fetch_pmi()
return UCS_OK;
}

ucs_status_t uct_ugni_init_nic(int device_index,
uint16_t *domain_id,
gni_cdm_handle_t *cdm_handle,
gni_nic_handle_t *nic_handle,
uint32_t *address)
{
int modes;
ucs_status_t status;
gni_return_t ugni_rc = GNI_RC_SUCCESS;

status = uct_ugni_fetch_pmi();
if (UCS_OK != status) {
ucs_error("Failed to activate context, Error status: %d", status);
return status;
}

*domain_id = job_info.pmi_rank_id + job_info.pmi_num_of_ranks * ugni_domain_global_counter;
modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED |
GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL;
ucs_debug("Creating new command domain with id %d (%d + %d * %d)",
*domain_id, job_info.pmi_rank_id,
job_info.pmi_num_of_ranks, ugni_domain_global_counter);
ugni_rc = GNI_CdmCreate(*domain_id, job_info.ptag, job_info.cookie,
modes, cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_NO_DEVICE;
}

/* For now we use the first device for allocation of the domain */
ugni_rc = GNI_CdmAttach(*cdm_handle, job_info.devices[device_index].device_id,
address, nic_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmAttach failed (domain id %d, %d), Error status: %s %d",
*domain_id, ugni_domain_global_counter, gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_NO_DEVICE;
}

++ugni_domain_global_counter;
return UCS_OK;
}

ucs_status_t ugni_activate_iface(uct_ugni_iface_t *iface)
{
ucs_status_t status;
gni_return_t ugni_rc;
uint32_t pe_address;

if(iface->activated) {
return UCS_OK;
}

status = uct_ugni_init_nic(0, &iface->domain_id,
&iface->cdm_handle, &iface->nic_handle,
&pe_address);
if (UCS_OK != status) {
ucs_error("Failed to UGNI NIC, Error status: %d", status);
return status;
}

ucs_debug("Made ugni interface. iface->dev->nic_addr = %i iface->domain_id = %i", iface->dev->address, iface->domain_id);

ugni_rc = GNI_CqCreate(iface->nic_handle, UCT_UGNI_LOCAL_CQ, 0,
GNI_CQ_NOBLOCK,
NULL, NULL, &iface->local_cq);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CqCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_NO_DEVICE;
}
iface->activated = true;

/* iface is activated */
return UCS_OK;
}

ucs_status_t ugni_deactivate_iface(uct_ugni_iface_t *iface)
{
gni_return_t ugni_rc;

if(!iface->activated) {
return UCS_OK;
}

ugni_rc = GNI_CqDestroy(iface->local_cq);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_warn("GNI_CqDestroy failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}
ugni_rc = GNI_CdmDestroy(iface->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_warn("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}

iface->activated = false ;
return UCS_OK;
}

static ucs_mpool_ops_t uct_ugni_flush_mpool_ops = {
.chunk_alloc = ucs_mpool_chunk_malloc,
.chunk_release = ucs_mpool_chunk_free,
.obj_init = NULL,
.obj_cleanup = NULL
};

void uct_ugni_cleanup_base_iface(uct_ugni_iface_t *iface)
{
ucs_arbiter_cleanup(&iface->arbiter);
ucs_mpool_cleanup(&iface->flush_pool, 1);
GNI_CqDestroy(iface->local_cq);
uct_ugni_destroy_cdm(&iface->cdm);
}

UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker,
const uct_iface_params_t *params,
uct_iface_ops_t *uct_ugni_iface_ops,
const uct_iface_config_t *tl_config
UCS_STATS_ARG(ucs_stats_node_t *stats_parent))
{
uct_ugni_device_t *dev;
gni_return_t ugni_rc;
ucs_status_t status;
uct_ugni_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_iface_config_t);
unsigned grow = (config->mpool.bufs_grow == 0) ? 128 : config->mpool.bufs_grow;

UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, uct_ugni_iface_ops, md, worker,
params, tl_config UCS_STATS_ARG(params->stats_root)
UCS_STATS_ARG(UCT_UGNI_MD_NAME));
dev = uct_ugni_device_by_name(params->dev_name);
if (NULL == dev) {
ucs_error("No device was found: %s", params->dev_name);
return UCS_ERR_NO_DEVICE;
}
UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, uct_ugni_iface_ops, md, worker,
params, tl_config UCS_STATS_ARG(params->stats_root)
UCS_STATS_ARG(UCT_UGNI_MD_NAME));
self->dev = dev;
self->activated = false;
status = uct_ugni_create_cdm(&self->cdm, dev, worker->thread_mode);
if (UCS_OK != status) {
ucs_error("Failed to UGNI NIC, Error status: %d", status);
return status;
}
ugni_rc = GNI_CqCreate(uct_ugni_iface_nic_handle(self), UCT_UGNI_LOCAL_CQ, 0,
GNI_CQ_NOBLOCK,
NULL, NULL, &self->local_cq);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CqCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
goto clean_cdm;
}
self->outstanding = 0;
sglib_hashed_uct_ugni_ep_t_init(self->eps);
ucs_arbiter_init(&self->arbiter);
Expand All @@ -378,18 +294,23 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker,
"UGNI-DESC-ONLY");
if (UCS_OK != status) {
ucs_error("Could not init iface");
goto clean_cq;
}
return status;
clean_cq:
GNI_CqDestroy(self->local_cq);
clean_cdm:
uct_ugni_destroy_cdm(&self->cdm);
return status;
}

UCS_CLASS_DEFINE_NEW_FUNC(uct_ugni_iface_t, uct_iface_t, uct_md_h, uct_worker_h,
const uct_iface_params_t*, uct_iface_ops_t *,
const uct_iface_config_t * UCS_STATS_ARG(ucs_stats_node_t *));

static UCS_CLASS_CLEANUP_FUNC(uct_ugni_iface_t){

ugni_deactivate_iface(self);
ucs_arbiter_cleanup(&self->arbiter);
static UCS_CLASS_CLEANUP_FUNC(uct_ugni_iface_t)
{
uct_ugni_cleanup_base_iface(self);
}

UCS_CLASS_DEFINE(uct_ugni_iface_t, uct_base_iface_t);
Loading

0 comments on commit 6b29f42

Please sign in to comment.