Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/UGNI: Start using spinlocks to protect critical structures #1494

Merged
merged 4 commits into from
May 19, 2017
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 97 additions & 2 deletions src/uct/ugni/base/ugni_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,33 @@
#endif

#include "ugni_device.h"
#include "ugni_md.h"
#include "ugni_iface.h"
#include <uct/base/uct_md.h>
#include <ucs/sys/string.h>

#if ENABLE_MT
#define uct_ugni_check_lock_needed(_cdm) UCS_THREAD_MODE_MULTI == _cdm->thread_mode
#define uct_ugni_device_init_lock(_dev) ucs_spinlock_init(&_dev->lock)
#define uct_ugni_device_destroy_lock(_dev) ucs_spinlock_destroy(&_dev->lock)
#define uct_ugni_device_lock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_spin_lock(&cdm->dev->lock); \
}
#define uct_ugni_device_unlock(_cdm) \
if (uct_ugni_check_lock_needed(_cdm)) { \
ucs_spin_unlock(&cdm->dev->lock); \
}
#else
#define uct_ugni_device_init_lock(x) UCS_OK
#define uct_ugni_device_destroy_lock(x) UCS_OK
#define uct_ugni_device_lock(x)
#define uct_ugni_device_unlock(x)
#define uct_ugni_check_lock_needed(x) 0
#endif

uint16_t ugni_domain_counter = 0;

void uct_ugni_device_get_resource(const char *tl_name, uct_ugni_device_t *dev,
uct_tl_resource_desc_t *resource)
{
Expand Down Expand Up @@ -106,21 +129,93 @@ ucs_status_t uct_ugni_device_create(int dev_id, int index, uct_ugni_device_t *de
ucs_snprintf_zero(dev_p->fname, sizeof(dev_p->fname), "%s:%d",
dev_p->type_name, dev_p->device_index);

status = uct_ugni_device_init_lock(dev_p);
if (UCS_OK != status) {
ucs_error("Couldn't initalize device lock.");
return status;
}
dev_p->attached = false;
return UCS_OK;
}

void uct_ugni_device_destroy(uct_ugni_device_t *dev)
{
/* Nop */
ucs_status_t status;

status = uct_ugni_device_destroy_lock(dev);
if (UCS_OK != status) {
ucs_error("Couldn't destroy device lock.");
}
}

ucs_status_t uct_ugni_iface_get_dev_address(uct_iface_t *tl_iface, uct_device_addr_t *addr)
{
uct_ugni_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_iface_t);
uct_devaddr_ugni_t *ugni_dev_addr = (uct_devaddr_ugni_t *)addr;
uct_ugni_device_t *dev = uct_ugni_iface_device(iface);

ugni_dev_addr->nic_addr = dev->address;

return UCS_OK;
}

ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, ucs_thread_mode_t thread_mode)
{
uct_ugni_job_info_t *job_info;
int modes;
gni_return_t ugni_rc;
ucs_status_t status = UCS_OK;

ugni_dev_addr->nic_addr = iface->dev->address;
job_info = uct_ugni_get_job_info();
if (NULL == job_info) {
return UCS_ERR_IO_ERROR;
}

cdm->thread_mode = thread_mode;
cdm->dev = device;
uct_ugni_device_lock(cdm);
cdm->domain_id = job_info->pmi_rank_id + job_info->pmi_num_of_ranks * ugni_domain_counter++;
ucs_debug("Creating new command domain with id %d (%d + %d * %d)",
cdm->domain_id, job_info->pmi_rank_id,
job_info->pmi_num_of_ranks, ugni_domain_counter);
modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED |
GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL;
ugni_rc = GNI_CdmCreate(cdm->domain_id, job_info->ptag, job_info->cookie,
modes, &cdm->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
status = UCS_ERR_NO_DEVICE;
goto out_unlock;
}

ugni_rc = GNI_CdmAttach(cdm->cdm_handle, device->device_id,
&cdm->address, &cdm->nic_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmAttach failed (domain id %d, %d), Error status: %s %d",
cdm->domain_id, ugni_domain_counter, gni_err_str[ugni_rc], ugni_rc);
GNI_CdmDestroy(cdm->cdm_handle);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No status check for the return code

status = UCS_ERR_NO_DEVICE;
}

out_unlock:
uct_ugni_device_unlock(cdm);
if (UCS_OK == status) {
ucs_debug("Made ugni cdm. nic_addr = %i domain_id = %i", device->address, cdm->domain_id);
}
return status;
}

ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm)
{
gni_return_t ugni_rc;

ucs_debug("MD GNI_CdmDestroy");
ugni_rc = GNI_CdmDestroy(cdm->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}
return UCS_OK;
}
2 changes: 2 additions & 0 deletions src/uct/ugni/base/ugni_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ void uct_ugni_device_destroy(uct_ugni_device_t *dev);
void uct_ugni_device_get_resource(const char *tl_name, uct_ugni_device_t *dev,
uct_tl_resource_desc_t *resource);
ucs_status_t uct_ugni_iface_get_dev_address(uct_iface_t *tl_iface, uct_device_addr_t *addr);
ucs_status_t uct_ugni_create_cdm(uct_ugni_cdm_t *cdm, uct_ugni_device_t *device, ucs_thread_mode_t thread_mode);
ucs_status_t uct_ugni_destroy_cdm(uct_ugni_cdm_t *cdm);
#endif
7 changes: 3 additions & 4 deletions src/uct/ugni/base/ugni_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface,
const uct_devaddr_ugni_t *ugni_dev_addr = (const uct_devaddr_ugni_t *)dev_addr;
ucs_status_t rc = UCS_OK;
gni_return_t ugni_rc;
uint32_t *big_hash;

self->arb_sched = 0;
UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super);
Expand All @@ -196,7 +197,7 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface,
self->flush_group->parent = NULL;
#endif

ugni_rc = GNI_EpCreate(iface->nic_handle, iface->local_cq, &self->ep);
ugni_rc = GNI_EpCreate(uct_ugni_iface_nic_handle(iface), iface->local_cq, &self->ep);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
Expand All @@ -208,11 +209,9 @@ UCS_CLASS_INIT_FUNC(uct_ugni_ep_t, uct_iface_t *tl_iface,
}

ucs_arbiter_group_init(&self->arb_group);

uint32_t *big_hash;
big_hash = (void *)&self->ep;
self->hash_key = big_hash[0];
if (GNI_DEVICE_ARIES == iface->dev->type) {
if (uct_ugni_check_device_type(iface, GNI_DEVICE_ARIES)) {
self->hash_key &= 0x00FFFFFF;
}
ucs_debug("Adding ep hash %x to iface %p", self->hash_key, iface);
Expand Down
137 changes: 23 additions & 114 deletions src/uct/ugni/base/ugni_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
#include "ugni_iface.h"
#include <pmi.h>

static uint16_t ugni_domain_global_counter = 0;

void uct_ugni_base_desc_init(ucs_mpool_t *mp, void *obj, void *chunk)
{
uct_ugni_base_desc_t *base = (uct_ugni_base_desc_t *) obj;
Expand Down Expand Up @@ -132,7 +130,7 @@ ucs_status_t uct_ugni_iface_get_address(uct_iface_h tl_iface,
uct_ugni_iface_t *iface = ucs_derived_of(tl_iface, uct_ugni_iface_t);
uct_sockaddr_ugni_t *iface_addr = (uct_sockaddr_ugni_t*)addr;

iface_addr->domain_id = iface->domain_id;
iface_addr->domain_id = iface->cdm.domain_id;
return UCS_OK;
}

Expand Down Expand Up @@ -183,7 +181,7 @@ static ucs_status_t get_ptag(uint8_t *ptag)
return UCS_OK;
}

static ucs_status_t uct_ugni_fetch_pmi()
ucs_status_t uct_ugni_fetch_pmi()
{
int spawned = 0,
rc;
Expand Down Expand Up @@ -234,108 +232,6 @@ static ucs_status_t uct_ugni_fetch_pmi()
return UCS_OK;
}

ucs_status_t uct_ugni_init_nic(int device_index,
uint16_t *domain_id,
gni_cdm_handle_t *cdm_handle,
gni_nic_handle_t *nic_handle,
uint32_t *address)
{
int modes;
ucs_status_t status;
gni_return_t ugni_rc = GNI_RC_SUCCESS;

status = uct_ugni_fetch_pmi();
if (UCS_OK != status) {
ucs_error("Failed to activate context, Error status: %d", status);
return status;
}

*domain_id = job_info.pmi_rank_id + job_info.pmi_num_of_ranks * ugni_domain_global_counter;
modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED |
GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL;
ucs_debug("Creating new command domain with id %d (%d + %d * %d)",
*domain_id, job_info.pmi_rank_id,
job_info.pmi_num_of_ranks, ugni_domain_global_counter);
ugni_rc = GNI_CdmCreate(*domain_id, job_info.ptag, job_info.cookie,
modes, cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_NO_DEVICE;
}

/* For now we use the first device for allocation of the domain */
ugni_rc = GNI_CdmAttach(*cdm_handle, job_info.devices[device_index].device_id,
address, nic_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CdmAttach failed (domain id %d, %d), Error status: %s %d",
*domain_id, ugni_domain_global_counter, gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_NO_DEVICE;
}

++ugni_domain_global_counter;
return UCS_OK;
}

ucs_status_t ugni_activate_iface(uct_ugni_iface_t *iface)
{
ucs_status_t status;
gni_return_t ugni_rc;
uint32_t pe_address;

if(iface->activated) {
return UCS_OK;
}

status = uct_ugni_init_nic(0, &iface->domain_id,
&iface->cdm_handle, &iface->nic_handle,
&pe_address);
if (UCS_OK != status) {
ucs_error("Failed to UGNI NIC, Error status: %d", status);
return status;
}

ucs_debug("Made ugni interface. iface->dev->nic_addr = %i iface->domain_id = %i", iface->dev->address, iface->domain_id);

ugni_rc = GNI_CqCreate(iface->nic_handle, UCT_UGNI_LOCAL_CQ, 0,
GNI_CQ_NOBLOCK,
NULL, NULL, &iface->local_cq);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CqCreate failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_NO_DEVICE;
}
iface->activated = true;

/* iface is activated */
return UCS_OK;
}

ucs_status_t ugni_deactivate_iface(uct_ugni_iface_t *iface)
{
gni_return_t ugni_rc;

if(!iface->activated) {
return UCS_OK;
}

ugni_rc = GNI_CqDestroy(iface->local_cq);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_warn("GNI_CqDestroy failed, Error status: %s %d",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}
ugni_rc = GNI_CdmDestroy(iface->cdm_handle);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_warn("GNI_CdmDestroy error status: %s (%d)",
gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_IO_ERROR;
}

iface->activated = false ;
return UCS_OK;
}

static ucs_mpool_ops_t uct_ugni_flush_mpool_ops = {
.chunk_alloc = ucs_mpool_chunk_malloc,
.chunk_release = ucs_mpool_chunk_free,
Expand All @@ -350,20 +246,32 @@ UCS_CLASS_INIT_FUNC(uct_ugni_iface_t, uct_md_h md, uct_worker_h worker,
UCS_STATS_ARG(ucs_stats_node_t *stats_parent))
{
uct_ugni_device_t *dev;
gni_return_t ugni_rc;
ucs_status_t status;
uct_ugni_iface_config_t *config = ucs_derived_of(tl_config, uct_ugni_iface_config_t);
unsigned grow = (config->mpool.bufs_grow == 0) ? 128 : config->mpool.bufs_grow;

UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, uct_ugni_iface_ops, md, worker,
params, tl_config UCS_STATS_ARG(params->stats_root)
UCS_STATS_ARG(UCT_UGNI_MD_NAME));
dev = uct_ugni_device_by_name(params->dev_name);
if (NULL == dev) {
ucs_error("No device was found: %s", params->dev_name);
return UCS_ERR_NO_DEVICE;
}
UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, uct_ugni_iface_ops, md, worker,
params, tl_config UCS_STATS_ARG(params->stats_root)
UCS_STATS_ARG(UCT_UGNI_MD_NAME));
self->dev = dev;
self->activated = false;
status = uct_ugni_create_cdm(&self->cdm, dev, worker->thread_mode);
if (UCS_OK != status) {
ucs_error("Failed to UGNI NIC, Error status: %d", status);
return status;
}
ugni_rc = GNI_CqCreate(uct_ugni_iface_nic_handle(self), UCT_UGNI_LOCAL_CQ, 0,
GNI_CQ_NOBLOCK,
NULL, NULL, &self->local_cq);
if (GNI_RC_SUCCESS != ugni_rc) {
ucs_error("GNI_CqCreate failed, Error status: %s %d",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Somebody has to clean CDM ?

gni_err_str[ugni_rc], ugni_rc);
return UCS_ERR_NO_DEVICE;
}
self->outstanding = 0;
sglib_hashed_uct_ugni_ep_t_init(self->eps);
ucs_arbiter_init(&self->arbiter);
Expand All @@ -386,9 +294,10 @@ UCS_CLASS_DEFINE_NEW_FUNC(uct_ugni_iface_t, uct_iface_t, uct_md_h, uct_worker_h,
const uct_iface_params_t*, uct_iface_ops_t *,
const uct_iface_config_t * UCS_STATS_ARG(ucs_stats_node_t *));

static UCS_CLASS_CLEANUP_FUNC(uct_ugni_iface_t){

ugni_deactivate_iface(self);
static UCS_CLASS_CLEANUP_FUNC(uct_ugni_iface_t)
{
GNI_CqDestroy(self->local_cq);
uct_ugni_destroy_cdm(&self->cdm);
ucs_arbiter_cleanup(&self->arbiter);
}

Expand Down
20 changes: 11 additions & 9 deletions src/uct/ugni/base/ugni_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,24 @@ ucs_status_t uct_ugni_iface_get_address(uct_iface_h tl_iface, uct_iface_addr_t *
int uct_ugni_iface_is_reachable(uct_iface_h tl_iface, const uct_device_addr_t *dev_addr,
const uct_iface_addr_t *iface_addr);
void uct_ugni_progress(void *arg);
ucs_status_t ugni_activate_iface(uct_ugni_iface_t *iface);
ucs_status_t ugni_deactivate_iface(uct_ugni_iface_t *iface);
ucs_status_t uct_ugni_init_nic(int device_index,
uint16_t *domain_id,
gni_cdm_handle_t *cdm_handle,
gni_nic_handle_t *nic_handle,
uint32_t *address);
ucs_status_t uct_ugni_fetch_pmi();
void uct_ugni_base_desc_init(ucs_mpool_t *mp, void *obj, void *chunk);
void uct_ugni_base_desc_key_init(uct_iface_h iface, void *obj, uct_mem_h memh);
ucs_status_t uct_ugni_query_tl_resources(uct_md_h md, const char *tl_name,
uct_tl_resource_desc_t **resource_p,
unsigned *num_resources_p);

static inline uct_ugni_device_t *uct_ugni_iface_device(uct_ugni_iface_t *iface)
{
return iface->dev;
return iface->cdm.dev;
}
static inline gni_nic_handle_t uct_ugni_iface_nic_handle(uct_ugni_iface_t *iface)
{
return iface->cdm.nic_handle;
}
static inline int uct_ugni_check_device_type(uct_ugni_iface_t *iface, gni_nic_device_t type)
{
uct_ugni_device_t *dev = uct_ugni_iface_device(iface);
return dev->type == type;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bool instead of int ?
Also I can be a single line.

}

#endif
Loading