Skip to content

Commit

Permalink
Merge pull request #7 from miked-mellanox/topic/oshmem_add_two_new_mc…
Browse files Browse the repository at this point in the history
…a_variables

Add-two-new-mca-variables, fix opal_help failure
  • Loading branch information
rhc54 committed Oct 28, 2014
2 parents da7c460 + 38ae556 commit da1cdd7
Show file tree
Hide file tree
Showing 6 changed files with 121 additions and 16 deletions.
25 changes: 25 additions & 0 deletions oshmem/mca/sshmem/base/help-oshmem-sshmem.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# -*- text -*-
#
# Copyright (c) 2013 Mellanox Technologies, Inc.
# All rights reserved.
# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for Open SHMEM MCA error messages.
#
[create segment failure]
The OpenSHMEM "(%s)" plugin in the "sshmem" framework failed to
allocate a shared memory segment via the system call. This
usually means that there are not enough resources available to memory subsystem on your server.

Your OpenSHMEM job will now abort.

Server: %s
Requested shared
memory segment size: %llu
Specific error: %s (%d)

2 changes: 1 addition & 1 deletion oshmem/mca/sshmem/mmap/sshmem_mmap_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ segment_create(map_segment_t *ds_buf,
if (MAP_FAILED == addr) {
opal_show_help("help-oshmem-sshmem.txt",
"create segment failure",
"mmap",
true,
"mmap",
orte_process_info.nodename, (unsigned long long) size,
strerror(errno), errno);
opal_show_help("help-oshmem-sshmem-mmap.txt",
Expand Down
4 changes: 4 additions & 0 deletions oshmem/mca/sshmem/sysv/sshmem_sysv.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

#include "oshmem_config.h"

#include "opal/util/sys_limits.h"

#include "oshmem/mca/sshmem/sshmem.h"

BEGIN_C_DECLS
Expand All @@ -36,6 +38,8 @@ typedef struct mca_sshmem_sysv_module_t {
} mca_sshmem_sysv_module_t;
extern mca_sshmem_sysv_module_t mca_sshmem_sysv_module;

OSHMEM_MODULE_DECLSPEC extern size_t sshmem_sysv_gethugepagesize(void);

END_C_DECLS

#endif /* MCA_SSHMEM_SYSV_EXPORT_H */
48 changes: 39 additions & 9 deletions oshmem/mca/sshmem/sysv/sshmem_sysv_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,31 +104,54 @@ sysv_runtime_query(mca_base_module_t **module,
char *addr = NULL;
struct shmid_ds tmp_buff;
int flags;
int ret;

ret = OSHMEM_SUCCESS;

*priority = 0;
*module = NULL;

/* if we are here, then let the run-time test games begin */

#if defined (SHM_HUGETLB)
mca_sshmem_sysv_component.use_hp = 1;
flags = IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR | SHM_HUGETLB;
if (-1 == (shmid = shmget(IPC_PRIVATE, (size_t)(opal_getpagesize()), flags))) {
mca_sshmem_sysv_component.use_hp = 0;
if (mca_sshmem_sysv_component.use_hp != 0) {
flags = IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR | SHM_HUGETLB;
if (-1 == (shmid = shmget(IPC_PRIVATE, sshmem_sysv_gethugepagesize(), flags))) {
if (mca_sshmem_sysv_component.use_hp == 1) {
mca_sshmem_sysv_component.use_hp = 0;
ret = OSHMEM_ERR_NOT_AVAILABLE;
goto out;
}
mca_sshmem_sysv_component.use_hp = 0;
}
else if ((void *)-1 == (addr = shmat(shmid, NULL, 0))) {
shmctl(shmid, IPC_RMID, NULL);
if (mca_sshmem_sysv_component.use_hp == 1) {
mca_sshmem_sysv_component.use_hp = 0;
ret = OSHMEM_ERR_NOT_AVAILABLE;
goto out;
}
mca_sshmem_sysv_component.use_hp = 0;
}
}
else if ((void *)-1 == (addr = shmat(shmid, NULL, 0))) {
shmctl(shmid, IPC_RMID, NULL );
#else
if (mca_sshmem_sysv_component.use_hp == 1) {
mca_sshmem_sysv_component.use_hp = 0;
ret = OSHMEM_ERR_NOT_AVAILABLE;
goto out;
}
mca_sshmem_sysv_component.use_hp = 0;
#endif

if (0 == mca_sshmem_sysv_component.use_hp) {
flags = IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR;
if (-1 == (shmid = shmget(IPC_PRIVATE, (size_t)(opal_getpagesize()), flags))) {
ret = OSHMEM_ERR_NOT_AVAILABLE;
goto out;
}
else if ((void *)-1 == (addr = shmat(shmid, NULL, 0))) {
shmctl(shmid, IPC_RMID, NULL );
shmctl(shmid, IPC_RMID, NULL);
ret = OSHMEM_ERR_NOT_AVAILABLE;
goto out;
}
}
Expand All @@ -153,7 +176,7 @@ sysv_runtime_query(mca_base_module_t **module,
if ((char *)-1 != addr) {
shmdt(addr);
}
return OSHMEM_SUCCESS;
return ret;
}

/* ////////////////////////////////////////////////////////////////////////// */
Expand All @@ -171,7 +194,14 @@ sysv_register(void)
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sshmem_sysv_component.priority);

mca_sshmem_sysv_component.use_hp = 0;
mca_sshmem_sysv_component.use_hp = -1;
mca_base_component_var_register (&mca_sshmem_sysv_component.super.base_version,
"use_hp", "Huge pages usage "
"[0 - off, 1 - on, -1 - auto] (default: -1)", MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sshmem_sysv_component.use_hp);

return OSHMEM_SUCCESS;
}
Expand Down
40 changes: 37 additions & 3 deletions oshmem/mca/sshmem/sysv/sshmem_sysv_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ segment_create(map_segment_t *ds_buf,
*/
flags = IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR;
#if defined (SHM_HUGETLB)
flags |= (mca_sshmem_sysv_component.use_hp ? SHM_HUGETLB : 0);
flags |= ((0 != mca_sshmem_sysv_component.use_hp) ? SHM_HUGETLB : 0);
size = ((size + sshmem_sysv_gethugepagesize() - 1) / sshmem_sysv_gethugepagesize()) * sshmem_sysv_gethugepagesize();
#endif

/* Create a new shared memory segment and save the shmid. */
Expand All @@ -206,7 +207,7 @@ segment_create(map_segment_t *ds_buf,
return OSHMEM_ERROR;
}

/* Attach to the sement */
/* Attach to the segment */
addr = shmat(shmid, (void *) mca_sshmem_base_start_address, 0);
if (addr == (void *) -1L) {
opal_show_help("help-oshmem-sshmem.txt",
Expand Down Expand Up @@ -294,7 +295,7 @@ segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey)
shmctl(ds_buf->seg_id, IPC_RMID, NULL );
}

if (mca_sshmem_sysv_component.use_hp > 0) {
if (mca_sshmem_sysv_component.use_hp != 0) {
/**
* Workaround kernel panic when detaching huge pages from user space simultanously from several processes
* dont detach here instead let kernel do it during process cleanup
Expand Down Expand Up @@ -336,3 +337,36 @@ segment_unlink(map_segment_t *ds_buf)
return OSHMEM_SUCCESS;
}

/*
* Get current huge page size
*
*/
size_t sshmem_sysv_gethugepagesize(void)
{
static size_t huge_page_size = 0;
char buf[256];
int size_kb;
FILE *f;

/* Cache the huge page size value */
if (huge_page_size == 0) {
f = fopen("/proc/meminfo", "r");
if (f != NULL) {
while (fgets(buf, sizeof(buf), f)) {
if (sscanf(buf, "Hugepagesize: %d kB", &size_kb) == 1) {
huge_page_size = size_kb * 1024L;
break;
}
}
fclose(f);
}

if (huge_page_size == 0) {
huge_page_size = 2 * 1024L *1024L;
}
}

return huge_page_size;
}


18 changes: 15 additions & 3 deletions oshmem/mca/sshmem/verbs/sshmem_verbs_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ verbs_runtime_query(mca_base_module_t **module,
}

#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
if (!rc) {
if (!rc && (0 != mca_sshmem_verbs_component.has_shared_mr)) {
struct ibv_exp_reg_shared_mr_in in_smr;

access_flag = IBV_ACCESS_LOCAL_WRITE |
Expand All @@ -190,14 +190,17 @@ verbs_runtime_query(mca_base_module_t **module,
if (NULL == ib_mr) {
if (mca_sshmem_verbs_component.has_shared_mr == 1)
rc = OSHMEM_ERR_OUT_OF_RESOURCE;

mca_sshmem_verbs_component.has_shared_mr = 0;
rc = OSHMEM_ERR_OUT_OF_RESOURCE;
} else {
opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
mca_sshmem_verbs_component.has_shared_mr = 1;
}
}
#else
if (!rc && mca_sshmem_verbs_component.has_shared_mr == 1) {
rc = OSHMEM_ERR_OUT_OF_RESOURCE;
}
mca_sshmem_verbs_component.has_shared_mr = 0;
#endif /* MPAGE_ENABLE */
}

Expand Down Expand Up @@ -308,6 +311,15 @@ verbs_register(void)
MCA_BASE_VAR_SYN_FLAG_DEPRECATED);
}

mca_sshmem_verbs_component.has_shared_mr = -1;
index = mca_base_component_var_register (&mca_sshmem_verbs_component.super.base_version,
"shared_mr", "Shared memory region usage "
"[0 - off, 1 - on, -1 - auto] (default: -1)", MCA_BASE_VAR_TYPE_INT,
NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
OPAL_INFO_LVL_4,
MCA_BASE_VAR_SCOPE_ALL_EQ,
&mca_sshmem_verbs_component.has_shared_mr);

return OSHMEM_SUCCESS;
}

Expand Down

0 comments on commit da1cdd7

Please sign in to comment.