From 38ae556ccfb34307fcbb625235b89411b56ac531 Mon Sep 17 00:00:00 2001 From: Igor Ivanov Date: Wed, 17 Sep 2014 18:42:47 +0300 Subject: [PATCH] OSHMEM: Add two new mca variables Added use_hp flag in sshmem/sysv variable to control huge page usage; Added shared_mr sshmem/verbs; Both paraemetes are set in auto. Fix help messages fixed by Igor, reviewed by @miked-mellanox and @alex-mikheev (cherry picked from commit d82dc7f67f47b326888ddb0dc846536f99b40d28) --- oshmem/mca/sshmem/base/help-oshmem-sshmem.txt | 25 ++++++++++ oshmem/mca/sshmem/mmap/sshmem_mmap_module.c | 2 +- oshmem/mca/sshmem/sysv/sshmem_sysv.h | 4 ++ .../mca/sshmem/sysv/sshmem_sysv_component.c | 48 +++++++++++++++---- oshmem/mca/sshmem/sysv/sshmem_sysv_module.c | 40 ++++++++++++++-- .../mca/sshmem/verbs/sshmem_verbs_component.c | 18 +++++-- 6 files changed, 121 insertions(+), 16 deletions(-) create mode 100644 oshmem/mca/sshmem/base/help-oshmem-sshmem.txt diff --git a/oshmem/mca/sshmem/base/help-oshmem-sshmem.txt b/oshmem/mca/sshmem/base/help-oshmem-sshmem.txt new file mode 100644 index 00000000000..165dafda915 --- /dev/null +++ b/oshmem/mca/sshmem/base/help-oshmem-sshmem.txt @@ -0,0 +1,25 @@ +# -*- text -*- +# +# Copyright (c) 2013 Mellanox Technologies, Inc. +# All rights reserved. +# Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is the US/English help file for Open SHMEM MCA error messages. +# +[create segment failure] +The OpenSHMEM "(%s)" plugin in the "sshmem" framework failed to +allocate a shared memory segment via the system call. This +usually means that there are not enough resources available to memory subsystem on your server. + +Your OpenSHMEM job will now abort. + + Server: %s + Requested shared + memory segment size: %llu + Specific error: %s (%d) + diff --git a/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c b/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c index 29dd004b565..6281cc90d4f 100644 --- a/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c +++ b/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c @@ -198,8 +198,8 @@ segment_create(map_segment_t *ds_buf, if (MAP_FAILED == addr) { opal_show_help("help-oshmem-sshmem.txt", "create segment failure", - "mmap", true, + "mmap", orte_process_info.nodename, (unsigned long long) size, strerror(errno), errno); opal_show_help("help-oshmem-sshmem-mmap.txt", diff --git a/oshmem/mca/sshmem/sysv/sshmem_sysv.h b/oshmem/mca/sshmem/sysv/sshmem_sysv.h index f9d02fda392..7242b5a608f 100644 --- a/oshmem/mca/sshmem/sysv/sshmem_sysv.h +++ b/oshmem/mca/sshmem/sysv/sshmem_sysv.h @@ -13,6 +13,8 @@ #include "oshmem_config.h" +#include "opal/util/sys_limits.h" + #include "oshmem/mca/sshmem/sshmem.h" BEGIN_C_DECLS @@ -36,6 +38,8 @@ typedef struct mca_sshmem_sysv_module_t { } mca_sshmem_sysv_module_t; extern mca_sshmem_sysv_module_t mca_sshmem_sysv_module; +OSHMEM_MODULE_DECLSPEC extern size_t sshmem_sysv_gethugepagesize(void); + END_C_DECLS #endif /* MCA_SSHMEM_SYSV_EXPORT_H */ diff --git a/oshmem/mca/sshmem/sysv/sshmem_sysv_component.c b/oshmem/mca/sshmem/sysv/sshmem_sysv_component.c index 83ac8084991..e12c016fff7 100644 --- a/oshmem/mca/sshmem/sysv/sshmem_sysv_component.c +++ b/oshmem/mca/sshmem/sysv/sshmem_sysv_component.c @@ -104,6 +104,9 @@ sysv_runtime_query(mca_base_module_t **module, char *addr = NULL; struct shmid_ds tmp_buff; int flags; + int ret; + + ret = OSHMEM_SUCCESS; *priority = 0; *module = NULL; @@ -111,24 +114,44 @@ sysv_runtime_query(mca_base_module_t **module, /* if we are here, then let the run-time test games begin */ #if defined (SHM_HUGETLB) - mca_sshmem_sysv_component.use_hp = 1; - flags = IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR | SHM_HUGETLB; - if (-1 == (shmid = shmget(IPC_PRIVATE, (size_t)(opal_getpagesize()), flags))) { - mca_sshmem_sysv_component.use_hp = 0; + if (mca_sshmem_sysv_component.use_hp != 0) { + flags = IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR | SHM_HUGETLB; + if (-1 == (shmid = shmget(IPC_PRIVATE, sshmem_sysv_gethugepagesize(), flags))) { + if (mca_sshmem_sysv_component.use_hp == 1) { + mca_sshmem_sysv_component.use_hp = 0; + ret = OSHMEM_ERR_NOT_AVAILABLE; + goto out; + } + mca_sshmem_sysv_component.use_hp = 0; + } + else if ((void *)-1 == (addr = shmat(shmid, NULL, 0))) { + shmctl(shmid, IPC_RMID, NULL); + if (mca_sshmem_sysv_component.use_hp == 1) { + mca_sshmem_sysv_component.use_hp = 0; + ret = OSHMEM_ERR_NOT_AVAILABLE; + goto out; + } + mca_sshmem_sysv_component.use_hp = 0; + } } - else if ((void *)-1 == (addr = shmat(shmid, NULL, 0))) { - shmctl(shmid, IPC_RMID, NULL ); +#else + if (mca_sshmem_sysv_component.use_hp == 1) { mca_sshmem_sysv_component.use_hp = 0; + ret = OSHMEM_ERR_NOT_AVAILABLE; + goto out; } + mca_sshmem_sysv_component.use_hp = 0; #endif if (0 == mca_sshmem_sysv_component.use_hp) { flags = IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR; if (-1 == (shmid = shmget(IPC_PRIVATE, (size_t)(opal_getpagesize()), flags))) { + ret = OSHMEM_ERR_NOT_AVAILABLE; goto out; } else if ((void *)-1 == (addr = shmat(shmid, NULL, 0))) { - shmctl(shmid, IPC_RMID, NULL ); + shmctl(shmid, IPC_RMID, NULL); + ret = OSHMEM_ERR_NOT_AVAILABLE; goto out; } } @@ -153,7 +176,7 @@ sysv_runtime_query(mca_base_module_t **module, if ((char *)-1 != addr) { shmdt(addr); } - return OSHMEM_SUCCESS; + return ret; } /* ////////////////////////////////////////////////////////////////////////// */ @@ -171,7 +194,14 @@ sysv_register(void) MCA_BASE_VAR_SCOPE_ALL_EQ, &mca_sshmem_sysv_component.priority); - mca_sshmem_sysv_component.use_hp = 0; + mca_sshmem_sysv_component.use_hp = -1; + mca_base_component_var_register (&mca_sshmem_sysv_component.super.base_version, + "use_hp", "Huge pages usage " + "[0 - off, 1 - on, -1 - auto] (default: -1)", MCA_BASE_VAR_TYPE_INT, + NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_ALL_EQ, + &mca_sshmem_sysv_component.use_hp); return OSHMEM_SUCCESS; } diff --git a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c index 674ade450eb..73f41c21162 100644 --- a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c +++ b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c @@ -188,7 +188,8 @@ segment_create(map_segment_t *ds_buf, */ flags = IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR; #if defined (SHM_HUGETLB) - flags |= (mca_sshmem_sysv_component.use_hp ? SHM_HUGETLB : 0); + flags |= ((0 != mca_sshmem_sysv_component.use_hp) ? SHM_HUGETLB : 0); + size = ((size + sshmem_sysv_gethugepagesize() - 1) / sshmem_sysv_gethugepagesize()) * sshmem_sysv_gethugepagesize(); #endif /* Create a new shared memory segment and save the shmid. */ @@ -206,7 +207,7 @@ segment_create(map_segment_t *ds_buf, return OSHMEM_ERROR; } - /* Attach to the sement */ + /* Attach to the segment */ addr = shmat(shmid, (void *) mca_sshmem_base_start_address, 0); if (addr == (void *) -1L) { opal_show_help("help-oshmem-sshmem.txt", @@ -294,7 +295,7 @@ segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey) shmctl(ds_buf->seg_id, IPC_RMID, NULL ); } - if (mca_sshmem_sysv_component.use_hp > 0) { + if (mca_sshmem_sysv_component.use_hp != 0) { /** * Workaround kernel panic when detaching huge pages from user space simultanously from several processes * dont detach here instead let kernel do it during process cleanup @@ -336,3 +337,36 @@ segment_unlink(map_segment_t *ds_buf) return OSHMEM_SUCCESS; } +/* + * Get current huge page size + * + */ +size_t sshmem_sysv_gethugepagesize(void) +{ + static size_t huge_page_size = 0; + char buf[256]; + int size_kb; + FILE *f; + + /* Cache the huge page size value */ + if (huge_page_size == 0) { + f = fopen("/proc/meminfo", "r"); + if (f != NULL) { + while (fgets(buf, sizeof(buf), f)) { + if (sscanf(buf, "Hugepagesize: %d kB", &size_kb) == 1) { + huge_page_size = size_kb * 1024L; + break; + } + } + fclose(f); + } + + if (huge_page_size == 0) { + huge_page_size = 2 * 1024L *1024L; + } + } + + return huge_page_size; +} + + diff --git a/oshmem/mca/sshmem/verbs/sshmem_verbs_component.c b/oshmem/mca/sshmem/verbs/sshmem_verbs_component.c index fbb6dc64539..2e44f5d03b1 100644 --- a/oshmem/mca/sshmem/verbs/sshmem_verbs_component.c +++ b/oshmem/mca/sshmem/verbs/sshmem_verbs_component.c @@ -176,7 +176,7 @@ verbs_runtime_query(mca_base_module_t **module, } #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) - if (!rc) { + if (!rc && (0 != mca_sshmem_verbs_component.has_shared_mr)) { struct ibv_exp_reg_shared_mr_in in_smr; access_flag = IBV_ACCESS_LOCAL_WRITE | @@ -190,14 +190,17 @@ verbs_runtime_query(mca_base_module_t **module, if (NULL == ib_mr) { if (mca_sshmem_verbs_component.has_shared_mr == 1) rc = OSHMEM_ERR_OUT_OF_RESOURCE; - mca_sshmem_verbs_component.has_shared_mr = 0; - rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { opal_value_array_append_item(&device->ib_mr_array, &ib_mr); mca_sshmem_verbs_component.has_shared_mr = 1; } } +#else + if (!rc && mca_sshmem_verbs_component.has_shared_mr == 1) { + rc = OSHMEM_ERR_OUT_OF_RESOURCE; + } + mca_sshmem_verbs_component.has_shared_mr = 0; #endif /* MPAGE_ENABLE */ } @@ -308,6 +311,15 @@ verbs_register(void) MCA_BASE_VAR_SYN_FLAG_DEPRECATED); } + mca_sshmem_verbs_component.has_shared_mr = -1; + index = mca_base_component_var_register (&mca_sshmem_verbs_component.super.base_version, + "shared_mr", "Shared memory region usage " + "[0 - off, 1 - on, -1 - auto] (default: -1)", MCA_BASE_VAR_TYPE_INT, + NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE, + OPAL_INFO_LVL_4, + MCA_BASE_VAR_SCOPE_ALL_EQ, + &mca_sshmem_verbs_component.has_shared_mr); + return OSHMEM_SUCCESS; }