Skip to content

Commit

Permalink
Merge pull request #64 from yosefe/topic/rcache-limit
Browse files Browse the repository at this point in the history
UCS/UCT/RCACHE: Add memory usage limits to registration cache
  • Loading branch information
yosefe authored Oct 22, 2020
2 parents 3cbfc93 + 5284d61 commit fc48291
Show file tree
Hide file tree
Showing 10 changed files with 300 additions and 30 deletions.
137 changes: 132 additions & 5 deletions src/ucs/memory/rcache.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ static void ucs_rcache_region_collect_callback(const ucs_pgtable_t *pgtable,
{
ucs_rcache_region_t *region = ucs_derived_of(pgt_region, ucs_rcache_region_t);
ucs_list_link_t *list = arg;
ucs_list_add_tail(list, &region->list);
ucs_list_add_tail(list, &region->tmp_list);
}

/* Lock must be held */
Expand All @@ -200,6 +200,52 @@ static void ucs_rcache_find_regions(ucs_rcache_t *rcache, ucs_pgt_addr_t from,
ucs_rcache_region_collect_callback, list);
}

/* LRU spinlock must be held */
static inline void
ucs_rcache_region_lru_add(ucs_rcache_t *rcache, ucs_rcache_region_t *region)
{
if (region->lru_flag) {
return;
}

ucs_rcache_region_trace(rcache, region, "lru add");
ucs_list_add_tail(&rcache->lru.list, &region->lru_list);
++rcache->lru.count;
region->lru_flag = 1;
}

/* LRU spinlock must be held */
static inline void
ucs_rcache_region_lru_remove(ucs_rcache_t *rcache, ucs_rcache_region_t *region)
{
if (!region->lru_flag) {
return;
}

ucs_rcache_region_trace(rcache, region, "lru remove");
ucs_list_del(&region->lru_list);
--rcache->lru.count;
region->lru_flag = 0;
}

static void ucs_rcache_region_lru_get(ucs_rcache_t *rcache,
ucs_rcache_region_t *region)
{
/* A used region cannot be evicted */
ucs_spin_lock(&rcache->lru.lock);
ucs_rcache_region_lru_remove(rcache, region);
ucs_spin_unlock(&rcache->lru.lock);
}

static void ucs_rcache_region_lru_put(ucs_rcache_t *rcache,
ucs_rcache_region_t *region)
{
/* When we finish using a region, it's a candidate for LRU eviction */
ucs_spin_lock(&rcache->lru.lock);
ucs_rcache_region_lru_add(rcache, region);
ucs_spin_unlock(&rcache->lru.lock);
}

/* Lock must be held in write mode */
static void ucs_mem_region_destroy_internal(ucs_rcache_t *rcache,
ucs_rcache_region_t *region)
Expand All @@ -216,6 +262,13 @@ static void ucs_mem_region_destroy_internal(ucs_rcache_t *rcache,
}
}

ucs_spin_lock(&rcache->lru.lock);
ucs_rcache_region_lru_remove(rcache, region);
ucs_spin_unlock(&rcache->lru.lock);

--rcache->num_regions;
rcache->total_size -= region->super.end - region->super.start;

ucs_free(region);
}

Expand Down Expand Up @@ -275,7 +328,7 @@ static void ucs_rcache_invalidate_range(ucs_rcache_t *rcache, ucs_pgt_addr_t sta
ucs_trace_func("rcache=%s, start=0x%lx, end=0x%lx", rcache->name, start, end);

ucs_rcache_find_regions(rcache, start, end - 1, &region_list);
ucs_list_for_each_safe(region, tmp, &region_list, list) {
ucs_list_for_each_safe(region, tmp, &region_list, tmp_list) {
/* all regions on the list are in the page table */
ucs_rcache_region_invalidate(rcache, region, 1, 0);
UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_UNMAP_INVALIDATES, 1);
Expand Down Expand Up @@ -360,7 +413,7 @@ static void ucs_rcache_purge(ucs_rcache_t *rcache)
ucs_list_head_init(&region_list);
ucs_pgtable_purge(&rcache->pgtable, ucs_rcache_region_collect_callback,
&region_list);
ucs_list_for_each_safe(region, tmp, &region_list, list) {
ucs_list_for_each_safe(region, tmp, &region_list, tmp_list) {
if (region->flags & UCS_RCACHE_REGION_FLAG_PGTABLE) {
region->flags &= ~UCS_RCACHE_REGION_FLAG_PGTABLE;
ucs_atomic_add32(&region->refcount, (uint32_t)-1);
Expand All @@ -372,6 +425,51 @@ static void ucs_rcache_purge(ucs_rcache_t *rcache)
}
}

/* Lock must be held in write mode */
static void ucs_rcache_lru_evict(ucs_rcache_t *rcache)
{
int num_evicted, num_skipped;
ucs_rcache_region_t *region;

num_evicted = 0;
num_skipped = 0;

ucs_spin_lock(&rcache->lru.lock);
while (!ucs_list_is_empty(&rcache->lru.list) &&
((rcache->num_regions > rcache->params.max_regions) ||
(rcache->total_size > rcache->params.max_size))) {

region = ucs_list_head(&rcache->lru.list, ucs_rcache_region_t, lru_list);
ucs_assert(region->lru_flag);

if (!(region->flags & UCS_RCACHE_REGION_FLAG_PGTABLE) ||
(region->refcount > 1)) {
/* region is in use or not in page table - remove from lru */
ucs_rcache_region_lru_remove(rcache, region);
++num_skipped;
continue;
}

ucs_spin_unlock(&rcache->lru.lock);

/* we expect the region to have refcount=1 and present in pgt so it
* would be destroyed immediately by this function
*/
ucs_rcache_region_trace(rcache, region, "evict");
ucs_rcache_region_invalidate(rcache, region, 1, 1);
++num_evicted;

ucs_spin_lock(&rcache->lru.lock);
}
ucs_spin_unlock(&rcache->lru.lock);

if (num_evicted > 0) {
ucs_debug("evicted %d regions, skipped %d regions, usage: %lu (%lu)",
num_evicted, num_skipped, rcache->num_regions,
rcache->params.max_regions);
}
}

static inline int ucs_rcache_region_test(ucs_rcache_region_t *region, int prot)
{
return (region->flags & UCS_RCACHE_REGION_FLAG_REGISTERED) &&
Expand All @@ -397,7 +495,7 @@ ucs_rcache_check_overlap(ucs_rcache_t *rcache, ucs_pgt_addr_t *start,

/* TODO check if any of the regions is locked */

ucs_list_for_each_safe(region, tmp, &region_list, list) {
ucs_list_for_each_safe(region, tmp, &region_list, tmp_list) {

if ((*start >= region->super.start) && (*end <= region->super.end) &&
ucs_rcache_region_test(region, *prot))
Expand Down Expand Up @@ -535,12 +633,19 @@ ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, size_t length,

region->prot = prot;
region->flags = UCS_RCACHE_REGION_FLAG_PGTABLE;
region->lru_flag = 0;
region->refcount = 1;
region->status = UCS_INPROGRESS;

++rcache->num_regions;
rcache->total_size += region->super.end - region->super.start;

region->status = status =
UCS_PROFILE_NAMED_CALL("mem_reg", rcache->params.ops->mem_reg,
rcache->params.context, rcache, arg, region,
merged ? UCS_RCACHE_MEM_REG_HIDE_ERRORS : 0);
if (status != UCS_OK) {
ucs_rcache_region_invalidate(rcache, region, 1, 1);
if (merged) {
/* failure may be due to merge, because memory of the merged
* regions has different access permission.
Expand All @@ -550,7 +655,6 @@ ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, size_t length,
*/
ucs_debug("failed to register merged region " UCS_PGT_REGION_FMT ": %s, retrying",
UCS_PGT_REGION_ARG(&region->super), ucs_status_string(status));
ucs_rcache_region_invalidate(rcache, region, 1, 1);
goto retry;
} else {
ucs_debug("failed to register region " UCS_PGT_REGION_FMT ": %s",
Expand All @@ -562,6 +666,8 @@ ucs_rcache_create_region(ucs_rcache_t *rcache, void *address, size_t length,
region->flags |= UCS_RCACHE_REGION_FLAG_REGISTERED;
region->refcount = 2; /* Page-table + user */

ucs_rcache_lru_evict(rcache);

if (ucs_global_opts.rcache_check_pfn) {
ucs_rcache_region_pfn(region) = ucs_sys_get_pfn(region->super.start);
} else {
Expand Down Expand Up @@ -607,6 +713,7 @@ ucs_status_t ucs_rcache_get(ucs_rcache_t *rcache, void *address, size_t length,
{
ucs_rcache_region_hold(rcache, region);
ucs_rcache_region_validate_pfn(rcache, region);
ucs_rcache_region_lru_get(rcache, region);
*region_p = region;
UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_HITS_FAST, 1);
pthread_rwlock_unlock(&rcache->lock);
Expand All @@ -627,6 +734,7 @@ ucs_status_t ucs_rcache_get(ucs_rcache_t *rcache, void *address, size_t length,

void ucs_rcache_region_put(ucs_rcache_t *rcache, ucs_rcache_region_t *region)
{
ucs_rcache_region_lru_put(rcache, region);
ucs_rcache_region_put_internal(rcache, region, 1, 0);
UCS_STATS_UPDATE_COUNTER(rcache->stats, UCS_RCACHE_PUTS, 1);
}
Expand Down Expand Up @@ -692,6 +800,11 @@ static UCS_CLASS_INIT_FUNC(ucs_rcache_t, const ucs_rcache_params_t *params,
}

ucs_queue_head_init(&self->inv_q);
self->lru.count = 0;
self->num_regions = 0;
self->total_size = 0;
ucs_list_head_init(&self->lru.list);
ucs_spinlock_init(&self->lru.lock, 0);

status = ucm_set_event_handler(params->ucm_events, params->ucm_event_priority,
ucs_rcache_unmapped_callback, self);
Expand Down Expand Up @@ -729,6 +842,20 @@ static UCS_CLASS_CLEANUP_FUNC(ucs_rcache_t)
ucs_rcache_check_inv_queue(self);
ucs_rcache_purge(self);

if (self->lru.count > 0) {
ucs_assert(!ucs_list_is_empty(&self->lru.list));
ucs_warn("%lu regions remained on lru list, first region: %p",
self->lru.count,
ucs_list_head(&self->lru.list, ucs_rcache_region_t, lru_list));
} else {
ucs_assert(ucs_list_is_empty(&self->lru.list));
}

status = ucs_spinlock_destroy(&self->lru.lock);
if (status != UCS_OK) {
ucs_warn("ucs_spinlock_destroy() failed (%d)", status);
}

ucs_mpool_cleanup(&self->inv_mp, 1);
ucs_pgtable_cleanup(&self->pgtable);
status = ucs_recursive_spinlock_destroy(&self->inv_lock);
Expand Down
8 changes: 6 additions & 2 deletions src/ucs/memory/rcache.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,17 +113,21 @@ struct ucs_rcache_params {
const ucs_rcache_ops_t *ops; /**< Memory operations functions */
void *context; /**< User-defined context that will
be passed to mem_reg/mem_dereg */
unsigned long max_regions; /**< Maximal number of regions */
size_t max_size; /**< Maximal total size of regions */
};


struct ucs_rcache_region {
ucs_pgt_region_t super; /**< Base class - page table region */
ucs_list_link_t list; /**< List element */
ucs_list_link_t lru_list; /**< LRU list element */
ucs_list_link_t tmp_list; /**< Temp list element */
volatile uint32_t refcount; /**< Reference count, including +1 if it's
in the page table */
ucs_status_t status; /**< Current status code */
uint8_t prot; /**< Protection bits */
uint16_t flags; /**< Status flags. Protected by page table lock. */
uint8_t flags; /**< Status flags. Protected by page table lock. */
uint8_t lru_flag;
uint64_t priv; /**< Used internally */
};

Expand Down
17 changes: 16 additions & 1 deletion src/ucs/memory/rcache_int.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
#ifndef UCS_REG_CACHE_INT_H_
#define UCS_REG_CACHE_INT_H_

#include <ucs/datastruct/list.h>
#include <ucs/type/spinlock.h>


/* Names of rcache stats counters */
enum {
UCS_RCACHE_GETS, /* number of get operations */
Expand Down Expand Up @@ -42,7 +44,20 @@ struct ucs_rcache {
since we cannot use regulat malloc().
The backing storage is original mmap()
which does not generate memory events */
char *name;
unsigned long num_regions;/**< Total number of managed regions */
size_t total_size; /**< Total size of registered memory */

struct {
ucs_spinlock_t lock; /**< Lock for this structure */
ucs_list_link_t list; /**< List of regions, sorted by usage:
The head of the list is the least
recently used region, and the tail
is the most recently used region. */
unsigned long count; /**< Number of regions on list */
} lru;

char *name; /**< Name for debug purposes */

UCS_STATS_NODE_DECLARE(stats)
};

Expand Down
18 changes: 18 additions & 0 deletions src/uct/base/uct_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <uct/api/uct.h>
#include <ucs/debug/log.h>
#include <ucs/debug/memtrack.h>
#include <ucs/memory/rcache.h>
#include <ucs/type/class.h>
#include <ucs/sys/module.h>
#include <ucs/sys/string.h>
Expand All @@ -39,6 +40,14 @@ ucs_config_field_t uct_md_config_rcache_table[] = {
"between "UCS_PP_MAKE_STRING(UCS_PGT_ADDR_ALIGN)"and system page size",
ucs_offsetof(uct_md_rcache_config_t, alignment), UCS_CONFIG_TYPE_UINT},

{"RCACHE_MAX_REGIONS", "inf",
"Maximal number of regions in the registration cache",
ucs_offsetof(uct_md_rcache_config_t, max_regions), UCS_CONFIG_TYPE_ULUNITS},

{"RCACHE_MAX_SIZE", "inf",
"Maximal total size of registration cache regions",
ucs_offsetof(uct_md_rcache_config_t, max_size), UCS_CONFIG_TYPE_MEMUNITS},

{NULL}
};

Expand Down Expand Up @@ -435,3 +444,12 @@ ucs_status_t uct_md_detect_memory_type(uct_md_h md, const void *addr, size_t len
{
return md->ops->detect_memory_type(md, addr, length, mem_type_p);
}

void uct_md_set_rcache_params(ucs_rcache_params_t *rcache_params,
const uct_md_rcache_config_t *rcache_config)
{
rcache_params->alignment = rcache_config->alignment;
rcache_params->ucm_event_priority = rcache_config->event_prio;
rcache_params->max_regions = rcache_config->max_regions;
rcache_params->max_size = rcache_config->max_size;
}
4 changes: 4 additions & 0 deletions src/uct/base/uct_md.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ typedef struct uct_md_rcache_config {
size_t alignment; /**< Force address alignment */
unsigned event_prio; /**< Memory events priority */
double overhead; /**< Lookup overhead estimation */
unsigned long max_regions; /**< Maximal number of rcache regions */
size_t max_size; /**< Maximal size of mapped memory */
} uct_md_rcache_config_t;


Expand Down Expand Up @@ -151,6 +153,8 @@ ucs_status_t uct_md_stub_rkey_unpack(uct_component_t *component,
const void *rkey_buffer, uct_rkey_t *rkey_p,
void **handle_p);

void uct_md_set_rcache_params();

extern ucs_config_field_t uct_md_config_table[];

#endif
3 changes: 1 addition & 2 deletions src/uct/cuda/gdr_copy/gdr_copy_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -385,11 +385,10 @@ uct_gdr_copy_md_open(uct_component_t *component, const char *md_name,
}

if (md_config->enable_rcache != UCS_NO) {
uct_md_set_rcache_params(&rcache_params, &md_config->rcache);
rcache_params.region_struct_size = sizeof(uct_gdr_copy_rcache_region_t);
rcache_params.alignment = md_config->rcache.alignment;
rcache_params.max_alignment = UCT_GDR_COPY_MD_RCACHE_DEFAULT_ALIGN;
rcache_params.ucm_events = UCM_EVENT_MEM_TYPE_FREE;
rcache_params.ucm_event_priority = md_config->rcache.event_prio;
rcache_params.context = md;
rcache_params.ops = &uct_gdr_copy_rcache_ops;
status = ucs_rcache_create(&rcache_params, "gdr_copy", NULL, &md->rcache);
Expand Down
3 changes: 1 addition & 2 deletions src/uct/ib/base/ib_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -1138,15 +1138,14 @@ uct_ib_md_parse_reg_methods(uct_ib_md_t *md, uct_md_attr_t *md_attr,

for (i = 0; i < md_config->reg_methods.count; ++i) {
if (!strcasecmp(md_config->reg_methods.rmtd[i], "rcache")) {
uct_md_set_rcache_params(&rcache_params, &md_config->rcache);
rcache_params.region_struct_size = sizeof(ucs_rcache_region_t) +
md->ops->memh_struct_size;
rcache_params.alignment = md_config->rcache.alignment;
rcache_params.max_alignment = ucs_get_page_size();
rcache_params.ucm_events = UCM_EVENT_VM_UNMAPPED;
if (md_attr->cap.reg_mem_types & ~UCS_BIT(UCS_MEMORY_TYPE_HOST)) {
rcache_params.ucm_events |= UCM_EVENT_MEM_TYPE_FREE;
}
rcache_params.ucm_event_priority = md_config->rcache.event_prio;
rcache_params.context = md;
rcache_params.ops = &uct_ib_rcache_ops;

Expand Down
2 changes: 2 additions & 0 deletions src/uct/sm/mm/xpmem/mm_xpmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,8 @@ uct_xpmem_rmem_add(xpmem_segid_t xsegid, uct_xpmem_remote_mem_t **rmem_p)
rcache_params.ucm_event_priority = 0;
rcache_params.ops = &uct_xpmem_rcache_ops;
rcache_params.context = rmem;
rcache_params.max_regions = ULONG_MAX;
rcache_params.max_size = SIZE_MAX;

status = ucs_rcache_create(&rcache_params, "xpmem_remote_mem",
ucs_stats_get_root(), &rmem->rcache);
Expand Down
Loading

0 comments on commit fc48291

Please sign in to comment.