diff --git a/src/tools/info/tl_info.c b/src/tools/info/tl_info.c index 18e09a86af8..ba4c6f384fd 100644 --- a/src/tools/info/tl_info.c +++ b/src/tools/info/tl_info.c @@ -472,7 +472,7 @@ static void print_md_info(uct_component_h component, printf("# memory invalidation is supported\n"); } - if (md_attr.reg_alignment != 0) { + if (md_attr.reg_alignment != 1) { printf("# alignment: %zx\n", md_attr.reg_alignment); } diff --git a/src/tools/perf/cuda/cuda_alloc.c b/src/tools/perf/cuda/cuda_alloc.c index c3e854ed7ee..ad5f6082dad 100644 --- a/src/tools/perf/cuda/cuda_alloc.c +++ b/src/tools/perf/cuda/cuda_alloc.c @@ -12,6 +12,8 @@ #include #include +#include +#include static ucs_status_t ucx_perf_cuda_init(ucx_perf_context_t *perf) @@ -69,15 +71,27 @@ uct_perf_cuda_alloc_reg_mem(const ucx_perf_context_t *perf, unsigned flags, uct_allocated_memory_t *alloc_mem) { + uct_md_attr_v2_t md_attr = {.field_mask = UCT_MD_ATTR_FIELD_REG_ALIGNMENT}; + void *reg_address; ucs_status_t status; + status = uct_md_query_v2(perf->uct.md, &md_attr); + if (status != UCS_OK) { + ucs_error("uct_md_query_v2() returned %d", status); + return status; + } + status = ucx_perf_cuda_alloc(length, mem_type, &alloc_mem->address); if (status != UCS_OK) { return status; } - status = uct_md_mem_reg(perf->uct.md, alloc_mem->address, - length, flags, &alloc_mem->memh); + /* Register memory respecting MD reg_alignment */ + reg_address = alloc_mem->address; + ucs_align_ptr_range(®_address, &length, md_attr.reg_alignment); + + status = uct_md_mem_reg(perf->uct.md, reg_address, length, flags, + &alloc_mem->memh); if (status != UCS_OK) { cudaFree(alloc_mem->address); ucs_error("failed to register memory"); diff --git a/src/ucp/core/ucp_mm.c b/src/ucp/core/ucp_mm.c index 3cd327de587..7a9492f8f65 100644 --- a/src/ucp/core/ucp_mm.c +++ b/src/ucp/core/ucp_mm.c @@ -522,7 +522,7 @@ ucp_memh_register_internal(ucp_context_h context, ucp_mem_h memh, reg_length = length; if (context->rcache == NULL) { - reg_align = ucs_max(context->tl_mds[md_index].attr.reg_alignment, 1); + reg_align = context->tl_mds[md_index].attr.reg_alignment; ucs_align_ptr_range(®_address, ®_length, reg_align); } diff --git a/src/uct/base/uct_md.c b/src/uct/base/uct_md.c index 0a8e1d20af1..a9d499ab0bc 100644 --- a/src/uct/base/uct_md.c +++ b/src/uct/base/uct_md.c @@ -486,6 +486,24 @@ ucs_status_t uct_md_query_v2(uct_md_h md, uct_md_attr_v2_t *md_attr) return UCS_OK; } +void uct_md_base_md_query(uct_md_attr_v2_t *md_attr) +{ + md_attr->reg_mem_types = 0; + md_attr->reg_nonblock_mem_types = 0; + md_attr->cache_mem_types = 0; + md_attr->detect_mem_types = 0; + md_attr->alloc_mem_types = 0; + md_attr->access_mem_types = 0; + md_attr->dmabuf_mem_types = 0; + md_attr->max_alloc = 0; + md_attr->max_reg = ULONG_MAX; + md_attr->reg_cost = UCS_LINEAR_FUNC_ZERO; + md_attr->rkey_packed_size = 0; + md_attr->exported_mkey_packed_size = 0; + md_attr->reg_alignment = 1; + memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); +} + ucs_status_t uct_mem_alloc_check_params(size_t length, const uct_alloc_method_t *methods, unsigned num_methods, diff --git a/src/uct/base/uct_md.h b/src/uct/base/uct_md.h index 718ff265b50..7742d9e7b1e 100644 --- a/src/uct/base/uct_md.h +++ b/src/uct/base/uct_md.h @@ -258,4 +258,6 @@ static UCS_F_ALWAYS_INLINE ucs_log_level_t uct_md_attach_log_lvl(uint64_t flags) void uct_md_vfs_init(uct_component_h component, uct_md_h md, const char *md_name); +void uct_md_base_md_query(uct_md_attr_v2_t *md_attr); + #endif diff --git a/src/uct/cuda/cuda_copy/cuda_copy_md.c b/src/uct/cuda/cuda_copy/cuda_copy_md.c index c288e955fbc..6a82d79cd29 100644 --- a/src/uct/cuda/cuda_copy/cuda_copy_md.c +++ b/src/uct/cuda/cuda_copy/cuda_copy_md.c @@ -100,25 +100,21 @@ uct_cuda_copy_md_query(uct_md_h uct_md, uct_md_attr_v2_t *md_attr) { uct_cuda_copy_md_t *md = ucs_derived_of(uct_md, uct_cuda_copy_md_t); - md_attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_ALLOC; - md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST) | - UCS_BIT(UCS_MEMORY_TYPE_CUDA) | - UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED); - md_attr->reg_nonblock_mem_types = 0; - md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED); - md_attr->alloc_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA) | - UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED); - md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA) | - UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED); - md_attr->detect_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA) | - UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED); - md_attr->dmabuf_mem_types = md->config.dmabuf_supported ? - UCS_BIT(UCS_MEMORY_TYPE_CUDA) : 0; + uct_md_base_md_query(md_attr); + md_attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_ALLOC; + md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST) | + UCS_BIT(UCS_MEMORY_TYPE_CUDA) | + UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED); + md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED); + md_attr->alloc_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA) | + UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED); + md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA) | + UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED); + md_attr->detect_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA) | + UCS_BIT(UCS_MEMORY_TYPE_CUDA_MANAGED); + md_attr->dmabuf_mem_types = md->config.dmabuf_supported ? + UCS_BIT(UCS_MEMORY_TYPE_CUDA) : 0; md_attr->max_alloc = SIZE_MAX; - md_attr->max_reg = ULONG_MAX; - md_attr->rkey_packed_size = 0; - md_attr->reg_cost = UCS_LINEAR_FUNC_ZERO; - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; } diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c index fba10e6d817..8ebe3f04fd0 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_md.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_md.c @@ -31,23 +31,16 @@ static ucs_config_field_t uct_cuda_ipc_md_config_table[] = { static ucs_status_t uct_cuda_ipc_md_query(uct_md_h md, uct_md_attr_v2_t *md_attr) { - md_attr->flags = UCT_MD_FLAG_REG | - UCT_MD_FLAG_NEED_RKEY | - UCT_MD_FLAG_INVALIDATE | - UCT_MD_FLAG_INVALIDATE_RMA | - UCT_MD_FLAG_INVALIDATE_AMO; - md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); - md_attr->reg_nonblock_mem_types = 0; - md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); - md_attr->alloc_mem_types = 0; - md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); - md_attr->detect_mem_types = 0; - md_attr->dmabuf_mem_types = 0; - md_attr->max_alloc = 0; - md_attr->max_reg = ULONG_MAX; - md_attr->rkey_packed_size = sizeof(uct_cuda_ipc_rkey_t); - md_attr->reg_cost = UCS_LINEAR_FUNC_ZERO; - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); + uct_md_base_md_query(md_attr); + md_attr->flags = UCT_MD_FLAG_REG | + UCT_MD_FLAG_NEED_RKEY | + UCT_MD_FLAG_INVALIDATE | + UCT_MD_FLAG_INVALIDATE_RMA | + UCT_MD_FLAG_INVALIDATE_AMO; + md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); + md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); + md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); + md_attr->rkey_packed_size = sizeof(uct_cuda_ipc_rkey_t); return UCS_OK; } diff --git a/src/uct/cuda/gdr_copy/gdr_copy_md.c b/src/uct/cuda/gdr_copy/gdr_copy_md.c index bee9da8992a..203062b2b9f 100644 --- a/src/uct/cuda/gdr_copy/gdr_copy_md.c +++ b/src/uct/cuda/gdr_copy/gdr_copy_md.c @@ -37,20 +37,13 @@ static ucs_config_field_t uct_gdr_copy_md_config_table[] = { static ucs_status_t uct_gdr_copy_md_query(uct_md_h md, uct_md_attr_v2_t *md_attr) { - md_attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY; - md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); - md_attr->reg_nonblock_mem_types = 0; - md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); - md_attr->alloc_mem_types = 0; - md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); - md_attr->detect_mem_types = 0; - md_attr->dmabuf_mem_types = 0; - md_attr->max_alloc = 0; - md_attr->max_reg = ULONG_MAX; - md_attr->rkey_packed_size = sizeof(uct_gdr_copy_key_t); - md_attr->reg_cost = UCS_LINEAR_FUNC_ZERO; - md_attr->reg_alignment = GPU_PAGE_SIZE; - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); + uct_md_base_md_query(md_attr); + md_attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY; + md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); + md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); + md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_CUDA); + md_attr->rkey_packed_size = sizeof(uct_gdr_copy_key_t); + md_attr->reg_alignment = GPU_PAGE_SIZE; return UCS_OK; } diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index a22024ac63c..888d61b2f33 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -242,13 +242,11 @@ ucs_status_t uct_ib_md_query(uct_md_h uct_md, uct_md_attr_v2_t *md_attr) size_t component_name_length = strlen(md->super.component->name); uint64_t guid = IBV_DEV_ATTR(&md->dev, sys_image_guid); + uct_md_base_md_query(md_attr); md_attr->max_alloc = ULONG_MAX; /* TODO query device */ md_attr->max_reg = ULONG_MAX; /* TODO query device */ md_attr->flags = md->cap_flags; - md_attr->alloc_mem_types = 0; md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - md_attr->detect_mem_types = 0; - md_attr->dmabuf_mem_types = 0; md_attr->reg_mem_types = md->reg_mem_types; md_attr->reg_nonblock_mem_types = md->reg_nonblock_mem_types; md_attr->cache_mem_types = UCS_MASK(UCS_MEMORY_TYPE_LAST); diff --git a/src/uct/rocm/copy/rocm_copy_md.c b/src/uct/rocm/copy/rocm_copy_md.c index f33d45d068b..79cf8335d7d 100644 --- a/src/uct/rocm/copy/rocm_copy_md.c +++ b/src/uct/rocm/copy/rocm_copy_md.c @@ -49,25 +49,21 @@ uct_rocm_copy_md_query(uct_md_h uct_md, uct_md_attr_v2_t *md_attr) { uct_rocm_copy_md_t *md = ucs_derived_of(uct_md, uct_rocm_copy_md_t); - md_attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY | - UCT_MD_FLAG_ALLOC; - md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST) | - UCS_BIT(UCS_MEMORY_TYPE_ROCM); - md_attr->reg_nonblock_mem_types = 0; - md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST) | - UCS_BIT(UCS_MEMORY_TYPE_ROCM); - md_attr->alloc_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); - md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); - md_attr->detect_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); - md_attr->dmabuf_mem_types = 0; + uct_md_base_md_query(md_attr); + md_attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY | + UCT_MD_FLAG_ALLOC; + md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST) | + UCS_BIT(UCS_MEMORY_TYPE_ROCM); + md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST) | + UCS_BIT(UCS_MEMORY_TYPE_ROCM); + md_attr->alloc_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); + md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); + md_attr->detect_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); if (md->have_dmabuf) { md_attr->dmabuf_mem_types |= UCS_BIT(UCS_MEMORY_TYPE_ROCM); } - md_attr->max_alloc = SIZE_MAX; - md_attr->max_reg = ULONG_MAX; - md_attr->rkey_packed_size = sizeof(uct_rocm_copy_key_t); - md_attr->reg_cost = UCS_LINEAR_FUNC_ZERO; - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); + md_attr->max_alloc = SIZE_MAX; + md_attr->rkey_packed_size = sizeof(uct_rocm_copy_key_t); return UCS_OK; } diff --git a/src/uct/rocm/ipc/rocm_ipc_md.c b/src/uct/rocm/ipc/rocm_ipc_md.c index 8761f1cfd55..8d475557d8b 100644 --- a/src/uct/rocm/ipc/rocm_ipc_md.c +++ b/src/uct/rocm/ipc/rocm_ipc_md.c @@ -24,22 +24,15 @@ static ucs_config_field_t uct_rocm_ipc_md_config_table[] = { static ucs_status_t uct_rocm_ipc_md_query(uct_md_h md, uct_md_attr_v2_t *md_attr) { - md_attr->rkey_packed_size = sizeof(uct_rocm_ipc_key_t); - md_attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY; - md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); - md_attr->reg_nonblock_mem_types = 0; - md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); - md_attr->alloc_mem_types = 0; - md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); - md_attr->detect_mem_types = 0; - md_attr->dmabuf_mem_types = 0; - md_attr->max_alloc = 0; - md_attr->max_reg = ULONG_MAX; + uct_md_base_md_query(md_attr); + md_attr->rkey_packed_size = sizeof(uct_rocm_ipc_key_t); + md_attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY; + md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); + md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); + md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ROCM); /* TODO: get accurate number */ - md_attr->reg_cost = ucs_linear_func_make(9e-9, 0); - - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); + md_attr->reg_cost = ucs_linear_func_make(9e-9, 0); return UCS_OK; } diff --git a/src/uct/sm/mm/base/mm_md.c b/src/uct/sm/mm/base/mm_md.c index a30e8397599..fe4a01f13bd 100644 --- a/src/uct/sm/mm/base/mm_md.c +++ b/src/uct/sm/mm/base/mm_md.c @@ -69,21 +69,17 @@ ucs_status_t uct_mm_seg_new(void *address, size_t length, uct_mm_seg_t **seg_p) void uct_mm_md_query(uct_md_h md, uct_md_attr_v2_t *md_attr, uint64_t max_alloc) { + uct_md_base_md_query(md_attr); md_attr->flags = UCT_MD_FLAG_RKEY_PTR | UCT_MD_FLAG_NEED_RKEY; md_attr->max_reg = 0; - md_attr->max_alloc = 0; md_attr->alloc_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - md_attr->detect_mem_types = 0; - md_attr->dmabuf_mem_types = 0; if (max_alloc > 0) { md_attr->flags |= UCT_MD_FLAG_ALLOC | UCT_MD_FLAG_FIXED; md_attr->max_alloc = max_alloc; } - - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); } ucs_status_t uct_mm_md_open(uct_component_t *component, const char *md_name, diff --git a/src/uct/sm/scopy/cma/cma_md.c b/src/uct/sm/scopy/cma/cma_md.c index c2311c893c9..340fbac2aaf 100644 --- a/src/uct/sm/scopy/cma/cma_md.c +++ b/src/uct/sm/scopy/cma/cma_md.c @@ -195,20 +195,13 @@ ucs_status_t uct_cma_md_query(uct_md_h uct_md, uct_md_attr_v2_t *md_attr) { uct_cma_md_t *md = ucs_derived_of(uct_md, uct_cma_md_t); - md_attr->rkey_packed_size = 0; + uct_md_base_md_query(md_attr); md_attr->flags = UCT_MD_FLAG_REG | md->extra_caps; md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); md_attr->reg_nonblock_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - md_attr->alloc_mem_types = 0; md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - md_attr->detect_mem_types = 0; - md_attr->dmabuf_mem_types = 0; - md_attr->max_alloc = 0; - md_attr->max_reg = ULONG_MAX; md_attr->reg_cost = ucs_linear_func_make(9e-9, 0); - - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; } diff --git a/src/uct/sm/scopy/knem/knem_md.c b/src/uct/sm/scopy/knem/knem_md.c index 73d3627f689..9dbb38b9c19 100644 --- a/src/uct/sm/scopy/knem/knem_md.c +++ b/src/uct/sm/scopy/knem/knem_md.c @@ -185,9 +185,8 @@ ucs_status_t uct_knem_md_query(uct_md_h uct_md, uct_md_attr_v2_t *md_attr) { uct_knem_md_t *md = ucs_derived_of(uct_md, uct_knem_md_t); + uct_md_base_md_query(md_attr); md_attr->flags = UCT_MD_FLAG_NEED_RKEY; - md_attr->reg_mem_types = 0; - md_attr->reg_nonblock_mem_types = 0; if (uct_knem_md_check_mem_reg(uct_md)) { md_attr->flags |= UCT_MD_FLAG_REG; md_attr->reg_mem_types |= UCS_BIT(UCS_MEMORY_TYPE_HOST); @@ -195,14 +194,8 @@ ucs_status_t uct_knem_md_query(uct_md_h uct_md, uct_md_attr_v2_t *md_attr) md_attr->rkey_packed_size = sizeof(uct_knem_key_t); md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - md_attr->alloc_mem_types = 0; md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - md_attr->detect_mem_types = 0; - md_attr->dmabuf_mem_types = 0; - md_attr->max_alloc = 0; - md_attr->max_reg = ULONG_MAX; md_attr->reg_cost = md->reg_cost; - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); return UCS_OK; } diff --git a/src/uct/sm/self/self.c b/src/uct/sm/self/self.c index 9819b5a8d3e..82e2c32b149 100644 --- a/src/uct/sm/self/self.c +++ b/src/uct/sm/self/self.c @@ -409,21 +409,14 @@ static uct_iface_ops_t uct_self_iface_ops = { static ucs_status_t uct_self_md_query(uct_md_h md, uct_md_attr_v2_t *attr) { + uct_md_base_md_query(attr); /* Dummy memory registration provided. No real memory handling exists */ attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY; /* TODO ignore rkey in rma/amo ops */ attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); attr->reg_nonblock_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - attr->alloc_mem_types = 0; - attr->detect_mem_types = 0; attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - attr->dmabuf_mem_types = 0; - attr->max_alloc = 0; - attr->max_reg = ULONG_MAX; - attr->rkey_packed_size = 0; - attr->reg_cost = UCS_LINEAR_FUNC_ZERO; - memset(&attr->local_cpus, 0xff, sizeof(attr->local_cpus)); return UCS_OK; } diff --git a/src/uct/tcp/tcp_md.c b/src/uct/tcp/tcp_md.c index 6d4004d28c7..a515d08525f 100644 --- a/src/uct/tcp/tcp_md.c +++ b/src/uct/tcp/tcp_md.c @@ -29,21 +29,14 @@ static ucs_config_field_t uct_tcp_md_config_table[] = { static ucs_status_t uct_tcp_md_query(uct_md_h md, uct_md_attr_v2_t *attr) { + uct_md_base_md_query(attr); /* Dummy memory registration provided. No real memory handling exists */ attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_RKEY; /* TODO ignore rkey in rma/amo ops */ - attr->max_alloc = 0; attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); attr->reg_nonblock_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - attr->alloc_mem_types = 0; attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - attr->detect_mem_types = 0; - attr->dmabuf_mem_types = 0; - attr->max_reg = ULONG_MAX; - attr->rkey_packed_size = 0; - attr->reg_cost = UCS_LINEAR_FUNC_ZERO; - memset(&attr->local_cpus, 0xff, sizeof(attr->local_cpus)); return UCS_OK; } diff --git a/src/uct/ugni/base/ugni_md.c b/src/uct/ugni/base/ugni_md.c index 25597d3baa4..175549c597d 100644 --- a/src/uct/ugni/base/ugni_md.c +++ b/src/uct/ugni/base/ugni_md.c @@ -36,20 +36,14 @@ uct_ugni_query_md_resources(uct_component_h component, static ucs_status_t uct_ugni_md_query(uct_md_h md, uct_md_attr_v2_t *md_attr) { - md_attr->rkey_packed_size = 3 * sizeof(uint64_t); - md_attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_MEMH | - UCT_MD_FLAG_NEED_RKEY; - md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - md_attr->reg_nonblock_mem_types = 0; - md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - md_attr->alloc_mem_types = 0; - md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); - md_attr->detect_mem_types = 0; - md_attr->dmabuf_mem_types = 0; - md_attr->max_alloc = 0; - md_attr->max_reg = ULONG_MAX; - md_attr->reg_cost = ucs_linear_func_make(1000.0e-9, 0.007e-9); - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); + uct_md_base_md_query(md_attr); + md_attr->rkey_packed_size = 3 * sizeof(uint64_t); + md_attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_NEED_MEMH | + UCT_MD_FLAG_NEED_RKEY; + md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); + md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); + md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST); + md_attr->reg_cost = ucs_linear_func_make(1000.0e-9, 0.007e-9); return UCS_OK; } diff --git a/src/uct/ze/copy/ze_copy_md.c b/src/uct/ze/copy/ze_copy_md.c index e97f5c4ad03..f4599d727e1 100644 --- a/src/uct/ze/copy/ze_copy_md.c +++ b/src/uct/ze/copy/ze_copy_md.c @@ -36,31 +36,27 @@ static ucs_config_field_t uct_ze_copy_md_config_table[] = { static ucs_status_t uct_ze_copy_md_query(uct_md_h md, uct_md_attr_v2_t *md_attr) { - md_attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_ALLOC; - md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST) | - UCS_BIT(UCS_MEMORY_TYPE_ZE_HOST) | - UCS_BIT(UCS_MEMORY_TYPE_ZE_DEVICE) | - UCS_BIT(UCS_MEMORY_TYPE_ZE_MANAGED); - md_attr->reg_nonblock_mem_types = 0; - md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ZE_HOST) | - UCS_BIT(UCS_MEMORY_TYPE_ZE_DEVICE) | - UCS_BIT(UCS_MEMORY_TYPE_ZE_MANAGED); - md_attr->alloc_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ZE_HOST) | - UCS_BIT(UCS_MEMORY_TYPE_ZE_DEVICE) | - UCS_BIT(UCS_MEMORY_TYPE_ZE_MANAGED); - md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ZE_HOST) | - UCS_BIT(UCS_MEMORY_TYPE_ZE_DEVICE) | - UCS_BIT(UCS_MEMORY_TYPE_ZE_MANAGED); - md_attr->detect_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ZE_HOST) | - UCS_BIT(UCS_MEMORY_TYPE_ZE_DEVICE) | - UCS_BIT(UCS_MEMORY_TYPE_ZE_MANAGED); - md_attr->dmabuf_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ZE_HOST) | - UCS_BIT(UCS_MEMORY_TYPE_ZE_DEVICE); - md_attr->max_alloc = SIZE_MAX; - md_attr->max_reg = ULONG_MAX; - md_attr->rkey_packed_size = 0; - md_attr->reg_cost = UCS_LINEAR_FUNC_ZERO; - memset(&md_attr->local_cpus, 0xff, sizeof(md_attr->local_cpus)); + uct_md_base_md_query(md_attr); + md_attr->flags = UCT_MD_FLAG_REG | UCT_MD_FLAG_ALLOC; + md_attr->reg_mem_types = UCS_BIT(UCS_MEMORY_TYPE_HOST) | + UCS_BIT(UCS_MEMORY_TYPE_ZE_HOST) | + UCS_BIT(UCS_MEMORY_TYPE_ZE_DEVICE) | + UCS_BIT(UCS_MEMORY_TYPE_ZE_MANAGED); + md_attr->cache_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ZE_HOST) | + UCS_BIT(UCS_MEMORY_TYPE_ZE_DEVICE) | + UCS_BIT(UCS_MEMORY_TYPE_ZE_MANAGED); + md_attr->alloc_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ZE_HOST) | + UCS_BIT(UCS_MEMORY_TYPE_ZE_DEVICE) | + UCS_BIT(UCS_MEMORY_TYPE_ZE_MANAGED); + md_attr->access_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ZE_HOST) | + UCS_BIT(UCS_MEMORY_TYPE_ZE_DEVICE) | + UCS_BIT(UCS_MEMORY_TYPE_ZE_MANAGED); + md_attr->detect_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ZE_HOST) | + UCS_BIT(UCS_MEMORY_TYPE_ZE_DEVICE) | + UCS_BIT(UCS_MEMORY_TYPE_ZE_MANAGED); + md_attr->dmabuf_mem_types = UCS_BIT(UCS_MEMORY_TYPE_ZE_HOST) | + UCS_BIT(UCS_MEMORY_TYPE_ZE_DEVICE); + md_attr->max_alloc = SIZE_MAX; return UCS_OK; } diff --git a/test/gtest/uct/test_md.cc b/test/gtest/uct/test_md.cc index 8833da28e88..cfcddc9a05b 100644 --- a/test/gtest/uct/test_md.cc +++ b/test/gtest/uct/test_md.cc @@ -66,6 +66,9 @@ void* test_md::alloc_thread(void *arg) ucs_status_t test_md::reg_mem(unsigned flags, void *address, size_t length, uct_mem_h *memh_p) { + /* Register memory respecting MD reg_alignment */ + ucs_align_ptr_range(&address, &length, md_attr().reg_alignment); + uct_md_mem_reg_params_t reg_params; reg_params.field_mask = UCT_MD_MEM_REG_FIELD_FLAGS; @@ -273,12 +276,6 @@ void test_md::dereg_cb(uct_completion_t *comp) md_comp->self->m_comp_count++; } -bool test_md::is_gpu_ipc() const -{ - return (GetParam().md_name == "cuda_ipc") || - (GetParam().md_name == "rocm_ipc"); -} - UCS_TEST_SKIP_COND_P(test_md, rkey_ptr, !check_caps(UCT_MD_FLAG_ALLOC | UCT_MD_FLAG_RKEY_PTR)) { @@ -531,6 +528,8 @@ UCS_TEST_P(test_md, mem_type_detect_mds) { } UCS_TEST_P(test_md, mem_query) { + ASSERT_GT(md_attr().reg_alignment, 0); + for (auto mem_type : mem_buffer::supported_mem_types()) { if (!(md_attr().detect_mem_types & UCS_BIT(mem_type))) { continue; @@ -611,7 +610,7 @@ UCS_TEST_SKIP_COND_P(test_md, reg, alloc_memory(&address, size, &fill_buffer[0], mem_type); - status = uct_md_mem_reg(md(), address, size, UCT_MD_MEM_ACCESS_ALL, &memh); + status = reg_mem(UCT_MD_MEM_ACCESS_ALL, address, size, &memh); ASSERT_UCS_OK(status); ASSERT_TRUE(memh != UCT_MEM_HANDLE_NULL); @@ -649,8 +648,7 @@ UCS_TEST_SKIP_COND_P(test_md, reg_perf, unsigned n = 0; while (n < count) { uct_mem_h memh; - status = uct_md_mem_reg(md(), ptr, size, UCT_MD_MEM_ACCESS_ALL, - &memh); + status = reg_mem(UCT_MD_MEM_ACCESS_ALL, ptr, size, &memh); ASSERT_UCS_OK(status); ASSERT_TRUE(memh != UCT_MEM_HANDLE_NULL); @@ -838,8 +836,9 @@ UCS_TEST_P(test_md, sockaddr_accessibility) { /* This test registers region N times and later deregs it N/2 times and * invalidates N/2 times - mix multiple dereg and invalidate calls. * Guarantee that all packed keys are unique. */ -UCS_TEST_SKIP_COND_P(test_md, invalidate, !check_caps(UCT_MD_FLAG_INVALIDATE) || - is_gpu_ipc()) +UCS_TEST_SKIP_COND_P(test_md, invalidate, + !check_caps(UCT_MD_FLAG_INVALIDATE) || + !check_reg_mem_type(UCS_MEMORY_TYPE_HOST)) { static const size_t size = 1 * UCS_MBYTE; const int limit = 64; @@ -991,8 +990,8 @@ UCS_TEST_P(test_md, rkey_compare_params_check) } // SM case is covered by XPMEM which has registration capability -UCS_TEST_SKIP_COND_P(test_md, rkey_compare, !check_caps(UCT_MD_FLAG_REG) || - is_gpu_ipc()) +UCS_TEST_SKIP_COND_P(test_md, rkey_compare, + !check_reg_mem_type(UCS_MEMORY_TYPE_HOST)) { size_t size = 4096; void *address = NULL; diff --git a/test/gtest/uct/test_md.h b/test/gtest/uct/test_md.h index 19a554a2385..88308b41ce6 100644 --- a/test/gtest/uct/test_md.h +++ b/test/gtest/uct/test_md.h @@ -75,8 +75,6 @@ class test_md : public testing::TestWithParam, static void dereg_cb(uct_completion_t *comp); - bool is_gpu_ipc() const; - const unsigned md_flags_remote_rma = UCT_MD_MEM_ACCESS_REMOTE_PUT | UCT_MD_MEM_ACCESS_REMOTE_GET; diff --git a/test/gtest/uct/uct_test.cc b/test/gtest/uct/uct_test.cc index 116fe8b6a35..df29d75c9ae 100644 --- a/test/gtest/uct/uct_test.cc +++ b/test/gtest/uct/uct_test.cc @@ -8,6 +8,7 @@ #include "uct/api/uct_def.h" #include "uct/api/v2/uct_v2.h" +#include #include #include #include @@ -967,8 +968,17 @@ void uct_test::entity::mem_type_reg(uct_allocated_memory_t *mem, unsigned mem_flags) const { if (md_attr().reg_mem_types & UCS_BIT(mem->mem_type)) { - ucs_status_t status = uct_md_mem_reg(m_md, mem->address, mem->length, - mem_flags, &mem->memh); + /* Register memory respecting MD reg_alignment */ + void *reg_address = mem->address; + size_t reg_length = mem->length; + ucs_align_ptr_range(®_address, ®_length, md_attr().reg_alignment); + + uct_md_mem_reg_params_t reg_params; + reg_params.field_mask = UCT_MD_MEM_REG_FIELD_FLAGS; + reg_params.flags = mem_flags; + + ucs_status_t status = uct_md_mem_reg_v2(m_md, reg_address, reg_length, + ®_params, &mem->memh); ASSERT_UCS_OK(status); mem->md = m_md; }