Skip to content

Commit

Permalink
Merge pull request #8235 from edgargabriel/pr/rocm-v1.13-fixes
Browse files Browse the repository at this point in the history
UCT/ROCM: bring ROCm fixes over to v1.13 branch
  • Loading branch information
yosefe authored May 18, 2022
2 parents 4c9e32a + d13b60d commit 5879c44
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 17 deletions.
19 changes: 13 additions & 6 deletions src/uct/rocm/base/rocm_base.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
#include <pthread.h>


#define MAX_AGENTS 16
#define MAX_AGENTS 63
static struct agents {
hsa_agent_t agents[MAX_AGENTS];
int num;
hsa_agent_t gpu_agents[MAX_AGENTS];
hsa_agent_t agents[MAX_AGENTS];
int num_gpu;
hsa_agent_t gpu_agents[MAX_AGENTS];
} uct_rocm_base_agents;

int uct_rocm_base_get_gpu_agents(hsa_agent_t **agents)
Expand Down Expand Up @@ -176,17 +176,24 @@ ucs_status_t uct_rocm_base_detect_memory_type(uct_md_h md, const void *addr,
{
hsa_status_t status;
hsa_amd_pointer_info_t info;
hsa_device_type_t dev_type;

*mem_type_p = UCS_MEMORY_TYPE_HOST;
if (addr == NULL) {
return UCS_OK;
}

info.size = sizeof(hsa_amd_pointer_info_t);
status = hsa_amd_pointer_info((void*)addr, &info, NULL, NULL, NULL);
status = hsa_amd_pointer_info((void*)addr, &info, NULL, NULL, NULL);
if ((status == HSA_STATUS_SUCCESS) &&
(info.type != HSA_EXT_POINTER_TYPE_UNKNOWN)) {
*mem_type_p = UCS_MEMORY_TYPE_ROCM;
(info.type == HSA_EXT_POINTER_TYPE_HSA)) {
status = hsa_agent_get_info(info.agentOwner, HSA_AGENT_INFO_DEVICE,
&dev_type);
if ((status == HSA_STATUS_SUCCESS) &&
(dev_type == HSA_DEVICE_TYPE_GPU)) {
*mem_type_p = UCS_MEMORY_TYPE_ROCM;
return UCS_OK;
}
}

return UCS_OK;
Expand Down
51 changes: 40 additions & 11 deletions src/uct/rocm/ipc/rocm_ipc_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,19 @@ ucs_status_t uct_rocm_ipc_ep_zcopy(uct_ep_h tl_ep,
{
uct_rocm_ipc_ep_t *ep = ucs_derived_of(tl_ep, uct_rocm_ipc_ep_t);
hsa_status_t status;
hsa_agent_t local_agent;
hsa_agent_t local_agent, remote_agent;
hsa_agent_t dst_agent, src_agent;
size_t size = uct_iov_get_length(iov);
ucs_status_t ret = UCS_OK;
void *base_addr, *local_addr = iov->buffer;
uct_rocm_ipc_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rocm_ipc_iface_t);
void *remote_base_addr, *remote_copy_addr;
void *dst_addr, *src_addr;
uct_rocm_ipc_signal_desc_t *rocm_ipc_signal;
void *tmp_base_ptr;
size_t tmp_base_size;
hsa_agent_t *gpu_agents;
int num_gpu;

/* no data to deliver */
if (!size)
Expand Down Expand Up @@ -94,22 +99,46 @@ ucs_status_t uct_rocm_ipc_ep_zcopy(uct_ep_h tl_ep,

remote_copy_addr = UCS_PTR_BYTE_OFFSET(remote_base_addr,
remote_addr - key->address);
if (is_put) {
dst_addr = remote_copy_addr;
src_addr = local_addr;

memset(&remote_agent, 0, sizeof(hsa_agent_t));
status = uct_rocm_base_get_ptr_info(remote_copy_addr, size, &tmp_base_ptr,
&tmp_base_size, &remote_agent);
if (status != HSA_STATUS_SUCCESS) {
return UCS_ERR_INVALID_ADDR;
}
else {
dst_addr = local_addr;
src_addr = remote_copy_addr;

if (remote_agent.handle == 0) {
/* No access to remote agent, e.g. because of limited visability of devices to
* this process. Using local_agent as a backup plan. */
remote_agent = local_agent;
} else {
num_gpu = uct_rocm_base_get_gpu_agents(&gpu_agents);
status = hsa_amd_agents_allow_access(num_gpu, gpu_agents, NULL,
base_addr);
if (status != HSA_STATUS_SUCCESS) {
ucs_error("failed to enable direct access for mem addr %p agent "
"%lu\n",
(void*)remote_addr, remote_agent.handle);
return UCS_ERR_INVALID_ADDR;
}
}
if (is_put) {
dst_addr = remote_copy_addr;
dst_agent = remote_agent;
src_addr = local_addr;
src_agent = local_agent;
} else {
dst_addr = local_addr;
dst_agent = local_agent;
src_addr = remote_copy_addr;
src_agent = remote_agent;
}

rocm_ipc_signal = ucs_mpool_get(&iface->signal_pool);
hsa_signal_store_screlease(rocm_ipc_signal->signal, 1);

status = hsa_amd_memory_async_copy(dst_addr, local_agent,
src_addr, local_agent,
size, 0, NULL,
rocm_ipc_signal->signal);
status = hsa_amd_memory_async_copy(dst_addr, dst_agent, src_addr, src_agent,
size, 0, NULL, rocm_ipc_signal->signal);

if (status != HSA_STATUS_SUCCESS) {
ucs_error("copy error");
Expand Down

0 comments on commit 5879c44

Please sign in to comment.