Skip to content

Commit

Permalink
UCT/SM/CUDA: CR Comments p2
Browse files Browse the repository at this point in the history
  • Loading branch information
brminich committed Dec 13, 2021
1 parent 2ab66ef commit 558b0c9
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 36 deletions.
5 changes: 3 additions & 2 deletions src/ucs/sys/sys.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#define UCS_PROCESS_NS_DIR "/proc/self/ns"
#define UCS_PROCESS_BOOTID_FILE "/proc/sys/kernel/random/boot_id"
#define UCS_PROCESS_BOOTID_FMT "%x-%4hx-%4hx-%4hx-%2hhx%2hhx%2hhx%2hhx%2hhx%2hhx"
#define UCS_PROCCESS_STAT_FMT "/proc/%d/stat"
#define UCS_PROCESS_NS_FIRST 0xF0000000U
#define UCS_PROCESS_NS_NET_DFLT 0xF0000080U

Expand Down Expand Up @@ -1524,7 +1525,7 @@ unsigned long ucs_sys_get_proc_create_time(pid_t pid)
unsigned long stime;
int res;

size = ucs_read_file_str(stat, sizeof(stat), 1, "/proc/%d/stat", pid);
size = ucs_read_file_str(stat, sizeof(stat), 1, UCS_PROCCESS_STAT_FMT, pid);
if (size < 0) {
goto err;
}
Expand All @@ -1543,7 +1544,7 @@ unsigned long ucs_sys_get_proc_create_time(pid_t pid)
}

scan_err:
ucs_error("failed to scan stat for pid %d", pid);
ucs_error("failed to scan "UCS_PROCCESS_STAT_FMT, pid);
err:
return 0ul;
}
Expand Down
30 changes: 17 additions & 13 deletions src/uct/base/uct_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -810,28 +810,32 @@ static ucs_status_t uct_iface_schedule_ep_err(uct_ep_h ep, ucs_status_t status)
return UCS_OK;
}

ucs_status_t uct_ep_keepalive_init(uct_keepalive_info_t *ka, pid_t pid)
{
ka->start_time = ucs_sys_get_proc_create_time(pid);
if (ka->start_time == 0) {
ucs_diag("failed to get start time for pid %d", pid);
return UCS_ERR_ENDPOINT_TIMEOUT;
}

return UCS_OK;
}

ucs_status_t uct_ep_keepalive_check(uct_ep_h ep, uct_keepalive_info_t *ka,
pid_t pid, unsigned flags,
uct_completion_t *comp)
{
ucs_status_t status = UCS_OK;
unsigned long start_time;

UCT_EP_KEEPALIVE_CHECK_PARAM(flags, comp);

if (ka->start_time == 0) {
ka->start_time = ucs_sys_get_proc_create_time(pid);
} else {
start_time = ucs_sys_get_proc_create_time(pid);
if (ka->start_time != start_time) {
ucs_diag("ka failed for %d start time %lu != %lu", pid,
ka->start_time, start_time);
status = UCS_ERR_ENDPOINT_TIMEOUT;
}
}
ucs_assert(ka->start_time != 0);

if (status != UCS_OK) {
return uct_iface_schedule_ep_err(ep, status);
start_time = ucs_sys_get_proc_create_time(pid);
if (ka->start_time != start_time) {
ucs_diag("ka failed for pid %d start time %lu != %lu", pid,
ka->start_time, start_time);
return uct_iface_schedule_ep_err(ep, UCS_ERR_ENDPOINT_TIMEOUT);
}

return UCS_OK;
Expand Down
2 changes: 2 additions & 0 deletions src/uct/base/uct_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,8 @@ ucs_status_t uct_base_ep_am_short_iov(uct_ep_h ep, uint8_t id, const uct_iov_t *

int uct_ep_get_process_proc_dir(char *buffer, size_t max_len, pid_t pid);

ucs_status_t uct_ep_keepalive_init(uct_keepalive_info_t *ka, pid_t pid);

ucs_status_t uct_ep_keepalive_check(uct_ep_h ep, uct_keepalive_info_t *ka,
pid_t pid, unsigned flags,
uct_completion_t *comp);
Expand Down
5 changes: 2 additions & 3 deletions src/uct/cuda/cuda_ipc/cuda_ipc_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,9 @@ static UCS_CLASS_INIT_FUNC(uct_cuda_ipc_ep_t, const uct_ep_params_t *params)
UCT_EP_PARAMS_CHECK_DEV_IFACE_ADDRS(params);
UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super);

self->remote_pid = *(const pid_t*)params->iface_addr;
self->keepalive.start_time = 0;
self->remote_pid = *(const pid_t*)params->iface_addr;

return UCS_OK;
return uct_ep_keepalive_init(&self->keepalive, self->remote_pid);
}

static UCS_CLASS_CLEANUP_FUNC(uct_cuda_ipc_ep_t)
Expand Down
33 changes: 22 additions & 11 deletions src/uct/sm/mm/base/mm_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,19 @@ static void uct_mm_ep_signal_remote(uct_mm_ep_t *ep)
}
}

void uct_mm_ep_cleanup_remote_segs(uct_mm_ep_t *ep)
{
uct_mm_iface_t *iface = ucs_derived_of(ep->super.super.iface,
uct_mm_iface_t);
uct_mm_remote_seg_t remote_seg;

kh_foreach_value(&ep->remote_segs, remote_seg, {
uct_mm_iface_mapper_call(iface, mem_detach, &remote_seg);
})

kh_destroy_inplace(uct_mm_remote_seg, &ep->remote_segs);
}

static UCS_CLASS_INIT_FUNC(uct_mm_ep_t, const uct_ep_params_t *params)
{
uct_mm_iface_t *iface = ucs_derived_of(params->iface, uct_mm_iface_t);
Expand Down Expand Up @@ -162,15 +175,21 @@ static UCS_CLASS_INIT_FUNC(uct_mm_ep_t, const uct_ep_params_t *params)

/* Initialize remote FIFO control structure */
uct_mm_iface_set_fifo_ptrs(fifo_ptr, &self->fifo_ctl, &self->fifo_elems);
self->cached_tail = self->fifo_ctl->tail;
self->keepalive.start_time = 0;
self->cached_tail = self->fifo_ctl->tail;
ucs_arbiter_elem_init(&self->arb_elem);

status = uct_ep_keepalive_init(&self->keepalive, self->fifo_ctl->pid);
if (status != UCS_OK) {
goto err_free_segs;
}

ucs_debug("created mm ep %p, connected to remote FIFO id 0x%"PRIx64,
self, addr->fifo_seg_id);

return UCS_OK;

err_free_segs:
uct_mm_ep_cleanup_remote_segs(self);
err_free_md_addr:
ucs_free(self->remote_iface_addr);
err:
Expand All @@ -179,17 +198,9 @@ static UCS_CLASS_INIT_FUNC(uct_mm_ep_t, const uct_ep_params_t *params)

static UCS_CLASS_CLEANUP_FUNC(uct_mm_ep_t)
{
uct_mm_iface_t *iface = ucs_derived_of(self->super.super.iface, uct_mm_iface_t);
uct_mm_remote_seg_t remote_seg;

uct_mm_ep_pending_purge(&self->super.super, NULL, NULL);

kh_foreach_value(&self->remote_segs, remote_seg, {
uct_mm_iface_mapper_call(iface, mem_detach, &remote_seg);
})

uct_mm_ep_cleanup_remote_segs(self);
ucs_free(self->remote_iface_addr);
kh_destroy_inplace(uct_mm_remote_seg, &self->remote_segs);
}

UCS_CLASS_DEFINE(uct_mm_ep_t, uct_base_ep_t)
Expand Down
3 changes: 1 addition & 2 deletions src/uct/sm/scopy/cma/cma_ep.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,8 @@ static UCS_CLASS_INIT_FUNC(uct_cma_ep_t, const uct_ep_params_t *params)

self->remote_pid = *(const pid_t*)params->iface_addr &
~UCT_CMA_IFACE_ADDR_FLAG_PID_NS;
self->keepalive.start_time = 0;

return UCS_OK;
return uct_ep_keepalive_init(&self->keepalive, self->remote_pid);
}

static UCS_CLASS_CLEANUP_FUNC(uct_cma_ep_t)
Expand Down
13 changes: 8 additions & 5 deletions test/gtest/uct/test_peer_failure.cc
Original file line number Diff line number Diff line change
Expand Up @@ -461,16 +461,19 @@ UCT_INSTANTIATE_TEST_CASE(test_uct_peer_failure_multiple)
class test_uct_keepalive : public uct_test {
public:
test_uct_keepalive() :
m_pid(getpid()), m_entity(NULL), m_err_handler_count(0)
m_ka(NULL), m_pid(getpid()), m_entity(NULL), m_err_handler_count(0)
{
m_ka.start_time = 0;
}

void init()
{
m_entity = create_entity(0, err_handler_cb);
m_entity->connect(0, *m_entity, 0);
m_entities.push_back(m_entity);

ASSERT_TRUE(has_mm());
uct_mm_ep_t *ep = ucs_derived_of(m_entity->ep(0), uct_mm_ep_t);
m_ka = &ep->keepalive;
}

void cleanup()
Expand All @@ -489,12 +492,12 @@ class test_uct_keepalive : public uct_test {
protected:
void do_keepalive()
{
ucs_status_t status = uct_ep_keepalive_check(m_entity->ep(0), &m_ka,
ucs_status_t status = uct_ep_keepalive_check(m_entity->ep(0), m_ka,
m_pid, 0, NULL);
EXPECT_UCS_OK(status);
}

uct_keepalive_info_t m_ka;
uct_keepalive_info_t *m_ka;
pid_t m_pid;
entity *m_entity;
unsigned m_err_handler_count;
Expand All @@ -508,7 +511,7 @@ UCS_TEST_P(test_uct_keepalive, ep_check)
EXPECT_EQ(0u, m_err_handler_count);

/* change start time saved in KA to force an error from EP check */
m_ka.start_time--;
m_ka->start_time--;

do_keepalive();
EXPECT_EQ(0u, m_err_handler_count);
Expand Down

0 comments on commit 558b0c9

Please sign in to comment.