Skip to content

Commit

Permalink
UCT/SM/CUDA: Fix common intra-node keepalive protocol
Browse files Browse the repository at this point in the history
  • Loading branch information
brminich committed Dec 8, 2021
1 parent af8bde5 commit 8a986cc
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 66 deletions.
60 changes: 34 additions & 26 deletions src/ucs/sys/sys.c
Original file line number Diff line number Diff line change
Expand Up @@ -1462,32 +1462,6 @@ ucs_status_t ucs_sys_enum_threads(ucs_sys_enum_threads_cb_t cb, void *ctx)
return ucs_sys_readdir(task_dir, &ucs_sys_enum_threads_cb, &param);
}

ucs_status_t ucs_sys_get_file_time(const char *name, ucs_sys_file_time_t type,
struct timespec *ts)
{
struct stat stat_buf;
int res;

res = stat(name, &stat_buf);
if (res != 0) {
return UCS_ERR_IO_ERROR; /* failed to get file info */
}

switch (type) {
case UCS_SYS_FILE_TIME_CTIME:
*ts = stat_buf.st_ctim;
return UCS_OK;
case UCS_SYS_FILE_TIME_ATIME:
*ts = stat_buf.st_atim;
return UCS_OK;
case UCS_SYS_FILE_TIME_MTIME:
*ts = stat_buf.st_mtim;
return UCS_OK;
default:
return UCS_ERR_INVALID_PARAM;
}
}

ucs_status_t ucs_sys_check_fd_limit_per_process()
{
int fd;
Expand Down Expand Up @@ -1541,3 +1515,37 @@ long ucs_sys_get_num_cpus()

return num_cpus;
}

unsigned long ucs_sys_get_proc_create_time(const char *proc_stat_path)
{
char stat[1024];
char *start_str;
ssize_t size;
unsigned long stime;
int res;

size = ucs_read_file_str(stat, sizeof(stat), 1, proc_stat_path);
if (size < 0) {
ucs_diag("failed to read %s %m", proc_stat_path);
return 0ul;
}

/* Start sscanf right after the executable name which may contain spaces or
* brackets itself */
start_str = strrchr(stat, ')');
if (start_str == NULL) {
goto scan_err;
}

res = sscanf(start_str, ") %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %*u"
"%*u %*d %*d %*d %*d %*d %*d %lu", &stime);
if (res == 1) {
return stime;
}

scan_err:
ucs_error("failed to scan %s", proc_stat_path);
return 0ul;
}


29 changes: 8 additions & 21 deletions src/ucs/sys/sys.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,6 @@ typedef enum {
} ucs_sys_vma_info_flags_t;


/* file time information */
typedef enum {
UCS_SYS_FILE_TIME_CTIME, /**< create time */
UCS_SYS_FILE_TIME_ATIME, /**< access time */
UCS_SYS_FILE_TIME_MTIME /**< modification time */
} ucs_sys_file_time_t;


/* information about virtual memory area */
typedef struct {
unsigned long start;
Expand Down Expand Up @@ -596,19 +588,6 @@ ucs_status_t ucs_sys_readdir(const char *path, ucs_sys_readdir_cb_t cb, void *ct
ucs_status_t ucs_sys_enum_threads(ucs_sys_enum_threads_cb_t cb, void *ctx);


/**
* Get file time
*
* @param [in] name File name
* @param [in] type Type of file time information
* @param [out] ts File time information
*
* @return UCS_OK if file is found and got information.
*/
ucs_status_t ucs_sys_get_file_time(const char *name, ucs_sys_file_time_t type,
struct timespec *ts);


/**
* Check the per-process limit on the number of open file descriptors.
*
Expand Down Expand Up @@ -639,6 +618,14 @@ ucs_status_t ucs_pthread_create(pthread_t *thread_id_p,
*/
long ucs_sys_get_num_cpus();


/*
* Get process creation time.
*
* @return The time the process started after system boot or 0 in case of error.
*/
unsigned long ucs_sys_get_proc_create_time(const char *proc_stat_path);

END_C_DECLS

#endif
31 changes: 15 additions & 16 deletions src/uct/base/uct_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -787,12 +787,13 @@ ucs_status_t uct_base_ep_am_short_iov(uct_ep_h ep, uint8_t id, const uct_iov_t *
return status;
}

int uct_ep_get_process_proc_dir(char *buffer, size_t max_len, pid_t pid)
static int
uct_ep_get_process_stat_proc_dir(char *buffer, size_t max_len, pid_t pid)
{
ucs_assert((buffer != NULL) || (max_len == 0));
/* cppcheck-suppress nullPointer */
/* cppcheck-suppress ctunullpointer */
return snprintf(buffer, max_len, "/proc/%d", (int)pid);
return snprintf(buffer, max_len, "/proc/%d/stat", (int)pid);
}

ucs_status_t uct_ep_keepalive_create(pid_t pid, uct_keepalive_info_t **ka_p)
Expand All @@ -801,7 +802,7 @@ ucs_status_t uct_ep_keepalive_create(pid_t pid, uct_keepalive_info_t **ka_p)
ucs_status_t status;
int proc_len;

proc_len = uct_ep_get_process_proc_dir(NULL, 0, pid);
proc_len = uct_ep_get_process_stat_proc_dir(NULL, 0, pid);
if (proc_len <= 0) {
ucs_error("failed to get length to hold path to a process directory");
status = UCS_ERR_NO_MEMORY;
Expand All @@ -815,11 +816,10 @@ ucs_status_t uct_ep_keepalive_create(pid_t pid, uct_keepalive_info_t **ka_p)
goto err;
}

uct_ep_get_process_proc_dir(ka->proc, proc_len + 1, pid);
uct_ep_get_process_stat_proc_dir(ka->proc, proc_len + 1, pid);

status = ucs_sys_get_file_time(ka->proc, UCS_SYS_FILE_TIME_CTIME,
&ka->start_time);
if (status != UCS_OK) {
ka->start_time = ucs_sys_get_proc_create_time(ka->proc);
if (ka->start_time == 0ul) {
ucs_error("failed to get process start time");
goto err_free_ka;
}
Expand Down Expand Up @@ -860,21 +860,20 @@ ucs_status_t uct_ep_keepalive_check(uct_ep_h ep, uct_keepalive_info_t **ka_p,
pid_t pid, unsigned flags,
uct_completion_t *comp)
{
struct timespec create_time;
ucs_status_t status = UCS_OK;
unsigned long start_time;
uct_keepalive_info_t *ka;
ucs_status_t status;

UCT_EP_KEEPALIVE_CHECK_PARAM(flags, comp);

if (*ka_p == NULL) {
status = uct_ep_keepalive_create(pid, ka_p);
status = uct_ep_keepalive_create(pid, ka_p);
} else {
ka = *ka_p;
status = ucs_sys_get_file_time(ka->proc, UCS_SYS_FILE_TIME_CTIME,
&create_time);
if ((status != UCS_OK) ||
(ka->start_time.tv_sec != create_time.tv_sec) ||
(ka->start_time.tv_nsec != create_time.tv_nsec)) {
ka = *ka_p;
start_time = ucs_sys_get_proc_create_time(ka->proc);
if (ka->start_time != start_time) {
ucs_diag("ka failed: %s start time %lu != %lu", ka->proc,
ka->start_time, start_time);
status = UCS_ERR_ENDPOINT_TIMEOUT;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/uct/base/uct_iface.h
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ typedef struct uct_failed_iface {
* Keepalive info used by EP
*/
typedef struct uct_keepalive_info {
struct timespec start_time; /* Process start time */
unsigned long start_time; /* Process start time */
char proc[]; /* Process owner proc dir */
} uct_keepalive_info_t;

Expand Down
4 changes: 2 additions & 2 deletions test/gtest/uct/test_peer_failure.cc
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ UCS_TEST_P(test_uct_keepalive, ep_check)
EXPECT_EQ(0u, m_err_handler_count);

/* change start time saved in KA to force an error from EP check */
m_ka->start_time.tv_sec--;
m_ka->start_time--;

do_keepalive();
EXPECT_EQ(0u, m_err_handler_count);
Expand Down Expand Up @@ -557,7 +557,7 @@ class test_uct_peer_failure_keepalive : public test_uct_peer_failure
}

if (ka_info != NULL) {
ka_info->start_time.tv_sec--;
ka_info->start_time--;
}

test_uct_peer_failure::kill_receiver();
Expand Down

0 comments on commit 8a986cc

Please sign in to comment.