Skip to content

Commit

Permalink
Merge pull request #1724 from rhc54/topic/timeout
Browse files Browse the repository at this point in the history
Add a timeout cmd line option and an option to report state info upon timeout to assist with debugging Jenkins tests
  • Loading branch information
rhc54 committed May 28, 2016
2 parents 59f4a76 + ebe159a commit a93c01d
Show file tree
Hide file tree
Showing 7 changed files with 278 additions and 15 deletions.
5 changes: 4 additions & 1 deletion orte/mca/odls/odls_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2012 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2014-2016 Intel, Inc. All rights reserved.
Expand Down Expand Up @@ -80,6 +80,9 @@ typedef uint8_t orte_daemon_cmd_flag_t;
/* add procs for the DVM */
#define ORTE_DAEMON_DVM_ADD_PROCS (orte_daemon_cmd_flag_t) 30

/* for debug purposes, get stack traces from all application procs */
#define ORTE_DAEMON_GET_STACK_TRACES (orte_daemon_cmd_flag_t) 31

/*
* Struct written up the pipe from the child to the parent.
*/
Expand Down
3 changes: 3 additions & 0 deletions orte/mca/rml/rml_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ BEGIN_C_DECLS
/* error notifications */
#define ORTE_RML_TAG_NOTIFICATION 59

/* stacktrace for debug */
#define ORTE_RML_TAG_STACK_TRACE 60

#define ORTE_RML_TAG_MAX 100

#define ORTE_RML_TAG_NTOH(t) ntohl(t)
Expand Down
14 changes: 13 additions & 1 deletion orte/mca/schizo/ompi/schizo_ompi.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Research Organization for Information Science
Expand Down Expand Up @@ -92,6 +92,18 @@ static opal_cmd_line_init_t cmd_line_init[] = {
&orte_cmd_options.report_uri, OPAL_CMD_LINE_TYPE_STRING,
"Printout URI on stdout [-], stderr [+], or a file [anything else]" },

/* testing options */
{ NULL, '\0', "timeout", "timeout", 1,
&orte_cmd_options.timeout, OPAL_CMD_LINE_TYPE_INT,
"Timeout the job after the specified number of seconds" },
{ NULL, '\0', "report-state-on-timeout", "report-state-on-timeout", 0,
&orte_cmd_options.report_state_on_timeout, OPAL_CMD_LINE_TYPE_BOOL,
"Report all job and process states upon timeout" },
{ NULL, '\0', "get-stack-traces", "get-stack-traces", 0,
&orte_cmd_options.get_stack_traces, OPAL_CMD_LINE_TYPE_BOOL,
"Get stack traces of all application procs on timeout" },


/* exit status reporting */
{ "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0,
NULL, OPAL_CMD_LINE_TYPE_BOOL,
Expand Down
85 changes: 84 additions & 1 deletion orte/orted/orted_comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
Expand Down Expand Up @@ -47,6 +47,7 @@
#include "opal/mca/base/base.h"
#include "opal/util/output.h"
#include "opal/util/opal_environ.h"
#include "opal/util/path.h"
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_progress.h"
#include "opal/dss/dss.h"
Expand Down Expand Up @@ -111,6 +112,9 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
bool found = false;
orte_node_t *node;
orte_grpcomm_signature_t *sig;
FILE *fp;
char gscmd[256], path[1035], *pathptr;
char string[256], *string_ptr = string;

/* unpack the command */
n = 1;
Expand Down Expand Up @@ -1071,6 +1075,82 @@ void orte_daemon_recv(int status, orte_process_name_t* sender,
}
break;

case ORTE_DAEMON_GET_STACK_TRACES:
/* prep the response */
answer = OBJ_NEW(opal_buffer_t);
pathptr = path;

// Try to find the "gstack" executable. Failure to find the
// executable will be handled below, because the receiver
// expects to have the process name, hostname, and PID in the
// buffer before finding an error message.
char *gstack_exec;
gstack_exec = opal_find_absolute_path("gstack");

/* hit each local process with a gstack command */
for (i=0; i < orte_local_children->size; i++) {
if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
relay_msg = OBJ_NEW(opal_buffer_t);
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->name, 1, ORTE_NAME) ||
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->node->name, 1, OPAL_STRING) ||
OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->pid, 1, OPAL_PID)) {
OBJ_RELEASE(relay_msg);
break;
}

// If we were able to find the gstack executable,
// above, then run the command here.
fp = NULL;
if (NULL != gstack_exec) {
(void) snprintf(gscmd, sizeof(gscmd), "%s %lu",
gstack_exec, (unsigned long) proct->pid);
fp = popen(gscmd, "r");
}

// If either we weren't able to find or run the gstack
// exectuable, send back a nice error message here.
if (NULL == gstack_exec || NULL == fp) {
(void) snprintf(string, sizeof(string),
"Failed to %s \"%s\" on %s to obtain stack traces",
(NULL == gstack_exec) ? "find" : "run",
(NULL == gstack_exec) ? "gstack" : gstack_exec,
proct->node->name);
if (OPAL_SUCCESS ==
opal_dss.pack(relay_msg, &string_ptr, 1, OPAL_STRING)) {
opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER);
}
OBJ_RELEASE(relay_msg);
break;
}
/* Read the output a line at a time and pack it for transmission */
memset(path, 0, sizeof(path));
while (fgets(path, sizeof(path)-1, fp) != NULL) {
if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &pathptr, 1, OPAL_STRING)) {
OBJ_RELEASE(relay_msg);
break;
}
memset(path, 0, sizeof(path));
}
/* close */
pclose(fp);
/* transfer this load */
if (OPAL_SUCCESS != opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER)) {
OBJ_RELEASE(relay_msg);
break;
}
OBJ_RELEASE(relay_msg);
}
}
/* always send our response */
if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer,
ORTE_RML_TAG_STACK_TRACE,
orte_rml_send_callback, NULL))) {
ORTE_ERROR_LOG(ret);
OBJ_RELEASE(answer);
}
break;

default:
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
}
Expand Down Expand Up @@ -1139,6 +1219,9 @@ static char *get_orted_comm_cmd_str(int command)
case ORTE_DAEMON_DVM_ADD_PROCS:
return strdup("ORTE_DAEMON_DVM_ADD_PROCS");

case ORTE_DAEMON_GET_STACK_TRACES:
return strdup("ORTE_DAEMON_GET_STACK_TRACES");

default:
return strdup("Unknown Command!");
}
Expand Down
Loading

0 comments on commit a93c01d

Please sign in to comment.