Index: orte/mca/odls/mosix/configure.m4 =================================================================== --- orte/mca/odls/mosix/configure.m4 (revision 0) +++ orte/mca/odls/mosix/configure.m4 (revision 0) @@ -0,0 +1,30 @@ +# -*- shell-script -*- +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_btl_mosix_CONFIG([action-if-found], [action-if-not-found]) +# ----------------------------------------------------------- +AC_DEFUN([MCA_orte_odls_mosix_CONFIG],[ + AC_CONFIG_FILES([orte/mca/odls/mosix/Makefile]) + + # check for mosix presence + AC_CHECK_FILE([/proc/mosix/mosip], + [$1], + [$2]) +])dnl Index: orte/mca/odls/mosix/odls_mosix_component.c =================================================================== --- orte/mca/odls/mosix/odls_mosix_component.c (revision 0) +++ orte/mca/odls/mosix/odls_mosix_component.c (revision 0) @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + * + * These symbols are in a file by themselves to provide nice linker + * semantics. Since linkers generally pull in symbols by object + * files, keeping these symbols as the only symbols in this file + * prevents utility programs such as "ompi_info" from having to import + * entire components just to query their version and parameters. + */ + +#include "orte_config.h" +#include "orte/constants.h" + +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#include + +#include "opal/mca/mca.h" +#include "opal/mca/base/base.h" +#include "opal/mca/base/mca_base_param.h" + +#include "orte/mca/odls/odls.h" +#include "orte/mca/odls/base/odls_private.h" +#include "orte/mca/odls/mosix/odls_mosix.h" + +/* + * Instantiate the public struct with all of our public information + * and pointers to our public functions in it + */ + +orte_odls_base_component_t mca_odls_mosix_component = { + /* First, the mca_component_t struct containing meta information + about the component itself */ + { + ORTE_ODLS_BASE_VERSION_2_0_0, + /* Component name and version */ + "mosix", + ORTE_MAJOR_VERSION, + ORTE_MINOR_VERSION, + ORTE_RELEASE_VERSION, + + /* Component open and close functions */ + orte_odls_mosix_component_open, + orte_odls_mosix_component_close, + orte_odls_mosix_component_query, + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + } +}; + + + +int orte_odls_mosix_component_open(void) +{ + return ORTE_SUCCESS; +} + +int orte_odls_mosix_component_query(mca_base_module_t **module, int *priority) +{ + /* the base open/select logic protects us against operation when + * we are NOT in a daemon, so we don't have to check that here + */ + + /* we have built some logic into the configure.m4 file that checks + * to see if we have "fork" support and only builds this component + * if we do. Hence, we only get here if we CAN build - in which + * case, we definitely should be considered for selection + */ + *priority = 2; /* default priority + 1 */ + *module = (mca_base_module_t *) &orte_odls_mosix_module; + return ORTE_SUCCESS; +} + + +int orte_odls_mosix_component_close(void) +{ + return ORTE_SUCCESS; +} Index: orte/mca/odls/mosix/odls_mosix.h =================================================================== --- orte/mca/odls/mosix/odls_mosix.h (revision 0) +++ orte/mca/odls/mosix/odls_mosix.h (revision 0) @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2004-2008 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2006 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ +/** + * @file: + */ + +#ifndef ORTE_ODLS_DEFAULT_H +#define ORTE_ODLS_DEFAULT_H + +#include "orte_config.h" + +#include "opal/mca/mca.h" + +#include "orte/mca/odls/odls.h" + +BEGIN_C_DECLS + +/* + * Module open / close + */ +int orte_odls_mosix_component_open(void); +int orte_odls_mosix_component_close(void); +int orte_odls_mosix_component_query(mca_base_module_t **module, int *priority); + +/* + * ODLS Default module + */ +extern orte_odls_base_module_t orte_odls_mosix_module; +ORTE_MODULE_DECLSPEC extern orte_odls_base_component_t mca_odls_mosix_component; + +END_C_DECLS + +#endif /* ORTE_ODLS_H */ Index: orte/mca/odls/mosix/odls_mosix_module.c =================================================================== --- orte/mca/odls/mosix/odls_mosix_module.c (revision 0) +++ orte/mca/odls/mosix/odls_mosix_module.c (revision 0) @@ -0,0 +1,834 @@ +/* + * Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana + * University Research and Technology + * Corporation. All rights reserved. + * Copyright (c) 2004-2008 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2010 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007 Evergrid, Inc. All rights reserved. + * Copyright (c) 2008-2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/* + * There is a complicated sequence of events that occurs when the + * parent forks a child process that is intended to launch the target + * executable. + * + * Before the child process exec's the target executable, it might tri + * to set the affinity of that new child process according to a + * complex series of rules. This binding may fail in a myriad of + * different ways. A lot of this code deals with reporting that error + * occurately to the end user. This is a complex task in itself + * because the child process is not "really" an ORTE process -- all + * error reporting must be proxied up to the parent who can use normal + * ORTE error reporting mechanisms. + * + * Here's a high-level description of what is occurring in this file: + * + * - parent opens a pipe + * - parent forks a child + * - parent blocks reading on the pipe: the pipe will either close + * (indicating that the child successfully exec'ed) or the child will + * write some proxied error data up the pipe + * + * - the child tries to set affinity and do other housekeeping in + * preparation of exec'ing the target executable + * - if the child fails anywhere along the way, it sends a message up + * the pipe to the parent indicating what happened -- including a + * rendered error message detailing the problem (i.e., human-readable). + * - it is important that the child renders the error message: there + * are so many errors that are possible that the child is really the + * only entity that has enough information to make an accuate error string + * to report back to the user. + * - the parent reads this message + rendered string in and uses ORTE + * reporting mechanisms to display it to the user + * - if the problem was only a warning, the child continues processing + * (potentially eventually exec'ing the target executable). + * - if the problem was an error, the child exits and the parent + * handles the death of the child as appropriate (i.e., this ODLS + * simply reports the error -- other things decide what to do). + */ + +#include "orte_config.h" +#include "orte/constants.h" +#include "orte/types.h" + +#ifdef HAVE_STRING_H +#include +#endif +#include +#ifdef HAVE_UNISTD_H +#include +#endif +#include +#ifdef HAVE_SYS_TYPES_H +#include +#endif +#ifdef HAVE_SYS_WAIT_H +#include +#endif +#include +#ifdef HAVE_FCNTL_H +#include +#endif +#ifdef HAVE_SYS_TIME_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include +#endif +#ifdef HAVE_NETDB_H +#include +#endif +#ifdef HAVE_STDLIB_H +#include +#endif +#ifdef HAVE_SYS_STAT_H +#include +#endif /* HAVE_SYS_STAT_H */ +#ifdef HAVE_STDARG_H +#include +#endif +#ifdef HAVE_SYS_SELECT_H +#include +#endif + +#include "opal/mca/hwloc/hwloc.h" +#include "opal/mca/hwloc/base/base.h" +#include "opal/mca/maffinity/base/base.h" +#include "opal/class/opal_pointer_array.h" +#include "opal/util/opal_environ.h" +#include "opal/util/show_help.h" +#include "opal/util/fd.h" + +#include "orte/util/show_help.h" +#include "orte/runtime/orte_wait.h" +#include "orte/runtime/orte_globals.h" +#include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/ess/ess.h" +#include "orte/mca/iof/base/iof_base_setup.h" +#include "orte/mca/plm/plm.h" +#include "orte/util/name_fns.h" + +#include "orte/mca/odls/base/base.h" +#include "orte/mca/odls/base/odls_private.h" +#include "orte/mca/odls/mosix/odls_mosix.h" + +/* + * Struct written up the pipe from the child to the parent. + */ +typedef struct { + /* True if the child has died; false if this is just a warning to + be printed. */ + bool fatal; + /* Relevant only if fatal==true */ + int exit_status; + + /* Length of the strings that are written up the pipe after this + struct */ + int file_str_len; + int topic_str_len; + int msg_str_len; +} pipe_err_msg_t; + +/* + * Max length of strings from the pipe_err_msg_t + */ +#define MAX_FILE_LEN 511 +#define MAX_TOPIC_LEN MAX_FILE_LEN + +/* + * Module functions (function pointers used in a struct) + */ +static int orte_odls_mosix_launch_local_procs(opal_buffer_t *data); +static int orte_odls_mosix_kill_local_procs(opal_pointer_array_t *procs); +static int orte_odls_mosix_signal_local_procs(const orte_process_name_t *proc, int32_t signal); +static int orte_odls_mosix_restart_proc(orte_odls_child_t *child); + +/* + * Explicitly declared functions so that we can get the noreturn + * attribute registered with the compiler. + */ +static void send_error_show_help(int fd, int exit_status, + const char *file, const char *topic, ...) + __opal_attribute_noreturn__; +static int do_child(orte_app_context_t* context, + orte_odls_child_t *child, + char **environ_copy, + orte_odls_job_t *jobdat, int write_fd, + orte_iof_base_io_conf_t opts) + __opal_attribute_noreturn__; + + +/* + * MOSIX requires all processes to be launched with "/bin/mosrun -w procname arg1 arg2..." + */ +#define ORTE_ODLS_MOSIX_ARGV_COUNT 2 + +#define ORTE_ODLS_MOSIX_MOSRUN_LOCATION "/bin/mosrun" + +#define ORTE_ODLS_MOSIX_MOSRUN_PARAM_IGNORE_UNSUPPORTED "-w" + +char* orte_odls_mosix_argv[] = { + ORTE_ODLS_MOSIX_MOSRUN_LOCATION, + ORTE_ODLS_MOSIX_MOSRUN_PARAM_IGNORE_UNSUPPORTED, + NULL, /* Place-holder for the application name */ + NULL /* Must truncate the ARGV array */ +}; + +/* + * Module + */ +orte_odls_base_module_t orte_odls_mosix_module = { + orte_odls_base_default_get_add_procs_data, + orte_odls_mosix_launch_local_procs, + orte_odls_mosix_kill_local_procs, + orte_odls_mosix_signal_local_procs, + orte_odls_base_default_deliver_message, + orte_odls_base_default_require_sync, + orte_odls_mosix_restart_proc +}; + + +static bool odls_mosix_child_died(orte_odls_child_t *child) +{ + time_t end; + pid_t ret; + + /* Because of rounding in time (which returns whole seconds) we + * have to add 1 to our wait number: this means that we wait + * somewhere between (target) and (target)+1 seconds. Otherwise, + * the default 1s actually means 'somwhere between 0 and 1s'. */ + end = time(NULL) + orte_odls_globals.timeout_before_sigkill + 1; + do { + ret = waitpid(child->pid, &child->exit_code, WNOHANG); + if (child->pid == ret) { + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:WAITPID INDICATES PROC %d IS DEAD", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid))); + /* It died -- return success */ + return true; + } else if (0 == ret) { + /* with NOHANG specified, if a process has already exited + * while waitpid was registered, then waitpid returns 0 + * as there is no error - this is a race condition problem + * that occasionally causes us to incorrectly report a proc + * as refusing to die. Unfortunately, errno may not be reset + * by waitpid in this case, so we cannot check it. + * + * (note the previous fix to this, to return 'process dead' + * here, fixes the race condition at the cost of reporting + * all live processes have immediately died! Better to + * occasionally report a dead process as still living - + * which will occasionally trip the timeout for cases that + * are right on the edge.) + */ + + /* Do nothing, process still alive */ + } else if (-1 == ret && ECHILD == errno) { + /* The pid no longer exists, so we'll call this "good + enough for government work" */ + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:WAITPID INDICATES PID %d NO LONGER EXISTS", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid))); + return true; + } + + /* Bogus delay for 1 msec - let's actually give the CPU some time + * to quit the other process (sched_yield() -- even if we have it + * -- changed behavior in 2.6.3x Linux flavors to be undesirable) + * Don't use select on a bogus file descriptor here as it has proven + * unreliable and sometimes immediately returns - we really, really + * -do- want to wait a bit! + */ + usleep(1000); + } while (time(NULL) < end); + + /* The child didn't die, so return false */ + return false; +} + +static int odls_mosix_kill_local(pid_t pid, int signum) +{ + if (orte_forward_job_control) { + pid = -pid; + } + if (0 != kill(pid, signum)) { + if (ESRCH != errno) { + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:SENT KILL %d TO PID %d GOT ERRNO %d", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid, errno)); + return errno; + } + } + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:SENT KILL %d TO PID %d SUCCESS", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid)); + return 0; +} + +int orte_odls_mosix_kill_local_procs(opal_pointer_array_t *procs) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs, + odls_mosix_kill_local, odls_mosix_child_died))) { + ORTE_ERROR_LOG(rc); + return rc; + } + return ORTE_SUCCESS; +} + + +static void set_handler_default(int sig) +{ + struct sigaction act; + + act.sa_handler = SIG_DFL; + act.sa_flags = 0; + sigemptyset(&act.sa_mask); + + sigaction(sig, &act, (struct sigaction *)0); +} + +/* + * Internal function to write a rendered show_help message back up the + * pipe to the waiting parent. + */ +static int write_help_msg(int fd, pipe_err_msg_t *msg, const char *file, + const char *topic, va_list ap) +{ + int ret; + char *str; + + if (NULL == file || NULL == topic) { + return OPAL_ERR_BAD_PARAM; + } + + str = opal_show_help_vstring(file, topic, true, ap); + + msg->file_str_len = (int) strlen(file); + if (msg->file_str_len > MAX_FILE_LEN) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + msg->topic_str_len = (int) strlen(topic); + if (msg->topic_str_len > MAX_TOPIC_LEN) { + ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); + return ORTE_ERR_BAD_PARAM; + } + msg->msg_str_len = (int) strlen(str); + + /* Only keep writing if each write() succeeds */ + if (OPAL_SUCCESS != (ret = opal_fd_write(fd, sizeof(*msg), msg))) { + goto out; + } + if (msg->file_str_len > 0 && + OPAL_SUCCESS != (ret = opal_fd_write(fd, msg->file_str_len, file))) { + goto out; + } + if (msg->topic_str_len > 0 && + OPAL_SUCCESS != (ret = opal_fd_write(fd, msg->topic_str_len, topic))) { + goto out; + } + if (msg->msg_str_len > 0 && + OPAL_SUCCESS != (ret = opal_fd_write(fd, msg->msg_str_len, str))) { + goto out; + } + + out: + free(str); + return ret; +} + +/* Called from the child to send an error message up the pipe to the + waiting parent. */ +static void send_error_show_help(int fd, int exit_status, + const char *file, const char *topic, ...) +{ + va_list ap; + pipe_err_msg_t msg; + + msg.fatal = true; + msg.exit_status = exit_status; + + /* Send it */ + va_start(ap, topic); + (void) write_help_msg(fd, &msg, file, topic, ap); + va_end(ap); + + exit(exit_status); +} + +static int do_child(orte_app_context_t* context, + orte_odls_child_t *child, + char **environ_copy, + orte_odls_job_t *jobdat, int write_fd, + orte_iof_base_io_conf_t opts) +{ + int i; + sigset_t sigs; + long fd, fdmax = sysconf(_SC_OPEN_MAX); + + if (orte_forward_job_control) { + /* Set a new process group for this child, so that a + SIGSTOP can be sent to it without being sent to the + orted. */ + setpgid(0, 0); + } + + /* Setup the pipe to be close-on-exec */ + fcntl(write_fd, F_SETFD, FD_CLOEXEC); + + if (NULL != child) { + /* setup stdout/stderr so that any error messages that we + may print out will get displayed back at orterun. + + NOTE: Definitely do this AFTER we check contexts so + that any error message from those two functions doesn't + come out to the user. IF we didn't do it in this order, + THEN a user who gives us a bad executable name or + working directory would get N error messages, where + N=num_procs. This would be very annoying for large + jobs, so instead we set things up so that orterun + always outputs a nice, single message indicating what + happened + */ + if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&opts, + &environ_copy))) { + ORTE_ERROR_LOG(i); + send_error_show_help(write_fd, 1, + "help-orte-odls-mosix.txt", + "iof setup failed", + orte_process_info.nodename, context->app); + /* Does not return */ + } + + } else if (!(ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) { + /* tie stdin/out/err/internal to /dev/null */ + int fdnull; + for (i=0; i < 3; i++) { + fdnull = open("/dev/null", O_RDONLY, 0); + if (fdnull > i && i != write_fd) { + dup2(fdnull, i); + } + close(fdnull); + } + fdnull = open("/dev/null", O_RDONLY, 0); + if (fdnull > opts.p_internal[1]) { + dup2(fdnull, opts.p_internal[1]); + } + close(fdnull); + } + + /* close all file descriptors w/ exception of stdin/stdout/stderr, + the pipe used for the IOF INTERNAL messages, and the pipe up to + the parent. */ + for(fd=3; fdapp); + for (jout=0; NULL != context->argv[jout]; jout++) { + opal_output(0, "%s\tARGV1[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, context->argv[jout]); + } + } + + /* + * Prepend the "/bin/mosrun -w" to the launched application + */ + if (context->argv == NULL) { + context->argv = malloc(sizeof(orte_odls_mosix_argv)); + memcpy(context->argv, orte_odls_mosix_argv, sizeof(orte_odls_mosix_argv)); + context->argv[ORTE_ODLS_MOSIX_ARGV_COUNT] = strdup(context->app); + } + else { + for (i=0; context->argv[i] != NULL; i++); /* count the arguments */ + context->argv = realloc(context->argv, + sizeof(char*) * (ORTE_ODLS_MOSIX_ARGV_COUNT + i + 1)); + context->argv[ORTE_ODLS_MOSIX_ARGV_COUNT + i] = NULL; /* Truncate argument list */ + for (i += ORTE_ODLS_MOSIX_ARGV_COUNT - 1; i >= ORTE_ODLS_MOSIX_ARGV_COUNT; i--) { + context->argv[i] = context->argv[i - ORTE_ODLS_MOSIX_ARGV_COUNT]; + } + memcpy(context->argv, orte_odls_mosix_argv, sizeof(char*) * ORTE_ODLS_MOSIX_ARGV_COUNT); + } + context->app = context->argv[0]; + + /* Set signal handlers back to the default. Do this close to + the exev() because the event library may (and likely will) + reset them. If we don't do this, the event library may + have left some set that, at least on some OS's, don't get + reset via fork() or exec(). Hence, the launched process + could be unkillable (for example). */ + + set_handler_default(SIGTERM); + set_handler_default(SIGINT); + set_handler_default(SIGHUP); + set_handler_default(SIGPIPE); + set_handler_default(SIGCHLD); + + /* Unblock all signals, for many of the same reasons that we + set the default handlers, above. This is noticable on + Linux where the event library blocks SIGTERM, but we don't + want that blocked by the launched process. */ + sigprocmask(0, 0, &sigs); + sigprocmask(SIG_UNBLOCK, &sigs, 0); + + /* Exec the new executable */ + + if (10 < opal_output_get_verbosity(orte_odls_globals.output)) { + int jout; + opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), context->app); + for (jout=0; NULL != context->argv[jout]; jout++) { + opal_output(0, "%s\tARGV2[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, context->argv[jout]); + } + for (jout=0; NULL != environ_copy[jout]; jout++) { + opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, environ_copy[jout]); + } + } + + execve(context->app, context->argv, environ_copy); + send_error_show_help(write_fd, 1, + "help-orte-odls-mosix.txt", "execve error", + context->app, strerror(errno)); + /* Does not return */ +} + + +static int do_parent(orte_app_context_t* context, + orte_odls_child_t *child, + char **environ_copy, + orte_odls_job_t *jobdat, int read_fd, + orte_iof_base_io_conf_t opts) +{ + int rc; + pipe_err_msg_t msg; + char file[MAX_FILE_LEN + 1], topic[MAX_TOPIC_LEN + 1], *str = NULL; + + if (NULL != child && (ORTE_JOB_CONTROL_FORWARD_OUTPUT & jobdat->controls)) { + /* connect endpoints IOF */ + rc = orte_iof_base_setup_parent(child->name, &opts); + if (ORTE_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + close(read_fd); + + if (NULL != child) { + child->state = ORTE_PROC_STATE_UNDEF; + } + return rc; + } + } + + /* Block reading a message from the pipe */ + while (1) { + rc = opal_fd_read(read_fd, sizeof(msg), &msg); + + /* If the pipe closed, then the child successfully launched */ + if (OPAL_ERR_TIMEOUT == rc) { + break; + } + + /* If Something Bad happened in the read, error out */ + if (OPAL_SUCCESS != rc) { + ORTE_ERROR_LOG(rc); + close(read_fd); + + if (NULL != child) { + child->state = ORTE_PROC_STATE_UNDEF; + } + return rc; + } + + /* Otherwise, we got a warning or error message from the child */ + if (NULL != child) { + child->alive = msg.fatal ? 0 : 1; + } + + /* Read in the strings; ensure to terminate them with \0 */ + if (msg.file_str_len > 0) { + rc = opal_fd_read(read_fd, msg.file_str_len, file); + if (OPAL_SUCCESS != rc) { + orte_show_help("help-orte-odls-mosix.txt", "syscall fail", + true, + orte_process_info.nodename, context->app, + "opal_fd_read", __FILE__, __LINE__); + if (NULL != child) { + child->state = ORTE_PROC_STATE_UNDEF; + } + return rc; + } + file[msg.file_str_len] = '\0'; + } + if (msg.topic_str_len > 0) { + rc = opal_fd_read(read_fd, msg.topic_str_len, topic); + if (OPAL_SUCCESS != rc) { + orte_show_help("help-orte-odls-mosix.txt", "syscall fail", + true, + orte_process_info.nodename, context->app, + "opal_fd_read", __FILE__, __LINE__); + if (NULL != child) { + child->state = ORTE_PROC_STATE_UNDEF; + } + return rc; + } + topic[msg.topic_str_len] = '\0'; + } + if (msg.msg_str_len > 0) { + str = calloc(1, msg.msg_str_len + 1); + if (NULL == str) { + orte_show_help("help-orte-odls-mosix.txt", "syscall fail", + true, + orte_process_info.nodename, context->app, + "opal_fd_read", __FILE__, __LINE__); + if (NULL != child) { + child->state = ORTE_PROC_STATE_UNDEF; + } + return rc; + } + rc = opal_fd_read(read_fd, msg.msg_str_len, str); + } + + /* Print out what we got. We already have a rendered string, + so use orte_show_help_norender(). */ + if (msg.msg_str_len > 0) { + orte_show_help_norender(file, topic, false, str); + free(str); + str = NULL; + } + + /* If msg.fatal is true, then the child exited with an error. + Otherwise, whatever we just printed was a warning, so loop + around and see what else is on the pipe (or if the pipe + closed, indicating that the child launched + successfully). */ + if (msg.fatal) { + if (NULL != child) { + child->state = ORTE_PROC_STATE_FAILED_TO_START; + child->alive = false; + } + close(read_fd); + return ORTE_ERR_FAILED_TO_START; + } + } + + /* If we got here, it means that the pipe closed without + indication of a fatal error, meaning that the child process + launched successfully. */ + if (NULL != child) { + child->state = ORTE_PROC_STATE_LAUNCHED; + child->alive = true; + } + close(read_fd); + + return ORTE_SUCCESS; +} + + +/** + * Fork/exec the specified processes + */ +static int odls_mosix_fork_local_proc(orte_app_context_t* context, + orte_odls_child_t *child, + char **environ_copy, + orte_odls_job_t *jobdat) +{ + orte_iof_base_io_conf_t opts; + int rc, p[2]; + pid_t pid; + + if (NULL != child) { + /* should pull this information from MPIRUN instead of going with + default */ + opts.usepty = OPAL_ENABLE_PTY_SUPPORT; + + /* do we want to setup stdin? */ + if (NULL != child && + (jobdat->stdin_target == ORTE_VPID_WILDCARD || child->name->vpid == jobdat->stdin_target)) { + opts.connect_stdin = true; + } else { + opts.connect_stdin = false; + } + + if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&opts))) { + ORTE_ERROR_LOG(rc); + if (NULL != child) { + child->state = ORTE_PROC_STATE_FAILED_TO_START; + child->exit_code = rc; + } + return rc; + } + } + + /* A pipe is used to communicate between the parent and child to + indicate whether the exec ultimately succeeded or failed. The + child sets the pipe to be close-on-exec; the child only ever + writes anything to the pipe if there is an error (e.g., + executable not found, exec() fails, etc.). The parent does a + blocking read on the pipe; if the pipe closed with no data, + then the exec() succeeded. If the parent reads something from + the pipe, then the child was letting us know why it failed. */ + if (pipe(p) < 0) { + ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES); + if (NULL != child) { + child->state = ORTE_PROC_STATE_FAILED_TO_START; + child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES; + } + return ORTE_ERR_SYS_LIMITS_PIPES; + } + + /* Fork off the child */ + pid = fork(); + if (NULL != child) { + child->pid = pid; + } + + if (pid < 0) { + ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN); + if (NULL != child) { + child->state = ORTE_PROC_STATE_FAILED_TO_START; + child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN; + } + return ORTE_ERR_SYS_LIMITS_CHILDREN; + } + + if (pid == 0) { + close(p[0]); + do_child(context, child, environ_copy, jobdat, p[1], opts); + /* Does not return */ + } + + close(p[1]); + return do_parent(context, child, environ_copy, jobdat, p[0], opts); +} + + +/** + * Launch all processes allocated to the current node. + */ + +int orte_odls_mosix_launch_local_procs(opal_buffer_t *data) +{ + int rc; + orte_jobid_t job; + orte_job_t *jdata; + + /* construct the list of children we are to launch */ + if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) { + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:launch:local failed to construct child list on error %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); + goto CLEANUP; + } + + /* launch the local procs */ + if (ORTE_SUCCESS != (rc = orte_odls_base_default_launch_local(job, odls_mosix_fork_local_proc))) { + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:launch:local failed to launch on error %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); + goto CLEANUP; + } + + /* look up job data object */ + if (NULL != (jdata = orte_get_job_data_object(job))) { + if (jdata->state & ORTE_JOB_STATE_SUSPENDED) { + if (ORTE_PROC_IS_HNP) { + /* Have the plm send the signal to all the nodes. + If the signal arrived before the orteds started, + then they won't know to suspend their procs. + The plm also arranges for any local procs to + be signaled. + */ + orte_plm.signal_job(jdata->jobid, SIGTSTP); + } else { + orte_odls_mosix_signal_local_procs(NULL, SIGTSTP); + } + } + } + +CLEANUP: + + return rc; +} + + +/** + * Send a sigal to a pid. Note that if we get an error, we set the + * return value and let the upper layer print out the message. + */ +static int send_signal(pid_t pid, int signal) +{ + int rc = ORTE_SUCCESS; + + OPAL_OUTPUT_VERBOSE((1, orte_odls_globals.output, + "%s sending signal %d to pid %ld", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + signal, (long)pid)); + + if (orte_forward_job_control) { + /* Send the signal to the process group rather than the + process. The child is the leader of its process group. */ + pid = -pid; + } + if (kill(pid, signal) != 0) { + switch(errno) { + case EINVAL: + rc = ORTE_ERR_BAD_PARAM; + break; + case ESRCH: + /* This case can occur when we deliver a signal to a + process that is no longer there. This can happen if + we deliver a signal while the job is shutting down. + This does not indicate a real problem, so just + ignore the error. */ + break; + case EPERM: + rc = ORTE_ERR_PERM; + break; + default: + rc = ORTE_ERROR; + } + } + + return rc; +} + +static int orte_odls_mosix_signal_local_procs(const orte_process_name_t *proc, int32_t signal) +{ + int rc; + + if (ORTE_SUCCESS != (rc = orte_odls_base_default_signal_local_procs(proc, signal, send_signal))) { + ORTE_ERROR_LOG(rc); + return rc; + } + return ORTE_SUCCESS; +} + +static int orte_odls_mosix_restart_proc(orte_odls_child_t *child) +{ + int rc; + + /* restart the local proc */ + if (ORTE_SUCCESS != (rc = orte_odls_base_default_restart_proc(child, odls_mosix_fork_local_proc))) { + OPAL_OUTPUT_VERBOSE((2, orte_odls_globals.output, + "%s odls:default:restart_proc failed to launch on error %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc))); + } + return rc; +} + Index: orte/mca/odls/mosix/help-orte-odls-mosix.txt =================================================================== --- orte/mca/odls/mosix/help-orte-odls-mosix.txt (revision 0) +++ orte/mca/odls/mosix/help-orte-odls-mosix.txt (revision 0) @@ -0,0 +1,117 @@ +# -*- text -*- +# +# Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright (c) 2010-2011 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# +# This is a US/English help file. +# +[execve error] +Open MPI tried to fork a new process via the "execve" system call but +failed. This is an unusual error because Open MPI checks many things +before attempting to launch a child process. This error may be +indicative of another problem on the target host. Your job will now +abort. + + Local host: %s + Application name: %s +# +[binding not supported] +Open MPI tried to bind a new process, but process binding is not +supported on the host where it was launched. The process was killed +without launching the target application. Your job will now abort. + + Local host: %s + Application name: %s +# +[binding generic error] +Open MPI tried to bind a new process, but something went wrong. The +process was killed without launching the target application. Your job +will now abort. + + Local host: %s + Application name: %s + Error message: %s + Location: %s:%d +# +[bound to everything] +Open MPI tried to bind a new process to a specific set of processors, +but ended up binding it to *all* processors. This means that the new +process is effectively unbound. + +This is only a warning -- your job will continue. You can suppress +this warning in the future by setting the odls_warn_if_not_bound MCA +parameter to 0. + + Local host: %s + Application name: %s + Location: %s:%d +# +[slot list and paffinity_alone] +Open MPI detected that both a slot list was specified and the MCA +parameter "paffinity_alone" was set to true. Only one of these can be +used at a time. Your job will now abort. + + Local host: %s + Application name: %s +# +[iof setup failed] +Open MPI tried to launch a child process but the "IOF child setup" +failed. This should not happen. Your job will now abort. + + Local host: %s + Application name: %s +# +[not bound] +WARNING: Open MPI tried to bind a process but failed. This is a +warning only; your job will continue. + + Local host: %s + Application name: %s + Error message: %s + Location: %s:%d +# +[syscall fail] +A system call failed that should not have. In this particular case, +a warning or error message was not displayed that should have been. +Your job may behave unpredictably after this, or abort. + + Local host: %s + Application name: %s + Function: %s + Location: %s:%d +# +[memory not bound] +WARNING: Open MPI tried to bind a process but failed. This is a +warning only; your job will continue, though performance may +be degraded. + + Local host: %s + Application name: %s + Error message: %s + Location: %s:%d + +# +[memory binding error] +Open MPI tried to bind memory for a new process but something went +wrong. The process was killed without launching the target +application. Your job will now abort. + + Local host: %s + Application name: %s + Error message: %s + Location: %s:%d Index: orte/mca/odls/mosix/Makefile.am =================================================================== --- orte/mca/odls/mosix/Makefile.am (revision 0) +++ orte/mca/odls/mosix/Makefile.am (revision 0) @@ -0,0 +1,46 @@ +# +# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana +# University Research and Technology +# Corporation. All rights reserved. +# Copyright (c) 2004-2005 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, +# University of Stuttgart. All rights reserved. +# Copyright (c) 2004-2005 The Regents of the University of California. +# All rights reserved. +# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +dist_pkgdata_DATA = help-orte-odls-mosix.txt + +sources = \ + odls_mosix.h \ + odls_mosix_component.c \ + odls_mosix_module.c + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_orte_odls_mosix_DSO +component_noinst = +component_install = mca_odls_mosix.la +else +component_noinst = libmca_odls_mosix.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_odls_mosix_la_SOURCES = $(sources) +mca_odls_mosix_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_odls_mosix_la_SOURCES =$(sources) +libmca_odls_mosix_la_LDFLAGS = -module -avoid-version