Index: ompi/tools/ompi-server/ompi-server.c =================================================================== --- ompi/tools/ompi-server/ompi-server.c (revision 23032) +++ ompi/tools/ompi-server/ompi-server.c (working copy) @@ -57,6 +57,9 @@ #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_data_server.h" +/* ensure I can behave like a daemon */ +#include "orte/orted/orted.h" + /* * Globals */ @@ -179,11 +182,14 @@ #endif tmp_env_var = NULL; /* Silence compiler warning */ + /* flag that I am the HNP */ + orte_process_info.hnp = true; + /* Perform the standard init, but flag that we are a tool * so that we only open up the communications infrastructure. No * session directories will be created. */ - if (ORTE_SUCCESS != (ret = orte_init(ORTE_TOOL))) { + if (ORTE_SUCCESS != (ret = orte_init(ORTE_NON_TOOL))) { fprintf(stderr, "ompi-server: failed to initialize -- aborting\n"); exit(1); } @@ -214,6 +220,15 @@ free(rml_uri); } + /* setup to listen for commands sent specifically to me */ + ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, + ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); + if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { + ORTE_ERROR_LOG(ret); + orte_finalize(); + exit(1); + } + /* setup the data server to listen for commands */ if (ORTE_SUCCESS != (ret = orte_data_server_init())) { fprintf(stderr, "ompi-server: failed to start data server -- aborting\n"); Index: orte/mca/odls/base/odls_base_default_fns.c =================================================================== --- orte/mca/odls/base/odls_base_default_fns.c (revision 23032) +++ orte/mca/odls/base/odls_base_default_fns.c (working copy) @@ -1612,6 +1612,10 @@ * come from a singleton */ if (!found) { + OPAL_OUTPUT_VERBOSE((5, orte_odls_globals.output, + "%s odls: registering sync on singleton %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(proc))); child = OBJ_NEW(orte_odls_child_t); if (ORTE_SUCCESS != (rc = opal_dss.copy((void**)&child->name, proc, ORTE_NAME))) { ORTE_ERROR_LOG(rc); @@ -1684,7 +1688,8 @@ OBJ_DESTRUCT(&buffer); /* now check to see if everyone in this job has registered */ - if (all_children_registered(proc->jobid)) { + if (ORTE_JOB_FAMILY(proc->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) && + all_children_registered(proc->jobid)) { /* once everyone registers, send their contact info to * the HNP so it is available to debuggers and anyone * else that needs it @@ -2111,7 +2116,9 @@ child->waitpid_recvd = true; /* check for everything complete */ - check_proc_complete(child); + if (ORTE_JOB_FAMILY(child->name->jobid) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { + check_proc_complete(child); + } /* done */ opal_condition_signal(&orte_odls_globals.cond); Index: orte/mca/routed/base/routed_base_register_sync.c =================================================================== --- orte/mca/routed/base/routed_base_register_sync.c (revision 23032) +++ orte/mca/routed/base/routed_base_register_sync.c (working copy) @@ -27,6 +27,7 @@ #include "orte/mca/rml/rml.h" #include "orte/runtime/orte_globals.h" #include "orte/runtime/orte_wait.h" +#include "orte/util/name_fns.h" #include "orte/mca/routed/base/base.h" @@ -49,6 +50,11 @@ orte_daemon_cmd_flag_t command=ORTE_DAEMON_SYNC_BY_PROC; char *rml_uri; + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, + "%s registering sync to daemon %s", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON))); + /* we need to get the oob to establish * the connection - the oob will leave the connection "alive" * thereafter so we can communicate readily @@ -93,6 +99,9 @@ * gets serviced by the event library on the orted prior to the * process exiting */ + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, + "%s registering sync waiting for ack", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); sync_recvd = false; rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SYNC, ORTE_RML_NON_PERSISTENT, report_sync, NULL); @@ -103,5 +112,8 @@ ORTE_PROGRESSED_WAIT(sync_recvd, 0, 1); + OPAL_OUTPUT_VERBOSE((5, orte_routed_base_output, + "%s registering sync ack recvd", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; } Index: orte/mca/routed/binomial/routed_binomial.c =================================================================== --- orte/mca/routed/binomial/routed_binomial.c (revision 23032) +++ orte/mca/routed/binomial/routed_binomial.c (working copy) @@ -509,13 +509,21 @@ char *rml_uri; orte_process_name_t name; int rc; + bool singleton=false; /* lookup the job object for this process */ if (NULL == (jdata = orte_get_job_data_object(job))) { - ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); - return ORTE_ERR_NOT_FOUND; + if (ORTE_JOB_FAMILY(job) == ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid)) { + /* came from my job family - this is an error */ + ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); + return ORTE_ERR_NOT_FOUND; + } + /* must be from a singleton that is connecting to us - just add the contact info */ + singleton = true; + } else { + procs = (orte_proc_t**)jdata->procs->addr; + } - procs = (orte_proc_t**)jdata->procs->addr; /* unpack the data for each entry */ cnt = 1; @@ -538,16 +546,20 @@ free(rml_uri); continue; } - /* the procs are stored in vpid order, so update the record */ - procs[name.vpid]->rml_uri = strdup(rml_uri); - free(rml_uri); - /* update the proc state */ - if (procs[name.vpid]->state < ORTE_PROC_STATE_RUNNING) { - procs[name.vpid]->state = ORTE_PROC_STATE_RUNNING; + if (!singleton) { + /* the procs are stored in vpid order, so update the record */ + procs[name.vpid]->rml_uri = strdup(rml_uri); + + /* update the proc state */ + if (procs[name.vpid]->state < ORTE_PROC_STATE_RUNNING) { + procs[name.vpid]->state = ORTE_PROC_STATE_RUNNING; + } + + ++jdata->num_reported; } - - ++jdata->num_reported; + free(rml_uri); + cnt = 1; } if (ORTE_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) { @@ -555,11 +567,13 @@ return rc; } - /* if all procs have reported, update our job state */ - if (jdata->num_reported == jdata->num_procs) { - /* update the job state */ - if (jdata->state < ORTE_JOB_STATE_RUNNING) { - jdata->state = ORTE_JOB_STATE_RUNNING; + if (!singleton) { + /* if all procs have reported, update our job state */ + if (jdata->num_reported == jdata->num_procs) { + /* update the job state */ + if (jdata->state < ORTE_JOB_STATE_RUNNING) { + jdata->state = ORTE_JOB_STATE_RUNNING; + } } } @@ -872,6 +886,7 @@ * is attempted until the overall ORTE system knows how to talk to everyone - * otherwise, the system can just hang. */ + if (ORTE_SUCCESS != (rc = orte_routed_base_register_sync(true))) { ORTE_ERROR_LOG(rc); return rc; Index: orte/mca/ess/singleton/ess_singleton_module.c =================================================================== --- orte/mca/ess/singleton/ess_singleton_module.c (revision 23032) +++ orte/mca/ess/singleton/ess_singleton_module.c (working copy) @@ -35,6 +35,7 @@ #include "opal/mca/installdirs/installdirs.h" #include "opal/class/opal_pointer_array.h" #include "opal/class/opal_value_array.h" +#include "opal/hash_string.h" #include "orte/util/show_help.h" #include "orte/util/proc_info.h" @@ -97,6 +98,10 @@ orte_nid_t *node; orte_jmap_t *jmap; orte_pmap_t pmap; + char *server_uri, *param; + uint16_t jobfam; + uint32_t hash32; + uint32_t bias; /* run the prolog */ if (ORTE_SUCCESS != (rc = orte_ess_base_std_prolog())) { @@ -104,44 +109,127 @@ return rc; } - /* - * If we are the selected module, then we must be a singleton - * as it means that no other method for discovering a name - * could be found. In this case, we need to start a daemon that - * can support our operation. We must do this for two reasons: - * - * (1) if we try to play the role of the HNP, then any child processes - * we might start via comm_spawn will rely on us for all ORTE-level - * support. However, we can only progress those requests when the - * the application calls into the OMPI/ORTE library! Thus, if this - * singleton just does computation, the other processes will "hang" - * in any calls into the ORTE layer that communicate with the HNP - - * and most calls on application procs *do*. - * - * (2) daemons are used to communicate messages for administrative - * purposes in a broadcast-like manner. Thus, daemons are expected - * to be able to interpret specific commands. Our application process - * doesn't have any idea how to handle those commands, thus causing - * the entire ORTE administrative system to break down. - * - * For those reasons, we choose to fork/exec a daemon at this time - * and then reconnect ourselves to it. We could just "fork" and declare - * the child to be a daemon, but that would require we place *all* of the - * daemon command processing code in the ORTE library, do some strange - * mojo in a few places, etc. This doesn't seem worth it, so we'll just - * do the old fork/exec here - * - * Note that Windows-based systems have to do their own special trick as - * they don't support fork/exec. So we have to use a giant "if" here to - * protect the Windows world. To make the results more readable, we put - * the whole mess in a separate function below - */ - if (ORTE_SUCCESS != (rc= fork_hnp())) { - /* if this didn't work, then we cannot support operation any further. - * Abort the system and tell orte_init to exit - */ - ORTE_ERROR_LOG(rc); - return rc; + /* look for the ompi-server MCA param */ + mca_base_param_reg_string_name("orte", "server", + "Server to be used as HNP - [file|FILE]: or just uri", + false, false, NULL, &server_uri); + + if (NULL != server_uri) { + /* we are going to connect to a server HNP */ + if (0 == strncmp(server_uri, "file", strlen("file")) || + 0 == strncmp(server_uri, "FILE", strlen("FILE"))) { + char input[1024], *filename; + FILE *fp; + + /* it is a file - get the filename */ + filename = strchr(server_uri, ':'); + if (NULL == filename) { + /* filename is not correctly formatted */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, + "singleton", server_uri); + return ORTE_ERROR; + } + ++filename; /* space past the : */ + + if (0 >= strlen(filename)) { + /* they forgot to give us the name! */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, + "singleton", server_uri); + return ORTE_ERROR; + } + + /* open the file and extract the uri */ + fp = fopen(filename, "r"); + if (NULL == fp) { /* can't find or read file! */ + orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, + "singleton", server_uri); + return ORTE_ERROR; + } + if (NULL == fgets(input, 1024, fp)) { + /* something malformed about file */ + fclose(fp); + orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, + "singleton", server_uri, "singleton"); + return ORTE_ERROR; + } + fclose(fp); + input[strlen(input)-1] = '\0'; /* remove newline */ + orte_process_info.my_hnp_uri = strdup(input); + } else { + orte_process_info.my_hnp_uri = strdup(server_uri); + } + /* save the daemon uri - we will process it later */ + orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri); + /* indicate we are a singleton so orte_init knows what to do */ + orte_process_info.singleton = true; + /* for convenience, push the pubsub version of this param into the environ */ + asprintf(¶m, "OMPI_MCA_dpm_orte_server=%s", orte_process_info.my_hnp_uri); + putenv(param); + free(param); + /* now define my own name */ + /* hash the nodename */ + OPAL_HASH_STR(orte_process_info.nodename, hash32); + + bias = (uint32_t)orte_process_info.pid; + + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "ess:singleton: initial bias %ld nodename hash %lu", + (long)bias, (unsigned long)hash32)); + + /* fold in the bias */ + hash32 = hash32 ^ bias; + + /* now compress to 16-bits */ + jobfam = (uint16_t)(((0x0000ffff & (0xffff0000 & hash32) >> 16)) ^ (0x0000ffff & hash32)); + + OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, + "ess:singleton:: final jobfam %lu", + (unsigned long)jobfam)); + + /* set the name */ + ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); + ORTE_PROC_MY_NAME->vpid = 0; + + } else { + /* + * If we are the selected module, then we must be a singleton + * as it means that no other method for discovering a name + * could be found. In this case, we need to start a daemon that + * can support our operation. We must do this for two reasons: + * + * (1) if we try to play the role of the HNP, then any child processes + * we might start via comm_spawn will rely on us for all ORTE-level + * support. However, we can only progress those requests when the + * the application calls into the OMPI/ORTE library! Thus, if this + * singleton just does computation, the other processes will "hang" + * in any calls into the ORTE layer that communicate with the HNP - + * and most calls on application procs *do*. + * + * (2) daemons are used to communicate messages for administrative + * purposes in a broadcast-like manner. Thus, daemons are expected + * to be able to interpret specific commands. Our application process + * doesn't have any idea how to handle those commands, thus causing + * the entire ORTE administrative system to break down. + * + * For those reasons, we choose to fork/exec a daemon at this time + * and then reconnect ourselves to it. We could just "fork" and declare + * the child to be a daemon, but that would require we place *all* of the + * daemon command processing code in the ORTE library, do some strange + * mojo in a few places, etc. This doesn't seem worth it, so we'll just + * do the old fork/exec here + * + * Note that Windows-based systems have to do their own special trick as + * they don't support fork/exec. So we have to use a giant "if" here to + * protect the Windows world. To make the results more readable, we put + * the whole mess in a separate function below + */ + if (ORTE_SUCCESS != (rc= fork_hnp())) { + /* if this didn't work, then we cannot support operation any further. + * Abort the system and tell orte_init to exit + */ + ORTE_ERROR_LOG(rc); + return rc; + } } orte_process_info.num_procs = 1; @@ -432,14 +520,14 @@ if (nid->daemon == ORTE_PROC_MY_DAEMON->vpid) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, - "%s ess:env: proc %s is LOCAL", + "%s ess:singleton: proc %s is LOCAL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc))); return true; } OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, - "%s ess:env: proc %s is REMOTE", + "%s ess:singleton: proc %s is REMOTE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));