Index: orte/mca/ess/slurmd/ess_slurmd_module.c =================================================================== --- orte/mca/ess/slurmd/ess_slurmd_module.c (revision 24646) +++ orte/mca/ess/slurmd/ess_slurmd_module.c (working copy) @@ -50,6 +50,7 @@ #include "orte/util/name_fns.h" #include "orte/runtime/orte_globals.h" #include "orte/util/nidmap.h" +#include "orte/util/pre_condition_transports.h" #include "orte/util/regex.h" #include "orte/runtime/orte_wait.h" @@ -107,7 +108,9 @@ char *regexp, *tasks_per_node; int *ppn; bool block=false, cyclic=false; - + uint64_t unique_key[2]; + char *cs_env, *string_key; + /* init flag */ app_init_complete = false; slurm20 = false; @@ -147,6 +150,26 @@ /* now build the jobid */ ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid); + /* setup transport keys in case the MPI layer needs them - + * we can use the SLURM jobid and stepid as unique keys + * because they are unique values assigned by the RM + */ + unique_key[0] = (uint64_t)jobfam; + unique_key[1] = (uint64_t)stepid; + if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + asprintf(&envar, "%s=%s", cs_env, string_key); + putenv(envar); + /* cannot free the envar as that messes up the environ */ + free(cs_env); + free(string_key); + /* get the slurm procid - this will be our vpid */ if (NULL == (envar = getenv("SLURM_PROCID"))) { error = "could not get SLURM_PROCID"; @@ -388,6 +411,8 @@ } } + unsetenv("OMPI_MCA_orte_precondition_transports"); + /* deconstruct my nidmap and jobmap arrays - this * function protects itself from being called * before things were initialized Index: orte/util/pre_condition_transports.h =================================================================== --- orte/util/pre_condition_transports.h (revision 24646) +++ orte/util/pre_condition_transports.h (working copy) @@ -34,6 +34,8 @@ ORTE_DECLSPEC int orte_pre_condition_transports(orte_job_t *jdata); +ORTE_DECLSPEC char* orte_pre_condition_transports_print(uint64_t *unique_key); + END_C_DECLS #endif Index: orte/util/pre_condition_transports.c =================================================================== --- orte/util/pre_condition_transports.c (revision 24646) +++ orte/util/pre_condition_transports.c (working copy) @@ -62,55 +62,19 @@ unique_key[1] = rand(); } -int orte_pre_condition_transports(orte_job_t *jdata) +char* orte_pre_condition_transports_print(uint64_t *unique_key) { + unsigned int *int_ptr; size_t i, string_key_len, written_len; - char *cs_env, *string_key = NULL, *format = NULL; - uint64_t unique_key[2]; - unsigned int *int_ptr; - orte_std_cntr_t n; - orte_app_context_t **apps; + char *string_key = NULL, *format = NULL; -#if !defined(__WINDOWS__) - int fd_rand; - size_t bytes_read; - struct stat buf; - - /* put the number here - or else create an appropriate string. this just needs to - * eventually be a string variable - */ - if(0 != stat("/dev/urandom", &buf)) { - /* file doesn't exist! */ - orte_pre_condition_transports_use_rand(unique_key); - } - - if(-1 == (fd_rand = open("/dev/urandom", O_RDONLY))) { - orte_pre_condition_transports_use_rand(unique_key); - } else { - bytes_read = read(fd_rand, (char *) unique_key, 16); - if(bytes_read != 16) { - orte_pre_condition_transports_use_rand(unique_key); - } else { - close(fd_rand); - } - } -#else - { - unsigned int random_value; - rand_s( &random_value ); - unique_key[0] = (uint64_t)random_value; - rand_s( &random_value ); - unique_key[1] = (uint64_t)random_value; - } -#endif /* !defined(__WINDOWS__) */ - /* string is two 64 bit numbers printed in hex with a dash between * and zero padding. */ string_key_len = (sizeof(uint64_t) * 2) * 2 + strlen("-") + 1; string_key = (char*) malloc(string_key_len); if (NULL == string_key) { - return ORTE_ERR_OUT_OF_RESOURCE; + return NULL; } string_key[0] = '\0'; @@ -147,19 +111,70 @@ format, int_ptr[i]); written_len = strlen(string_key); } - + free(format); + + return string_key; +} + + +int orte_pre_condition_transports(orte_job_t *jdata) +{ + uint64_t unique_key[2]; + int n; + orte_app_context_t *app; + char *string_key, *cs_env; + +#if !defined(__WINDOWS__) + int fd_rand; + size_t bytes_read; + struct stat buf; + + /* put the number here - or else create an appropriate string. this just needs to + * eventually be a string variable + */ + if(0 != stat("/dev/urandom", &buf)) { + /* file doesn't exist! */ + orte_pre_condition_transports_use_rand(unique_key); + } + + if(-1 == (fd_rand = open("/dev/urandom", O_RDONLY))) { + orte_pre_condition_transports_use_rand(unique_key); + } else { + bytes_read = read(fd_rand, (char *) unique_key, 16); + if(bytes_read != 16) { + orte_pre_condition_transports_use_rand(unique_key); + } else { + close(fd_rand); + } + } +#else + { + unsigned int random_value; + rand_s( &random_value ); + unique_key[0] = (uint64_t)random_value; + rand_s( &random_value ); + unique_key[1] = (uint64_t)random_value; + } +#endif /* !defined(__WINDOWS__) */ + + if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { + ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); + return ORTE_ERR_OUT_OF_RESOURCE; + } + if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } - apps = (orte_app_context_t**)jdata->apps->addr; for (n=0; n < jdata->num_apps; n++) { - opal_setenv(cs_env, string_key, true, &apps[n]->env); + if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, n))) { + continue; + } + opal_setenv(cs_env, string_key, true, &app->env); } free(cs_env); - free(format); free(string_key); return ORTE_SUCCESS;