Add debug and handle the use-case where someone (a) uses a hostfile while in a managed allocation to sub-allocate runs, and (b) includes the HNP's node in one of those hostfiles.
cmr:v1.7
1.1 --- a/orte/mca/plm/base/plm_base_launch_support.c Thu Mar 21 23:05:54 2013 +0000
1.2 +++ b/orte/mca/plm/base/plm_base_launch_support.c Fri Mar 22 00:53:33 2013 +0000
1.3 @@ -1246,14 +1246,20 @@
1.4 }
1.5 /* ignore nodes that are marked as do-not-use for this mapping */
1.6 if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
1.7 + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
1.8 + "NODE %s IS MARKED NO_USE", node->name));
1.9 /* reset the state so it can be used another time */
1.10 node->state = ORTE_NODE_STATE_UP;
1.11 continue;
1.12 }
1.13 if (ORTE_NODE_STATE_DOWN == node->state) {
1.14 + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
1.15 + "NODE %s IS MARKED DOWN", node->name));
1.16 continue;
1.17 }
1.18 if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
1.19 + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
1.20 + "NODE %s IS MARKED NO_INCLUDE", node->name));
1.21 /* not to be used */
1.22 continue;
1.23 }
1.24 @@ -1388,14 +1394,20 @@
1.25 /* have a match - now see if we want this node */
1.26 /* ignore nodes that are marked as do-not-use for this mapping */
1.27 if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
1.28 + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
1.29 + "NODE %s IS MARKED NO_USE", node->name));
1.30 /* reset the state so it can be used another time */
1.31 node->state = ORTE_NODE_STATE_UP;
1.32 break;
1.33 }
1.34 if (ORTE_NODE_STATE_DOWN == node->state) {
1.35 + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
1.36 + "NODE %s IS MARKED DOWN", node->name));
1.37 break;
1.38 }
1.39 if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
1.40 + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
1.41 + "NODE %s IS MARKED NO_INCLUDE", node->name));
1.42 break;
1.43 }
1.44 /* if this node is us, ignore it */
1.45 @@ -1430,21 +1442,25 @@
1.46 goto process;
1.47 }
1.48
1.49 - /* construct a list of available nodes - don't need ours as
1.50 - * we already exist
1.51 - */
1.52 + /* construct a list of available nodes */
1.53 for (i=1; i < orte_node_pool->size; i++) {
1.54 if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
1.55 /* ignore nodes that are marked as do-not-use for this mapping */
1.56 if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
1.57 + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
1.58 + "NODE %s IS MARKED NO_USE", node->name));
1.59 /* reset the state so it can be used another time */
1.60 node->state = ORTE_NODE_STATE_UP;
1.61 continue;
1.62 }
1.63 if (ORTE_NODE_STATE_DOWN == node->state) {
1.64 + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
1.65 + "NODE %s IS MARKED DOWN", node->name));
1.66 continue;
1.67 }
1.68 if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
1.69 + OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
1.70 + "NODE %s IS MARKED NO_INCLUDE", node->name));
1.71 /* not to be used */
1.72 continue;
1.73 }
1.74 @@ -1461,13 +1477,14 @@
1.75 }
1.76
1.77 /* if we didn't get anything, then we are the only node in the
1.78 - * allocation - so there is nothing else to do as no other
1.79 + * system - so there is nothing else to do as no other
1.80 * daemons are to be launched
1.81 */
1.82 if (0 == opal_list_get_size(&nodes)) {
1.83 OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
1.84 "%s plm:base:setup_vm only HNP in allocation",
1.85 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1.86 + /* cleanup */
1.87 OBJ_DESTRUCT(&nodes);
1.88 /* mark that the daemons have reported so we can proceed */
1.89 daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
1.90 @@ -1475,7 +1492,17 @@
1.91 return ORTE_SUCCESS;
1.92 }
1.93
1.94 - /* filter across the union of all app_context specs */
1.95 + /* filter across the union of all app_context specs - if the HNP
1.96 + * was allocated, then we have to include
1.97 + * ourselves in case someone has specified a -host or hostfile
1.98 + * that includes the head node. We will remove ourselves later
1.99 + * as we clearly already exist
1.100 + */
1.101 + if (orte_hnp_is_allocated) {
1.102 + node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
1.103 + OBJ_RETAIN(node);
1.104 + opal_list_append(&nodes, &node->super);
1.105 + }
1.106 for (i=0; i < jdata->apps->size; i++) {
1.107 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
1.108 continue;
1.109 @@ -1507,6 +1534,18 @@
1.110 }
1.111 }
1.112
1.113 + /* ensure we are not on the list */
1.114 + for (item = opal_list_get_first(&nodes);
1.115 + item != opal_list_get_end(&nodes);
1.116 + item = opal_list_get_next(item)) {
1.117 + node = (orte_node_t*)item;
1.118 + if (0 == node->index) {
1.119 + opal_list_remove_item(&nodes, item);
1.120 + OBJ_RELEASE(item);
1.121 + break;
1.122 + }
1.123 + }
1.124 +
1.125 /* if we didn't get anything, then we are the only node in the
1.126 * allocation - so there is nothing else to do as no other
1.127 * daemons are to be launched
2.1 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c Thu Mar 21 23:05:54 2013 +0000
2.2 +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c Fri Mar 22 00:53:33 2013 +0000
2.3 @@ -255,18 +255,27 @@
2.4 continue;
2.5 }
2.6 if (0 != strcmp(node->name, nptr->name)) {
2.7 + OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
2.8 + "NODE %s DOESNT MATCH NODE %s",
2.9 + node->name, nptr->name));
2.10 continue;
2.11 }
2.12 /* ignore nodes that are marked as do-not-use for this mapping */
2.13 if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
2.14 + OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
2.15 + "NODE %s IS MARKED NO_USE", node->name));
2.16 /* reset the state so it can be used another time */
2.17 node->state = ORTE_NODE_STATE_UP;
2.18 continue;
2.19 }
2.20 if (ORTE_NODE_STATE_DOWN == node->state) {
2.21 + OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
2.22 + "NODE %s IS DOWN", node->name));
2.23 continue;
2.24 }
2.25 if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
2.26 + OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
2.27 + "NODE %s IS MARKED NO_INCLUDE", node->name));
2.28 /* not to be used */
2.29 continue;
2.30 }
2.31 @@ -274,6 +283,8 @@
2.32 * unless we are mapping prior to launching the vm
2.33 */
2.34 if (NULL == node->daemon && !novm) {
2.35 + OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
2.36 + "NODE %s HAS NO DAEMON", node->name));
2.37 continue;
2.38 }
2.39 /* retain a copy for our use in case the item gets
2.40 @@ -327,6 +338,8 @@
2.41 if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) {
2.42 if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
2.43 if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
2.44 + OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
2.45 + "HNP IS MARKED NO_USE"));
2.46 /* clear this for future use, but don't include it */
2.47 node->state = ORTE_NODE_STATE_UP;
2.48 } else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) {
2.49 @@ -359,14 +372,20 @@
2.50 if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
2.51 /* ignore nodes that are marked as do-not-use for this mapping */
2.52 if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
2.53 + OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
2.54 + "NODE %s IS MARKED NO_USE", node->name));
2.55 /* reset the state so it can be used another time */
2.56 node->state = ORTE_NODE_STATE_UP;
2.57 continue;
2.58 }
2.59 if (ORTE_NODE_STATE_DOWN == node->state) {
2.60 + OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
2.61 + "NODE %s IS MARKED DOWN", node->name));
2.62 continue;
2.63 }
2.64 if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
2.65 + OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
2.66 + "NODE %s IS MARKED NO_INCLUDE", node->name));
2.67 /* not to be used */
2.68 continue;
2.69 }
2.70 @@ -374,6 +393,8 @@
2.71 * unless we are mapping prior to launching the vm
2.72 */
2.73 if (NULL == node->daemon && !novm) {
2.74 + OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
2.75 + "NODE %s HAS NO DAEMON", node->name));
2.76 continue;
2.77 }
2.78 /* retain a copy for our use in case the item gets