Add debug and handle the use-case where someone (a) uses a hostfile while in a managed allocation to sub-allocate runs, and (b) includes the HNP's node in one of those hostfiles.
authorrhc
Fri Mar 22 00:53:33 2013 +0000 (2 months ago)
changeset 2151997a763a7cc19
parent 21518 89648481c1c4
child 21521 741e869129d3
Add debug and handle the use-case where someone (a) uses a hostfile while in a managed allocation to sub-allocate runs, and (b) includes the HNP's node in one of those hostfiles.

cmr:v1.7
orte/mca/plm/base/plm_base_launch_support.c
orte/mca/rmaps/base/rmaps_base_support_fns.c
     1.1 --- a/orte/mca/plm/base/plm_base_launch_support.c	Thu Mar 21 23:05:54 2013 +0000
     1.2 +++ b/orte/mca/plm/base/plm_base_launch_support.c	Fri Mar 22 00:53:33 2013 +0000
     1.3 @@ -1246,14 +1246,20 @@
     1.4              }
     1.5              /* ignore nodes that are marked as do-not-use for this mapping */
     1.6              if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
     1.7 +                OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
     1.8 +                                     "NODE %s IS MARKED NO_USE", node->name));
     1.9                  /* reset the state so it can be used another time */
    1.10                  node->state = ORTE_NODE_STATE_UP;
    1.11                  continue;
    1.12              }
    1.13              if (ORTE_NODE_STATE_DOWN == node->state) {
    1.14 +                OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
    1.15 +                                     "NODE %s IS MARKED DOWN", node->name));
    1.16                  continue;
    1.17              }
    1.18              if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
    1.19 +                OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
    1.20 +                                     "NODE %s IS MARKED NO_INCLUDE", node->name));
    1.21                  /* not to be used */
    1.22                  continue;
    1.23              }
    1.24 @@ -1388,14 +1394,20 @@
    1.25                  /* have a match - now see if we want this node */
    1.26                  /* ignore nodes that are marked as do-not-use for this mapping */
    1.27                  if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
    1.28 +                    OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
    1.29 +                                         "NODE %s IS MARKED NO_USE", node->name));
    1.30                      /* reset the state so it can be used another time */
    1.31                      node->state = ORTE_NODE_STATE_UP;
    1.32                      break;
    1.33                  }
    1.34                  if (ORTE_NODE_STATE_DOWN == node->state) {
    1.35 +                    OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
    1.36 +                                         "NODE %s IS MARKED DOWN", node->name));
    1.37                      break;
    1.38                  }
    1.39                  if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
    1.40 +                    OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
    1.41 +                                         "NODE %s IS MARKED NO_INCLUDE", node->name));
    1.42                      break;
    1.43                  }
    1.44                  /* if this node is us, ignore it */
    1.45 @@ -1430,21 +1442,25 @@
    1.46          goto process;
    1.47      }
    1.48  
    1.49 -    /* construct a list of available nodes - don't need ours as
    1.50 -     * we already exist
    1.51 -     */
    1.52 +    /* construct a list of available nodes */
    1.53      for (i=1; i < orte_node_pool->size; i++) {
    1.54          if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
    1.55              /* ignore nodes that are marked as do-not-use for this mapping */
    1.56              if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
    1.57 +                OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
    1.58 +                                     "NODE %s IS MARKED NO_USE", node->name));
    1.59                  /* reset the state so it can be used another time */
    1.60                  node->state = ORTE_NODE_STATE_UP;
    1.61                  continue;
    1.62              }
    1.63              if (ORTE_NODE_STATE_DOWN == node->state) {
    1.64 +                OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
    1.65 +                                     "NODE %s IS MARKED DOWN", node->name));
    1.66                  continue;
    1.67              }
    1.68              if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
    1.69 +                OPAL_OUTPUT_VERBOSE((10, orte_plm_globals.output,
    1.70 +                                     "NODE %s IS MARKED NO_INCLUDE", node->name));
    1.71                  /* not to be used */
    1.72                  continue;
    1.73              }
    1.74 @@ -1461,13 +1477,14 @@
    1.75      }
    1.76  
    1.77      /* if we didn't get anything, then we are the only node in the
    1.78 -     * allocation - so there is nothing else to do as no other
    1.79 +     * system - so there is nothing else to do as no other
    1.80       * daemons are to be launched
    1.81       */
    1.82      if (0 == opal_list_get_size(&nodes)) {
    1.83          OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
    1.84                               "%s plm:base:setup_vm only HNP in allocation",
    1.85                               ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    1.86 +        /* cleanup */
    1.87          OBJ_DESTRUCT(&nodes);
    1.88          /* mark that the daemons have reported so we can proceed */
    1.89          daemons->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
    1.90 @@ -1475,7 +1492,17 @@
    1.91          return ORTE_SUCCESS;
    1.92      }
    1.93  
    1.94 -    /* filter across the union of all app_context specs */
    1.95 +    /* filter across the union of all app_context specs - if the HNP
    1.96 +     * was allocated, then we have to include
    1.97 +     * ourselves in case someone has specified a -host or hostfile
    1.98 +     * that includes the head node. We will remove ourselves later
    1.99 +     * as we clearly already exist
   1.100 +     */
   1.101 +    if (orte_hnp_is_allocated) {
   1.102 +        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
   1.103 +        OBJ_RETAIN(node);
   1.104 +        opal_list_append(&nodes, &node->super);
   1.105 +    }
   1.106      for (i=0; i < jdata->apps->size; i++) {
   1.107          if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
   1.108              continue;
   1.109 @@ -1507,6 +1534,18 @@
   1.110          }
   1.111      }
   1.112  
   1.113 +    /* ensure we are not on the list */
   1.114 +    for (item = opal_list_get_first(&nodes);
   1.115 +         item != opal_list_get_end(&nodes);
   1.116 +         item = opal_list_get_next(item)) {
   1.117 +        node = (orte_node_t*)item;
   1.118 +        if (0 == node->index) {
   1.119 +            opal_list_remove_item(&nodes, item);
   1.120 +            OBJ_RELEASE(item);
   1.121 +            break;
   1.122 +        }
   1.123 +    }
   1.124 +
   1.125      /* if we didn't get anything, then we are the only node in the
   1.126       * allocation - so there is nothing else to do as no other
   1.127       * daemons are to be launched
     2.1 --- a/orte/mca/rmaps/base/rmaps_base_support_fns.c	Thu Mar 21 23:05:54 2013 +0000
     2.2 +++ b/orte/mca/rmaps/base/rmaps_base_support_fns.c	Fri Mar 22 00:53:33 2013 +0000
     2.3 @@ -255,18 +255,27 @@
     2.4                      continue;
     2.5                  }
     2.6                  if (0 != strcmp(node->name, nptr->name)) {
     2.7 +                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
     2.8 +                                         "NODE %s DOESNT MATCH NODE %s",
     2.9 +                                         node->name, nptr->name));
    2.10                      continue;
    2.11                  }
    2.12                  /* ignore nodes that are marked as do-not-use for this mapping */
    2.13                  if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
    2.14 +                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
    2.15 +                                         "NODE %s IS MARKED NO_USE", node->name));
    2.16                      /* reset the state so it can be used another time */
    2.17                      node->state = ORTE_NODE_STATE_UP;
    2.18                      continue;
    2.19                  }
    2.20                  if (ORTE_NODE_STATE_DOWN == node->state) {
    2.21 +                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
    2.22 +                                         "NODE %s IS DOWN", node->name));
    2.23                      continue;
    2.24                  }
    2.25                  if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
    2.26 +                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
    2.27 +                                         "NODE %s IS MARKED NO_INCLUDE", node->name));
    2.28                      /* not to be used */
    2.29                      continue;
    2.30                  }
    2.31 @@ -274,6 +283,8 @@
    2.32                   * unless we are mapping prior to launching the vm
    2.33                   */
    2.34                  if (NULL == node->daemon && !novm) {
    2.35 +                    OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
    2.36 +                                         "NODE %s HAS NO DAEMON", node->name));
    2.37                      continue;
    2.38                  }
    2.39                  /* retain a copy for our use in case the item gets
    2.40 @@ -327,6 +338,8 @@
    2.41      if (orte_hnp_is_allocated && !(ORTE_GET_MAPPING_DIRECTIVE(policy) & ORTE_MAPPING_NO_USE_LOCAL)) {
    2.42          if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0))) {
    2.43              if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
    2.44 +                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
    2.45 +                                     "HNP IS MARKED NO_USE"));
    2.46                  /* clear this for future use, but don't include it */
    2.47                  node->state = ORTE_NODE_STATE_UP;
    2.48              } else if (ORTE_NODE_STATE_NOT_INCLUDED != node->state) {
    2.49 @@ -359,14 +372,20 @@
    2.50          if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
    2.51              /* ignore nodes that are marked as do-not-use for this mapping */
    2.52              if (ORTE_NODE_STATE_DO_NOT_USE == node->state) {
    2.53 +                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
    2.54 +                                     "NODE %s IS MARKED NO_USE", node->name));
    2.55                  /* reset the state so it can be used another time */
    2.56                  node->state = ORTE_NODE_STATE_UP;
    2.57                  continue;
    2.58              }
    2.59              if (ORTE_NODE_STATE_DOWN == node->state) {
    2.60 +                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
    2.61 +                                     "NODE %s IS MARKED DOWN", node->name));
    2.62                  continue;
    2.63              }
    2.64              if (ORTE_NODE_STATE_NOT_INCLUDED == node->state) {
    2.65 +                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
    2.66 +                                     "NODE %s IS MARKED NO_INCLUDE", node->name));
    2.67                  /* not to be used */
    2.68                  continue;
    2.69              }
    2.70 @@ -374,6 +393,8 @@
    2.71               * unless we are mapping prior to launching the vm
    2.72               */
    2.73              if (NULL == node->daemon && !novm) {
    2.74 +                OPAL_OUTPUT_VERBOSE((10, orte_rmaps_base.rmaps_output,
    2.75 +                                     "NODE %s HAS NO DAEMON", node->name));
    2.76                  continue;
    2.77              }
    2.78              /* retain a copy for our use in case the item gets