Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: Re: [OMPI devel] [OMPI svn] svn:open-mpi r25302
From: George Bosilca (bosilca_at_[hidden])
Date: 2011-10-17 22:51:20


This commit put the mpirun process in an infinite loop for the simple case
mpirun -np 2 --mca orte_default_hostfile machinefile --bynode *my_app*

  george.

On Oct 17, 2011, at 15:49 , rhc_at_[hidden] wrote:

> Author: rhc
> Date: 2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011)
> New Revision: 25302
> URL: https://svn.open-mpi.org/trac/ompi/changeset/25302
>
> Log:
> Fix the mapping algo for computing vpids - it was borked for bynode operations when using nperxxx directives
>
> Text files modified:
> trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c | 67 ++++++++++++++++++++-------------------
> 1 files changed, 34 insertions(+), 33 deletions(-)
>
> Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c
> ==============================================================================
> --- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c (original)
> +++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c 2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011)
> @@ -527,7 +527,7 @@
> int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
> {
> orte_job_map_t *map;
> - orte_vpid_t vpid;
> + orte_vpid_t vpid, cnt;
> int i, j;
> orte_node_t *node;
> orte_proc_t *proc;
> @@ -539,6 +539,7 @@
> ORTE_MAPPING_BYSOCKET & map->policy ||
> ORTE_MAPPING_BYBOARD & map->policy) {
> /* assign the ranks sequentially */
> + vpid = 0;
> for (i=0; i < map->nodes->size; i++) {
> if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
> continue;
> @@ -553,12 +554,10 @@
> }
> if (ORTE_VPID_INVALID == proc->name.vpid) {
> /* find the next available vpid */
> - for (vpid=0; vpid < jdata->num_procs; vpid++) {
> - if (NULL == opal_pointer_array_get_item(jdata->procs, vpid)) {
> - break;
> - }
> + while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
> + vpid++;
> }
> - proc->name.vpid = vpid;
> + proc->name.vpid = vpid++;
> ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>
> @@ -580,39 +579,41 @@
>
> if (ORTE_MAPPING_BYNODE & map->policy) {
> /* assign the ranks round-robin across nodes */
> - for (i=0; i < map->nodes->size; i++) {
> - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
> - continue;
> - }
> - for (j=0; j < node->procs->size; j++) {
> - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
> + cnt = 0;
> + vpid = 0;
> + do {
> + for (i=0; i < map->nodes->size; i++) {
> + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
> continue;
> }
> - /* ignore procs from other jobs */
> - if (proc->name.jobid != jdata->jobid) {
> - continue;
> - }
> - if (ORTE_VPID_INVALID == proc->name.vpid) {
> - /* find the next available vpid */
> - vpid = i;
> - while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
> - vpid += map->num_nodes;
> - if (jdata->num_procs <= vpid) {
> - vpid = vpid - jdata->num_procs;
> + for (j=0; j < node->procs->size; j++) {
> + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
> + continue;
> + }
> + /* ignore procs from other jobs */
> + if (proc->name.jobid != jdata->jobid) {
> + continue;
> + }
> + if (ORTE_VPID_INVALID == proc->name.vpid) {
> + /* find next available vpid */
> + while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
> + vpid++;
> + }
> + proc->name.vpid = vpid++;
> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
> + ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
> + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
> + proc->name.vpid, proc))) {
> + ORTE_ERROR_LOG(rc);
> + return rc;
> }
> + cnt++;
> + break; /* move to next node */
> }
> - proc->name.vpid = vpid;
> - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
> - ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
> - }
> - if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
> - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
> - ORTE_ERROR_LOG(rc);
> - return rc;
> - }
> }
> }
> - }
> + } while (cnt < jdata->num_procs);
> +
> return ORTE_SUCCESS;
> }
>
> _______________________________________________
> svn mailing list
> svn_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/svn