On Oct 18, 2011, at 7:35 AM, TERRY DONTJE wrote:

Strange - it ran fine for me on multiple tests. I'll check to see if something strange got into the mix and recommit.

Not sure it is the same issue but it looks like all my MTT tests on the trunk r25308 are timing out.

Okay - sorry about that. I'm looking into it now. I tested it with a multi-node setup, but it's always possible that something got in there after the tests (and sounds like it did).

--td

On Oct 17, 2011, at 8:51 PM, George Bosilca wrote:

This commit put the mpirun process in an infinite loop for the simple case 
mpirun -np 2 --mca orte_default_hostfile machinefile --bynode *my_app*

 george.

On Oct 17, 2011, at 15:49 , rhc@osl.iu.edu wrote:

Author: rhc
Date: 2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011)
New Revision: 25302
URL: https://svn.open-mpi.org/trac/ompi/changeset/25302

Log:
Fix the mapping algo for computing vpids - it was borked for bynode operations when using nperxxx directives

Text files modified: 
 trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c |    67 ++++++++++++++++++++------------------- 
 1 files changed, 34 insertions(+), 33 deletions(-)

Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c
==============================================================================
--- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c	(original)
+++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c	2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011)
@@ -527,7 +527,7 @@
int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
{
   orte_job_map_t *map;
-    orte_vpid_t vpid;
+    orte_vpid_t vpid, cnt;
   int i, j;
   orte_node_t *node;
   orte_proc_t *proc;
@@ -539,6 +539,7 @@
       ORTE_MAPPING_BYSOCKET & map->policy ||
       ORTE_MAPPING_BYBOARD & map->policy) {
       /* assign the ranks sequentially */
+        vpid = 0;
       for (i=0; i < map->nodes->size; i++) {
           if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
               continue;
@@ -553,12 +554,10 @@
               }
               if (ORTE_VPID_INVALID == proc->name.vpid) {
                   /* find the next available vpid */
-                    for (vpid=0; vpid < jdata->num_procs; vpid++) {
-                        if (NULL == opal_pointer_array_get_item(jdata->procs, vpid)) {
-                            break;
-                        }
+                    while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
+                        vpid++;
                   }
-                    proc->name.vpid = vpid;
+                    proc->name.vpid = vpid++;
                   ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
                   ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));

@@ -580,39 +579,41 @@

   if (ORTE_MAPPING_BYNODE & map->policy) {
       /* assign the ranks round-robin across nodes */
-        for (i=0; i < map->nodes->size; i++) {
-            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
-                continue;
-            }
-            for (j=0; j < node->procs->size; j++) {
-                if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
+        cnt = 0;
+        vpid = 0;
+        do {
+            for (i=0; i < map->nodes->size; i++) {
+                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
                   continue;
               }
-                /* ignore procs from other jobs */
-                if (proc->name.jobid != jdata->jobid) {
-                    continue;
-                }
-                if (ORTE_VPID_INVALID == proc->name.vpid) {
-                    /* find the next available vpid */
-                    vpid = i;
-                    while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
-                        vpid += map->num_nodes;
-                        if (jdata->num_procs <= vpid) {
-                            vpid = vpid - jdata->num_procs;
+                for (j=0; j < node->procs->size; j++) {
+                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
+                        continue;
+                    }
+                    /* ignore procs from other jobs */
+                    if (proc->name.jobid != jdata->jobid) {
+                        continue;
+                    }
+                    if (ORTE_VPID_INVALID == proc->name.vpid) {
+                        /* find next available vpid */
+                        while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
+                            vpid++;
+                        }
+                        proc->name.vpid = vpid++;
+                        ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
+                        ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
+                        if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
+                                                                              proc->name.vpid, proc))) {
+                            ORTE_ERROR_LOG(rc);
+                            return rc;
                       }
+                        cnt++;
+                        break;  /* move to next node */
                   }
-                    proc->name.vpid = vpid;
-                    ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
-                    ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
-                }
-                if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
-                    if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
-                        ORTE_ERROR_LOG(rc);
-                        return rc;
-                    }                    
               }
           }
-        }
+        } while (cnt < jdata->num_procs);
+
       return ORTE_SUCCESS;
   }

_______________________________________________
svn mailing list
svn@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/svn
_______________________________________________
devel mailing list
devel@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel
_______________________________________________
devel mailing list
devel@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel

--
<Mail Attachment.gif>
Terry D. Dontje | Principal Software Engineer
Developer Tools Engineering | +1.781.442.2631
Oracle - Performance Technologies
95 Network Drive, Burlington, MA 01803
Email terry.dontje@oracle.com



_______________________________________________
devel mailing list
devel@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel