Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: Re: [OMPI devel] [OMPI svn] svn:open-mpi r25302
From: Ralph Castain (rhc_at_[hidden])
Date: 2011-10-18 09:19:11


Strange - it ran fine for me on multiple tests. I'll check to see if something strange got into the mix and recommit.

On Oct 17, 2011, at 8:51 PM, George Bosilca wrote:

> This commit put the mpirun process in an infinite loop for the simple case
> mpirun -np 2 --mca orte_default_hostfile machinefile --bynode *my_app*
>
> george.
>
> On Oct 17, 2011, at 15:49 , rhc_at_[hidden] wrote:
>
>> Author: rhc
>> Date: 2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011)
>> New Revision: 25302
>> URL: https://svn.open-mpi.org/trac/ompi/changeset/25302
>>
>> Log:
>> Fix the mapping algo for computing vpids - it was borked for bynode operations when using nperxxx directives
>>
>> Text files modified:
>> trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c | 67 ++++++++++++++++++++-------------------
>> 1 files changed, 34 insertions(+), 33 deletions(-)
>>
>> Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c
>> ==============================================================================
>> --- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c (original)
>> +++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c 2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011)
>> @@ -527,7 +527,7 @@
>> int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
>> {
>> orte_job_map_t *map;
>> - orte_vpid_t vpid;
>> + orte_vpid_t vpid, cnt;
>> int i, j;
>> orte_node_t *node;
>> orte_proc_t *proc;
>> @@ -539,6 +539,7 @@
>> ORTE_MAPPING_BYSOCKET & map->policy ||
>> ORTE_MAPPING_BYBOARD & map->policy) {
>> /* assign the ranks sequentially */
>> + vpid = 0;
>> for (i=0; i < map->nodes->size; i++) {
>> if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
>> continue;
>> @@ -553,12 +554,10 @@
>> }
>> if (ORTE_VPID_INVALID == proc->name.vpid) {
>> /* find the next available vpid */
>> - for (vpid=0; vpid < jdata->num_procs; vpid++) {
>> - if (NULL == opal_pointer_array_get_item(jdata->procs, vpid)) {
>> - break;
>> - }
>> + while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
>> + vpid++;
>> }
>> - proc->name.vpid = vpid;
>> + proc->name.vpid = vpid++;
>> ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>>
>> @@ -580,39 +579,41 @@
>>
>> if (ORTE_MAPPING_BYNODE & map->policy) {
>> /* assign the ranks round-robin across nodes */
>> - for (i=0; i < map->nodes->size; i++) {
>> - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
>> - continue;
>> - }
>> - for (j=0; j < node->procs->size; j++) {
>> - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
>> + cnt = 0;
>> + vpid = 0;
>> + do {
>> + for (i=0; i < map->nodes->size; i++) {
>> + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
>> continue;
>> }
>> - /* ignore procs from other jobs */
>> - if (proc->name.jobid != jdata->jobid) {
>> - continue;
>> - }
>> - if (ORTE_VPID_INVALID == proc->name.vpid) {
>> - /* find the next available vpid */
>> - vpid = i;
>> - while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
>> - vpid += map->num_nodes;
>> - if (jdata->num_procs <= vpid) {
>> - vpid = vpid - jdata->num_procs;
>> + for (j=0; j < node->procs->size; j++) {
>> + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
>> + continue;
>> + }
>> + /* ignore procs from other jobs */
>> + if (proc->name.jobid != jdata->jobid) {
>> + continue;
>> + }
>> + if (ORTE_VPID_INVALID == proc->name.vpid) {
>> + /* find next available vpid */
>> + while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
>> + vpid++;
>> + }
>> + proc->name.vpid = vpid++;
>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
>> + ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>> + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
>> + proc->name.vpid, proc))) {
>> + ORTE_ERROR_LOG(rc);
>> + return rc;
>> }
>> + cnt++;
>> + break; /* move to next node */
>> }
>> - proc->name.vpid = vpid;
>> - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
>> - ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>> - }
>> - if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
>> - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
>> - ORTE_ERROR_LOG(rc);
>> - return rc;
>> - }
>> }
>> }
>> - }
>> + } while (cnt < jdata->num_procs);
>> +
>> return ORTE_SUCCESS;
>> }
>>
>> _______________________________________________
>> svn mailing list
>> svn_at_[hidden]
>> http://www.open-mpi.org/mailman/listinfo.cgi/svn
>
>
> _______________________________________________
> devel mailing list
> devel_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/devel