Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: Re: [OMPI devel] [OMPI svn] svn:open-mpi r25302
From: TERRY DONTJE (terry.dontje_at_[hidden])
Date: 2011-10-18 09:35:42


> Strange - it ran fine for me on multiple tests. I'll check to see if something strange got into the mix and recommit.
>
Not sure it is the same issue but it looks like all my MTT tests on the
trunk r25308 are timing out.
--td

> On Oct 17, 2011, at 8:51 PM, George Bosilca wrote:
>
>> This commit put the mpirun process in an infinite loop for the simple case
>> mpirun -np 2 --mca orte_default_hostfile machinefile --bynode *my_app*
>>
>> george.
>>
>> On Oct 17, 2011, at 15:49 , rhc_at_[hidden] wrote:
>>
>>> Author: rhc
>>> Date: 2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011)
>>> New Revision: 25302
>>> URL: https://svn.open-mpi.org/trac/ompi/changeset/25302
>>>
>>> Log:
>>> Fix the mapping algo for computing vpids - it was borked for bynode operations when using nperxxx directives
>>>
>>> Text files modified:
>>> trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c | 67 ++++++++++++++++++++-------------------
>>> 1 files changed, 34 insertions(+), 33 deletions(-)
>>>
>>> Modified: trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c
>>> ==============================================================================
>>> --- trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c (original)
>>> +++ trunk/orte/mca/rmaps/base/rmaps_base_support_fns.c 2011-10-17 15:49:04 EDT (Mon, 17 Oct 2011)
>>> @@ -527,7 +527,7 @@
>>> int orte_rmaps_base_compute_vpids(orte_job_t *jdata)
>>> {
>>> orte_job_map_t *map;
>>> - orte_vpid_t vpid;
>>> + orte_vpid_t vpid, cnt;
>>> int i, j;
>>> orte_node_t *node;
>>> orte_proc_t *proc;
>>> @@ -539,6 +539,7 @@
>>> ORTE_MAPPING_BYSOCKET& map->policy ||
>>> ORTE_MAPPING_BYBOARD& map->policy) {
>>> /* assign the ranks sequentially */
>>> + vpid = 0;
>>> for (i=0; i< map->nodes->size; i++) {
>>> if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
>>> continue;
>>> @@ -553,12 +554,10 @@
>>> }
>>> if (ORTE_VPID_INVALID == proc->name.vpid) {
>>> /* find the next available vpid */
>>> - for (vpid=0; vpid< jdata->num_procs; vpid++) {
>>> - if (NULL == opal_pointer_array_get_item(jdata->procs, vpid)) {
>>> - break;
>>> - }
>>> + while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
>>> + vpid++;
>>> }
>>> - proc->name.vpid = vpid;
>>> + proc->name.vpid = vpid++;
>>> ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
>>> ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>>>
>>> @@ -580,39 +579,41 @@
>>>
>>> if (ORTE_MAPPING_BYNODE& map->policy) {
>>> /* assign the ranks round-robin across nodes */
>>> - for (i=0; i< map->nodes->size; i++) {
>>> - if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
>>> - continue;
>>> - }
>>> - for (j=0; j< node->procs->size; j++) {
>>> - if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
>>> + cnt = 0;
>>> + vpid = 0;
>>> + do {
>>> + for (i=0; i< map->nodes->size; i++) {
>>> + if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
>>> continue;
>>> }
>>> - /* ignore procs from other jobs */
>>> - if (proc->name.jobid != jdata->jobid) {
>>> - continue;
>>> - }
>>> - if (ORTE_VPID_INVALID == proc->name.vpid) {
>>> - /* find the next available vpid */
>>> - vpid = i;
>>> - while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
>>> - vpid += map->num_nodes;
>>> - if (jdata->num_procs<= vpid) {
>>> - vpid = vpid - jdata->num_procs;
>>> + for (j=0; j< node->procs->size; j++) {
>>> + if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
>>> + continue;
>>> + }
>>> + /* ignore procs from other jobs */
>>> + if (proc->name.jobid != jdata->jobid) {
>>> + continue;
>>> + }
>>> + if (ORTE_VPID_INVALID == proc->name.vpid) {
>>> + /* find next available vpid */
>>> + while (NULL != opal_pointer_array_get_item(jdata->procs, vpid)) {
>>> + vpid++;
>>> + }
>>> + proc->name.vpid = vpid++;
>>> + ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
>>> + ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>>> + if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs,
>>> + proc->name.vpid, proc))) {
>>> + ORTE_ERROR_LOG(rc);
>>> + return rc;
>>> }
>>> + cnt++;
>>> + break; /* move to next node */
>>> }
>>> - proc->name.vpid = vpid;
>>> - ORTE_EPOCH_SET(proc->name.epoch,ORTE_EPOCH_INVALID);
>>> - ORTE_EPOCH_SET(proc->name.epoch,orte_ess.proc_get_epoch(&proc->name));
>>> - }
>>> - if (NULL == opal_pointer_array_get_item(jdata->procs, proc->name.vpid)) {
>>> - if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
>>> - ORTE_ERROR_LOG(rc);
>>> - return rc;
>>> - }
>>> }
>>> }
>>> - }
>>> + } while (cnt< jdata->num_procs);
>>> +
>>> return ORTE_SUCCESS;
>>> }
>>>
>>> _______________________________________________
>>> svn mailing list
>>> svn_at_[hidden]
>>> http://www.open-mpi.org/mailman/listinfo.cgi/svn
>>
>> _______________________________________________
>> devel mailing list
>> devel_at_[hidden]
>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>
> _______________________________________________
> devel mailing list
> devel_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/devel

-- 
Oracle
Terry D. Dontje | Principal Software Engineer
Developer Tools Engineering | +1.781.442.2631
Oracle *- Performance Technologies*
95 Network Drive, Burlington, MA 01803
Email terry.dontje_at_[hidden] <mailto:terry.dontje_at_[hidden]>



picture