Your code works for me on two platforms. Thus, I guess the problem is with the communication layer (BTL) is Open MPI. What network do you use? If Ethernet how many interfaces?

  Thanks,
    george.

On Oct 10, 2012, at 09:30 , Santhosh Kokala <Santhosh.Kokala@riverbed.com> wrote:

I have a problem with my MPI code, it hangs when the code is run on multiple nodes. It successfullycompletes when run on a single node. I am not sure how to debug this. Can someone help me debug this issue?

Program Usage:

mpicc -o string string.cpp
mpirun -np 
4 -npernode 2 -hostfile hosts ./string 12 0.1 0.9 10 2
 
MPI_Reduce Hangs in 2nd iteration: (Output cout statements from my program)
 
1st Iteration (Timestep 1)
-----------------------------------------------------
0 Waiting for MPI_Reduce()
0 Done Waiting for MPI_Reduce()
 
1 Waiting for MPI_Reduce()
1 Done Waiting for MPI_Reduce()
 
2 Waiting for MPI_Reduce()
2 Done Waiting for MPI_Reduce()
 
3 Waiting for MPI_Reduce()
3 Done Waiting for MPI_Reduce()
 
0 Sending to right  task      = 1
0 Receiving from right task   = 1
 
1 Receiving from left task   = 0
1 Sending to left task       = 0
 
1 Sending to right  task      = 2
1 Receiving from right task   = 2
 
 
2 Receiving from left task   = 1
2 Sending to left task       = 1
 
2 Sending to right  task      = 3
2 Receiving from right task   = 3
 
3 Receiving from left task   = 2
3 Sending to left task       = 2
 
 
 
2nd Iteration (Timestep 2)
-----------------------------------------------------
0 Waiting for MPI_Reduce()
 
1 Waiting for MPI_Reduce()
1 Done Waiting for MPI_Reduce()
 
2 Waiting for MPI_Reduce()
 
3 Waiting for MPI_Reduce()
 
 
 
My Code:
 
#include <iostream>
#include <vector>
#include <stdio.h>
#include <stdlib.h>
#include "mpi.h"
 
#define MASTER 0
int RtoL = 10;
int LtoR = 20;
 
int main ( int argc, char **argv )
{
    int nprocs, taskid;
    FILE *f = NULL;
    int left, right, i_start, i_end;
    float sum = 0;
    MPI_Status status;
    float *y, *yold;
    float *v, *vold;
 
    //  const int NUM_MASSES = 1000;
    //  const float Ktension = 0.1;
    //  const float Kdamping = 0.9;
    //  const float duration = 10.0;
 
#if 0
    if ( argc != 5 ) {
        std::cout << "usage: " << argv[0] << " NUM_MASSES durationInSecs Ktension Kdamping\n";
        return 2;
    }
#endif
 
    int NUM_MASSES  = atoi ( argv[1] );
    float duration = atof ( argv[2] );
    float Ktension = atof ( argv[3] );
    float Kdamping = atof ( argv[4] );
    const int PICKUP_POS = NUM_MASSES / 7;
    const int OVERSAMPLING = 16;
 
    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD,&nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
 
    if (taskid  == 0) {
        f = fopen ( "rstring.raw", "wb" );
        if (!f) {
            std::cout << "can't open output file\n";
            return 1;
        }
    }
 
    y = new float[NUM_MASSES];
    yold = new float[NUM_MASSES];
    v = new float[NUM_MASSES];
 
    for (int i = 0; i < NUM_MASSES; i++ ) {
        v[i]  = 0.0f;
        yold[i] = y[i] = 0.0f;
        if (i == NUM_MASSES/2 )
            yold[i] = 1.0;
    }
 
    if (taskid == 0) {
        left = -1;
        right = 1;
    } else if (taskid == nprocs - 1) {
        left = taskid - 1;
        right = -1;
    } else {
        left = taskid - 1;
        right = taskid + 1;
    }
 
    i_start = taskid * (NUM_MASSES/nprocs);
    i_end = i_start + (NUM_MASSES/nprocs);
 
    int numIters = duration * 44100 * OVERSAMPLING;;
    if (argc == 6) {
        numIters = atoi(argv[5]);
    }
 
    for ( int t = 0; t < numIters; t++ ) {
        float sum = 0;
        float gsum = 0;
 
        for ( int i = i_start; i < i_end; i++ ) {
            if ( i == 0 || i == NUM_MASSES-1 ) {
            } else {
                float accel = Ktension * (yold[i+1] + yold[i-1] - 2*yold[i]);
                v[i] += accel;
                v[i] *= Kdamping;
                y[i] = yold[i] + v[i];
                sum += y[i];
            }
        }
 
        std::cout << taskid << " Waiting for MPI_Reduce()" << std::endl;
        MPI_Reduce(&sum, &gsum, 1, MPI_FLOAT, MPI_SUM, MASTER, MPI_COMM_WORLD);
        std::cout << taskid << " Done Waiting for MPI_Reduce()" << std::endl;
 
        if (taskid != 0) {
            MPI_Recv(&y[i_start-1], 1, MPI_FLOAT, left, LtoR, MPI_COMM_WORLD, &status);
            std::cout << taskid << " Receiving from left task   = " << left << std::endl;
            MPI_Send(&y[i_start],   1, MPI_FLOAT, left, RtoL, MPI_COMM_WORLD);
            std::cout << taskid << " Sending to left task       = " << left << std::endl;
        }
        if (taskid != nprocs - 1) {
            MPI_Send(&y[i_end-1],1, MPI_FLOAT, right, LtoR, MPI_COMM_WORLD);
            std::cout << taskid <<" Sending to right  task      = " << right << std::endl;
            MPI_Recv(&y[i_end],  1, MPI_FLOAT, right, RtoL, MPI_COMM_WORLD, &status);
            std::cout << taskid <<" Receiving from right task   = " << right << std::endl;
        }
 
        //printf("After Reduce task = %d yold = %f %f %f %f\n", taskid,yold[0], yold[1], yold[2], yold[3]);
        //printf("After Reduce task = %d y = %f %f %f %f\n", taskid, y[0], y[1], y[2], y[3]);
        //printf("After Reduce task = %d v = %f %f %f %f\n", taskid, v[0], v[1], v[2], v[3]);
 
        float *tmp = y;
        y = yold;
        yold = tmp;
 
        if (taskid == 0) {
            //std::cout<< "sum = " << gsum << std::endl;
            if ( t % OVERSAMPLING == 0 ) {
                fwrite ( &gsum, sizeof(float), 1, f );
            }
        }
    }
    if (taskid  == 0) {
        fclose ( f );
    }
    MPI_Finalize();
}
 
_______________________________________________
devel mailing list
devel@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel