Open MPI logo

Open MPI User's Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Open MPI User's mailing list

Subject: [OMPI users] GM + OpenMPI bug ...
From: José Ignacio Aliaga Estellés (aliaga_at_[hidden])
Date: 2010-05-12 16:57:25


Hi,

I think that I have found a bug on the implementation of GM
collectives routines included in OpenMPI. The version of the GM
software is 2.0.30 for the PCI64 cards.
Sometimes, when I broadcast a vector with 1024 integer by using the
MPI_Bcast call, some processor receives a bad packet. Usually, the
difference with the original packet is only 1 bit, but it is enough
to break the communication.
I obtain the same problems when I use the 1.4.1 or the 1.4.2 version.
Could you help me? Thanks.

Best regards,

   José I. Aliaga

=================================================
   COMPILATION COMMAND
=================================================
   mpicc test_comm.c -o test_comm

=================================================
   EXECUTION COMMAND
=================================================
   mpirun --mca btl gm,sm,self -np 8 -machinefile mach_file test_comm
1 10 1000

=================================================
   SOME EXECUTION ERRORS
=================================================
## EXECUTION 1 ##
[2] receives 3039 when it must receive 7135. Its subtraction is 4096.
[2] receives 7142 when it must receive 7143. Its subtraction is 1.
[2,411th] (Bcast of 7 en 8) 1024 integers with 2 errors.

## EXECUTION 2 ##
[5] receives 7142 when it must receive 7136. Its subtraction is 6.
[5,277th] (Bcast of 0 en 8) 1024 integers with 1 errors.
[1] receives 7138 when it must receive 7140. Its subtraction is 2.
[1,385th] (Bcast of 4 en 8) 1024 integers with 1 errors.

## EXECUTION 3 ##
[5] receives 3038 when it must receive 7134. Its subtraction is 4096.
[5] receives 7141 when it must receive 7142. Its subtraction is 1.
[5,479th] (Bcast of 6 en 8) 1024 integers with 2 errors.

## EXECUTION 4 ##
[3] receives 3034 when it must receive 7130. Its subtraction is 4096.
[3] receives 7140 when it must receive 7138. Its subtraction is 2.
[3,539th] (Bcast of 2 en 8) 1024 integers with 2 errors.

## EXECUTION 5 ##
[5] receives 7135 when it must receive 3039. Its subtraction is 4096.
[5] receives 3046 when it must receive 3047. Its subtraction is 1.
[5,135th] (Bcast of 7 en 8) 1024 integers with 2 errors.

## EXECUTION 6 ##
[5] receives 7135 when it must receive 3039. Its subtraction is 4096.
[5] receives 3046 when it must receive 3047. Its subtraction is 1.
[5,246th] (Bcast of 7 en 8) 1024 integers with 2 errors.

## EXECUTION 7 ##
[2] receives 7128 when it must receive 3032. Its subtraction is 4096.
[2] receives 3047 when it must receive 3040. Its subtraction is 7.
[2,232th] (Bcast of 0 en 8) 1024 integers with 2 errors.

## EXECUTION 8 ##
[3] receives 3036 when it must receive 7132. Its subtraction is 4096.
[3] receives 7139 when it must receive 7140. Its subtraction is 1.
[3,344th] (Bcast of 4 en 8) 1024 integers with 2 errors.

=================================================
   SOURCE CODE --> test_comm.c
=================================================
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <mpi.h>

int *CreateVector (int tam) {
   int *ptr = NULL, my_id;

   MPI_Comm_rank(MPI_COMM_WORLD, &my_id);
   ptr = (int *) malloc (sizeof(int) * tam);
   if (ptr == NULL)
     { printf ("ERROR MEMORIA (%d)\n", my_id); exit(-1); }

   return ptr;
}

void InitVector (int *vec, int tam, int inic) {
   int i, val = inic, numprocs;

   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   if (inic >= 0)
     for (i= 0; i<tam; i++)
       { vec[i] = val; val += numprocs; }
   else
     for (i= 0; i<tam; i++) vec[i] = -1;
}

int CompareVector (int *vec, int tam, int inic) {
   int i, val = inic, numprocs, my_id, bool = 0;

   MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank
(MPI_COMM_WORLD, &my_id);
   for (i= 0; i<tam; i++) {
     if (vec[i] != val) {
       printf ("[%d] receives %d when it must receive %d. Its
subtraction is %d.\n",
                     my_id, vec[i], val, ((val>vec[i])?(val-vec[i]):
(vec[i]-val)));
       bool++;
     }
     val += numprocs;
   }

   return bool;
}

int main (int argc, char **argv) {
   int i, j, k, tam, num, bool;
   int pos1, pos2, dim1, dim2, nexecs;
   int my_id, numprocs, prc_src, prc_dst;
   int *mess = NULL;
   int *mess1 = NULL, *mess2 = NULL;;
   char name[MPI_MAX_PROCESSOR_NAME];
   MPI_Request req;
   MPI_Status sta;

   MPI_Init (&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank
(MPI_COMM_WORLD, &my_id);

   pos1 = atoi(argv[1]); dim1 = (1 << pos1);
   pos2 = atoi(argv[2]); dim2 = (1 << pos2);
   nexecs = atoi(argv[3]);

   MPI_Barrier(MPI_COMM_WORLD);

   mess = CreateVector(dim2); InitVector(mess, dim2, -1);
   for (tam=dim1; tam<=dim2; tam <<= 1) {
     for (k=0; k<nexecs; k++)
       for (i=0; i<numprocs; i++) {
         InitVector(mess, tam, ((my_id==i)?i:-1));
         MPI_Bcast(mess, tam, MPI_INT, i, MPI_COMM_WORLD);
         bool = CompareVector (mess, tam, i);
         if (bool > 0)
           printf ("[%d,%dth] (Bcast of %d in %d) %d integers with %d
errors\n",
                     my_id, k, i, numprocs, tam, bool);
       }
   }
   free (mess); mess = NULL;

   MPI_Barrier(MPI_COMM_WORLD);

   MPI_Finalize ();

   return 0;
}