Open MPI logo

Open MPI User's Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Open MPI User's mailing list

Subject: [OMPI users] bug in CUDA support for dual-processor systems?
From: Zbigniew Koza (zzkoza_at_[hidden])
Date: 2012-07-31 12:38:14


Hi,

I wrote a simple program to see if OpenMPI can really handle cuda
pointers as promised in the FAQ and how efficiently.
The program (see below) breaks if MPI communication is to be performed
between two devices that are on the same node but under different IOHs
in a dual-processor Intel machine.
Note that cudaMemCpy works for such devices, although not as
efficiently as for the devices on the same IOH and GPUDirect enabled.

Here's the output from my program:

===============================

> mpirun -n 6 ./a.out
Init
Init
Init
Init
Init
Init
rank: 1, size: 6
rank: 2, size: 6
rank: 3, size: 6
rank: 4, size: 6
rank: 5, size: 6
rank: 0, size: 6
device 3 is set
Process 3 is on typhoon1
Using regular memory
device 0 is set
Process 0 is on typhoon1
Using regular memory
device 4 is set
Process 4 is on typhoon1
Using regular memory
device 1 is set
Process 1 is on typhoon1
Using regular memory
device 5 is set
Process 5 is on typhoon1
Using regular memory
device 2 is set
Process 2 is on typhoon1
Using regular memory
^C^[[A^C
zkoza_at_typhoon1:~/multigpu$
zkoza_at_typhoon1:~/multigpu$ vim cudamussings.c
zkoza_at_typhoon1:~/multigpu$ mpicc cudamussings.c -lcuda -lcudart
-L/usr/local/cuda/lib64 -I/usr/local/cuda/include
zkoza_at_typhoon1:~/multigpu$ vim cudamussings.c
zkoza_at_typhoon1:~/multigpu$ mpicc cudamussings.c -lcuda -lcudart
-L/usr/local/cuda/lib64 -I/usr/local/cuda/include
zkoza_at_typhoon1:~/multigpu$ mpirun -n 6 ./a.out
Process 1 of 6 is on typhoon1
Process 2 of 6 is on typhoon1
Process 0 of 6 is on typhoon1
Process 4 of 6 is on typhoon1
Process 5 of 6 is on typhoon1
Process 3 of 6 is on typhoon1
device 2 is set
device 1 is set
device 0 is set
Using regular memory
device 5 is set
device 3 is set
device 4 is set
Host->device bandwidth for processor 1: 1587.993499 MB/sec
Host->device bandwidth for processor 2: 1570.275316 MB/sec
Host->device bandwidth for processor 3: 1569.890751 MB/sec
Host->device bandwidth for processor 5: 1483.637702 MB/sec
Host->device bandwidth for processor 0: 1480.888029 MB/sec
Host->device bandwidth for processor 4: 1476.241371 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Host [1] bandwidth: 3338.57 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Host [1] bandwidth: 420.85 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Device[1] bandwidth: 362.13 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Device[1] bandwidth: 6552.35 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Host [2] bandwidth: 3238.88 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Host [2] bandwidth: 418.18 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Device[2] bandwidth: 362.06 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Device[2] bandwidth: 5022.82 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Host [3] bandwidth: 3295.32 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Host [3] bandwidth: 418.90 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Device[3] bandwidth: 359.16 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Device[3] bandwidth: 5019.89 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Host [4] bandwidth: 4619.55 MB/sec
MPI_Send/MPI_Receive, Device[0] -> Host [4] bandwidth: 419.24 MB/sec
MPI_Send/MPI_Receive, Host [0] -> Device[4] bandwidth: 364.52 MB/sec
--------------------------------------------------------------------------
The call to cuIpcOpenMemHandle failed. This is an unrecoverable error
and will cause the program to abort.
   cuIpcOpenMemHandle return value: 205
   address: 0x200200000
Check the cuda.h file for what the return value means. Perhaps a reboot
of the node will clear the problem.
--------------------------------------------------------------------------
[typhoon1:06098] Failed to register remote memory, rc=-1
[typhoon1:06098] [[33788,1],4] ORTE_ERROR_LOG: Error in file
pml_ob1_recvreq.c at line 465

========================================================

Comment:
In my machine there are 2 six-core intel processors with HT on, yielding
24 virtual processors, and 6 Tesla C2070s.
The devices are grouped in two groups, one with 4 and the other with 2
devices.
Devices in the same group can talk to each other via GPUDirect at approx
6GB/s; devices in different groups can use
cudaMemCpy and UVA at somewhat smaller transfer rates.

my OpenMPI is openmpi-1.9a1r26904 compiled from sources

./configure -prefix=/home/zkoza/openmpi.1.9.cuda
--with-cuda=/usr/local/cuda --with-cuda-libdir=/usr/lib

> nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2012 NVIDIA Corporation
Built on Thu_Apr__5_00:24:31_PDT_2012
Cuda compilation tools, release 4.2, V0.2.1221

gcc version 4.6.3 (Ubuntu/Linaro 4.6.3-1ubuntu5)

Ubuntu 12.04 64-bit

Nvidia Driver Version: 295.41 |

The program was compiled with:
> mpicc prog.c -lcuda -lcudart -L/usr/local/cuda/lib64
-I/usr/local/cuda/include

================================================
SOURCE CODE:
================================================

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <sys/time.h>
#include <mpi.h>

#define NREPEAT 20
#define NBYTES 100000000

#define CALL(x)\
{\
   cudaError_t err = x;\
   if (cudaSuccess != err)\
   {\
     printf("CUDA ERROR %s at %d\n", cudaGetErrorString(err), __LINE__ ); \
     cudaGetLastError();\
   }\
}

int main (int argc, char *argv[])
{
         int rank, size, n, len, numbytes;
         void *a_h, *a_d;
         struct timeval time[2];
         double bandwidth;
         char name[MPI_MAX_PROCESSOR_NAME];
         MPI_Status status;

         MPI_Init (&argc, &argv);
         MPI_Comm_rank (MPI_COMM_WORLD, &rank);
         MPI_Comm_size (MPI_COMM_WORLD, &size);
         MPI_Get_processor_name(name, &len);

         printf("Process %d of %d is on %s\n", rank, size, name);
         fflush(stdout);

         CALL( cudaSetDevice(rank) );
         printf("device %d is set\n", rank);
         fflush(stdout);

#ifdef PINNED
         if (rank == 0)
                 printf("Using pinned memory \n");
         CALL( cudaMallocHost( (void **) &a_h, NBYTES) );
#else
         if (rank == 0)
                 printf("Using regular memory \n");
         a_h = malloc(NBYTES);
#endif
         CALL( cudaMalloc( (void **) &a_d, NBYTES) );

         MPI_Barrier(MPI_COMM_WORLD);

         gettimeofday(&time[0], NULL);
         for (n=0; n<NREPEAT; n++ )
         {
                 CALL( cudaMemcpy(a_d, a_h, NBYTES,
cudaMemcpyHostToDevice) );
         }
         gettimeofday(&time[1], NULL);

         bandwidth = time[1].tv_sec - time[0].tv_sec;
         bandwidth += 1.e-6*(time[1].tv_usec - time[0].tv_usec);
         bandwidth = (double)NBYTES*NREPEAT/1.e6/bandwidth;

         printf("Host->device bandwidth for processor %d: %f MB/sec\n",
rank, bandwidth);

         /* Test MPI send/recv bandwidth. */

         MPI_Barrier(MPI_COMM_WORLD);

         int i, proc;
         for (proc = 1; proc < size; proc++)
         {
                 for (i = 0; i < 4; i++)
                 {
                          const int from_host = (i & 1) == 0;
                         const int to_host = (i & 2) == 0;
                         const char* tab[2] = {"Device", "Host "};
                         void * ptr[2] = {a_d, a_h};;

                         MPI_Barrier(MPI_COMM_WORLD);
                         gettimeofday(&time[0], NULL);
                         for (n=0; n<NREPEAT; n++)
                         {
                                 if (rank == 0)
                                         MPI_Send(ptr[from_host],
NBYTES/sizeof(int), MPI_INT, proc, 0, MPI_COMM_WORLD);
                                 else if (rank == proc)
                                         MPI_Recv(ptr[to_host],
NBYTES/sizeof(int), MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
                         }

                         gettimeofday(&time[1], NULL);
                         // printf("MPI status: %d\n", status);

                         bandwidth = time[1].tv_sec - time[0].tv_sec;
                         bandwidth += 1.e-6*(time[1].tv_usec -
time[0].tv_usec);
                         bandwidth = NBYTES*NREPEAT/1.e6/bandwidth;
                         if (rank == 0)
                         {
                                 printf("MPI_Send/MPI_Receive, %s[%d]
-> %s[%d] bandwidth: %4.2f MB/sec\n",
                                                       tab[from_host],
0, tab[to_host], proc, bandwidth);
                                 fflush(stdout);
                         }
         }
}
#ifdef PINNED
         CALL( cudaFreeHost(a_h) );
#else
         free(a_h);
#endif
         CALL( cudaFree(a_d) ) ;

         MPI_Finalize();
         return 0;
}