Open MPI logo

Open MPI User's Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Open MPI User's mailing list

Subject: Re: [OMPI users] GPU and CPU timing - OpenMPI and Thrust
From: Rolf vandeVaart (rvandevaart_at_[hidden])
Date: 2012-05-08 08:21:05


You should be running with one GPU per MPI process. If I understand correctly, you have a 3 node cluster and each node has a GPU so you should run with np=3.
Maybe you can try that and see if your numbers come out better.

From: users-bounces_at_[hidden] [mailto:users-bounces_at_[hidden]] On Behalf Of Rohan Deshpande
Sent: Monday, May 07, 2012 9:38 PM
To: Open MPI Users
Subject: [OMPI users] GPU and CPU timing - OpenMPI and Thrust

 I am running MPI and Thrust code on a cluster and measuring time for calculations.

My MPI code -

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>

#define MASTER 0
#define ARRAYSIZE 20000000

int *masterarray,*onearray,*twoarray,*threearray,*fourarray,*fivearray,*sixarray,*sevenarray,*eightarray,*ninearray;
   int main(int argc, char* argv[])
{
  int numtasks, taskid,chunksize, namelen;
  int mysum,one,two,three,four,five,six,seven,eight,nine;

  char myname[MPI_MAX_PROCESSOR_NAME];
  MPI_Status status;
  int a,b,c,d,e,f,g,h,i,j;

/***** Initializations *****/
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname, &namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname);

masterarray= malloc(ARRAYSIZE * sizeof(int));
onearray= malloc(ARRAYSIZE * sizeof(int));
twoarray= malloc(ARRAYSIZE * sizeof(int));
threearray= malloc(ARRAYSIZE * sizeof(int));
fourarray= malloc(ARRAYSIZE * sizeof(int));
fivearray= malloc(ARRAYSIZE * sizeof(int));
sixarray= malloc(ARRAYSIZE * sizeof(int));
sevenarray= malloc(ARRAYSIZE * sizeof(int));
eightarray= malloc(ARRAYSIZE * sizeof(int));
ninearray= malloc(ARRAYSIZE * sizeof(int));

/***** Master task only ******/
if (taskid == MASTER){
           for(a=0; a < ARRAYSIZE; a++){
                 masterarray[a] = 1;

            }
   mysum = run_kernel0(masterarray,ARRAYSIZE,taskid, myname);

 } /* end of master section */

  if (taskid > MASTER) {

             if(taskid == 1){
                for(b=0;b<ARRAYSIZE;b++){
                onearray[b] = 1;
            }
                 one = run_kernel0(onearray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 2){
                for(c=0;c<ARRAYSIZE;c++){
                 twoarray[c] = 1;
            }
                 two = run_kernel0(twoarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 3){
                 for(d=0;d<ARRAYSIZE;d++){
                 threearray[d] = 1;
                  }
                  three = run_kernel0(threearray,ARRAYSIZE,taskid, myname);
             }
     if(taskid == 4){
                   for(e=0;e < ARRAYSIZE; e++){
                      fourarray[e] = 1;
                  }
                 four = run_kernel0(fourarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 5){
                for(f=0;f<ARRAYSIZE;f++){
                  fivearray[f] = 1;
                  }
                five = run_kernel0(fivearray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 6){

                for(g=0;g<ARRAYSIZE;g++){
                 sixarray[g] = 1;
                }
                 six = run_kernel0(sixarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 7){
                    for(h=0;h<ARRAYSIZE;h++){
                    sevenarray[h] = 1;
                  }
                   seven = run_kernel0(sevenarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 8){

                  for(i=0;i<ARRAYSIZE;i++){
                  eightarray[i] = 1;
                }
                   eight = run_kernel0(eightarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 9){

                   for(j=0;j<ARRAYSIZE;j++){
                 ninearray[j] = 1;
                   }
                   nine = run_kernel0(ninearray,ARRAYSIZE,taskid, myname);
             }
   }
 MPI_Finalize();

}

All the tasks just initialize their own array and then calculate the sum using cuda thrust.
My CUDA Thrust code -

 #include <stdio.h>
#include <cutil_inline.h>
#include <cutil.h>
#include <thrust/version.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>

  extern "C"
 int run_kernel0( int array[], int nelements, int taskid, char hostname[])
 {

       float elapsedTime;
        int result = 0;
int threshold = 25000000;
        cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
thrust::device_vector<int> gpuarray;
int *begin = array;
int *end = array + nelements;
while(begin != end)
{
   int chunk_size = thrust::min(threshold,end - begin);
   gpuarray.assign(begin, begin + chunk_size);
 result += thrust::reduce(gpuarray.begin(), gpuarray.end());
   begin += chunk_size;
}
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);

        printf(" Task %d on has sum (on GPU): %ld Time for the kernel: %f ms \n", taskid, result, elapsedTime);

return result;
    }

I also calculate the sum using CPU and the code is as below -

  struct timespec time1, time2, temp_time;

  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);
  int i;
  int cpu_sum = 0;
  long diff = 0;

  for (i = 0; i < nelements; i++) {
    cpu_sum += array[i];
  }
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);
  temp_time.tv_sec = time2.tv_sec - time1.tv_sec;
  temp_time.tv_nsec = time2.tv_nsec - time1.tv_nsec;
  diff = temp_time.tv_sec * 1000000000 + temp_time.tv_nsec;
  printf("Task %d calculated sum: %d using CPU in %lf ms \n", taskid, cpu_sum, (double) diff/1000000);
  return cpu_sum;

Now when I run the job on cluster with 10 MPI tasks and compare the timings of CPU and GPU, I get weird results where GPU time is much much higher than CPU time.
But the case should be opposite isnt it?

The CPU time is almost same for all the task but GPU time increases.

Just wondering what might be the cause of this or are these results correct? Anything wrong with MPI code?

My cluster has 3 machines. 4 MPI tasks run on 2 machine and 2 Tasks run on 1 machine.
Each machine has 1 GPU - GForce 9500 GT with 512 MB memory.

Can anyone please help me with this.?

Thanks

--
-----------------------------------------------------------------------------------
This email message is for the sole use of the intended recipient(s) and may contain
confidential information.  Any unauthorized review, use, disclosure or distribution
is prohibited.  If you are not the intended recipient, please contact the sender by
reply email and destroy all copies of the original message.
-----------------------------------------------------------------------------------