Open MPI logo

Open MPI User's Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Open MPI User's mailing list

Subject: [OMPI users] GPU and CPU timing - OpenMPI and Thrust
From: Rohan Deshpande (rohand87_at_[hidden])
Date: 2012-05-07 21:38:20


 I am running MPI and Thrust code on a cluster and measuring time for
calculations.

My MPI code -

#include "mpi.h"
 #include <stdio.h>
#include <stdlib.h>
#include <string.h>
 #include <time.h>
#include <sys/time.h>
#include <sys/resource.h>

#define MASTER 0
#define ARRAYSIZE 20000000

int
*masterarray,*onearray,*twoarray,*threearray,*fourarray,*fivearray,*sixarray,*sevenarray,*eightarray,*ninearray;

   int main(int argc, char* argv[])
 {
  int numtasks, taskid,chunksize, namelen;
  int mysum,one,two,three,four,five,six,seven,eight,nine;

  char myname[MPI_MAX_PROCESSOR_NAME];
  MPI_Status status;
  int a,b,c,d,e,f,g,h,i,j;

/***** Initializations *****/
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
 MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname, &namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname);

masterarray= malloc(ARRAYSIZE * sizeof(int));
onearray= malloc(ARRAYSIZE * sizeof(int));
 twoarray= malloc(ARRAYSIZE * sizeof(int));
threearray= malloc(ARRAYSIZE * sizeof(int));
fourarray= malloc(ARRAYSIZE * sizeof(int));
 fivearray= malloc(ARRAYSIZE * sizeof(int));
sixarray= malloc(ARRAYSIZE * sizeof(int));
sevenarray= malloc(ARRAYSIZE * sizeof(int));
 eightarray= malloc(ARRAYSIZE * sizeof(int));
ninearray= malloc(ARRAYSIZE * sizeof(int));

/***** Master task only ******/
if (taskid == MASTER){
           for(a=0; a < ARRAYSIZE; a++){
                 masterarray[a] = 1;

            }
   mysum = run_kernel0(masterarray,ARRAYSIZE,taskid, myname);

 } /* end of master section */

  if (taskid > MASTER) {

             if(taskid == 1){
                for(b=0;b<ARRAYSIZE;b++){
                onearray[b] = 1;
            }
                 one = run_kernel0(onearray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 2){
                for(c=0;c<ARRAYSIZE;c++){
                 twoarray[c] = 1;
            }
                 two = run_kernel0(twoarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 3){
                 for(d=0;d<ARRAYSIZE;d++){
                 threearray[d] = 1;
                  }
                  three = run_kernel0(threearray,ARRAYSIZE,taskid, myname);
             }
     if(taskid == 4){
                   for(e=0;e < ARRAYSIZE; e++){
                      fourarray[e] = 1;
                  }
                 four = run_kernel0(fourarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 5){
                for(f=0;f<ARRAYSIZE;f++){
                  fivearray[f] = 1;
                  }
                five = run_kernel0(fivearray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 6){

                for(g=0;g<ARRAYSIZE;g++){
                 sixarray[g] = 1;
                }
                 six = run_kernel0(sixarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 7){
                    for(h=0;h<ARRAYSIZE;h++){
                    sevenarray[h] = 1;
                  }
                   seven = run_kernel0(sevenarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 8){

                  for(i=0;i<ARRAYSIZE;i++){
                  eightarray[i] = 1;
                }
                   eight = run_kernel0(eightarray,ARRAYSIZE,taskid, myname);
             }
             if(taskid == 9){

                   for(j=0;j<ARRAYSIZE;j++){
                 ninearray[j] = 1;
                   }
                   nine = run_kernel0(ninearray,ARRAYSIZE,taskid, myname);
             }
   }
 MPI_Finalize();

}

All the tasks just initialize their own array and then calculate the sum
using cuda thrust.
My CUDA Thrust code -

 #include <stdio.h>
#include <cutil_inline.h>
#include <cutil.h>
 #include <thrust/version.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
 #include <time.h>
#include <sys/time.h>
#include <sys/resource.h>

  extern "C"
 int run_kernel0( int array[], int nelements, int taskid, char hostname[])
 {

       float elapsedTime;
        int result = 0;
int threshold = 25000000;
        cudaEvent_t start, stop;
cudaEventCreate(&start);
 cudaEventCreate(&stop);
cudaEventRecord(start, 0);
thrust::device_vector<int> gpuarray;
 int *begin = array;
int *end = array + nelements;
while(begin != end)
 {
   int chunk_size = thrust::min(threshold,end - begin);
   gpuarray.assign(begin, begin + chunk_size);
  result += thrust::reduce(gpuarray.begin(), gpuarray.end());
   begin += chunk_size;
}
        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
 cudaEventDestroy(start);
cudaEventDestroy(stop);

        printf(" Task %d on has sum (on GPU): %ld Time for the kernel: %f
ms \n", taskid, result, elapsedTime);

return result;
    }

I also calculate the sum using CPU and the code is as below -

  struct timespec time1, time2, temp_time;

  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);
  int i;
  int cpu_sum = 0;
  long diff = 0;

  for (i = 0; i < nelements; i++) {
    cpu_sum += array[i];
  }
  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);
  temp_time.tv_sec = time2.tv_sec - time1.tv_sec;
  temp_time.tv_nsec = time2.tv_nsec - time1.tv_nsec;
  diff = temp_time.tv_sec * 1000000000 + temp_time.tv_nsec;
  printf("Task %d calculated sum: %d using CPU in %lf ms \n", taskid,
cpu_sum, (double) diff/1000000);
  return cpu_sum;

Now when I run the job on cluster with 10 MPI tasks and compare the timings
of CPU and GPU, I get weird results where GPU time is much much higher than
CPU time.
But the case should be opposite isnt it?

The CPU time is almost same for all the task but GPU time increases.

Just wondering what might be the cause of this or are these results
correct? Anything wrong with MPI code?

My cluster has 3 machines. 4 MPI tasks run on 2 machine and 2 Tasks run on
1 machine.
Each machine has 1 GPU - GForce 9500 GT with 512 MB memory.

Can anyone please help me with this.?

Thanks

--