Open MPI logo

Open MPI User's Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Open MPI User's mailing list

Subject: Re: [OMPI users] GPU and CPU timing - OpenMPI and Thrust
From: Rohan Deshpande (rohand87_at_[hidden])
Date: 2012-05-08 09:59:32


Yep you are correct. I did the same and it worked. When I have more than 3
MPI tasks there is lot of overhead on GPU.

But for CPU there is not overhead. All three machines have 4 quad core
processors with 3.8 GB RAM.

Just wondering why there is no degradation of performance on CPU ?

On Tue, May 8, 2012 at 8:21 PM, Rolf vandeVaart <rvandevaart_at_[hidden]>wrote:

> You should be running with one GPU per MPI process. If I understand
> correctly, you have a 3 node cluster and each node has a GPU so you should
> run with np=3.****
>
> Maybe you can try that and see if your numbers come out better.****
>
> ** **
>
> ** **
>
> *From:* users-bounces_at_[hidden] [mailto:users-bounces_at_[hidden]] *On
> Behalf Of *Rohan Deshpande
> *Sent:* Monday, May 07, 2012 9:38 PM
> *To:* Open MPI Users
> *Subject:* [OMPI users] GPU and CPU timing - OpenMPI and Thrust****
>
> ** **
>
> I am running MPI and Thrust code on a cluster and measuring time for
> calculations.****
>
> ** **
>
> My MPI code - ****
>
> ** **
>
> #include "mpi.h"****
>
> #include <stdio.h>****
>
> #include <stdlib.h>****
>
> #include <string.h>****
>
> #include <time.h>****
>
> #include <sys/time.h>****
>
> #include <sys/resource.h>****
>
> ** **
>
> #define MASTER 0****
>
> #define ARRAYSIZE 20000000****
>
> ** **
>
> int
> *masterarray,*onearray,*twoarray,*threearray,*fourarray,*fivearray,*sixarray,*sevenarray,*eightarray,*ninearray;
> ****
>
> int main(int argc, char* argv[])****
>
> {****
>
> int numtasks, taskid,chunksize, namelen; ****
>
> int mysum,one,two,three,four,five,six,seven,eight,nine;****
>
> ** **
>
> char myname[MPI_MAX_PROCESSOR_NAME];****
>
> MPI_Status status;****
>
> int a,b,c,d,e,f,g,h,i,j;****
>
> ** **
>
> /***** Initializations *****/****
>
> MPI_Init(&argc, &argv);****
>
> MPI_Comm_size(MPI_COMM_WORLD, &numtasks);****
>
> MPI_Comm_rank(MPI_COMM_WORLD,&taskid); ****
>
> MPI_Get_processor_name(myname, &namelen);****
>
> printf ("MPI task %d has started on host %s...\n", taskid, myname);****
>
> ** **
>
> masterarray= malloc(ARRAYSIZE * sizeof(int));****
>
> onearray= malloc(ARRAYSIZE * sizeof(int));****
>
> twoarray= malloc(ARRAYSIZE * sizeof(int));****
>
> threearray= malloc(ARRAYSIZE * sizeof(int));****
>
> fourarray= malloc(ARRAYSIZE * sizeof(int));****
>
> fivearray= malloc(ARRAYSIZE * sizeof(int));****
>
> sixarray= malloc(ARRAYSIZE * sizeof(int));****
>
> sevenarray= malloc(ARRAYSIZE * sizeof(int));****
>
> eightarray= malloc(ARRAYSIZE * sizeof(int));****
>
> ninearray= malloc(ARRAYSIZE * sizeof(int)); ****
>
> ** **
>
> /***** Master task only ******/****
>
> if (taskid == MASTER){****
>
> for(a=0; a < ARRAYSIZE; a++){****
>
> masterarray[a] = 1;****
>
> ****
>
> }****
>
> mysum = run_kernel0(masterarray,ARRAYSIZE,taskid, myname);****
>
> ** **
>
> } /* end of master section */****
>
> ** **
>
> if (taskid > MASTER) {****
>
> ** **
>
> if(taskid == 1){****
>
> for(b=0;b<ARRAYSIZE;b++){****
>
> onearray[b] = 1;****
>
> }****
>
> one = run_kernel0(onearray,ARRAYSIZE,taskid, myname);****
>
> }****
>
> if(taskid == 2){****
>
> for(c=0;c<ARRAYSIZE;c++){****
>
> twoarray[c] = 1;****
>
> }****
>
> two = run_kernel0(twoarray,ARRAYSIZE,taskid, myname);****
>
> }****
>
> if(taskid == 3){****
>
> for(d=0;d<ARRAYSIZE;d++){****
>
> threearray[d] = 1;****
>
> }****
>
> three = run_kernel0(threearray,ARRAYSIZE,taskid, myname);
> ****
>
> }****
>
> if(taskid == 4){****
>
> for(e=0;e < ARRAYSIZE; e++){****
>
> fourarray[e] = 1;****
>
> }****
>
> four = run_kernel0(fourarray,ARRAYSIZE,taskid, myname);**
> **
>
> }****
>
> if(taskid == 5){****
>
> for(f=0;f<ARRAYSIZE;f++){****
>
> fivearray[f] = 1;****
>
> }****
>
> five = run_kernel0(fivearray,ARRAYSIZE,taskid, myname);***
> *
>
> }****
>
> if(taskid == 6){****
>
> ****
>
> for(g=0;g<ARRAYSIZE;g++){****
>
> sixarray[g] = 1;****
>
> }****
>
> six = run_kernel0(sixarray,ARRAYSIZE,taskid, myname);****
>
> } ****
>
> if(taskid == 7){****
>
> for(h=0;h<ARRAYSIZE;h++){****
>
> sevenarray[h] = 1;****
>
> }****
>
> seven = run_kernel0(sevenarray,ARRAYSIZE,taskid,
> myname);****
>
> } ****
>
> if(taskid == 8){****
>
> ** **
>
> for(i=0;i<ARRAYSIZE;i++){****
>
> eightarray[i] = 1;****
>
> }****
>
> eight = run_kernel0(eightarray,ARRAYSIZE,taskid,
> myname);****
>
> } ****
>
> if(taskid == 9){****
>
> ** **
>
> for(j=0;j<ARRAYSIZE;j++){****
>
> ninearray[j] = 1;****
>
> }****
>
> nine = run_kernel0(ninearray,ARRAYSIZE,taskid, myname);
> ****
>
> } ****
>
> }****
>
> MPI_Finalize();****
>
> ** **
>
> } ****
>
> ** **
>
> All the tasks just initialize their own array and then calculate the sum
> using cuda thrust.****
>
> My CUDA Thrust code - ****
>
> ** **
>
> #include <stdio.h>****
>
> #include <cutil_inline.h>****
>
> #include <cutil.h>****
>
> #include <thrust/version.h>****
>
> #include <thrust/generate.h>****
>
> #include <thrust/host_vector.h>****
>
> #include <thrust/device_vector.h>****
>
> #include <thrust/functional.h>****
>
> #include <thrust/transform_reduce.h>****
>
> #include <time.h>****
>
> #include <sys/time.h>****
>
> #include <sys/resource.h>****
>
> ** **
>
> extern "C"****
>
> int run_kernel0( int array[], int nelements, int taskid, char hostname[])
> ****
>
> {****
>
> ****
>
> float elapsedTime;****
>
> int result = 0;****
>
> int threshold = 25000000;****
>
> cudaEvent_t start, stop;****
>
> cudaEventCreate(&start);****
>
> cudaEventCreate(&stop);****
>
> cudaEventRecord(start, 0);****
>
> thrust::device_vector<int> gpuarray;****
>
> int *begin = array;****
>
> int *end = array + nelements;****
>
> while(begin != end)****
>
> {****
>
> int chunk_size = thrust::min(threshold,end - begin);****
>
> gpuarray.assign(begin, begin + chunk_size); ****
>
> result += thrust::reduce(gpuarray.begin(), gpuarray.end());****
>
> begin += chunk_size;****
>
> }****
>
> cudaEventRecord(stop, 0);****
>
> cudaEventSynchronize(stop); ****
>
> cudaEventElapsedTime(&elapsedTime, start, stop);****
>
> cudaEventDestroy(start);****
>
> cudaEventDestroy(stop);****
>
> ** **
>
> printf(" Task %d on has sum (on GPU): %ld Time for the kernel: %f
> ms \n", taskid, result, elapsedTime); ****
>
> ****
>
> return result;****
>
> }****
>
> ** **
>
> I also calculate the sum using CPU and the code is as below - ****
>
> ** **
>
> struct timespec time1, time2, temp_time;****
>
> ** **
>
> clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);****
>
> int i;****
>
> int cpu_sum = 0;****
>
> long diff = 0;****
>
> ** **
>
> for (i = 0; i < nelements; i++) {****
>
> cpu_sum += array[i];****
>
> } ****
>
> clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);****
>
> temp_time.tv_sec = time2.tv_sec - time1.tv_sec;****
>
> temp_time.tv_nsec = time2.tv_nsec - time1.tv_nsec;****
>
> diff = temp_time.tv_sec * 1000000000 + temp_time.tv_nsec; ****
>
> printf("Task %d calculated sum: %d using CPU in %lf ms \n", taskid,
> cpu_sum, (double) diff/1000000); ****
>
> return cpu_sum;****
>
> ** **
>
> Now when I run the job on cluster with 10 MPI tasks and compare the
> timings of CPU and GPU, I get weird results where GPU time is much much
> higher than CPU time. ****
>
> But the case should be opposite isnt it?****
>
> ** **
>
> The CPU time is almost same for all the task but GPU time increases. ****
>
> ** **
>
> Just wondering what might be the cause of this or are these results
> correct? Anything wrong with MPI code?****
>
> ** **
>
> My cluster has 3 machines. 4 MPI tasks run on 2 machine and 2 Tasks run on
> 1 machine. ****
>
> Each machine has 1 GPU - GForce 9500 GT with 512 MB memory. ****
>
> ** **
>
> Can anyone please help me with this.?****
>
> ** **
>
> Thanks****
>
> -- ****
>
> ** **
>
> ** **
>
> ** **
> ------------------------------
> This email message is for the sole use of the intended recipient(s) and
> may contain confidential information. Any unauthorized review, use,
> disclosure or distribution is prohibited. If you are not the intended
> recipient, please contact the sender by reply email and destroy all copies
> of the original message.
> ------------------------------
>
>
> _______________________________________________
> users mailing list
> users_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/users
>

-- 
Best Regards,
ROHAN DESHPANDE