I am running MPI and Thrust code on a cluster and measuring time for calculations.
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>
#define MASTER 0
#define ARRAYSIZE 20000000
int *masterarray,*onearray,*twoarray,*threearray,*fourarray,*fivearray,*sixarray,*sevenarray,*eightarray,*ninearray;
int main(int argc, char* argv[])
{
int numtasks, taskid,chunksize, namelen;
int mysum,one,two,three,four,five,six,seven,eight,nine;
char myname[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
int a,b,c,d,e,f,g,h,i,j;
/***** Initializations *****/
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname, &namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname);
masterarray= malloc(ARRAYSIZE * sizeof(int));
onearray= malloc(ARRAYSIZE * sizeof(int));
twoarray= malloc(ARRAYSIZE * sizeof(int));
threearray= malloc(ARRAYSIZE * sizeof(int));
fourarray= malloc(ARRAYSIZE * sizeof(int));
fivearray= malloc(ARRAYSIZE * sizeof(int));
sixarray= malloc(ARRAYSIZE * sizeof(int));
sevenarray= malloc(ARRAYSIZE * sizeof(int));
eightarray= malloc(ARRAYSIZE * sizeof(int));
ninearray= malloc(ARRAYSIZE * sizeof(int));
/***** Master task only ******/
if (taskid == MASTER){
for(a=0; a < ARRAYSIZE; a++){
masterarray[a] = 1;
}
mysum = run_kernel0(masterarray,ARRAYSIZE,taskid, myname);
} /* end of master section */
if (taskid > MASTER) {
if(taskid == 1){
for(b=0;b<ARRAYSIZE;b++){
onearray[b] = 1;
}
one = run_kernel0(onearray,ARRAYSIZE,taskid, myname);
}
if(taskid == 2){
for(c=0;c<ARRAYSIZE;c++){
twoarray[c] = 1;
}
two = run_kernel0(twoarray,ARRAYSIZE,taskid, myname);
}
if(taskid == 3){
for(d=0;d<ARRAYSIZE;d++){
threearray[d] = 1;
}
three = run_kernel0(threearray,ARRAYSIZE,taskid, myname);
}
if(taskid == 4){
for(e=0;e < ARRAYSIZE; e++){
fourarray[e] = 1;
}
four = run_kernel0(fourarray,ARRAYSIZE,taskid, myname);
}
if(taskid == 5){
for(f=0;f<ARRAYSIZE;f++){
fivearray[f] = 1;
}
five = run_kernel0(fivearray,ARRAYSIZE,taskid, myname);
}
if(taskid == 6){
for(g=0;g<ARRAYSIZE;g++){
sixarray[g] = 1;
}
six = run_kernel0(sixarray,ARRAYSIZE,taskid, myname);
}
if(taskid == 7){
for(h=0;h<ARRAYSIZE;h++){
sevenarray[h] = 1;
}
seven = run_kernel0(sevenarray,ARRAYSIZE,taskid, myname);
}
if(taskid == 8){
for(i=0;i<ARRAYSIZE;i++){
eightarray[i] = 1;
}
eight = run_kernel0(eightarray,ARRAYSIZE,taskid, myname);
}
if(taskid == 9){
for(j=0;j<ARRAYSIZE;j++){
ninearray[j] = 1;
}
nine = run_kernel0(ninearray,ARRAYSIZE,taskid, myname);
}
}
MPI_Finalize();
}
All the tasks just initialize their own array and then calculate the sum using cuda thrust.
My CUDA Thrust code -
#include <stdio.h>
#include <cutil_inline.h>
#include <cutil.h>
#include <thrust/version.h>
#include <thrust/generate.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <time.h>
#include <sys/time.h>
#include <sys/resource.h>
extern "C"
int run_kernel0( int array[], int nelements, int taskid, char hostname[])
{
float elapsedTime;
int result = 0;
int threshold = 25000000;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
thrust::device_vector<int> gpuarray;
int *begin = array;
int *end = array + nelements;
while(begin != end)
{
int chunk_size = thrust::min(threshold,end - begin);
gpuarray.assign(begin, begin + chunk_size);
result += thrust::reduce(gpuarray.begin(), gpuarray.end());
begin += chunk_size;
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf(" Task %d on has sum (on GPU): %ld Time for the kernel: %f ms \n", taskid, result, elapsedTime);
return result;
}
I also calculate the sum using CPU and the code is as below -
struct timespec time1, time2, temp_time;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);
int i;
int cpu_sum = 0;
long diff = 0;
for (i = 0; i < nelements; i++) {
cpu_sum += array[i];
}
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);
temp_time.tv_sec = time2.tv_sec - time1.tv_sec;
temp_time.tv_nsec = time2.tv_nsec - time1.tv_nsec;
diff = temp_time.tv_sec * 1000000000 + temp_time.tv_nsec;
printf("Task %d calculated sum: %d using CPU in %lf ms \n", taskid, cpu_sum, (double) diff/1000000);
return cpu_sum;
Now when I run the job on cluster with 10 MPI tasks and compare the timings of CPU and GPU, I get weird results where GPU time is much much higher than CPU time.
But the case should be opposite isnt it?