I am combining mpi and cuda. Trying to find out sum of array elements using cuda and using mpi to distribute the array.
 
my cuda code
#include <stdio.h>
  
__global__ void add(int *devarray, int *devsum)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
*devsum = *devsum + devarray[index];
}
 
extern "C"
int * run_kernel(int array[],int nelements)
{
int *devarray, *sum, *devsum;
sum =(int *) malloc(1 * sizeof(int));

printf("\nrun_kernel called..............");

cudaMalloc((void**) &devarray, sizeof(int)*nelements);
cudaMalloc((void**) &devsum, sizeof(int));
cudaMemcpy(devarray, array, sizeof(int)*nelements, cudaMemcpyHostToDevice);
//cudaMemcpy(devsum, sum, sizeof(int), cudaMemcpyHostToDevice);
add<<<2, 3>>>(devarray, devsum);
// printf("\ndevsum is %d", devsum);
cudaMemcpy(sum, devsum, sizeof(int), cudaMemcpyDeviceToHost);
 
printf(" \nthe sum is %d\n", *sum);
cudaFree(devarray);
cudaFree(devsum);
return sum;

}
 

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
 
#define ARRAYSIZE 2000
#define MASTER 0 
int data[ARRAYSIZE]; 

int main(int argc, char* argv[])
{
 
int numtasks, taskid, rc, dest, offset, i, j, tag1, tag2, source, chunksize, namelen;  
int mysum;
long sum;
int update(int myoffset, int chunk, int myid);
char myname[MPI_MAX_PROCESSOR_NAME]; 
MPI_Status status;
double start = 0.0, stop = 0.0, time = 0.0;
double totaltime;
FILE *fp;
char line[128];
char element;
int n;
int k=0;
 
/***** Initializations *****/
 
MPI_Init(&argc, &argv); 
MPI_Comm_size(MPI_COMM_WORLD, &numtasks); 
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Get_processor_name(myname,
&namelen);
printf ("MPI task %d has started on host %s...\n", taskid, myname); 
chunksize = (ARRAYSIZE / numtasks);
tag2 = 1;
tag1 = 2;

/***** Master task only ******/
 
if (taskid == MASTER){
 
fp=fopen("integers.txt", "r");
if(fp != NULL){
sum = 0;
while(fgets(line, sizeof line, fp)!= NULL){
fscanf(fp,"%d",&data[k]);
sum = sum + data[k]; // calculate sum to verify later on
k++;
}
}
 

printf("Initialized array sum %d\n", sum);
 
/* Send each task its portion of the array - master keeps 1st part */
offset = chunksize;
for (dest=1; dest<numtasks; dest++) { 
MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD); 
MPI_Send(&data[offset], chunksize, MPI_INT, dest, tag2, MPI_COMM_WORLD); 
printf("Sent %d elements to task %d offset= %d\n",chunksize,dest,offset); 
offset = offset + chunksize;
}
 

 
/* Master does its part of the work */
 
offset = 0;
mysum = run_kernel(&data[offset], chunksize);
printf("Kernel returns sum %d", mysum);
//mysum = update(offset, chunksize, taskid);
 
/* Wait to receive results from each task */
 
for (i=1; i<numtasks; i++) { 
source = i;
MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status); 
MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, &status); 
}
  
/* Get final sum and print sample results */
 
MPI_Reduce(&mysum, &sum, 1, MPI_INT, MPI_SUM, MASTER, MPI_COMM_WORLD); 
printf("\n*** Final sum= %d ***\n",sum); 
} /* end of master section */
 
/***** Non-master tasks only *****/

 
if (taskid > MASTER) {
 
/* Receive my portion of array from the master task */
start= MPI_Wtime();
source = MASTER;
MPI_Recv(&offset, 1, MPI_INT, source, tag1, MPI_COMM_WORLD, &status); 
MPI_Recv(&data[offset], chunksize, MPI_INT, source, tag2,MPI_COMM_WORLD, &status); 
mysum = run_kernel(&data[offset], chunksize);
printf("\nKernel returns sum %d ", mysum);
 
// mysum = update(offset, chunksize, taskid);
stop = MPI_Wtime();
time = stop -start;
printf("time taken by process %d to recieve elements and caluclate own sum is = %lf seconds \n", taskid, time);
// totaltime = totaltime + time;
 

 
/* Send my results back to the master task */
dest = MASTER;
MPI_Send(&offset, 1, MPI_INT, dest, tag1, MPI_COMM_WORLD); 
MPI_Send(&data[offset], chunksize, MPI_INT, MASTER, tag2, MPI_COMM_WORLD); 
MPI_Reduce(&mysum, &sum, 1, MPI_INT, MPI_SUM, MASTER, MPI_COMM_WORLD);
 
} /* end of non-master */
 
MPI_Finalize();
}

here is the output of above code -
 
MPI task 2 has started on host 4
MPI task 3 has started on host 4
MPI task 0 has started on host 4
MPI task 1 has started on host 4
 
Initialized array sum 9061
Sent 500 elements to task 1 offset= 500
Sent 500 elements to task 2 offset= 1000
Sent 500 elements to task 3 offset= 1500
 

 
run_kernel called..............
the sum is 10
 
Kernel returns sum 159300360 time taken by process 2 to recieve elements and caluclate own sum is = 0.290016 seconds
run_kernel called..............
the sum is 268452367
run_kernel called..............
the sum is 10
 
Kernel returns sum 145185544 time taken by process 3 to recieve elements and caluclate own sum is = 0.293579 seconds
run_kernel called..............
the sum is 1048
 
Kernel returns sum 156969736 time taken by process 1 to recieve elements and caluclate own sum is = 0.297599 seconds
Kernel returns sum 152148496
*** Final sum= 613604136 ***
 
The final sum and initialized sum is not matching. I am guessing its a pointer issue. mysum should be pointer? but then MPI_REDUCE does not execute properly and segmentation fault occurs.
 
Any idea what is going wrong?
Thanks