Hi Jeff,
I have a minimal MPI program to test the TM interface and strangely I seem to get errors during tm_init call. Could you explain what could be wrong? Have you seen anything similar. Here is the MPI code:
#include <stdio.h>
#include <tm.h>
#include <mpi.h>
extern char **environ;
void do_check(int val, char *msg) {
if (TM_SUCCESS != val) {
printf("ret is %d instead of %d: %s\n", val, TM_SUCCESS, msg);
exit(1);
}
}
main (int argc, char *argv[]) {
int size, rank, ret, err, numnodes, local_err;
MPI_Status status;
char **input;
input[0] = "/bin/echo";
input[1] = "Hello There";
struct tm_roots task_root;
tm_node_id *nodelist;
tm_event_t event;
tm_task_id task_id;
char hostname[64];
char buf[]="11000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000";
gethostname(hostname, 64);
ret = MPI_Init (&argc, &argv);
if (ret) {
printf ("Error: %d\n", ret);
return (1);
}
ret = MPI_Comm_size (MPI_COMM_WORLD, &size);
if (ret) {
printf("Error: %d\n", ret);
return (1);
}
ret = MPI_Comm_rank (MPI_COMM_WORLD, &rank);
if (ret) {
printf("Error: %d\n", ret);
return (1);
}
printf ("First Hostname: %s node %d out of %d\n", hostname, rank, size);
if (size%2 && rank==size-1)
printf("Sitting out\n");
else {
if (rank%2==0)
MPI_Send(buf, strlen(buf), MPI_BYTE, rank+1, 11, MPI_COMM_WORLD);
else
MPI_Recv(buf, sizeof(buf), MPI_BYTE, rank-1, 11, MPI_COMM_WORLD, &status);
}
printf ("Second Hostname: %s node %d out of %d\n", hostname, rank, size);
if (rank == 1) {
ret = tm_init(NULL, &task_root);
do_check(ret, "tm_init failed");
printf ("Special Hostname: %s node %d out of %d\n", hostname, rank, size);
task_id = 0xabcdef;
event = 0xabcdef;
printf("%s\t%s", input[0], input[1]);
tm_finalize();
}
MPI_Finalize ();
return (0);
}
The error I am getting is:
First Hostname: wins05 node 0 out of 4
First Hostname: wins03 node 1 out of 4
First Hostname: wins02 node 2 out of 4
First Hostname: wins01 node 3 out of 4
Second Hostname: wins05 node 0 out of 4
Second Hostname: wins02 node 2 out of 4
Second Hostname: wins03 node 1 out of 4
Second Hostname: wins01 node 3 out of 4
tm_poll: protocol number dis error 11
ret is 17002 instead of 0: tm_init failed
3 processes killed (possibly by Open MPI)
I am using Torque-2.0.0p7 and Open MPI-1.0.1.
Thanks,
Prakash
|