Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: [OMPI devel] Some questions about checkpoint/restart (9)
From: Takayuki Seki (seki_at_[hidden])
Date: 2010-04-02 00:32:34


9th question is as follows:

(9) The communication which has different element size in sender and receiver
    deadlocks after taking checkpoint.

Framework : crcp
Component : bkmrk
The source file : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
The function name : drain_message_find

Here's the code that causes the problem:

#define WORKBUFSIZE 4
#define SLPTIME 60

  int rbuf[WORKBUFSIZE];
  int j;

  MPI_Barrier(MPI_COMM_WORLD);
  if (rank == 1) {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME); /** take checkpoint at this point **/
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    MPI_Irecv(&rbuf[0],WORKBUFSIZE,MPI_INT,0,1000,MPI_COMM_WORLD,&req);
    MPI_Wait(&req,&sts);
    j=rbuf[0];
  }
  else { /* rank 0 */
    j=100;
    MPI_Isend(&j,1,MPI_INT,1,1000,MPI_COMM_WORLD,&req);
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME); /** take checkpoint at this point **/
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    MPI_Wait(&req,&sts);
  }
  printf(" rank=%d pass-2 %d %d \n",rank,j,sts._count); fflush(stdout);

* Take checkpoint while Process 0 and Process 1 are in sleep function,
  then MPI program deadlocks.

* element size does not match in drain_message_find.
 drain_message_find:My=1 drain_msg=e6fc80 [peer=0/0 count=4/1 comm=6014e0 ID 0/0/0 R=1/1 tag=1000/1000 ddt_size=4/4 [datatype->size=1]] [done=1
active=0 already_posted=0]

        /* Check the datatype size, if specified for a match */
        if( ddt_size != PROBE_ANY_SIZE &&
            count != PROBE_ANY_COUNT) {
            /* Check the datatype size and count to make sure it matches */
            if((drain_msg->count ) != count ||
               (drain_msg->ddt_size) != ddt_size) {
                continue;
            }
        }

  drain_msg->count is 1.
  count is 4.
  drain_msg->ddt_size is 4.
  ddt_size is 4.

* If Open MPI is built with --enable-debug configure option,
  and openib btl is selected on running MPI job,
  the following message is printed in mca_btl_openib_ft_event.

  t_mpi_question-9.out: ../../../../../ompi/mca/btl/openib/btl_openib.c:1433:
  mca_btl_openib_ft_event: Assertion `((0xdeafbeedULL << 32) + 0xdeafbeedULL) == ((opal_object_t *)
(&mca_btl_openib_component.ib_procs))->obj_magic_id' failed.

* The following programs behave in the same.

  1) t_mpi_question-9-packunpack.c

    Sender : MPI_Isend(&workbuf[0],j,MPI_PACKED,1,1000,MPI_COMM_WORLD,&req);
    Receiver: #define WORKBUFSIZ 64
              char workbuf[WORKBUFSIZ];
              MPI_Irecv(&workbuf[0],WORKBUFSIZ,MPI_PACKED,0,1000,MPI_COMM_WORLD,&req);

    drain_message_find:My=1 drain_msg=794200 [peer=0/0 count=64/20 comm=601ba0 ID 0/0/0 R=1/1 tag=1000/1000 ddt_size=1/1 [datatype->size=1]] [done=1
active=0 already_posted=0]

    drain_msg->count is 20.
    count is 64.

  2) t_mpi_question-9-contiguous.c

    Sender : cc=MPI_Type_contiguous(50,MPI_INT,&newdt);
              cc=MPI_Isend(&buf[0][0],1,newdt,1,1000,MPI_COMM_WORLD,&req);
    Receiver: cc=MPI_Irecv(&buf[0][0],50,MPI_INT,0,1000,MPI_COMM_WORLD,&req);

    drain_message_find:My=1 drain_msg=1658200 [peer=0/0 count=50/1 comm=601840 ID 0/0/0 R=1/1 tag=1000/1000 ddt_size=4/200 [datatype->size=1]] [done=1
active=0 already_posted=0]

    drain_msg->count is 1.
    count is 50.
    drain_msg->ddt_size is 200.
    ddt_size is 4.

  3) t_mpi_question-9-vector.c

    Sender : cc=MPI_Type_vector(10,1,10,MPI_INT,&newdt);
               cc=MPI_Isend(&buf[0][0],1,newdt,1,1000,MPI_COMM_WORLD,&req);
    Recevier: cc=MPI_Irecv(&buf[0][0],10,MPI_INT,0,1000,MPI_COMM_WORLD,&req);

    drain_message_find:My=1 drain_msg=20ad900 [peer=0/0 count=10/1 comm=601840 ID 0/0/0 R=1/1 tag=1000/1000 ddt_size=4/40 [datatype->size=1]] [done=1
active=0 already_posted=0]

    drain_msg->count is 1.
    count is 10.
    drain_msg->ddt_size is 40.
    ddt_size is 4.

-bash-3.2$ cat t_mpi_question-9.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include "mpi.h"

#define WORKBUFSIZE 4
#define SLPTIME 60

int main(int ac,char **av)
{
  int rank,size,cc,i,j;
  MPI_Request req;
  MPI_Status sts;
  int rbuf[WORKBUFSIZE];

  rank=0;
  j=0;
  memset((void *)rbuf,0,sizeof(int)*WORKBUFSIZE);

  MPI_Init(&ac,&av);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  MPI_Barrier(MPI_COMM_WORLD);

  if (rank == 1) {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    MPI_Irecv(&rbuf[0],WORKBUFSIZE,MPI_INT,0,1000,MPI_COMM_WORLD,&req);
    MPI_Wait(&req,&sts);
    j=rbuf[0];
  }
  else {
    j=100;
    MPI_Isend(&j,1,MPI_INT,1,1000,MPI_COMM_WORLD,&req);
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    MPI_Wait(&req,&sts);
  }
  printf(" rank=%d pass-2 %d %d \n",rank,j,sts._count); fflush(stdout);

  MPI_Finalize();
  if (rank ==0) {
    printf(" rank=%d program end \n",rank); fflush(stdout);
  }
  return(0);
}

-bash-3.2$ cat t_mpi_question-9-cotiguous.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define SLPTIME 60
#define ITEMNUM 10

int buf[ITEMNUM][ITEMNUM];
int main(int ac,char **av)
{
  int rank,size,cc,i,j;
  MPI_Request req;
  MPI_Status sts;
  MPI_Datatype newdt;

  MPI_Init(&ac,&av);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  for (i=0;i<ITEMNUM;i++) {
    for (j=0;j<ITEMNUM;j++) {
      if (rank == 0) { buf[i][j] = (i*100)+j; }
      else { buf[i][j] = -1; }
    }
  }

  cc=MPI_Type_contiguous(50,MPI_INT,&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  cc=MPI_Type_commit(&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  MPI_Barrier(MPI_COMM_WORLD);

  if (rank == 0) {
    cc=MPI_Isend(&buf[0][0],1,newdt,1,1000,MPI_COMM_WORLD,&req);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    cc=MPI_Wait(&req,&sts);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    cc=MPI_Type_free(&newdt);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  }
  else {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    cc=MPI_Irecv(&buf[0][0],50,MPI_INT,0,1000,MPI_COMM_WORLD,&req);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    cc=MPI_Wait(&req,&sts);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    cc=MPI_Type_free(&newdt);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  }

  for (i=0;i<ITEMNUM;i++) {
    printf(" rank=%d size=%d i=%d [%3d %3d %3d %3d %3d %3d %3d %3d %3d %3d] \n"
     ,rank,size,i
     ,buf[i][0],buf[i][1],buf[i][2],buf[i][3],buf[i][4]
     ,buf[i][5],buf[i][6],buf[i][7],buf[i][8],buf[i][9]
    );
    fflush(stdout);
  }

  MPI_Finalize();
  if (rank ==0) {
    printf(" rank=%d program end \n",rank); fflush(stdout);
  }
  return(0);
}

-bash-3.2$ cat t_mpi_question-9-vector.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define SLPTIME 60
#define ITEMNUM 10

int buf[ITEMNUM][ITEMNUM];
int main(int ac,char **av)
{
  int rank,size,cc,i,j;
  MPI_Request req;
  MPI_Status sts;
  MPI_Datatype newdt;

  MPI_Init(&ac,&av);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  for (i=0;i<ITEMNUM;i++) {
    for (j=0;j<ITEMNUM;j++) {
      if (rank == 0) { buf[i][j] = (i*100)+j; }
      else { buf[i][j] = -1; }
    }
  }

  cc=MPI_Type_vector(10,1,10,MPI_INT,&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  cc=MPI_Type_commit(&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  MPI_Barrier(MPI_COMM_WORLD);

  if (rank == 0) {
    cc=MPI_Isend(&buf[0][0],1,newdt,1,1000,MPI_COMM_WORLD,&req);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    cc=MPI_Wait(&req,&sts);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    cc=MPI_Type_free(&newdt);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  }
  else {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    cc=MPI_Irecv(&buf[0][0],10,MPI_INT,0,1000,MPI_COMM_WORLD,&req);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    cc=MPI_Wait(&req,&sts);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    cc=MPI_Type_free(&newdt);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  }

  for (i=0;i<ITEMNUM;i++) {
    printf(" rank=%d size=%d i=%d [%3d %3d %3d %3d %3d %3d %3d %3d %3d %3d] \n"
     ,rank,size,i
     ,buf[i][0],buf[i][1],buf[i][2],buf[i][3],buf[i][4]
     ,buf[i][5],buf[i][6],buf[i][7],buf[i][8],buf[i][9]
    );
    fflush(stdout);
  }

  MPI_Finalize();
  if (rank ==0) {
    printf(" rank=%d program end \n",rank); fflush(stdout);
  }
  return(0);
}