Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: [OMPI devel] Some questions about checkpoint/restart (7)
From: Takayuki Seki (seki_at_[hidden])
Date: 2010-03-18 05:23:52


7th question is as follows:

(7) The result of communication which use derived datatype after taking checkpoint is incorrect.

Framework : crcp
Component : bkmrk
The source file : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
The function name : traffic_message_append

Framework : datatype
The source file : ompi/datatype/datatype.h
The function name : ompi_ddt_type_size

Here's the code that causes the problem:

struct dd {
  char x;
  float a;
  char y;
  float b;
  int c;
};
struct dd buf,ans_dd_buf;

  if (rank == 0) {
    buf.x = (char)1;
    buf.a = (float)4329.1003;
    buf.y = (char)2;
    buf.b = (float)8474.73;
    buf.c = (int)48;
  }
  else {
    buf.x = (char)0;
    buf.a = (float)0;
    buf.y = (char)0;
    buf.b = (float)0;
    buf.c = (int)0;
  }
  ans_dd_buf.x = (char)1;
  ans_dd_buf.a = (float)4329.1003;
  ans_dd_buf.y = (char)2;
  ans_dd_buf.b = (float)8474.73;
  ans_dd_buf.c = (int)48;

  /* item number per a block */
  b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;
  /* datatype per a block */
  dt[0] = dt[2] = MPI_BYTE;
  dt[1] = dt[3] = MPI_FLOAT;
  dt[4] = MPI_INT;
  /* disp per a block */
  dp[0] = 0;
  MPI_Address(&buf.x,&st);
  MPI_Address(&buf.a,&cr);
  dp[1] = (cr - st);
  MPI_Address(&buf.y,&cr);
  dp[2] = (cr - st);
  MPI_Address(&buf.b,&cr);
  dp[3] = (cr - st);
  MPI_Address(&buf.c,&cr);
  dp[4] = (cr - st);
  cc = MPI_Type_struct(ITEMNUM,&b_l[0],&dp[0],&dt[0],&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  cc = MPI_Type_commit(&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  MPI_Barrier(MPI_COMM_WORLD);
  printf(" rank=%d pass-1 x->x =%d[%d] x->a=%d[%d] x->y=%d[%d] x->b=%d[%d] x->c=%d[%d]\n"
    ,rank
    ,( (int)((unsigned long)(&buf.x) - (unsigned long)(&buf.x)) ),dp[0]
    ,( (int)((unsigned long)(&buf.a) - (unsigned long)(&buf.x)) ),dp[1]
    ,( (int)((unsigned long)(&buf.y) - (unsigned long)(&buf.x)) ),dp[2]
    ,( (int)((unsigned long)(&buf.b) - (unsigned long)(&buf.x)) ),dp[3]
    ,( (int)((unsigned long)(&buf.c) - (unsigned long)(&buf.x)) ),dp[4]
  );
  fflush(stdout);

  if (rank == 0) {
    MPI_Isend(&buf,1,newdt,1,1000,MPI_COMM_WORLD,&req);
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME); /** take checkpoint at this point **/
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }
  else { /* rank 1 */
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME); /** take checkpoint at this point **/
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    MPI_Irecv(&buf,1,newdt,0,1000,MPI_COMM_WORLD,&req);
    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }
  if (ans_dd_buf.x != buf.x) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.a != buf.a) { MPI_Abort(MPI_COMM_WORLD,1); } /* The error occurs at this point */
  if (ans_dd_buf.y != buf.y) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.b != buf.b) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.c != buf.c) { MPI_Abort(MPI_COMM_WORLD,1); }

* Take checkpoint while Rank 0 and Rank 1 are performing sleep function

* Construct derived datatype from the structure dd.

* I think that image of memory mapping of the derived datatype is as follows:

              1111111111
    01234567890123456789
    --------------------
    X###AAAAY###BBBBCCCC
    --------------------

### means space.

* ddt_size for /** Quick reference to the size of the datatype */ in
  ompi_crcp_bkmrk_pml_traffic_message_ref_t structure is obtained by
  ompi_ddt_type_size function in traffic_message_append function.

    if( NULL != datatype ) {
        ompi_ddt_type_size(datatype,
                           &ddt_size);

* I think that the returned value of ddt_size is wrong.
  The obtained value is 14.(Does it means total size in the memory is 14bytes?)

  struct dd {
    char x; -> charactor is 1byte.
    float a; -> float is 4byte.
    char y; -> charactor is 1byte.
    float b; -> float is 4byte.
    int c; -> integer is 4byte.
  };

* But the returned value of ddt_size should be 20bytes, considering the memory mapping.

* Rank 1 receive messages of only 14bytes in the bkmrk.
  The wrong result is obtained.

* t_mpi_question-7-ng.c : the error occurs.
  Here's my debugging output.

  ft_event_post_drain_message:Irecv drain_msg_ref=c89200 rank=0 tag=1000 cnt=1 ddt=14 to=c929b0 [datatype->size=1]
  wait_quiesce_drained: x=1 a=142658605493679655240073216.000000 y=4 b=0.000000 c=32
    /* 14bytes data is received, it is incorrect. values are wrong. */
  drain_message_check_recv:datatype->size=1 14 count=1 1
  ompi_ddt_copy_content_same_ddt:Start size=14 flag=102/4 count=1
    /* DT_FLAG_CONTIGUOUS is false. */

* t_mpi_question-7-ok.c : the error does not occur.
  Here's my debugging output.

  ft_event_post_drain_message:Irecv drain_msg_ref=a51280 rank=0 tag=1000 cnt=1 ddt=20 to=a5b6b0 [datatype->size=1]
  wait_quiesce_drained: x=1 a=4329.100098 y=2 b=8474.730469 c=48
    /* 20bytes data is received correctly. */
  drain_message_check_recv:datatype->size=1 20 count=1 1
  ompi_ddt_copy_content_same_ddt:Start size=20 flag=186/4 count=1
    /* DT_FLAG_CONTIGUOUS is true. */

* difference list

-bash-3.2$ diff -c t_mpi_question-7-ng.c t_mpi_question-7-ok.c
*** t_mpi_question-7-ng.c Fri Feb 26 13:07:05 2010
--- t_mpi_question-7-ok.c Fri Feb 26 13:20:25 2010
***************
*** 8,16 ****
  #define ITEMNUM 5

  struct dd {
! char x;
    float a;
! char y;
    float b;
    int c;
  };
--- 8,16 ----
  #define ITEMNUM 5

  struct dd {
! int x;
    float a;
! int y;
    float b;
    int c;
  };
***************
*** 31,52 ****
    MPI_Comm_size(MPI_COMM_WORLD,&size);

    if (rank == 0) {
! buf.x = (char)1;
      buf.a = (float)4329.1003;
! buf.y = (char)2;
      buf.b = (float)8474.73;
      buf.c = (int)48;
    }
    else {
! buf.x = (char)0;
      buf.a = (float)0;
! buf.y = (char)0;
      buf.b = (float)0;
      buf.c = (int)0;
    }
! ans_dd_buf.x = (char)1;
    ans_dd_buf.a = (float)4329.1003;
! ans_dd_buf.y = (char)2;
    ans_dd_buf.b = (float)8474.73;
    ans_dd_buf.c = (int)48;

--- 31,52 ----
    MPI_Comm_size(MPI_COMM_WORLD,&size);

    if (rank == 0) {
! buf.x = (int)1;
      buf.a = (float)4329.1003;
! buf.y = (int)2;
      buf.b = (float)8474.73;
      buf.c = (int)48;
    }
    else {
! buf.x = (int)0;
      buf.a = (float)0;
! buf.y = (int)0;
      buf.b = (float)0;
      buf.c = (int)0;
    }
! ans_dd_buf.x = (int)1;
    ans_dd_buf.a = (float)4329.1003;
! ans_dd_buf.y = (int)2;
    ans_dd_buf.b = (float)8474.73;
    ans_dd_buf.c = (int)48;

***************
*** 54,60 ****
    b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;

    /* datatype per a block */
! dt[0] = dt[2] = MPI_BYTE;
    dt[1] = dt[3] = MPI_FLOAT;
    dt[4] = MPI_INT;

--- 54,60 ----
    b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;

    /* datatype per a block */
! dt[0] = dt[2] = MPI_INT;
    dt[1] = dt[3] = MPI_FLOAT;
    dt[4] = MPI_INT;

-bash-3.2$ cat t_mpi_question-7-ng.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define SLPTIME 60

#define ITEMNUM 5

struct dd {
  char x;
  float a;
  char y;
  float b;
  int c;
};

int main(int ac,char **av)
{
  int rank,size,cc;
  MPI_Request req;
  MPI_Status sts;
  struct dd buf,ans_dd_buf;
  int b_l[ITEMNUM];
  MPI_Aint dp[ITEMNUM],st,cr;
  MPI_Datatype dt[ITEMNUM],newdt;

  MPI_Init(&ac,&av);

  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  if (rank == 0) {
    buf.x = (char)1;
    buf.a = (float)4329.1003;
    buf.y = (char)2;
    buf.b = (float)8474.73;
    buf.c = (int)48;
  }
  else {
    buf.x = (char)0;
    buf.a = (float)0;
    buf.y = (char)0;
    buf.b = (float)0;
    buf.c = (int)0;
  }
  ans_dd_buf.x = (char)1;
  ans_dd_buf.a = (float)4329.1003;
  ans_dd_buf.y = (char)2;
  ans_dd_buf.b = (float)8474.73;
  ans_dd_buf.c = (int)48;

  /* item number per a block */
  b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;

  /* datatype per a block */
  dt[0] = dt[2] = MPI_BYTE;
  dt[1] = dt[3] = MPI_FLOAT;
  dt[4] = MPI_INT;

  /* disp per a block */
  dp[0] = 0;
  MPI_Address(&buf.x,&st);

  MPI_Address(&buf.a,&cr);
  dp[1] = (cr - st);

  MPI_Address(&buf.y,&cr);
  dp[2] = (cr - st);

  MPI_Address(&buf.b,&cr);
  dp[3] = (cr - st);

  MPI_Address(&buf.c,&cr);
  dp[4] = (cr - st);

  cc = MPI_Type_struct(ITEMNUM,&b_l[0],&dp[0],&dt[0],&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  cc = MPI_Type_commit(&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }

  MPI_Barrier(MPI_COMM_WORLD);

  printf(" rank=%d pass-1 x->x =%d[%d] x->a=%d[%d] x->y=%d[%d] x->b=%d[%d] x->c=%d[%d]\n"
    ,rank
    ,( (int)((unsigned long)(&buf.x) - (unsigned long)(&buf.x)) ),dp[0]
    ,( (int)((unsigned long)(&buf.a) - (unsigned long)(&buf.x)) ),dp[1]
    ,( (int)((unsigned long)(&buf.y) - (unsigned long)(&buf.x)) ),dp[2]
    ,( (int)((unsigned long)(&buf.b) - (unsigned long)(&buf.x)) ),dp[3]
    ,( (int)((unsigned long)(&buf.c) - (unsigned long)(&buf.x)) ),dp[4]
  );
  fflush(stdout);

  if (rank == 0) {
    MPI_Isend(&buf,1,newdt,1,1000,MPI_COMM_WORLD,&req);

    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);

    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }
  else {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);

    MPI_Irecv(&buf,1,newdt,0,1000,MPI_COMM_WORLD,&req);
    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }

  printf(" rank=%d pass-2 %d %f %d %f %d \n"
    ,rank,buf.x,buf.a,buf.y,buf.b,buf.c); fflush(stdout);
  if (ans_dd_buf.x != buf.x) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.a != buf.a) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.y != buf.y) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.b != buf.b) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.c != buf.c) { MPI_Abort(MPI_COMM_WORLD,1); }

  cc = MPI_Finalize();
  if (rank ==0) {
    printf(" rank=%d program end \n",rank); fflush(stdout);
  }
  return(0);
}

-bash-3.2$ cat t_mpi_question-7-ok.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define SLPTIME 60

#define ITEMNUM 5

struct dd {
  int x;
  float a;
  int y;
  float b;
  int c;
};

int main(int ac,char **av)
{
  int rank,size,cc;
  MPI_Request req;
  MPI_Status sts;
  struct dd buf,ans_dd_buf;
  int b_l[ITEMNUM];
  MPI_Aint dp[ITEMNUM],st,cr;
  MPI_Datatype dt[ITEMNUM],newdt;

  MPI_Init(&ac,&av);

  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  if (rank == 0) {
    buf.x = (int)1;
    buf.a = (float)4329.1003;
    buf.y = (int)2;
    buf.b = (float)8474.73;
    buf.c = (int)48;
  }
  else {
    buf.x = (int)0;
    buf.a = (float)0;
    buf.y = (int)0;
    buf.b = (float)0;
    buf.c = (int)0;
  }
  ans_dd_buf.x = (int)1;
  ans_dd_buf.a = (float)4329.1003;
  ans_dd_buf.y = (int)2;
  ans_dd_buf.b = (float)8474.73;
  ans_dd_buf.c = (int)48;

  /* item number per a block */
  b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;

  /* datatype per a block */
  dt[0] = dt[2] = MPI_INT;
  dt[1] = dt[3] = MPI_FLOAT;
  dt[4] = MPI_INT;

  /* disp per a block */
  dp[0] = 0;
  MPI_Address(&buf.x,&st);

  MPI_Address(&buf.a,&cr);
  dp[1] = (cr - st);

  MPI_Address(&buf.y,&cr);
  dp[2] = (cr - st);

  MPI_Address(&buf.b,&cr);
  dp[3] = (cr - st);

  MPI_Address(&buf.c,&cr);
  dp[4] = (cr - st);

  cc = MPI_Type_struct(ITEMNUM,&b_l[0],&dp[0],&dt[0],&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  cc = MPI_Type_commit(&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }

  MPI_Barrier(MPI_COMM_WORLD);

  printf(" rank=%d pass-1 x->x =%d[%d] x->a=%d[%d] x->y=%d[%d] x->b=%d[%d] x->c=%d[%d]\n"
    ,rank
    ,( (int)((unsigned long)(&buf.x) - (unsigned long)(&buf.x)) ),dp[0]
    ,( (int)((unsigned long)(&buf.a) - (unsigned long)(&buf.x)) ),dp[1]
    ,( (int)((unsigned long)(&buf.y) - (unsigned long)(&buf.x)) ),dp[2]
    ,( (int)((unsigned long)(&buf.b) - (unsigned long)(&buf.x)) ),dp[3]
    ,( (int)((unsigned long)(&buf.c) - (unsigned long)(&buf.x)) ),dp[4]
  );
  fflush(stdout);

  if (rank == 0) {
    MPI_Isend(&buf,1,newdt,1,1000,MPI_COMM_WORLD,&req);

    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);

    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }
  else {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);

    MPI_Irecv(&buf,1,newdt,0,1000,MPI_COMM_WORLD,&req);
    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }

  printf(" rank=%d pass-2 %d %f %d %f %d \n"
    ,rank,buf.x,buf.a,buf.y,buf.b,buf.c); fflush(stdout);
  if (ans_dd_buf.x != buf.x) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.a != buf.a) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.y != buf.y) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.b != buf.b) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.c != buf.c) { MPI_Abort(MPI_COMM_WORLD,1); }

  cc = MPI_Finalize();
  if (rank ==0) {
    printf(" rank=%d program end \n",rank); fflush(stdout);
  }
  return(0);
}