Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: [OMPI devel] Some questions about checkpoint/restart (8)
From: Takayuki Seki (seki_at_[hidden])
Date: 2010-03-30 03:07:37


8th question is as follows:

(8) The result of communication which uses derived datatypes which was constructed
    using MPI_Type_vector,MPI_Type_indexed is incorrect after taking checkpoint.

Framework : datatype
Component : datatype
The source file : ompi/datatype/dt_copy.c
The function name : ompi_ddt_copy_content_same_ddt

Framework : crcp
Component : bkmrk
The source file : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
The function name : ?

Here's the code that causes the problem:

#define SLPTIME 60
#define ITEMNUM 10

int buf[ITEMNUM][ITEMNUM];

  MPI_Type_vector(10,1,10,MPI_INT,&newdt);
  MPI_Type_commit(&newdt);
  MPI_Barrier(MPI_COMM_WORLD);

  if (rank == 0) {
    MPI_Isend(&buf[0][0],1,newdt,1,1000,MPI_COMM_WORLD,&req);
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME); /** take checkpoint at this point **/
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }
  else {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME); /** take checkpoint at this point **/
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    MPI_Irecv(&buf[0][0],1,newdt,0,1000,MPI_COMM_WORLD,&req);
    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }

  for (i=0;i<ITEMNUM;i++) { /* showing the result */
    printf(" rank=%d size=%d i=%d [%3d %3d %3d %3d %3d %3d %3d %3d %3d %3d] \n"
     ,rank,size,i
     ,buf[i][0],buf[i][1],buf[i][2],buf[i][3],buf[i][4]
     ,buf[i][5],buf[i][6],buf[i][7],buf[i][8],buf[i][9]
    );
    fflush(stdout);
  }

* Take checkpoint while Process 0 and Process 1 are in sleep function

* After receiving data(by MPI_Irecv/Wait), incorrect results are printed.
  Incorrect results are as follows:
    rank=1 size=2 i=0 [ 0 -1 -1 -1 -1 -1 -1 -1 -1 -1]
    rank=1 size=2 i=1 [113 -1 -1 -1 -1 -1 -1 -1 -1 -1]
    rank=1 size=2 i=2 [1734306160 -1 -1 -1 -1 -1 -1 -1 -1 -1]
    rank=1 size=2 i=3 [1953458291 -1 -1 -1 -1 -1 -1 -1 -1 -1]
    rank=1 size=2 i=4 [1836017711 -1 -1 -1 -1 -1 -1 -1 -1 -1]
    rank=1 size=2 i=5 [1853054828 -1 -1 -1 -1 -1 -1 -1 -1 -1]
    rank=1 size=2 i=6 [7630955 -1 -1 -1 -1 -1 -1 -1 -1 -1]
    rank=1 size=2 i=7 [1919907629 -1 -1 -1 -1 -1 -1 -1 -1 -1]
    rank=1 size=2 i=8 [1600938352 -1 -1 -1 -1 -1 -1 -1 -1 -1]
    rank=1 size=2 i=9 [23761296 -1 -1 -1 -1 -1 -1 -1 -1 -1]
  It seems that ten elements may be copied, but these value are incorrect.

* Here's my debugging output.

  ft_event_post_drain_message:Irecv drain_msg_ref=e84200 rank=0 tag=1000 cnt=1 ddt=40 to=e8d3a0 [datatype->size=1]
  wait_quiesce_drained:xx=0 0
  wait_quiesce_drained:xx=1 100
  wait_quiesce_drained:xx=2 200
  wait_quiesce_drained:xx=3 300
  wait_quiesce_drained:xx=4 400
  wait_quiesce_drained:xx=5 500
  wait_quiesce_drained:xx=6 600
  wait_quiesce_drained:xx=7 700
  wait_quiesce_drained:xx=8 800
  wait_quiesce_drained:xx=9 900
  ompi_ddt_copy_content_same_ddt:Start size=40 flag=102/4 count=1

* I think that receiver received message correctly in the bkmrk.
  Received messages are contiguous.

* I think that the problem is copy processing in ompi_ddt_copy_content_same_ddt.
  Or is using ompi_ddt_copy_content_same_ddt function wrong?

* the first argument(datatype) of ompi_ddt_copy_content_same_ddt function in
  drain_message_copy_remove is specified by user's application
  Hexadecimal value of datatype->flags is 0x102.
  It does not contain DT_FLAG_CONTIGUOUS and it will mean derived datatype.

* I think that problem occurs at the following parts of ompi_ddt_copy_content_same_ddt function.
  Both source and destination use the same information of datatype which is specified by
  user's application.
  But source(received messages in the bkmrk) is simple contiguous messages.

  -------------------
       destination += datatype->true_lb;
       source += datatype->true_lb;

  -------------------
  ptrdiff_t extent = (datatype->ub - datatype->lb);
       destination += extent;
       source += extent;

  -------------------
  pStack = (dt_stack_t*)alloca( sizeof(dt_stack_t) * (datatype->btypes[DT_LOOP] + 1) );
       source = (unsigned char*)source_base + pStack->disp;
       destination = (unsigned char*)destination_base + pStack->disp;

* If the source datatype is different from the destination datatype,
  Should not ompi_ddt_copy_content_same_ddt function be used?

-bash-3.2$ cat t_mpi_question-8.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define SLPTIME 60
#define ITEMNUM 10

int buf[ITEMNUM][ITEMNUM];
int main(int ac,char **av)
{
  int rank,size,cc,i,j;
  MPI_Request req;
  MPI_Status sts;
  MPI_Datatype newdt;

  MPI_Init(&ac,&av);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  for (i=0;i<ITEMNUM;i++) {
    for (j=0;j<ITEMNUM;j++) {
      if (rank == 0) { buf[i][j] = (i*100)+j; }
      else { buf[i][j] = -1; }
    }
  }

  cc=MPI_Type_vector(10,1,10,MPI_INT,&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  cc=MPI_Type_commit(&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  MPI_Barrier(MPI_COMM_WORLD);

  if (rank == 0) {
    cc=MPI_Isend(&buf[0][0],1,newdt,1,1000,MPI_COMM_WORLD,&req);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }

    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);

    cc=MPI_Wait(&req,&sts);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    cc=MPI_Type_free(&newdt);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  }
  else {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);

    cc=MPI_Irecv(&buf[0][0],1,newdt,0,1000,MPI_COMM_WORLD,&req);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    cc=MPI_Wait(&req,&sts);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    cc=MPI_Type_free(&newdt);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  }

  for (i=0;i<ITEMNUM;i++) {
    printf(" rank=%d size=%d i=%d [%3d %3d %3d %3d %3d %3d %3d %3d %3d %3d] \n"
     ,rank,size,i
     ,buf[i][0],buf[i][1],buf[i][2],buf[i][3],buf[i][4]
     ,buf[i][5],buf[i][6],buf[i][7],buf[i][8],buf[i][9]
    );
    fflush(stdout);
  }

  cc = MPI_Finalize();
  if (rank ==0) {
    printf(" rank=%d program end \n",rank); fflush(stdout);
  }
  return(0);
}

-bash-3.2$ cat t_mpi_question-8-type_indexed.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define SLPTIME 60
#define ITEMNUM 10

int buf[ITEMNUM][ITEMNUM];
int main(int ac,char **av)
{
  int rank,size,cc,i,j;
  MPI_Request req;
  MPI_Status sts;
  MPI_Datatype newdt;
  int block_length[ITEMNUM];
  int disp[ITEMNUM];

  MPI_Init(&ac,&av);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  for (i=0;i<ITEMNUM;i++) {
    block_length[i] = 1;
    disp[i] = (i * ITEMNUM);
    for (j=0;j<ITEMNUM;j++) {
      if (rank == 0) { buf[i][j] = (i*100)+j; }
      else { buf[i][j] = -1; }
    }
  }

  cc=MPI_Type_indexed(10,&block_length[0],&disp[0],MPI_INT,&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  cc=MPI_Type_commit(&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  MPI_Barrier(MPI_COMM_WORLD);

  if (rank == 0) {
    cc=MPI_Isend(&buf[0][0],1,newdt,1,1000,MPI_COMM_WORLD,&req);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }

    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);

    cc=MPI_Wait(&req,&sts);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    cc=MPI_Type_free(&newdt);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  }
  else {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);

    cc=MPI_Irecv(&buf[0][0],1,newdt,0,1000,MPI_COMM_WORLD,&req);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    cc=MPI_Wait(&req,&sts);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
    cc=MPI_Type_free(&newdt);
    if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  }

  for (i=0;i<ITEMNUM;i++) {
    printf(" rank=%d size=%d i=%d [%3d %3d %3d %3d %3d %3d %3d %3d %3d %3d] \n"
     ,rank,size,i
     ,buf[i][0],buf[i][1],buf[i][2],buf[i][3],buf[i][4]
     ,buf[i][5],buf[i][6],buf[i][7],buf[i][8],buf[i][9]
    );
    fflush(stdout);
  }

  cc = MPI_Finalize();
  if (rank ==0) {
    printf(" rank=%d program end \n",rank); fflush(stdout);
  }
  return(0);
}

-bash-3.2$ cat t_mpi_question-8-LBUB.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define ITEMNUM_1 10
#define SLPTIME 60

int buf[ITEMNUM_1][ITEMNUM_1];
int main(int ac,char **av)
{
  int rank,size,cc,i,j,k;
  MPI_Request req;
  MPI_Status sts;
  MPI_Datatype newdt;
  int itmnum,newdt_size;
  int b_l[3];
  MPI_Aint dp[3],newdt_extent,newdt_lb,newdt_ub;
  MPI_Datatype dt[3];

  itmnum = 10;
  rank=0;
  MPI_Init(&ac,&av);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  for (i=0;i<ITEMNUM_1;i++) {
    for (j=0;j<ITEMNUM_1;j++) {
      if (rank == 0) { buf[i][j] = (i*100)+j; }
      else { buf[i][j] = -1; }
    }
  }

  b_l[0] = b_l[1] = b_l[2] = 1;
  dt[0] = MPI_LB;
  dt[1] = MPI_INT;
  dt[2] = MPI_UB;
  dp[0] = 0;
  dp[1] = 8;
  dp[2] = 20;
  MPI_Type_struct(3,&b_l[0],&dp[0],&dt[0],&newdt);
  MPI_Type_commit(&newdt);

  MPI_Type_size(newdt,&newdt_size);
  MPI_Type_extent(newdt,&newdt_extent);
  MPI_Type_lb(newdt,&newdt_lb);
  MPI_Type_ub(newdt,&newdt_ub);

  printf(" rank=%d newdt size=%d extent=%d lb=%d ub=%d itmnum=%d \n"
    ,rank,newdt_size,newdt_extent,newdt_lb,newdt_ub,itmnum);
  fflush(stdout);
  MPI_Barrier(MPI_COMM_WORLD);

  if (rank == 0) {
    MPI_Isend(&buf[0][0],itmnum,newdt,1,1000,MPI_COMM_WORLD,&req);

    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);

    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }
  else {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);

    MPI_Irecv(&buf[0][0],itmnum,newdt,0,1000,MPI_COMM_WORLD,&req);
    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }

  for (i=0;i<ITEMNUM_1;i++) {
    printf(" rank=%d size=%d i=%d/%d [%3d %3d %3d %3d %3d %3d %3d %3d %3d %3d] \n"
     ,rank,size,i,itmnum
     ,buf[i][0],buf[i][1],buf[i][2],buf[i][3],buf[i][4]
     ,buf[i][5],buf[i][6],buf[i][7],buf[i][8],buf[i][9]
    );
    fflush(stdout);
  }

  MPI_Finalize();
  if (rank ==0) {
    printf(" rank=%d program end \n",rank); fflush(stdout);
  }
  return(0);
}