Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: [OMPI devel] ome questions about checkpoint/restart (3)
From: Takayuki Seki (seki_at_[hidden])
Date: 2010-03-12 01:11:32


3rd question is as follows:

(3) If the message of the same condition exists in two lists or more,
    an error occurs by assert(need <= found) in send_msg_details function.
    I built Open MPI with "--enable-debug" configure option.

Framework : crcp
Component : bkmrk
The source file : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
The function name : send_msg_details,do_recv_msg_detail_check_drain

Here's the code that causes the problem:

#define BLOCKNUM 1
#define SLPTIM 60

  if (rank == 0) {
    MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD);
    MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD);
    MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD);
    MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); MPI_Wait(&sreq,&ssts);
    MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); MPI_Wait(&sreq,&ssts);
    MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); MPI_Wait(&sreq,&ssts);
    MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq);
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIM); /** take checkpoint at this point **/
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    MPI_Wait(&sreq,&ssts);
  }
  else { /* rank 1 */
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts);
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts);
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts);
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts);
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts);
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts);
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIM); /** take checkpoint at this point **/
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq);
    MPI_Wait(&rreq,&rsts);
  }

* Take checkpoint while Process 0 and Process 1 are in sleep function

* Here's the tag,elements,type,and communicator of the message;
    message tag=100,number of elements=1,data type=MPI_INT,communicator=MPI_COMM_WORLD

* Send side(Rank 0):
  The information of the message of the same condition exists in both send_list and isend_list.

* Recv side(Rank 1):
  The information of the message exists in irecv_list only.
  I wonder that there are some problems on messages matching in do_recv_msg_detail_check_drain function.

* Result
 rank=0 size=2
 rank=1 size=2
 rank=0 sleep start
 rank=1 sleep start
 rank=0 sleep end
 rank=1 sleep end
t_mpi_question-3.out: ../../../../../ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c:5471: send_msg_details: Assertion `need <= found' failed.
[camel0:24606] *** Process received signal ***
[camel0:24606] Signal: Aborted (6)
[camel0:24606] Signal code: (-6)

-bash-3.2$ cat t_mpi_question-3.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include "mpi.h"

#define BLOCKNUM 1
#define SLPTIM 60

int main(int ac,char **av)
{
  int i;
  int rank,size;
  int *wbuf;
  int *rbuf;
  MPI_Status rsts,ssts;
  MPI_Request rreq,sreq;

  MPI_Init(&ac,&av);

  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);
  if (size != 2) { MPI_Abort(MPI_COMM_WORLD,-1); }

  rbuf = (int *)malloc(BLOCKNUM * sizeof(int));
  wbuf = (int *)malloc(BLOCKNUM * sizeof(int));
  if ((rbuf == NULL)||(wbuf == NULL)) { MPI_Abort(MPI_COMM_WORLD,-1); }

  printf(" rank=%d size=%d \n",rank,size); fflush(stdout);
  MPI_Barrier(MPI_COMM_WORLD);

  if (rank == 0) {
    for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (100 + i); }
    MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD);
    for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (200 + i); }
    MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD);
    for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (300 + i); }
    MPI_Send(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD);

    for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (400 + i); }
    MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); MPI_Wait(&sreq,&ssts);
    for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (500 + i); }
    MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); MPI_Wait(&sreq,&ssts);
    for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (600 + i); }
    MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq); MPI_Wait(&sreq,&ssts);

    for (i=0;i<BLOCKNUM;i++) { wbuf[i] = (700 + i); }
    MPI_Isend(wbuf,BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&sreq);

    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIM);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);

    MPI_Wait(&sreq,&ssts);
  }
  else {
    for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; }
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts);
    for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (100 + i)) { abort(); } }
    for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; }
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts);
    for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (200 + i)) { abort(); } }
    for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; }
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts);
    for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (300 + i)) { abort(); } }

    for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; }
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts);
    for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (400 + i)) { abort(); } }
    for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; }
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts);
    for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (500 + i)) { abort(); } }
    for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; }
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq); MPI_Wait(&rreq,&rsts);
    for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (600 + i)) { abort(); } }

    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIM);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);

    for (i=0;i<BLOCKNUM;i++) { rbuf[i] = 0; }
    MPI_Irecv(rbuf,BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&rreq);
    MPI_Wait(&rreq,&rsts);
    for (i=0;i<BLOCKNUM;i++) { if (rbuf[i] != (700 + i)) { abort(); } }
  }

  MPI_Barrier(MPI_COMM_WORLD);
  free(rbuf);
  free(wbuf);
  MPI_Finalize();
  if (rank == 0) {
    printf(" rank=%d program end \n",rank); fflush(stdout);
  }
  return(0);
}