Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: [OMPI devel] Some questions about checkpoint/restart (1)
From: Takayuki Seki (seki_at_[hidden])
Date: 2010-03-08 23:53:14


I'm trying checkpoint/restart of Open MPI.
I'm using Open MPI 1.4.1 and BLCR 0.8.2.
But it doesn't work well.
I'm looking into the source code.
And I have some questions about checkpoint/restart.
Could anyone answer my questions ?

I will give them one by one.

My 1st question is as follows:

(1) Clearing the send_init_list, recv_init_list.

Framework : crcp
Component : bkmrk
The source file : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
The function name : ft_event_finalize_exchange

I found the comment /* Clear send_init_list */ in ft_event_finalize_exchange function.
However, the corresponding source code seems to be not clearing send_init_list but clearing send_list.

Source code is as follows:
        /* Clear send_init_list */
        for(rm_item = opal_list_get_last(&peer_ref->send_list);
            rm_item != opal_list_get_begin(&peer_ref->send_list);

Is it correct?
Send_list seems to be already cleared by this point.

And, Clearing recv_init_list is in the same.
The comment is /* Clear recv_init_list */.
However, the corresponding source code seems to be not clearing recv_init_list but clearing recv_list.

Recv_list seems to be already cleared by this point.

Source code is as follows:
        /* Clear recv_init_list */
        for(rm_item = opal_list_get_last(&peer_ref->recv_list);
            rm_item != opal_list_get_begin(&peer_ref->recv_list);

Here's the code that causes the problem:

#define BLOCKNUM 1
#define SLPTIM 60

  if (rank == 0) {
    MPI_Send_init(&buf[0],BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&req1);
    MPI_Start(&req1); MPI_Wait(&req1,&sts1);
    MPI_Start(&req1); MPI_Wait(&req1,&sts1);
    printf(" rank=%d sleep1 start \n",rank); fflush(stdout);
    sleep(SLPTIM); /** take checkpoint(1st time) **/
    printf(" rank=%d sleep1 end \n",rank); fflush(stdout);
    MPI_Start(&req1);
    printf(" rank=%d sleep2 start \n",rank); fflush(stdout);
    sleep(SLPTIM); /** take checkpoint(2nd time), and deadlock occurs. **/
    printf(" rank=%d sleep2 end \n",rank); fflush(stdout);
    MPI_Wait(&req1,&sts1);
    MPI_Request_free(&req1);
  } else { /* rank 1 */
    MPI_Recv_init(&buf[0],BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&req1);
    MPI_Start(&req1); MPI_Wait(&req1,&sts1);
    printf(" rank=%d sleep1 start \n",rank); fflush(stdout);
    sleep(SLPTIM); /** take checkpoint(1st time) **/
    printf(" rank=%d sleep1 end \n",rank); fflush(stdout);
    MPI_Start(&req1); MPI_Wait(&req1,&sts1);
    printf(" rank=%d sleep2 start \n",rank); fflush(stdout);
    sleep(SLPTIM); /** take checkpoint(2nd time), and deadlock occurs. **/
    printf(" rank=%d sleep2 end \n",rank); fflush(stdout);
    MPI_Start(&req1); MPI_Wait(&req1,&sts1);
    MPI_Request_free(&req1);
  }

* Take checkpoint twice.

* Take checkpoint while Process 0 is in MPI_Send function and Process 1 is in sleep function

* Deadlock occurs when checkpoint is taken at the second time.

* Here's my debugging output.

 rank=1 pass-1 100
 rank=1 sleep1 start /* 1st checkpoint */
 rank=0 sleep1 start /* 1st checkpoint */
 rank=1 sleep1 end
 rank=0 sleep1 end
DEBUG:do_recv_msg_detail_check_drain p_num_sent(from sender)=1 /* MPI_Barrier */
DEBUG:do_recv_msg_detail_check_drain p_num_sent(from sender)=2
DEBUG:do_recv_msg_detail_check_drain posted_precv_msg_ref BEFORE-UPDATE matched=0 done=1 num_left_unresolved=2
DEBUG:do_recv_msg_detail_check_drain posted_precv_msg_ref AFTER-UPDATE matched=1 done=1 num_left_unresolved=1
DEBUG: num_left_unresolved=1 goto cleapup
DEBUG:ft_event_post_drain_message calls wrapped_pml_module->pml_irecv tag=100 count=1 ddt_size=4
 rank=1 pass-2 200
 rank=1 sleep2 start /* 2nd checkpoint */
 rank=0 sleep2 start /* 2nd checkpoint */
 rank=1 sleep2 end
 rank=0 sleep2 end
DEBUG:do_recv_msg_detail_check_drain p_num_sent(from sender)=3
   /* Sender sent the wrong value("3"). I think the correct value may be "1". */
DEBUG:do_recv_msg_detail_check_drain posted_precv_msg_ref BEFORE-UPDATE matched=1 done=1 num_left_unresolved=3
DEBUG:do_recv_msg_detail_check_drain posted_precv_msg_ref AFTER-UPDATE matched=2 done=1 num_left_unresolved=2
   /* The wrong values are set in recv_init_list of receiver,
      Because recv_init_list was not cleared in taking checkpoint in the first time. */
DEBUG: num_left_unresolved=2 goto cleapup
DEBUG:ft_event_post_drain_message calls wrapped_pml_module->pml_irecv tag=100 count=1 ddt_size=4
DEBUG:ft_event_post_drain_message calls wrapped_pml_module->pml_irecv tag=100 count=1 ddt_size=4
   /* the wrong receiving is issued. */

-bash-3.2$ cat t_mpi_question-1.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define BLOCKNUM 1
#define SLPTIM 60

int main(int ac,char **av)
{
  int i,k,rank,size,cc;
  int *buf;
  MPI_Status sts1;
  MPI_Request req1;

  MPI_Init(&ac,&av);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);
  if (size != 2) { MPI_Abort(MPI_COMM_WORLD,-1); }

  buf = (int *)malloc(sizeof(int)*BLOCKNUM);
  if (buf == NULL) { MPI_Abort(MPI_COMM_WORLD,-1); }

  MPI_Barrier(MPI_COMM_WORLD);

  if (rank == 0) {
    MPI_Send_init(&buf[0],BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&req1);

    for (i=0;i<BLOCKNUM;i++) { buf[i] = (100+i); }
    MPI_Start(&req1); MPI_Wait(&req1,&sts1);
    for (i=0;i<BLOCKNUM;i++) { buf[i] = (200+i); }
    MPI_Start(&req1); MPI_Wait(&req1,&sts1);

    printf(" rank=%d sleep1 start \n",rank); fflush(stdout);
    sleep(SLPTIM);
    printf(" rank=%d sleep1 end \n",rank); fflush(stdout);

    for (i=0;i<BLOCKNUM;i++) { buf[i] = (300+i); }
    MPI_Start(&req1);

    printf(" rank=%d sleep2 start \n",rank); fflush(stdout);
    sleep(SLPTIM);
    printf(" rank=%d sleep2 end \n",rank); fflush(stdout);

    MPI_Wait(&req1,&sts1);
    MPI_Request_free(&req1);
  } else {
    MPI_Recv_init(&buf[0],BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&req1);

    for (i=0;i<BLOCKNUM;i++) { buf[i] = 0; }
    MPI_Start(&req1); MPI_Wait(&req1,&sts1);
    printf(" rank=%d pass-1 %d \n",rank,buf[0]); fflush(stdout);

    printf(" rank=%d sleep1 start \n",rank); fflush(stdout);
    sleep(SLPTIM);
    printf(" rank=%d sleep1 end \n",rank); fflush(stdout);

    for (i=0;i<BLOCKNUM;i++) { buf[i] = 0; }
    MPI_Start(&req1); MPI_Wait(&req1,&sts1);
    printf(" rank=%d pass-2 %d \n",rank,buf[0]); fflush(stdout);

    printf(" rank=%d sleep2 start \n",rank); fflush(stdout);
    sleep(SLPTIM);
    printf(" rank=%d sleep2 end \n",rank); fflush(stdout);

    for (i=0;i<BLOCKNUM;i++) { buf[i] = 0; }
    MPI_Start(&req1); MPI_Wait(&req1,&sts1);
    printf(" rank=%d pass-3 %d \n",rank,buf[0]); fflush(stdout);

    MPI_Request_free(&req1);
  }

  MPI_Barrier(MPI_COMM_WORLD);
  free(buf);
  MPI_Finalize();
  if (rank == 0) {
    printf(" rank=%d Program End \n",rank); fflush(stdout);
  }
  return(0);
}