Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: [OMPI devel] Some questions about checkpoint/restart (10)
From: Takayuki Seki (seki_at_[hidden])
Date: 2010-04-02 00:55:51


(10) Receiving which has element size 0 terminates abnormally after taking checkpoint.

Framework : crcp
Component : bkmrk
The source file : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
The function name : drain_message_copy_remove

  if (rank == 0) {
    j=100;
    MPI_Isend(&j,0,MPI_INT,1,1000,MPI_COMM_WORLD,&req);
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME); /** take checkpoint at this point **/
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
  }
  else {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME); /** take checkpoint at this point **/
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    MPI_Irecv(&j,0,MPI_INT,0,1000,MPI_COMM_WORLD,&req);
  }
  MPI_Wait(&req,&sts);

* Take checkpoint while Process 0 and Process 1 are in sleep function,
  then program terminates abnormally with following message:

  *** An error occurred in MPI_Irecv
  *** on communicator MPI_COMM_WORLD
  *** MPI_ERR_BUFFER: invalid buffer pointer
  *** MPI_ERRORS_ARE_FATAL (your MPI job will now abort)

* ompi_ddt_copy_content_same_ddt function returns true
  in drain_message_copy_remove function and an error occurs.

* In drain_message_copy_remove function,
  If count is 0, it returns true.
  it is as follows:
    /* empty data ? then do nothing. This should normally be trapped
     * at a higher level.
     */
    if( 0 == count ) return 1;

* If count is 0,
  Is it necessary that drain_message_copy_remove function calls
  copy function(ompi_ddt_copy_content_same_ddt)?

-bash-3.2$ cat t_mpi_question-10.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define SLPTIME 60

main(int ac,char **av)
{
  int rank,size,cc,i,j;
  MPI_Request req;
  MPI_Status sts;

  rank=0;
  j=0;
  MPI_Init(&ac,&av);
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  MPI_Barrier(MPI_COMM_WORLD);

  if (rank == 0) {
    j=100;
    MPI_Isend(&j,0,MPI_INT,1,1000,MPI_COMM_WORLD,&req);
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
  }
  else {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end \n",rank); fflush(stdout);
    MPI_Irecv(&j,0,MPI_INT,0,1000,MPI_COMM_WORLD,&req);
  }
  MPI_Wait(&req,&sts);
  printf(" rank=%d pass-2 %d \n",rank,j); fflush(stdout);
  if ((rank == 1) && (j != 0)) { MPI_Abort(MPI_COMM_WORLD,1); }

  MPI_Finalize();
  if (rank ==0) {
    printf(" rank=%d program end \n",rank); fflush(stdout);
  }
  return(0);
}