Dennis

In MPI, you must complete every MPI_Isend by MPI_Wait on the request handle (or a variant like MPI_Waitall or MPI_Test that returns TRUE). An un-completed MPI_Isend leaves resources tied up.

I do not know what symptom to expect from OpenMPI with this particular application error but the one you describe is plausible.


Dick Treumann - MPI Team
IBM Systems & Technology Group
Dept X2ZA / MS P963 -- 2455 South Road -- Poughkeepsie, NY 12601
Tele (845) 433-7846 Fax (845) 433-8363


users-bounces@open-mpi.org wrote on 09/09/2009 11:47:12 AM:

> [image removed]

>
> [OMPI users] Messages getting lost during transmission (?)

>
> Dennis Luxen

>
> to:

>
> users

>
> 09/09/2009 11:48 AM

>
> Sent by:

>
> users-bounces@open-mpi.org

>
> Please respond to Open MPI Users

>
> Hi all,
>
> I have a very strange behaviour in a program. It seems that messages
> that are sent from one processor to another are getting lost.
>
> The problem is isolated in the attached source code. The code works as
> follows. Two processess send each other 100k request. Each request is
> answered and triggers a number of requests to the other process in
> return. As you might already suspect, the communication is asynchronous.
>
> I already debugged the application and found that at one point during
> the communication at least one of the processes does not receive any
> messages anymore and hangs in the while loop beginning in line 45.
>
> The program is started with two processes on a single machine and no
> other parameters: "mpirun -np 2 ./mpi_test2".
>
> I appreciate your help.
>
> Best wishes,
> Dennis
>
> --
> Dennis Luxen
> Universität Karlsruhe (TH)           | Fon  : +49 (721) 608-6781
> Institut für Theoretische Informatik | Fax  : +49 (721) 608-3088
> Am Fasanengarten 5, Zimmer 220       | WWW  : algo2.ira.uka.de/luxen
> D-76131 Karlsruhe, Germany           | Email: luxen@kit.edu
> --------------------------------------------------------------------
>
> #include <iostream>
> #include <fstream>
> #include <sstream>
> #include <cassert>
> #include <queue>
> #include <list>
> #include <cstdlib>
> #include <mpi.h>
>
> std::ofstream output_file;
>
> enum {REQUEST_TAG=4321, ANSWER_TAG, FINISHED_TAG};
>
> typedef int Answer_type;
>
>
> int main(int argc, char *argv[])
> {
>    MPI_Init (&argc, &argv);   // starts MPI
>    int number_of_PEs, my_PE_ID;
>    MPI_Comm_size(MPI_COMM_WORLD, &number_of_PEs);
>    assert(number_of_PEs == 2);
>    MPI_Comm_rank(MPI_COMM_WORLD, &my_PE_ID);
>
>    std::srand(123456);
>
>    int number_of_requests_to_send = 100000;
>    int number_of_requests_to_recv = number_of_requests_to_send;
>    int number_of_answers_to_recv  = number_of_requests_to_send;
>
>    std::stringstream filename;
>    filename<<"output"<<my_PE_ID<<".txt";
>    output_file.open(filename.str().c_str());
>
>    int buffer[100];
>    MPI_Request dummy_request;
>
>    //Send the first request
>    MPI_Isend(buffer, 1, MPI_INT, 1-my_PE_ID, REQUEST_TAG,
> MPI_COMM_WORLD, &dummy_request);
>    number_of_requests_to_send--;
>
>    int working_PEs = number_of_PEs;
>    bool lack_of_work_sent = false;
>    bool there_was_change = true;
>    while(working_PEs > 0)
>    {
>       if(there_was_change)
>       {
>          there_was_change = false;
>          std::cout<<my_PE_ID<<": req_to_recv = "<<number_of_requests_to_recv
>                      <<", req_to_send = "<<number_of_requests_to_send
>                      <<", answers_to_recv = "<<number_of_answers_to_recv
>                      <<std::endl;
>          output_file<<my_PE_ID<<": req_to_recv = "<<number_of_requests_to_recv
>                      <<", req_to_send = "<<number_of_requests_to_send
>                      <<", answers_to_recv = "<<number_of_answers_to_recv
>                      <<std::endl;
>       }
>
>       MPI_Status status;
>       int flag = 1;
>       int number_of_answer;
> //      MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
>       MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag,&status);
>       if(flag)
>       {
>          there_was_change = true;
>          switch(status.MPI_TAG){
>             case(REQUEST_TAG):
>                MPI_Recv(buffer, 1, MPI_INT, status.MPI_SOURCE,
> REQUEST_TAG, MPI_COMM_WORLD, &status);
>                MPI_Isend(buffer, (1<<(std::rand()%5))*sizeof(int),
> MPI_BYTE, 1-my_PE_ID, ANSWER_TAG, MPI_COMM_WORLD, &dummy_request);
>                number_of_requests_to_recv--;
>             break;
>             case(ANSWER_TAG):
>                number_of_answers_to_recv--;
>                MPI_Get_count( &status, MPI_BYTE, &number_of_answer);
>
>                MPI_Recv(buffer, number_of_answer, MPI_BYTE,
> status.MPI_SOURCE, ANSWER_TAG, MPI_COMM_WORLD, &status);
>
>                for(int i = (number_of_answer+3)/4; (i>0)&&
> (number_of_requests_to_send>0); i--)
>                {
>                   MPI_Isend(buffer, 1, MPI_INT, 1-my_PE_ID,
> REQUEST_TAG, MPI_COMM_WORLD, &dummy_request);
>                   number_of_requests_to_send--;
>                }
>             break;
>             case(FINISHED_TAG):
>                MPI_Recv(buffer, 1, MPI_INT, status.MPI_SOURCE,
> FINISHED_TAG, MPI_COMM_WORLD, &status);
>                working_PEs--;
>             break;
>          }
>       }
>       if((number_of_answers_to_recv == 0) && (!lack_of_work_sent))
>       {
>          there_was_change = true;
>          MPI_Isend(buffer, 1, MPI_INT, 1-my_PE_ID, FINISHED_TAG,
> MPI_COMM_WORLD, &dummy_request);
>          working_PEs--;
>          lack_of_work_sent = true;
>       }
>    }
>    MPI_Barrier(MPI_COMM_WORLD);
>    std::cout<<my_PE_ID<<": Finished normaly"<<std::endl;
>    MPI_Finalize();
>
>    return 0;
> }
>                  Package: Open MPI abuild@build26 Distribution
>                 Open MPI: 1.3.2
>    Open MPI SVN revision: r21054
>    Open MPI release date: Apr 21, 2009
>                 Open RTE: 1.3.2
>    Open RTE SVN revision: r21054
>    Open RTE release date: Apr 21, 2009
>                     OPAL: 1.3.2
>        OPAL SVN revision: r21054
>        OPAL release date: Apr 21, 2009
>             Ident string: 1.3.2
>                   Prefix: /usr/lib64/mpi/gcc/openmpi
>  Configured architecture: x86_64-suse-linux-gnu
>           Configure host: build26
>            Configured by: abuild
>            Configured on: Tue May  5 16:03:55 UTC 2009
>           Configure host: build26
>                 Built by: abuild
>                 Built on: Tue May  5 16:18:52 UTC 2009
>               Built host: build26
>               C bindings: yes
>             C++ bindings: yes
>       Fortran77 bindings: yes (all)
>       Fortran90 bindings: yes
>  Fortran90 bindings size: small
>               C compiler: gcc
>      C compiler absolute: /usr/bin/gcc
>             C++ compiler: g++
>    C++ compiler absolute: /usr/bin/g++
>       Fortran77 compiler: gfortran
>   Fortran77 compiler abs: /usr/bin/gfortran
>       Fortran90 compiler: gfortran
>   Fortran90 compiler abs: /usr/bin/gfortran
>              C profiling: yes
>            C++ profiling: yes
>      Fortran77 profiling: yes
>      Fortran90 profiling: yes
>           C++ exceptions: no
>           Thread support: posix (mpi: no, progress: no)
>            Sparse Groups: no
>   Internal debug support: no
>      MPI parameter check: runtime
> Memory profiling support: no
> Memory debugging support: no
>          libltdl support: yes
>    Heterogeneous support: no
>  mpirun default --prefix: no
>          MPI I/O support: yes
>        MPI_WTIME support: gettimeofday
> Symbol visibility support: yes
>    FT Checkpoint support: no  (checkpoint thread: no)
>            MCA backtrace: execinfo (MCA v2.0, API v2.0, Component v1.3.2)
>               MCA memory: ptmalloc2 (MCA v2.0, API v2.0, Component v1.3.2)
>            MCA paffinity: linux (MCA v2.0, API v2.0, Component v1.3.2)
>                MCA carto: auto_detect (MCA v2.0, API v2.0, Component v1.3.2)
>                MCA carto: file (MCA v2.0, API v2.0, Component v1.3.2)
>            MCA maffinity: first_use (MCA v2.0, API v2.0, Component v1.3.2)
>                MCA timer: linux (MCA v2.0, API v2.0, Component v1.3.2)
>          MCA installdirs: env (MCA v2.0, API v2.0, Component v1.3.2)
>          MCA installdirs: config (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA dpm: orte (MCA v2.0, API v2.0, Component v1.3.2)
>               MCA pubsub: orte (MCA v2.0, API v2.0, Component v1.3.2)
>            MCA allocator: basic (MCA v2.0, API v2.0, Component v1.3.2)
>            MCA allocator: bucket (MCA v2.0, API v2.0, Component v1.3.2)
>                 MCA coll: basic (MCA v2.0, API v2.0, Component v1.3.2)
>                 MCA coll: hierarch (MCA v2.0, API v2.0, Component v1.3.2)
>                 MCA coll: inter (MCA v2.0, API v2.0, Component v1.3.2)
>                 MCA coll: self (MCA v2.0, API v2.0, Component v1.3.2)
>                 MCA coll: sm (MCA v2.0, API v2.0, Component v1.3.2)
>                 MCA coll: sync (MCA v2.0, API v2.0, Component v1.3.2)
>                 MCA coll: tuned (MCA v2.0, API v2.0, Component v1.3.2)
>                   MCA io: romio (MCA v2.0, API v2.0, Component v1.3.2)
>                MCA mpool: fake (MCA v2.0, API v2.0, Component v1.3.2)
>                MCA mpool: rdma (MCA v2.0, API v2.0, Component v1.3.2)
>                MCA mpool: sm (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA pml: cm (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA pml: csum (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA pml: ob1 (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA pml: v (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA bml: r2 (MCA v2.0, API v2.0, Component v1.3.2)
>               MCA rcache: vma (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA btl: self (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA btl: sm (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA btl: tcp (MCA v2.0, API v2.0, Component v1.3.2)
>                 MCA topo: unity (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA osc: pt2pt (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA osc: rdma (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA iof: hnp (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA iof: orted (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA iof: tool (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA oob: tcp (MCA v2.0, API v2.0, Component v1.3.2)
>                 MCA odls: default (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA ras: slurm (MCA v2.0, API v2.0, Component v1.3.2)
>                MCA rmaps: rank_file (MCA v2.0, API v2.0, Component v1.3.2)
>                MCA rmaps: round_robin (MCA v2.0, API v2.0, Component v1.3.2)
>                MCA rmaps: seq (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA rml: oob (MCA v2.0, API v2.0, Component v1.3.2)
>               MCA routed: binomial (MCA v2.0, API v2.0, Component v1.3.2)
>               MCA routed: direct (MCA v2.0, API v2.0, Component v1.3.2)
>               MCA routed: linear (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA plm: rsh (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA plm: slurm (MCA v2.0, API v2.0, Component v1.3.2)
>                MCA filem: rsh (MCA v2.0, API v2.0, Component v1.3.2)
>               MCA errmgr: default (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA ess: env (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA ess: hnp (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA ess: singleton (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA ess: slurm (MCA v2.0, API v2.0, Component v1.3.2)
>                  MCA ess: tool (MCA v2.0, API v2.0, Component v1.3.2)
>              MCA grpcomm: bad (MCA v2.0, API v2.0, Component v1.3.2)
>              MCA grpcomm: basic (MCA v2.0, API v2.0, Component v1.3.2)
> _______________________________________________
> users mailing list
> users@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/users