Open MPI logo

Open MPI User's Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Open MPI User's mailing list

Subject: [OMPI users] mpi prorg fails (big data)
From: Dr.Peer-Joachim Koch (pkoch_at_[hidden])
Date: 2014-06-24 06:31:15


Hi,

one of our cluster users reported a problem with openmpi.
He created a short sample (just a few lines) which will start and
crash after a short time.
We only see "Fatal error in PMPI_Gather: Other MPI error" - no further
details.
He is using an intel fortran compiler with a self compiled openmpi (just
tested 1.8.1).

I've know nearly nothing about mpi(openmpi) so I'm asking at this forum.
Has anybody some idea ?

Thanks, Peer

-----------------------makefile----------
OPTIONS=-assume byterecl -fpp -allow nofpp_comments -free
DEBUG=-g -d-lines -check -debug -debug-parameters -fpe0 -traceback

all:
         rm -f JeDi globe_mod.mod JeDi.out jedi_restart
         $(SOURCE) ; mpif90 $(OPTIONS) $(DEBUG) -o JeDi globe.f90

--------------------------

----------------globe.f90---------------------
       program globe
       use mpi
       implicit none

       integer :: mpinfo = 0
       integer :: myworld = 0
       integer :: mypid = 0
       integer :: npro = 1

! * The comments give some conditions required to reproduce the problem.

! * If the program runs at two hosts, the error message is shown two
times

       integer, parameter :: vv_g_d1 = 2432
       integer, parameter :: vv_p_d1 = vv_g_d1 / 16 ! requires 16 CPUs

       integer, parameter :: out_d1 = 2418 ! requires >=2416 (vv_g_d1
- 16)

       integer, parameter :: d2 = 5001 ! requires >=4282 @ ii=30 /
>=6682 @ ii=20 (depends on number of loops, but this limit can change
for unknown reason)

       integer :: ii, jj

       real :: vv_p(vv_p_d1,d2)
       real,allocatable :: vv_g(:,:)
! * requires the definition of the variable for write to be defined
below vv_g(:,:)
       real :: out(out_d1,d2)

       vv_p(:,:) = 0.0
       out(:,:) = 0.0

       call mpi_init(mpinfo)
       myworld = MPI_COMM_WORLD
       call mpi_comm_size(myworld, npro, mpinfo)
! * The problem requires 16 CPUs
       if (npro .ne. 16) then; write(*,*) "Works only with 16 CPUs";
stop; endif
       call mpi_comm_rank(myworld, mypid, mpinfo)

       if (mypid == 0) then
         open(11, FILE='jedi_restart', STATUS='replace', FORM='unformatted')
       endif

       write(6,*) "test1",mypid ; flush(6)

       do ii = 1, 25 ! number of loops depends on field size
         allocate(vv_g(vv_g_d1,d2))

         do jj = 1, d2
           call mpi_gather(vv_p(1,jj), vv_p_d1, MPI_REAL, vv_g(1,jj),
vv_p_d1, MPI_REAL, 0, myworld, mpinfo)
         enddo

         if (mypid == 0) then; write(11) out; flush(11); endif

         deallocate(vv_g)
       enddo

       write(6,*) "test2",mypid ; flush(6)

       if (mypid == 0) close(11)

       call mpi_barrier(myworld, mpinfo)
       call mpi_finalize(mpinfo)

       end
---------------------------------------------end
globe.f90----------------------

-- 
Mit freundlichem Gruß
     Peer-Joachim Koch
_________________________________________________________
Max-Planck-Institut für Biogeochemie
Dr. Peer-Joachim Koch
Hans-Knöll Str.10            Telefon: ++49 3641 57-6705
D-07745 Jena                 Telefax: ++49 3641 57-7705