Thanks for the quick reply and suggestions.

I have tried both isolating the output to a single OST and striping across multiple OSTs. Both will produce the same result. I have tried compiling with multiple versions of both pathscale and intel compilers all with the same result.

The odd thing is that this seems to work using hpmpi 2.03 compiled with pathscale 3.2 and intel 10.1.018. The operating system is XC 3.2.1 which is essentially rhel4.5. The kernel is 2.6.9-67.9hp.7sp.XCsmp. Lustre version is lustre-1.4.11-2.3_0.6_xc3.2.1_
k2.6.9_67.9hp.7sp.XCsmp.

Thanks for the info, Nate


On Tue, Mar 3, 2009 at 11:10 AM, Brian Dobbins <bdobbins@gmail.com> wrote:

Hi Nathan,

  I just ran your code here and it worked fine - CentOS 5 on dual Xeons w/ IB network, and the kernel is 2.6.18-53.1.14.el5_lustre.1.6.5smp.  I used an OpenMPI 1.3.0 install compiled with Intel 11.0.081 and, independently, one with GCC 4.1.2.  I tried a few different times with varying numbers of processors. 

  (Both executables were compiled with -O2)

  I'm sure the main OpenMPI guys will have better ideas, but in the meantime what kernel, OS and compilers are you using?  And does it happen when you write to a single OST?  Make a directory and try setting the stripe-size to 1 (eg, lfs setstripe <directory name> 1048576 0 1' will give you, I think, a 1MB stripe size starting at OST 0 and of size 1.)  I'm just wondering whether it's something with your hardware, maybe a particular OST, since it seems to work for me.

  ... Sorry I can't be of more help, but I imagine the regular experts will chime in shortly.

  Cheers,
  - Brian


On Tue, Mar 3, 2009 at 12:51 PM, Nathan Baca <nathan.baca@gmail.com> wrote:
Hello,

I am seeing inconsistent mpi-io behavior when writing to a Lustre file system using open mpi 1.3 with romio. What follows is a simple reproducer and output. Essentially one or more of the running processes does not read or write the correct amount of data to its part of a file residing on a Lustre (parallel) file system.

Any help figuring out what is happening is greatly appreciated. Thanks, Nate

program gcrm_test_io
  implicit none
  include "mpif.h"
 
  integer X_SIZE
 
      integer w_me, w_nprocs
      integer  my_info
 
      integer i
      integer (kind=4) :: ierr
      integer (kind=4) :: fileID
      
      integer (kind=MPI_OFFSET_KIND)        :: mylen
      integer (kind=MPI_OFFSET_KIND)        :: offset
      integer status(MPI_STATUS_SIZE)
      integer count
      integer ncells
      real (kind=4), allocatable, dimension (:)     :: array2
      logical sync
 
      call mpi_init(ierr)
      call MPI_COMM_SIZE(MPI_COMM_WORLD,w_nprocs,ierr)
      call MPI_COMM_RANK(MPI_COMM_WORLD,w_me,ierr)
 
      call mpi_info_create(my_info, ierr)
!     optional ways to set things in mpi-io
!     call mpi_info_set   (my_info, "romio_ds_read" , "enable"   , ierr)
!     call mpi_info_set   (my_info, "romio_ds_write", "enable"   , ierr)
!     call mpi_info_set   (my_info, "romio_cb_write", "enable"    , ierr)
 
      x_size = 410011  ! A 'big' number, with bigger numbers it is more likely to fail
      sync = .true.  ! Extra file synchronization
 
      ncells = (X_SIZE * w_nprocs)
 
!  Use node zero to fill it with nines
      if (w_me .eq. 0) then
          call MPI_FILE_OPEN  (MPI_COMM_SELF, "output.dat", MPI_MODE_CREATE+MPI_MODE_WRONLY, my_info, fileID, ierr)
          allocate (array2(ncells)) 
          array2(:) = 9.0
          mylen = ncells
          offset = 0 * 4
          call MPI_FILE_SET_VIEW(fileID,offset, MPI_REAL,MPI_REAL, "native",MPI_INFO_NULL,ierr)
          call MPI_File_write(fileID, array2, mylen , MPI_REAL, status,ierr)
          call MPI_Get_count(status,MPI_INTEGER, count, ierr)
          if (count .ne. mylen) print*, "Wrong initial write count:", count,mylen
          deallocate(array2)
          if (sync) call MPI_FILE_SYNC (fileID,ierr)
          call MPI_FILE_CLOSE (fileID,ierr)
      endif
 
!  All nodes now fill their area with ones
      call MPI_BARRIER(MPI_COMM_WORLD,ierr)
      allocate (array2( X_SIZE))
      array2(:) = 1.0
      offset = (w_me * X_SIZE) * 4 ! multiply by four, since it is real*4
      mylen = X_SIZE
      call MPI_FILE_OPEN  (MPI_COMM_WORLD,"output.dat",MPI_MODE_WRONLY, my_info, fileID, ierr)
      print*,"node",w_me,"starting",(offset/4) + 1,"ending",(offset/4)+mylen
      call MPI_FILE_SET_VIEW(fileID,offset, MPI_REAL,MPI_REAL, "native",MPI_INFO_NULL,ierr)
      call MPI_File_write(fileID, array2, mylen , MPI_REAL, status,ierr)
      call MPI_Get_count(status,MPI_INTEGER, count, ierr)
      if (count .ne. mylen) print*, "Wrong write count:", count,mylen,w_me
      deallocate(array2)
      if (sync) call MPI_FILE_SYNC (fileID,ierr)
      call MPI_FILE_CLOSE (fileID,ierr)
 
!  Read it back on node zero to see if it is ok data
      if (w_me .eq. 0) then
          call MPI_FILE_OPEN  (MPI_COMM_SELF, "output.dat", MPI_MODE_RDONLY, my_info, fileID, ierr)
          mylen = ncells
          allocate (array2(ncells))
          call MPI_File_read(fileID, array2, mylen , MPI_REAL, status,ierr)
          call MPI_Get_count(status,MPI_INTEGER, count, ierr)
          if (count .ne. mylen) print*, "Wrong read count:", count,mylen
          do i=1,ncells
               if (array2(i) .ne. 1) then
                  print*, "ERROR", i,array2(i), ((i-1)*4), ((i-1)*4)/(1024d0*1024d0) ! Index, value, # of good bytes,MB
                  goto 999
               end if
          end do
          print*, "All done with nothing wrong"
 999      deallocate(array2)
          call MPI_FILE_CLOSE (fileID,ierr)
          call MPI_file_delete ("output.dat",MPI_INFO_NULL,ierr)
      endif
 
      call mpi_finalize(ierr)
 
end program gcrm_test_io  
 
1.3 Open MPI
 node           0 starting                     1 ending                410011
 node           1 starting                410012 ending                820022
 node           2 starting                820023 ending               1230033
 node           3 starting               1230034 ending               1640044
 node           4 starting               1640045 ending               2050055
 node           5 starting               2050056 ending               2460066
 All done with nothing wrong


 node           0 starting                     1 ending                410011
 node           1 starting                410012 ending                820022
 node           2 starting                820023 ending               1230033
 node           5 starting               2050056 ending               2460066
 node           4 starting               1640045 ending               2050055
 node           3 starting               1230034 ending               1640044
 Wrong write count:      228554                410011           2
 Wrong read count:     1048576               2460066
 ERROR     1048577  0.0000000E+00     4194304   4.00000000000000   


 node           1 starting                410012 ending                820022
 node           0 starting                     1 ending                410011
 node           2 starting                820023 ending               1230033
 node           3 starting               1230034 ending               1640044
 node           4 starting               1640045 ending               2050055
 node           5 starting               2050056 ending               2460066
 Wrong read count:     1229824               2460066
 ERROR     1229825  0.0000000E+00     4919296   4.69140625000000

--
Nathan Baca
nathan.baca@gmail.com

_______________________________________________
users mailing list
users@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/users


_______________________________________________
users mailing list
users@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/users



--
Nathan Baca
nathan.baca@gmail.com