Open MPI logo

Open MPI User's Mailing List Archives

  |   Home   |   Support   |   FAQ   |  

This web mail archive is frozen.

This page is part of a frozen web archive of this mailing list.

You can still navigate around this archive, but know that no new mails have been added to it since July of 2016.

Click here to be taken to the new web archives of this list; it includes all the mails that are in this frozen archive plus all new mails that have been sent to the list since it was migrated to the new archives.

Subject: Re: [OMPI users] Segmentation fault in MPI_Init when passing pointers allocated in main()
From: Alex A. Granovsky (gran_at_[hidden])
Date: 2013-11-12 12:15:25


Hello,

> It seems that argv[argc] should always be NULL according to the
> standard. So OMPI failure is not actually a bug!

could you please point to the exact document where this is explicitly
stated?
Otherwise, I'd assume this is a bug.

Kind regards,
Alex Granovsky

-----Original Message-----
From: Matthieu Brucher
Sent: Tuesday, November 12, 2013 8:56 PM
To: Open MPI Users
Subject: Re: [OMPI users] Segmentation fault in MPI_Init when passing
pointers allocated in main()

It seems that argv[argc] should always be NULL according to the
standard. So OMPI failure is not actually a bug!

Cheers,

2013/11/12 Matthieu Brucher <matthieu.brucher_at_[hidden]>:
> Interestingly enough, in ompi_mpi_init, opal_argv_join is called
> without then array length, so I suppose that in the usual argc/argv
> couple, you have an additional value to argv which may be NULL. So try
> allocating 3 additional values, the last being NULL, and it may work.
>
> Cheers,
>
> Matthieu
>
> 2013/11/12 Tang, Yu-Hang <yuhang_tang_at_[hidden]>:
>> I tried the following code without CUDA, the error is still there:
>>
>> #include "mpi.h"
>>
>> #include <cstdlib>
>> #include <cstring>
>> #include <cmath>
>>
>> int main(int argc, char **argv)
>> {
>> // override command line arguments to make sure cudaengine get the
>> correct one
>> char **argv_new = new char*[ argc + 2 ];
>> for( int i = 0 ; i < argc ; i++ )
>> {
>> argv_new[i] = new char[ strlen( argv[i] ) + 1 ];
>> strcpy( argv_new[i], argv[i] );
>> }
>> argv_new[ argc ] = new char[ 32 ];
>> argv_new[ argc+1 ] = new char[ 32 ];
>> strcpy( argv_new[argc], "-device" );
>> sprintf( argv_new[argc+1], "%d", 0 );
>>
>> argc += 2;
>> argv = argv_new;
>>
>> MPI_Init(&argc,&argv);
>>
>> // do something...
>>
>> MPI_Finalize();
>>
>> for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
>> delete [] argv;
>> }
>>
>> At the end of the program the pointer stored in argv is exactly that of
>> argv_new so this should not be a problem. Manually inserting printf tells
>> me
>> that the fault occured at MPI_Init. The code works fine if I use
>> MPI_Init(NULL,NULL) instead. The same code also compiles and runs without
>> a
>> problem on my laptop with mpich2-1.4.
>>
>> Best,
>> Yu-Hang
>>
>>
>>
>> On Tue, Nov 12, 2013 at 11:18 AM, Matthieu Brucher
>> <matthieu.brucher_at_[hidden]> wrote:
>>>
>>> Hi,
>>>
>>> Are you sure this is the correct code? This seems strange and not a good
>>> idea:
>>>
>>> MPI_Init(&argc,&argv);
>>>
>>> // do something...
>>>
>>> for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
>>> delete [] argv;
>>>
>>> Did you mean argc_new and argv_new instead?
>>> Do you have the same error without CUDA?
>>>
>>> Cheers,
>>>
>>> Matthieu
>>>
>>>
>>> 2013/11/12 Tang, Yu-Hang <yuhang_tang_at_[hidden]>:
>>> > Hi,
>>> >
>>> > I tried to augment the command line argument list by allocating my own
>>> > list
>>> > of strings and passing them to MPI_Init, yet I got a segmentation
>>> > fault
>>> > for
>>> > both OpenMPI 1.6.3 and 1.7.2, while the code works fine with MPICH2.
>>> > The
>>> > code is:
>>> >
>>> > #include "mpi.h"
>>> > #include "cuda_runtime.h"
>>> > #include <cstdlib>
>>> > #include <cstring>
>>> > #include <cmath>
>>> >
>>> > int main(int argc, char **argv)
>>> > {
>>> > int device = 0;
>>> > int skip = 0;
>>> > bool skipmode = false;
>>> > bool specified = false;
>>> > for( int i = 0 ; i < argc ; i++ )
>>> > {
>>> > if ( strcmp( argv[i], "-device" ) == 0 )
>>> > {
>>> > i++;
>>> > if ( argv[i][0] == '-' )
>>> > {
>>> > skipmode = true;
>>> > skip = fabs( atoi( argv[i] ) );
>>> > }
>>> > else
>>> > {
>>> > skipmode = false;
>>> > device = atoi( argv[i] );
>>> > }
>>> > specified = true;
>>> > }
>>> > }
>>> >
>>> > if ( !specified || skipmode )
>>> > {
>>> > char* var;
>>> > int dev_count, local_rank = 0;
>>> > if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank =
>>> > atoi(var);
>>> > else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL)
>>> > local_rank = atoi(var);
>>> > else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL)
>>> > local_rank = atoi(var);
>>> > cudaGetDeviceCount( &dev_count );
>>> > if ( skipmode )
>>> > {
>>> > device = 0;
>>> > if ( device == skip ) local_rank++;
>>> > while( local_rank-- > 0 )
>>> > {
>>> > device = (++device) % dev_count;
>>> > if ( device == skip ) local_rank++;
>>> > }
>>> > }
>>> > else device = local_rank % dev_count;
>>> > }
>>> >
>>> > // override command line arguments to make sure cudaengine get the
>>> > correct one
>>> > char **argv_new = new char*[ argc + 2 ];
>>> > for( int i = 0 ; i < argc ; i++ )
>>> > {
>>> > argv_new[i] = new char[ strlen( argv[i] ) + 1 ];
>>> > strcpy( argv_new[i], argv[i] );
>>> > }
>>> > argv_new[ argc ] = new char[ 32 ];
>>> > argv_new[ argc+1 ] = new char[ 32 ];
>>> > strcpy( argv_new[argc], "-device" );
>>> > sprintf( argv_new[argc+1], "%d", device );
>>> > argc += 2;
>>> > argv = argv_new;
>>> >
>>> > cudaSetDevice( device );
>>> >
>>> > MPI_Init(&argc,&argv);
>>> >
>>> > // do something...
>>> >
>>> > MPI_Finalize();
>>> >
>>> > cudaDeviceReset();
>>> > for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
>>> > delete [] argv;
>>> > }
>>> >
>>> > When compiled using nvcc -ccbin mpic++, The error I got was:
>>> >
>>> > [jueying:16317] *** Process received signal ***
>>> > [jueying:16317] Signal: Segmentation fault (11)
>>> > [jueying:16317] Signal code: Address not mapped (1)
>>> > [jueying:16317] Failing at address: 0x21
>>> > [jueying:16317] [ 0] /usr/lib64/libpthread.so.0() [0x39e5e0f000]
>>> > [jueying:16317] [ 1] /usr/lib64/libc.so.6() [0x39e5760551]
>>> > [jueying:16317] [ 2]
>>> > /opt/openmpi/1.7.2/lib/libopen-pal.so.5(opal_argv_join+0x39)
>>> > [0x7f460b993079]
>>> > [jueying:16317] [ 3]
>>> > /opt/openmpi/1.7.2/lib/libmpi.so.1(ompi_mpi_init+0x347)
>>> > [0x7f460c106a57]
>>> > [jueying:16317] [ 4]
>>> > /opt/openmpi/1.7.2/lib/libmpi.so.1(MPI_Init+0x16b)
>>> > [0x7f460c12523b]
>>> > [jueying:16317] [ 5] ./lmp_jueying() [0x40c035]
>>> > [jueying:16317] [ 6] /usr/lib64/libc.so.6(__libc_start_main+0xf5)
>>> > [0x39e5621a05]
>>> > [jueying:16317] [ 7] ./lmp_jueying() [0x40dd21]
>>> > [jueying:16317] *** End of error message ***
>>> >
>>> > Thanks for the help.
>>> >
>>> > Best regards,
>>> > Yu-Hang Tang
>>> >
>>> > _______________________________________________
>>> > users mailing list
>>> > users_at_[hidden]
>>> > http://www.open-mpi.org/mailman/listinfo.cgi/users
>>>
>>>
>>>
>>> --
>>> Information System Engineer, Ph.D.
>>> Blog: http://matt.eifelle.com
>>> LinkedIn: http://www.linkedin.com/in/matthieubrucher
>>> Music band: http://liliejay.com/
>>> _______________________________________________
>>> users mailing list
>>> users_at_[hidden]
>>> http://www.open-mpi.org/mailman/listinfo.cgi/users
>>
>>
>>
>>
>> --
>> Yu-Hang Tang
>> Room 105, 37 Manning St
>> Division of Applied Mathematics, Brown University
>> Providence, RI 02912
>>
>> _______________________________________________
>> users mailing list
>> users_at_[hidden]
>> http://www.open-mpi.org/mailman/listinfo.cgi/users
>
>
>
> --
> Information System Engineer, Ph.D.
> Blog: http://matt.eifelle.com
> LinkedIn: http://www.linkedin.com/in/matthieubrucher
> Music band: http://liliejay.com/

-- 
Information System Engineer, Ph.D.
Blog: http://matt.eifelle.com
LinkedIn: http://www.linkedin.com/in/matthieubrucher
Music band: http://liliejay.com/
_______________________________________________
users mailing list
users_at_[hidden]
http://www.open-mpi.org/mailman/listinfo.cgi/users