Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

Subject: Re: [OMPI devel] amd64 atomic.h warnings
From: George Bosilca (bosilca_at_[hidden])
Date: 2010-06-08 11:40:55


On Jun 8, 2010, at 14:49 , Jeff Squyres wrote:

> ## -cmdline +pgcc george2.c -O3 -c -S -x 123 4 -x 123 0x80000000 -alwaysinline /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4
> ## -asm george2.s
> ## lineno: 7
> .text
> .align 16
> opal_atomic_cmpset_32:
> ..Dcfb0:
> pushq %rbp
> ..Dcfi0:
> movq %rsp, %rbp
> ..Dcfi1:
> movl %esi, -4(%rbp)
> movl %edx, -8(%rbp)
> ..EN1:
> ## lineno: 16
> movl -8(%rbp), %edx
> movl -4(%rbp), %eax

oldval is moved into the %eax ... once

> lock;cmpxchgl %edx,(%rdi)
> sete %cl

The CCR is retrieved

> movb %cl, -9(%rbp)

And stored.

But the loop disappeared. I really have a doubt about the correctness of this assembly code.

  george.

> ## lineno: 17
> movzbl -9(%rbp), %eax
> ## lineno: 0
> popq %rbp
> ret
> .type opal_atomic_cmpset_32,@function
> .size opal_atomic_cmpset_32,.-opal_atomic_cmpset_32
> ..Dcfe0:
> __opal_atomic_cmpset_32END:
> .section .pgi_trace
> .align 8
> .quad opal_atomic_cmpset_32 ## address of routine
> .quad __opal_atomic_cmpset_32END - opal_atomic_cmpset_32 ## size of routine
> .2byte 0 ## flags for future use
> .2byte 21 ## length of following string
> ## name:opal_atomic_cmpset_32:
> .byte 0x6f,0x70,0x61,0x6c,0x5f,0x61,0x74,0x6f,0x6d,0x69,0x63
> .byte 0x5f,0x63,0x6d,0x70,0x73,0x65,0x74,0x5f,0x33,0x32,0x00
> ## lineno: 20
> .text
> .align 16
> .globl main
> main:
> ..Dcfb1:
> pushq %rbp
> ..Dcfi2:
> movq %rsp, %rbp
> ..Dcfi3:
> subq $16, %rsp
> movq %rbx, -16(%rbp)
> pushq %rax
> pushq %rax
> stmxcsr (%rsp)
> popq %rax
> orq $64, %rax
> pushq %rax
> ldmxcsr (%rsp)
> popq %rax
> popq %rax
> ## lineno: 0
> ..EN2:
> ## lineno: 21
> movl $0, -4(%rbp)
> .align 8
> .LB191:
> ## lineno: 24
> movl -4(%rbp), %ebx
> leaq -4(%rbp), %rdi
> leal 1(%rbx), %edx
> movl %ebx, %esi
> call opal_atomic_cmpset_32
> testl %eax, %eax
> je .LB191
> ## lineno: 27
> addl $1, %ebx
> movl %ebx, %eax
> ## lineno: 28
> movq -16(%rbp), %rbx
> leave
> ret
> .type main,@function
> .size main,.-main
> ..Dcfe1:
> __mainEND:
> .section .pgi_trace
> .align 8
> .quad main ## address of routine
> .quad __mainEND - main ## size of routine
> .2byte 0 ## flags for future use
> .2byte 4 ## length of following string
> ## name:main:
> .byte 0x6d,0x61,0x69,0x6e,0x00
> .data
> .section .debug_frame
> ..Dcieb0:
> .4byte ..Dciee0-..Dcieb0-4 ## CIE length
> .4byte 0xffffffff ## CIE ID
> .byte 0x1 ## CIE version
> .byte 0x0 ## no augmentation
> .byte 0x1 ## ULEB128 1, code alignment factor
> .byte 0x78 ## SLEB128 -8, data alignment factor
> .byte 0x10 ## return address column
> .byte 0xc ## DW_CFA_def_cfa (col 7)
> .byte 0x7 ## ULEB128 7
> .byte 0x8 ## ULEB128 8
> .byte 0x90 ## DW_CFA_offset (col 16)
> .byte 0x1 ## ULEB128 1
> .align 8
> ..Dciee0:
> .4byte ..Dfdee0-..Dfdeb0 ## FDE length
> ..Dfdeb0:
> .4byte ..Dcieb0 ## CIE pointer
> .quad ..Dcfb0 ## initial location
> .quad ..Dcfe0-..Dcfb0 ## address range
> .byte 0x4 ## DW_CFA_advance_loc4
> .4byte ..Dcfi0-..Dcfb0
> .byte 0xe ## DW_CFA_def_cfa_offset
> .byte 0x10 ## ULEB128 16
> .byte 0x86 ## DW_CFA_offset (col 6)
> .byte 0x2 ## ULEB128 2
> .byte 0x4 ## DW_CFA_advance_loc4
> .4byte ..Dcfi1-..Dcfi0
> .byte 0xd ## DW_CFA_def_cfa_register (col 6)
> .byte 0x6 ## ULEB128 6
> .align 8
> ..Dfdee0:
> .4byte ..Dfdee1-..Dfdeb1 ## FDE length
> ..Dfdeb1:
> .4byte ..Dcieb0 ## CIE pointer
> .quad ..Dcfb1 ## initial location
> .quad ..Dcfe1-..Dcfb1 ## address range
> .byte 0x4 ## DW_CFA_advance_loc4
> .4byte ..Dcfi2-..Dcfb1
> .byte 0xe ## DW_CFA_def_cfa_offset
> .byte 0x10 ## ULEB128 16
> .byte 0x86 ## DW_CFA_offset (col 6)
> .byte 0x2 ## ULEB128 2
> .byte 0x4 ## DW_CFA_advance_loc4
> .4byte ..Dcfi3-..Dcfi2
> .byte 0xd ## DW_CFA_def_cfa_register (col 6)
> .byte 0x6 ## ULEB128 6
> .align 8
> ..Dfdee1:
> .ident "PGC 7.0-7"
> [7:49] svbu-mpi:~/tmp %
>
> -----
>
>
>
> On Jun 8, 2010, at 10:46 AM, George Bosilca wrote:
>
>> It didn't work. Let's try with this small complete application:
>>
>> #include <stdint.h>
>>
>> #define SMPLOCK "lock;"
>>
>> static inline int opal_atomic_cmpset_32( volatile int32_t *addr,
>> int32_t oldval, int32_t newval)
>> {
>> unsigned char ret;
>> __asm__ __volatile__ (
>> SMPLOCK "cmpxchgl %1,%2 \n\t"
>> "sete %0 \n\t"
>> : "=qm" (ret)
>> : "q"(newval), "m"(*addr), "a"(oldval)
>> : "memory");
>>
>> return (int)ret;
>> }
>>
>> int main(int argc, char* argv[] )
>> {
>> int32_t value = 0, oldval = 0, delta = 1;
>> int32_t* addr = &value;
>>
>> do {
>> oldval = *addr;
>> } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
>> return (oldval + delta);
>> }
>>
>>
>>
>> Thanks,
>> george.
>>
>>
>> On Jun 8, 2010, at 14:42 , Jeff Squyres wrote:
>>
>>> Look at my output -- I did...
>>>
>>> On Jun 8, 2010, at 10:40 AM, George Bosilca wrote:
>>>
>>>> Still no good, the opal_atomic_cmpset_32 is not inlined. Try to add -O3 to your command line, this helped for gcc.
>>>>
>>>> Thanks,
>>>> george.
>>>>
>>>> On Jun 8, 2010, at 14:14 , Jeff Squyres wrote:
>>>>
>>>>> On Jun 8, 2010, at 9:53 AM, George Bosilca wrote:
>>>>>
>>>>>> As you can see there is no explicit call, the opal_atomic_cmpset_32 is really inlined. I think the problem is that you didn't specify the -O3 flag on your command line.
>>>>>
>>>>> Ah, you wanted me to compile the OMPI code itself and send you the assembly. That's not what you asked for. :-)
>>>>>
>>>>> (I just took the code you sent in the mail, stuffed it into george.c, and compiled that with -s -- outside of the context of the Open MPI code tree)
>>>>>
>>>>> Here's the new output. It still didn't inline, but you can see the code for the _cmpset function:
>>>>>
>>>>> -----
>>>>> [7:13] svbu-mpi:~/tmp % cat george.c
>>>>> #include <stdint.h>
>>>>>
>>>>> #include "opal/sys/atomic.h"
>>>>>
>>>>> int foo(void) {
>>>>> int32_t oldval, delta;
>>>>> int32_t *addr = 0;
>>>>> do {
>>>>> oldval = *addr;
>>>>> } while (0 == opal_atomic_cmpset_32(addr, oldval, oldval + delta));
>>>>> return (oldval + delta);
>>>>> }
>>>>>
>>>>> [7:13] svbu-mpi:~/tmp % pgcc -O3 -I /home/jsquyres/svn/ompi4 -I/home/jsquyres/svn/ompi4/opal/include -c -s george.c
>>>>> [7:13] svbu-mpi:~/tmp % cat george.s .file "george.c"
>>>>> .version "01.01"
>>>>> ## PGC 7.0 -opt 1
>>>>> ## PGC 06/08/2010 05:10:04
>>>>> ## pgcc george.c -c -S
>>>>> ## /opt/pgi/7.0.7/linux86-64/7.0-7/bin/pgc
>>>>> ## george.c -opt 1 -terse 1 -inform warn -x 119 0xa10000 -x 122 0x40 -x 123 0x1000
>>>>> ## -x 127 4 -x 127 16 -x 19 0x400000 -x 28 0x40000 -x 70 0x8000 -x 122 1 -quad
>>>>> ## -x 59 4 -x 59 4 -tp p7-64 -astype 0 -stdinc /opt/pgi/7.0.7/linux86-64/7.0-7/include:/usr/local/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include:/usr/include
>>>>> ## -def unix -def __unix -def __unix__ -def linux -def __linux -def __linux__
>>>>> ## -def __NO_MATH_INLINES -def __x86_64__ -def __LONG_MAX__=9223372036854775807L
>>>>> ## -def __SIZE_TYPE__=unsigned long int -def __PTRDIFF_TYPE__=long int -def __THROW=
>>>>> ## -def __extension__= -def __amd64__ -def __SSE__ -def __MMX__ -def __SSE2__
>>>>> ## -def __SSE3__ -predicate #machine(x86_64) #lint(off) #system(posix) #cpu(x86_64)
>>>>> ## -cmdline +pgcc george.c -c -S -x 123 4 -x 123 0x80000000 -alwaysinline /opt/pgi/7.0.7/linux86-64/7.0-7/lib/libintrinsics.il 4
>>>>> ## -asm george.s
>>>>> ## lineno: 3
>>>>> .text
>>>>> .align 16
>>>>> .globl foo
>>>>> foo:
>>>>> ..Dcfb0:
>>>>> pushq %rbp
>>>>> ..Dcfi0:
>>>>> movq %rsp, %rbp
>>>>> ..Dcfi1:
>>>>> subq $16, %rsp
>>>>> ..EN1:
>>>>> ## lineno: 5
>>>>> movq $0, -8(%rbp)
>>>>> .p2align 4,,3
>>>>> .LB157:
>>>>> ## lineno: 6
>>>>> movq -8(%rbp), %rdi
>>>>> movl (%rdi), %esi
>>>>> movl %esi, -12(%rbp)
>>>>> movl -16(%rbp), %edx
>>>>> addl %esi, %edx
>>>>> xorl %eax, %eax
>>>>> call opal_atomic_cmpset_32
>>>>> testl %eax, %eax
>>>>> je .LB157
>>>>> movl -16(%rbp), %eax
>>>>> addl -12(%rbp), %eax
>>>>> ## lineno: 10
>>>>> leave
>>>>> ret
>>>>> .type foo,@function
>>>>> .size foo,.-foo
>>>>> ..Dcfe0:
>>>>> __fooEND:
>>>>> .section .pgi_trace
>>>>> .align 8
>>>>> .quad foo ## address of routine
>>>>> .quad __fooEND - foo ## size of routine
>>>>> .2byte 0 ## flags for future use
>>>>> .2byte 3 ## length of following string
>>>>> ## name:foo:
>>>>> .byte 0x66,0x6f,0x6f,0x00
>>>>> .data
>>>>> .globl opal_atomic_cmpset_32
>>>>> .section .debug_frame
>>>>> ..Dcieb0:
>>>>> .4byte ..Dciee0-..Dcieb0-4 ## CIE length
>>>>> .4byte 0xffffffff ## CIE ID
>>>>> .byte 0x1 ## CIE version
>>>>> .byte 0x0 ## no augmentation
>>>>> .byte 0x1 ## ULEB128 1, code alignment factor
>>>>> .byte 0x78 ## SLEB128 -8, data alignment factor
>>>>> .byte 0x10 ## return address column
>>>>> .byte 0xc ## DW_CFA_def_cfa (col 7)
>>>>> .byte 0x7 ## ULEB128 7
>>>>> .byte 0x8 ## ULEB128 8
>>>>> .byte 0x90 ## DW_CFA_offset (col 16)
>>>>> .byte 0x1 ## ULEB128 1
>>>>> .align 8
>>>>> ..Dciee0:
>>>>> .4byte ..Dfdee0-..Dfdeb0 ## FDE length
>>>>> ..Dfdeb0:
>>>>> .4byte ..Dcieb0 ## CIE pointer
>>>>> .quad ..Dcfb0 ## initial location
>>>>> .quad ..Dcfe0-..Dcfb0 ## address range
>>>>> .byte 0x4 ## DW_CFA_advance_loc4
>>>>> .4byte ..Dcfi0-..Dcfb0
>>>>> .byte 0xe ## DW_CFA_def_cfa_offset
>>>>> .byte 0x10 ## ULEB128 16
>>>>> .byte 0x86 ## DW_CFA_offset (col 6)
>>>>> .byte 0x2 ## ULEB128 2
>>>>> .byte 0x4 ## DW_CFA_advance_loc4
>>>>> .4byte ..Dcfi1-..Dcfi0
>>>>> .byte 0xd ## DW_CFA_def_cfa_register (col 6)
>>>>> .byte 0x6 ## ULEB128 6
>>>>> .align 8
>>>>> ..Dfdee0:
>>>>> .ident "PGC 7.0-7"
>>>>> [7:13] svbu-mpi:~/tmp %
>>>>> -----
>>>>>
>>>>> --
>>>>> Jeff Squyres
>>>>> jsquyres_at_[hidden]
>>>>> For corporate legal information go to:
>>>>> http://www.cisco.com/web/about/doing_business/legal/cri/
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> devel mailing list
>>>>> devel_at_[hidden]
>>>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>>>>
>>>>
>>>> _______________________________________________
>>>> devel mailing list
>>>> devel_at_[hidden]
>>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>>>>
>>>
>>>
>>> --
>>> Jeff Squyres
>>> jsquyres_at_[hidden]
>>> For corporate legal information go to:
>>> http://www.cisco.com/web/about/doing_business/legal/cri/
>>>
>>>
>>> _______________________________________________
>>> devel mailing list
>>> devel_at_[hidden]
>>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>>
>>
>> _______________________________________________
>> devel mailing list
>> devel_at_[hidden]
>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
>
>
> --
> Jeff Squyres
> jsquyres_at_[hidden]
> For corporate legal information go to:
> http://www.cisco.com/web/about/doing_business/legal/cri/
>
>
> _______________________________________________
> devel mailing list
> devel_at_[hidden]
> http://www.open-mpi.org/mailman/listinfo.cgi/devel