Open MPI logo

Open MPI Development Mailing List Archives

  |   Home   |   Support   |   FAQ   |   all Development mailing list

From: Tim Prins (tprins_at_[hidden])
Date: 2007-07-09 10:41:52


Gleb Natapov wrote:
> On Sun, Jul 08, 2007 at 12:41:58PM -0400, Tim Prins wrote:
>> On Sunday 08 July 2007 08:32:27 am Gleb Natapov wrote:
>>> On Fri, Jul 06, 2007 at 06:36:13PM -0400, Tim Prins wrote:
>>>> While looking into another problem I ran into an issue which made ob1
>>>> segfault on me. Using gm, and running the test test_dan1 in the onesided
>>>> test suite, if I limit the gm freelist by too much, I get a segfault.
>>>> That is,
>>>>
>>>> mpirun -np 2 -mca btl gm,self -mca btl_gm_free_list_max 1024 test_dan1
>>>>
>>>> works fine, but
>>>>
>>>> mpirun -np 2 -mca btl gm,self -mca btl_gm_free_list_max 512 test_dan1
>>> I cannot, unfortunately, reproduce this with openib BTL.
>>>
>>>> segfaults. Here is the relevant output from gdb:
>>>>
>>>> Program received signal SIGSEGV, Segmentation fault.
>>>> [Switching to Thread 1077541088 (LWP 15600)]
>>>> 0x404d81c1 in mca_pml_ob1_send_fin (proc=0x9bd9490, bml_btl=0xd323580,
>>>> hdr_des=0x9e54e78, order=255 '�', status=1) at pml_ob1.c:267
>>>> 267 MCA_PML_OB1_DES_ALLOC(bml_btl, fin, order,
>>>> sizeof(mca_pml_ob1_fin_hdr_t));
>>> can you send me what's inside bml_btl?
>> It turns out that the order of arguments to mca_pml_ob1_send_fin was wrong. I
>> fixed this in r15304. But now we hang instead of segfault, and have both
>> processes just looping through opal_progress. I really don't what to look
>> for. Any hints?
>>
> Can you look in gdb at mca_pml_ob1.rdma_pending?
Yeah, rank 0 has nothing on the list, and rank 1 has 48 things.

Here is the first item on the list:
$7 = {
   super = {
     super = {
       super = {
         obj_magic_id = 16046253926196952813,
         obj_class = 0x404f5980,
         obj_reference_count = 1,
         cls_init_file_name = 0x404f30f9 "pml_ob1_sendreq.c",
         cls_init_lineno = 1134
       },
       opal_list_next = 0x8f5d680,
       opal_list_prev = 0x404f57c8,
       opal_list_item_refcount = 1,
       opal_list_item_belong_to = 0x404f57b0
     },
     registration = 0x0,
     ptr = 0x0
   },
   rdma_bml = 0x8729098,
   rdma_hdr = {
     hdr_common = {
       hdr_type = 8 '\b',
       hdr_flags = 4 '\004'
     },
     hdr_match = {
       hdr_common = {
         hdr_type = 8 '\b',
         hdr_flags = 4 '\004'
       },
       hdr_ctx = 5,
       hdr_src = 1,
       hdr_tag = 142418176,
       hdr_seq = 0,
       hdr_padding = "\000"
     },
     hdr_rndv = {
       hdr_match = {
         hdr_common = {
           hdr_type = 8 '\b',
           hdr_flags = 4 '\004'
         },
         hdr_ctx = 5,
         hdr_src = 1,
         hdr_tag = 142418176,
         hdr_seq = 0,
         hdr_padding = "\000"
       },
       hdr_msg_length = 236982400,
       hdr_src_req = {
         lval = 0,
         ival = 0,
         pval = 0x0,
         sval = {
           uval = 0,
           lval = 0
         }
       }
     },
     hdr_rget = {
       hdr_rndv = {
         hdr_match = {
           hdr_common = {
             hdr_type = 8 '\b',
             hdr_flags = 4 '\004'
           },
           hdr_ctx = 5,
           hdr_src = 1,
           hdr_tag = 142418176,
           hdr_seq = 0,
           hdr_padding = "\000"
         },
         hdr_msg_length = 236982400,
         hdr_src_req = {
           lval = 0,
           ival = 0,
           pval = 0x0,
           sval = {
             uval = 0,
             lval = 0
           }
         }
       },
       hdr_seg_cnt = 1106481152,
       hdr_padding = "\000\000\000",
       hdr_des = {
         lval = 32768,
         ival = 32768,
         pval = 0x8000,
         sval = {
           uval = 32768,
           lval = 0
         }
       },
       hdr_segs = {{
           seg_addr = {
             lval = 0,
             ival = 0,
             pval = 0x0,
             sval = {
               uval = 0,
               lval = 0
             }
           },
           seg_len = 0,
           seg_padding = "\000\000\000",
           seg_key = {
             key32 = {0, 0},
             key64 = 0,
             key8 = "\000\000\000\000\000\000\000"
           }
         }}
     },
     hdr_frag = {
       hdr_common = {
         hdr_type = 8 '\b',
         hdr_flags = 4 '\004'
       },
       hdr_padding = "\005\000\001\000\000",
       hdr_frag_offset = 142418176,
       hdr_src_req = {
         lval = 236982400,
         ival = 236982400,
         pval = 0xe201080,
         sval = {
           uval = 236982400,
           lval = 0
         }
       },
       hdr_dst_req = {
         lval = 0,
         ival = 0,
         pval = 0x0,
         sval = {
           uval = 0,
           lval = 0
         }
       }
     },
     hdr_ack = {
       hdr_common = {
         hdr_type = 8 '\b',
         hdr_flags = 4 '\004'
       },
       hdr_padding = "\005\000\001\000\000",
       hdr_src_req = {
         lval = 142418176,
         ival = 142418176,
         pval = 0x87d2100,
         sval = {
           uval = 142418176,
           lval = 0
         }
       },
       hdr_dst_req = {
         lval = 236982400,
         ival = 236982400,
         pval = 0xe201080,
         sval = {
           uval = 236982400,
           lval = 0
         }
       },
       hdr_send_offset = 0
     },
     hdr_rdma = {
       hdr_common = {
         hdr_type = 8 '\b',
         hdr_flags = 4 '\004'
       },
       hdr_padding = "\005",
       hdr_seg_cnt = 1,
       hdr_req = {
         lval = 142418176,
         ival = 142418176,
         pval = 0x87d2100,
         sval = {
           uval = 142418176,
           lval = 0
         }
       },
       hdr_des = {
         lval = 236982400,
         ival = 236982400,
         pval = 0xe201080,
         sval = {
           uval = 236982400,
           lval = 0
         }
       },
       hdr_rdma_offset = 0,
       hdr_segs = {{
           seg_addr = {
             lval = 1106481152,
             ival = 1106481152,
             pval = 0x41f39000,
             sval = {
               uval = 1106481152,
               lval = 0
             }
           },
           seg_len = 32768,
           seg_padding = "\000\000\000",
           seg_key = {
             key32 = {0, 0},
             key64 = 0,
             key8 = "\000\000\000\000\000\000\000"
           }
         }}
     },
     hdr_fin = {
       hdr_common = {
         hdr_type = 8 '\b',
         hdr_flags = 4 '\004'
       },
       hdr_padding = "\005\000\001\000\000",
       hdr_des = {
         lval = 142418176,
         ival = 142418176,
         pval = 0x87d2100,
         sval = {
           uval = 142418176,
           lval = 0
         }
       },
       hdr_fail = 236982400
     }
   },
   rdma_state = MCA_PML_OB1_RDMA_PUT,
   rdma_length = 32768,
   rdma_segs = {{
       seg_addr = {
         lval = 1106481152,
         ival = 1106481152,
         pval = 0x41f39000,
         sval = {
           uval = 1106481152,
           lval = 0
         }
       },
       seg_len = 32768,
       seg_padding = "\000\000\000",
       seg_key = {
         key32 = {0, 0},
         key64 = 0,
         key8 = "\000\000\000\000\000\000\000"
       }
     }, {
       seg_addr = {
         lval = 0,
         ival = 0,
         pval = 0x0,
         sval = {
           uval = 0,
           lval = 0
         }
       },
       seg_len = 0,
       seg_padding = "\000\000\000",
       seg_key = {
         key32 = {0, 0},
         key64 = 0,
         key8 = "\000\000\000\000\000\000\000"
       }
     } <repeats 15 times>},
   rdma_req = 0x87d2100,
   rdma_ep = 0x8516f08,
   convertor = {
     super = {
       obj_magic_id = 0,
       obj_class = 0x0,
       obj_reference_count = 0,
       cls_init_file_name = 0x0,
       cls_init_lineno = 0
     },
     remoteArch = 4291428864,
     flags = 1855942,
     local_size = 32768,
     remote_size = 32768,
     pDesc = 0x8054620,
     use_desc = 0x80546b4,
     count = 32768,
     pBaseBuf = 0x41f39000 "",
     pStack = 0x8f5c3ec,
     stack_size = 5,
     fAdvance = 0,
     master = 0x84ad398,
     stack_pos = 4294967295,
     bConverted = 0,
     partial_length = 0,
     checksum = 0,
     csum_ui1 = 0,
     csum_ui2 = 0,
     static_stack = {{
         index = 0,
         type = 0,
         count = 0,
         disp = 0
       }, {
         index = 0,
         type = 0,
         count = 0,
         disp = 0
       }, {
         index = 0,
         type = 0,
         count = 0,
         disp = 0
       }, {
         index = 0,
         type = 0,
         count = 0,
         disp = 0
       }, {
         index = 0,
         type = 0,
         count = 0,
         disp = 0
       }}
   },
   reg = 0x8515e80,
   retries = 1
}

Thanks,

Tim