I am trying to use checkpoint-restart functionality of OpenMPI. Most of the times checkpointing of MPI application behaves correctly, but in some situations the MPI application hangs indefinitely after the checkpoint is taken. Ompi-checkpoint terminates without error and I do get the snapshot reference, but the application does not resume (seems to be busy waiting somewhere in mpi code). I am not able to reproduce this problem to find the exact scenario which leads to this issue.
1. OpenIB BTL is used. (using TCP btl does not produce this error)
2. The communication is of the form - Isends/Irecvs followed by Waitall(...)
I saw a ticket(#2397) which shows some bug fixes targeted for V1.7 ; I went through them, but not sure whether my problem is because of those bugs. Are there any known issues specifically when OpenIB btl is used?
Please find the output of ompi-info and config.log as attachments.
I am providing these back-traces of a single process taken at different times, if it helps. All the MPI application processes are in running state.
Please let me know if additional information is required.
Back trace 1
#0 mca_btl_sm_component_progress () at btl_sm_component.c:560
#1 0x00002b09eb1d3105 in opal_progress () at runtime/opal_progress.c:207
#2 0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at ../opal/threads/condition.h:92
#3 ompi_request_default_wait_all (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at request/req_wait.c:263
#4 0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at pwaitall.c:70
#5 0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838, status=0x7fffbab81820) at mrmpi_p2p.c:3330
#6 0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948, status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418
#7 0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c, x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8, r=0x1ca6f058,
w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c, reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0,
reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10, reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295
#8 0x0000000000402271 in cg_unit () at cg.f:502
#9 0x000000000040181b in MAIN__ () at cg.f:56
#10 0x0000000000406e8e in main ()
Back trace 2
#0 0x00002aaaaf710a8a in get_sw_cqe (cq=<value optimized out>, n=19) at src/cq.c:119
#1 0x00002aaaaf710f01 in next_cqe_sw (ibcq=0x32a7cde0, ne=1, wc=<value optimized out>) at src/cq.c:125
#2 mlx4_poll_one (ibcq=0x32a7cde0, ne=1, wc=<value optimized out>) at src/cq.c:205
#3 mlx4_poll_cq (ibcq=0x32a7cde0, ne=1, wc=<value optimized out>) at src/cq.c:352
#4 0x00002aaaad9d7b53 in opal_pointer_array_get_item () at ../../../../opal/threads/mutex_unix.h:102
#5 btl_openib_component_progress () at btl_openib_component.c:3540
#6 0x00002b09eb1d3105 in opal_progress () at runtime/opal_progress.c:207
#7 0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at ../opal/threads/condition.h:92
#8 ompi_request_default_wait_all (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at request/req_wait.c:263
#9 0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at pwaitall.c:70
#10 0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838, status=0x7fffbab81820) at mrmpi_p2p.c:3330
#11 0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948, status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418
#12 0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c, x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8, r=0x1ca6f058,
w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c, reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0,
reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10, reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295
#13 0x0000000000402271 in cg_unit () at cg.f:502
#14 0x000000000040181b in MAIN__ () at cg.f:56
#15 0x0000000000406e8e in main ()
Back trace 3
#0 mlx4_poll_cq (ibcq=0x32a7cc60, ne=1, wc=<value optimized out>) at src/cq.c:360
#1 0x00002aaaad9d7b53 in opal_pointer_array_get_item () at ../../../../opal/threads/mutex_unix.h:102
#2 btl_openib_component_progress () at btl_openib_component.c:3540
#3 0x00002b09eb1d3105 in opal_progress () at runtime/opal_progress.c:207
#4 0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at ../opal/threads/condition.h:92
#5 ompi_request_default_wait_all (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at request/req_wait.c:263
#6 0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at pwaitall.c:70
#7 0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838, status=0x7fffbab81820) at mrmpi_p2p.c:3330
#8 0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948, status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418
#9 0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c, x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8, r=0x1ca6f058,
w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c, reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0,
reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10, reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295
#10 0x0000000000402271 in cg_unit () at cg.f:502
#11 0x000000000040181b in MAIN__ () at cg.f:56
#12 0x0000000000406e8e in main ()
Back trace 4
#0 0x00002b09eb1d30f8 in opal_progress () at runtime/opal_progress.c:207
#1 0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at ../opal/threads/condition.h:92
#2 ompi_request_default_wait_all (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at request/req_wait.c:263
#3 0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at pwaitall.c:70
#4 0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838, status=0x7fffbab81820) at mrmpi_p2p.c:3330
#5 0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948, status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418
#6 0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c, x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8, r=0x1ca6f058,
w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c, reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0,
reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10, reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295
#7 0x0000000000402271 in cg_unit () at cg.f:502
#8 0x000000000040181b in MAIN__ () at cg.f:56
#9 0x0000000000406e8e in main ()
Back trace 5
#0 0x0000003a6aa08cd6 in pthread_mutex_lock () from /lib64/libpthread.so.0
#1 0x00002aaaad9d7b53 in opal_pointer_array_get_item () at ../../../../opal/threads/mutex_unix.h:102
#2 btl_openib_component_progress () at btl_openib_component.c:3540
#3 0x00002b09eb1d3105 in opal_progress () at runtime/opal_progress.c:207
#4 0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at ../opal/threads/condition.h:92
#5 ompi_request_default_wait_all (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at request/req_wait.c:263
#6 0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at pwaitall.c:70
#7 0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838, status=0x7fffbab81820) at mrmpi_p2p.c:3330
#8 0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948, status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418
#9 0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c, x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8, r=0x1ca6f058,
w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c, reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0,
reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10, reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295
#10 0x0000000000402271 in cg_unit () at cg.f:502
#11 0x000000000040181b in MAIN__ () at cg.f:56
#12 0x0000000000406e8e in main ()