You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
hangs after: # All processes entering MPI_Finalize
the same reproduced with '-x UCX_TLS=rc,sm '
trace examples:
Thread 1 (Thread 0x7ffff7fb5740 (LWP 29710)):
#0 0x00007ffff79091cd in write () from /usr/lib64/libpthread.so.0
#1 0x00007ffff12f1755 in ibv_cmd_dereg_mr () from /usr/lib64/libibverbs.so.1
#2 0x00007ffff041d85d in mlx5_dereg_mr () from /usr/lib64/libmlx5-rdmav2.so
#3 0x00007ffff12f827e in ibv_dereg_mr () from /usr/lib64/libibverbs.so.1
#4 0x00007fffea31bba8 in uct_ib_dereg_mr (mr=<optimized out>) at ib/base/ib_md.c:479
#5 uct_ib_memh_dereg (memh=0x2bedb9e8) at ib/base/ib_md.c:498
#6 uct_ib_mem_dereg_internal (memh=0x2bedb9e8) at ib/base/ib_md.c:780
#7 uct_ib_rcache_mem_dereg_cb (context=<optimized out>, rcache=<optimized out>, rregion=0x2bedb9c0) at ib/base/ib_md.c:933
#8 0x00007fffe9e0c219 in ucs_mem_region_destroy_internal (region=0x2bedb9c0, rcache=0x99a0f0) at sys/rcache.c:160
#9 ucs_rcache_region_invalidate (must_be_in_pgt=0, must_be_destroyed=0, region=0x2bedb9c0, rcache=0x99a0f0) at sys/rcache.c:193
#10 ucs_rcache_invalidate_range (end=<optimized out>, start=<optimized out>, rcache=0x99a0f0) at sys/rcache.c:211
#11 ucs_rcache_check_inv_queue (rcache=rcache@entry=0x99a0f0) at sys/rcache.c:233
#12 0x00007fffe9e0c2f5 in ucs_rcache_t_cleanup (self=0x99a0f0) at sys/rcache.c:619
#13 0x00007fffe9e0eca6 in ucs_class_call_cleanup_chain (cls=cls@entry=0x7fffea0e9460 <ucs_rcache_t_class>, obj=obj@entry=0x99a0f0, limit=limit@entry=-1) at type/class.c:50
#14 0x00007fffe9e0cc88 in ucs_rcache_destroy (self=0x99a0f0) at sys/rcache.c:633
#15 0x00007fffea31b0e7 in uct_ib_md_close (uct_md=0x957670) at ib/base/ib_md.c:1255
#16 0x00007fffea5679d6 in ucp_free_resources (context=0x9121d0) at core/ucp_context.c:419
#17 ucp_cleanup (context=0x9121d0) at core/ucp_context.c:845
#18 0x00007fffea78745c in mca_pml_ucx_close () at pml_ucx.c:171
#19 0x00007fffea789189 in mca_pml_ucx_component_close () at pml_ucx_component.c:87
#20 0x00007ffff700ea49 in mca_base_component_close (component=0x7fffea98b720 <mca_pml_ucx_component>, output_id=output_id@entry=-1) at mca_base_components_close.c:53
#21 0x00007ffff700eac2 in mca_base_components_close (output_id=-1, components=0x7ffff7dc7370 <ompi_pml_base_framework+80>, skip=0x0) at mca_base_components_close.c:85
#22 0x00007ffff70180f9 in mca_base_framework_close (framework=0x7ffff7dc7320 <ompi_pml_base_framework>) at mca_base_framework.c:214
#23 0x00007ffff7b5c170 in ompi_mpi_finalize () at runtime/ompi_mpi_finalize.c:343
#24 0x00000000004029ac in main ()
Thread 1 (Thread 0x7ffff7fb5740 (LWP 29727)):
#0 0x00007ffff0405110 in mlx5_poll_cq_1 () from /usr/lib64/libmlx5-rdmav2.so
#1 0x00007fffea3224a8 in ibv_poll_cq (wc=0x7fffffffb518, num_entries=<optimized out>, cq=<optimized out>) at /usr/include/infiniband/verbs.h:1360
#2 uct_ib_poll_cq (wcs=0x7fffffffb518, count=<synthetic pointer>, cq=<optimized out>) at /hpc/local/benchmarks/hpcx_install_Thursday/src/hpcx-gcc-redhat7.2/ucx-master/src/uct/ib/base/ib_device.h:270
#3 uct_rc_verbs_iface_poll_rx_common (iface=0xa6ac70) at ib/rc/verbs/rc_verbs_common.h:154
#4 uct_rc_verbs_iface_progress (arg=0xa6ac70) at ib/rc/verbs/rc_verbs_iface.c:129
#5 0x00007fffea31368e in ucs_callbackq_dispatch (cbq=0xa69e38, cbq=0xa69e38) at /hpc/local/benchmarks/hpcx_install_Thursday/src/hpcx-gcc-redhat7.2/ucx-master/src/ucs/datastruct/callbackq.inl:39
#6 uct_worker_progress (worker=0xa69e30) at base/uct_md.c:233
#7 0x00007fffea56d1dd in ucp_worker_progress (worker=0xa33850) at core/ucp_worker.c:719
#8 0x00007fffea785cc7 in mca_pml_ucx_progress () at pml_ucx.c:421
#9 0x00007ffff6feef2c in opal_progress () at runtime/opal_progress.c:225
#10 0x00007ffff7b5a1dd in ompi_request_wait_completion (req=0x2cb77708) at ../ompi/request/request.h:392
#11 ompi_request_default_wait (req_ptr=0x7fffffffb930, status=0x0) at request/req_wait.c:41
#12 0x00007fffea78825e in mca_pml_ucx_send (buf=0x7fff850ed010, count=262144, datatype=<optimized out>, dst=<optimized out>, tag=<optimized out>, mode=<optimized out>, comm=0x2bfff9e0) at pml_ucx.c:711
#13 0x00007ffff7b8513b in ompi_coll_base_sendrecv_nonzero_actual (sendbuf=0x7fff850ed010, scount=262144, sdatatype=sdatatype@entry=0x61a600 <ompi_mpi_byte>, dest=<optimized out>, stag=stag@entry=-14, recvbuf=<optimized out>, rcount=262144, rdatatype=rdatatype@entry=0x61a600 <ompi_mpi_byte>, source=source@entry=418, rtag=rtag@entry=-14, comm=comm@entry=0x2bfff9e0, status=status@entry=0x0) at base/coll_base_util.c:58
#14 0x00007ffff7b89823 in ompi_coll_base_sendrecv (stag=-14, rtag=-14, status=0x0, myid=884, comm=0x2bfff9e0, source=418, rdatatype=0x61a600 <ompi_mpi_byte>, rcount=<optimized out>, recvbuf=<optimized out>, dest=1350, sdatatype=0x61a600 <ompi_mpi_byte>, scount=<optimized out>, sendbuf=<optimized out>) at base/coll_base_util.h:67
#15 ompi_coll_base_alltoallv_intra_pairwise (sbuf=0x7fff6ff6d010, scounts=0xb869f0, sdisps=0xb88200, sdtype=0x61a600 <ompi_mpi_byte>, rbuf=0x7fffa87e8010, rcounts=0xb89a10, rdisps=0xb8b220, rdtype=0x61a600 <ompi_mpi_byte>, comm=0x2bfff9e0, module=0x2bf94100) at base/coll_base_alltoallv.c:167
#16 0x00007ffff7b6a0f5 in PMPI_Alltoallv (sendbuf=<optimized out>, sendcounts=<optimized out>, sdispls=<optimized out>, sendtype=<optimized out>, recvbuf=<optimized out>, recvcounts=<optimized out>, rdispls=0xb8b220, recvtype=0x61a600 <ompi_mpi_byte>, comm=0x2bfff9e0) at palltoallv.c:123
#17 0x000000000040e246 in IMB_alltoallv ()
#18 0x0000000000407256 in IMB_init_buffers_iter ()
#19 0x000000000040239b in main ()
yosefe
changed the title
Hang in MPI_Finalize with UCX_TLS=rc[_x],sm on the IMB-MPI1 test
Hang in MPI_Finalize with UCX_TLS=rc[_x],sm on IMB-MPI1 with datacheck
May 14, 2017
The command line to reproduce:
/hpc/local/benchmarks/hpcx_install_Thursday/hpcx-gcc-redhat7.2/ompi-v2.x/bin/mpirun -np 1536 -mca btl_openib_warn_default_gid_prefix 0 --debug-daemons --bind-to core --tag-output --timestamp-output --display-map -mca pml ucx -x UCX_NET_DEVICES=mlx5_0:1 -mca btl_openib_if_include mlx5_0:1 -mca coll_hcoll_enable 0 -x UCX_TLS=rc_x,sm -mca opal_pmix_base_async_modex 1 -mca mpi_add_procs_cutoff 0 -mca pmix_base_collect_data 0 --map-by node /hpc/scrap/users/mtt/scratch/ucx_ompi/20170511_191552_29012_735424_clx-hercules-065/installs/VVWm/tests/imb-datacheck/imb/src/IMB-MPI1 -npmin 1536 -iter 1000 -mem 0.9
http://e2e-gw.mellanox.com:4080/hpc/scrap/users/mtt/scratch/ucx_ompi/20170511_191552_29012_735424_clx-hercules-065/html/test_stdout_hPmhp2.txt
hangs after:
# All processes entering MPI_Finalize
the same reproduced with '-x UCX_TLS=rc,sm '
trace examples:
may be related to #1502 and #1513
The text was updated successfully, but these errors were encountered: