Since I'm trying to track down some various netcdf mpi issues I'm seeing on
Fedora, here is another.
I started seeing this on Dec 3 trying to rebuild netcdf 4.2.1.1 for hdf5
1.8.10 with:
mpich2 1.5
gcc 4.7.2-8.
A previous build on Nov 1 succeeded with:
hdf5 1.8.9
gcc 4.7.2-6.
mpich2 1.5
So I suspect a change in hdf5 between 1.8.9 and 1.8.10.
I'm currently testing with netcdf 4.3.0-rc1, gcc 4.8.0-0.14, hdf5-1.8.10,
mpich2 1.5.
The test hangs here:
Testing very simple parallel I/O with 4 processors...
*** tst_parallel testing very basic parallel access.
*** tst_parallel testing whether we can create file for parallel access and
write to it...
Three of the four processes traces when attached with gdb looks like:
(gdb) bt
#0 0x0000003819ab86b1 in MPID_nem_tcp_connpoll () from
/usr/lib64/mpich2/lib/libmpich.so.8
#1 0x0000003819aa5fd5 in MPIDI_CH3I_Progress () from
/usr/lib64/mpich2/lib/libmpich.so.8
#2 0x0000003819a601ad in MPIC_Wait () from /usr/lib64/mpich2/lib/libmpich.so.8
#3 0x0000003819a60852 in MPIC_Sendrecv () from
/usr/lib64/mpich2/lib/libmpich.so.8
#4 0x0000003819a60cb4 in MPIC_Sendrecv_ft () from
/usr/lib64/mpich2/lib/libmpich.so.8
#5 0x0000003819adb172 in MPIR_Barrier_intra () from
/usr/lib64/mpich2/lib/libmpich.so.8
#6 0x0000003819adb26d in MPIR_Barrier_or_coll_fn () from
/usr/lib64/mpich2/lib/libmpich.so.8
#7 0x0000003819adb711 in MPIR_Barrier_impl () from
/usr/lib64/mpich2/lib/libmpich.so.8
#8 0x0000003819adba58 in PMPI_Barrier () from
/usr/lib64/mpich2/lib/libmpich.so.8
#9 0x0000003818a642a9 in H5AC_rsp__dist_md_write__flush (f=0x20d2780,
dxpl_id=167772175,
cache_ptr=0x7f3d0bc2b010) at ../../src/H5AC.c:4424
#10 0x0000003818a650c2 in H5AC_run_sync_point (f=0x20d2780, dxpl_id=167772175,
sync_point_op=1)
at ../../src/H5AC.c:4870
#11 0x0000003818a65532 in H5AC_flush_entries (f=0x20d2780) at
../../src/H5AC.c:5050
#12 0x0000003818a5c7d5 in H5AC_flush (f=0x20d2780, dxpl_id=167772174) at
../../src/H5AC.c:838
#13 0x0000003818ae490d in H5F_flush (f=0x20d2780, dxpl_id=167772174, closing=0)
at ../../src/H5F.c:1758
#14 0x0000003818af0fba in H5F_flush_mounts_recurse (f=0x20d2780,
dxpl_id=167772174)
at ../../src/H5Fmount.c:659
#15 0x0000003818af1175 in H5F_flush_mounts (f=0x20d2780, dxpl_id=167772174)
at ../../src/H5Fmount.c:698
#16 0x0000003818ae4648 in H5Fflush (object_id=16777216, scope=H5F_SCOPE_GLOBAL)
at ../../src/H5F.c:1704
#17 0x00007f3d0e24199c in sync_netcdf4_file (h5=0x20d1270) at
../../libsrc4/nc4file.c:2964
#18 0x00007f3d0e242862 in NC4_enddef (ncid=<optimized out>) at
../../libsrc4/nc4file.c:2922
#19 0x00007f3d0e1f44d2 in nc_enddef (ncid=65536) at
../../libdispatch/dfile.c:786
#20 0x0000000000400f59 in main (argc=1, argv=0x7fffece50d88)
at ../../nc_test4/tst_parallel.c:111
The other looks like:
(gdb) bt
#0 0x0000003819aa6005 in MPIDI_CH3I_Progress () from
/usr/lib64/mpich2/lib/libmpich.so.8
#1 0x0000003819a601ad in MPIC_Wait () from /usr/lib64/mpich2/lib/libmpich.so.8
#2 0x0000003819a60436 in MPIC_Recv () from /usr/lib64/mpich2/lib/libmpich.so.8
#3 0x0000003819a60af9 in MPIC_Recv_ft () from
/usr/lib64/mpich2/lib/libmpich.so.8
#4 0x0000003819addab2 in MPIR_Bcast_binomial.isra.1 ()
from /usr/lib64/mpich2/lib/libmpich.so.8
#5 0x0000003819addef3 in MPIR_Bcast_intra () from
/usr/lib64/mpich2/lib/libmpich.so.8
#6 0x0000003819adeb7d in MPIR_Bcast_impl () from
/usr/lib64/mpich2/lib/libmpich.so.8
#7 0x0000003819ad90c7 in MPIR_Allreduce_intra () from
/usr/lib64/mpich2/lib/libmpich.so.8
#8 0x0000003819ada6f2 in MPIR_Allreduce_impl () from
/usr/lib64/mpich2/lib/libmpich.so.8
#9 0x0000003819adacde in PMPI_Allreduce () from
/usr/lib64/mpich2/lib/libmpich.so.8
#10 0x0000003818ac1f7b in H5D__mpio_opt_possible (io_info=0x7ffffa47e2a0,
file_space=0x84e150,
mem_space=0x867950, type_info=0x7ffffa47e220, fm=0x7ffffa47e380,
dx_plist=0x854540)
at ../../src/H5Dmpio.c:241
#11 0x0000003818ac0050 in H5D__ioinfo_adjust (io_info=0x7ffffa47e2a0,
dset=0x854900,
dxpl_id=167772189, file_space=0x84e150, mem_space=0x867950,
type_info=0x7ffffa47e220,
fm=0x7ffffa47e380) at ../../src/H5Dio.c:999
#12 0x0000003818abf1bc in H5D__write (dataset=0x854900, mem_type_id=50331660,
mem_space=0x867950, file_space=0x84e150, dxpl_id=167772189,
buf=0x7ffffa489170)
at ../../src/H5Dio.c:667
#13 0x0000003818abd8e9 in H5Dwrite (dset_id=83886083, mem_type_id=50331660,
mem_space_id=67108867, file_space_id=67108866, dxpl_id=167772189,
buf=0x7ffffa489170)
at ../../src/H5Dio.c:265
#14 0x00007f407ab6992a in nc4_put_vara (nc=<optimized out>,
ncid=ncid@entry=65536,
varid=varid@entry=0, startp=startp@entry=0x7ffffa489130,
countp=countp@entry=0x7ffffa489150, mem_nc_type=mem_nc_type@entry=4,
is_long=is_long@entry=0, data=data@entry=0x7ffffa489170) at
../../libsrc4/nc4hdf.c:795
#15 0x00007f407ab6418b in nc4_put_vara_tc (mem_type_is_long=0,
op=0x7ffffa489170,
countp=0x7ffffa489150, startp=0x7ffffa489130, mem_type=4, varid=0,
ncid=65536)
at ../../libsrc4/nc4var.c:1350
#16 NC4_put_vara (ncid=65536, varid=0, startp=0x7ffffa489130,
countp=0x7ffffa489150,
op=0x7ffffa489170, memtype=4) at ../../libsrc4/nc4var.c:1484
#17 0x00007f407ab17075 in NC_put_vara (ncid=ncid@entry=65536,
varid=varid@entry=0,
start=start@entry=0x7ffffa489130, edges=edges@entry=0x7ffffa489150,
value=value@entry=0x7ffffa489170, memtype=memtype@entry=4)
at ../../libdispatch/dvarput.c:79
#18 0x00007f407ab17f0f in nc_put_vara_int (ncid=65536, varid=0,
startp=startp@entry=0x7ffffa489130, countp=countp@entry=0x7ffffa489150,
op=op@entry=0x7ffffa489170) at ../../libdispatch/dvarput.c:628
#19 0x0000000000401010 in main (argc=1, argv=0x7ffffa489648)
at ../../nc_test4/tst_parallel.c:138
With openmpi 1.6.3, it appears to hang at the previous test:
Testing very simple parallel I/O with 4 processors...
*** tst_parallel testing very basic parallel access.
Similar backtraces:
Three with:
(gdb) bt
#0 0x00000037a6cda4c7 in sched_yield () at
../sysdeps/unix/syscall-template.S:81
#1 0x000000381a317a5d in opal_progress () from
/usr/lib64/openmpi/lib/libmpi.so.1
#2 0x000000381a261acd in ompi_request_default_wait_all ()
from /usr/lib64/openmpi/lib/libmpi.so.1
#3 0x00007f44f1d3a6e7 in ompi_coll_tuned_sendrecv_actual ()
from /usr/lib64/openmpi/lib/openmpi/mca_coll_tuned.so
#4 0x00007f44f1d423ae in ompi_coll_tuned_barrier_intra_recursivedoubling ()
from /usr/lib64/openmpi/lib/openmpi/mca_coll_tuned.so
#5 0x000000381a26fc86 in PMPI_Barrier () from
/usr/lib64/openmpi/lib/libmpi.so.1
#6 0x00007f44f7cc85c4 in H5AC_rsp__dist_md_write__flush (f=0x1924a30,
dxpl_id=167772175,
cache_ptr=0x19256d0) at ../../src/H5AC.c:4424
#7 0x00007f44f7cc93e1 in H5AC_run_sync_point (f=0x1924a30, dxpl_id=167772175,
sync_point_op=1)
at ../../src/H5AC.c:4870
#8 0x00007f44f7cc9851 in H5AC_flush_entries (f=0x1924a30) at
../../src/H5AC.c:5050
#9 0x00007f44f7cc0ad0 in H5AC_flush (f=0x1924a30, dxpl_id=167772174) at
../../src/H5AC.c:838
#10 0x00007f44f7d48d15 in H5F_flush (f=0x1924a30, dxpl_id=167772174, closing=0)
at ../../src/H5F.c:1758
#11 0x00007f44f7d553c2 in H5F_flush_mounts_recurse (f=0x1924a30,
dxpl_id=167772174)
at ../../src/H5Fmount.c:659
#12 0x00007f44f7d5557d in H5F_flush_mounts (f=0x1924a30, dxpl_id=167772174)
at ../../src/H5Fmount.c:698
#13 0x00007f44f7d48a50 in H5Fflush (object_id=16777216, scope=H5F_SCOPE_GLOBAL)
at ../../src/H5F.c:1704
#14 0x00007f44f8537adc in sync_netcdf4_file (h5=0x191cc50) at
../../libsrc4/nc4file.c:2964
#15 0x00007f44f85389a2 in NC4_enddef (ncid=<optimized out>) at
../../libsrc4/nc4file.c:2922
#16 0x00007f44f84ea612 in nc_enddef (ncid=65536) at
../../libdispatch/dfile.c:786
#17 0x0000000000400f88 in main (argc=1, argv=0x7fff476e1958)
at ../../nc_test4/tst_parallel.c:111
One with:
(gdb) bt
#0 0x00000037a6cda4c7 in sched_yield () at
../sysdeps/unix/syscall-template.S:81
#1 0x000000381a317a5d in opal_progress () from
/usr/lib64/openmpi/lib/libmpi.so.1
#2 0x000000381a261acd in ompi_request_default_wait_all ()
from /usr/lib64/openmpi/lib/libmpi.so.1
#3 0x00007f26db7d4c99 in ompi_coll_tuned_allreduce_intra_recursivedoubling ()
from /usr/lib64/openmpi/lib/openmpi/mca_coll_tuned.so
#4 0x000000381a26e66b in PMPI_Allreduce () from
/usr/lib64/openmpi/lib/libmpi.so.1
#5 0x00007f26e17be321 in H5D__mpio_opt_possible (io_info=0x7fffbcbb0110,
file_space=0x210d3b0, mem_space=0x21afc20, type_info=0x7fffbcbb0090,
fm=0x7fffbcbb0200,
dx_plist=0x219c7e0) at ../../src/H5Dmpio.c:241
#6 0x00007f26e17bc3ee in H5D__ioinfo_adjust (io_info=0x7fffbcbb0110,
dset=0x219cba0,
dxpl_id=167772189, file_space=0x210d3b0, mem_space=0x21afc20,
type_info=0x7fffbcbb0090,
fm=0x7fffbcbb0200) at ../../src/H5Dio.c:999
#7 0x00007f26e17bb550 in H5D__write (dataset=0x219cba0, mem_type_id=50331660,
mem_space=0x21afc20, file_space=0x210d3b0, dxpl_id=167772189,
buf=0x7fffbcbbaff0)
at ../../src/H5Dio.c:667
#8 0x00007f26e17b9c7d in H5Dwrite (dset_id=83886083, mem_type_id=50331660,
mem_space_id=67108867, file_space_id=67108866, dxpl_id=167772189,
buf=0x7fffbcbbaff0)
at ../../src/H5Dio.c:265
#9 0x00007f26e1fd9a6a in nc4_put_vara (nc=<optimized out>,
ncid=ncid@entry=65536,
varid=varid@entry=0, startp=startp@entry=0x7fffbcbbafb0,
countp=countp@entry=0x7fffbcbbafd0, mem_nc_type=mem_nc_type@entry=4,
is_long=is_long@entry=0, data=data@entry=0x7fffbcbbaff0) at
../../libsrc4/nc4hdf.c:795
#10 0x00007f26e1fd42cb in nc4_put_vara_tc (mem_type_is_long=0,
op=0x7fffbcbbaff0,
countp=0x7fffbcbbafd0, startp=0x7fffbcbbafb0, mem_type=4, varid=0,
ncid=65536)
at ../../libsrc4/nc4var.c:1350
#11 NC4_put_vara (ncid=65536, varid=0, startp=0x7fffbcbbafb0,
countp=0x7fffbcbbafd0,
op=0x7fffbcbbaff0, memtype=4) at ../../libsrc4/nc4var.c:1484
#12 0x00007f26e1f871b5 in NC_put_vara (ncid=ncid@entry=65536,
varid=varid@entry=0,
start=start@entry=0x7fffbcbbafb0, edges=edges@entry=0x7fffbcbbafd0,
value=value@entry=0x7fffbcbbaff0, memtype=memtype@entry=4)
at ../../libdispatch/dvarput.c:79
#13 0x00007f26e1f8804f in nc_put_vara_int (ncid=65536, varid=0,
startp=startp@entry=0x7fffbcbbafb0, countp=countp@entry=0x7fffbcbbafd0,
op=op@entry=0x7fffbcbbaff0) at ../../libdispatch/dvarput.c:628
#14 0x000000000040103b in main (argc=1, argv=0x7fffbcbbb528)
at ../../nc_test4/tst_parallel.c:138
--
Orion Poplawski
Technical Manager 303-415-9701 x222
NWRA, Boulder Office FAX: 303-415-9702
3380 Mitchell Lane orion@xxxxxxxx
Boulder, CO 80301 http://www.nwra.com