Re: [netcdfgroup] nf90_char size

To: Wei-Keng Liao <wkliao@xxxxxxxxxxxxxxxx>
Subject: Re: [netcdfgroup] nf90_char size
From: Davide Sangalli <davide.sangalli@xxxxxx>
Date: Sat, 2 May 2020 20:42:32 +0200

Yeah, but  BS_K_linearized1 is just a dimension, how can it be 8 GB big ?
Same for  BS_K_linearized2, how can it be 3 GB big ?
These two are just two numbers
BS_K_linearized1 =  2,025,000,000

(it was chosen has a maximum variable size in my code to avoidoverflowing the maximum allowed integer in standard precision)

BS_K_linearized2 = 781,887,360

D.

On 02/05/20 19:06, Wei-Keng Liao wrote:

The dump information shows there are actually 8 datasets in the file.
Below is the start offsets, sizes, and end offsets of individual datasets.
There is not much padding space in between the datasets.
According to this, your file is expected to be of size 16 GB.

dataset name                    start offset    size            end offset
BS_K_linearized1                2,379           8,100,000,000   8,100,002,379
BSE_RESONANT_COMPRESSED1_DONE   8,100,002,379   2,025,000,000   10,125,002,379
BSE_RESONANT_COMPRESSED2_DONE   10,125,006,475  2,025,000,000   12,150,006,475
BS_K_linearized2                12,150,006,475  3,127,549,440   15,277,555,915
BSE_RESONANT_COMPRESSED3_DONE   15,277,557,963  781,887,360     16,059,445,323
complex                         16,059,447,371  8               16,059,447,379
BS_K_compressed1                16,059,447,379  99,107,168      16,158,554,547
BSE_RESONANT_COMPRESSED1        16,158,554,547  198,214,336     16,356,768,883

Wei-keng

On May 2, 2020, at 11:28 AM, Davide Sangalli <davide.sangalli@xxxxxx> wrote:

h5dump -Hp ndb.BS_COMPRESS0.005000_Q1
HDF5 "ndb.BS_COMPRESS0.005000_Q1" {
GROUP "/" {
    ATTRIBUTE "_NCProperties" {
       DATATYPE  H5T_STRING {
          STRSIZE 57;
          STRPAD H5T_STR_NULLTERM;
          CSET H5T_CSET_ASCII;
          CTYPE H5T_C_S1;
       }
       DATASPACE  SCALAR
    }
    DATASET "BSE_RESONANT_COMPRESSED1" {
       DATATYPE  H5T_IEEE_F32LE
       DATASPACE  SIMPLE { ( 24776792, 2 ) / ( 24776792, 2 ) }
       STORAGE_LAYOUT {
          CONTIGUOUS
          SIZE 198214336
          OFFSET 16158554547
       }
       FILTERS {
          NONE
       }
       FILLVALUE {
          FILL_TIME H5D_FILL_TIME_IFSET
          VALUE  9.96921e+36
       }
       ALLOCATION_TIME {
          H5D_ALLOC_TIME_EARLY
       }
       ATTRIBUTE "DIMENSION_LIST" {
          DATATYPE  H5T_VLEN { H5T_REFERENCE { H5T_STD_REF_OBJECT }}
          DATASPACE  SIMPLE { ( 2 ) / ( 2 ) }
       }
    }
    DATASET "BSE_RESONANT_COMPRESSED1_DONE" {
       DATATYPE  H5T_STRING {
          STRSIZE 1;
          STRPAD H5T_STR_NULLTERM;
          CSET H5T_CSET_UTF8;
          CTYPE H5T_C_S1;
       }
       DATASPACE  SIMPLE { ( 2025000000 ) / ( 2025000000 ) }
       STORAGE_LAYOUT {
          CONTIGUOUS
          SIZE 2025000000
          OFFSET 8100002379
       }
       FILTERS {
          NONE
       }
       FILLVALUE {
          FILL_TIME H5D_FILL_TIME_IFSET
          VALUE  ""
       }
       ALLOCATION_TIME {
          H5D_ALLOC_TIME_EARLY
       }
       ATTRIBUTE "DIMENSION_LIST" {
          DATATYPE  H5T_VLEN { H5T_REFERENCE { H5T_STD_REF_OBJECT }}
          DATASPACE  SIMPLE { ( 1 ) / ( 1 ) }
       }
    }
    DATASET "BSE_RESONANT_COMPRESSED2_DONE" {
       DATATYPE  H5T_STRING {
          STRSIZE 1;
          STRPAD H5T_STR_NULLTERM;
          CSET H5T_CSET_UTF8;
          CTYPE H5T_C_S1;
       }
       DATASPACE  SIMPLE { ( 2025000000 ) / ( 2025000000 ) }
       STORAGE_LAYOUT {
          CONTIGUOUS
          SIZE 2025000000
          OFFSET 10125006475
       }
       FILTERS {
          NONE
       }
       FILLVALUE {
          FILL_TIME H5D_FILL_TIME_IFSET
          VALUE  ""
       }
       ALLOCATION_TIME {
          H5D_ALLOC_TIME_EARLY
       }
       ATTRIBUTE "DIMENSION_LIST" {
          DATATYPE  H5T_VLEN { H5T_REFERENCE { H5T_STD_REF_OBJECT }}
          DATASPACE  SIMPLE { ( 1 ) / ( 1 ) }
       }
    }
    DATASET "BSE_RESONANT_COMPRESSED3_DONE" {
       DATATYPE  H5T_STRING {
          STRSIZE 1;
          STRPAD H5T_STR_NULLTERM;
          CSET H5T_CSET_UTF8;
          CTYPE H5T_C_S1;
       }
       DATASPACE  SIMPLE { ( 781887360 ) / ( 781887360 ) }
       STORAGE_LAYOUT {
          CONTIGUOUS
          SIZE 781887360
          OFFSET 15277557963
       }
       FILTERS {
          NONE
       }
       FILLVALUE {
          FILL_TIME H5D_FILL_TIME_IFSET
          VALUE  ""
       }
       ALLOCATION_TIME {
          H5D_ALLOC_TIME_EARLY
       }
       ATTRIBUTE "DIMENSION_LIST" {
          DATATYPE  H5T_VLEN { H5T_REFERENCE { H5T_STD_REF_OBJECT }}
          DATASPACE  SIMPLE { ( 1 ) / ( 1 ) }
       }
    }
    DATASET "BS_K_compressed1" {
       DATATYPE  H5T_IEEE_F32BE
       DATASPACE  SIMPLE { ( 24776792 ) / ( 24776792 ) }
       STORAGE_LAYOUT {
          CONTIGUOUS
          SIZE 99107168
          OFFSET 16059447379
       }
       FILTERS {
          NONE
       }
       FILLVALUE {
          FILL_TIME H5D_FILL_TIME_IFSET
          VALUE  H5D_FILL_VALUE_DEFAULT
       }
       ALLOCATION_TIME {
          H5D_ALLOC_TIME_EARLY
       }
       ATTRIBUTE "CLASS" {
          DATATYPE  H5T_STRING {
             STRSIZE 16;
             STRPAD H5T_STR_NULLTERM;
             CSET H5T_CSET_ASCII;
             CTYPE H5T_C_S1;
          }
          DATASPACE  SCALAR
       }
       ATTRIBUTE "NAME" {
          DATATYPE  H5T_STRING {
             STRSIZE 64;
             STRPAD H5T_STR_NULLTERM;
             CSET H5T_CSET_ASCII;
             CTYPE H5T_C_S1;
          }
          DATASPACE  SCALAR
       }
       ATTRIBUTE "REFERENCE_LIST" {
          DATATYPE  H5T_COMPOUND {
             H5T_REFERENCE { H5T_STD_REF_OBJECT } "dataset";
             H5T_STD_I32LE "dimension";
          }
          DATASPACE  SIMPLE { ( 1 ) / ( 1 ) }
       }
    }
    DATASET "BS_K_linearized1" {
       DATATYPE  H5T_IEEE_F32BE
       DATASPACE  SIMPLE { ( 2025000000 ) / ( 2025000000 ) }
       STORAGE_LAYOUT {
          CONTIGUOUS
          SIZE 8100000000
          OFFSET 2379
       }
       FILTERS {
          NONE
       }
       FILLVALUE {
          FILL_TIME H5D_FILL_TIME_IFSET
          VALUE  H5D_FILL_VALUE_DEFAULT
       }
       ALLOCATION_TIME {
          H5D_ALLOC_TIME_EARLY
       }
       ATTRIBUTE "CLASS" {
          DATATYPE  H5T_STRING {
             STRSIZE 16;
             STRPAD H5T_STR_NULLTERM;
             CSET H5T_CSET_ASCII;
             CTYPE H5T_C_S1;
          }
          DATASPACE  SCALAR
       }
       ATTRIBUTE "NAME" {
          DATATYPE  H5T_STRING {
             STRSIZE 64;
             STRPAD H5T_STR_NULLTERM;
             CSET H5T_CSET_ASCII;
             CTYPE H5T_C_S1;
          }
          DATASPACE  SCALAR
       }
       ATTRIBUTE "REFERENCE_LIST" {
          DATATYPE  H5T_COMPOUND {
             H5T_REFERENCE { H5T_STD_REF_OBJECT } "dataset";
             H5T_STD_I32LE "dimension";
          }
          DATASPACE  SIMPLE { ( 2 ) / ( 2 ) }
       }
    }
    DATASET "BS_K_linearized2" {
       DATATYPE  H5T_IEEE_F32BE
       DATASPACE  SIMPLE { ( 781887360 ) / ( 781887360 ) }
       STORAGE_LAYOUT {
          CONTIGUOUS
          SIZE 3127549440
          OFFSET 12150006475
       }
       FILTERS {
          NONE
       }
       FILLVALUE {
          FILL_TIME H5D_FILL_TIME_IFSET
          VALUE  H5D_FILL_VALUE_DEFAULT
       }
       ALLOCATION_TIME {
          H5D_ALLOC_TIME_EARLY
       }
       ATTRIBUTE "CLASS" {
          DATATYPE  H5T_STRING {
             STRSIZE 16;
             STRPAD H5T_STR_NULLTERM;
             CSET H5T_CSET_ASCII;
             CTYPE H5T_C_S1;
          }
          DATASPACE  SCALAR
       }
       ATTRIBUTE "NAME" {
          DATATYPE  H5T_STRING {
             STRSIZE 64;
             STRPAD H5T_STR_NULLTERM;
             CSET H5T_CSET_ASCII;
             CTYPE H5T_C_S1;
          }
          DATASPACE  SCALAR
       }
       ATTRIBUTE "REFERENCE_LIST" {
          DATATYPE  H5T_COMPOUND {
             H5T_REFERENCE { H5T_STD_REF_OBJECT } "dataset";
             H5T_STD_I32LE "dimension";
          }
          DATASPACE  SIMPLE { ( 1 ) / ( 1 ) }
       }
    }
    DATASET "complex" {
       DATATYPE  H5T_IEEE_F32BE
       DATASPACE  SIMPLE { ( 2 ) / ( 2 ) }
       STORAGE_LAYOUT {
          CONTIGUOUS
          SIZE 8
          OFFSET 16059447371
       }
       FILTERS {
          NONE
       }
       FILLVALUE {
          FILL_TIME H5D_FILL_TIME_IFSET
          VALUE  H5D_FILL_VALUE_DEFAULT
       }
       ALLOCATION_TIME {
          H5D_ALLOC_TIME_EARLY
       }
       ATTRIBUTE "CLASS" {
          DATATYPE  H5T_STRING {
             STRSIZE 16;
             STRPAD H5T_STR_NULLTERM;
             CSET H5T_CSET_ASCII;
             CTYPE H5T_C_S1;
          }
          DATASPACE  SCALAR
       }
       ATTRIBUTE "NAME" {
          DATATYPE  H5T_STRING {
             STRSIZE 64;
             STRPAD H5T_STR_NULLTERM;
             CSET H5T_CSET_ASCII;
             CTYPE H5T_C_S1;
          }
          DATASPACE  SCALAR
       }
       ATTRIBUTE "REFERENCE_LIST" {
          DATATYPE  H5T_COMPOUND {
             H5T_REFERENCE { H5T_STD_REF_OBJECT } "dataset";
             H5T_STD_I32LE "dimension";
          }
          DATASPACE  SIMPLE { ( 1 ) / ( 1 ) }
       }
    }
}
}



On Sat, May 2, 2020 at 5:55 PM +0200, "Wei-Keng Liao" <wkliao@xxxxxxxxxxxxxxxx> 
wrote:

For HDF5 files, command “h5dump -Hp ndb.BS_COMPRESS0.005000_Q1” shows
the data chunk settings used by all datasets in the file.

Command “h5stat -Ss ndb.BS_COMPRESS0.005000_Q1” shows information about
free space, metadata, raw data, etc.

They may reveal why your file is abnormal big.
Most likely it is the chunk setting you used.

Wei-keng

On May 1, 2020, at 6:40 PM, Davide Sangalli

  wrote:

I also add

ncvalidator ndb.BS_COMPRESS0.005000_Q1
Error: Unknow file signature
     Expecting "CDF1", "CDF2", or "CDF5", but got "�HDF"
File "ndb.BS_COMPRESS0.005000_Q1" fails to conform with CDF file format 
specifications

Best,
D.

On 02/05/20 01:26, Davide Sangalli wrote:

Output of ncdump -hs

D.

ncdump -hs BSK_2-5B_X59RL-50B_SP_bse-io/ndb.BS_COMPRESS0.005000_Q1

netcdf ndb.BS_COMPRESS0 {
dimensions:
         BS_K_linearized1 = 2025000000 ;
         BS_K_linearized2 = 781887360 ;
         complex = 2 ;
         BS_K_compressed1 = 24776792 ;
variables:
         char BSE_RESONANT_COMPRESSED1_DONE(BS_K_linearized1) ;
                 BSE_RESONANT_COMPRESSED1_DONE:_Storage = "contiguous" ;
         char BSE_RESONANT_COMPRESSED2_DONE(BS_K_linearized1) ;
                 BSE_RESONANT_COMPRESSED2_DONE:_Storage = "contiguous" ;
         char BSE_RESONANT_COMPRESSED3_DONE(BS_K_linearized2) ;
                 BSE_RESONANT_COMPRESSED3_DONE:_Storage = "contiguous" ;
         float BSE_RESONANT_COMPRESSED1(BS_K_compressed1, complex) ;
                 BSE_RESONANT_COMPRESSED1:_Storage = "contiguous" ;
                 BSE_RESONANT_COMPRESSED1:_Endianness = "little" ;
// global attributes:
                 :_NCProperties = 
"version=1|netcdflibversion=4.4.1.1|hdf5libversion=1.8.18" ;
                 :_SuperblockVersion = 0 ;
                 :_IsNetcdf4 = 1 ;
                 :_Format = "netCDF-4" ;



On Sat, May 2, 2020 at 12:24 AM +0200, "Dave Allured - NOAA Affiliate"

  wrote:

I agree that you should expect the file size to be about 1 byte per stored 
character.  IMO the most likely explanation is that you have a netcdf-4 file 
with inappropriately small chunk size.  Another possibility is a 64-bit offset 
file with crazy huge padding between file sections.  This is very unlikely, but 
I do not know                 what is inside your writer code.

Diagnose, please.  Ncdump -hs.  If it is 64-bit offset, I think ncvalidator can 
display the hidden pad sizes.


On Fri, May 1, 2020 at 3:37 PM Davide Sangalli

  wrote:

Dear all,
I'm a developer of a fortran code which uses netcdf for I/O

In one of my runs I created a file with some huge array of characters.
The header of the file is the following:
netcdf ndb.BS_COMPRESS0 {
dimensions:
     BS_K_linearized1 = 2025000000 ;
     BS_K_linearized2 = 781887360 ;
variables:
     char BSE_RESONANT_COMPRESSED1_DONE(BS_K_linearized1) ;
     char BSE_RESONANT_COMPRESSED2_DONE(BS_K_linearized1) ;
     char BSE_RESONANT_COMPRESSED3_DONE(BS_K_linearized2) ;
}

The variable is declared as nf90_char which, according to the documentation 
should be 1 byte per element.
Thus I would expect the total size of the file to be 1 
byte*(2*2025000000+781887360) ~ 4.5 GB
Instead the file size is 16059445323 bytes ~ 14.96 GB, i.e. 10.46 GB more and a 
factor 3.33 bigger

This happens consistently if I consider the file
netcdf ndb {
dimensions:
     complex = 2 ;
     BS_K_linearized1 = 2025000000 ;
     BS_K_linearized2 = 781887360 ;
variables:
     float BSE_RESONANT_LINEARIZED1(BS_K_linearized1, complex) ;
     char BSE_RESONANT_LINEARIZED1_DONE(BS_K_linearized1) ;
     float BSE_RESONANT_LINEARIZED2(BS_K_linearized1, complex) ;
     char BSE_RESONANT_LINEARIZED2_DONE(BS_K_linearized1) ;
     float BSE_RESONANT_LINEARIZED3(BS_K_linearized2, complex) ;
     char BSE_RESONANT_LINEARIZED3_DONE(BS_K_linearized2) ;
}
The float component should weight ~36 GB while the char component should be 
identical to before, i.e. 4.5 GB for a total of 40.5 GB
The file is instead ~ 50.96 GB, i.e. again a factor 10.46 GB bigger than 
expected.

Why ?

My character variables are something like
"tnnnntnnnntnnnnnnnntnnnnnttnnnnnnnnnnnnnnnnt..."
but the file size is already like that just after the file creation, i.e. 
before filling it.

Few info about the library, compiled linking to HDF5 (hdf5-1.8.18), with 
parallel IO support:
Name: netcdf
Description: NetCDF Client Library for C
URL: http://www.unidata.ucar.edu/netcdf
Version: 4.4.1.1
Libs: -L${libdir}  -lnetcdf -ldl -lm 
/nfs/data/bin/Yambo/gcc-8.1.0/openmpi-3.1.0/yambo_ext_libs/gfortran/mpifort/v4/parallel/lib/libhdf5hl_fortran.a
 
/nfs/data/bin/Yambo/gcc-8.1.0/openmpi-3.1.0/yambo_ext_libs/gfortran/mpifort/v4/parallel/lib/libhdf5_fortran.a
 
/nfs/data/bin/Yambo/gcc-8.1.0/openmpi-3.1.0/yambo_ext_libs/gfortran/mpifort/v4/parallel/lib/libhdf5_hl.a
 
/nfs/data/bin/Yambo/gcc-8.1.0/openmpi-3.1.0/yambo_ext_libs/gfortran/mpifort/v4/parallel/lib/libhdf5.a
 -lz -lm -ldl -lcurl
Cflags: -I${includedir}

Name: netcdf-fortran
Description: NetCDF Client Library for Fortran
URL: http://www.unidata.ucar.edu/netcdf
Version: 4.4.4
Requires.private: netcdf > 4.1.1
Libs: -L${libdir} -lnetcdff
Libs.private: -L${libdir} -lnetcdff -lnetcdf
Cflags: -I${includedir}

Best,
D.
--
Davide Sangalli, PhD
CNR-ISM, Division of Ultrafast Processes in Materials (FLASHit) and MaX Centre
Area della Ricerca di Roma 1, 00016 Monterotondo Scalo, Italy
http://www.ism.cnr.it/en/davide-sangalli-cv/
http://www.max-centre.eu/

_______________________________________________
NOTE: All exchanges posted to Unidata maintained email lists are
recorded in the Unidata inquiry tracking system and made publicly
available through the web.  Users who post to any of the lists we
maintain are reminded to remove any personal information that they
do not want to be made public.


netcdfgroup mailing list
netcdfgroup@xxxxxxxxxxxxxxxx
For list information or to unsubscribe,  visit: 
https://urldefense.com/v3/__https://www.unidata.ucar.edu/mailing_lists/__;!!Dq0X2DkFhyF93HkjWTBQKhk!GlMUXr2ZUUJOLFkvEP_YqN7UDZILtBBWb_Z5DVa2Mwi9UIg_yB2Hb7tJibyV8bgan4ku$

Follow-Ups:
- Re: [netcdfgroup] nf90_char size
  - From: Wei-Keng Liao

References:
- [netcdfgroup] nf90_char size
  - From: Davide Sangalli
- Re: [netcdfgroup] nf90_char size
  - From: Dave Allured - NOAA Affiliate
- Re: [netcdfgroup] nf90_char size
  - From: Davide Sangalli
- Re: [netcdfgroup] nf90_char size
  - From: Davide Sangalli
- Re: [netcdfgroup] nf90_char size
  - From: Wei-Keng Liao
- Re: [netcdfgroup] nf90_char size
  - From: Davide Sangalli
- Re: [netcdfgroup] nf90_char size
  - From: Wei-Keng Liao

2020 messages navigation, sorted by:
1. Thread
2. Subject
3. Author
4. Date
5. ↑ Table Of Contents
Search the netcdfgroup archives: