Source code for ewoksdata.data.hdf5.config

"""HDF5 configuration of optimal data storage (IO speed, compression, ...)
"""

from typing import Optional
from numbers import Integral
from collections.abc import Mapping

import numpy
from numpy.typing import DTypeLike

try:
    import hdf5plugin
except ImportError:
    hdf5plugin = None

from .types import ShapeType, VarShapeType, VarH5pyShapeType

DEFAULT_CHUNK_NBYTES = 1 << 20
DEFAULT_COMPRESSION_LIMIT_NBYTES = 1 << 20
DEFAULT_CHUNK_SPLIT = 4
DEFAULT_COMPRESSION_SCHEME = "gzip-byteshuffle"

# Default data size
#  0D detector: 2 KB
#  1D detector: 2 MB
#  2D detector: 2 GB
DEFAULT_SCAN_DIM_SIZE = 512
DEFAULT_DETECTOR_DIM_SIZE = 1024
DEFAULT_DTYPE = numpy.int32


[docs] def dtype_nbytes(dtype: DTypeLike) -> int: return numpy.dtype(dtype).itemsize
[docs] def shape_to_size(shape: ShapeType) -> int: # numpy.prod gives problems on windows n = 1 for x in shape: n *= x return n
[docs] def shape_to_nbytes(shape: ShapeType, dtype: DTypeLike) -> int: return shape_to_size(shape) * dtype_nbytes(dtype)
[docs] def guess_data_shape( scan_shape: VarShapeType, detector_shape: VarShapeType, max_shape: Optional[VarH5pyShapeType], ) -> ShapeType: scan_shape = tuple(n if n else DEFAULT_SCAN_DIM_SIZE for n in scan_shape) detector_shape = tuple(n if n else DEFAULT_SCAN_DIM_SIZE for n in detector_shape) data_shape = scan_shape + detector_shape if max_shape: assert len(max_shape) == len( data_shape ), "HDF5 dataset shape must have the same dimensions as maxshape" data_shape = tuple( n1 if not n2 else max(n1, n2) for n1, n2 in zip(data_shape, max_shape) ) return data_shape
[docs] def guess_chunk_shape( data_shape: ShapeType, dtype: DTypeLike, chunk_split: Optional[Integral] = None, chunk_nbytes: Optional[Integral] = None, ) -> Optional[ShapeType]: """Try to guess the optimal chunk shape with these constraints: * Split any dimension for partial access * Below the maximal chunk size (1 MB by default, uncompressed) The inner-most dimensions are split in `chunk_split` parts until chunk_nbytes is reached. The chunk size in the outer dimensions will be 1, unless the data size is too small. """ if chunk_nbytes is None: chunk_nbytes = DEFAULT_CHUNK_NBYTES if chunk_split is None: chunk_split = DEFAULT_CHUNK_SPLIT itemsize = dtype_nbytes(dtype) size = shape_to_size(data_shape) nbytes = size * itemsize if nbytes <= chunk_nbytes: return None max_size = chunk_nbytes // itemsize current_size = 1 chunk_shape = [] for n_i in data_shape[-1::-1]: if current_size >= max_size: c_i = 1 else: a = int(numpy.ceil(n_i / chunk_split)) b = int(numpy.ceil(max_size / current_size)) c_i = min(a, b) chunk_shape.append(c_i) current_size *= c_i chunk_shape = tuple(chunk_shape[::-1]) if chunk_shape == data_shape: return None return chunk_shape
[docs] def guess_compression( data_shape: ShapeType, dtype: DTypeLike, compression_limit_nbytes: Optional[Integral] = None, ) -> bool: """Compression is needed when the total data size exceeds the limits (1 MB by default).""" if compression_limit_nbytes is None: compression_limit_nbytes = DEFAULT_COMPRESSION_LIMIT_NBYTES nbytes = shape_to_nbytes(data_shape, dtype) return nbytes > compression_limit_nbytes
[docs] def get_compression_arguments(compression_scheme: Optional[str] = None) -> Mapping: if compression_scheme: compression_scheme = compression_scheme.lower() if compression_scheme is None: compression_scheme = DEFAULT_COMPRESSION_SCHEME if compression_scheme == "none": return dict() elif compression_scheme == "gzip": return {"compression": "gzip"} elif compression_scheme == "byteshuffle": return {"shuffle": True} elif compression_scheme == "gzip-byteshuffle": return {"compression": "gzip", "shuffle": True} elif compression_scheme == "bitshuffle": if hdf5plugin is None: raise RuntimeError( "Writer does not support HDF5 'bitshuffle' compression. Install the hdf5plugin library" ) return hdf5plugin.Bitshuffle(nelems=0, lz4=False) elif compression_scheme == "lz4-bitshuffle": if hdf5plugin is None: raise RuntimeError( "Writer does not support HDF5 'bitshuffle' compression. Install the hdf5plugin library" ) return hdf5plugin.Bitshuffle(nelems=0, lz4=True) else: raise ValueError(f"Unknown HDF5 compression '{compression_scheme}'")
[docs] def guess_dataset_config( scan_shape: VarShapeType, detector_shape: VarShapeType, dtype: Optional[DTypeLike] = None, chunk_split: Optional[Integral] = None, chunk_nbytes: Optional[Integral] = None, compression_limit_nbytes: Optional[Integral] = None, compression_scheme: Optional[str] = None, max_shape: Optional[VarH5pyShapeType] = None, ) -> dict: """Dataset configuration passed the `h5py.Group.create_dataset` for optimal storage (IO speed, compression, ...) """ data_shape = guess_data_shape( scan_shape=scan_shape, detector_shape=detector_shape, max_shape=max_shape ) if dtype is None: dtype = DEFAULT_DTYPE chunk_shape = guess_chunk_shape( data_shape=data_shape, dtype=dtype, chunk_split=chunk_split, chunk_nbytes=chunk_nbytes, ) config = {"chunks": chunk_shape} compression = guess_compression( data_shape=data_shape, dtype=dtype, compression_limit_nbytes=compression_limit_nbytes, ) if compression: config.update(get_compression_arguments(compression_scheme=compression_scheme)) chunking_required = compression or max_shape is not None if chunking_required and chunk_shape is None: # Do not let h5py guess the chunk size config["chunks"] = data_shape return config