h5py for HDF5 Files — Deep Dive

HDF5 architecture

HDF5 is a binary format built around two core abstractions:

  1. B-tree indexed chunks — datasets are split into chunks stored in a B-tree, enabling O(log n) access to any slice.
  2. Hierarchical namespace — groups and datasets form a tree with absolute paths like /experiment/run_001/images.

The HDF5 C library handles all low-level I/O, caching, and compression. h5py provides a thin Python wrapper that maps HDF5 objects to familiar Python interfaces: files behave like dicts, datasets behave like NumPy arrays.

Creating and writing files

Basic file operations

import h5py
import numpy as np

# Create a new file
with h5py.File("experiment.h5", "w") as f:
    # Create groups
    raw = f.create_group("raw")
    processed = f.create_group("processed")

    # Create a dataset
    data = np.random.randn(1000, 256, 256).astype(np.float32)
    raw.create_dataset("images", data=data)

    # Create dataset with explicit options
    processed.create_dataset(
        "features",
        shape=(1000, 512),
        dtype=np.float32,
        chunks=(100, 512),         # Chunk shape
        compression="gzip",        # Compression filter
        compression_opts=4,        # Compression level (1-9)
    )

    # Add attributes (metadata)
    f.attrs["experiment_name"] = "diffraction_study"
    f.attrs["date"] = "2026-03-28"
    f.attrs["version"] = 2

    raw["images"].attrs["units"] = "counts"
    raw["images"].attrs["exposure_ms"] = 50.0
    raw["images"].attrs["detector"] = "Pilatus 2M"

Data types

import h5py
import numpy as np

with h5py.File("types_demo.h5", "w") as f:
    # Standard numeric types
    f.create_dataset("integers", data=np.arange(100, dtype=np.int64))
    f.create_dataset("floats", data=np.random.randn(100).astype(np.float32))

    # Boolean arrays
    f.create_dataset("mask", data=np.random.choice([True, False], size=(100, 100)))

    # Fixed-length strings
    dt = h5py.string_dtype(encoding="utf-8", length=50)
    f.create_dataset("labels", data=["sample_A", "sample_B", "sample_C"], dtype=dt)

    # Variable-length strings
    dt_var = h5py.string_dtype(encoding="utf-8")
    f.create_dataset("descriptions", data=["short", "a much longer description"], dtype=dt_var)

    # Compound types (like structured arrays)
    compound_dtype = np.dtype([
        ("x", np.float64),
        ("y", np.float64),
        ("label", "S20"),
    ])
    records = np.array(
        [(1.0, 2.0, b"point_a"), (3.0, 4.0, b"point_b")],
        dtype=compound_dtype,
    )
    f.create_dataset("points", data=records)

Reading files

import h5py

with h5py.File("experiment.h5", "r") as f:
    # List contents (like os.listdir)
    print("Root contents:", list(f.keys()))

    # Recursive listing
    def print_structure(name, obj):
        print(name, type(obj).__name__)
    f.visititems(print_structure)

    # Dataset info without reading data
    dset = f["raw/images"]
    print(f"Shape: {dset.shape}")
    print(f"Dtype: {dset.dtype}")
    print(f"Chunks: {dset.chunks}")
    print(f"Compression: {dset.compression}")
    print(f"Size on disk: {dset.id.get_storage_size() / 1e6:.1f} MB")

    # Read attributes
    print(f"Experiment: {f.attrs['experiment_name']}")
    print(f"Exposure: {dset.attrs['exposure_ms']} ms")

Partial reads (the key advantage)

import h5py
import numpy as np

with h5py.File("experiment.h5", "r") as f:
    dset = f["raw/images"]  # Shape: (1000, 256, 256)

    # Read a single image (loads only ~256 KB, not the full ~250 MB)
    image_42 = dset[42]

    # Read a slice of images
    batch = dset[100:110]  # 10 images

    # Read a region from all images
    center_crop = dset[:, 64:192, 64:192]  # Center 128×128 from all frames

    # Fancy indexing
    indices = [0, 5, 42, 99, 500]
    selected = dset[indices]

    # Boolean masking (requires reading the full dimension)
    # For large datasets, prefer slice-based access

    # Read into a pre-allocated array (avoids temporary allocation)
    output = np.empty((10, 256, 256), dtype=np.float32)
    dset.read_direct(output, source_sel=np.s_[100:110])

Chunking strategies

Chunk shape determines I/O performance. Each read fetches whole chunks, so chunk shape should match your access pattern.

import h5py
import numpy as np

with h5py.File("chunking_demo.h5", "w") as f:
    shape = (10000, 1000, 1000)

    # Pattern 1: Row-oriented access (read one full row at a time)
    f.create_dataset("row_access", shape=shape, dtype="f4",
                     chunks=(1, 1000, 1000))

    # Pattern 2: Column-oriented access (read one column across all rows)
    f.create_dataset("col_access", shape=shape, dtype="f4",
                     chunks=(10000, 1, 1))

    # Pattern 3: Tile-based access (read spatial tiles across time)
    f.create_dataset("tile_access", shape=shape, dtype="f4",
                     chunks=(100, 100, 100))

    # Auto-chunking (let h5py decide)
    f.create_dataset("auto_chunks", shape=shape, dtype="f4",
                     chunks=True)

Chunk size guidelines

FactorRecommendation
Target chunk size100 KB – 1 MB
Too smallExcessive metadata overhead, slow sequential reads
Too largeWasted I/O for small reads
Match access patternIf you read rows, chunk by rows
CompressionChunks are compressed individually; larger chunks compress better

Compression filters

import h5py
import numpy as np
import time

data = np.random.randn(1000, 1000).astype(np.float32)

with h5py.File("compression_compare.h5", "w") as f:
    # No compression (baseline)
    f.create_dataset("none", data=data)

    # gzip — good ratio, slow write, moderate read
    f.create_dataset("gzip_4", data=data, compression="gzip", compression_opts=4)
    f.create_dataset("gzip_9", data=data, compression="gzip", compression_opts=9)

    # LZF — lower ratio, very fast read/write
    f.create_dataset("lzf", data=data, compression="lzf")

    # SZIP — hardware-accelerated on some systems
    # f.create_dataset("szip", data=data, compression="szip",
    #                  compression_opts=("nn", 16))

# Compare sizes
with h5py.File("compression_compare.h5", "r") as f:
    for name in f.keys():
        dset = f[name]
        raw_size = dset.dtype.itemsize * np.prod(dset.shape)
        stored = dset.id.get_storage_size()
        ratio = raw_size / stored if stored > 0 else 0
        print(f"{name:10s}: {stored/1e6:.2f} MB (ratio: {ratio:.1f}x)")

Shuffle filter

The shuffle filter rearranges bytes before compression to improve ratio. It groups the first byte of every value together, then the second byte, etc. Since adjacent values often share high-order bytes, this creates long runs of similar bytes that compress well.

with h5py.File("shuffle_demo.h5", "w") as f:
    f.create_dataset("no_shuffle", data=data, compression="gzip", compression_opts=4)
    f.create_dataset("with_shuffle", data=data, compression="gzip",
                     compression_opts=4, shuffle=True)

Resizable datasets

import h5py
import numpy as np

with h5py.File("growing.h5", "w") as f:
    # Create with maxshape=None for unlimited growth
    dset = f.create_dataset(
        "log",
        shape=(0, 10),           # Start empty
        maxshape=(None, 10),     # Unlimited rows, fixed columns
        dtype=np.float64,
        chunks=(1000, 10),       # Chunk shape for efficient appending
    )

    # Append data in batches
    for batch_idx in range(5):
        new_data = np.random.randn(200, 10)
        current_len = dset.shape[0]
        dset.resize(current_len + 200, axis=0)
        dset[current_len:current_len + 200] = new_data

    print(f"Final shape: {dset.shape}")  # (1000, 10)

Virtual datasets

Virtual datasets combine slices from multiple files into a single logical dataset without copying data:

import h5py
import numpy as np

# Create source files
for i in range(4):
    with h5py.File(f"part_{i}.h5", "w") as f:
        f.create_dataset("data", data=np.random.randn(250, 100).astype(np.float32))

# Create virtual dataset that spans all four files
layout = h5py.VirtualLayout(shape=(1000, 100), dtype=np.float32)

for i in range(4):
    source = h5py.VirtualSource(f"part_{i}.h5", "data", shape=(250, 100))
    layout[i*250:(i+1)*250] = source

with h5py.File("virtual_combined.h5", "w", libver="latest") as f:
    f.create_virtual_dataset("combined", layout)

# Read as if it were a single dataset
with h5py.File("virtual_combined.h5", "r") as f:
    print(f"Virtual shape: {f['combined'].shape}")  # (1000, 100)
    slice_across_files = f["combined"][200:300]  # Spans parts 0 and 1

Parallel HDF5 (MPI)

For high-performance computing clusters:

# Requires h5py built with --enable-parallel and mpi4py
# from mpi4py import MPI
# import h5py

# comm = MPI.COMM_WORLD
# rank = comm.Get_rank()
# size = comm.Get_size()

# with h5py.File("parallel.h5", "w", driver="mpio", comm=comm) as f:
#     dset = f.create_dataset("data", shape=(size * 1000, 100), dtype="f4")
#     start = rank * 1000
#     dset[start:start+1000] = np.random.randn(1000, 100).astype(np.float32)

Integration with other libraries

import h5py
import numpy as np

# With pandas
import pandas as pd

# pandas has built-in HDF5 support via PyTables
df = pd.DataFrame({"a": range(1000), "b": np.random.randn(1000)})
df.to_hdf("pandas_data.h5", key="table1", mode="w")
df_loaded = pd.read_hdf("pandas_data.h5", "table1")

# With PyTorch (model weights)
# import torch
# model_state = model.state_dict()
# with h5py.File("weights.h5", "w") as f:
#     for key, tensor in model_state.items():
#         f.create_dataset(key, data=tensor.numpy())

# Keras/TensorFlow models save in HDF5 by default
# model.save("model.h5")

Performance optimization

import h5py
import numpy as np
import time

# 1. Batch writes (write large blocks, not one row at a time)
with h5py.File("perf_test.h5", "w") as f:
    dset = f.create_dataset("data", shape=(100000, 100), dtype="f4",
                            chunks=(1000, 100))

    # SLOW: row-by-row
    start = time.time()
    for i in range(10000):
        dset[i] = np.random.randn(100).astype(np.float32)
    slow_time = time.time() - start

    # FAST: batch writes
    start = time.time()
    for batch_start in range(10000, 100000, 1000):
        dset[batch_start:batch_start+1000] = np.random.randn(1000, 100).astype(np.float32)
    fast_time = time.time() - start

    print(f"Row-by-row (10k rows): {slow_time:.2f}s")
    print(f"Batch (90k rows): {fast_time:.2f}s")

# 2. Chunk cache tuning
# Default chunk cache is 1 MB — increase for large random access patterns
with h5py.File("data.h5", "r", rdcc_nbytes=100*1024*1024) as f:  # 100 MB cache
    dset = f["data"]
    # Random access will hit cache more often

Common pitfalls

  1. Forgetting to close files. Always use with statements. An unclosed HDF5 file can become corrupted. The with block guarantees cleanup even if exceptions occur.
  2. Writing row by row. Each write to an HDF5 dataset has fixed overhead. Writing 1 million single rows is hundreds of times slower than writing 1000 batches of 1000 rows.
  3. Mismatched chunk shape and access pattern. If your chunks are (1, 1000000) but you read columns, every column read loads the entire dataset. Match chunks to your access pattern.
  4. Concurrent writes without MPI. Standard HDF5 does not support multiple processes writing to the same file simultaneously. Use swmr=True (Single Writer Multiple Reader) for one writer with concurrent readers, or parallel HDF5 with MPI for true multi-writer access.
  5. Storing many small arrays. HDF5 has per-dataset overhead. Storing millions of tiny arrays (< 1 KB each) is inefficient — consider concatenating them into larger datasets with an index.

The one thing to remember: h5py provides Python with dictionary-like access to HDF5’s high-performance, hierarchical, compressed array storage — the key being that you read only the slice you need from datasets that can be terabytes in size.

pythondata-sciencescience