h5py for HDF5 Files — Deep Dive
HDF5 architecture
HDF5 is a binary format built around two core abstractions:
- B-tree indexed chunks — datasets are split into chunks stored in a B-tree, enabling O(log n) access to any slice.
- Hierarchical namespace — groups and datasets form a tree with absolute paths like
/experiment/run_001/images.
The HDF5 C library handles all low-level I/O, caching, and compression. h5py provides a thin Python wrapper that maps HDF5 objects to familiar Python interfaces: files behave like dicts, datasets behave like NumPy arrays.
Creating and writing files
Basic file operations
import h5py
import numpy as np
# Create a new file
with h5py.File("experiment.h5", "w") as f:
# Create groups
raw = f.create_group("raw")
processed = f.create_group("processed")
# Create a dataset
data = np.random.randn(1000, 256, 256).astype(np.float32)
raw.create_dataset("images", data=data)
# Create dataset with explicit options
processed.create_dataset(
"features",
shape=(1000, 512),
dtype=np.float32,
chunks=(100, 512), # Chunk shape
compression="gzip", # Compression filter
compression_opts=4, # Compression level (1-9)
)
# Add attributes (metadata)
f.attrs["experiment_name"] = "diffraction_study"
f.attrs["date"] = "2026-03-28"
f.attrs["version"] = 2
raw["images"].attrs["units"] = "counts"
raw["images"].attrs["exposure_ms"] = 50.0
raw["images"].attrs["detector"] = "Pilatus 2M"
Data types
import h5py
import numpy as np
with h5py.File("types_demo.h5", "w") as f:
# Standard numeric types
f.create_dataset("integers", data=np.arange(100, dtype=np.int64))
f.create_dataset("floats", data=np.random.randn(100).astype(np.float32))
# Boolean arrays
f.create_dataset("mask", data=np.random.choice([True, False], size=(100, 100)))
# Fixed-length strings
dt = h5py.string_dtype(encoding="utf-8", length=50)
f.create_dataset("labels", data=["sample_A", "sample_B", "sample_C"], dtype=dt)
# Variable-length strings
dt_var = h5py.string_dtype(encoding="utf-8")
f.create_dataset("descriptions", data=["short", "a much longer description"], dtype=dt_var)
# Compound types (like structured arrays)
compound_dtype = np.dtype([
("x", np.float64),
("y", np.float64),
("label", "S20"),
])
records = np.array(
[(1.0, 2.0, b"point_a"), (3.0, 4.0, b"point_b")],
dtype=compound_dtype,
)
f.create_dataset("points", data=records)
Reading files
Navigation and inspection
import h5py
with h5py.File("experiment.h5", "r") as f:
# List contents (like os.listdir)
print("Root contents:", list(f.keys()))
# Recursive listing
def print_structure(name, obj):
print(name, type(obj).__name__)
f.visititems(print_structure)
# Dataset info without reading data
dset = f["raw/images"]
print(f"Shape: {dset.shape}")
print(f"Dtype: {dset.dtype}")
print(f"Chunks: {dset.chunks}")
print(f"Compression: {dset.compression}")
print(f"Size on disk: {dset.id.get_storage_size() / 1e6:.1f} MB")
# Read attributes
print(f"Experiment: {f.attrs['experiment_name']}")
print(f"Exposure: {dset.attrs['exposure_ms']} ms")
Partial reads (the key advantage)
import h5py
import numpy as np
with h5py.File("experiment.h5", "r") as f:
dset = f["raw/images"] # Shape: (1000, 256, 256)
# Read a single image (loads only ~256 KB, not the full ~250 MB)
image_42 = dset[42]
# Read a slice of images
batch = dset[100:110] # 10 images
# Read a region from all images
center_crop = dset[:, 64:192, 64:192] # Center 128×128 from all frames
# Fancy indexing
indices = [0, 5, 42, 99, 500]
selected = dset[indices]
# Boolean masking (requires reading the full dimension)
# For large datasets, prefer slice-based access
# Read into a pre-allocated array (avoids temporary allocation)
output = np.empty((10, 256, 256), dtype=np.float32)
dset.read_direct(output, source_sel=np.s_[100:110])
Chunking strategies
Chunk shape determines I/O performance. Each read fetches whole chunks, so chunk shape should match your access pattern.
import h5py
import numpy as np
with h5py.File("chunking_demo.h5", "w") as f:
shape = (10000, 1000, 1000)
# Pattern 1: Row-oriented access (read one full row at a time)
f.create_dataset("row_access", shape=shape, dtype="f4",
chunks=(1, 1000, 1000))
# Pattern 2: Column-oriented access (read one column across all rows)
f.create_dataset("col_access", shape=shape, dtype="f4",
chunks=(10000, 1, 1))
# Pattern 3: Tile-based access (read spatial tiles across time)
f.create_dataset("tile_access", shape=shape, dtype="f4",
chunks=(100, 100, 100))
# Auto-chunking (let h5py decide)
f.create_dataset("auto_chunks", shape=shape, dtype="f4",
chunks=True)
Chunk size guidelines
| Factor | Recommendation |
|---|---|
| Target chunk size | 100 KB – 1 MB |
| Too small | Excessive metadata overhead, slow sequential reads |
| Too large | Wasted I/O for small reads |
| Match access pattern | If you read rows, chunk by rows |
| Compression | Chunks are compressed individually; larger chunks compress better |
Compression filters
import h5py
import numpy as np
import time
data = np.random.randn(1000, 1000).astype(np.float32)
with h5py.File("compression_compare.h5", "w") as f:
# No compression (baseline)
f.create_dataset("none", data=data)
# gzip — good ratio, slow write, moderate read
f.create_dataset("gzip_4", data=data, compression="gzip", compression_opts=4)
f.create_dataset("gzip_9", data=data, compression="gzip", compression_opts=9)
# LZF — lower ratio, very fast read/write
f.create_dataset("lzf", data=data, compression="lzf")
# SZIP — hardware-accelerated on some systems
# f.create_dataset("szip", data=data, compression="szip",
# compression_opts=("nn", 16))
# Compare sizes
with h5py.File("compression_compare.h5", "r") as f:
for name in f.keys():
dset = f[name]
raw_size = dset.dtype.itemsize * np.prod(dset.shape)
stored = dset.id.get_storage_size()
ratio = raw_size / stored if stored > 0 else 0
print(f"{name:10s}: {stored/1e6:.2f} MB (ratio: {ratio:.1f}x)")
Shuffle filter
The shuffle filter rearranges bytes before compression to improve ratio. It groups the first byte of every value together, then the second byte, etc. Since adjacent values often share high-order bytes, this creates long runs of similar bytes that compress well.
with h5py.File("shuffle_demo.h5", "w") as f:
f.create_dataset("no_shuffle", data=data, compression="gzip", compression_opts=4)
f.create_dataset("with_shuffle", data=data, compression="gzip",
compression_opts=4, shuffle=True)
Resizable datasets
import h5py
import numpy as np
with h5py.File("growing.h5", "w") as f:
# Create with maxshape=None for unlimited growth
dset = f.create_dataset(
"log",
shape=(0, 10), # Start empty
maxshape=(None, 10), # Unlimited rows, fixed columns
dtype=np.float64,
chunks=(1000, 10), # Chunk shape for efficient appending
)
# Append data in batches
for batch_idx in range(5):
new_data = np.random.randn(200, 10)
current_len = dset.shape[0]
dset.resize(current_len + 200, axis=0)
dset[current_len:current_len + 200] = new_data
print(f"Final shape: {dset.shape}") # (1000, 10)
Virtual datasets
Virtual datasets combine slices from multiple files into a single logical dataset without copying data:
import h5py
import numpy as np
# Create source files
for i in range(4):
with h5py.File(f"part_{i}.h5", "w") as f:
f.create_dataset("data", data=np.random.randn(250, 100).astype(np.float32))
# Create virtual dataset that spans all four files
layout = h5py.VirtualLayout(shape=(1000, 100), dtype=np.float32)
for i in range(4):
source = h5py.VirtualSource(f"part_{i}.h5", "data", shape=(250, 100))
layout[i*250:(i+1)*250] = source
with h5py.File("virtual_combined.h5", "w", libver="latest") as f:
f.create_virtual_dataset("combined", layout)
# Read as if it were a single dataset
with h5py.File("virtual_combined.h5", "r") as f:
print(f"Virtual shape: {f['combined'].shape}") # (1000, 100)
slice_across_files = f["combined"][200:300] # Spans parts 0 and 1
Parallel HDF5 (MPI)
For high-performance computing clusters:
# Requires h5py built with --enable-parallel and mpi4py
# from mpi4py import MPI
# import h5py
# comm = MPI.COMM_WORLD
# rank = comm.Get_rank()
# size = comm.Get_size()
# with h5py.File("parallel.h5", "w", driver="mpio", comm=comm) as f:
# dset = f.create_dataset("data", shape=(size * 1000, 100), dtype="f4")
# start = rank * 1000
# dset[start:start+1000] = np.random.randn(1000, 100).astype(np.float32)
Integration with other libraries
import h5py
import numpy as np
# With pandas
import pandas as pd
# pandas has built-in HDF5 support via PyTables
df = pd.DataFrame({"a": range(1000), "b": np.random.randn(1000)})
df.to_hdf("pandas_data.h5", key="table1", mode="w")
df_loaded = pd.read_hdf("pandas_data.h5", "table1")
# With PyTorch (model weights)
# import torch
# model_state = model.state_dict()
# with h5py.File("weights.h5", "w") as f:
# for key, tensor in model_state.items():
# f.create_dataset(key, data=tensor.numpy())
# Keras/TensorFlow models save in HDF5 by default
# model.save("model.h5")
Performance optimization
import h5py
import numpy as np
import time
# 1. Batch writes (write large blocks, not one row at a time)
with h5py.File("perf_test.h5", "w") as f:
dset = f.create_dataset("data", shape=(100000, 100), dtype="f4",
chunks=(1000, 100))
# SLOW: row-by-row
start = time.time()
for i in range(10000):
dset[i] = np.random.randn(100).astype(np.float32)
slow_time = time.time() - start
# FAST: batch writes
start = time.time()
for batch_start in range(10000, 100000, 1000):
dset[batch_start:batch_start+1000] = np.random.randn(1000, 100).astype(np.float32)
fast_time = time.time() - start
print(f"Row-by-row (10k rows): {slow_time:.2f}s")
print(f"Batch (90k rows): {fast_time:.2f}s")
# 2. Chunk cache tuning
# Default chunk cache is 1 MB — increase for large random access patterns
with h5py.File("data.h5", "r", rdcc_nbytes=100*1024*1024) as f: # 100 MB cache
dset = f["data"]
# Random access will hit cache more often
Common pitfalls
- Forgetting to close files. Always use
withstatements. An unclosed HDF5 file can become corrupted. Thewithblock guarantees cleanup even if exceptions occur. - Writing row by row. Each write to an HDF5 dataset has fixed overhead. Writing 1 million single rows is hundreds of times slower than writing 1000 batches of 1000 rows.
- Mismatched chunk shape and access pattern. If your chunks are (1, 1000000) but you read columns, every column read loads the entire dataset. Match chunks to your access pattern.
- Concurrent writes without MPI. Standard HDF5 does not support multiple processes writing to the same file simultaneously. Use
swmr=True(Single Writer Multiple Reader) for one writer with concurrent readers, or parallel HDF5 with MPI for true multi-writer access. - Storing many small arrays. HDF5 has per-dataset overhead. Storing millions of tiny arrays (< 1 KB each) is inefficient — consider concatenating them into larger datasets with an index.
The one thing to remember: h5py provides Python with dictionary-like access to HDF5’s high-performance, hierarchical, compressed array storage — the key being that you read only the slice you need from datasets that can be terabytes in size.