Xarray for Multidimensional Data — Deep Dive
Xarray’s data model
Xarray builds on NumPy with three additional concepts:
- Named dimensions (dims) — each axis has a string name. Operations use names instead of integer positions.
- Coordinate labels (coords) — tick marks along each dimension. A time dimension has actual datetime values; a latitude dimension has degree values.
- Attributes (attrs) — free-form metadata attached to arrays and datasets. Units, data source, processing history.
This model maps directly to the NetCDF data model, which is why Xarray handles NetCDF files natively.
Creating DataArrays and Datasets
import xarray as xr
import numpy as np
import pandas as pd
# Create a DataArray from scratch
times = pd.date_range("2026-01-01", periods=365, freq="D")
lats = np.arange(-90, 91, 2.5) # 73 latitude points
lons = np.arange(0, 360, 2.5) # 144 longitude points
# Random temperature data: (time, lat, lon)
np.random.seed(42)
temp_data = (
15 + 10 * np.sin(np.radians(lats))[:, np.newaxis] # latitude gradient
+ 5 * np.random.randn(365, len(lats), len(lons)) # noise
)
temperature = xr.DataArray(
data=temp_data,
dims=["time", "latitude", "longitude"],
coords={
"time": times,
"latitude": lats,
"longitude": lons,
},
attrs={
"units": "°C",
"long_name": "Surface Temperature",
"source": "Synthetic data for demonstration",
},
)
print(temperature)
# Create a Dataset with multiple variables
ds = xr.Dataset({
"temperature": temperature,
"pressure": xr.DataArray(
data=1013 + 5 * np.random.randn(365, len(lats), len(lons)),
dims=["time", "latitude", "longitude"],
coords={"time": times, "latitude": lats, "longitude": lons},
attrs={"units": "hPa"},
),
})
Advanced selection and indexing
import xarray as xr
# Assume ds is a climate Dataset with (time, latitude, longitude)
# Label-based selection
jan_data = ds.sel(time="2026-01") # All of January
point = ds.sel(latitude=40.0, longitude=280.0) # Single point
# Nearest-neighbor lookup
nearest_point = ds.sel(latitude=37.3, longitude=282.7, method="nearest")
# Range selection (inclusive)
europe = ds.sel(latitude=slice(35, 70), longitude=slice(350, 360))
# Boolean indexing
warm = ds.where(ds["temperature"] > 25, drop=True)
# Advanced: multi-dimensional selection with DataArray indexing
target_lats = xr.DataArray([30, 40, 50], dims="station")
target_lons = xr.DataArray([260, 280, 300], dims="station")
stations = ds.sel(latitude=target_lats, longitude=target_lons, method="nearest")
Computation and aggregation
import xarray as xr
# Mean over time → 2D map of average temperature
time_mean = ds["temperature"].mean(dim="time")
# Zonal mean → average across longitudes at each latitude
zonal_mean = ds["temperature"].mean(dim="longitude")
# Multiple dimensions at once
global_mean = ds["temperature"].mean(dim=["latitude", "longitude"])
# Weighted mean (area-weighted for lat-lon grids)
import numpy as np
weights = np.cos(np.radians(ds.latitude))
weights.name = "weights"
weighted_mean = ds["temperature"].weighted(weights).mean(dim=["latitude", "longitude"])
# Rolling window (30-day moving average)
rolling_temp = ds["temperature"].rolling(time=30, center=True).mean()
# Climatology: average for each day-of-year across all years
climatology = ds["temperature"].groupby("time.dayofyear").mean()
# Anomalies: deviation from climatology
anomalies = ds["temperature"].groupby("time.dayofyear") - climatology
GroupBy operations
import xarray as xr
# Monthly means
monthly = ds.groupby("time.month").mean()
# Seasonal means
seasonal = ds.groupby("time.season").mean()
# Custom grouping: group by latitude bands
lat_bands = np.digitize(ds.latitude, bins=[-60, -30, 0, 30, 60])
band_labels = xr.DataArray(lat_bands, dims="latitude", coords={"latitude": ds.latitude})
band_means = ds.groupby(band_labels).mean()
# Resample to lower time frequency
weekly = ds.resample(time="W").mean()
monthly_max = ds.resample(time="ME").max()
Working with NetCDF files
import xarray as xr
# Save to NetCDF
ds.to_netcdf("climate_data.nc")
# Load from NetCDF
ds_loaded = xr.open_dataset("climate_data.nc")
# Lazy loading (data stays on disk until needed)
ds_lazy = xr.open_dataset("climate_data.nc", chunks={"time": 30})
# Multiple files (e.g., one file per year)
ds_multi = xr.open_mfdataset("data_*.nc", combine="by_coords")
# Selective loading — only specific variables
ds_temp = xr.open_dataset("climate_data.nc")[["temperature"]]
Dask integration for out-of-core computation
import xarray as xr
# Open with chunking → data stays on disk, operations are lazy
ds = xr.open_dataset("large_climate_data.nc", chunks={"time": 100})
# Operations build a task graph (no computation yet)
annual_mean = ds["temperature"].groupby("time.year").mean()
# Trigger computation
result = annual_mean.compute()
# Or write directly to disk (computed chunk by chunk)
annual_mean.to_netcdf("annual_means.nc")
# Custom chunk sizes for different access patterns
# Chunked by time → fast time-series at a point
ds_time_chunks = xr.open_dataset("data.nc", chunks={"time": 365, "latitude": -1, "longitude": -1})
# Chunked by space → fast spatial maps at one time
ds_space_chunks = xr.open_dataset("data.nc", chunks={"time": -1, "latitude": 50, "longitude": 50})
Zarr: cloud-native storage
import xarray as xr
# Save to Zarr (chunked, cloud-friendly)
ds.to_zarr("climate_data.zarr", mode="w")
# Read from Zarr
ds_zarr = xr.open_zarr("climate_data.zarr")
# Cloud storage (S3, GCS)
# import s3fs
# ds_cloud = xr.open_zarr("s3://bucket/climate_data.zarr")
# Zarr advantages over NetCDF:
# - Parallel reads/writes (each chunk is a separate file)
# - Cloud-native (works with S3, GCS, Azure Blob)
# - Append-friendly (add new time steps without rewriting)
Visualization
import xarray as xr
import matplotlib.pyplot as plt
# Built-in plotting (wraps matplotlib)
fig, axes = plt.subplots(1, 2, figsize=(16, 5))
# 2D map: mean temperature
ds["temperature"].mean(dim="time").plot(ax=axes[0], cmap="RdBu_r")
axes[0].set_title("Mean Temperature")
# 1D line: time series at a point
ds["temperature"].sel(latitude=40, longitude=280, method="nearest").plot(ax=axes[1])
axes[1].set_title("Temperature at 40°N, 80°W")
plt.tight_layout()
plt.savefig("xarray_plots.png", dpi=150)
# Faceted plots: one panel per season
ds["temperature"].mean(dim="time").plot(col="season" if "season" in ds.dims else None)
Real-world pattern: climate anomaly analysis
import xarray as xr
import numpy as np
# Load ERA5 reanalysis data (example structure)
# ds = xr.open_dataset("era5_temperature.nc")
# For demonstration, use synthetic data
times = np.arange(np.datetime64("2000-01-01"), np.datetime64("2026-01-01"), np.timedelta64(1, "D"))
lats = np.arange(-90, 91, 2.5)
lons = np.arange(0, 360, 2.5)
np.random.seed(42)
ds = xr.Dataset({
"temperature": xr.DataArray(
data=15 + 10 * np.sin(np.radians(lats))[np.newaxis, :, np.newaxis]
+ 0.02 * np.arange(len(times))[:, np.newaxis, np.newaxis] # warming trend
+ 5 * np.random.randn(len(times), len(lats), len(lons)),
dims=["time", "latitude", "longitude"],
coords={"time": times, "latitude": lats, "longitude": lons},
),
})
# 1. Compute baseline climatology (2000-2020)
baseline = ds.sel(time=slice("2000", "2020"))
climatology = baseline["temperature"].groupby("time.dayofyear").mean()
# 2. Compute anomalies
anomalies = ds["temperature"].groupby("time.dayofyear") - climatology
# 3. Annual mean anomaly (area-weighted)
weights = np.cos(np.radians(ds.latitude))
annual_anomaly = anomalies.weighted(weights).mean(dim=["latitude", "longitude"]).resample(time="YE").mean()
# 4. Linear trend
years = np.arange(len(annual_anomaly))
coeffs = np.polyfit(years, annual_anomaly.values, 1)
print(f"Warming trend: {coeffs[0]:.4f} °C/year")
print(f"Total warming 2000-2025: {coeffs[0] * len(annual_anomaly):.2f} °C")
Performance tips
| Pattern | Slow | Fast |
|---|---|---|
| Point time series | Loop over isel | .sel(lat=x, lon=y) directly |
| Spatial operation | Large time chunks | Large spatial chunks |
| Global mean | .mean() on unchunked | .weighted().mean() on chunked |
| File I/O | Single NetCDF | Zarr with appropriate chunking |
| Multiple files | Loop + concatenate | open_mfdataset with parallel=True |
Memory management
# Check estimated memory usage
print(f"Dataset size: {ds.nbytes / 1e9:.2f} GB")
# Load only what you need
subset = ds.sel(latitude=slice(30, 50), longitude=slice(250, 300))
subset.load() # Bring into memory only the subset
Common pitfalls
- Forgetting that operations on chunked data are lazy. Nothing computes until you call
.compute(),.values, or.load(). Check the repr to see if you have a Dask array. - Inefficient chunk sizes. Chunks too small create overhead; too large exhaust memory. Target 100 MB – 500 MB per chunk.
- Using
.valueson large arrays. This forces the entire array into memory as a NumPy array. Use Xarray operations as long as possible before converting. - Ignoring the coordinate reference. When merging datasets from different sources, ensure coordinates align. Xarray will broadcast but may silently introduce NaN for non-overlapping regions.
The one thing to remember: Xarray transforms the experience of working with multidimensional scientific data — named dimensions and coordinate labels eliminate axis-number confusion while Dask integration enables analysis of datasets larger than available memory.