Python Ocean Data Analysis — Deep Dive
Architecture of an ocean data analysis system
A comprehensive ocean analysis pipeline integrates multiple data sources into a coherent view:
Argo Profiles → Quality Control → Gridding/Interpolation
Satellite SST/SSH → Regridding → Anomaly Computation
Model Output → Subsetting → Validation against observations
→ Integrated Products (heat content, MHWs, currents)
→ Visualization (maps, sections, T-S diagrams)
Loading and processing Argo float data
The argopy library provides streamlined access to the global Argo dataset:
import argopy
import xarray as xr
import numpy as np
def fetch_argo_region(
bbox: list[float], # [lon_min, lon_max, lat_min, lat_max]
depth_range: list[float], # [z_min, z_max]
date_range: list[str], # ["2024-01", "2024-12"]
) -> xr.Dataset:
"""Fetch Argo profiles for a region and time period."""
fetcher = argopy.DataFetcher(src="gdac", parallel=True)
ds = fetcher.region(
bbox + depth_range + date_range
).to_xarray()
# Apply QC flags — keep only good data (QC flag 1)
good_temp = ds["TEMP_QC"].isin([b"1"])
good_psal = ds["PSAL_QC"].isin([b"1"])
ds["TEMP"] = ds["TEMP"].where(good_temp)
ds["PSAL"] = ds["PSAL"].where(good_psal)
return ds
def compute_density_profile(
temperature: np.ndarray,
salinity: np.ndarray,
pressure: np.ndarray,
longitude: float,
latitude: float
) -> np.ndarray:
"""Compute in-situ density using TEOS-10 equations."""
import gsw
# Convert practical salinity to absolute salinity
SA = gsw.SA_from_SP(salinity, pressure, longitude, latitude)
# Convert in-situ temperature to conservative temperature
CT = gsw.CT_from_t(SA, temperature, pressure)
# Compute in-situ density
rho = gsw.rho(SA, CT, pressure)
return rho
def find_mixed_layer_depth(
temperature: np.ndarray,
depth: np.ndarray,
threshold: float = 0.2 # °C difference from surface
) -> float:
"""Find mixed layer depth using temperature threshold criterion."""
surface_temp = temperature[0]
below_threshold = np.abs(temperature - surface_temp) > threshold
if below_threshold.any():
mld_idx = np.argmax(below_threshold)
# Interpolate for more precise estimate
if mld_idx > 0:
frac = (threshold - abs(temperature[mld_idx-1] - surface_temp)) / \
(abs(temperature[mld_idx] - surface_temp) - abs(temperature[mld_idx-1] - surface_temp))
return depth[mld_idx-1] + frac * (depth[mld_idx] - depth[mld_idx-1])
return float(depth[mld_idx])
return float(depth[-1]) # Entire profile is mixed
Satellite SST analysis and anomalies
NOAA’s OISST product provides daily global SST at 0.25° resolution since 1981:
import xarray as xr
import numpy as np
def load_oisst(data_dir: str, year: int) -> xr.Dataset:
"""Load NOAA OISST daily data for a year."""
ds = xr.open_mfdataset(
f"{data_dir}/sst.day.mean.{year}.nc",
chunks={"time": 30} # Dask chunking for lazy computation
)
return ds
def compute_sst_anomaly(
ds: xr.Dataset,
climatology_years: tuple[int, int] = (1991, 2020)
) -> xr.DataArray:
"""Compute SST anomaly relative to a 30-year climatology."""
# Compute day-of-year climatology
clim = ds["sst"].sel(
time=slice(str(climatology_years[0]), str(climatology_years[1]))
).groupby("time.dayofyear").mean(dim="time")
# Subtract climatology from observations
anomaly = ds["sst"].groupby("time.dayofyear") - clim
return anomaly
def compute_ocean_heat_content(
temperature_profiles: xr.DataArray,
depth_levels: xr.DataArray,
max_depth: float = 700.0
) -> xr.DataArray:
"""Compute Ocean Heat Content (OHC) from temperature profiles."""
rho_cp = 4.09e6 # seawater density × specific heat (J/m³/K)
# Reference temperature (could be climatological mean or 0°C)
temp_anom = temperature_profiles # Assume already anomaly
# Integrate temperature anomaly over depth
depth_mask = depth_levels <= max_depth
dz = depth_levels.diff(dim="depth")
ohc = (temp_anom.where(depth_mask) * rho_cp * dz).sum(dim="depth")
return ohc # Units: J/m²
Marine heatwave detection
Implementing the Hobday et al. (2016) definition:
import numpy as np
import pandas as pd
def detect_marine_heatwaves(
sst: pd.Series,
climatology_period: tuple[str, str] = ("1991-01-01", "2020-12-31"),
percentile: float = 90,
min_duration_days: int = 5
) -> list[dict]:
"""Detect marine heatwaves in an SST time series."""
# Compute day-of-year climatology and threshold
clim_data = sst[climatology_period[0]:climatology_period[1]]
doy_mean = clim_data.groupby(clim_data.index.dayofyear).mean()
doy_threshold = clim_data.groupby(clim_data.index.dayofyear).quantile(
percentile / 100
)
# Smooth climatology and threshold (11-day window)
doy_mean_smooth = doy_mean.rolling(11, center=True, min_periods=1).mean()
doy_threshold_smooth = doy_threshold.rolling(11, center=True, min_periods=1).mean()
# Identify days exceeding threshold
threshold_series = sst.index.dayofyear.map(
lambda d: doy_threshold_smooth.get(d, np.nan)
)
climatology_series = sst.index.dayofyear.map(
lambda d: doy_mean_smooth.get(d, np.nan)
)
exceedance = sst.values > threshold_series
# Find contiguous periods
events = []
in_event = False
start_idx = 0
for i in range(len(exceedance)):
if exceedance[i] and not in_event:
start_idx = i
in_event = True
elif not exceedance[i] and in_event:
duration = i - start_idx
if duration >= min_duration_days:
event_sst = sst.iloc[start_idx:i]
event_clim = climatology_series[start_idx:i]
events.append({
"start": sst.index[start_idx],
"end": sst.index[i - 1],
"duration_days": duration,
"max_intensity": float(
(event_sst.values - event_clim).max()
),
"mean_intensity": float(
(event_sst.values - event_clim).mean()
),
"max_sst": float(event_sst.max()),
"category": classify_mhw(
float((event_sst.values - event_clim).max()),
float(doy_threshold_smooth.mean() - doy_mean_smooth.mean())
)
})
in_event = False
return events
def classify_mhw(max_intensity: float, threshold_diff: float) -> str:
"""Classify MHW category (Hobday et al. 2018)."""
if max_intensity > 4 * threshold_diff:
return "IV Extreme"
elif max_intensity > 3 * threshold_diff:
return "III Severe"
elif max_intensity > 2 * threshold_diff:
return "II Strong"
return "I Moderate"
Geostrophic current computation from altimetry
Sea surface height gradients reveal ocean currents:
import xarray as xr
import numpy as np
def compute_geostrophic_currents(
ssh: xr.DataArray,
latitude: xr.DataArray
) -> tuple[xr.DataArray, xr.DataArray]:
"""Compute geostrophic velocity from sea surface height.
Uses the geostrophic balance: f × v = g × ∂η/∂x
"""
g = 9.81 # m/s²
omega = 7.2921e-5 # Earth rotation rate, rad/s
# Coriolis parameter (avoid equator where f→0)
f = 2 * omega * np.sin(np.deg2rad(latitude))
f = f.where(np.abs(latitude) > 3) # Mask ±3° equatorial band
# Convert lat/lon gradients to meters
R = 6.371e6 # Earth radius
dlat = np.deg2rad(float(latitude.diff(dim="latitude").mean()))
dlon = np.deg2rad(float(ssh.longitude.diff(dim="longitude").mean()))
dy = R * dlat
dx = R * dlon * np.cos(np.deg2rad(latitude))
# Compute gradients
deta_dx = ssh.diff(dim="longitude") / dx
deta_dy = ssh.diff(dim="latitude") / dy
# Geostrophic velocities
# u = -(g/f) * ∂η/∂y (eastward velocity from north-south gradient)
# v = (g/f) * ∂η/∂x (northward velocity from east-west gradient)
u_geo = -(g / f) * deta_dy
v_geo = (g / f) * deta_dx
return u_geo, v_geo
def detect_eddies(
ssh_anomaly: xr.DataArray,
min_radius_km: float = 50,
max_radius_km: float = 300
) -> list[dict]:
"""Simple eddy detection from SSH anomaly closed contours."""
from scipy.ndimage import label, center_of_mass
eddies = []
# Cyclonic eddies (SSH < 0, Northern Hemisphere)
for threshold in np.arange(-0.05, -0.30, -0.05):
mask = ssh_anomaly.values < threshold
labeled, n_features = label(mask)
for i in range(1, n_features + 1):
region = labeled == i
area_pixels = region.sum()
# Estimate radius (assuming roughly circular)
pixel_area_km2 = 111 * 111 * 0.25 * 0.25 # at ~0.25° resolution
radius_km = np.sqrt(area_pixels * pixel_area_km2 / np.pi)
if min_radius_km <= radius_km <= max_radius_km:
cy, cx = center_of_mass(region)
eddies.append({
"type": "cyclonic",
"center_lat": float(ssh_anomaly.latitude[int(cy)]),
"center_lon": float(ssh_anomaly.longitude[int(cx)]),
"radius_km": round(radius_km, 1),
"amplitude_m": abs(threshold),
})
return eddies
Visualization with cartopy and cmocean
Oceanographic visualizations require proper map projections and perceptually uniform colormaps:
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import cmocean
def plot_sst_anomaly(
anomaly: xr.DataArray,
title: str = "Sea Surface Temperature Anomaly",
output_path: str | None = None
):
"""Create publication-quality SST anomaly map."""
fig, ax = plt.subplots(
figsize=(14, 8),
subplot_kw={"projection": ccrs.Robinson()}
)
im = anomaly.plot(
ax=ax,
transform=ccrs.PlateCarree(),
cmap=cmocean.cm.balance, # Diverging, red-blue
vmin=-3, vmax=3,
add_colorbar=False
)
ax.add_feature(cfeature.LAND, facecolor="lightgray")
ax.add_feature(cfeature.COASTLINE, linewidth=0.5)
ax.set_global()
ax.set_title(title, fontsize=14, fontweight="bold")
cbar = plt.colorbar(im, ax=ax, orientation="horizontal",
pad=0.05, shrink=0.6)
cbar.set_label("SST Anomaly (°C)")
if output_path:
fig.savefig(output_path, dpi=150, bbox_inches="tight")
return fig, ax
Tradeoffs
In-situ accuracy vs. spatial coverage: Argo floats measure temperature to ±0.002°C but cover only ~300km × 300km per float. Satellites cover the globe daily but measure only the top ~1mm of the surface with ±0.3°C accuracy. Optimal analysis fuses both.
Resolution vs. compute cost: Global ocean models at 1/12° resolution produce ~1TB per simulated year. Analysis at this resolution requires Dask distributed computing or cloud platforms. Downsampling to 1° reduces data volume 100x but smooths away mesoscale features (eddies, fronts) that drive biological productivity.
Climatology period: The choice of reference period (1961-1990 vs. 1991-2020) changes anomaly magnitudes by 0.3-0.5°C globally. Climate studies must clearly state their baseline, and Python pipelines should make this configurable.
Real-time vs. delayed mode: Argo profiles are available in real-time (within hours) but may contain uncorrected sensor drift. Delayed-mode data (corrected after expert QC, released months later) is more accurate. Research requiring high precision should use delayed-mode; monitoring applications use real-time.
One thing to remember: Ocean data analysis in Python centers on xarray for multi-dimensional datasets, gsw for thermodynamic computations, and the constant challenge of integrating sparse in-situ observations with satellite-derived surface fields and model outputs — where understanding the limitations of each data source matters as much as the analysis code itself.
See Also
- Python Biodiversity Tracking How Python helps scientists count and protect every kind of animal and plant on Earth — from whales to wildflowers.
- Python Crop Disease Detection How Python looks at photos of plants and figures out if they're sick — like a doctor for crops.
- Python Deforestation Detection How Python spots disappearing forests from space — catching illegal logging and land clearing as it happens.
- Python Drone Image Processing How Python turns hundreds of overlapping drone photos into detailed maps and 3D models of the ground below.
- Python Precision Agriculture How Python helps farmers give every plant exactly what it needs instead of treating the whole field the same way.