Python Ocean Data Analysis — Deep Dive

Architecture of an ocean data analysis system

A comprehensive ocean analysis pipeline integrates multiple data sources into a coherent view:

Argo Profiles → Quality Control → Gridding/Interpolation
Satellite SST/SSH → Regridding → Anomaly Computation
Model Output → Subsetting → Validation against observations
    → Integrated Products (heat content, MHWs, currents)
        → Visualization (maps, sections, T-S diagrams)

Loading and processing Argo float data

The argopy library provides streamlined access to the global Argo dataset:

import argopy
import xarray as xr
import numpy as np

def fetch_argo_region(
    bbox: list[float],  # [lon_min, lon_max, lat_min, lat_max]
    depth_range: list[float],  # [z_min, z_max]
    date_range: list[str],  # ["2024-01", "2024-12"]
) -> xr.Dataset:
    """Fetch Argo profiles for a region and time period."""
    fetcher = argopy.DataFetcher(src="gdac", parallel=True)
    
    ds = fetcher.region(
        bbox + depth_range + date_range
    ).to_xarray()
    
    # Apply QC flags — keep only good data (QC flag 1)
    good_temp = ds["TEMP_QC"].isin([b"1"])
    good_psal = ds["PSAL_QC"].isin([b"1"])
    
    ds["TEMP"] = ds["TEMP"].where(good_temp)
    ds["PSAL"] = ds["PSAL"].where(good_psal)
    
    return ds

def compute_density_profile(
    temperature: np.ndarray,
    salinity: np.ndarray,
    pressure: np.ndarray,
    longitude: float,
    latitude: float
) -> np.ndarray:
    """Compute in-situ density using TEOS-10 equations."""
    import gsw
    
    # Convert practical salinity to absolute salinity
    SA = gsw.SA_from_SP(salinity, pressure, longitude, latitude)
    
    # Convert in-situ temperature to conservative temperature
    CT = gsw.CT_from_t(SA, temperature, pressure)
    
    # Compute in-situ density
    rho = gsw.rho(SA, CT, pressure)
    
    return rho

def find_mixed_layer_depth(
    temperature: np.ndarray,
    depth: np.ndarray,
    threshold: float = 0.2  # °C difference from surface
) -> float:
    """Find mixed layer depth using temperature threshold criterion."""
    surface_temp = temperature[0]
    
    below_threshold = np.abs(temperature - surface_temp) > threshold
    
    if below_threshold.any():
        mld_idx = np.argmax(below_threshold)
        # Interpolate for more precise estimate
        if mld_idx > 0:
            frac = (threshold - abs(temperature[mld_idx-1] - surface_temp)) / \
                   (abs(temperature[mld_idx] - surface_temp) - abs(temperature[mld_idx-1] - surface_temp))
            return depth[mld_idx-1] + frac * (depth[mld_idx] - depth[mld_idx-1])
        return float(depth[mld_idx])
    
    return float(depth[-1])  # Entire profile is mixed

Satellite SST analysis and anomalies

NOAA’s OISST product provides daily global SST at 0.25° resolution since 1981:

import xarray as xr
import numpy as np

def load_oisst(data_dir: str, year: int) -> xr.Dataset:
    """Load NOAA OISST daily data for a year."""
    ds = xr.open_mfdataset(
        f"{data_dir}/sst.day.mean.{year}.nc",
        chunks={"time": 30}  # Dask chunking for lazy computation
    )
    return ds

def compute_sst_anomaly(
    ds: xr.Dataset,
    climatology_years: tuple[int, int] = (1991, 2020)
) -> xr.DataArray:
    """Compute SST anomaly relative to a 30-year climatology."""
    # Compute day-of-year climatology
    clim = ds["sst"].sel(
        time=slice(str(climatology_years[0]), str(climatology_years[1]))
    ).groupby("time.dayofyear").mean(dim="time")
    
    # Subtract climatology from observations
    anomaly = ds["sst"].groupby("time.dayofyear") - clim
    
    return anomaly

def compute_ocean_heat_content(
    temperature_profiles: xr.DataArray,
    depth_levels: xr.DataArray,
    max_depth: float = 700.0
) -> xr.DataArray:
    """Compute Ocean Heat Content (OHC) from temperature profiles."""
    rho_cp = 4.09e6  # seawater density × specific heat (J/m³/K)
    
    # Reference temperature (could be climatological mean or 0°C)
    temp_anom = temperature_profiles  # Assume already anomaly
    
    # Integrate temperature anomaly over depth
    depth_mask = depth_levels <= max_depth
    dz = depth_levels.diff(dim="depth")
    
    ohc = (temp_anom.where(depth_mask) * rho_cp * dz).sum(dim="depth")
    
    return ohc  # Units: J/m²

Marine heatwave detection

Implementing the Hobday et al. (2016) definition:

import numpy as np
import pandas as pd

def detect_marine_heatwaves(
    sst: pd.Series,
    climatology_period: tuple[str, str] = ("1991-01-01", "2020-12-31"),
    percentile: float = 90,
    min_duration_days: int = 5
) -> list[dict]:
    """Detect marine heatwaves in an SST time series."""
    
    # Compute day-of-year climatology and threshold
    clim_data = sst[climatology_period[0]:climatology_period[1]]
    
    doy_mean = clim_data.groupby(clim_data.index.dayofyear).mean()
    doy_threshold = clim_data.groupby(clim_data.index.dayofyear).quantile(
        percentile / 100
    )
    
    # Smooth climatology and threshold (11-day window)
    doy_mean_smooth = doy_mean.rolling(11, center=True, min_periods=1).mean()
    doy_threshold_smooth = doy_threshold.rolling(11, center=True, min_periods=1).mean()
    
    # Identify days exceeding threshold
    threshold_series = sst.index.dayofyear.map(
        lambda d: doy_threshold_smooth.get(d, np.nan)
    )
    climatology_series = sst.index.dayofyear.map(
        lambda d: doy_mean_smooth.get(d, np.nan)
    )
    
    exceedance = sst.values > threshold_series
    
    # Find contiguous periods
    events = []
    in_event = False
    start_idx = 0
    
    for i in range(len(exceedance)):
        if exceedance[i] and not in_event:
            start_idx = i
            in_event = True
        elif not exceedance[i] and in_event:
            duration = i - start_idx
            if duration >= min_duration_days:
                event_sst = sst.iloc[start_idx:i]
                event_clim = climatology_series[start_idx:i]
                
                events.append({
                    "start": sst.index[start_idx],
                    "end": sst.index[i - 1],
                    "duration_days": duration,
                    "max_intensity": float(
                        (event_sst.values - event_clim).max()
                    ),
                    "mean_intensity": float(
                        (event_sst.values - event_clim).mean()
                    ),
                    "max_sst": float(event_sst.max()),
                    "category": classify_mhw(
                        float((event_sst.values - event_clim).max()),
                        float(doy_threshold_smooth.mean() - doy_mean_smooth.mean())
                    )
                })
            in_event = False
    
    return events

def classify_mhw(max_intensity: float, threshold_diff: float) -> str:
    """Classify MHW category (Hobday et al. 2018)."""
    if max_intensity > 4 * threshold_diff:
        return "IV Extreme"
    elif max_intensity > 3 * threshold_diff:
        return "III Severe"
    elif max_intensity > 2 * threshold_diff:
        return "II Strong"
    return "I Moderate"

Geostrophic current computation from altimetry

Sea surface height gradients reveal ocean currents:

import xarray as xr
import numpy as np

def compute_geostrophic_currents(
    ssh: xr.DataArray,
    latitude: xr.DataArray
) -> tuple[xr.DataArray, xr.DataArray]:
    """Compute geostrophic velocity from sea surface height.
    
    Uses the geostrophic balance: f × v = g × ∂η/∂x
    """
    g = 9.81  # m/s²
    omega = 7.2921e-5  # Earth rotation rate, rad/s
    
    # Coriolis parameter (avoid equator where f→0)
    f = 2 * omega * np.sin(np.deg2rad(latitude))
    f = f.where(np.abs(latitude) > 3)  # Mask ±3° equatorial band
    
    # Convert lat/lon gradients to meters
    R = 6.371e6  # Earth radius
    dlat = np.deg2rad(float(latitude.diff(dim="latitude").mean()))
    dlon = np.deg2rad(float(ssh.longitude.diff(dim="longitude").mean()))
    
    dy = R * dlat
    dx = R * dlon * np.cos(np.deg2rad(latitude))
    
    # Compute gradients
    deta_dx = ssh.diff(dim="longitude") / dx
    deta_dy = ssh.diff(dim="latitude") / dy
    
    # Geostrophic velocities
    # u = -(g/f) * ∂η/∂y  (eastward velocity from north-south gradient)
    # v =  (g/f) * ∂η/∂x  (northward velocity from east-west gradient)
    u_geo = -(g / f) * deta_dy
    v_geo = (g / f) * deta_dx
    
    return u_geo, v_geo

def detect_eddies(
    ssh_anomaly: xr.DataArray,
    min_radius_km: float = 50,
    max_radius_km: float = 300
) -> list[dict]:
    """Simple eddy detection from SSH anomaly closed contours."""
    from scipy.ndimage import label, center_of_mass
    
    eddies = []
    
    # Cyclonic eddies (SSH < 0, Northern Hemisphere)
    for threshold in np.arange(-0.05, -0.30, -0.05):
        mask = ssh_anomaly.values < threshold
        labeled, n_features = label(mask)
        
        for i in range(1, n_features + 1):
            region = labeled == i
            area_pixels = region.sum()
            
            # Estimate radius (assuming roughly circular)
            pixel_area_km2 = 111 * 111 * 0.25 * 0.25  # at ~0.25° resolution
            radius_km = np.sqrt(area_pixels * pixel_area_km2 / np.pi)
            
            if min_radius_km <= radius_km <= max_radius_km:
                cy, cx = center_of_mass(region)
                eddies.append({
                    "type": "cyclonic",
                    "center_lat": float(ssh_anomaly.latitude[int(cy)]),
                    "center_lon": float(ssh_anomaly.longitude[int(cx)]),
                    "radius_km": round(radius_km, 1),
                    "amplitude_m": abs(threshold),
                })
    
    return eddies

Visualization with cartopy and cmocean

Oceanographic visualizations require proper map projections and perceptually uniform colormaps:

import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import cmocean

def plot_sst_anomaly(
    anomaly: xr.DataArray,
    title: str = "Sea Surface Temperature Anomaly",
    output_path: str | None = None
):
    """Create publication-quality SST anomaly map."""
    fig, ax = plt.subplots(
        figsize=(14, 8),
        subplot_kw={"projection": ccrs.Robinson()}
    )
    
    im = anomaly.plot(
        ax=ax,
        transform=ccrs.PlateCarree(),
        cmap=cmocean.cm.balance,  # Diverging, red-blue
        vmin=-3, vmax=3,
        add_colorbar=False
    )
    
    ax.add_feature(cfeature.LAND, facecolor="lightgray")
    ax.add_feature(cfeature.COASTLINE, linewidth=0.5)
    ax.set_global()
    ax.set_title(title, fontsize=14, fontweight="bold")
    
    cbar = plt.colorbar(im, ax=ax, orientation="horizontal",
                        pad=0.05, shrink=0.6)
    cbar.set_label("SST Anomaly (°C)")
    
    if output_path:
        fig.savefig(output_path, dpi=150, bbox_inches="tight")
    
    return fig, ax

Tradeoffs

In-situ accuracy vs. spatial coverage: Argo floats measure temperature to ±0.002°C but cover only ~300km × 300km per float. Satellites cover the globe daily but measure only the top ~1mm of the surface with ±0.3°C accuracy. Optimal analysis fuses both.

Resolution vs. compute cost: Global ocean models at 1/12° resolution produce ~1TB per simulated year. Analysis at this resolution requires Dask distributed computing or cloud platforms. Downsampling to 1° reduces data volume 100x but smooths away mesoscale features (eddies, fronts) that drive biological productivity.

Climatology period: The choice of reference period (1961-1990 vs. 1991-2020) changes anomaly magnitudes by 0.3-0.5°C globally. Climate studies must clearly state their baseline, and Python pipelines should make this configurable.

Real-time vs. delayed mode: Argo profiles are available in real-time (within hours) but may contain uncorrected sensor drift. Delayed-mode data (corrected after expert QC, released months later) is more accurate. Research requiring high precision should use delayed-mode; monitoring applications use real-time.

One thing to remember: Ocean data analysis in Python centers on xarray for multi-dimensional datasets, gsw for thermodynamic computations, and the constant challenge of integrating sparse in-situ observations with satellite-derived surface fields and model outputs — where understanding the limitations of each data source matters as much as the analysis code itself.

pythonoceanographydata-scienceclimategeospatial

See Also