Python Service Health Dashboards — Core Concepts

Design and build service health dashboards in Python: health check endpoints, status aggregation, and real-time visualization with Flask and Streamlit.

The three components

A service health dashboard has three parts:

Health checks — tests that verify each service is working
Aggregation — collecting results from all checks into a single status
Display — presenting the results in a web page or API

Writing health checks

A health check is a function that tests whether a service is working correctly and responds within acceptable time:

import httpx
import psycopg2
import redis
import time
from dataclasses import dataclass
from enum import Enum

class Status(Enum):
    HEALTHY = "healthy"
    DEGRADED = "degraded"
    UNHEALTHY = "unhealthy"

@dataclass
class HealthResult:
    service: str
    status: Status
    response_time_ms: float
    message: str = ""

def check_http(name: str, url: str, timeout: float = 5.0) -> HealthResult:
    start = time.monotonic()
    try:
        response = httpx.get(url, timeout=timeout)
        elapsed = (time.monotonic() - start) * 1000

        if response.status_code == 200:
            status = Status.DEGRADED if elapsed > 2000 else Status.HEALTHY
            return HealthResult(name, status, elapsed)
        else:
            return HealthResult(name, Status.UNHEALTHY, elapsed,
                              f"Status code: {response.status_code}")
    except Exception as e:
        elapsed = (time.monotonic() - start) * 1000
        return HealthResult(name, Status.UNHEALTHY, elapsed, str(e))

def check_postgres(name: str, dsn: str, timeout: float = 5.0) -> HealthResult:
    start = time.monotonic()
    try:
        conn = psycopg2.connect(dsn, connect_timeout=int(timeout))
        cursor = conn.cursor()
        cursor.execute("SELECT 1")
        cursor.close()
        conn.close()
        elapsed = (time.monotonic() - start) * 1000
        status = Status.DEGRADED if elapsed > 1000 else Status.HEALTHY
        return HealthResult(name, status, elapsed)
    except Exception as e:
        elapsed = (time.monotonic() - start) * 1000
        return HealthResult(name, Status.UNHEALTHY, elapsed, str(e))

def check_redis(name: str, url: str, timeout: float = 3.0) -> HealthResult:
    start = time.monotonic()
    try:
        r = redis.from_url(url, socket_timeout=timeout)
        r.ping()
        elapsed = (time.monotonic() - start) * 1000
        status = Status.DEGRADED if elapsed > 500 else Status.HEALTHY
        return HealthResult(name, status, elapsed)
    except Exception as e:
        elapsed = (time.monotonic() - start) * 1000
        return HealthResult(name, Status.UNHEALTHY, elapsed, str(e))

Aggregating health status

from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Callable

class HealthAggregator:
    def __init__(self):
        self.checks: list[Callable[[], HealthResult]] = []

    def add_check(self, check_fn: Callable[[], HealthResult]):
        self.checks.append(check_fn)

    def run_all(self, max_workers: int = 10) -> dict:
        results = []

        with ThreadPoolExecutor(max_workers=max_workers) as pool:
            futures = {pool.submit(fn): fn for fn in self.checks}
            for future in as_completed(futures):
                try:
                    results.append(future.result())
                except Exception as e:
                    results.append(HealthResult("unknown", Status.UNHEALTHY, 0, str(e)))

        # Determine overall status
        statuses = [r.status for r in results]
        if Status.UNHEALTHY in statuses:
            overall = Status.UNHEALTHY
        elif Status.DEGRADED in statuses:
            overall = Status.DEGRADED
        else:
            overall = Status.HEALTHY

        return {
            'overall': overall.value,
            'services': [
                {
                    'name': r.service,
                    'status': r.status.value,
                    'response_time_ms': round(r.response_time_ms, 1),
                    'message': r.message,
                }
                for r in sorted(results, key=lambda r: r.service)
            ],
            'checked_at': time.time(),
        }

Running checks concurrently (with ThreadPoolExecutor) is important. If you have 10 services and each check takes up to 5 seconds on timeout, sequential checks could take 50 seconds. Concurrent checks complete in the time of the slowest check.

Building the dashboard API

With Flask

from flask import Flask, jsonify, render_template_string

app = Flask(__name__)

aggregator = HealthAggregator()
aggregator.add_check(lambda: check_http("API", "https://api.example.com/health"))
aggregator.add_check(lambda: check_http("Website", "https://www.example.com"))
aggregator.add_check(lambda: check_postgres("Database", "postgresql://localhost/mydb"))
aggregator.add_check(lambda: check_redis("Cache", "redis://localhost:6379"))

@app.route('/health')
def health():
    report = aggregator.run_all()
    status_code = 200 if report['overall'] == 'healthy' else 503
    return jsonify(report), status_code

@app.route('/')
def dashboard():
    report = aggregator.run_all()
    return render_template_string(DASHBOARD_HTML, report=report)

With Streamlit (for internal dashboards)

import streamlit as st
import time

st.set_page_config(page_title="Service Health", layout="wide")
st.title("🏥 Service Health Dashboard")

# Auto-refresh every 30 seconds
if 'last_refresh' not in st.session_state:
    st.session_state.last_refresh = 0

report = aggregator.run_all()

# Overall status banner
status_emoji = {"healthy": "🟢", "degraded": "🟡", "unhealthy": "🔴"}
st.header(f"{status_emoji.get(report['overall'], '⚪')} Overall: {report['overall'].upper()}")

# Service cards
cols = st.columns(4)
for i, service in enumerate(report['services']):
    with cols[i % 4]:
        emoji = status_emoji.get(service['status'], '⚪')
        st.metric(
            label=f"{emoji} {service['name']}",
            value=f"{service['response_time_ms']} ms",
        )
        if service['message']:
            st.caption(service['message'])

Health check endpoint design

When building services that will be monitored, include a /health endpoint that reports the service’s own view of its dependencies:

@app.route('/health')
def health_endpoint():
    checks = {
        'database': check_db_connection(),
        'cache': check_redis_connection(),
        'disk_space': check_disk_free(),
    }

    all_healthy = all(c['healthy'] for c in checks.values())

    return jsonify({
        'status': 'healthy' if all_healthy else 'unhealthy',
        'checks': checks,
        'version': '2.3.1',
        'uptime_seconds': time.time() - app.start_time,
    }), 200 if all_healthy else 503

Common misconception

Many teams build dashboards that only check if a service responds to HTTP requests. This misses many failure modes: the database might be up but returning stale data, the cache might be full, or the disk might be 99% full. Effective health checks test the actual capabilities of the service — can it read from the database? Can it write to the cache? Is there enough disk space for logs?

Practical patterns

Traffic light dashboards for operations teams monitoring production services
Dependency maps that show which services depend on which, highlighting cascading failures
Historical views that track uptime percentages and response time trends over days and weeks
Alerting integration that pages on-call engineers when the dashboard turns red
Status pages that give customers a public view of service health

One thing to remember: A good health dashboard does not just check if services are running — it verifies they can actually do their job (connect to databases, respond in acceptable time, have enough resources), runs checks concurrently for speed, and presents the results at a glance so problems are obvious in seconds.

pythonmonitoringdashboardssystem-administration