Python Health Check Endpoints — Deep Dive
Architecture of a Health Check System
A production health check system has three layers:
- Individual checks — Functions that verify a single dependency
- Aggregator — Combines check results into an overall status
- Endpoint — Exposes the aggregated status over HTTP
This separation makes checks reusable, testable, and configurable.
Building Health Checks in FastAPI
from fastapi import FastAPI, Response
from enum import Enum
from dataclasses import dataclass, field
import asyncio
import time
class CheckStatus(str, Enum):
UP = "up"
DOWN = "down"
DEGRADED = "degraded"
@dataclass
class CheckResult:
name: str
status: CheckStatus
latency_ms: float = 0
error: str | None = None
details: dict = field(default_factory=dict)
class HealthChecker:
def __init__(self):
self.checks: list = []
def register(self, name: str, check_fn, critical: bool = True):
self.checks.append({"name": name, "fn": check_fn, "critical": critical})
async def run_all(self, timeout: float = 5.0) -> list[CheckResult]:
results = []
for check in self.checks:
start = time.perf_counter()
try:
result = await asyncio.wait_for(
check["fn"](), timeout=timeout
)
latency = (time.perf_counter() - start) * 1000
results.append(CheckResult(
name=check["name"],
status=CheckStatus.UP,
latency_ms=round(latency, 2),
details=result or {},
))
except asyncio.TimeoutError:
results.append(CheckResult(
name=check["name"],
status=CheckStatus.DOWN,
error=f"Timeout after {timeout}s",
))
except Exception as e:
latency = (time.perf_counter() - start) * 1000
results.append(CheckResult(
name=check["name"],
status=CheckStatus.DOWN,
latency_ms=round(latency, 2),
error=str(e),
))
return results
def aggregate(self, results: list[CheckResult]) -> CheckStatus:
critical_checks = [
r for r, c in zip(results, self.checks) if c["critical"]
]
if any(r.status == CheckStatus.DOWN for r in critical_checks):
return CheckStatus.DOWN
if any(r.status == CheckStatus.DEGRADED for r in results):
return CheckStatus.DEGRADED
return CheckStatus.UP
Registering Checks
from sqlalchemy.ext.asyncio import AsyncSession
app = FastAPI()
health = HealthChecker()
async def check_database():
async with async_session() as session:
await session.execute(text("SELECT 1"))
return {"connection_pool": "active"}
async def check_redis():
await redis_client.ping()
info = await redis_client.info("memory")
return {"used_memory_mb": round(info["used_memory"] / 1024 / 1024, 1)}
async def check_disk():
import shutil
usage = shutil.disk_usage("/")
free_gb = usage.free / (1024 ** 3)
if free_gb < 1:
raise RuntimeError(f"Low disk space: {free_gb:.1f}GB free")
return {"free_gb": round(free_gb, 1)}
health.register("database", check_database, critical=True)
health.register("redis", check_redis, critical=True)
health.register("disk", check_disk, critical=False)
Endpoint Implementation
@app.get("/health/live")
async def liveness():
return {"status": "up"}
@app.get("/health/ready")
async def readiness(response: Response):
results = await health.run_all(timeout=3.0)
overall = health.aggregate(results)
output = {
"status": overall.value,
"checks": {r.name: {
"status": r.status.value,
"latency_ms": r.latency_ms,
**({"error": r.error} if r.error else {}),
**r.details,
} for r in results},
}
if overall == CheckStatus.DOWN:
response.status_code = 503
return output
@app.get("/health/startup")
async def startup_check(response: Response):
# Check that essential initialization is complete
if not app.state.ready:
response.status_code = 503
return {"status": "starting"}
return {"status": "up"}
Django Implementation
Django can use a simple view or the django-health-check package:
# health/views.py
from django.http import JsonResponse
from django.db import connection
def health_ready(request):
checks = {}
healthy = True
# Database check
try:
with connection.cursor() as cursor:
cursor.execute("SELECT 1")
checks["database"] = {"status": "up"}
except Exception as e:
checks["database"] = {"status": "down", "error": str(e)}
healthy = False
# Redis check
try:
from django_redis import get_redis_connection
redis_conn = get_redis_connection("default")
redis_conn.ping()
checks["redis"] = {"status": "up"}
except Exception as e:
checks["redis"] = {"status": "down", "error": str(e)}
healthy = False
status_code = 200 if healthy else 503
return JsonResponse({
"status": "healthy" if healthy else "unhealthy",
"checks": checks,
}, status=status_code)
def health_live(request):
return JsonResponse({"status": "up"})
Wire into URLs:
# urls.py
urlpatterns = [
path("health/live", health_live),
path("health/ready", health_ready),
]
Flask with Background Checks
For Flask, consider caching health check results to avoid hammering dependencies on every probe:
from flask import Flask, jsonify
import threading
import time
app = Flask(__name__)
class CachedHealthChecker:
def __init__(self, interval=10):
self.interval = interval
self.results = {}
self.status = "unknown"
self._lock = threading.Lock()
def start_background(self):
thread = threading.Thread(target=self._run, daemon=True)
thread.start()
def _run(self):
while True:
results = {}
try:
db.session.execute(text("SELECT 1"))
results["database"] = "up"
except Exception:
results["database"] = "down"
with self._lock:
self.results = results
self.status = (
"healthy" if all(v == "up" for v in results.values())
else "unhealthy"
)
time.sleep(self.interval)
def get_status(self):
with self._lock:
return self.status, dict(self.results)
checker = CachedHealthChecker(interval=10)
checker.start_background()
@app.route("/health/ready")
def readiness():
status, checks = checker.get_status()
code = 200 if status == "healthy" else 503
return jsonify({"status": status, "checks": checks}), code
This approach runs checks every 10 seconds in the background instead of on every probe request, reducing database load when you have dozens of Kubernetes pods each probed every 5 seconds.
Kubernetes Probe Configuration
apiVersion: apps/v1
kind: Deployment
spec:
template:
spec:
containers:
- name: myapp
livenessProbe:
httpGet:
path: /health/live
port: 8000
initialDelaySeconds: 5
periodSeconds: 10
failureThreshold: 3
timeoutSeconds: 2
readinessProbe:
httpGet:
path: /health/ready
port: 8000
initialDelaySeconds: 10
periodSeconds: 5
failureThreshold: 2
timeoutSeconds: 3
startupProbe:
httpGet:
path: /health/startup
port: 8000
initialDelaySeconds: 0
periodSeconds: 5
failureThreshold: 30
timeoutSeconds: 2
Key configuration decisions:
- Liveness has a higher
failureThreshold(3) — don’t restart on transient hiccups - Readiness has a lower threshold (2) — pull from traffic quickly when unhealthy
- Startup has a high
failureThreshold(30 × 5s = 150s) — give the app time to initialize - timeoutSeconds should be shorter than
periodSecondsto prevent probe pile-up
Handling Degraded State
Not everything is binary healthy/unhealthy. A degraded state means the app works but with reduced capability:
async def check_recommendation_service():
try:
resp = await httpx.get("http://recommendations/health", timeout=2)
return {"status": "up"}
except Exception:
# Recommendations are nice to have, not critical
return {"status": "degraded", "fallback": "using cached recommendations"}
Report degraded in the health response but return HTTP 200. The app still works, just not at full capacity. Use metrics and alerts to track degraded states separately from outages.
Security Considerations
Health endpoints should not require authentication — load balancers and Kubernetes probes don’t send tokens. However:
- Don’t expose sensitive data — No connection strings, credentials, or internal IPs
- Rate limit health endpoints — Even though they’re lightweight, protect against abuse
- Separate detailed health from public health —
/health/readyreturns minimal info publicly,/health/detailedrequires authentication and shows debug information
@app.get("/health/detailed")
async def detailed_health(user=Depends(require_admin)):
# Full diagnostics including versions, connection pool stats, etc.
pass
Common Pitfalls
Checking too much in liveness. If a liveness check depends on the database and the database is down, Kubernetes restarts all pods. Now you have a cascade: the database recovers, but all pods restart simultaneously, overloading it again. Keep liveness minimal.
No timeouts on checks. A health check that hangs on a slow DNS lookup blocks the probe, which Kubernetes interprets as failure. Always set explicit timeouts shorter than the probe timeout.
Health checks that modify state. A health check should be read-only and idempotent. Never write to the database, increment counters, or trigger side effects from a health check.
The one thing to remember: Production health checks need three separate endpoints (liveness, readiness, startup), cached results to avoid thundering herd on dependencies, and timeouts that are always shorter than the probe interval.
See Also
- Python Aiohttp Client Understand Aiohttp Client through a practical analogy so your Python decisions become faster and clearer.
- Python Api Client Design Why building your own API client in Python is like creating a TV remote that only has the buttons you actually need.
- Python Api Documentation Swagger Swagger turns your Python API into an interactive playground where anyone can click buttons to try it out — no coding required.
- Python Api Mocking Responses Why testing with fake API responses is like rehearsing a play with stand-ins before the real actors show up.
- Python Api Pagination Clients Why APIs send data in pages, and how Python handles it — like reading a book one chapter at a time instead of swallowing the whole thing.