Python Graceful Degradation — Deep Dive
A Degradation Framework
Here’s a framework that manages degradation levels and feature availability:
import asyncio
import logging
from dataclasses import dataclass, field
from enum import IntEnum
from typing import Any, Callable, Optional
logger = logging.getLogger(__name__)
class DegradationLevel(IntEnum):
NORMAL = 0
MILD = 1
SIGNIFICANT = 2
EMERGENCY = 3
class FeatureTier(IntEnum):
CRITICAL = 1
IMPORTANT = 2
NICE_TO_HAVE = 3
@dataclass
class Feature:
name: str
tier: FeatureTier
fallback: Optional[Callable] = None
description: str = ""
@dataclass
class DegradationManager:
"""Controls system degradation level and feature availability."""
_level: DegradationLevel = DegradationLevel.NORMAL
_features: dict[str, Feature] = field(default_factory=dict)
_overrides: dict[str, bool] = field(default_factory=dict)
_listeners: list[Callable] = field(default_factory=list)
def register(self, feature: Feature) -> None:
self._features[feature.name] = feature
def is_enabled(self, feature_name: str) -> bool:
# Manual overrides take precedence
if feature_name in self._overrides:
return self._overrides[feature_name]
feature = self._features.get(feature_name)
if feature is None:
return True # Unknown features default to enabled
# Features are enabled if their tier <= current cutoff
return feature.tier.value <= self._active_tier_cutoff
@property
def _active_tier_cutoff(self) -> int:
"""Which tiers are active at the current degradation level."""
mapping = {
DegradationLevel.NORMAL: 3, # All tiers
DegradationLevel.MILD: 2, # Tier 1 + 2
DegradationLevel.SIGNIFICANT: 1, # Tier 1 only
DegradationLevel.EMERGENCY: 1, # Tier 1 only (+ extra measures)
}
return mapping[self._level]
async def set_level(self, level: DegradationLevel, reason: str = "") -> None:
old_level = self._level
self._level = level
logger.warning(
"Degradation level changed: %s → %s (reason: %s)",
old_level.name, level.name, reason,
)
for listener in self._listeners:
await listener(old_level, level, reason)
def override(self, feature_name: str, enabled: bool) -> None:
self._overrides[feature_name] = enabled
def clear_override(self, feature_name: str) -> None:
self._overrides.pop(feature_name, None)
@property
def level(self) -> DegradationLevel:
return self._level
def on_change(self, callback: Callable) -> None:
self._listeners.append(callback)
def status(self) -> dict[str, Any]:
return {
"level": self._level.name,
"features": {
name: {
"tier": f.tier.name,
"enabled": self.is_enabled(name),
"overridden": name in self._overrides,
}
for name, f in self._features.items()
},
}
# Global instance
degradation = DegradationManager()
Registering Features
# At application startup
degradation.register(Feature(
name="checkout",
tier=FeatureTier.CRITICAL,
description="Shopping cart and payment processing",
))
degradation.register(Feature(
name="search",
tier=FeatureTier.IMPORTANT,
description="Product search and filtering",
))
degradation.register(Feature(
name="recommendations",
tier=FeatureTier.NICE_TO_HAVE,
description="Personalized product recommendations",
fallback=lambda: {"items": [], "source": "disabled"},
))
degradation.register(Feature(
name="reviews",
tier=FeatureTier.NICE_TO_HAVE,
description="Customer reviews and ratings",
fallback=lambda: {"reviews": [], "average": None},
))
degradation.register(Feature(
name="live_chat",
tier=FeatureTier.NICE_TO_HAVE,
description="Real-time customer support chat",
))
FastAPI Integration
Use middleware and dependency injection to gate features:
from fastapi import FastAPI, Depends, HTTPException, Request
from functools import wraps
app = FastAPI()
def require_feature(feature_name: str):
"""FastAPI dependency that checks feature availability."""
async def checker():
if not degradation.is_enabled(feature_name):
feature = degradation._features.get(feature_name)
if feature and feature.fallback:
return feature.fallback()
raise HTTPException(
status_code=503,
detail=f"Feature '{feature_name}' is temporarily unavailable",
)
return None
return Depends(checker)
@app.get("/api/products/{product_id}")
async def get_product(product_id: str):
product = await fetch_product(product_id)
# Conditionally include optional sections
response = {"product": product}
if degradation.is_enabled("recommendations"):
try:
response["recommendations"] = await fetch_recommendations(
product_id
)
except Exception:
response["recommendations"] = []
if degradation.is_enabled("reviews"):
try:
response["reviews"] = await fetch_reviews(product_id)
except Exception:
response["reviews"] = []
return response
@app.get("/api/search")
async def search(query: str, _=require_feature("search")):
return await perform_search(query)
# Admin endpoint to control degradation
@app.post("/admin/degradation")
async def set_degradation(level: int, reason: str = "manual"):
await degradation.set_level(DegradationLevel(level), reason)
return degradation.status()
Automatic Degradation Based on Health
Connect degradation levels to real system metrics:
import psutil
import aiohttp
class HealthMonitor:
"""Monitors system health and adjusts degradation automatically."""
def __init__(self, manager: DegradationManager):
self.manager = manager
self._running = False
self._check_interval = 10 # seconds
async def start(self):
self._running = True
while self._running:
await self._check_health()
await asyncio.sleep(self._check_interval)
async def stop(self):
self._running = False
async def _check_health(self):
score = await self._calculate_health_score()
if score >= 90:
target = DegradationLevel.NORMAL
elif score >= 70:
target = DegradationLevel.MILD
elif score >= 40:
target = DegradationLevel.SIGNIFICANT
else:
target = DegradationLevel.EMERGENCY
if target != self.manager.level:
await self.manager.set_level(
target,
reason=f"Health score: {score}",
)
async def _calculate_health_score(self) -> float:
checks = []
# CPU check (0-25 points)
cpu = psutil.cpu_percent(interval=1)
checks.append(max(0, 25 * (1 - cpu / 100)))
# Memory check (0-25 points)
memory = psutil.virtual_memory().percent
checks.append(max(0, 25 * (1 - memory / 100)))
# Dependency checks (0-50 points)
dep_score = await self._check_dependencies()
checks.append(dep_score * 50)
return sum(checks)
async def _check_dependencies(self) -> float:
"""Returns 0.0-1.0 based on dependency health."""
dependencies = [
("database", "http://localhost:5432/health"),
("cache", "http://localhost:6379/health"),
("search", "http://localhost:9200/_cluster/health"),
]
healthy = 0
async with aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=2)
) as session:
for name, url in dependencies:
try:
async with session.get(url) as resp:
if resp.status == 200:
healthy += 1
except Exception:
logger.warning("Dependency %s is unhealthy", name)
return healthy / len(dependencies) if dependencies else 1.0
Prometheus Metrics for Degradation
from prometheus_client import Gauge, Counter, Info
degradation_level_gauge = Gauge(
"app_degradation_level",
"Current degradation level (0=normal, 3=emergency)",
)
features_disabled_gauge = Gauge(
"app_features_disabled_total",
"Number of features currently disabled",
)
degradation_changes = Counter(
"app_degradation_changes_total",
"Number of degradation level changes",
["from_level", "to_level"],
)
async def metrics_listener(old_level, new_level, reason):
degradation_level_gauge.set(new_level.value)
degradation_changes.labels(
from_level=old_level.name,
to_level=new_level.name,
).inc()
disabled_count = sum(
1 for name in degradation._features
if not degradation.is_enabled(name)
)
features_disabled_gauge.set(disabled_count)
degradation.on_change(metrics_listener)
Testing Degradation Levels
import pytest
@pytest.fixture
def manager():
mgr = DegradationManager()
mgr.register(Feature("cart", FeatureTier.CRITICAL))
mgr.register(Feature("search", FeatureTier.IMPORTANT))
mgr.register(Feature("reviews", FeatureTier.NICE_TO_HAVE))
return mgr
def test_normal_all_enabled(manager):
assert manager.is_enabled("cart")
assert manager.is_enabled("search")
assert manager.is_enabled("reviews")
@pytest.mark.asyncio
async def test_mild_drops_tier3(manager):
await manager.set_level(DegradationLevel.MILD)
assert manager.is_enabled("cart")
assert manager.is_enabled("search")
assert not manager.is_enabled("reviews")
@pytest.mark.asyncio
async def test_significant_keeps_only_critical(manager):
await manager.set_level(DegradationLevel.SIGNIFICANT)
assert manager.is_enabled("cart")
assert not manager.is_enabled("search")
assert not manager.is_enabled("reviews")
@pytest.mark.asyncio
async def test_override_keeps_feature_despite_level(manager):
await manager.set_level(DegradationLevel.SIGNIFICANT)
manager.override("search", True)
assert manager.is_enabled("search") # Override wins
Graceful Degradation Checklist
Before deploying a degradation-aware system, verify:
- Every feature is classified into a tier. Unclassified features default to enabled, which defeats the purpose.
- Fallbacks exist for Tier 2 and 3 features. Disabling a feature should show a reasonable alternative, not a blank space.
- Degradation can be triggered manually via an admin endpoint. Automation is great, but operators need an override.
- Each level has been tested in a staging environment. Switch to each level and verify the user experience makes sense.
- Metrics and alerts are configured for degradation level changes. Your team should know within minutes when the system degrades.
- Recovery is automatic. When health improves, the system should step back up through levels. Don’t require manual intervention to restore normal operation.
- Hysteresis prevents flapping. Require sustained improvement before upgrading levels — don’t oscillate between NORMAL and MILD every 10 seconds.
One thing to remember: Graceful degradation is a product decision disguised as an engineering pattern. The hardest part isn’t the code — it’s agreeing with stakeholders on what can be sacrificed when the system is under pressure. Have that conversation before the outage, not during it.
See Also
- Python Aggregate Pattern Why grouping related objects under a single gatekeeper prevents data chaos in your Python application.
- Python Bounded Contexts Why the same word means different things in different parts of your code — and why that is perfectly fine.
- Python Bulkhead Pattern Why smart Python apps put walls between their parts — like a ship that stays afloat even with a hole in the hull.
- Python Circuit Breaker Pattern How a circuit breaker saves your app from crashing — explained with a home electrical fuse analogy.
- Python Clean Architecture Why your Python app should look like an onion — and how that saves you from painful rewrites.