Data Anonymization in Python — Deep Dive
Building an anonymization pipeline
A production anonymization system needs a declarative configuration that maps each column to a transformation strategy. This makes the pipeline auditable and reproducible.
from dataclasses import dataclass
from enum import Enum
from typing import Any, Callable
import pandas as pd
import numpy as np
from hashlib import sha256
class Strategy(Enum):
SUPPRESS = "suppress"
GENERALIZE = "generalize"
MASK = "mask"
PERTURB = "perturb"
BUCKET = "bucket"
@dataclass
class ColumnPolicy:
column: str
strategy: Strategy
params: dict[str, Any] | None = None
class AnonymizationPipeline:
def __init__(self, policies: list[ColumnPolicy]):
self.policies = {p.column: p for p in policies}
self._transforms: dict[Strategy, Callable] = {
Strategy.SUPPRESS: self._suppress,
Strategy.GENERALIZE: self._generalize,
Strategy.MASK: self._mask,
Strategy.PERTURB: self._perturb,
Strategy.BUCKET: self._bucket,
}
def run(self, df: pd.DataFrame) -> pd.DataFrame:
result = df.copy()
for col, policy in self.policies.items():
if col in result.columns:
result[col] = self._transforms[policy.strategy](
result[col], policy.params or {}
)
return result
def _suppress(self, series: pd.Series, params: dict) -> pd.Series:
return pd.Series(["[REDACTED]"] * len(series), index=series.index)
def _generalize(self, series: pd.Series, params: dict) -> pd.Series:
mapping = params.get("mapping", {})
return series.map(lambda x: mapping.get(x, x))
def _mask(self, series: pd.Series, params: dict) -> pd.Series:
keep_last = params.get("keep_last", 4)
return series.astype(str).map(
lambda x: "*" * max(0, len(x) - keep_last) + x[-keep_last:]
if len(x) > keep_last else x
)
def _perturb(self, series: pd.Series, params: dict) -> pd.Series:
noise_pct = params.get("noise_percent", 10)
noise = np.random.normal(0, noise_pct / 100, len(series))
return (series * (1 + noise)).round(params.get("decimals", 2))
def _bucket(self, series: pd.Series, params: dict) -> pd.Series:
bins = params.get("bins", [0, 18, 30, 45, 60, 100])
labels = params.get("labels", ["<18", "18-29", "30-44", "45-59", "60+"])
return pd.cut(series, bins=bins, labels=labels, right=False)
Usage looks clean and declarative:
policies = [
ColumnPolicy("name", Strategy.SUPPRESS),
ColumnPolicy("email", Strategy.SUPPRESS),
ColumnPolicy("phone", Strategy.MASK, {"keep_last": 4}),
ColumnPolicy("age", Strategy.BUCKET, {
"bins": [0, 18, 30, 45, 60, 100],
"labels": ["<18", "18-29", "30-44", "45-59", "60+"],
}),
ColumnPolicy("salary", Strategy.PERTURB, {"noise_percent": 5}),
ColumnPolicy("zip_code", Strategy.GENERALIZE, {
"mapping": {str(z): str(z)[:3] + "**" for z in range(10000, 99999)},
}),
]
pipeline = AnonymizationPipeline(policies)
anonymized_df = pipeline.run(original_df)
Implementing k-anonymity
K-anonymity requires that every combination of quasi-identifier values appears at least k times. Here’s how to check and enforce it:
def check_k_anonymity(df: pd.DataFrame, quasi_identifiers: list[str], k: int) -> dict:
"""Check if a dataset satisfies k-anonymity."""
group_sizes = df.groupby(quasi_identifiers).size()
violations = group_sizes[group_sizes < k]
return {
"satisfies_k_anonymity": len(violations) == 0,
"k": k,
"min_group_size": int(group_sizes.min()),
"violating_groups": len(violations),
"total_groups": len(group_sizes),
"records_in_violating_groups": int(
df.merge(
violations.reset_index(name="count"),
on=quasi_identifiers,
how="inner"
).shape[0]
) if len(violations) > 0 else 0,
}
def enforce_k_anonymity(
df: pd.DataFrame,
quasi_identifiers: list[str],
k: int,
generalization_hierarchies: dict[str, list[Callable]],
) -> pd.DataFrame:
"""Iteratively generalize until k-anonymity is achieved.
generalization_hierarchies maps each QI to a list of increasingly
aggressive generalization functions.
"""
result = df.copy()
for qi in quasi_identifiers:
hierarchy = generalization_hierarchies.get(qi, [])
for level, generalize_fn in enumerate(hierarchy):
check = check_k_anonymity(result, quasi_identifiers, k)
if check["satisfies_k_anonymity"]:
return result
result[qi] = result[qi].map(generalize_fn)
# Final check — if still not k-anonymous, suppress smallest groups
check = check_k_anonymity(result, quasi_identifiers, k)
if not check["satisfies_k_anonymity"]:
group_sizes = result.groupby(quasi_identifiers).size().reset_index(name="count")
safe_groups = group_sizes[group_sizes["count"] >= k]
result = result.merge(safe_groups.drop(columns="count"), on=quasi_identifiers)
return result
Define generalization hierarchies for common quasi-identifiers:
# Age: exact → 5-year bins → 10-year bins → 20-year bins
age_hierarchy = [
lambda x: (x // 5) * 5, # 27 → 25
lambda x: (x // 10) * 10, # 27 → 20
lambda x: (x // 20) * 20, # 27 → 20
]
# Zip code: 5 digits → 4 digits → 3 digits
zip_hierarchy = [
lambda x: str(x)[:4] + "*",
lambda x: str(x)[:3] + "**",
]
hierarchies = {"age": age_hierarchy, "zip_code": zip_hierarchy}
L-diversity and t-closeness
K-anonymity alone has weaknesses. If everyone in a k-anonymous group has the same sensitive attribute (e.g., all have “cancer” as their diagnosis), the attribute is revealed despite anonymity.
L-diversity extends k-anonymity by requiring that each equivalence class has at least l “well-represented” values for each sensitive attribute.
T-closeness goes further, requiring that the distribution of sensitive attributes within each group is close to the distribution in the overall dataset, measured by a threshold t.
from scipy.stats import wasserstein_distance
def check_l_diversity(
df: pd.DataFrame,
quasi_identifiers: list[str],
sensitive_col: str,
l: int,
) -> dict:
"""Check if dataset satisfies l-diversity."""
groups = df.groupby(quasi_identifiers)[sensitive_col]
min_diversity = groups.nunique().min()
violations = groups.nunique()
violating = violations[violations < l]
return {
"satisfies_l_diversity": len(violating) == 0,
"l": l,
"min_diversity": int(min_diversity),
"violating_groups": len(violating),
}
def check_t_closeness(
df: pd.DataFrame,
quasi_identifiers: list[str],
sensitive_col: str,
t: float,
) -> dict:
"""Check t-closeness using Earth Mover's Distance."""
overall_dist = df[sensitive_col].value_counts(normalize=True).sort_index()
max_distance = 0.0
groups = df.groupby(quasi_identifiers)
for _, group in groups:
group_dist = group[sensitive_col].value_counts(normalize=True).sort_index()
# Align indices
all_vals = overall_dist.index.union(group_dist.index)
o = overall_dist.reindex(all_vals, fill_value=0)
g = group_dist.reindex(all_vals, fill_value=0)
distance = wasserstein_distance(o.values, g.values)
max_distance = max(max_distance, distance)
return {
"satisfies_t_closeness": max_distance <= t,
"t": t,
"max_distance": round(max_distance, 4),
}
Synthetic data generation
When anonymization degrades data quality too much, synthetic data provides an alternative. The Synthetic Data Vault (SDV) library learns statistical properties from real data and generates new records that preserve correlations without corresponding to real individuals.
# pip install sdv
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata
# Define metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(real_df)
metadata.update_column("email", sdtype="email")
metadata.update_column("name", sdtype="name")
# Train the model
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(real_df)
# Generate synthetic data
synthetic_df = synthesizer.sample(num_rows=10000)
# Validate quality
from sdv.evaluation.single_table import evaluate_quality
quality_report = evaluate_quality(real_df, synthetic_df, metadata)
print(f"Overall quality score: {quality_report.get_score():.2%}")
SDV preserves statistical correlations (e.g., “age correlates with salary”) while generating entirely fictional individuals. However, for small or highly unique datasets, there’s still a risk that synthetic records closely resemble real ones. Always validate with a membership inference test.
Re-identification risk assessment
Before releasing anonymized data, quantify the risk:
def assess_reidentification_risk(
df: pd.DataFrame,
quasi_identifiers: list[str],
) -> dict:
"""Estimate re-identification risk based on uniqueness."""
group_sizes = df.groupby(quasi_identifiers).size()
unique_records = (group_sizes == 1).sum()
total_groups = len(group_sizes)
# Prosecutor risk: probability of identifying a known target
prosecutor_risk = unique_records / len(df)
# Journalist risk: max probability across all records
journalist_risk = 1.0 / group_sizes.min() if len(group_sizes) > 0 else 1.0
# Marketer risk: expected number of correct re-identifications
marketer_risk = sum(1.0 / size for size in group_sizes) / len(df)
return {
"unique_records": int(unique_records),
"total_records": len(df),
"prosecutor_risk": round(prosecutor_risk, 4),
"journalist_risk": round(journalist_risk, 4),
"marketer_risk": round(marketer_risk, 4),
"recommendation": (
"LOW RISK" if prosecutor_risk < 0.05
else "MODERATE RISK" if prosecutor_risk < 0.2
else "HIGH RISK — further anonymization needed"
),
}
Tradeoffs
Privacy vs. utility: Every anonymization step reduces data quality. The goal is finding the minimum transformation that achieves your privacy threshold. Over-anonymizing makes data useless; under-anonymizing risks exposure.
Deterministic vs. probabilistic: Deterministic techniques (like bucketing) are reproducible but predictable. Probabilistic techniques (like perturbation) are harder to reverse but introduce statistical uncertainty.
Linkage attacks: Even well-anonymized datasets can be de-anonymized by linking with external data. Always consider what auxiliary information an adversary might have when designing your anonymization strategy.
The one thing to remember: Effective anonymization combines multiple techniques — suppression for direct identifiers, generalization for quasi-identifiers, and formal guarantees like k-anonymity — and then validates with re-identification risk assessment before releasing the data.
See Also
- Python Compliance Audit Trails Why your Python app needs a tamper-proof diary that records every important action — like a security camera for your data
- Python Consent Management How Python apps ask permission like a polite guest — and remember exactly what you said yes and no to
- Python Data Retention Policies Why your Python app needs an expiration date for data — just like the one on milk cartons — and what happens when data goes stale
- Python Differential Privacy How adding a pinch of random noise to data lets companies learn from millions of people without knowing anything about any single person
- Python Gdpr Compliance Why Europe's privacy law is like a restaurant that must tell you every ingredient — and how Python apps follow the recipe