A/B Testing ML Models in Python — Deep Dive

Hash-Based Traffic Splitting

Consistent, deterministic traffic splitting is essential. Using random assignment per request creates noisy results. Hash-based splitting ensures the same user always sees the same model variant:

import hashlib

def assign_variant(
    user_id: str,
    experiment_name: str,
    variants: dict[str, float]  # {"control": 0.5, "treatment": 0.5}
) -> str:
    """Deterministic variant assignment using consistent hashing."""
    hash_input = f"{experiment_name}:{user_id}"
    hash_value = int(hashlib.sha256(hash_input.encode()).hexdigest(), 16)
    bucket = (hash_value % 10000) / 10000  # 0.0 to 0.9999

    cumulative = 0.0
    for variant, weight in variants.items():
        cumulative += weight
        if bucket < cumulative:
            return variant

    return list(variants.keys())[-1]  # Fallback

# Usage
variant = assign_variant("user_12345", "rec_model_v2_test", {
    "control": 0.8,
    "treatment": 0.2
})

This approach is stateless — no database lookup needed to determine a user’s assignment. The same user-experiment combination always produces the same variant.

Sample Size Calculation

from scipy import stats
import math

def required_sample_size(
    baseline_rate: float,
    minimum_detectable_effect: float,
    alpha: float = 0.05,
    power: float = 0.8
) -> int:
    """Calculate minimum users per group for a two-proportion z-test."""
    p1 = baseline_rate
    p2 = baseline_rate * (1 + minimum_detectable_effect)

    z_alpha = stats.norm.ppf(1 - alpha / 2)
    z_beta = stats.norm.ppf(power)

    p_avg = (p1 + p2) / 2

    numerator = (z_alpha * math.sqrt(2 * p_avg * (1 - p_avg)) +
                 z_beta * math.sqrt(p1 * (1 - p1) + p2 * (1 - p2))) ** 2
    denominator = (p2 - p1) ** 2

    return math.ceil(numerator / denominator)

# Example: baseline CTR 5%, want to detect 10% relative lift (5% → 5.5%)
n = required_sample_size(0.05, 0.10)
print(f"Need {n:,} users per group")  # ~31,234 per group

Frequentist Analysis

Two-Proportion Z-Test

import numpy as np
from scipy import stats

def analyze_ab_test(
    control_conversions: int,
    control_total: int,
    treatment_conversions: int,
    treatment_total: int,
    alpha: float = 0.05
) -> dict:
    """Analyze A/B test results with a two-proportion z-test."""
    p_control = control_conversions / control_total
    p_treatment = treatment_conversions / treatment_total
    p_pool = (control_conversions + treatment_conversions) / (
        control_total + treatment_total
    )

    se = math.sqrt(p_pool * (1 - p_pool) * (1/control_total + 1/treatment_total))
    z_stat = (p_treatment - p_control) / se
    p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))

    # Confidence interval for the difference
    se_diff = math.sqrt(
        p_control * (1 - p_control) / control_total +
        p_treatment * (1 - p_treatment) / treatment_total
    )
    z_crit = stats.norm.ppf(1 - alpha / 2)
    ci_lower = (p_treatment - p_control) - z_crit * se_diff
    ci_upper = (p_treatment - p_control) + z_crit * se_diff

    relative_lift = (p_treatment - p_control) / p_control * 100

    return {
        "control_rate": p_control,
        "treatment_rate": p_treatment,
        "relative_lift_pct": relative_lift,
        "p_value": p_value,
        "significant": p_value < alpha,
        "confidence_interval": (ci_lower, ci_upper)
    }

result = analyze_ab_test(
    control_conversions=520, control_total=10000,
    treatment_conversions=570, treatment_total=10000
)

Bayesian A/B Testing

Bayesian analysis provides intuitive results: “There is an 94% probability that Model B is better than Model A.”

import numpy as np

def bayesian_ab_test(
    control_conversions: int,
    control_total: int,
    treatment_conversions: int,
    treatment_total: int,
    n_simulations: int = 100_000
) -> dict:
    """Bayesian A/B test using Beta-Binomial model."""
    # Prior: Beta(1, 1) = uniform
    alpha_prior, beta_prior = 1, 1

    # Posterior distributions
    control_samples = np.random.beta(
        alpha_prior + control_conversions,
        beta_prior + control_total - control_conversions,
        n_simulations
    )
    treatment_samples = np.random.beta(
        alpha_prior + treatment_conversions,
        beta_prior + treatment_total - treatment_conversions,
        n_simulations
    )

    # Probability that treatment is better
    prob_treatment_better = (treatment_samples > control_samples).mean()

    # Expected lift distribution
    lift_samples = (treatment_samples - control_samples) / control_samples
    expected_lift = lift_samples.mean()
    lift_ci = (np.percentile(lift_samples, 2.5), np.percentile(lift_samples, 97.5))

    # Expected loss: how much we lose if we pick the wrong variant
    loss_if_choose_treatment = np.maximum(control_samples - treatment_samples, 0).mean()
    loss_if_choose_control = np.maximum(treatment_samples - control_samples, 0).mean()

    return {
        "prob_treatment_better": prob_treatment_better,
        "expected_lift": expected_lift,
        "lift_95_ci": lift_ci,
        "expected_loss_treatment": loss_if_choose_treatment,
        "expected_loss_control": loss_if_choose_control
    }

The “expected loss” metric is particularly useful: it answers “If I choose Model B and I’m wrong, how much do I lose on average?” When expected loss drops below a threshold (e.g., 0.1%), the decision is safe regardless of statistical significance.

Multi-Armed Bandits for Model Selection

Traditional A/B tests split traffic evenly. Multi-armed bandits adaptively route more traffic to the better-performing model, reducing regret (the cost of showing the worse model):

import numpy as np

class ThompsonSamplingRouter:
    """Route traffic between models using Thompson Sampling."""

    def __init__(self, model_names: list[str]):
        self.models = {
            name: {"alpha": 1, "beta": 1}  # Beta prior
            for name in model_names
        }

    def select_model(self) -> str:
        """Sample from each model's posterior and pick the highest."""
        samples = {
            name: np.random.beta(params["alpha"], params["beta"])
            for name, params in self.models.items()
        }
        return max(samples, key=samples.get)

    def update(self, model_name: str, success: bool):
        """Update posterior with observed outcome."""
        if success:
            self.models[model_name]["alpha"] += 1
        else:
            self.models[model_name]["beta"] += 1

    def get_allocation(self) -> dict:
        """Current estimated win probability per model."""
        n_samples = 10_000
        win_counts = {name: 0 for name in self.models}

        for _ in range(n_samples):
            samples = {
                name: np.random.beta(params["alpha"], params["beta"])
                for name, params in self.models.items()
            }
            winner = max(samples, key=samples.get)
            win_counts[winner] += 1

        return {name: count / n_samples for name, count in win_counts.items()}

# Usage
router = ThompsonSamplingRouter(["model_v1", "model_v2", "model_v3"])

# Simulation loop
for request in incoming_requests:
    chosen = router.select_model()
    prediction = serve_model(chosen, request)
    outcome = observe_outcome(request)
    router.update(chosen, success=outcome)

When to Use Bandits vs A/B Tests

CriterionA/B TestMulti-Armed Bandit
GoalRigorous statistical proofMinimize regret / maximize reward
DurationFixed (2-4 weeks typical)Continuous
Traffic splitEvenAdaptive
Statistical validityStrong (if run correctly)Weaker (harder to get p-values)
Best forProduct launches, regulatory needsOngoing optimization, many variants

Experiment Platform Architecture

from dataclasses import dataclass
from datetime import datetime

@dataclass
class Experiment:
    name: str
    variants: dict[str, float]  # variant_name -> traffic_weight
    primary_metric: str
    guardrail_metrics: list[str]
    start_date: datetime
    end_date: datetime
    min_sample_size: int
    status: str = "running"  # running, completed, stopped

class ExperimentPlatform:
    def __init__(self):
        self.experiments: dict[str, Experiment] = {}
        self.assignments: dict[str, dict] = {}  # user_id -> {exp: variant}

    def create_experiment(self, experiment: Experiment):
        self.experiments[experiment.name] = experiment

    def get_variant(self, user_id: str, experiment_name: str) -> str | None:
        exp = self.experiments.get(experiment_name)
        if not exp or exp.status != "running":
            return None
        return assign_variant(user_id, experiment_name, exp.variants)

    def log_event(self, user_id: str, experiment_name: str, metric: str, value: float):
        """Log metric observation for analysis."""
        # In production: write to analytics warehouse
        pass

    def check_guardrails(self, experiment_name: str) -> dict:
        """Auto-stop experiment if guardrail metrics degrade."""
        # Query warehouse, run statistical test on each guardrail
        # Return {"healthy": True/False, "violations": [...]}
        pass

Common Pitfalls

  1. Network effects — if users interact (social networks, marketplaces), treatment effects leak between groups. Use cluster randomization (randomize by city, school, or network cluster).

  2. Novelty effects — users may engage more with a new model simply because it is different. Run tests long enough for the novelty to wear off (typically 2+ weeks).

  3. Simpson’s paradox — overall metrics look flat but hide segment-level improvements and degradations that cancel out. Always segment results by key dimensions (device, country, user cohort).

  4. Multiple testing — testing 10 metrics inflates false positive rate. Apply Bonferroni correction or use a hierarchical testing framework.

  5. Survivorship bias — if Model B causes more users to drop off, comparing metrics only among surviving users makes Model B look better than it is. Always analyze on intent-to-treat basis.

One thing to remember: A well-designed model A/B test requires consistent user-level assignment, pre-calculated sample sizes, guardrail metrics, and patience — rushing to significance is the fastest way to deploy a worse model.

pythonab-testingmachine-learningmlops

See Also