Court Case Prediction with Python — Deep Dive

Build a Python court case prediction system with feature engineering from court records, judge profiling, legal text embeddings, and calibrated probability estimation

Data collection from court records

Public court databases provide the training data. CourtListener’s bulk data API offers millions of opinions with structured metadata:

import httpx
from dataclasses import dataclass, field
from datetime import date


@dataclass
class CaseRecord:
    case_id: str
    case_name: str
    court: str
    date_filed: str
    date_decided: str | None
    judge: str
    case_type: str
    outcome: str  # plaintiff_win, defendant_win, mixed, dismissed, settled
    citations_count: int
    opinion_text: str
    procedural_posture: str
    statutes_cited: list[str] = field(default_factory=list)


class CourtDataCollector:
    BASE_URL = "https://www.courtlistener.com/api/rest/v4"

    def __init__(self, api_token: str):
        self.client = httpx.Client(
            headers={"Authorization": f"Token {api_token}"},
            timeout=60,
        )

    def collect_cases(
        self,
        court: str,
        case_type: str,
        start_date: date,
        end_date: date,
        max_results: int = 5000,
    ) -> list[CaseRecord]:
        """Collect case records for a specific court and case type."""
        cases = []
        params = {
            "court": court,
            "type": "o",
            "date_filed__gte": start_date.isoformat(),
            "date_filed__lte": end_date.isoformat(),
            "order_by": "dateFiled",
            "page_size": 100,
        }

        while len(cases) < max_results:
            response = self.client.get(
                f"{self.BASE_URL}/search/", params=params
            )
            response.raise_for_status()
            data = response.json()
            results = data.get("results", [])

            if not results:
                break

            for r in results:
                cases.append(CaseRecord(
                    case_id=str(r.get("id", "")),
                    case_name=r.get("caseName", ""),
                    court=r.get("court", ""),
                    date_filed=r.get("dateFiled", ""),
                    date_decided=r.get("dateArgued"),
                    judge=r.get("judge", ""),
                    case_type=case_type,
                    outcome="",  # labeled separately
                    citations_count=r.get("citeCount", 0),
                    opinion_text=r.get("snippet", ""),
                    procedural_posture="",
                    statutes_cited=[],
                ))

            # Pagination
            next_url = data.get("next")
            if not next_url:
                break
            params = {}  # next_url includes all params
            response = self.client.get(next_url)

        return cases[:max_results]

Feature engineering

The quality of features determines model performance. Legal case features span structured metadata and extracted text features:

import numpy as np
from dataclasses import dataclass
from collections import Counter
from sklearn.preprocessing import LabelEncoder


@dataclass
class CaseFeatures:
    # Structured features
    court_encoded: int
    judge_encoded: int
    case_type_encoded: int
    year_filed: int
    month_filed: int
    day_of_week_filed: int
    citations_in_opinion: int
    opinion_word_count: int

    # Judge historical features
    judge_total_cases: int
    judge_plaintiff_win_rate: float
    judge_reversal_rate: float
    judge_years_on_bench: int

    # Case complexity features
    num_parties: int
    num_statutes_cited: int
    num_prior_motions: int
    case_age_days: int

    # Text-derived features
    text_embedding: np.ndarray  # from Legal-BERT


class FeatureExtractor:
    def __init__(self):
        self.court_encoder = LabelEncoder()
        self.judge_encoder = LabelEncoder()
        self.type_encoder = LabelEncoder()
        self.judge_stats: dict[str, dict] = {}

    def build_judge_profiles(self, cases: list[CaseRecord]):
        """Compute historical statistics for each judge."""
        judge_cases: dict[str, list[CaseRecord]] = {}
        for case in cases:
            judge = case.judge
            if judge not in judge_cases:
                judge_cases[judge] = []
            judge_cases[judge].append(case)

        for judge, jcases in judge_cases.items():
            outcomes = [c.outcome for c in jcases]
            total = len(outcomes)
            plaintiff_wins = sum(
                1 for o in outcomes if o == "plaintiff_win"
            )

            self.judge_stats[judge] = {
                "total_cases": total,
                "plaintiff_win_rate": plaintiff_wins / total if total > 0 else 0.5,
                "reversal_rate": 0.0,  # computed from appellate data
                "years_on_bench": 0,    # from judge metadata
            }

    def extract_features(self, case: CaseRecord) -> dict:
        """Extract all features for a single case."""
        from datetime import datetime

        filed = datetime.fromisoformat(case.date_filed) if case.date_filed else datetime.now()

        judge_info = self.judge_stats.get(case.judge, {
            "total_cases": 0,
            "plaintiff_win_rate": 0.5,
            "reversal_rate": 0.0,
            "years_on_bench": 0,
        })

        return {
            "court": case.court,
            "judge": case.judge,
            "case_type": case.case_type,
            "year_filed": filed.year,
            "month_filed": filed.month,
            "day_of_week": filed.weekday(),
            "citations_count": case.citations_count,
            "opinion_word_count": len(case.opinion_text.split()),
            "judge_total_cases": judge_info["total_cases"],
            "judge_plaintiff_win_rate": judge_info["plaintiff_win_rate"],
            "judge_reversal_rate": judge_info["reversal_rate"],
            "judge_years_on_bench": judge_info["years_on_bench"],
            "num_statutes_cited": len(case.statutes_cited),
        }

Legal text embeddings

Text features capture nuances that structured features miss. Legal-BERT generates embeddings tuned for legal language:

from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np


class LegalTextEncoder:
    def __init__(self, model_name: str = "nlpaueb/legal-bert-base-uncased"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.model.eval()

    def encode(self, text: str, max_length: int = 512) -> np.ndarray:
        """Generate a fixed-size embedding for legal text."""
        inputs = self.tokenizer(
            text,
            max_length=max_length,
            truncation=True,
            padding=True,
            return_tensors="pt",
        )
        with torch.no_grad():
            outputs = self.model(**inputs)
            # Use CLS token embedding
            embedding = outputs.last_hidden_state[:, 0, :].squeeze()

        return embedding.numpy()

    def encode_long_document(
        self, text: str, chunk_size: int = 512, stride: int = 256
    ) -> np.ndarray:
        """Handle documents longer than model's max length using sliding window."""
        tokens = self.tokenizer.encode(text, add_special_tokens=False)
        chunks = []

        for i in range(0, len(tokens), stride):
            chunk_tokens = tokens[i:i + chunk_size]
            if len(chunk_tokens) < 50:  # skip tiny trailing chunks
                continue

            chunk_text = self.tokenizer.decode(chunk_tokens)
            chunks.append(self.encode(chunk_text))

        if not chunks:
            return self.encode(text[:1000])

        # Average pool chunk embeddings
        return np.mean(chunks, axis=0)

Model training with calibrated probabilities

The prediction model needs well-calibrated probabilities, not just accuracy. A predicted 75% should mean the outcome occurs 75% of the time:

import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (
    accuracy_score, roc_auc_score, brier_score_loss,
    classification_report,
)
import pandas as pd


class CaseOutcomePredictor:
    def __init__(self):
        base_model = GradientBoostingClassifier(
            n_estimators=500,
            max_depth=5,
            learning_rate=0.05,
            subsample=0.8,
            min_samples_leaf=20,
        )
        # Calibrate probabilities using isotonic regression
        self.model = CalibratedClassifierCV(
            base_model, cv=5, method="isotonic"
        )
        self.text_encoder = LegalTextEncoder()
        self.feature_extractor = FeatureExtractor()

    def prepare_training_data(
        self, cases: list[CaseRecord]
    ) -> tuple[np.ndarray, np.ndarray]:
        """Convert cases to feature matrix and labels."""
        self.feature_extractor.build_judge_profiles(cases)

        features_list = []
        labels = []

        for case in cases:
            if not case.outcome:
                continue

            # Structured features
            structured = self.feature_extractor.extract_features(case)

            # Text embedding
            text_emb = self.text_encoder.encode(case.opinion_text)

            # Combine structured + text features
            structured_vec = np.array([
                structured["year_filed"],
                structured["month_filed"],
                structured["citations_count"],
                structured["opinion_word_count"],
                structured["judge_total_cases"],
                structured["judge_plaintiff_win_rate"],
                structured["judge_reversal_rate"],
                structured["num_statutes_cited"],
            ], dtype=np.float32)

            combined = np.concatenate([structured_vec, text_emb])
            features_list.append(combined)

            label = 1 if case.outcome == "plaintiff_win" else 0
            labels.append(label)

        return np.array(features_list), np.array(labels)

    def train_and_evaluate(
        self, X: np.ndarray, y: np.ndarray
    ) -> dict:
        """Train with temporal cross-validation (no future leakage)."""
        tscv = TimeSeriesSplit(n_splits=5)
        metrics = {
            "accuracy": [], "auc": [], "brier": [],
        }

        for train_idx, test_idx in tscv.split(X):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            self.model.fit(X_train, y_train)
            y_pred = self.model.predict(X_test)
            y_proba = self.model.predict_proba(X_test)[:, 1]

            metrics["accuracy"].append(accuracy_score(y_test, y_pred))
            metrics["auc"].append(roc_auc_score(y_test, y_proba))
            metrics["brier"].append(brier_score_loss(y_test, y_proba))

        return {k: np.mean(v) for k, v in metrics.items()}

    def predict(self, case: CaseRecord) -> dict:
        """Predict outcome for a new case."""
        structured = self.feature_extractor.extract_features(case)
        text_emb = self.text_encoder.encode(case.opinion_text)

        structured_vec = np.array([
            structured["year_filed"],
            structured["month_filed"],
            structured["citations_count"],
            structured["opinion_word_count"],
            structured["judge_total_cases"],
            structured["judge_plaintiff_win_rate"],
            structured["judge_reversal_rate"],
            structured["num_statutes_cited"],
        ], dtype=np.float32)

        combined = np.concatenate([structured_vec, text_emb]).reshape(1, -1)
        proba = self.model.predict_proba(combined)[0]

        return {
            "plaintiff_win_probability": float(proba[1]),
            "defendant_win_probability": float(proba[0]),
            "confidence": float(max(proba)),
            "prediction": "plaintiff_win" if proba[1] > 0.5 else "defendant_win",
        }

Feature importance and explainability

Lawyers need to understand why a model predicts a certain outcome, not just the probability:

import shap
import numpy as np


class PredictionExplainer:
    def __init__(self, model, feature_names: list[str]):
        self.explainer = shap.TreeExplainer(
            model.calibrated_classifiers_[0].estimator
        )
        self.feature_names = feature_names

    def explain_prediction(
        self, features: np.ndarray
    ) -> list[tuple[str, float]]:
        """Return features ranked by their impact on the prediction."""
        shap_values = self.explainer.shap_values(features.reshape(1, -1))

        # For binary classification, use class 1 SHAP values
        if isinstance(shap_values, list):
            values = shap_values[1][0]
        else:
            values = shap_values[0]

        # Pair features with their SHAP values
        explanations = list(zip(self.feature_names, values))
        explanations.sort(key=lambda x: abs(x[1]), reverse=True)

        return explanations[:10]  # Top 10 most influential features

Temporal validation and avoiding data leakage

Legal prediction has a critical pitfall: using future information to predict past outcomes. The model must only train on cases decided before the case being predicted:

from datetime import datetime


def temporal_train_test_split(
    cases: list[CaseRecord],
    cutoff_date: str,
) -> tuple[list[CaseRecord], list[CaseRecord]]:
    """Split cases by date to avoid temporal leakage."""
    cutoff = datetime.fromisoformat(cutoff_date)
    train = []
    test = []

    for case in cases:
        if not case.date_decided:
            continue
        decided = datetime.fromisoformat(case.date_decided)
        if decided < cutoff:
            train.append(case)
        else:
            test.append(case)

    return train, test

Ethical considerations and production guardrails

Bias detection — Models trained on historical decisions can perpetuate judicial biases. Monitor prediction disparities across demographic groups and case types. If the model predicts significantly different outcomes for similar cases based on party demographics, that’s a bias signal.

Confidence thresholds — Don’t present predictions below a confidence threshold. A model that says “55% plaintiff win” is barely better than a coin flip. Set minimum confidence levels (e.g., 65%) before surfacing predictions.

Disclaimer requirements — Any prediction tool must clearly state that predictions are statistical estimates, not legal advice. Courts have not recognized ML predictions as admissible evidence, and presenting them as definitive could be sanctionable.

Model drift — Legal landscapes change. New legislation, landmark rulings, and shifts in judicial philosophy make older training data less relevant. Retrain models regularly and monitor prediction accuracy on recent outcomes.

The one thing to remember: A production case prediction system combines structured case features, judge profiling, legal text embeddings, and calibrated gradient boosting with temporal validation — always framed as a strategic tool for litigation planning rather than a substitute for legal judgment.

pythonlegal-techmachine-learningprediction