Court Case Prediction with Python — Deep Dive
Data collection from court records
Public court databases provide the training data. CourtListener’s bulk data API offers millions of opinions with structured metadata:
import httpx
from dataclasses import dataclass, field
from datetime import date
@dataclass
class CaseRecord:
case_id: str
case_name: str
court: str
date_filed: str
date_decided: str | None
judge: str
case_type: str
outcome: str # plaintiff_win, defendant_win, mixed, dismissed, settled
citations_count: int
opinion_text: str
procedural_posture: str
statutes_cited: list[str] = field(default_factory=list)
class CourtDataCollector:
BASE_URL = "https://www.courtlistener.com/api/rest/v4"
def __init__(self, api_token: str):
self.client = httpx.Client(
headers={"Authorization": f"Token {api_token}"},
timeout=60,
)
def collect_cases(
self,
court: str,
case_type: str,
start_date: date,
end_date: date,
max_results: int = 5000,
) -> list[CaseRecord]:
"""Collect case records for a specific court and case type."""
cases = []
params = {
"court": court,
"type": "o",
"date_filed__gte": start_date.isoformat(),
"date_filed__lte": end_date.isoformat(),
"order_by": "dateFiled",
"page_size": 100,
}
while len(cases) < max_results:
response = self.client.get(
f"{self.BASE_URL}/search/", params=params
)
response.raise_for_status()
data = response.json()
results = data.get("results", [])
if not results:
break
for r in results:
cases.append(CaseRecord(
case_id=str(r.get("id", "")),
case_name=r.get("caseName", ""),
court=r.get("court", ""),
date_filed=r.get("dateFiled", ""),
date_decided=r.get("dateArgued"),
judge=r.get("judge", ""),
case_type=case_type,
outcome="", # labeled separately
citations_count=r.get("citeCount", 0),
opinion_text=r.get("snippet", ""),
procedural_posture="",
statutes_cited=[],
))
# Pagination
next_url = data.get("next")
if not next_url:
break
params = {} # next_url includes all params
response = self.client.get(next_url)
return cases[:max_results]
Feature engineering
The quality of features determines model performance. Legal case features span structured metadata and extracted text features:
import numpy as np
from dataclasses import dataclass
from collections import Counter
from sklearn.preprocessing import LabelEncoder
@dataclass
class CaseFeatures:
# Structured features
court_encoded: int
judge_encoded: int
case_type_encoded: int
year_filed: int
month_filed: int
day_of_week_filed: int
citations_in_opinion: int
opinion_word_count: int
# Judge historical features
judge_total_cases: int
judge_plaintiff_win_rate: float
judge_reversal_rate: float
judge_years_on_bench: int
# Case complexity features
num_parties: int
num_statutes_cited: int
num_prior_motions: int
case_age_days: int
# Text-derived features
text_embedding: np.ndarray # from Legal-BERT
class FeatureExtractor:
def __init__(self):
self.court_encoder = LabelEncoder()
self.judge_encoder = LabelEncoder()
self.type_encoder = LabelEncoder()
self.judge_stats: dict[str, dict] = {}
def build_judge_profiles(self, cases: list[CaseRecord]):
"""Compute historical statistics for each judge."""
judge_cases: dict[str, list[CaseRecord]] = {}
for case in cases:
judge = case.judge
if judge not in judge_cases:
judge_cases[judge] = []
judge_cases[judge].append(case)
for judge, jcases in judge_cases.items():
outcomes = [c.outcome for c in jcases]
total = len(outcomes)
plaintiff_wins = sum(
1 for o in outcomes if o == "plaintiff_win"
)
self.judge_stats[judge] = {
"total_cases": total,
"plaintiff_win_rate": plaintiff_wins / total if total > 0 else 0.5,
"reversal_rate": 0.0, # computed from appellate data
"years_on_bench": 0, # from judge metadata
}
def extract_features(self, case: CaseRecord) -> dict:
"""Extract all features for a single case."""
from datetime import datetime
filed = datetime.fromisoformat(case.date_filed) if case.date_filed else datetime.now()
judge_info = self.judge_stats.get(case.judge, {
"total_cases": 0,
"plaintiff_win_rate": 0.5,
"reversal_rate": 0.0,
"years_on_bench": 0,
})
return {
"court": case.court,
"judge": case.judge,
"case_type": case.case_type,
"year_filed": filed.year,
"month_filed": filed.month,
"day_of_week": filed.weekday(),
"citations_count": case.citations_count,
"opinion_word_count": len(case.opinion_text.split()),
"judge_total_cases": judge_info["total_cases"],
"judge_plaintiff_win_rate": judge_info["plaintiff_win_rate"],
"judge_reversal_rate": judge_info["reversal_rate"],
"judge_years_on_bench": judge_info["years_on_bench"],
"num_statutes_cited": len(case.statutes_cited),
}
Legal text embeddings
Text features capture nuances that structured features miss. Legal-BERT generates embeddings tuned for legal language:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
class LegalTextEncoder:
def __init__(self, model_name: str = "nlpaueb/legal-bert-base-uncased"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name)
self.model.eval()
def encode(self, text: str, max_length: int = 512) -> np.ndarray:
"""Generate a fixed-size embedding for legal text."""
inputs = self.tokenizer(
text,
max_length=max_length,
truncation=True,
padding=True,
return_tensors="pt",
)
with torch.no_grad():
outputs = self.model(**inputs)
# Use CLS token embedding
embedding = outputs.last_hidden_state[:, 0, :].squeeze()
return embedding.numpy()
def encode_long_document(
self, text: str, chunk_size: int = 512, stride: int = 256
) -> np.ndarray:
"""Handle documents longer than model's max length using sliding window."""
tokens = self.tokenizer.encode(text, add_special_tokens=False)
chunks = []
for i in range(0, len(tokens), stride):
chunk_tokens = tokens[i:i + chunk_size]
if len(chunk_tokens) < 50: # skip tiny trailing chunks
continue
chunk_text = self.tokenizer.decode(chunk_tokens)
chunks.append(self.encode(chunk_text))
if not chunks:
return self.encode(text[:1000])
# Average pool chunk embeddings
return np.mean(chunks, axis=0)
Model training with calibrated probabilities
The prediction model needs well-calibrated probabilities, not just accuracy. A predicted 75% should mean the outcome occurs 75% of the time:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (
accuracy_score, roc_auc_score, brier_score_loss,
classification_report,
)
import pandas as pd
class CaseOutcomePredictor:
def __init__(self):
base_model = GradientBoostingClassifier(
n_estimators=500,
max_depth=5,
learning_rate=0.05,
subsample=0.8,
min_samples_leaf=20,
)
# Calibrate probabilities using isotonic regression
self.model = CalibratedClassifierCV(
base_model, cv=5, method="isotonic"
)
self.text_encoder = LegalTextEncoder()
self.feature_extractor = FeatureExtractor()
def prepare_training_data(
self, cases: list[CaseRecord]
) -> tuple[np.ndarray, np.ndarray]:
"""Convert cases to feature matrix and labels."""
self.feature_extractor.build_judge_profiles(cases)
features_list = []
labels = []
for case in cases:
if not case.outcome:
continue
# Structured features
structured = self.feature_extractor.extract_features(case)
# Text embedding
text_emb = self.text_encoder.encode(case.opinion_text)
# Combine structured + text features
structured_vec = np.array([
structured["year_filed"],
structured["month_filed"],
structured["citations_count"],
structured["opinion_word_count"],
structured["judge_total_cases"],
structured["judge_plaintiff_win_rate"],
structured["judge_reversal_rate"],
structured["num_statutes_cited"],
], dtype=np.float32)
combined = np.concatenate([structured_vec, text_emb])
features_list.append(combined)
label = 1 if case.outcome == "plaintiff_win" else 0
labels.append(label)
return np.array(features_list), np.array(labels)
def train_and_evaluate(
self, X: np.ndarray, y: np.ndarray
) -> dict:
"""Train with temporal cross-validation (no future leakage)."""
tscv = TimeSeriesSplit(n_splits=5)
metrics = {
"accuracy": [], "auc": [], "brier": [],
}
for train_idx, test_idx in tscv.split(X):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]
self.model.fit(X_train, y_train)
y_pred = self.model.predict(X_test)
y_proba = self.model.predict_proba(X_test)[:, 1]
metrics["accuracy"].append(accuracy_score(y_test, y_pred))
metrics["auc"].append(roc_auc_score(y_test, y_proba))
metrics["brier"].append(brier_score_loss(y_test, y_proba))
return {k: np.mean(v) for k, v in metrics.items()}
def predict(self, case: CaseRecord) -> dict:
"""Predict outcome for a new case."""
structured = self.feature_extractor.extract_features(case)
text_emb = self.text_encoder.encode(case.opinion_text)
structured_vec = np.array([
structured["year_filed"],
structured["month_filed"],
structured["citations_count"],
structured["opinion_word_count"],
structured["judge_total_cases"],
structured["judge_plaintiff_win_rate"],
structured["judge_reversal_rate"],
structured["num_statutes_cited"],
], dtype=np.float32)
combined = np.concatenate([structured_vec, text_emb]).reshape(1, -1)
proba = self.model.predict_proba(combined)[0]
return {
"plaintiff_win_probability": float(proba[1]),
"defendant_win_probability": float(proba[0]),
"confidence": float(max(proba)),
"prediction": "plaintiff_win" if proba[1] > 0.5 else "defendant_win",
}
Feature importance and explainability
Lawyers need to understand why a model predicts a certain outcome, not just the probability:
import shap
import numpy as np
class PredictionExplainer:
def __init__(self, model, feature_names: list[str]):
self.explainer = shap.TreeExplainer(
model.calibrated_classifiers_[0].estimator
)
self.feature_names = feature_names
def explain_prediction(
self, features: np.ndarray
) -> list[tuple[str, float]]:
"""Return features ranked by their impact on the prediction."""
shap_values = self.explainer.shap_values(features.reshape(1, -1))
# For binary classification, use class 1 SHAP values
if isinstance(shap_values, list):
values = shap_values[1][0]
else:
values = shap_values[0]
# Pair features with their SHAP values
explanations = list(zip(self.feature_names, values))
explanations.sort(key=lambda x: abs(x[1]), reverse=True)
return explanations[:10] # Top 10 most influential features
Temporal validation and avoiding data leakage
Legal prediction has a critical pitfall: using future information to predict past outcomes. The model must only train on cases decided before the case being predicted:
from datetime import datetime
def temporal_train_test_split(
cases: list[CaseRecord],
cutoff_date: str,
) -> tuple[list[CaseRecord], list[CaseRecord]]:
"""Split cases by date to avoid temporal leakage."""
cutoff = datetime.fromisoformat(cutoff_date)
train = []
test = []
for case in cases:
if not case.date_decided:
continue
decided = datetime.fromisoformat(case.date_decided)
if decided < cutoff:
train.append(case)
else:
test.append(case)
return train, test
Ethical considerations and production guardrails
Bias detection — Models trained on historical decisions can perpetuate judicial biases. Monitor prediction disparities across demographic groups and case types. If the model predicts significantly different outcomes for similar cases based on party demographics, that’s a bias signal.
Confidence thresholds — Don’t present predictions below a confidence threshold. A model that says “55% plaintiff win” is barely better than a coin flip. Set minimum confidence levels (e.g., 65%) before surfacing predictions.
Disclaimer requirements — Any prediction tool must clearly state that predictions are statistical estimates, not legal advice. Courts have not recognized ML predictions as admissible evidence, and presenting them as definitive could be sanctionable.
Model drift — Legal landscapes change. New legislation, landmark rulings, and shifts in judicial philosophy make older training data less relevant. Retrain models regularly and monitor prediction accuracy on recent outcomes.
The one thing to remember: A production case prediction system combines structured case features, judge profiling, legal text embeddings, and calibrated gradient boosting with temporal validation — always framed as a strategic tool for litigation planning rather than a substitute for legal judgment.
See Also
- Activation Functions Why neural networks need these tiny mathematical functions — and how ReLU's simplicity accidentally made deep learning possible.
- Ai Agents Architecture How AI systems go from answering questions to actually doing things — the design patterns that turn language models into autonomous agents that browse, code, and plan.
- Ai Agents ChatGPT answers questions. AI agents actually do things — browse the web, write code, send emails, and keep going until the job is done. Here's the difference.
- Ai Ethics Why building AI fairly is harder than it sounds — bias, accountability, privacy, and who gets to decide what AI is allowed to do.
- Ai Hallucinations ChatGPT sometimes makes up facts with total confidence. Here's the weird reason why — and why it's not as simple as 'the AI lied.'