ROC and AUC Curves in Python — Deep Dive
Computing ROC and AUC in Scikit-Learn
Binary Classification
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
auc = roc_auc_score(y_test, y_proba)
print(f"AUC: {auc:.4f}")
The roc_curve function returns arrays of FPR, TPR, and the corresponding thresholds. Each triple defines one point on the curve.
Plotting
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(fpr, tpr, label=f"Logistic Regression (AUC = {auc:.3f})", linewidth=2)
ax.plot([0, 1], [0, 1], "k--", label="Random (AUC = 0.5)")
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.set_title("ROC Curve")
ax.legend(loc="lower right")
ax.set_aspect("equal")
plt.tight_layout()
plt.savefig("roc_curve.png", dpi=150)
Comparing Multiple Models
Plot several models on the same axes for visual comparison:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
models = {
"Logistic Regression": LogisticRegression(max_iter=1000),
"Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
"Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
}
fig, ax = plt.subplots(figsize=(8, 8))
for name, clf in models.items():
clf.fit(X_train, y_train)
if hasattr(clf, "predict_proba"):
y_score = clf.predict_proba(X_test)[:, 1]
else:
y_score = clf.decision_function(X_test)
fpr_m, tpr_m, _ = roc_curve(y_test, y_score)
auc_m = roc_auc_score(y_test, y_score)
ax.plot(fpr_m, tpr_m, label=f"{name} (AUC = {auc_m:.3f})", linewidth=2)
ax.plot([0, 1], [0, 1], "k--", alpha=0.5)
ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.set_title("Model Comparison — ROC Curves")
ax.legend()
plt.tight_layout()
Finding the Optimal Threshold
Youden’s J Statistic
The point on the ROC curve farthest from the diagonal maximizes the sum of sensitivity and specificity:
import numpy as np
j_scores = tpr - fpr
best_idx = np.argmax(j_scores)
best_threshold = thresholds[best_idx]
print(f"Optimal threshold (Youden's J): {best_threshold:.3f}")
print(f"TPR: {tpr[best_idx]:.3f}, FPR: {fpr[best_idx]:.3f}")
Cost-Based Threshold Selection
When false positives and false negatives have different costs:
cost_fp = 10 # Cost of false alarm
cost_fn = 500 # Cost of missed positive
prevalence = y_test.mean()
# Expected cost at each threshold
costs = []
for i in range(len(thresholds)):
expected_cost = (
fpr[i] * (1 - prevalence) * cost_fp +
(1 - tpr[i]) * prevalence * cost_fn
)
costs.append(expected_cost)
best_cost_idx = np.argmin(costs)
print(f"Cost-optimal threshold: {thresholds[best_cost_idx]:.3f}")
print(f"Expected cost: ${costs[best_cost_idx]:.2f}")
Multi-Class ROC Curves
One-vs-Rest (OvR)
Compute a separate ROC curve for each class against all others:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
y_score = model.predict_proba(X_test)
fig, ax = plt.subplots(figsize=(8, 8))
class_names = ["Setosa", "Versicolor", "Virginica"]
for i, name in enumerate(class_names):
fpr_i, tpr_i, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
auc_i = auc(fpr_i, tpr_i)
ax.plot(fpr_i, tpr_i, label=f"{name} (AUC = {auc_i:.3f})")
ax.plot([0, 1], [0, 1], "k--")
ax.set_title("Multi-Class ROC (One-vs-Rest)")
ax.legend()
Macro and Weighted AUC
from sklearn.metrics import roc_auc_score
auc_macro = roc_auc_score(y_test_bin, y_score, average="macro", multi_class="ovr")
auc_weighted = roc_auc_score(y_test_bin, y_score, average="weighted", multi_class="ovr")
print(f"Macro AUC: {auc_macro:.4f}")
print(f"Weighted AUC: {auc_weighted:.4f}")
Confidence Intervals via Bootstrapping
A single AUC number needs uncertainty quantification:
from sklearn.utils import resample
n_bootstraps = 1000
rng = np.random.RandomState(42)
aucs = []
for _ in range(n_bootstraps):
indices = rng.randint(0, len(y_test), len(y_test))
if len(np.unique(y_test[indices])) < 2:
continue
aucs.append(roc_auc_score(y_test[indices], y_proba[indices]))
aucs = np.array(aucs)
ci_lower = np.percentile(aucs, 2.5)
ci_upper = np.percentile(aucs, 97.5)
print(f"AUC: {auc:.4f} (95% CI: {ci_lower:.4f} - {ci_upper:.4f})")
DeLong Test for Comparing Two AUCs
To determine if two models have statistically different AUCs, use the DeLong test:
# Using the scipy approach for the DeLong test
# Simplified comparison using bootstrapped difference
diffs = []
for _ in range(1000):
idx = rng.randint(0, len(y_test), len(y_test))
if len(np.unique(y_test[idx])) < 2:
continue
auc_a = roc_auc_score(y_test[idx], y_proba_a[idx])
auc_b = roc_auc_score(y_test[idx], y_proba_b[idx])
diffs.append(auc_a - auc_b)
diffs = np.array(diffs)
p_value = np.mean(diffs <= 0) if np.mean(diffs) > 0 else np.mean(diffs >= 0)
print(f"Mean AUC difference: {np.mean(diffs):.4f}, p-value: {p_value:.4f}")
When to Use Precision-Recall Instead
ROC curves can be misleadingly optimistic on heavily imbalanced datasets. Consider a dataset with 10,000 negatives and 100 positives. A model that produces 100 false positives has an FPR of only 100/10,000 = 1 percent, which looks tiny on the ROC curve. But precision is 50 percent (100 TP out of 200 predicted positives), which is concerning.
from sklearn.metrics import precision_recall_curve, average_precision_score
precision, recall, pr_thresholds = precision_recall_curve(y_test, y_proba)
ap = average_precision_score(y_test, y_proba)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
axes[0].plot(fpr, tpr)
axes[0].set_title(f"ROC Curve (AUC = {auc:.3f})")
axes[0].set_xlabel("FPR")
axes[0].set_ylabel("TPR")
axes[1].plot(recall, precision)
axes[1].set_title(f"Precision-Recall (AP = {ap:.3f})")
axes[1].set_xlabel("Recall")
axes[1].set_ylabel("Precision")
plt.tight_layout()
Rule of thumb: If your positive class is less than 5 percent of the data, always look at the PR curve alongside the ROC curve.
Production Monitoring
Track AUC over rolling windows in production:
from collections import deque
class AUCMonitor:
def __init__(self, window_size=10000):
self.y_true = deque(maxlen=window_size)
self.y_score = deque(maxlen=window_size)
def update(self, y_true_batch, y_score_batch):
self.y_true.extend(y_true_batch)
self.y_score.extend(y_score_batch)
def current_auc(self):
yt = np.array(self.y_true)
if len(np.unique(yt)) < 2:
return None
return roc_auc_score(yt, np.array(self.y_score))
Alert when AUC drops below a baseline (e.g., 0.05 below training AUC), which signals data drift or model degradation.
One thing to remember: ROC-AUC is the standard yardstick for classifier comparison, but it is not the only one — pair it with precision-recall analysis and cost-based threshold selection to make decisions that actually work in production.
See Also
- Python Confusion Matrix See how a simple grid of right and wrong answers reveals what your computer is actually getting confused about.
- Python Cross Validation Find out why testing a computer's homework on different practice sets keeps it from cheating.
- Python Model Evaluation Metrics Discover why asking 'how good is my model?' needs more than one number to get an honest answer.
- Python Sklearn Learning Curves Why your machine learning model might need more data — or a simpler brain — explained with zero jargon.
- Activation Functions Why neural networks need these tiny mathematical functions — and how ReLU's simplicity accidentally made deep learning possible.