Visual Regression Testing — Deep Dive
Playwright visual comparison setup
Playwright’s Python bindings provide built-in screenshot comparison with configurable thresholds:
# tests/visual/test_pages.py
import pytest
from playwright.sync_api import Page
@pytest.fixture(scope="session")
def browser_context_args():
return {
"viewport": {"width": 1280, "height": 720},
"device_scale_factor": 1,
}
class TestDashboardVisuals:
def test_dashboard_layout(self, page: Page):
page.goto("http://localhost:3000/dashboard")
page.wait_for_load_state("networkidle")
# Disable animations for stable screenshots
page.add_style_tag(content="""
*, *::before, *::after {
animation-duration: 0s !important;
transition-duration: 0s !important;
}
""")
expect(page).to_have_screenshot(
"dashboard.png",
max_diff_pixels=100,
)
def test_dashboard_mobile(self, page: Page):
page.set_viewport_size({"width": 375, "height": 812})
page.goto("http://localhost:3000/dashboard")
page.wait_for_load_state("networkidle")
page.add_style_tag(content="""
*, *::before, *::after {
animation-duration: 0s !important;
transition-duration: 0s !important;
}
""")
expect(page).to_have_screenshot(
"dashboard-mobile.png",
max_diff_pixels=100,
)
Update baselines when changes are intentional:
pytest tests/visual/ --update-snapshots
Custom pixel diffing with Pillow
For more control over comparison logic, build a custom differ:
# visual_diff.py
from PIL import Image, ImageChops, ImageDraw
import numpy as np
from pathlib import Path
from dataclasses import dataclass
@dataclass
class DiffResult:
match: bool
diff_percentage: float
diff_pixels: int
total_pixels: int
diff_image_path: str | None
def compare_screenshots(
baseline_path: str,
current_path: str,
diff_output_path: str,
threshold: float = 0.1,
pixel_tolerance: int = 10,
) -> DiffResult:
baseline = Image.open(baseline_path).convert("RGB")
current = Image.open(current_path).convert("RGB")
if baseline.size != current.size:
return DiffResult(
match=False,
diff_percentage=100.0,
diff_pixels=baseline.size[0] * baseline.size[1],
total_pixels=baseline.size[0] * baseline.size[1],
diff_image_path=None,
)
base_arr = np.array(baseline, dtype=np.int16)
curr_arr = np.array(current, dtype=np.int16)
# Per-pixel color distance
diff = np.abs(base_arr - curr_arr)
pixel_diff = np.max(diff, axis=2) # Max channel difference per pixel
# Pixels that differ more than tolerance
changed_mask = pixel_diff > pixel_tolerance
diff_pixels = int(np.sum(changed_mask))
total_pixels = baseline.size[0] * baseline.size[1]
diff_percentage = (diff_pixels / total_pixels) * 100
# Generate diff visualization
diff_image = Image.new("RGB", baseline.size, (0, 0, 0))
diff_arr = np.array(diff_image)
diff_arr[changed_mask] = [255, 0, 80] # Hot pink for changed pixels
diff_arr[~changed_mask] = (curr_arr[~changed_mask] * 0.3).astype(np.uint8)
diff_image = Image.fromarray(diff_arr.astype(np.uint8))
diff_image.save(diff_output_path)
return DiffResult(
match=diff_percentage <= threshold,
diff_percentage=diff_percentage,
diff_pixels=diff_pixels,
total_pixels=total_pixels,
diff_image_path=diff_output_path,
)
Usage in tests:
def test_homepage_visual(capture_screenshot):
current = capture_screenshot("homepage", "http://localhost:3000/")
result = compare_screenshots(
baseline_path="baselines/homepage.png",
current_path=current,
diff_output_path="diffs/homepage-diff.png",
threshold=0.05,
pixel_tolerance=15,
)
assert result.match, (
f"Visual regression: {result.diff_percentage:.2f}% pixels changed "
f"({result.diff_pixels}/{result.total_pixels}). "
f"Diff image: {result.diff_image_path}"
)
Multi-viewport testing
Test across breakpoints systematically:
import pytest
from playwright.sync_api import Page
VIEWPORTS = {
"mobile": {"width": 375, "height": 812},
"tablet": {"width": 768, "height": 1024},
"desktop": {"width": 1280, "height": 720},
"wide": {"width": 1920, "height": 1080},
}
PAGES = [
("/", "homepage"),
("/pricing", "pricing"),
("/dashboard", "dashboard"),
("/settings", "settings"),
]
@pytest.mark.parametrize("viewport_name,viewport", VIEWPORTS.items())
@pytest.mark.parametrize("path,page_name", PAGES)
def test_visual_matrix(
page: Page, viewport_name: str, viewport: dict,
path: str, page_name: str,
):
page.set_viewport_size(viewport)
page.goto(f"http://localhost:3000{path}")
page.wait_for_load_state("networkidle")
freeze_dynamic_content(page)
expect(page).to_have_screenshot(
f"{page_name}-{viewport_name}.png",
max_diff_pixel_ratio=0.001,
)
def freeze_dynamic_content(page: Page):
"""Replace dynamic content with static values for stable screenshots."""
page.add_style_tag(content="""
*, *::before, *::after {
animation: none !important;
transition: none !important;
}
""")
page.evaluate("""
// Freeze timestamps
document.querySelectorAll('[data-testid="timestamp"]')
.forEach(el => el.textContent = '2026-01-01 12:00:00');
// Freeze counters
document.querySelectorAll('[data-testid="live-count"]')
.forEach(el => el.textContent = '42');
""")
This creates a matrix of 4 viewports × 4 pages = 16 screenshots, each compared against its own baseline.
Element-level visual testing
Instead of full-page screenshots, test individual components:
def test_navigation_component(page: Page):
page.goto("http://localhost:3000/")
nav = page.locator("nav.main-navigation")
expect(nav).to_have_screenshot("navigation.png")
def test_pricing_card(page: Page):
page.goto("http://localhost:3000/pricing")
card = page.locator('[data-testid="pro-plan-card"]')
expect(card).to_have_screenshot("pro-plan-card.png")
Element-level testing is more stable than full-page testing because it’s isolated from layout changes in unrelated parts of the page.
CI pipeline integration
Visual tests need a consistent rendering environment. Docker ensures screenshots are identical across developer machines and CI:
# Dockerfile.visual-tests
FROM mcr.microsoft.com/playwright/python:v1.42.0
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY . .
# Install consistent fonts
RUN apt-get update && apt-get install -y fonts-noto fonts-liberation \
&& fc-cache -fv
CMD ["pytest", "tests/visual/", "-v", "--screenshot=only-on-failure"]
In GitHub Actions:
jobs:
visual-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run visual tests
run: |
docker build -f Dockerfile.visual-tests -t visual-tests .
docker run --network=host \
-v $(pwd)/baselines:/app/baselines \
-v $(pwd)/diffs:/app/diffs \
visual-tests
- name: Upload diff images
if: failure()
uses: actions/upload-artifact@v4
with:
name: visual-diffs
path: diffs/
- name: Comment on PR with diffs
if: failure() && github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const diffs = fs.readdirSync('diffs').filter(f => f.endsWith('.png'));
if (diffs.length > 0) {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: `## Visual Regression Detected\n\n${diffs.length} screenshots differ from baseline. Check the artifacts for diff images.`,
});
}
Masking regions
Some parts of the page always change. Mask them instead of freezing the entire page:
from PIL import Image, ImageDraw
def apply_mask(image_path: str, regions: list[tuple]) -> Image.Image:
"""Mask dynamic regions before comparison."""
img = Image.open(image_path)
draw = ImageDraw.Draw(img)
for x, y, w, h in regions:
draw.rectangle([x, y, x + w, y + h], fill=(128, 128, 128))
return img
DYNAMIC_REGIONS = {
"dashboard": [
(850, 20, 200, 30), # Timestamp in header
(10, 600, 300, 50), # Live user count
],
}
Better yet, use data-visual-test="ignore" attributes in your HTML and have the test harness automatically mask elements with that attribute.
Threshold strategies
Different pages need different sensitivity:
VISUAL_THRESHOLDS = {
"landing-page": {"max_diff_pixel_ratio": 0.0001}, # Very strict
"dashboard": {"max_diff_pixel_ratio": 0.005}, # Moderate (dynamic data)
"admin-panel": {"max_diff_pixel_ratio": 0.01}, # Relaxed (internal tool)
"email-template": {"max_diff_pixels": 0}, # Pixel-perfect
}
Pixel-perfect comparison works for email templates and static marketing pages. Ratio-based thresholds work better for complex interactive pages where minor rendering variations are acceptable.
The one thing to remember: Effective visual regression testing requires consistent environments (Docker), smart handling of dynamic content (masking and freezing), multi-viewport coverage, and CI integration that surfaces diff images for fast human review.
See Also
- Python Acceptance Testing Patterns How Python teams verify software does what real users actually asked for.
- Python Approval Testing How approval testing lets you verify complex Python output by comparing it to a saved 'golden' copy you already checked.
- Python Behavior Driven Development Get an intuitive feel for Behavior Driven Development so Python behavior stops feeling unpredictable.
- Python Browser Automation Testing How Python can control a web browser like a robot to test websites automatically.
- Python Chaos Testing Applications Why breaking your own Python systems on purpose makes them stronger.