Legal Citation Extraction with Python — Deep Dive

Build a Python citation extraction system using eyecite, custom regex patterns, CourtListener API integration, and citation network analysis with NetworkX

Citation extraction with eyecite

The eyecite library from the Free Law Project is the gold standard for US legal citation detection and parsing:

from eyecite import get_citations, resolve_citations
from eyecite.models import FullCaseCitation, ShortCaseCitation, SupraCitation
from dataclasses import dataclass


@dataclass
class ParsedCitation:
    raw_text: str
    citation_type: str  # full_case, short_case, supra, statute, id
    volume: str | None
    reporter: str | None
    page: str | None
    year: int | None
    plaintiff: str | None
    defendant: str | None
    pinpoint: str | None
    position: int


def extract_citations(text: str) -> list[ParsedCitation]:
    """Extract and parse all legal citations from text."""
    citations = get_citations(text)
    parsed = []

    for cite in citations:
        if isinstance(cite, FullCaseCitation):
            parsed.append(ParsedCitation(
                raw_text=cite.matched_text(),
                citation_type="full_case",
                volume=str(cite.groups.get("volume", "")),
                reporter=cite.groups.get("reporter", ""),
                page=str(cite.groups.get("page", "")),
                year=cite.year,
                plaintiff=cite.metadata.plaintiff,
                defendant=cite.metadata.defendant,
                pinpoint=cite.metadata.pin_cite,
                position=cite.span()[0],
            ))
        elif isinstance(cite, ShortCaseCitation):
            parsed.append(ParsedCitation(
                raw_text=cite.matched_text(),
                citation_type="short_case",
                volume=str(cite.groups.get("volume", "")),
                reporter=cite.groups.get("reporter", ""),
                page=str(cite.groups.get("page", "")),
                year=None,
                plaintiff=cite.metadata.antecedent_guess,
                defendant=None,
                pinpoint=cite.metadata.pin_cite,
                position=cite.span()[0],
            ))

    return parsed


# Example usage
text = """
In Brown v. Board of Education, 347 U.S. 483 (1954), the Supreme Court
held that racial segregation in public schools violated the Equal Protection
Clause. This principle was later extended in Loving v. Virginia, 388 U.S. 1,
12 (1967), to strike down laws banning interracial marriage. See also
Obergefell v. Hodges, 576 U.S. 644 (2015).
"""

citations = extract_citations(text)
# Returns 3 parsed citations with full metadata

Custom statutory citation patterns

eyecite handles case citations well, but statutory and regulatory citations often need custom patterns:

import re
from dataclasses import dataclass


@dataclass
class StatutoryCitation:
    raw_text: str
    citation_type: str  # usc, cfr, public_law, state_statute
    title: str
    section: str
    subsection: str | None
    position: int


STATUTORY_PATTERNS = [
    # US Code: "42 U.S.C. § 1983" or "42 USC § 1983"
    (
        "usc",
        re.compile(
            r"(\d+)\s+U\.?S\.?C\.?\s*§\s*(\d+(?:[a-z])?)"
            r"(?:\(([a-z0-9]+)\))?",
            re.IGNORECASE,
        ),
    ),
    # Code of Federal Regulations: "17 C.F.R. § 240.10b-5"
    (
        "cfr",
        re.compile(
            r"(\d+)\s+C\.?F\.?R\.?\s*§\s*(\d+(?:\.\d+[a-z]?(?:-\d+)?)?)",
            re.IGNORECASE,
        ),
    ),
    # Public Law: "Pub. L. No. 116-283"
    (
        "public_law",
        re.compile(
            r"Pub\.?\s*L\.?\s*(?:No\.?)?\s*(\d+)-(\d+)",
            re.IGNORECASE,
        ),
    ),
    # Federal Register: "85 Fed. Reg. 44,992"
    (
        "fed_reg",
        re.compile(
            r"(\d+)\s+Fed\.?\s*Reg\.?\s*([\d,]+)",
            re.IGNORECASE,
        ),
    ),
]


def extract_statutory_citations(text: str) -> list[StatutoryCitation]:
    """Extract statutory and regulatory citations from legal text."""
    results = []
    for cite_type, pattern in STATUTORY_PATTERNS:
        for match in pattern.finditer(text):
            groups = match.groups()
            results.append(StatutoryCitation(
                raw_text=match.group(0),
                citation_type=cite_type,
                title=groups[0] if groups else "",
                section=groups[1] if len(groups) > 1 else "",
                subsection=groups[2] if len(groups) > 2 else None,
                position=match.start(),
            ))
    return results

Citation resolution via CourtListener API

The Free Law Project’s CourtListener provides a free API for resolving citations to actual opinions:

import httpx
from dataclasses import dataclass
from functools import lru_cache


@dataclass
class ResolvedCase:
    case_name: str
    citation: str
    court: str
    date_filed: str
    opinion_url: str
    status: str  # published, unpublished
    precedential_status: str


class CourtListenerResolver:
    BASE_URL = "https://www.courtlistener.com/api/rest/v4"

    def __init__(self, api_token: str):
        self.client = httpx.Client(
            headers={"Authorization": f"Token {api_token}"},
            timeout=30,
        )

    def resolve_citation(
        self, volume: str, reporter: str, page: str
    ) -> ResolvedCase | None:
        """Resolve a case citation to its full record."""
        response = self.client.get(
            f"{self.BASE_URL}/search/",
            params={
                "type": "o",  # opinions
                "citation": f"{volume} {reporter} {page}",
            },
        )
        response.raise_for_status()
        results = response.json().get("results", [])

        if not results:
            return None

        result = results[0]
        return ResolvedCase(
            case_name=result.get("caseName", ""),
            citation=f"{volume} {reporter} {page}",
            court=result.get("court", ""),
            date_filed=result.get("dateFiled", ""),
            opinion_url=f"https://www.courtlistener.com{result.get('absolute_url', '')}",
            status=result.get("status", ""),
            precedential_status=result.get("precedentialStatus", ""),
        )

    def check_citing_cases(
        self, opinion_id: int
    ) -> dict[str, list[str]]:
        """Check what later cases say about this opinion."""
        response = self.client.get(
            f"{self.BASE_URL}/opinions/{opinion_id}/cited-by/",
        )
        response.raise_for_status()
        results = response.json().get("results", [])

        # Categorize treatment
        treatment: dict[str, list[str]] = {
            "followed": [],
            "distinguished": [],
            "overruled": [],
            "questioned": [],
            "cited": [],
        }

        for case in results:
            relation = case.get("treatment", "cited").lower()
            case_name = case.get("caseName", "Unknown")
            if relation in treatment:
                treatment[relation].append(case_name)
            else:
                treatment["cited"].append(case_name)

        return treatment

Citation network analysis

Building a citation graph reveals the structure of legal authority:

import networkx as nx
from dataclasses import dataclass


@dataclass
class CitationEdge:
    citing_case: str
    cited_case: str
    treatment: str  # positive, negative, neutral, distinguished
    context: str     # the sentence containing the citation


class CitationNetworkAnalyzer:
    def __init__(self):
        self.graph = nx.DiGraph()

    def add_document_citations(
        self,
        document_id: str,
        citations: list[ParsedCitation],
        treatments: dict[str, str] | None = None,
    ):
        """Add a document's citations to the network."""
        self.graph.add_node(document_id, type="source")
        treatments = treatments or {}

        for cite in citations:
            cite_key = f"{cite.volume} {cite.reporter} {cite.page}"
            self.graph.add_node(cite_key, type="authority")
            treatment = treatments.get(cite_key, "cited")
            self.graph.add_edge(
                document_id,
                cite_key,
                treatment=treatment,
                position=cite.position,
            )

    def find_most_cited(self, top_n: int = 20) -> list[tuple[str, int]]:
        """Find the most frequently cited authorities."""
        in_degrees = dict(self.graph.in_degree())
        authorities = [
            (node, degree)
            for node, degree in in_degrees.items()
            if self.graph.nodes[node].get("type") == "authority"
        ]
        return sorted(authorities, key=lambda x: x[1], reverse=True)[:top_n]

    def find_authority_score(self) -> dict[str, float]:
        """Calculate authority strength using PageRank."""
        return nx.pagerank(self.graph)

    def find_citation_communities(self) -> list[set[str]]:
        """Detect clusters of frequently co-cited authorities."""
        undirected = self.graph.to_undirected()
        from networkx.algorithms.community import greedy_modularity_communities
        communities = greedy_modularity_communities(undirected)
        return [set(c) for c in communities]

    def detect_negative_treatment(self) -> list[tuple[str, str]]:
        """Find authorities that have been negatively treated."""
        negative = []
        for u, v, data in self.graph.edges(data=True):
            if data.get("treatment") in ("overruled", "questioned", "criticized"):
                negative.append((v, data["treatment"]))
        return negative

Putting it together: brief analysis

A complete brief analyzer combines extraction, resolution, and validation:

@dataclass
class BriefAnalysis:
    total_citations: int
    case_citations: int
    statutory_citations: int
    unresolved_citations: list[str]
    negative_authority: list[dict]
    citation_frequency: dict[str, int]
    strongest_authorities: list[tuple[str, float]]


def analyze_legal_brief(
    text: str,
    resolver: CourtListenerResolver | None = None,
) -> BriefAnalysis:
    """Full citation analysis of a legal brief."""
    # Extract all citation types
    case_cites = extract_citations(text)
    stat_cites = extract_statutory_citations(text)

    # Count citation frequency
    frequency: dict[str, int] = {}
    for cite in case_cites:
        key = f"{cite.volume} {cite.reporter} {cite.page}"
        frequency[key] = frequency.get(key, 0) + 1

    # Build citation network
    analyzer = CitationNetworkAnalyzer()
    analyzer.add_document_citations("brief", case_cites)

    # Resolve and validate if resolver available
    unresolved = []
    negative = []
    if resolver:
        for cite in case_cites:
            if cite.citation_type == "full_case":
                result = resolver.resolve_citation(
                    cite.volume or "", cite.reporter or "", cite.page or ""
                )
                if not result:
                    unresolved.append(cite.raw_text)

    return BriefAnalysis(
        total_citations=len(case_cites) + len(stat_cites),
        case_citations=len(case_cites),
        statutory_citations=len(stat_cites),
        unresolved_citations=unresolved,
        negative_authority=negative,
        citation_frequency=frequency,
        strongest_authorities=analyzer.find_most_cited(10),
    )

Tradeoffs and limitations

Jurisdiction coverage — eyecite and CourtListener focus on US federal and state law. International citations (EU Court of Justice, UK courts, ICC) require custom patterns and different resolution APIs. No single library covers all jurisdictions.

Citation accuracy vs. recall — Strict pattern matching misses non-standard citations but avoids false positives. Looser matching catches more citations but flags non-citations. Production systems typically use strict patterns for automated processing and looser patterns for human review queues.

Short-form citation resolution — “Id. at 495” or “Brown, 347 U.S. at 490” require tracking context to resolve which case is referenced. eyecite’s resolve_citations function handles common short forms, but complex chains of “Id.” references in footnote-heavy briefs can trip up any parser.

Historical citations — Older cases use citation formats that have changed over time. Pre-1874 Supreme Court cases use nominative reporters (e.g., “5 Cranch 137” instead of “9 U.S. 137”). Comprehensive extraction requires mapping historical reporter names.

The one thing to remember: A production citation extraction system chains eyecite for detection, custom regex for statutory patterns, API-based resolution for validation, and NetworkX for authority analysis — turning dense legal text into a verified, navigable map of legal authority.

pythonlegal-techcitationsnlp