Legal Document Parsing with Python — Deep Dive

Build a Python legal document parser with hierarchical structure extraction, cross-reference resolution, defined term linking, and legislative XML processing

Hierarchical structure extraction

Legal documents follow nested numbering schemes that vary by jurisdiction and document type. US federal regulations use Title → Part → Subpart → Section → Paragraph (a) → Subparagraph (1) → Clause (i). Contracts use Article → Section → Subsection patterns. Building a parser that handles multiple conventions requires a flexible approach:

import re
from dataclasses import dataclass, field
from enum import Enum


class NodeLevel(Enum):
    TITLE = "title"
    ARTICLE = "article"
    SECTION = "section"
    SUBSECTION = "subsection"
    PARAGRAPH = "paragraph"
    SUBPARAGRAPH = "subparagraph"
    CLAUSE = "clause"


@dataclass
class DocumentNode:
    level: NodeLevel
    number: str
    heading: str
    text: str
    children: list["DocumentNode"] = field(default_factory=list)
    parent: "DocumentNode | None" = None
    references: list[str] = field(default_factory=list)

    def path(self) -> str:
        """Full hierarchical path like 'Article 3 > Section 3.2 > (a)'."""
        parts = []
        node = self
        while node:
            parts.append(f"{node.level.value} {node.number}")
            node = node.parent
        return " > ".join(reversed(parts))


# Patterns for different numbering conventions
LEVEL_PATTERNS = [
    (NodeLevel.ARTICLE, re.compile(
        r"^(?:ARTICLE|Article)\s+([IVXLCDM]+|\d+)[.\s]+(.*)", re.MULTILINE
    )),
    (NodeLevel.SECTION, re.compile(
        r"^(?:Section|SECTION)\s+(\d+(?:\.\d+)*)[.\s]+(.*)", re.MULTILINE
    )),
    (NodeLevel.PARAGRAPH, re.compile(
        r"^\s+\(([a-z])\)\s+(.*)", re.MULTILINE
    )),
    (NodeLevel.SUBPARAGRAPH, re.compile(
        r"^\s+\((\d+)\)\s+(.*)", re.MULTILINE
    )),
    (NodeLevel.CLAUSE, re.compile(
        r"^\s+\(([ivx]+)\)\s+(.*)", re.MULTILINE
    )),
]


def build_document_tree(text: str) -> list[DocumentNode]:
    """Parse legal text into a hierarchical tree of document nodes."""
    # Collect all matches with positions
    matches = []
    for level, pattern in LEVEL_PATTERNS:
        for match in pattern.finditer(text):
            matches.append((match.start(), level, match.group(1), match.group(2)))

    matches.sort(key=lambda m: m[0])

    root_nodes = []
    stack: list[DocumentNode] = []

    for i, (pos, level, number, heading) in enumerate(matches):
        # Extract text between this match and the next
        end_pos = matches[i + 1][0] if i + 1 < len(matches) else len(text)
        node_text = text[pos:end_pos].strip()

        node = DocumentNode(
            level=level,
            number=number,
            heading=heading.strip(),
            text=node_text,
        )

        # Find parent: walk up stack until we find a higher level
        level_order = list(NodeLevel)
        while stack and level_order.index(stack[-1].level) >= level_order.index(level):
            stack.pop()

        if stack:
            node.parent = stack[-1]
            stack[-1].children.append(node)
        else:
            root_nodes.append(node)

        stack.append(node)

    return root_nodes

Cross-reference resolution

Legal cross-references come in many forms: “Section 3.2(a),” “Article VII,” “paragraph (b) above,” “as defined herein.” A resolver needs to handle all of these:

import re
from dataclasses import dataclass


@dataclass
class CrossReference:
    source_position: int
    raw_text: str
    target_section: str
    resolved_node: DocumentNode | None = None


REFERENCE_PATTERNS = [
    # "Section 3.2(a)" style
    re.compile(
        r"(?:Section|section|§)\s*(\d+(?:\.\d+)*(?:\([a-z]\))?(?:\(\d+\))?)"
    ),
    # "Article VII" style
    re.compile(r"(?:Article|ARTICLE)\s+([IVXLCDM]+|\d+)"),
    # "paragraph (b)" style
    re.compile(r"(?:paragraph|clause)\s+\(([a-z]|\d+|[ivx]+)\)"),
    # "herein" / "hereof" / "hereunder" — refers to the whole document
    re.compile(r"\b(herein|hereof|hereunder|hereby)\b"),
]


def extract_cross_references(text: str) -> list[CrossReference]:
    """Find all cross-references in legal text."""
    refs = []
    for pattern in REFERENCE_PATTERNS:
        for match in pattern.finditer(text):
            refs.append(CrossReference(
                source_position=match.start(),
                raw_text=match.group(0),
                target_section=match.group(1) if match.lastindex else "document",
            ))
    return refs


def resolve_references(
    refs: list[CrossReference],
    tree: list[DocumentNode],
) -> list[CrossReference]:
    """Link cross-references to their target nodes in the document tree."""
    # Build lookup index
    index: dict[str, DocumentNode] = {}

    def index_tree(nodes: list[DocumentNode]):
        for node in nodes:
            # Index by number and by full path fragments
            index[node.number] = node
            index[f"{node.level.value} {node.number}"] = node
            index_tree(node.children)

    index_tree(tree)

    for ref in refs:
        target = ref.target_section
        # Try exact match first, then fuzzy
        ref.resolved_node = (
            index.get(target)
            or index.get(f"section {target}")
            or index.get(f"article {target}")
        )

    return refs

Defined term extraction and linking

Legal documents establish defined terms (often capitalized or in quotes) and use them throughout. Parsing the definitions section and linking every usage creates a semantic layer:

import re


def extract_definitions(text: str) -> dict[str, str]:
    """Extract defined terms and their definitions from a definitions section."""
    definitions = {}

    # Pattern: "Term" means / shall mean ...
    pattern = re.compile(
        r'"([^"]+)"\s+(?:means|shall mean|refers to|has the meaning)\s+'
        r"(.*?)(?=\n\s*\"[A-Z]|\n\s*\n|\Z)",
        re.DOTALL,
    )

    for match in pattern.finditer(text):
        term = match.group(1).strip()
        definition = match.group(2).strip()
        # Clean trailing punctuation
        definition = re.sub(r"\s*[;.]\s*$", "", definition)
        definitions[term] = definition

    return definitions


def link_defined_terms(
    text: str, definitions: dict[str, str]
) -> list[dict]:
    """Find all occurrences of defined terms in document text."""
    occurrences = []
    for term in sorted(definitions.keys(), key=len, reverse=True):
        # Match the term as a whole word, case-sensitive for defined terms
        pattern = re.compile(rf"\b{re.escape(term)}\b")
        for match in pattern.finditer(text):
            occurrences.append({
                "term": term,
                "position": match.start(),
                "definition": definitions[term],
                "context": text[max(0, match.start() - 50):match.end() + 50],
            })
    return occurrences

Processing legislative XML

Government databases increasingly publish structured XML. The US Legislative Markup (USLM) and international Akoma Ntoso standards provide rich semantic structure:

from lxml import etree
from dataclasses import dataclass


@dataclass
class LegislativeSection:
    identifier: str
    heading: str
    text: str
    amendments: list[str]
    effective_date: str | None


def parse_uslm(xml_path: str) -> list[LegislativeSection]:
    """Parse US Legislative Markup XML into structured sections."""
    tree = etree.parse(xml_path)
    ns = {"uslm": "https://xml.house.gov/schemas/uslm/2.0"}
    sections = []

    for section in tree.xpath("//uslm:section", namespaces=ns):
        identifier = section.get("identifier", "")

        heading_el = section.find("uslm:heading", namespaces=ns)
        heading = heading_el.text if heading_el is not None else ""

        # Collect all text content
        content_parts = []
        for content in section.xpath(
            ".//uslm:content | .//uslm:chapeau", namespaces=ns
        ):
            if content.text:
                content_parts.append(content.text.strip())

        # Extract amendment notes
        amendments = [
            note.text for note in section.xpath(
                ".//uslm:note[@type='amendment']", namespaces=ns
            )
            if note.text
        ]

        sections.append(LegislativeSection(
            identifier=identifier,
            heading=heading,
            text="\n".join(content_parts),
            amendments=amendments,
            effective_date=section.get("effectiveDate"),
        ))

    return sections


def parse_akoma_ntoso(xml_path: str) -> list[LegislativeSection]:
    """Parse Akoma Ntoso XML (used by EU, UN, and many countries)."""
    tree = etree.parse(xml_path)
    ns = {"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"}
    sections = []

    for article in tree.xpath("//akn:article", namespaces=ns):
        eid = article.get("eId", "")
        heading_el = article.find("akn:heading", namespaces=ns)

        paragraphs = []
        for para in article.xpath(".//akn:paragraph", namespaces=ns):
            content = para.find("akn:content", namespaces=ns)
            if content is not None and content.find("akn:p", namespaces=ns) is not None:
                paragraphs.append(
                    content.find("akn:p", namespaces=ns).text or ""
                )

        sections.append(LegislativeSection(
            identifier=eid,
            heading=heading_el.text if heading_el is not None else "",
            text="\n".join(paragraphs),
            amendments=[],
            effective_date=None,
        ))

    return sections

Table extraction from legal PDFs

Legal documents frequently contain tables — schedules, fee structures, comparison charts. Standard text extraction flattens these into gibberish. Position-aware extraction preserves the structure:

import pdfplumber


def extract_legal_tables(pdf_path: str) -> list[list[list[str]]]:
    """Extract tables from legal PDFs with position-aware parsing."""
    all_tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables = page.extract_tables(
                table_settings={
                    "vertical_strategy": "lines",
                    "horizontal_strategy": "lines",
                    "snap_y_tolerance": 5,
                    "join_x_tolerance": 5,
                }
            )
            for table in tables:
                # Clean cells
                cleaned = [
                    [cell.strip() if cell else "" for cell in row]
                    for row in table
                ]
                all_tables.append(cleaned)
    return all_tables

Production architecture

A production legal parsing system typically uses a pipeline pattern:

Ingestion service — accepts documents via API, queues them for processing
Format normalizer — converts all formats to a standard intermediate representation
Structure parser — builds the document tree using jurisdiction-specific rule sets
NLP enrichment — runs entity extraction, defined term linking, cross-reference resolution
Storage layer — stores the parsed structure in a graph database (Neo4j) or document store (Elasticsearch) for querying
API layer — exposes search, comparison, and navigation endpoints

The key architectural decision is whether to use rule-based parsing (reliable for well-formatted documents, fragile for edge cases) or ML-based parsing (handles variety better, requires training data). Most production systems use a hybrid: rules for structural parsing where conventions are clear, ML for classification and extraction tasks where language varies.

The one thing to remember: Legal document parsing combines structural pattern matching for hierarchical extraction with NLP for entity and reference resolution, transforming flat legal text into a navigable, linked knowledge structure that supports search, comparison, and compliance workflows.

pythonlegal-techdocument-parsingnlp