Legal Document Parsing with Python — Deep Dive
Hierarchical structure extraction
Legal documents follow nested numbering schemes that vary by jurisdiction and document type. US federal regulations use Title → Part → Subpart → Section → Paragraph (a) → Subparagraph (1) → Clause (i). Contracts use Article → Section → Subsection patterns. Building a parser that handles multiple conventions requires a flexible approach:
import re
from dataclasses import dataclass, field
from enum import Enum
class NodeLevel(Enum):
TITLE = "title"
ARTICLE = "article"
SECTION = "section"
SUBSECTION = "subsection"
PARAGRAPH = "paragraph"
SUBPARAGRAPH = "subparagraph"
CLAUSE = "clause"
@dataclass
class DocumentNode:
level: NodeLevel
number: str
heading: str
text: str
children: list["DocumentNode"] = field(default_factory=list)
parent: "DocumentNode | None" = None
references: list[str] = field(default_factory=list)
def path(self) -> str:
"""Full hierarchical path like 'Article 3 > Section 3.2 > (a)'."""
parts = []
node = self
while node:
parts.append(f"{node.level.value} {node.number}")
node = node.parent
return " > ".join(reversed(parts))
# Patterns for different numbering conventions
LEVEL_PATTERNS = [
(NodeLevel.ARTICLE, re.compile(
r"^(?:ARTICLE|Article)\s+([IVXLCDM]+|\d+)[.\s]+(.*)", re.MULTILINE
)),
(NodeLevel.SECTION, re.compile(
r"^(?:Section|SECTION)\s+(\d+(?:\.\d+)*)[.\s]+(.*)", re.MULTILINE
)),
(NodeLevel.PARAGRAPH, re.compile(
r"^\s+\(([a-z])\)\s+(.*)", re.MULTILINE
)),
(NodeLevel.SUBPARAGRAPH, re.compile(
r"^\s+\((\d+)\)\s+(.*)", re.MULTILINE
)),
(NodeLevel.CLAUSE, re.compile(
r"^\s+\(([ivx]+)\)\s+(.*)", re.MULTILINE
)),
]
def build_document_tree(text: str) -> list[DocumentNode]:
"""Parse legal text into a hierarchical tree of document nodes."""
# Collect all matches with positions
matches = []
for level, pattern in LEVEL_PATTERNS:
for match in pattern.finditer(text):
matches.append((match.start(), level, match.group(1), match.group(2)))
matches.sort(key=lambda m: m[0])
root_nodes = []
stack: list[DocumentNode] = []
for i, (pos, level, number, heading) in enumerate(matches):
# Extract text between this match and the next
end_pos = matches[i + 1][0] if i + 1 < len(matches) else len(text)
node_text = text[pos:end_pos].strip()
node = DocumentNode(
level=level,
number=number,
heading=heading.strip(),
text=node_text,
)
# Find parent: walk up stack until we find a higher level
level_order = list(NodeLevel)
while stack and level_order.index(stack[-1].level) >= level_order.index(level):
stack.pop()
if stack:
node.parent = stack[-1]
stack[-1].children.append(node)
else:
root_nodes.append(node)
stack.append(node)
return root_nodes
Cross-reference resolution
Legal cross-references come in many forms: “Section 3.2(a),” “Article VII,” “paragraph (b) above,” “as defined herein.” A resolver needs to handle all of these:
import re
from dataclasses import dataclass
@dataclass
class CrossReference:
source_position: int
raw_text: str
target_section: str
resolved_node: DocumentNode | None = None
REFERENCE_PATTERNS = [
# "Section 3.2(a)" style
re.compile(
r"(?:Section|section|§)\s*(\d+(?:\.\d+)*(?:\([a-z]\))?(?:\(\d+\))?)"
),
# "Article VII" style
re.compile(r"(?:Article|ARTICLE)\s+([IVXLCDM]+|\d+)"),
# "paragraph (b)" style
re.compile(r"(?:paragraph|clause)\s+\(([a-z]|\d+|[ivx]+)\)"),
# "herein" / "hereof" / "hereunder" — refers to the whole document
re.compile(r"\b(herein|hereof|hereunder|hereby)\b"),
]
def extract_cross_references(text: str) -> list[CrossReference]:
"""Find all cross-references in legal text."""
refs = []
for pattern in REFERENCE_PATTERNS:
for match in pattern.finditer(text):
refs.append(CrossReference(
source_position=match.start(),
raw_text=match.group(0),
target_section=match.group(1) if match.lastindex else "document",
))
return refs
def resolve_references(
refs: list[CrossReference],
tree: list[DocumentNode],
) -> list[CrossReference]:
"""Link cross-references to their target nodes in the document tree."""
# Build lookup index
index: dict[str, DocumentNode] = {}
def index_tree(nodes: list[DocumentNode]):
for node in nodes:
# Index by number and by full path fragments
index[node.number] = node
index[f"{node.level.value} {node.number}"] = node
index_tree(node.children)
index_tree(tree)
for ref in refs:
target = ref.target_section
# Try exact match first, then fuzzy
ref.resolved_node = (
index.get(target)
or index.get(f"section {target}")
or index.get(f"article {target}")
)
return refs
Defined term extraction and linking
Legal documents establish defined terms (often capitalized or in quotes) and use them throughout. Parsing the definitions section and linking every usage creates a semantic layer:
import re
def extract_definitions(text: str) -> dict[str, str]:
"""Extract defined terms and their definitions from a definitions section."""
definitions = {}
# Pattern: "Term" means / shall mean ...
pattern = re.compile(
r'"([^"]+)"\s+(?:means|shall mean|refers to|has the meaning)\s+'
r"(.*?)(?=\n\s*\"[A-Z]|\n\s*\n|\Z)",
re.DOTALL,
)
for match in pattern.finditer(text):
term = match.group(1).strip()
definition = match.group(2).strip()
# Clean trailing punctuation
definition = re.sub(r"\s*[;.]\s*$", "", definition)
definitions[term] = definition
return definitions
def link_defined_terms(
text: str, definitions: dict[str, str]
) -> list[dict]:
"""Find all occurrences of defined terms in document text."""
occurrences = []
for term in sorted(definitions.keys(), key=len, reverse=True):
# Match the term as a whole word, case-sensitive for defined terms
pattern = re.compile(rf"\b{re.escape(term)}\b")
for match in pattern.finditer(text):
occurrences.append({
"term": term,
"position": match.start(),
"definition": definitions[term],
"context": text[max(0, match.start() - 50):match.end() + 50],
})
return occurrences
Processing legislative XML
Government databases increasingly publish structured XML. The US Legislative Markup (USLM) and international Akoma Ntoso standards provide rich semantic structure:
from lxml import etree
from dataclasses import dataclass
@dataclass
class LegislativeSection:
identifier: str
heading: str
text: str
amendments: list[str]
effective_date: str | None
def parse_uslm(xml_path: str) -> list[LegislativeSection]:
"""Parse US Legislative Markup XML into structured sections."""
tree = etree.parse(xml_path)
ns = {"uslm": "https://xml.house.gov/schemas/uslm/2.0"}
sections = []
for section in tree.xpath("//uslm:section", namespaces=ns):
identifier = section.get("identifier", "")
heading_el = section.find("uslm:heading", namespaces=ns)
heading = heading_el.text if heading_el is not None else ""
# Collect all text content
content_parts = []
for content in section.xpath(
".//uslm:content | .//uslm:chapeau", namespaces=ns
):
if content.text:
content_parts.append(content.text.strip())
# Extract amendment notes
amendments = [
note.text for note in section.xpath(
".//uslm:note[@type='amendment']", namespaces=ns
)
if note.text
]
sections.append(LegislativeSection(
identifier=identifier,
heading=heading,
text="\n".join(content_parts),
amendments=amendments,
effective_date=section.get("effectiveDate"),
))
return sections
def parse_akoma_ntoso(xml_path: str) -> list[LegislativeSection]:
"""Parse Akoma Ntoso XML (used by EU, UN, and many countries)."""
tree = etree.parse(xml_path)
ns = {"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"}
sections = []
for article in tree.xpath("//akn:article", namespaces=ns):
eid = article.get("eId", "")
heading_el = article.find("akn:heading", namespaces=ns)
paragraphs = []
for para in article.xpath(".//akn:paragraph", namespaces=ns):
content = para.find("akn:content", namespaces=ns)
if content is not None and content.find("akn:p", namespaces=ns) is not None:
paragraphs.append(
content.find("akn:p", namespaces=ns).text or ""
)
sections.append(LegislativeSection(
identifier=eid,
heading=heading_el.text if heading_el is not None else "",
text="\n".join(paragraphs),
amendments=[],
effective_date=None,
))
return sections
Table extraction from legal PDFs
Legal documents frequently contain tables — schedules, fee structures, comparison charts. Standard text extraction flattens these into gibberish. Position-aware extraction preserves the structure:
import pdfplumber
def extract_legal_tables(pdf_path: str) -> list[list[list[str]]]:
"""Extract tables from legal PDFs with position-aware parsing."""
all_tables = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
tables = page.extract_tables(
table_settings={
"vertical_strategy": "lines",
"horizontal_strategy": "lines",
"snap_y_tolerance": 5,
"join_x_tolerance": 5,
}
)
for table in tables:
# Clean cells
cleaned = [
[cell.strip() if cell else "" for cell in row]
for row in table
]
all_tables.append(cleaned)
return all_tables
Production architecture
A production legal parsing system typically uses a pipeline pattern:
- Ingestion service — accepts documents via API, queues them for processing
- Format normalizer — converts all formats to a standard intermediate representation
- Structure parser — builds the document tree using jurisdiction-specific rule sets
- NLP enrichment — runs entity extraction, defined term linking, cross-reference resolution
- Storage layer — stores the parsed structure in a graph database (Neo4j) or document store (Elasticsearch) for querying
- API layer — exposes search, comparison, and navigation endpoints
The key architectural decision is whether to use rule-based parsing (reliable for well-formatted documents, fragile for edge cases) or ML-based parsing (handles variety better, requires training data). Most production systems use a hybrid: rules for structural parsing where conventions are clear, ML for classification and extraction tasks where language varies.
The one thing to remember: Legal document parsing combines structural pattern matching for hierarchical extraction with NLP for entity and reference resolution, transforming flat legal text into a navigable, linked knowledge structure that supports search, comparison, and compliance workflows.
See Also
- Python Contract Analysis Nlp How Python reads through legal contracts to find the important parts, risky clauses, and hidden surprises before you sign
- Python EDiscovery Processing How Python helps lawyers find the right emails, documents, and messages when companies get sued or investigated
- Python Legal Citation Extraction How Python finds and understands references to laws, court cases, and regulations buried inside legal documents
- Activation Functions Why neural networks need these tiny mathematical functions — and how ReLU's simplicity accidentally made deep learning possible.
- Ai Agents Architecture How AI systems go from answering questions to actually doing things — the design patterns that turn language models into autonomous agents that browse, code, and plan.