Python DOCX Generation — Deep Dive
python-docx provides a clean API for common document operations, but production use often requires reaching into the underlying XML, handling style inheritance, building complex layouts, and generating documents at scale.
Understanding DOCX Internals
A .docx file is a ZIP archive containing XML files:
document.docx (ZIP)
├── [Content_Types].xml
├── _rels/.rels
├── word/
│ ├── document.xml ← Main content
│ ├── styles.xml ← Style definitions
│ ├── header1.xml ← Header content
│ ├── footer1.xml ← Footer content
│ ├── numbering.xml ← List numbering
│ └── media/ ← Embedded images
│ └── image1.png
python-docx wraps lxml to manipulate these XML trees. When the API doesn’t support something, you can modify the XML directly.
Direct XML Access
from docx import Document
from docx.oxml.ns import qn
from lxml import etree
doc = Document()
para = doc.add_paragraph("Regular text")
# Access the XML element
print(etree.tostring(para._element, pretty_print=True).decode())
# <w:p>
# <w:r>
# <w:t>Regular text</w:t>
# </w:r>
# </w:p>
Adding Features Not in the API
Page numbers in footer:
from docx.oxml import OxmlElement
def add_page_number(paragraph):
"""Add a dynamic page number field."""
run = paragraph.add_run()
fldChar1 = OxmlElement("w:fldChar")
fldChar1.set(qn("w:fldCharType"), "begin")
run._element.append(fldChar1)
instrText = OxmlElement("w:instrText")
instrText.set(qn("xml:space"), "preserve")
instrText.text = " PAGE "
run._element.append(instrText)
fldChar2 = OxmlElement("w:fldChar")
fldChar2.set(qn("w:fldCharType"), "end")
run._element.append(fldChar2)
# Usage
section = doc.sections[0]
footer = section.footer
para = footer.paragraphs[0]
para.text = "Page "
add_page_number(para)
Hyperlinks:
def add_hyperlink(paragraph, url, text):
"""Add a clickable hyperlink to a paragraph."""
part = paragraph.part
r_id = part.relate_to(
url, "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
is_external=True
)
hyperlink = OxmlElement("w:hyperlink")
hyperlink.set(qn("r:id"), r_id)
run = OxmlElement("w:r")
rPr = OxmlElement("w:rPr")
# Blue underlined text
color = OxmlElement("w:color")
color.set(qn("w:val"), "0563C1")
rPr.append(color)
u = OxmlElement("w:u")
u.set(qn("w:val"), "single")
rPr.append(u)
run.append(rPr)
run_text = OxmlElement("w:t")
run_text.text = text
run.append(run_text)
hyperlink.append(run)
paragraph._element.append(hyperlink)
# Usage
para = doc.add_paragraph("Visit our website: ")
add_hyperlink(para, "https://example.com", "example.com")
Style Inheritance and Custom Styles
from docx.shared import Pt, RGBColor
from docx.enum.style import WD_STYLE_TYPE
# Create custom styles
styles = doc.styles
# Custom heading
custom_heading = styles.add_style("CustomHeading", WD_STYLE_TYPE.PARAGRAPH)
custom_heading.base_style = styles["Heading 1"]
custom_heading.font.color.rgb = RGBColor(0x2C, 0x3E, 0x50)
custom_heading.font.size = Pt(24)
custom_heading.paragraph_format.space_before = Pt(24)
custom_heading.paragraph_format.space_after = Pt(12)
# Custom character style
highlight = styles.add_style("Highlight", WD_STYLE_TYPE.CHARACTER)
highlight.font.bold = True
highlight.font.color.rgb = RGBColor(0xE7, 0x4C, 0x3C)
# Usage
doc.add_paragraph("Important Section", style="CustomHeading")
para = doc.add_paragraph("This has a ")
para.add_run("highlighted keyword", style="Highlight")
para.add_run(" in the middle.")
Complex Table Layouts
Merged Cells
table = doc.add_table(rows=5, cols=4)
# Merge header row across all columns
cell = table.cell(0, 0)
merged = cell.merge(table.cell(0, 3))
merged.text = "Quarterly Revenue Report"
# Vertical merge for region labels
region_cell = table.cell(1, 0)
region_cell.merge(table.cell(2, 0))
region_cell.text = "North"
region_cell2 = table.cell(3, 0)
region_cell2.merge(table.cell(4, 0))
region_cell2.text = "South"
Table Cell Styling
from docx.shared import Cm
from docx.enum.table import WD_TABLE_ALIGNMENT
# Set specific column widths
for row in table.rows:
row.cells[0].width = Cm(4)
row.cells[1].width = Cm(3)
# Cell shading via XML
def shade_cell(cell, color):
"""Apply background color to a table cell."""
shading = OxmlElement("w:shd")
shading.set(qn("w:fill"), color)
shading.set(qn("w:val"), "clear")
cell._element.get_or_add_tcPr().append(shading)
shade_cell(table.cell(0, 0), "2C3E50")
Mail Merge at Scale
For generating hundreds or thousands of personalized documents:
from docx import Document
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
import io
def generate_single_document(args):
"""Generate one document from template. Runs in separate process."""
template_bytes, data, output_path = args
doc = Document(io.BytesIO(template_bytes))
# Replace placeholders
for para in doc.paragraphs:
replace_in_paragraph(para, data)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for para in cell.paragraphs:
replace_in_paragraph(para, data)
doc.save(output_path)
return output_path
def replace_in_paragraph(para, data):
"""Replace placeholders while preserving formatting."""
# Combine all runs to find placeholders that span runs
full_text = "".join(run.text for run in para.runs)
for key, value in data.items():
if key in full_text:
full_text = full_text.replace(key, value)
if full_text != "".join(run.text for run in para.runs):
# Rewrite: put all text in first run, clear others
if para.runs:
para.runs[0].text = full_text
for run in para.runs[1:]:
run.text = ""
def mail_merge_batch(template_path, records, output_dir, workers=4):
"""Generate documents for all records in parallel."""
Path(output_dir).mkdir(exist_ok=True)
# Read template once
template_bytes = Path(template_path).read_bytes()
tasks = []
for record in records:
filename = f"{record.get('{{ID}}', 'doc')}.docx"
output_path = str(Path(output_dir) / filename)
tasks.append((template_bytes, record, output_path))
with ProcessPoolExecutor(max_workers=workers) as executor:
results = list(executor.map(generate_single_document, tasks))
return results
# Usage
records = [
{"{{NAME}}": "Alice Chen", "{{ID}}": "1001", "{{AMOUNT}}": "$5,000"},
{"{{NAME}}": "Bob Smith", "{{ID}}": "1002", "{{AMOUNT}}": "$7,500"},
# ... hundreds more
]
mail_merge_batch("template.docx", records, "output/contracts")
Performance
- Single document generation: ~50-100ms
- Batch with 4 workers: ~200-400 documents/second
- Template loading: Read once, serialize as bytes, pass to workers
Working with Sections
from docx.enum.section import WD_ORIENT
from docx.shared import Inches
# Add a landscape section for wide tables
new_section = doc.add_section(start_type=2) # New page
new_section.orientation = WD_ORIENT.LANDSCAPE
new_section.page_width = Inches(11)
new_section.page_height = Inches(8.5)
# Set margins
new_section.top_margin = Inches(0.5)
new_section.bottom_margin = Inches(0.5)
new_section.left_margin = Inches(0.5)
new_section.right_margin = Inches(0.5)
doc.add_paragraph("This page is landscape.")
# Switch back to portrait
portrait = doc.add_section(start_type=2)
portrait.orientation = WD_ORIENT.PORTRAIT
portrait.page_width = Inches(8.5)
portrait.page_height = Inches(11)
Table of Contents
python-docx can’t generate a TOC directly (Word must refresh it), but you can insert the field code:
def add_toc(doc):
"""Insert a Table of Contents field. User must update in Word."""
para = doc.add_paragraph()
run = para.add_run()
fldChar1 = OxmlElement("w:fldChar")
fldChar1.set(qn("w:fldCharType"), "begin")
run._element.append(fldChar1)
instrText = OxmlElement("w:instrText")
instrText.set(qn("xml:space"), "preserve")
instrText.text = ' TOC \\o "1-3" \\h \\z \\u '
run._element.append(instrText)
fldChar2 = OxmlElement("w:fldChar")
fldChar2.set(qn("w:fldCharType"), "separate")
run._element.append(fldChar2)
run2 = para.add_run("Right-click and select 'Update Field' to generate TOC")
run2.font.color.rgb = RGBColor(0x80, 0x80, 0x80)
fldChar3 = OxmlElement("w:fldChar")
fldChar3.set(qn("w:fldCharType"), "end")
run2._element.append(fldChar3)
Testing Generated Documents
import pytest
from docx import Document
from pathlib import Path
def test_report_generation():
"""Verify generated document has expected structure."""
generate_report(test_data, "test_report.docx")
doc = Document("test_report.docx")
# Check headings
headings = [p.text for p in doc.paragraphs
if p.style.name.startswith("Heading")]
assert "Executive Summary" in headings
assert "Financial Results" in headings
# Check table data
assert len(doc.tables) >= 1
first_table = doc.tables[0]
header_texts = [cell.text for cell in first_table.rows[0].cells]
assert "Product" in header_texts
# Check images
from docx.opc.constants import RELATIONSHIP_TYPE as RT
image_rels = [r for r in doc.part.rels.values()
if "image" in r.reltype]
assert len(image_rels) >= 1
Path("test_report.docx").unlink()
The one thing to remember: python-docx shines with template-based generation — design in Word, automate with Python — and when the API falls short, direct XML manipulation via lxml unlocks nearly every Word feature.
See Also
- Python Excel Openpyxl openpyxl lets Python read and write real Excel files — no Excel needed on the computer.
- Python Pdf Generation Reportlab ReportLab lets Python draw professional PDFs from scratch — invoices, reports, certificates — without needing Word or a designer.
- Ci Cd Why big apps can ship updates every day without turning your phone into a glitchy mess — CI/CD is the behind-the-scenes quality gate and delivery truck.
- Containerization Why does software that works on your computer break on everyone else's? Containers fix that — and they're why Netflix can deploy 100 updates a day without the site going down.
- Python 310 New Features Python 3.10 gave programmers a shape-sorting machine, friendlier error messages, and cleaner ways to say 'this or that' in type hints.