Python DOCX Generation — Deep Dive

Advanced python-docx: XML manipulation, style inheritance, complex table layouts, mail merge at scale, and working around library limitations.

python-docx provides a clean API for common document operations, but production use often requires reaching into the underlying XML, handling style inheritance, building complex layouts, and generating documents at scale.

Understanding DOCX Internals

A .docx file is a ZIP archive containing XML files:

document.docx (ZIP)
├── [Content_Types].xml
├── _rels/.rels
├── word/
│   ├── document.xml       ← Main content
│   ├── styles.xml         ← Style definitions
│   ├── header1.xml        ← Header content
│   ├── footer1.xml        ← Footer content
│   ├── numbering.xml      ← List numbering
│   └── media/             ← Embedded images
│       └── image1.png

python-docx wraps lxml to manipulate these XML trees. When the API doesn’t support something, you can modify the XML directly.

Direct XML Access

from docx import Document
from docx.oxml.ns import qn
from lxml import etree

doc = Document()
para = doc.add_paragraph("Regular text")

# Access the XML element
print(etree.tostring(para._element, pretty_print=True).decode())
# <w:p>
#   <w:r>
#     <w:t>Regular text</w:t>
#   </w:r>
# </w:p>

Adding Features Not in the API

Page numbers in footer:

from docx.oxml import OxmlElement

def add_page_number(paragraph):
    """Add a dynamic page number field."""
    run = paragraph.add_run()
    fldChar1 = OxmlElement("w:fldChar")
    fldChar1.set(qn("w:fldCharType"), "begin")
    run._element.append(fldChar1)

    instrText = OxmlElement("w:instrText")
    instrText.set(qn("xml:space"), "preserve")
    instrText.text = " PAGE "
    run._element.append(instrText)

    fldChar2 = OxmlElement("w:fldChar")
    fldChar2.set(qn("w:fldCharType"), "end")
    run._element.append(fldChar2)

# Usage
section = doc.sections[0]
footer = section.footer
para = footer.paragraphs[0]
para.text = "Page "
add_page_number(para)

Hyperlinks:

def add_hyperlink(paragraph, url, text):
    """Add a clickable hyperlink to a paragraph."""
    part = paragraph.part
    r_id = part.relate_to(
        url, "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
        is_external=True
    )

    hyperlink = OxmlElement("w:hyperlink")
    hyperlink.set(qn("r:id"), r_id)

    run = OxmlElement("w:r")
    rPr = OxmlElement("w:rPr")

    # Blue underlined text
    color = OxmlElement("w:color")
    color.set(qn("w:val"), "0563C1")
    rPr.append(color)

    u = OxmlElement("w:u")
    u.set(qn("w:val"), "single")
    rPr.append(u)

    run.append(rPr)
    run_text = OxmlElement("w:t")
    run_text.text = text
    run.append(run_text)

    hyperlink.append(run)
    paragraph._element.append(hyperlink)

# Usage
para = doc.add_paragraph("Visit our website: ")
add_hyperlink(para, "https://example.com", "example.com")

Style Inheritance and Custom Styles

from docx.shared import Pt, RGBColor
from docx.enum.style import WD_STYLE_TYPE

# Create custom styles
styles = doc.styles

# Custom heading
custom_heading = styles.add_style("CustomHeading", WD_STYLE_TYPE.PARAGRAPH)
custom_heading.base_style = styles["Heading 1"]
custom_heading.font.color.rgb = RGBColor(0x2C, 0x3E, 0x50)
custom_heading.font.size = Pt(24)
custom_heading.paragraph_format.space_before = Pt(24)
custom_heading.paragraph_format.space_after = Pt(12)

# Custom character style
highlight = styles.add_style("Highlight", WD_STYLE_TYPE.CHARACTER)
highlight.font.bold = True
highlight.font.color.rgb = RGBColor(0xE7, 0x4C, 0x3C)

# Usage
doc.add_paragraph("Important Section", style="CustomHeading")
para = doc.add_paragraph("This has a ")
para.add_run("highlighted keyword", style="Highlight")
para.add_run(" in the middle.")

Complex Table Layouts

Merged Cells

table = doc.add_table(rows=5, cols=4)

# Merge header row across all columns
cell = table.cell(0, 0)
merged = cell.merge(table.cell(0, 3))
merged.text = "Quarterly Revenue Report"

# Vertical merge for region labels
region_cell = table.cell(1, 0)
region_cell.merge(table.cell(2, 0))
region_cell.text = "North"

region_cell2 = table.cell(3, 0)
region_cell2.merge(table.cell(4, 0))
region_cell2.text = "South"

Table Cell Styling

from docx.shared import Cm
from docx.enum.table import WD_TABLE_ALIGNMENT

# Set specific column widths
for row in table.rows:
    row.cells[0].width = Cm(4)
    row.cells[1].width = Cm(3)

# Cell shading via XML
def shade_cell(cell, color):
    """Apply background color to a table cell."""
    shading = OxmlElement("w:shd")
    shading.set(qn("w:fill"), color)
    shading.set(qn("w:val"), "clear")
    cell._element.get_or_add_tcPr().append(shading)

shade_cell(table.cell(0, 0), "2C3E50")

Mail Merge at Scale

For generating hundreds or thousands of personalized documents:

from docx import Document
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path
import io

def generate_single_document(args):
    """Generate one document from template. Runs in separate process."""
    template_bytes, data, output_path = args

    doc = Document(io.BytesIO(template_bytes))

    # Replace placeholders
    for para in doc.paragraphs:
        replace_in_paragraph(para, data)

    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for para in cell.paragraphs:
                    replace_in_paragraph(para, data)

    doc.save(output_path)
    return output_path

def replace_in_paragraph(para, data):
    """Replace placeholders while preserving formatting."""
    # Combine all runs to find placeholders that span runs
    full_text = "".join(run.text for run in para.runs)

    for key, value in data.items():
        if key in full_text:
            full_text = full_text.replace(key, value)

    if full_text != "".join(run.text for run in para.runs):
        # Rewrite: put all text in first run, clear others
        if para.runs:
            para.runs[0].text = full_text
            for run in para.runs[1:]:
                run.text = ""

def mail_merge_batch(template_path, records, output_dir, workers=4):
    """Generate documents for all records in parallel."""
    Path(output_dir).mkdir(exist_ok=True)

    # Read template once
    template_bytes = Path(template_path).read_bytes()

    tasks = []
    for record in records:
        filename = f"{record.get('{{ID}}', 'doc')}.docx"
        output_path = str(Path(output_dir) / filename)
        tasks.append((template_bytes, record, output_path))

    with ProcessPoolExecutor(max_workers=workers) as executor:
        results = list(executor.map(generate_single_document, tasks))

    return results

# Usage
records = [
    {"{{NAME}}": "Alice Chen", "{{ID}}": "1001", "{{AMOUNT}}": "$5,000"},
    {"{{NAME}}": "Bob Smith", "{{ID}}": "1002", "{{AMOUNT}}": "$7,500"},
    # ... hundreds more
]
mail_merge_batch("template.docx", records, "output/contracts")

Performance

Single document generation: ~50-100ms
Batch with 4 workers: ~200-400 documents/second
Template loading: Read once, serialize as bytes, pass to workers

Working with Sections

from docx.enum.section import WD_ORIENT
from docx.shared import Inches

# Add a landscape section for wide tables
new_section = doc.add_section(start_type=2)  # New page
new_section.orientation = WD_ORIENT.LANDSCAPE
new_section.page_width = Inches(11)
new_section.page_height = Inches(8.5)

# Set margins
new_section.top_margin = Inches(0.5)
new_section.bottom_margin = Inches(0.5)
new_section.left_margin = Inches(0.5)
new_section.right_margin = Inches(0.5)

doc.add_paragraph("This page is landscape.")

# Switch back to portrait
portrait = doc.add_section(start_type=2)
portrait.orientation = WD_ORIENT.PORTRAIT
portrait.page_width = Inches(8.5)
portrait.page_height = Inches(11)

python-docx can’t generate a TOC directly (Word must refresh it), but you can insert the field code:

def add_toc(doc):
    """Insert a Table of Contents field. User must update in Word."""
    para = doc.add_paragraph()
    run = para.add_run()

    fldChar1 = OxmlElement("w:fldChar")
    fldChar1.set(qn("w:fldCharType"), "begin")
    run._element.append(fldChar1)

    instrText = OxmlElement("w:instrText")
    instrText.set(qn("xml:space"), "preserve")
    instrText.text = ' TOC \\o "1-3" \\h \\z \\u '
    run._element.append(instrText)

    fldChar2 = OxmlElement("w:fldChar")
    fldChar2.set(qn("w:fldCharType"), "separate")
    run._element.append(fldChar2)

    run2 = para.add_run("Right-click and select 'Update Field' to generate TOC")
    run2.font.color.rgb = RGBColor(0x80, 0x80, 0x80)

    fldChar3 = OxmlElement("w:fldChar")
    fldChar3.set(qn("w:fldCharType"), "end")
    run2._element.append(fldChar3)

Testing Generated Documents

import pytest
from docx import Document
from pathlib import Path

def test_report_generation():
    """Verify generated document has expected structure."""
    generate_report(test_data, "test_report.docx")

    doc = Document("test_report.docx")

    # Check headings
    headings = [p.text for p in doc.paragraphs
                if p.style.name.startswith("Heading")]
    assert "Executive Summary" in headings
    assert "Financial Results" in headings

    # Check table data
    assert len(doc.tables) >= 1
    first_table = doc.tables[0]
    header_texts = [cell.text for cell in first_table.rows[0].cells]
    assert "Product" in header_texts

    # Check images
    from docx.opc.constants import RELATIONSHIP_TYPE as RT
    image_rels = [r for r in doc.part.rels.values()
                  if "image" in r.reltype]
    assert len(image_rels) >= 1

    Path("test_report.docx").unlink()

The one thing to remember: python-docx shines with template-based generation — design in Word, automate with Python — and when the API falls short, direct XML manipulation via lxml unlocks nearly every Word feature.

pythondocxWorddocument-generationXMLproductiontemplates