Python Text Normalization — Deep Dive
Text normalization is deceptively simple in concept but complex in practice. Production systems deal with multilingual input, invisible Unicode characters, homoglyph attacks, and performance constraints on large corpora. This deep dive builds a comprehensive normalization pipeline and explores the edge cases that break naive approaches.
Unicode Normalization Forms in Detail
Python’s unicodedata.normalize() supports four forms. Understanding the difference is crucial for correct behavior.
import unicodedata
# The character é can be represented two ways
composed = "\u00e9" # é as single codepoint (U+00E9)
decomposed = "e\u0301" # e + combining acute accent (U+0301)
# They look identical but aren't equal
print(composed == decomposed) # False
print(len(composed)) # 1
print(len(decomposed)) # 2
# NFC composes: decomposed → composed
print(unicodedata.normalize('NFC', decomposed) == composed) # True
# NFD decomposes: composed → decomposed
print(unicodedata.normalize('NFD', composed) == decomposed) # True
NFC vs NFKC
# Compatibility decomposition handles visual equivalents
ligature = "\ufb01" # fi ligature
print(unicodedata.normalize('NFC', ligature)) # fi (unchanged — it's already composed)
print(unicodedata.normalize('NFKC', ligature)) # fi (decomposed to f + i)
# Superscripts and subscripts
superscript_2 = "\u00b2" # ²
print(unicodedata.normalize('NFKC', superscript_2)) # 2
# Fullwidth characters (common in CJK text)
fullwidth_A = "\uff21" # A
print(unicodedata.normalize('NFKC', fullwidth_A)) # A
Use NFC for storage and interchange. Use NFKC for search indexing and comparison.
Production Normalization Pipeline
import re
import unicodedata
from typing import Callable
class TextNormalizer:
"""Configurable text normalization pipeline."""
def __init__(self, steps: list[Callable[[str], str]] | None = None):
self.steps = steps or [
self.unicode_normalize,
self.strip_control_chars,
self.normalize_whitespace,
self.normalize_punctuation,
self.case_fold,
]
def __call__(self, text: str) -> str:
for step in self.steps:
text = step(text)
return text
@staticmethod
def unicode_normalize(text: str, form: str = 'NFKC') -> str:
return unicodedata.normalize(form, text)
@staticmethod
def strip_control_chars(text: str) -> str:
"""Remove control characters except common whitespace."""
return ''.join(
c for c in text
if unicodedata.category(c) != 'Cc' or c in '\n\r\t'
)
@staticmethod
def normalize_whitespace(text: str) -> str:
"""Replace all Unicode whitespace with space, collapse runs."""
# \s in re with UNICODE flag catches all Unicode whitespace
text = re.sub(r'[\s\u200b\u200c\u200d\ufeff]+', ' ', text)
return text.strip()
@staticmethod
def normalize_punctuation(text: str) -> str:
"""Normalize common punctuation variants to ASCII."""
replacements = {
'\u2018': "'", '\u2019': "'", # Smart single quotes
'\u201c': '"', '\u201d': '"', # Smart double quotes
'\u2013': '-', '\u2014': '-', # En/em dashes
'\u2026': '...', # Ellipsis
'\u00a0': ' ', # Non-breaking space
}
for old, new in replacements.items():
text = text.replace(old, new)
return text
@staticmethod
def case_fold(text: str) -> str:
return text.casefold()
@staticmethod
def remove_accents(text: str) -> str:
"""Remove combining marks (diacritics)."""
nfkd = unicodedata.normalize('NFKD', text)
return ''.join(c for c in nfkd if unicodedata.category(c) != 'Mn')
# Usage
normalize = TextNormalizer()
print(normalize(" Café DÉLICIEUX\u2019s "))
# "cafe delicieux's"
# With accent removal
normalize_search = TextNormalizer(steps=[
TextNormalizer.unicode_normalize,
TextNormalizer.remove_accents,
TextNormalizer.strip_control_chars,
TextNormalizer.normalize_whitespace,
TextNormalizer.normalize_punctuation,
TextNormalizer.case_fold,
])
print(normalize_search("Ñoño está aquí"))
# "nono esta aqui"
Case Folding Edge Cases
# German sharp s
print("straße".casefold()) # "strasse" — ß becomes ss
# Greek sigma: two lowercase forms
# σ (medial) and ς (final) both casefold to σ
print("ΟΔΥΣΣΕΥΣ".casefold()) # "οδυσσευσ"
# Note: this is locale-independent folding
# Turkish dotted/dotless I problem
# In Turkish, I → ı (dotless) and İ → i
# Python's casefold() uses universal rules, not Turkish-specific
print("ISTANBUL".casefold()) # "istanbul" (not "ıstanbul")
# For Turkish-aware folding, use locale-specific logic:
def turkish_casefold(text: str) -> str:
text = text.replace('I', 'ı').replace('İ', 'i')
return text.lower()
Invisible Character Detection
Invisible Unicode characters cause silent bugs in data pipelines:
def find_invisible(text: str) -> list[tuple[int, str, str]]:
"""Find invisible or zero-width characters."""
invisible = []
for i, c in enumerate(text):
cat = unicodedata.category(c)
if cat.startswith('C') and c not in '\n\r\t': # Control chars
invisible.append((i, f'U+{ord(c):04X}', unicodedata.name(c, 'UNKNOWN')))
elif cat == 'Zs' and c != ' ': # Non-standard space
invisible.append((i, f'U+{ord(c):04X}', unicodedata.name(c, 'UNKNOWN')))
elif cat == 'Mn': # Combining marks (standalone)
if i == 0 or unicodedata.category(text[i-1]).startswith('Z'):
invisible.append((i, f'U+{ord(c):04X}', unicodedata.name(c, 'UNKNOWN')))
return invisible
# Text with zero-width spaces (common in copy-paste from web)
sneaky = "hello\u200bworld" # Zero-width space between hello and world
print(repr(sneaky)) # 'hello\u200bworld'
print(find_invisible(sneaky)) # [(5, 'U+200B', 'ZERO WIDTH SPACE')]
Homoglyph Detection
Homoglyphs are characters from different scripts that look identical — a security concern for phishing and identity spoofing:
def detect_mixed_scripts(text: str) -> dict[str, list[str]]:
"""Detect characters from multiple scripts in a single word."""
scripts = {}
for c in text:
if c.isalpha():
# Get the script via Unicode block (simplified)
name = unicodedata.name(c, '')
script = name.split()[0] if name else 'UNKNOWN'
scripts.setdefault(script, []).append(c)
return scripts
# Latin 'a' vs Cyrillic 'а' (U+0430) — visually identical
latin = "admin"
spoofed = "\u0430dmin" # First char is Cyrillic а
print(latin == spoofed) # False!
print(detect_mixed_scripts(spoofed))
# {'CYRILLIC': ['а'], 'LATIN': ['d', 'm', 'i', 'n']}
# Normalization approach: confusable mapping
# The 'confusables' package or ICU provides official Unicode confusable mappings
Performance Optimization
Compiled Regex for Hot Paths
# Pre-compile patterns used in tight loops
_WHITESPACE_RE = re.compile(r'[\s\u200b\u200c\u200d\ufeff]+')
_PUNCT_MAP = str.maketrans({
'\u2018': "'", '\u2019': "'",
'\u201c': '"', '\u201d': '"',
'\u2013': '-', '\u2014': '-',
})
def fast_normalize(text: str) -> str:
"""Optimized for high-throughput normalization."""
text = unicodedata.normalize('NFKC', text)
text = text.translate(_PUNCT_MAP)
text = _WHITESPACE_RE.sub(' ', text)
return text.casefold().strip()
Batch Processing with Multiprocessing
from multiprocessing import Pool
def normalize_batch(texts: list[str], workers: int = 4) -> list[str]:
"""Normalize a large list in parallel."""
normalizer = TextNormalizer()
with Pool(workers) as pool:
return pool.map(normalizer, texts)
# 1M strings: ~4x speedup on 4 cores
str.translate() vs re.sub()
For character-level replacements, str.translate() with str.maketrans() is significantly faster than regex:
import timeit
text = "Hello\u2019s world\u2014it\u2018s great" * 100
# translate approach
trans = str.maketrans({'\u2019': "'", '\u2014': '-', '\u2018': "'"})
t1 = timeit.timeit(lambda: text.translate(trans), number=10000)
# regex approach
pattern = re.compile(r'[\u2018\u2019\u2014]')
def replace_fn(m):
return {'\u2018': "'", '\u2019': "'", '\u2014': '-'}[m.group()]
t2 = timeit.timeit(lambda: pattern.sub(replace_fn, text), number=10000)
print(f"translate: {t1:.3f}s, regex: {t2:.3f}s")
# translate is typically 5-10x faster
Domain-Specific Normalization
Email Addresses
def normalize_email(email: str) -> str:
"""Normalize email address for deduplication."""
local, _, domain = email.strip().partition('@')
# Remove dots and +suffix from Gmail-style addresses
if domain.lower() in ('gmail.com', 'googlemail.com'):
local = local.split('+')[0].replace('.', '')
domain = 'gmail.com'
return f"{local.lower()}@{domain.lower()}"
print(normalize_email("John.Doe+spam@Gmail.com")) # johndoe@gmail.com
URLs
from urllib.parse import urlparse, urlunparse, unquote
def normalize_url(url: str) -> str:
"""Normalize URL for comparison."""
parsed = urlparse(url.strip())
# Lowercase scheme and host
scheme = parsed.scheme.lower() or 'https'
host = parsed.hostname.lower() if parsed.hostname else ''
# Remove default ports
port = parsed.port
if (scheme == 'http' and port == 80) or (scheme == 'https' and port == 443):
port = None
# Remove trailing slash from path
path = unquote(parsed.path).rstrip('/') or '/'
# Remove fragment
netloc = host + (f':{port}' if port else '')
return urlunparse((scheme, netloc, path, '', parsed.query, ''))
Testing Normalization
import pytest
@pytest.mark.parametrize("input_text,expected", [
(" Hello World ", "hello world"),
("café", "cafe"), # With accent removal
("\u201cQuoted\u201d", '"quoted"'),
("straße", "strasse"),
("hello\u200bworld", "hello world"), # Zero-width space
("\uff21\uff22\uff23", "abc"), # Fullwidth
])
def test_normalize(input_text, expected):
normalize = TextNormalizer(steps=[
TextNormalizer.unicode_normalize,
TextNormalizer.remove_accents,
TextNormalizer.strip_control_chars,
TextNormalizer.normalize_whitespace,
TextNormalizer.normalize_punctuation,
TextNormalizer.case_fold,
])
assert normalize(input_text) == expected
One Thing to Remember
Text normalization is an ordered pipeline of Unicode form, case folding, invisible character removal, whitespace collapse, and punctuation mapping — and production systems need locale awareness, homoglyph detection, and str.translate() for performance at scale.
See Also
- Python Fuzzy Matching Fuzzywuzzy Find out how Python's FuzzyWuzzy library matches messy, misspelled text — like a friend who understands you even when you mumble.
- Python Regex Lookahead Lookbehind Learn how Python regex can peek ahead or behind without grabbing text — like checking what's next in line without stepping forward.
- Python Regex Named Groups Learn how Python regex named groups let you label the pieces you capture — like putting name tags on your search results.
- Python Regex Patterns Discover how Python regex patterns work like a secret code for finding hidden text treasures in any document.
- Python Regular Expressions Learn how Python can find tricky text patterns fast, like spotting every phone number hidden in a messy page.