Python Finite Automata — Deep Dive
DFA Implementation from Scratch
from typing import Set, Dict, Tuple, Optional
class DFA:
def __init__(
self,
states: Set[str],
alphabet: Set[str],
transitions: Dict[Tuple[str, str], str],
start: str,
accepting: Set[str],
):
self.states = states
self.alphabet = alphabet
self.transitions = transitions
self.start = start
self.accepting = accepting
self._validate()
def _validate(self):
"""Ensure the DFA is well-formed."""
assert self.start in self.states, f"Start state {self.start} not in states"
assert self.accepting.issubset(self.states), "Accepting states must be subset of states"
for state in self.states:
for symbol in self.alphabet:
key = (state, symbol)
assert key in self.transitions, (
f"Missing transition for ({state}, {symbol})"
)
assert self.transitions[key] in self.states, (
f"Transition ({state}, {symbol}) → {self.transitions[key]} "
f"leads to unknown state"
)
def accepts(self, input_string: str) -> bool:
"""Process input and return whether it's accepted."""
current = self.start
for symbol in input_string:
if symbol not in self.alphabet:
return False
current = self.transitions[(current, symbol)]
return current in self.accepting
def trace(self, input_string: str) -> list[str]:
"""Return the sequence of states visited."""
path = [self.start]
current = self.start
for symbol in input_string:
if symbol not in self.alphabet:
break
current = self.transitions[(current, symbol)]
path.append(current)
return path
# Example: DFA that accepts strings ending in "ab"
dfa = DFA(
states={"q0", "q1", "q2"},
alphabet={"a", "b"},
transitions={
("q0", "a"): "q1", ("q0", "b"): "q0",
("q1", "a"): "q1", ("q1", "b"): "q2",
("q2", "a"): "q1", ("q2", "b"): "q0",
},
start="q0",
accepting={"q2"},
)
assert dfa.accepts("ab") is True
assert dfa.accepts("aab") is True
assert dfa.accepts("ba") is False
assert dfa.trace("aab") == ["q0", "q1", "q1", "q2"]
NFA Implementation
NFAs require tracking multiple possible states simultaneously:
class NFA:
def __init__(
self,
states: Set[str],
alphabet: Set[str],
transitions: Dict[Tuple[str, str], Set[str]],
start: str,
accepting: Set[str],
):
self.states = states
self.alphabet = alphabet
self.transitions = transitions
self.start = start
self.accepting = accepting
def _epsilon_closure(self, states: Set[str]) -> Set[str]:
"""Find all states reachable via epsilon transitions."""
closure = set(states)
stack = list(states)
while stack:
state = stack.pop()
for next_state in self.transitions.get((state, ""), set()):
if next_state not in closure:
closure.add(next_state)
stack.append(next_state)
return closure
def accepts(self, input_string: str) -> bool:
"""Process input, tracking all possible states."""
current_states = self._epsilon_closure({self.start})
for symbol in input_string:
next_states = set()
for state in current_states:
next_states |= self.transitions.get((state, symbol), set())
current_states = self._epsilon_closure(next_states)
if not current_states:
return False
return bool(current_states & self.accepting)
# Example: NFA that accepts strings containing "ab" anywhere
nfa = NFA(
states={"q0", "q1", "q2"},
alphabet={"a", "b"},
transitions={
("q0", "a"): {"q0", "q1"}, # nondeterministic: stay or advance
("q0", "b"): {"q0"},
("q1", "b"): {"q2"},
("q2", "a"): {"q2"},
("q2", "b"): {"q2"},
},
start="q0",
accepting={"q2"},
)
assert nfa.accepts("ab") is True
assert nfa.accepts("bab") is True
assert nfa.accepts("bba") is False
NFA-to-DFA Conversion (Subset Construction)
The powerset/subset construction algorithm converts any NFA to an equivalent DFA:
def nfa_to_dfa(nfa: NFA) -> DFA:
"""Convert NFA to equivalent DFA using subset construction."""
dfa_start = frozenset(nfa._epsilon_closure({nfa.start}))
dfa_states = set()
dfa_transitions = {}
worklist = [dfa_start]
visited = set()
while worklist:
current = worklist.pop()
if current in visited:
continue
visited.add(current)
dfa_states.add(current)
for symbol in nfa.alphabet:
# Find all NFA states reachable via this symbol
next_nfa_states = set()
for nfa_state in current:
next_nfa_states |= nfa.transitions.get((nfa_state, symbol), set())
next_dfa_state = frozenset(nfa._epsilon_closure(next_nfa_states))
if not next_dfa_state:
# Dead state — add explicit trap state
next_dfa_state = frozenset()
dfa_transitions[(current, symbol)] = next_dfa_state
if next_dfa_state not in visited:
worklist.append(next_dfa_state)
dfa_states.add(next_dfa_state)
# DFA accepting states: any DFA state containing an NFA accepting state
dfa_accepting = {s for s in dfa_states if s & nfa.accepting}
# Convert frozensets to string names for readability
state_names = {s: f"S{i}" for i, s in enumerate(sorted(dfa_states, key=str))}
return DFA(
states=set(state_names.values()),
alphabet=nfa.alphabet,
transitions={
(state_names[s], sym): state_names[dfa_transitions[(s, sym)]]
for s in dfa_states
for sym in nfa.alphabet
if (s, sym) in dfa_transitions
},
start=state_names[dfa_start],
accepting={state_names[s] for s in dfa_accepting},
)
DFA Minimization (Hopcroft’s Algorithm)
Minimization produces the DFA with the fewest states that accepts the same language:
def minimize_dfa(dfa: DFA) -> DFA:
"""Hopcroft's algorithm for DFA minimization."""
# Initial partition: accepting vs non-accepting
accepting = frozenset(dfa.accepting)
non_accepting = frozenset(dfa.states - dfa.accepting)
partitions = set()
if accepting:
partitions.add(accepting)
if non_accepting:
partitions.add(non_accepting)
worklist = list(partitions)
while worklist:
splitter = worklist.pop()
for symbol in dfa.alphabet:
# States that transition into the splitter on this symbol
predecessors = frozenset(
s for s in dfa.states
if dfa.transitions.get((s, symbol)) in splitter
)
if not predecessors:
continue
new_partitions = set()
for partition in partitions:
intersection = partition & predecessors
difference = partition - predecessors
if intersection and difference:
new_partitions.add(intersection)
new_partitions.add(difference)
if partition in worklist:
worklist.remove(partition)
worklist.append(intersection)
worklist.append(difference)
else:
if len(intersection) <= len(difference):
worklist.append(intersection)
else:
worklist.append(difference)
else:
new_partitions.add(partition)
partitions = new_partitions
# Build minimized DFA
state_map = {}
for i, partition in enumerate(sorted(partitions, key=str)):
name = f"M{i}"
for state in partition:
state_map[state] = name
new_start = state_map[dfa.start]
new_accepting = {state_map[s] for s in dfa.accepting}
new_states = set(state_map.values())
new_transitions = {}
for (state, symbol), target in dfa.transitions.items():
new_transitions[(state_map[state], symbol)] = state_map[target]
return DFA(
states=new_states,
alphabet=dfa.alphabet,
transitions=new_transitions,
start=new_start,
accepting=new_accepting,
)
Using automata-lib
The automata-lib library provides production-ready implementations:
from automata.fa.dfa import DFA
from automata.fa.nfa import NFA
# Create NFA
nfa = NFA(
states={"q0", "q1", "q2"},
input_symbols={"a", "b"},
transitions={
"q0": {"a": {"q0", "q1"}, "b": {"q0"}},
"q1": {"b": {"q2"}},
"q2": {"a": {"q2"}, "b": {"q2"}},
},
initial_state="q0",
final_states={"q2"},
)
# Convert to DFA
dfa = DFA.from_nfa(nfa)
# Minimize
minimal_dfa = dfa.minify()
# Test
assert minimal_dfa.accepts_input("xyzab"[3:]) # test "ab"
assert dfa.accepts_input("bab")
# Generate visual diagram (requires Graphviz)
dfa.show_diagram(path="dfa_diagram.png")
Regex to Finite Automaton
The standard algorithm chain: Regex → NFA (Thompson’s construction) → DFA (subset construction) → Minimized DFA.
def regex_char_to_nfa(char: str) -> NFA:
"""Build NFA for a single character."""
return NFA(
states={"s0", "s1"},
input_symbols={char},
transitions={
"s0": {char: {"s1"}},
"s1": {},
},
initial_state="s0",
final_states={"s1"},
)
# Full Thompson's construction handles:
# - Concatenation: connect end of NFA1 to start of NFA2
# - Union (|): epsilon from new start to both NFA starts
# - Kleene star (*): epsilon loop from accepting back to start
Python’s re module uses a more optimized variant, but the conceptual pipeline is the same.
Performance Analysis
Processing a string of length n:
| Operation | DFA | NFA (simulation) |
|---|---|---|
| Time per string | O(n) | O(n × |states|) |
| Space | O(|states| × |alphabet|) | O(|states|²) |
| Construction time | — | O(|states|² × |alphabet|) for NFA→DFA |
For repeated matching (e.g., scanning a large file), converting to DFA first pays off. For one-shot matching, NFA simulation avoids the potentially exponential DFA construction.
Practical Application: Log Line Classifier
from automata.fa.dfa import DFA
# Build DFAs for different log levels
error_dfa = DFA(
states={"q0", "q1", "q2", "q3", "q4", "q5", "dead"},
input_symbols=set("ERORerro \t[]"),
transitions={
"q0": {c: "dead" for c in "ERORerro \t[]"} | {"E": "q1"},
"q1": {c: "dead" for c in "ERORerro \t[]"} | {"R": "q2"},
"q2": {c: "dead" for c in "ERORerro \t[]"} | {"R": "q3"},
"q3": {c: "dead" for c in "ERORerro \t[]"} | {"O": "q4"},
"q4": {c: "dead" for c in "ERORerro \t[]"} | {"R": "q5"},
"q5": {c: "q5" for c in "ERORerro \t[]"},
"dead": {c: "dead" for c in "ERORerro \t[]"},
},
initial_state="q0",
final_states={"q5"},
)
# In practice, use regex for this — but understanding the automaton
# underneath helps you write better patterns and predict performance.
When Finite Automata Aren’t Enough
Finite automata can’t:
- Count unboundedly — matching
aⁿbⁿ(equal a’s and b’s) is impossible - Match nested structures — balanced parentheses, HTML tags
- Remember arbitrary past input — only the current state encodes history
For these, you need pushdown automata (add a stack) or Turing machines (add arbitrary memory). In Python, that means switching from regex to a parser library like lark or pyparsing.
One thing to remember: Finite automata are the fastest possible string acceptors — O(n) time, constant memory per character — and understanding their construction (NFA → DFA → minimized DFA) explains why regex works, when it fails, and how to optimize it.
See Also
- Python Petri Nets How Petri nets model things happening at the same time — like marbles rolling through a maze of gates that only open when enough marbles arrive.
- Ci Cd Why big apps can ship updates every day without turning your phone into a glitchy mess — CI/CD is the behind-the-scenes quality gate and delivery truck.
- Containerization Why does software that works on your computer break on everyone else's? Containers fix that — and they're why Netflix can deploy 100 updates a day without the site going down.
- Python 310 New Features Python 3.10 gave programmers a shape-sorting machine, friendlier error messages, and cleaner ways to say 'this or that' in type hints.
- Python 311 New Features Python 3.11 made everything faster, error messages smarter, and let you catch several mistakes at once instead of stopping at the first one.