Python Finite Automata — Deep Dive

DFA Implementation from Scratch

from typing import Set, Dict, Tuple, Optional

class DFA:
    def __init__(
        self,
        states: Set[str],
        alphabet: Set[str],
        transitions: Dict[Tuple[str, str], str],
        start: str,
        accepting: Set[str],
    ):
        self.states = states
        self.alphabet = alphabet
        self.transitions = transitions
        self.start = start
        self.accepting = accepting
        self._validate()

    def _validate(self):
        """Ensure the DFA is well-formed."""
        assert self.start in self.states, f"Start state {self.start} not in states"
        assert self.accepting.issubset(self.states), "Accepting states must be subset of states"

        for state in self.states:
            for symbol in self.alphabet:
                key = (state, symbol)
                assert key in self.transitions, (
                    f"Missing transition for ({state}, {symbol})"
                )
                assert self.transitions[key] in self.states, (
                    f"Transition ({state}, {symbol}) → {self.transitions[key]} "
                    f"leads to unknown state"
                )

    def accepts(self, input_string: str) -> bool:
        """Process input and return whether it's accepted."""
        current = self.start
        for symbol in input_string:
            if symbol not in self.alphabet:
                return False
            current = self.transitions[(current, symbol)]
        return current in self.accepting

    def trace(self, input_string: str) -> list[str]:
        """Return the sequence of states visited."""
        path = [self.start]
        current = self.start
        for symbol in input_string:
            if symbol not in self.alphabet:
                break
            current = self.transitions[(current, symbol)]
            path.append(current)
        return path

# Example: DFA that accepts strings ending in "ab"
dfa = DFA(
    states={"q0", "q1", "q2"},
    alphabet={"a", "b"},
    transitions={
        ("q0", "a"): "q1", ("q0", "b"): "q0",
        ("q1", "a"): "q1", ("q1", "b"): "q2",
        ("q2", "a"): "q1", ("q2", "b"): "q0",
    },
    start="q0",
    accepting={"q2"},
)

assert dfa.accepts("ab") is True
assert dfa.accepts("aab") is True
assert dfa.accepts("ba") is False
assert dfa.trace("aab") == ["q0", "q1", "q1", "q2"]

NFA Implementation

NFAs require tracking multiple possible states simultaneously:

class NFA:
    def __init__(
        self,
        states: Set[str],
        alphabet: Set[str],
        transitions: Dict[Tuple[str, str], Set[str]],
        start: str,
        accepting: Set[str],
    ):
        self.states = states
        self.alphabet = alphabet
        self.transitions = transitions
        self.start = start
        self.accepting = accepting

    def _epsilon_closure(self, states: Set[str]) -> Set[str]:
        """Find all states reachable via epsilon transitions."""
        closure = set(states)
        stack = list(states)
        while stack:
            state = stack.pop()
            for next_state in self.transitions.get((state, ""), set()):
                if next_state not in closure:
                    closure.add(next_state)
                    stack.append(next_state)
        return closure

    def accepts(self, input_string: str) -> bool:
        """Process input, tracking all possible states."""
        current_states = self._epsilon_closure({self.start})

        for symbol in input_string:
            next_states = set()
            for state in current_states:
                next_states |= self.transitions.get((state, symbol), set())
            current_states = self._epsilon_closure(next_states)

            if not current_states:
                return False

        return bool(current_states & self.accepting)

# Example: NFA that accepts strings containing "ab" anywhere
nfa = NFA(
    states={"q0", "q1", "q2"},
    alphabet={"a", "b"},
    transitions={
        ("q0", "a"): {"q0", "q1"},  # nondeterministic: stay or advance
        ("q0", "b"): {"q0"},
        ("q1", "b"): {"q2"},
        ("q2", "a"): {"q2"},
        ("q2", "b"): {"q2"},
    },
    start="q0",
    accepting={"q2"},
)

assert nfa.accepts("ab") is True
assert nfa.accepts("bab") is True
assert nfa.accepts("bba") is False

NFA-to-DFA Conversion (Subset Construction)

The powerset/subset construction algorithm converts any NFA to an equivalent DFA:

def nfa_to_dfa(nfa: NFA) -> DFA:
    """Convert NFA to equivalent DFA using subset construction."""
    dfa_start = frozenset(nfa._epsilon_closure({nfa.start}))
    dfa_states = set()
    dfa_transitions = {}
    worklist = [dfa_start]
    visited = set()

    while worklist:
        current = worklist.pop()
        if current in visited:
            continue
        visited.add(current)
        dfa_states.add(current)

        for symbol in nfa.alphabet:
            # Find all NFA states reachable via this symbol
            next_nfa_states = set()
            for nfa_state in current:
                next_nfa_states |= nfa.transitions.get((nfa_state, symbol), set())

            next_dfa_state = frozenset(nfa._epsilon_closure(next_nfa_states))

            if not next_dfa_state:
                # Dead state — add explicit trap state
                next_dfa_state = frozenset()

            dfa_transitions[(current, symbol)] = next_dfa_state

            if next_dfa_state not in visited:
                worklist.append(next_dfa_state)
                dfa_states.add(next_dfa_state)

    # DFA accepting states: any DFA state containing an NFA accepting state
    dfa_accepting = {s for s in dfa_states if s & nfa.accepting}

    # Convert frozensets to string names for readability
    state_names = {s: f"S{i}" for i, s in enumerate(sorted(dfa_states, key=str))}

    return DFA(
        states=set(state_names.values()),
        alphabet=nfa.alphabet,
        transitions={
            (state_names[s], sym): state_names[dfa_transitions[(s, sym)]]
            for s in dfa_states
            for sym in nfa.alphabet
            if (s, sym) in dfa_transitions
        },
        start=state_names[dfa_start],
        accepting={state_names[s] for s in dfa_accepting},
    )

DFA Minimization (Hopcroft’s Algorithm)

Minimization produces the DFA with the fewest states that accepts the same language:

def minimize_dfa(dfa: DFA) -> DFA:
    """Hopcroft's algorithm for DFA minimization."""
    # Initial partition: accepting vs non-accepting
    accepting = frozenset(dfa.accepting)
    non_accepting = frozenset(dfa.states - dfa.accepting)

    partitions = set()
    if accepting:
        partitions.add(accepting)
    if non_accepting:
        partitions.add(non_accepting)

    worklist = list(partitions)

    while worklist:
        splitter = worklist.pop()
        for symbol in dfa.alphabet:
            # States that transition into the splitter on this symbol
            predecessors = frozenset(
                s for s in dfa.states
                if dfa.transitions.get((s, symbol)) in splitter
            )
            if not predecessors:
                continue

            new_partitions = set()
            for partition in partitions:
                intersection = partition & predecessors
                difference = partition - predecessors
                if intersection and difference:
                    new_partitions.add(intersection)
                    new_partitions.add(difference)
                    if partition in worklist:
                        worklist.remove(partition)
                        worklist.append(intersection)
                        worklist.append(difference)
                    else:
                        if len(intersection) <= len(difference):
                            worklist.append(intersection)
                        else:
                            worklist.append(difference)
                else:
                    new_partitions.add(partition)
            partitions = new_partitions

    # Build minimized DFA
    state_map = {}
    for i, partition in enumerate(sorted(partitions, key=str)):
        name = f"M{i}"
        for state in partition:
            state_map[state] = name

    new_start = state_map[dfa.start]
    new_accepting = {state_map[s] for s in dfa.accepting}
    new_states = set(state_map.values())
    new_transitions = {}

    for (state, symbol), target in dfa.transitions.items():
        new_transitions[(state_map[state], symbol)] = state_map[target]

    return DFA(
        states=new_states,
        alphabet=dfa.alphabet,
        transitions=new_transitions,
        start=new_start,
        accepting=new_accepting,
    )

Using automata-lib

The automata-lib library provides production-ready implementations:

from automata.fa.dfa import DFA
from automata.fa.nfa import NFA

# Create NFA
nfa = NFA(
    states={"q0", "q1", "q2"},
    input_symbols={"a", "b"},
    transitions={
        "q0": {"a": {"q0", "q1"}, "b": {"q0"}},
        "q1": {"b": {"q2"}},
        "q2": {"a": {"q2"}, "b": {"q2"}},
    },
    initial_state="q0",
    final_states={"q2"},
)

# Convert to DFA
dfa = DFA.from_nfa(nfa)

# Minimize
minimal_dfa = dfa.minify()

# Test
assert minimal_dfa.accepts_input("xyzab"[3:])  # test "ab"
assert dfa.accepts_input("bab")

# Generate visual diagram (requires Graphviz)
dfa.show_diagram(path="dfa_diagram.png")

Regex to Finite Automaton

The standard algorithm chain: Regex → NFA (Thompson’s construction) → DFA (subset construction) → Minimized DFA.

def regex_char_to_nfa(char: str) -> NFA:
    """Build NFA for a single character."""
    return NFA(
        states={"s0", "s1"},
        input_symbols={char},
        transitions={
            "s0": {char: {"s1"}},
            "s1": {},
        },
        initial_state="s0",
        final_states={"s1"},
    )

# Full Thompson's construction handles:
# - Concatenation: connect end of NFA1 to start of NFA2
# - Union (|): epsilon from new start to both NFA starts
# - Kleene star (*): epsilon loop from accepting back to start

Python’s re module uses a more optimized variant, but the conceptual pipeline is the same.

Performance Analysis

Processing a string of length n:

OperationDFANFA (simulation)
Time per stringO(n)O(n × |states|)
SpaceO(|states| × |alphabet|)O(|states|²)
Construction timeO(|states|² × |alphabet|) for NFA→DFA

For repeated matching (e.g., scanning a large file), converting to DFA first pays off. For one-shot matching, NFA simulation avoids the potentially exponential DFA construction.

Practical Application: Log Line Classifier

from automata.fa.dfa import DFA

# Build DFAs for different log levels
error_dfa = DFA(
    states={"q0", "q1", "q2", "q3", "q4", "q5", "dead"},
    input_symbols=set("ERORerro \t[]"),
    transitions={
        "q0": {c: "dead" for c in "ERORerro \t[]"} | {"E": "q1"},
        "q1": {c: "dead" for c in "ERORerro \t[]"} | {"R": "q2"},
        "q2": {c: "dead" for c in "ERORerro \t[]"} | {"R": "q3"},
        "q3": {c: "dead" for c in "ERORerro \t[]"} | {"O": "q4"},
        "q4": {c: "dead" for c in "ERORerro \t[]"} | {"R": "q5"},
        "q5": {c: "q5" for c in "ERORerro \t[]"},
        "dead": {c: "dead" for c in "ERORerro \t[]"},
    },
    initial_state="q0",
    final_states={"q5"},
)

# In practice, use regex for this — but understanding the automaton
# underneath helps you write better patterns and predict performance.

When Finite Automata Aren’t Enough

Finite automata can’t:

  • Count unboundedly — matching aⁿbⁿ (equal a’s and b’s) is impossible
  • Match nested structures — balanced parentheses, HTML tags
  • Remember arbitrary past input — only the current state encodes history

For these, you need pushdown automata (add a stack) or Turing machines (add arbitrary memory). In Python, that means switching from regex to a parser library like lark or pyparsing.

One thing to remember: Finite automata are the fastest possible string acceptors — O(n) time, constant memory per character — and understanding their construction (NFA → DFA → minimized DFA) explains why regex works, when it fails, and how to optimize it.

pythonautomatacomputer-science

See Also

  • Python Petri Nets How Petri nets model things happening at the same time — like marbles rolling through a maze of gates that only open when enough marbles arrive.
  • Ci Cd Why big apps can ship updates every day without turning your phone into a glitchy mess — CI/CD is the behind-the-scenes quality gate and delivery truck.
  • Containerization Why does software that works on your computer break on everyone else's? Containers fix that — and they're why Netflix can deploy 100 updates a day without the site going down.
  • Python 310 New Features Python 3.10 gave programmers a shape-sorting machine, friendlier error messages, and cleaner ways to say 'this or that' in type hints.
  • Python 311 New Features Python 3.11 made everything faster, error messages smarter, and let you catch several mistakes at once instead of stopping at the first one.