Python Bloom Filters — Deep Dive

Build Bloom filters from scratch in Python, tune false-positive rates, and integrate with Redis for distributed membership testing.

Building a Bloom filter from scratch

Understanding the internals makes it easier to tune performance and debug false-positive issues.

import math
import mmh3  # MurmurHash3 — fast, non-cryptographic hash


class BloomFilter:
    """Space-efficient probabilistic set membership testing."""

    def __init__(self, expected_items: int, fp_rate: float = 0.01):
        self.expected_items = expected_items
        self.fp_rate = fp_rate
        self.size = self._optimal_size(expected_items, fp_rate)
        self.hash_count = self._optimal_hashes(self.size, expected_items)
        self.bit_array = bytearray(math.ceil(self.size / 8))
        self._count = 0

    @staticmethod
    def _optimal_size(n: int, p: float) -> int:
        """Calculate optimal bit array size: m = -(n * ln(p)) / (ln(2)^2)"""
        return int(-n * math.log(p) / (math.log(2) ** 2))

    @staticmethod
    def _optimal_hashes(m: int, n: int) -> int:
        """Calculate optimal hash count: k = (m/n) * ln(2)"""
        return max(1, int((m / n) * math.log(2)))

    def _get_positions(self, item: str) -> list[int]:
        """Generate k hash positions using double hashing."""
        h1 = mmh3.hash(item, seed=0) % self.size
        h2 = mmh3.hash(item, seed=42) % self.size
        return [(h1 + i * h2) % self.size for i in range(self.hash_count)]

    def _set_bit(self, pos: int) -> None:
        byte_idx, bit_idx = divmod(pos, 8)
        self.bit_array[byte_idx] |= (1 << bit_idx)

    def _get_bit(self, pos: int) -> bool:
        byte_idx, bit_idx = divmod(pos, 8)
        return bool(self.bit_array[byte_idx] & (1 << bit_idx))

    def add(self, item: str) -> None:
        """Add an element to the filter."""
        for pos in self._get_positions(item):
            self._set_bit(pos)
        self._count += 1

    def __contains__(self, item: str) -> bool:
        """Check if an element is possibly in the filter."""
        return all(self._get_bit(pos) for pos in self._get_positions(item))

    @property
    def estimated_fp_rate(self) -> float:
        """Current estimated false-positive rate based on fill ratio."""
        bits_set = sum(bin(byte).count("1") for byte in self.bit_array)
        fill_ratio = bits_set / self.size
        return fill_ratio ** self.hash_count

    def __len__(self) -> int:
        return self._count

    def __repr__(self) -> str:
        return (
            f"BloomFilter(items={self._count}, size={self.size} bits, "
            f"hashes={self.hash_count}, est_fp={self.estimated_fp_rate:.4%})"
        )

Usage and verification

bf = BloomFilter(expected_items=100_000, fp_rate=0.01)
print(bf)
# BloomFilter(items=0, size=958506 bits, hashes=6, est_fp=0.0000%)

# Add elements
for i in range(100_000):
    bf.add(f"user:{i}")

# Membership checks
assert "user:42" in bf          # True positive — added
assert "user:99999" in bf       # True positive — added

# Check false positive rate empirically
false_positives = sum(
    1 for i in range(100_000, 200_000)
    if f"user:{i}" in bf
)
print(f"Empirical FP rate: {false_positives / 100_000:.2%}")
# Typically: ~1.00% (close to configured 0.01)

# Memory comparison
bf_memory = len(bf.bit_array)  # ~120 KB
set_memory = 100_000 * 50     # ~5 MB for a set of strings
print(f"Bloom filter: {bf_memory:,} bytes vs Set: {set_memory:,} bytes")

Counting Bloom filter (supports deletion)

Standard Bloom filters can’t delete elements because clearing a bit might affect other elements. A counting Bloom filter replaces each bit with a counter:

import array


class CountingBloomFilter:
    """Bloom filter variant that supports deletion using counters."""

    def __init__(self, expected_items: int, fp_rate: float = 0.01):
        size = int(-expected_items * math.log(fp_rate) / (math.log(2) ** 2))
        self.size = size
        self.hash_count = max(1, int((size / expected_items) * math.log(2)))
        # Use 4-bit counters packed in bytes (max count: 15)
        self.counters = array.array("B", [0] * size)

    def _positions(self, item: str) -> list[int]:
        h1 = mmh3.hash(item, seed=0) % self.size
        h2 = mmh3.hash(item, seed=42) % self.size
        return [(h1 + i * h2) % self.size for i in range(self.hash_count)]

    def add(self, item: str) -> None:
        for pos in self._positions(item):
            if self.counters[pos] < 255:
                self.counters[pos] += 1

    def remove(self, item: str) -> bool:
        """Remove an element. Returns False if element wasn't present."""
        positions = self._positions(item)
        if not all(self.counters[pos] > 0 for pos in positions):
            return False
        for pos in positions:
            self.counters[pos] -= 1
        return True

    def __contains__(self, item: str) -> bool:
        return all(self.counters[pos] > 0 for pos in self._positions(item))

The trade-off: counting filters use 4–8x more memory than standard Bloom filters (one byte per counter vs one bit per position).

Redis-backed Bloom filters

For distributed systems, RedisBloom provides server-side Bloom filters accessible from any application instance:

import redis


def setup_redis_bloom(
    r: redis.Redis,
    key: str,
    capacity: int = 1_000_000,
    error_rate: float = 0.01,
) -> None:
    """Create a Bloom filter in Redis using the RedisBloom module."""
    try:
        r.execute_command("BF.RESERVE", key, error_rate, capacity)
    except redis.ResponseError as e:
        if "item exists" not in str(e):
            raise


def bloom_add(r: redis.Redis, key: str, item: str) -> bool:
    """Add item to Redis Bloom filter. Returns True if newly added."""
    return bool(r.execute_command("BF.ADD", key, item))


def bloom_check(r: redis.Redis, key: str, item: str) -> bool:
    """Check membership in Redis Bloom filter."""
    return bool(r.execute_command("BF.EXISTS", key, item))


def bloom_bulk_add(r: redis.Redis, key: str, items: list[str]) -> list[bool]:
    """Add multiple items in one round-trip."""
    return [bool(x) for x in r.execute_command("BF.MADD", key, *items)]


# Usage
r = redis.Redis()
setup_redis_bloom(r, "seen_urls", capacity=10_000_000, error_rate=0.001)

# Web crawler deduplication
url = "https://example.com/page/123"
if not bloom_check(r, "seen_urls", url):
    # URL not seen — crawl it
    crawl(url)
    bloom_add(r, "seen_urls", url)

Real-world application: cache pollution prevention

A common problem with caches: one-time requests pollute the cache, evicting frequently-accessed entries. A Bloom filter prevents this by requiring an item to be requested at least twice before caching:

class AntiPollutionCache:
    """Cache that only stores items requested more than once."""

    def __init__(self, redis_client: redis.Redis, bf_capacity: int = 1_000_000):
        self.redis = redis_client
        self.bf_key = "cache:seen"
        setup_redis_bloom(redis_client, self.bf_key, bf_capacity)

    def get_or_load(self, key: str, loader) -> any:
        # Check cache
        cached = self.redis.get(f"data:{key}")
        if cached is not None:
            return cached

        # Load from source
        value = loader()

        # Only cache if we've seen this key before
        already_seen = bloom_check(self.redis, self.bf_key, key)
        bloom_add(self.redis, self.bf_key, key)

        if already_seen:
            self.redis.setex(f"data:{key}", 300, value)

        return value

This technique is used by databases like LevelDB and RocksDB to avoid unnecessary disk reads for non-existent keys.

Performance characteristics

Operation	Time complexity	Notes
Add	O(k)	k hash computations + k bit sets
Check	O(k)	k hash computations + k bit reads
Memory	O(m)	Fixed regardless of items added

With k=7 and MurmurHash3, a single add/check takes ~200 nanoseconds in Python. The bottleneck is usually the hash function, not the bit operations.

When Bloom filters aren’t the answer

Need exact membership — use a set or database.
Need to list elements — Bloom filters are opaque; you can’t enumerate contents.
Small datasets — if your set fits in memory as a Python set, the complexity isn’t worth it.
Need deletion — use a counting Bloom filter or Cuckoo filter (which also supports deletion with better space efficiency).

The one thing to remember: Bloom filters give Python applications a memory-efficient way to say “definitely not here” — use them as a pre-filter before expensive operations, and size them based on your acceptable false-positive rate.

pythondata-structuresalgorithms