Python Bloom Filters — Deep Dive
Building a Bloom filter from scratch
Understanding the internals makes it easier to tune performance and debug false-positive issues.
import math
import mmh3 # MurmurHash3 — fast, non-cryptographic hash
class BloomFilter:
"""Space-efficient probabilistic set membership testing."""
def __init__(self, expected_items: int, fp_rate: float = 0.01):
self.expected_items = expected_items
self.fp_rate = fp_rate
self.size = self._optimal_size(expected_items, fp_rate)
self.hash_count = self._optimal_hashes(self.size, expected_items)
self.bit_array = bytearray(math.ceil(self.size / 8))
self._count = 0
@staticmethod
def _optimal_size(n: int, p: float) -> int:
"""Calculate optimal bit array size: m = -(n * ln(p)) / (ln(2)^2)"""
return int(-n * math.log(p) / (math.log(2) ** 2))
@staticmethod
def _optimal_hashes(m: int, n: int) -> int:
"""Calculate optimal hash count: k = (m/n) * ln(2)"""
return max(1, int((m / n) * math.log(2)))
def _get_positions(self, item: str) -> list[int]:
"""Generate k hash positions using double hashing."""
h1 = mmh3.hash(item, seed=0) % self.size
h2 = mmh3.hash(item, seed=42) % self.size
return [(h1 + i * h2) % self.size for i in range(self.hash_count)]
def _set_bit(self, pos: int) -> None:
byte_idx, bit_idx = divmod(pos, 8)
self.bit_array[byte_idx] |= (1 << bit_idx)
def _get_bit(self, pos: int) -> bool:
byte_idx, bit_idx = divmod(pos, 8)
return bool(self.bit_array[byte_idx] & (1 << bit_idx))
def add(self, item: str) -> None:
"""Add an element to the filter."""
for pos in self._get_positions(item):
self._set_bit(pos)
self._count += 1
def __contains__(self, item: str) -> bool:
"""Check if an element is possibly in the filter."""
return all(self._get_bit(pos) for pos in self._get_positions(item))
@property
def estimated_fp_rate(self) -> float:
"""Current estimated false-positive rate based on fill ratio."""
bits_set = sum(bin(byte).count("1") for byte in self.bit_array)
fill_ratio = bits_set / self.size
return fill_ratio ** self.hash_count
def __len__(self) -> int:
return self._count
def __repr__(self) -> str:
return (
f"BloomFilter(items={self._count}, size={self.size} bits, "
f"hashes={self.hash_count}, est_fp={self.estimated_fp_rate:.4%})"
)
Usage and verification
bf = BloomFilter(expected_items=100_000, fp_rate=0.01)
print(bf)
# BloomFilter(items=0, size=958506 bits, hashes=6, est_fp=0.0000%)
# Add elements
for i in range(100_000):
bf.add(f"user:{i}")
# Membership checks
assert "user:42" in bf # True positive — added
assert "user:99999" in bf # True positive — added
# Check false positive rate empirically
false_positives = sum(
1 for i in range(100_000, 200_000)
if f"user:{i}" in bf
)
print(f"Empirical FP rate: {false_positives / 100_000:.2%}")
# Typically: ~1.00% (close to configured 0.01)
# Memory comparison
bf_memory = len(bf.bit_array) # ~120 KB
set_memory = 100_000 * 50 # ~5 MB for a set of strings
print(f"Bloom filter: {bf_memory:,} bytes vs Set: {set_memory:,} bytes")
Counting Bloom filter (supports deletion)
Standard Bloom filters can’t delete elements because clearing a bit might affect other elements. A counting Bloom filter replaces each bit with a counter:
import array
class CountingBloomFilter:
"""Bloom filter variant that supports deletion using counters."""
def __init__(self, expected_items: int, fp_rate: float = 0.01):
size = int(-expected_items * math.log(fp_rate) / (math.log(2) ** 2))
self.size = size
self.hash_count = max(1, int((size / expected_items) * math.log(2)))
# Use 4-bit counters packed in bytes (max count: 15)
self.counters = array.array("B", [0] * size)
def _positions(self, item: str) -> list[int]:
h1 = mmh3.hash(item, seed=0) % self.size
h2 = mmh3.hash(item, seed=42) % self.size
return [(h1 + i * h2) % self.size for i in range(self.hash_count)]
def add(self, item: str) -> None:
for pos in self._positions(item):
if self.counters[pos] < 255:
self.counters[pos] += 1
def remove(self, item: str) -> bool:
"""Remove an element. Returns False if element wasn't present."""
positions = self._positions(item)
if not all(self.counters[pos] > 0 for pos in positions):
return False
for pos in positions:
self.counters[pos] -= 1
return True
def __contains__(self, item: str) -> bool:
return all(self.counters[pos] > 0 for pos in self._positions(item))
The trade-off: counting filters use 4–8x more memory than standard Bloom filters (one byte per counter vs one bit per position).
Redis-backed Bloom filters
For distributed systems, RedisBloom provides server-side Bloom filters accessible from any application instance:
import redis
def setup_redis_bloom(
r: redis.Redis,
key: str,
capacity: int = 1_000_000,
error_rate: float = 0.01,
) -> None:
"""Create a Bloom filter in Redis using the RedisBloom module."""
try:
r.execute_command("BF.RESERVE", key, error_rate, capacity)
except redis.ResponseError as e:
if "item exists" not in str(e):
raise
def bloom_add(r: redis.Redis, key: str, item: str) -> bool:
"""Add item to Redis Bloom filter. Returns True if newly added."""
return bool(r.execute_command("BF.ADD", key, item))
def bloom_check(r: redis.Redis, key: str, item: str) -> bool:
"""Check membership in Redis Bloom filter."""
return bool(r.execute_command("BF.EXISTS", key, item))
def bloom_bulk_add(r: redis.Redis, key: str, items: list[str]) -> list[bool]:
"""Add multiple items in one round-trip."""
return [bool(x) for x in r.execute_command("BF.MADD", key, *items)]
# Usage
r = redis.Redis()
setup_redis_bloom(r, "seen_urls", capacity=10_000_000, error_rate=0.001)
# Web crawler deduplication
url = "https://example.com/page/123"
if not bloom_check(r, "seen_urls", url):
# URL not seen — crawl it
crawl(url)
bloom_add(r, "seen_urls", url)
Real-world application: cache pollution prevention
A common problem with caches: one-time requests pollute the cache, evicting frequently-accessed entries. A Bloom filter prevents this by requiring an item to be requested at least twice before caching:
class AntiPollutionCache:
"""Cache that only stores items requested more than once."""
def __init__(self, redis_client: redis.Redis, bf_capacity: int = 1_000_000):
self.redis = redis_client
self.bf_key = "cache:seen"
setup_redis_bloom(redis_client, self.bf_key, bf_capacity)
def get_or_load(self, key: str, loader) -> any:
# Check cache
cached = self.redis.get(f"data:{key}")
if cached is not None:
return cached
# Load from source
value = loader()
# Only cache if we've seen this key before
already_seen = bloom_check(self.redis, self.bf_key, key)
bloom_add(self.redis, self.bf_key, key)
if already_seen:
self.redis.setex(f"data:{key}", 300, value)
return value
This technique is used by databases like LevelDB and RocksDB to avoid unnecessary disk reads for non-existent keys.
Performance characteristics
| Operation | Time complexity | Notes |
|---|---|---|
| Add | O(k) | k hash computations + k bit sets |
| Check | O(k) | k hash computations + k bit reads |
| Memory | O(m) | Fixed regardless of items added |
With k=7 and MurmurHash3, a single add/check takes ~200 nanoseconds in Python. The bottleneck is usually the hash function, not the bit operations.
When Bloom filters aren’t the answer
- Need exact membership — use a set or database.
- Need to list elements — Bloom filters are opaque; you can’t enumerate contents.
- Small datasets — if your set fits in memory as a Python
set, the complexity isn’t worth it. - Need deletion — use a counting Bloom filter or Cuckoo filter (which also supports deletion with better space efficiency).
The one thing to remember: Bloom filters give Python applications a memory-efficient way to say “definitely not here” — use them as a pre-filter before expensive operations, and size them based on your acceptable false-positive rate.
See Also
- Ci Cd Why big apps can ship updates every day without turning your phone into a glitchy mess — CI/CD is the behind-the-scenes quality gate and delivery truck.
- Containerization Why does software that works on your computer break on everyone else's? Containers fix that — and they're why Netflix can deploy 100 updates a day without the site going down.
- Python 310 New Features Python 3.10 gave programmers a shape-sorting machine, friendlier error messages, and cleaner ways to say 'this or that' in type hints.
- Python 311 New Features Python 3.11 made everything faster, error messages smarter, and let you catch several mistakes at once instead of stopping at the first one.
- Python 312 New Features Python 3.12 made type hints shorter, f-strings more powerful, and started preparing Python's engine for a world without the GIL.