A Bloom Filter is a probabilistic data structure that answers “is this element in the set?” with two possible answers: “definitely not” or “probably yes”. False positives are possible, false negatives are not. This makes it perfect for negative caching, token blacklists, and spam detection – cases where you need to quickly eliminate non-members before doing an expensive lookup.
How a Bloom Filter works
<?php
declare(strict_types=1);
class BloomFilter
{
private array $bitArray;
private int $size;
private int $hashCount;
public function __construct(int $expectedItems = 10000, float $falsePositiveRate = 0.01)
{
// Calculate optimal bit array size and hash count
$this->size = $this->calculateSize($expectedItems, $falsePositiveRate);
$this->hashCount = $this->calculateHashCount($this->size, $expectedItems);
$this->bitArray = array_fill(0, $this->size, 0);
}
// Add an item to the filter - O(k) where k is hash count
public function add(string $item): void
{
foreach ($this->getPositions($item) as $position) {
$this->bitArray[$position] = 1;
}
}
// Check if item might be in the set - O(k)
// Returns false = definitely not in the set
// Returns true = probably in the set (might be a false positive)
public function mightContain(string $item): bool
{
foreach ($this->getPositions($item) as $position) {
if ($this->bitArray[$position] === 0) {
return false; // definite miss
}
}
return true; // probable hit
}
private function getPositions(string $item): array
{
$positions = [];
for ($i = 0; $i < $this->hashCount; $i++) {
// Use different seeds to create k independent hash functions
$positions[] = abs(crc32($item . ':' . $i)) % $this->size;
}
return $positions;
}
private function calculateSize(int $n, float $p): int
{
return (int) ceil(-($n * log($p)) / (log(2) ** 2));
}
private function calculateHashCount(int $m, int $n): int
{
return max(1, (int) round(($m / $n) * log(2)));
}
public function getSize(): int { return $this->size; }
public function getHashCount(): int { return $this->hashCount; }
public function serialize(): string
{
return base64_encode(pack('N*', ...$this->bitArray));
}
public static function deserialize(string $data, int $expectedItems = 10000, float $fpr = 0.01): static
{
$filter = new static($expectedItems, $fpr);
$filter->bitArray = array_values(unpack('N*', base64_decode($data)));
return $filter;
}
}
// Demonstration
$filter = new BloomFilter(expectedItems: 1000, falsePositiveRate: 0.01);
$filter->add('user@example.com');
$filter->add('spam@badactor.com');
$filter->add('test@test.com');
echo $filter->mightContain('user@example.com'); // true (definitely in set)
echo $filter->mightContain('unknown@new.com'); // false (definitely not in set)
echo $filter->mightContain('spam@badactor.com'); // true (definitely in set)
Practical application: JWT token blacklist
<?php
declare(strict_types=1);
// Problem: When a user logs out, we want to invalidate their JWT token
// Checking every request against a DB table of revoked tokens = expensive
// Bloom Filter: fast in-memory check, only hit DB when filter says "probably yes"
class TokenBlacklist
{
private BloomFilter $filter;
public function __construct(
private \Magento\Framework\App\CacheInterface $cache,
private \Magento\Framework\App\ResourceConnection $resourceConnection
) {
$this->filter = $this->loadOrCreateFilter();
}
public function revoke(string $tokenJti): void
{
// Add to Bloom Filter (in-memory, fast)
$this->filter->add($tokenJti);
$this->saveFilter();
// Also persist to DB for accuracy (handles false positives on restart)
$this->resourceConnection->getConnection()->insert(
$this->resourceConnection->getTableName('vendor_revoked_tokens'),
['jti' => $tokenJti, 'revoked_at' => date('Y-m-d H:i:s')]
);
}
public function isRevoked(string $tokenJti): bool
{
// Fast path: Bloom Filter says "definitely not revoked"
if (!$this->filter->mightContain($tokenJti)) {
return false; // no false negatives - this is accurate
}
// Slow path: Bloom Filter says "might be revoked"
// Check the actual DB to eliminate false positives
$count = $this->resourceConnection->getConnection()->fetchOne(
'SELECT COUNT(*) FROM ' .
$this->resourceConnection->getTableName('vendor_revoked_tokens') .
' WHERE jti = ?',
[$tokenJti]
);
return $count > 0;
}
private function loadOrCreateFilter(): BloomFilter
{
$cached = $this->cache->load('token_blacklist_filter');
if ($cached) {
return BloomFilter::deserialize($cached, 100000);
}
// Bootstrap filter from DB on cold start
$filter = new BloomFilter(100000, 0.001);
$jtis = $this->resourceConnection->getConnection()->fetchCol(
'SELECT jti FROM ' .
$this->resourceConnection->getTableName('vendor_revoked_tokens')
);
foreach ($jtis as $jti) { $filter->add($jti); }
$this->cache->save($filter->serialize(), 'token_blacklist_filter', [], 3600);
return $filter;
}
private function saveFilter(): void
{
$this->cache->save($this->filter->serialize(), 'token_blacklist_filter', [], 3600);
}
}
Negative cache for product lookups
<?php
// Prevent "thundering herd" on non-existent SKU lookups
// Without filter: 1000 concurrent requests for non-existent SKU = 1000 DB queries
// With Bloom Filter: 1000 requests -> filter says "not in set" -> 0 DB queries
class ProductLookupService
{
private BloomFilter $existingSkuFilter;
public function __construct(
private \Magento\Catalog\Api\ProductRepositoryInterface $productRepository,
private BloomFilter $filter
) {
$this->existingSkuFilter = $filter;
}
public function getProductBySku(string $sku): ?\Magento\Catalog\Api\Data\ProductInterface
{
// Fast negative check
if (!$this->existingSkuFilter->mightContain($sku)) {
return null; // SKU definitely does not exist
}
// SKU might exist - do the actual lookup
try {
return $this->productRepository->get($sku);
} catch (\Magento\Framework\Exception\NoSuchEntityException $e) {
return null; // False positive from the filter - that's OK
}
}
}
Space efficiency
<?php
// Memory comparison for 1 million items at 1% false positive rate
$filter = new BloomFilter(1_000_000, 0.01);
echo "Bloom Filter bits: {$filter->getSize()}\n"; // ~9,585,059 bits = ~1.1 MB
echo "Hash functions: {$filter->getHashCount()}\n"; // 7
// Storing 1M strings of average 20 chars in a PHP array:
// 1,000,000 × ~80 bytes overhead = ~76 MB
// Bloom Filter: 1.1 MB vs plain array: 76 MB
// ~70x less memory for the existence check
Summary
Bloom Filter is a specialised tool for one specific problem: fast negative cache – definitively answering “this item is NOT in the set” without any storage or DB lookup. False positives are acceptable because they just fall through to a slower but accurate check. In PHP the pattern of Bloom Filter + DB fallback gives you near-zero DB load for non-members (the common case in token blacklisting and negative caching) while maintaining correctness.
