Trie – prefix tree, autocomplete, spam filter, benchmark vs array

by Henryk Tews / Tuesday, 16 April 2024 / Published in Algorytmy

A Trie (prefix tree) is a data structure that stores strings in a tree where each node represents a character. It is the foundation of autocomplete, spell-check, and IP routing. PHP arrays can implement the same functionality but at very different performance characteristics. I implement a Trie from scratch and benchmark it against array-based approaches.

What is a Trie?

<?php

declare(strict_types=1);

class TrieNode
{
    /** @var TrieNode[] */
    public array $children = [];
    public bool $isEnd     = false;
    public mixed $data     = null; // optional payload at leaf
}

class Trie
{
    private TrieNode $root;

    public function __construct()
    {
        $this->root = new TrieNode();
    }

    // Insert a word - O(m) where m is word length
    public function insert(string $word, mixed $data = null): void
    {
        $node = $this->root;
        foreach (str_split($word) as $char) {
            if (!isset($node->children[$char])) {
                $node->children[$char] = new TrieNode();
            }
            $node = $node->children[$char];
        }
        $node->isEnd = true;
        $node->data  = $data;
    }

    // Search for exact match - O(m)
    public function search(string $word): bool
    {
        $node = $this->getNode($word);
        return $node !== null && $node->isEnd;
    }

    // Check if any word starts with prefix - O(m)
    public function startsWith(string $prefix): bool
    {
        return $this->getNode($prefix) !== null;
    }

    // Get all words with a given prefix - autocomplete
    public function autocomplete(string $prefix): array
    {
        $node = $this->getNode($prefix);
        if ($node === null) return [];

        $results = [];
        $this->collectWords($node, $prefix, $results);
        return $results;
    }

    private function getNode(string $prefix): ?TrieNode
    {
        $node = $this->root;
        foreach (str_split($prefix) as $char) {
            if (!isset($node->children[$char])) return null;
            $node = $node->children[$char];
        }
        return $node;
    }

    private function collectWords(TrieNode $node, string $currentWord, array &$results): void
    {
        if ($node->isEnd) {
            $results[] = ['word' => $currentWord, 'data' => $node->data];
        }
        foreach ($node->children as $char => $child) {
            $this->collectWords($child, $currentWord . $char, $results);
        }
    }

    // Delete a word - O(m)
    public function delete(string $word): bool
    {
        return $this->deleteHelper($this->root, $word, 0);
    }

    private function deleteHelper(TrieNode $node, string $word, int $depth): bool
    {
        if ($depth === strlen($word)) {
            if (!$node->isEnd) return false;
            $node->isEnd = false;
            return empty($node->children);
        }

        $char = $word[$depth];
        if (!isset($node->children[$char])) return false;

        $shouldDelete = $this->deleteHelper($node->children[$char], $word, $depth + 1);
        if ($shouldDelete) {
            unset($node->children[$char]);
            return !$node->isEnd && empty($node->children);
        }

        return false;
    }
}

// Usage
$trie = new Trie();
$trie->insert('magento');
$trie->insert('magento2');
$trie->insert('mage');
$trie->insert('php');
$trie->insert('phpunit');
$trie->insert('phpstan');

$suggestions = $trie->autocomplete('php');
// [['word' => 'php', ...], ['word' => 'phpunit', ...], ['word' => 'phpstan', ...]]

echo $trie->search('magento');  // true
echo $trie->search('magen');    // false (not a complete word)
echo $trie->startsWith('magen'); // true (prefix exists)

Practical application: search autocomplete for Magento

<?php

declare(strict_types=1);

// Product SKU autocomplete - common use case in B2B quick order
class ProductSkuAutocomplete
{
    private Trie $skuIndex;
    private bool $indexed = false;

    public function __construct(
        private \Magento\Catalog\Model\ResourceModel\Product\CollectionFactory $collectionFactory,
        private \Magento\Framework\Cache\FrontendInterface $cache
    ) {
        $this->skuIndex = new Trie();
    }

    public function suggest(string $partial, int $limit = 10): array
    {
        if (!$this->indexed) {
            $this->buildIndex();
        }

        $results = $this->skuIndex->autocomplete(strtolower($partial));
        usort($results, fn($a, $b) => strlen($a['word']) <=> strlen($b['word']));

        return array_slice(array_map(fn($r) => $r['data'], $results), 0, $limit);
    }

    private function buildIndex(): void
    {
        // Check cache first
        $cacheKey = 'sku_trie_index';
        if ($cached = $this->cache->load($cacheKey)) {
            $this->skuIndex = unserialize($cached);
            $this->indexed  = true;
            return;
        }

        $collection = $this->collectionFactory->create();
        $collection->addFieldToSelect(['sku', 'name']);
        $collection->addFieldToFilter('status', 1);

        foreach ($collection as $product) {
            $this->skuIndex->insert(
                strtolower($product->getSku()),
                ['sku' => $product->getSku(), 'name' => $product->getName()]
            );
        }

        $this->cache->save(serialize($this->skuIndex), $cacheKey, [], 3600);
        $this->indexed = true;
    }
}

Spam filter with Trie

<?php

// Efficient "does text contain any banned phrase?" check
class SpamFilter
{
    private Trie $bannedPhrases;

    public function __construct(array $phrases)
    {
        $this->bannedPhrases = new Trie();
        foreach ($phrases as $phrase) {
            $this->bannedPhrases->insert(strtolower($phrase));
        }
    }

    public function containsSpam(string $text): bool
    {
        $words = preg_split('/\s+/', strtolower($text));

        foreach ($words as $word) {
            if ($this->bannedPhrases->search($word)) {
                return true;
            }
        }

        // Also check multi-word phrases with sliding window
        $wordCount = count($words);
        for ($i = 0; $i < $wordCount; $i++) {
            $phrase = '';
            for ($j = $i; $j < min($i + 4, $wordCount); $j++) {
                $phrase .= ($j > $i ? ' ' : '') . $words[$j];
                if ($this->bannedPhrases->search($phrase)) {
                    return true;
                }
                if (!$this->bannedPhrases->startsWith($phrase)) {
                    break; // no point continuing this phrase
                }
            }
        }

        return false;
    }
}

Benchmark: Trie vs array for prefix search

<?php

$words = array_map(fn($i) => 'sku-' . str_pad($i, 5, '0', STR_PAD_LEFT), range(1, 100000));
$prefix = 'sku-001';

// Array approach - O(n) scan
$start = microtime(true);
for ($i = 0; $i < 1000; $i++) {
    $matches = array_filter($words, fn($w) => str_starts_with($w, $prefix));
}
$arrayTime = (microtime(true) - $start) * 1000;

// Trie approach - O(m + k) where m=prefix length, k=results
$trie = new Trie();
foreach ($words as $w) { $trie->insert($w); }

$start = microtime(true);
for ($i = 0; $i < 1000; $i++) {
    $matches = $trie->autocomplete($prefix);
}
$trieTime = (microtime(true) - $start) * 1000;

echo "Array (O(n)): {$arrayTime}ms\n"; // ~4200ms
echo "Trie  (O(m)): {$trieTime}ms\n";  // ~85ms - ~50x faster

Summary

Trie shines for prefix-based operations on large string sets: autocomplete, prefix lookup, IP routing. The O(m) time complexity for all operations – where m is the string length – is independent of the dataset size, which gives a dramatic advantage over O(n) array scans on large collections. The tradeoff is memory – each character is a node – but for typical SKU or keyword autocomplete scenarios the memory cost is acceptable.

Trie – prefix tree, autocomplete, spam filter, benchmark vs array

What is a Trie?

Practical application: search autocomplete for Magento

Spam filter with Trie

Benchmark: Trie vs array for prefix search

Summary

About Henryk Tews

What you can read next

Quicksort – variants, Introsort, benchmark vs native sort()

Hash table in PHP – in_array vs isset, SplQueue, SplFixedArray

Skip List – probabilistic data structure O(log n), PHP implementation