A Trie (prefix tree) is a data structure that stores strings in a tree where each node represents a character. It is the foundation of autocomplete, spell-check, and IP routing. PHP arrays can implement the same functionality but at very different performance characteristics. I implement a Trie from scratch and benchmark it against array-based approaches.
What is a Trie?
<?php
declare(strict_types=1);
class TrieNode
{
/** @var TrieNode[] */
public array $children = [];
public bool $isEnd = false;
public mixed $data = null; // optional payload at leaf
}
class Trie
{
private TrieNode $root;
public function __construct()
{
$this->root = new TrieNode();
}
// Insert a word - O(m) where m is word length
public function insert(string $word, mixed $data = null): void
{
$node = $this->root;
foreach (str_split($word) as $char) {
if (!isset($node->children[$char])) {
$node->children[$char] = new TrieNode();
}
$node = $node->children[$char];
}
$node->isEnd = true;
$node->data = $data;
}
// Search for exact match - O(m)
public function search(string $word): bool
{
$node = $this->getNode($word);
return $node !== null && $node->isEnd;
}
// Check if any word starts with prefix - O(m)
public function startsWith(string $prefix): bool
{
return $this->getNode($prefix) !== null;
}
// Get all words with a given prefix - autocomplete
public function autocomplete(string $prefix): array
{
$node = $this->getNode($prefix);
if ($node === null) return [];
$results = [];
$this->collectWords($node, $prefix, $results);
return $results;
}
private function getNode(string $prefix): ?TrieNode
{
$node = $this->root;
foreach (str_split($prefix) as $char) {
if (!isset($node->children[$char])) return null;
$node = $node->children[$char];
}
return $node;
}
private function collectWords(TrieNode $node, string $currentWord, array &$results): void
{
if ($node->isEnd) {
$results[] = ['word' => $currentWord, 'data' => $node->data];
}
foreach ($node->children as $char => $child) {
$this->collectWords($child, $currentWord . $char, $results);
}
}
// Delete a word - O(m)
public function delete(string $word): bool
{
return $this->deleteHelper($this->root, $word, 0);
}
private function deleteHelper(TrieNode $node, string $word, int $depth): bool
{
if ($depth === strlen($word)) {
if (!$node->isEnd) return false;
$node->isEnd = false;
return empty($node->children);
}
$char = $word[$depth];
if (!isset($node->children[$char])) return false;
$shouldDelete = $this->deleteHelper($node->children[$char], $word, $depth + 1);
if ($shouldDelete) {
unset($node->children[$char]);
return !$node->isEnd && empty($node->children);
}
return false;
}
}
// Usage
$trie = new Trie();
$trie->insert('magento');
$trie->insert('magento2');
$trie->insert('mage');
$trie->insert('php');
$trie->insert('phpunit');
$trie->insert('phpstan');
$suggestions = $trie->autocomplete('php');
// [['word' => 'php', ...], ['word' => 'phpunit', ...], ['word' => 'phpstan', ...]]
echo $trie->search('magento'); // true
echo $trie->search('magen'); // false (not a complete word)
echo $trie->startsWith('magen'); // true (prefix exists)
Practical application: search autocomplete for Magento
<?php
declare(strict_types=1);
// Product SKU autocomplete - common use case in B2B quick order
class ProductSkuAutocomplete
{
private Trie $skuIndex;
private bool $indexed = false;
public function __construct(
private \Magento\Catalog\Model\ResourceModel\Product\CollectionFactory $collectionFactory,
private \Magento\Framework\Cache\FrontendInterface $cache
) {
$this->skuIndex = new Trie();
}
public function suggest(string $partial, int $limit = 10): array
{
if (!$this->indexed) {
$this->buildIndex();
}
$results = $this->skuIndex->autocomplete(strtolower($partial));
usort($results, fn($a, $b) => strlen($a['word']) <=> strlen($b['word']));
return array_slice(array_map(fn($r) => $r['data'], $results), 0, $limit);
}
private function buildIndex(): void
{
// Check cache first
$cacheKey = 'sku_trie_index';
if ($cached = $this->cache->load($cacheKey)) {
$this->skuIndex = unserialize($cached);
$this->indexed = true;
return;
}
$collection = $this->collectionFactory->create();
$collection->addFieldToSelect(['sku', 'name']);
$collection->addFieldToFilter('status', 1);
foreach ($collection as $product) {
$this->skuIndex->insert(
strtolower($product->getSku()),
['sku' => $product->getSku(), 'name' => $product->getName()]
);
}
$this->cache->save(serialize($this->skuIndex), $cacheKey, [], 3600);
$this->indexed = true;
}
}
Spam filter with Trie
<?php
// Efficient "does text contain any banned phrase?" check
class SpamFilter
{
private Trie $bannedPhrases;
public function __construct(array $phrases)
{
$this->bannedPhrases = new Trie();
foreach ($phrases as $phrase) {
$this->bannedPhrases->insert(strtolower($phrase));
}
}
public function containsSpam(string $text): bool
{
$words = preg_split('/\s+/', strtolower($text));
foreach ($words as $word) {
if ($this->bannedPhrases->search($word)) {
return true;
}
}
// Also check multi-word phrases with sliding window
$wordCount = count($words);
for ($i = 0; $i < $wordCount; $i++) {
$phrase = '';
for ($j = $i; $j < min($i + 4, $wordCount); $j++) {
$phrase .= ($j > $i ? ' ' : '') . $words[$j];
if ($this->bannedPhrases->search($phrase)) {
return true;
}
if (!$this->bannedPhrases->startsWith($phrase)) {
break; // no point continuing this phrase
}
}
}
return false;
}
}
Benchmark: Trie vs array for prefix search
<?php
$words = array_map(fn($i) => 'sku-' . str_pad($i, 5, '0', STR_PAD_LEFT), range(1, 100000));
$prefix = 'sku-001';
// Array approach - O(n) scan
$start = microtime(true);
for ($i = 0; $i < 1000; $i++) {
$matches = array_filter($words, fn($w) => str_starts_with($w, $prefix));
}
$arrayTime = (microtime(true) - $start) * 1000;
// Trie approach - O(m + k) where m=prefix length, k=results
$trie = new Trie();
foreach ($words as $w) { $trie->insert($w); }
$start = microtime(true);
for ($i = 0; $i < 1000; $i++) {
$matches = $trie->autocomplete($prefix);
}
$trieTime = (microtime(true) - $start) * 1000;
echo "Array (O(n)): {$arrayTime}ms\n"; // ~4200ms
echo "Trie (O(m)): {$trieTime}ms\n"; // ~85ms - ~50x faster
Summary
Trie shines for prefix-based operations on large string sets: autocomplete, prefix lookup, IP routing. The O(m) time complexity for all operations – where m is the string length – is independent of the dataset size, which gives a dramatic advantage over O(n) array scans on large collections. The tradeoff is memory – each character is a node – but for typical SKU or keyword autocomplete scenarios the memory cost is acceptable.
