Ever wondered how Google finds what you're looking for in milliseconds? Or how Wikipedia's search instantly surfaces the right article? It's all powered by full-text search — a technique that transforms messy, unstructured text into something computers can query efficiently. Let's build one from scratch.
How Search Actually Works
At its heart, every search engine does two things: it pre-processes documents once (indexing), then answers queries super fast using that pre-built index. The trick is doing the heavy lifting upfront so searches feel instant.
Turning Text Into Searchable Tokens
Try searching for "running cats" in a document that says "The cat runs fast." A simple string match would fail — "running" ≠ "runs" and "cats" ≠ "cat". We need to normalize text so semantically similar words match.
Here's the pipeline we use:
| Stage | What Happens | Example |
|---|---|---|
| Tokenization | Split into words | "The cat runs fast." → ["the", "cat", "runs", "fast"] |
| Lowercasing | Make it case-insensitive | ["the", "cat", "runs", "fast"] |
| Stopword Removal | Drop common words | ["cat", "runs", "fast"] |
| Stemming | Reduce to root form | ["cat", "run", "fast"] |
Now "running cats" and "The cat runs fast" both become ["run", "cat"] — they match!
The Magic: Inverted Index
Instead of storing documents as-is, we flip the relationship. Instead of "Document 1 contains [cat, dog]", we store:
"cat" → [1, 3, 7, 12] (documents containing "cat")
"dog" → [2, 3, 9] (documents containing "dog")
Now finding documents with "cat" is just a hash lookup — O(1) instead of scanning everything.
Multi-word searches? Easy. For "cat AND dog":
- Get both lists:
[1, 3, 7, 12]and[2, 3, 9] - Find common IDs:
[3] - Document 3 has both words
Let's Build It
Data Model
from dataclasses import dataclass
@dataclass
class Document:
title: str
url: str
text: str
id: int = 0
Text Analysis
import string
from typing import List, Set
from collections import defaultdict
try:
from nltk.stem import PorterStemmer
nltk_available = True
except ImportError:
nltk_available = False
class TextAnalyzer:
STOP_WORDS: Set[str] = {
'a', 'an', 'and', 'be', 'have', 'i', 'in', 'of', 'that', 'the', 'to',
'is', 'it', 'for', 'with', 'as', 'on', 'at', 'by', 'from'
}
def __init__(self):
self.stemmer = PorterStemmer() if nltk_available else None
def tokenize(self, text: str) -> List[str]:
text = text.lower()
for char in string.punctuation:
text = text.replace(char, ' ')
return [t for t in text.split() if t]
def analyze(self, text: str) -> List[str]:
tokens = self.tokenize(text)
tokens = [t for t in tokens if t not in self.STOP_WORDS]
if self.stemmer:
tokens = [self.stemmer.stem(t) for t in tokens]
return tokens
The stemmer here is the classic Porter algorithm from 1980 — it strips suffixes like "-ing", "-ed", "-s" to get word roots.
The Index Itself
from typing import Dict, List, Optional, Set
class InvertedIndex:
def __init__(self):
self.index: Dict[str, List[int]] = defaultdict(list)
self.analyzer = TextAnalyzer()
self.documents: Dict[int, Document] = {}
def add_documents(self, documents: List[Document]):
for doc in documents:
self.documents[doc.id] = doc
tokens = self.analyzer.analyze(doc.text)
for token in tokens:
if not self.index[token] or self.index[token][-1] != doc.id:
self.index[token].append(doc.id)
def search(self, query: str) -> List[int]:
"""AND search - all terms must match"""
tokens = self.analyzer.analyze(query)
if not tokens:
return []
result = None
for token in tokens:
if token not in self.index:
return []
if result is None:
result = self.index[token][:]
else:
result = self._intersection(result, self.index[token])
return result if result else []
def _intersection(self, a: List[int], b: List[int]) -> List[int]:
"""Two-pointer merge for O(n+m) intersection"""
result = []
i = j = 0
while i < len(a) and j < len(b):
if a[i] < b[j]:
i += 1
elif a[i] > b[j]:
j += 1
else:
result.append(a[i])
i += 1
j += 1
return result
Loading Real Wikipedia Data
Wikipedia publishes "stub articles" — files with all 7+ million article titles and metadata, but without the full article text. Perfect for building a title search engine, and much smaller than full dumps (~600MB vs 50GB+).
Grab one of the 27 parts:
wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-stub-articles14.xml.gz
gunzip enwiki-latest-stub-articles14.xml.gz
These files have a quirk: the <text> element is empty. The actual content lives in external storage. So we index article titles instead — which works great for finding articles by name.
import xml.etree.ElementTree as ET
from typing import List
def load_documents(path: str) -> List[Document]:
tree = ET.parse(path)
root = tree.getroot()
# Handle XML namespace - required for Wikipedia dumps
ns = {'mw': 'http://www.mediawiki.org/xml/export-0.11/'}
pages = root.findall('mw:page', ns) or root.findall('.//page')
documents = []
for i, page in enumerate(pages):
title = page.findtext('mw:title', '', ns) or page.findtext('title', '')
url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
# Stub articles have no content text - use title as searchable content
documents.append(Document(
title=title,
url=url,
text=title,
id=i
))
return documents
Performance
Running this on enwiki-latest-stub-articles14.xml (~600MB, ~780K articles):
| Operation | Time |
|---|---|
| XML Parsing | ~28 seconds |
| Index Building | ~1.6 seconds |
| Search | <1 millisecond |
| Memory (index) | ~50MB |
That sub-millisecond search time holds even as you add more documents — we're only intersecting posting lists for query terms, not scanning everything.
Full Working Code
import xml.etree.ElementTree as ET
import time
import sys
from dataclasses import dataclass
from typing import List, Dict, Optional
from collections import defaultdict
try:
from nltk.stem import PorterStemmer
nltk_available = True
except ImportError:
nltk_available = False
@dataclass
class Document:
title: str
url: str
text: str
id: int = 0
class TextAnalyzer:
STOP_WORDS = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
def __init__(self):
self.stemmer = PorterStemmer() if nltk_available else None
def analyze(self, text: str) -> List[str]:
text = text.lower()
for char in '.,!?;:"()[]{}':
text = text.replace(char, ' ')
tokens = [t for t in text.split() if t and t not in self.STOP_WORDS]
if self.stemmer:
tokens = [self.stemmer.stem(t) for t in tokens]
return tokens
class InvertedIndex:
def __init__(self):
self.index: Dict[str, List[int]] = defaultdict(list)
self.analyzer = TextAnalyzer()
self.documents: Dict[int, Document] = {}
def add_documents(self, documents: List[Document]):
for doc in documents:
self.documents[doc.id] = doc
for token in self.analyzer.analyze(doc.text):
if not self.index[token] or self.index[token][-1] != doc.id:
self.index[token].append(doc.id)
def search(self, query: str) -> List[int]:
tokens = self.analyzer.analyze(query)
if not tokens:
return []
result = None
for token in tokens:
if token not in self.index:
return []
if result is None:
result = self.index[token][:]
else:
result = [d for d in result if d in set(self.index[token])]
return result if result else []
def get_document(self, doc_id: int) -> Optional[Document]:
return self.documents.get(doc_id)
def load_documents(path: str) -> List[Document]:
tree = ET.parse(path)
root = tree.getroot()
ns = {'mw': 'http://www.mediawiki.org/xml/export-0.11/'}
pages = root.findall('mw:page', ns) or root.findall('.//page')
documents = []
for i, page in enumerate(pages):
title = page.findtext('mw:title', '', ns) or page.findtext('title', '')
url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
documents.append(Document(title=title, url=url, text=title, id=i))
return documents
def main():
if len(sys.argv) != 2:
print("Usage: python search.py <path-to-wikipedia-xml>")
print("\nDownload Wikipedia stub articles:")
print(" wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-stub-articles14.xml.gz")
print(" gunzip enwiki-latest-stub-articles14.xml.gz")
sys.exit(1)
xml_path = sys.argv[1]
print(f"Loading documents from: {xml_path}")
start = time.time()
documents = load_documents(xml_path)
print(f"Loaded {len(documents)} documents in {time.time()-start:.2f}s")
print("Building index...")
start = time.time()
index = InvertedIndex()
index.add_documents(documents)
print(f"Index built in {time.time()-start:.2f}s")
print(f"Index contains {len(index.index)} unique tokens")
while True:
query = input("\nSearch (or 'quit'): ").strip()
if query.lower() == 'quit':
break
start = time.time()
results = index.search(query)
elapsed = (time.time() - start) * 1000
print(f"Found {len(results)} results in {elapsed:.2f}ms")
for doc_id in results[:10]:
doc = index.get_document(doc_id)
print(f" [{doc_id}] {doc.title}")
if __name__ == "__main__":
main()
Running It
pip install nltk
wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-stub-articles14.xml.gz
gunzip enwiki-latest-stub-articles14.xml.gz
python search.py enwiki-latest-stub-articles14.xml
Gallery
Where to Go From Here
This is Boolean search — results either match or they don't. Real search engines add ranking:
- TF-IDF: Score by how rare/important words are
- BM25: Probabilistic scoring used by Elasticsearch
- Vector search: Semantic similarity using embeddings
But the inverted index remains the foundation everything else builds on.


Top comments (0)