Introduction
Keyword density analysis is a fundamental SEO tool that reveals how often specific words and phrases appear in content. In this technical guide, we'll build a keyword density checker from scratch, exploring the algorithms, data structures, and optimization techniques involved.
Understanding Keyword Density
Keyword density is calculated as:
Keyword Density = (Number of keyword occurrences / Total words) × 100
For example, if "machine learning" appears 15 times in a 1,000-word article, the keyword density is 1.5%.
But modern keyword density checkers do much more:
- Analyze 1-word, 2-word, and 3-word phrases
- Filter stop words
- Calculate TF-IDF scores
- Identify semantic variations
The Algorithm: Step-by-Step
Step 1: Text Preprocessing
function preprocessText(text) {
// Convert to lowercase for case-insensitive analysis
text = text.toLowerCase();
// Remove HTML tags
text = text.replace(/<[^>]*>/g, ' ');
// Remove special characters but keep spaces and hyphens
text = text.replace(/[^\w\s-]/g, ' ');
// Normalize whitespace
text = text.replace(/\s+/g, ' ').trim();
return text;
}
Step 2: Tokenization
function tokenize(text) {
// Split text into words
const words = text.split(/\s+/);
// Filter empty strings
return words.filter(word => word.length > 0);
}
Step 3: Stop Word Filtering
const STOP_WORDS = new Set([
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
'to', 'was', 'will', 'with', 'the', 'this', 'but', 'they', 'have',
'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
]);
function filterStopWords(words, removeStopWords = true) {
if (!removeStopWords) {
return words;
}
return words.filter(word => !STOP_WORDS.has(word));
}
Step 4: N-Gram Generation
function generateNGrams(words, n) {
const ngrams = [];
for (let i = 0; i <= words.length - n; i++) {
const ngram = words.slice(i, i + n).join(' ');
ngrams.push(ngram);
}
return ngrams;
}
// Usage
const words = ['machine', 'learning', 'is', 'powerful'];
const bigrams = generateNGrams(words, 2);
// Result: ['machine learning', 'learning is', 'is powerful']
const trigrams = generateNGrams(words, 3);
// Result: ['machine learning is', 'learning is powerful']
Step 5: Frequency Counting
function countFrequency(items) {
const frequency = new Map();
for (const item of items) {
frequency.set(item, (frequency.get(item) || 0) + 1);
}
return frequency;
}
Step 6: Density Calculation
function calculateDensity(frequency, totalWords) {
const densityMap = new Map();
for (const [word, count] of frequency) {
const density = (count / totalWords) * 100;
densityMap.set(word, {
count,
density: parseFloat(density.toFixed(2))
});
}
return densityMap;
}
Step 7: Sorting and Filtering
function sortByFrequency(densityMap, minOccurrences = 2) {
// Convert Map to array of [word, data] pairs
const entries = Array.from(densityMap.entries());
// Filter by minimum occurrences
const filtered = entries.filter(([word, data]) => data.count >= minOccurrences);
// Sort by frequency (descending)
filtered.sort((a, b) => b[1].count - a[1].count);
return filtered;
}
Complete Implementation
class KeywordDensityChecker {
constructor(options = {}) {
this.options = {
removeStopWords: options.removeStopWords !== false,
minWordLength: options.minWordLength || 2,
minOccurrences: options.minOccurrences || 2,
maxResults: options.maxResults || 50
};
this.stopWords = new Set([
'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
'to', 'was', 'will', 'with', 'the', 'this', 'but', 'they', 'have',
'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
]);
}
analyze(text) {
// Preprocess
const processed = this.preprocessText(text);
// Tokenize
const allWords = this.tokenize(processed);
// Filter stop words
const words = this.filterStopWords(allWords);
// Calculate total words (for density calculation)
const totalWords = words.length;
// Analyze 1-word keywords
const oneWordKeywords = this.analyzeNGrams(words, 1, totalWords);
// Analyze 2-word phrases
const twoWordKeywords = this.analyzeNGrams(words, 2, totalWords);
// Analyze 3-word phrases
const threeWordKeywords = this.analyzeNGrams(words, 3, totalWords);
return {
totalWords: allWords.length,
totalWordsExcludingStopWords: totalWords,
oneWordKeywords,
twoWordKeywords,
threeWordKeywords,
timestamp: new Date().toISOString()
};
}
preprocessText(text) {
text = text.toLowerCase();
text = text.replace(/<[^>]*>/g, ' ');
text = text.replace(/[^\w\s-]/g, ' ');
text = text.replace(/\s+/g, ' ').trim();
return text;
}
tokenize(text) {
return text.split(/\s+/).filter(word => word.length >= this.options.minWordLength);
}
filterStopWords(words) {
if (!this.options.removeStopWords) {
return words;
}
return words.filter(word => !this.stopWords.has(word));
}
analyzeNGrams(words, n, totalWords) {
// Generate n-grams
const ngrams = this.generateNGrams(words, n);
// Count frequency
const frequency = this.countFrequency(ngrams);
// Calculate density
const density = this.calculateDensity(frequency, totalWords);
// Sort and filter
const sorted = this.sortByFrequency(density);
// Limit results
return sorted.slice(0, this.options.maxResults);
}
generateNGrams(words, n) {
const ngrams = [];
for (let i = 0; i <= words.length - n; i++) {
ngrams.push(words.slice(i, i + n).join(' '));
}
return ngrams;
}
countFrequency(items) {
const frequency = new Map();
for (const item of items) {
frequency.set(item, (frequency.get(item) || 0) + 1);
}
return frequency;
}
calculateDensity(frequency, totalWords) {
const densityMap = new Map();
for (const [word, count] of frequency) {
if (count >= this.options.minOccurrences) {
const density = (count / totalWords) * 100;
densityMap.set(word, {
count,
density: parseFloat(density.toFixed(2))
});
}
}
return densityMap;
}
sortByFrequency(densityMap) {
const entries = Array.from(densityMap.entries());
entries.sort((a, b) => b[1].count - a[1].count);
return entries.map(([word, data]) => ({ word, ...data }));
}
}
// Usage
const checker = new KeywordDensityChecker({
removeStopWords: true,
minWordLength: 2,
minOccurrences: 2,
maxResults: 50
});
const text = `
Machine learning is a subset of artificial intelligence.
Machine learning algorithms learn from data.
The more data machine learning models have, the better they perform.
`;
const results = checker.analyze(text);
console.log('Total Words:', results.totalWords);
console.log('Top 1-Word Keywords:', results.oneWordKeywords.slice(0, 5));
console.log('Top 2-Word Keywords:', results.twoWordKeywords.slice(0, 5));
console.log('Top 3-Word Keywords:', results.threeWordKeywords.slice(0, 5));
Advanced Features
TF-IDF Scoring
TF-IDF (Term Frequency-Inverse Document Frequency) reveals which terms are important to a document relative to a corpus.
class TFIDFCalculator {
constructor(documents) {
this.documents = documents;
this.idfCache = new Map();
this.calculateIDF();
}
calculateIDF() {
const N = this.documents.length;
const termDocCount = new Map();
// Count how many documents contain each term
for (const doc of this.documents) {
const uniqueTerms = new Set(doc.toLowerCase().split(/\s+/));
for (const term of uniqueTerms) {
termDocCount.set(term, (termDocCount.get(term) || 0) + 1);
}
}
// Calculate IDF for each term
for (const [term, docCount] of termDocCount) {
const idf = Math.log(N / docCount);
this.idfCache.set(term, idf);
}
}
calculateTFIDF(document) {
const words = document.toLowerCase().split(/\s+/);
const totalWords = words.length;
const termFreq = new Map();
// Calculate TF
for (const word of words) {
termFreq.set(word, (termFreq.get(word) || 0) + 1);
}
// Calculate TF-IDF
const tfidf = new Map();
for (const [term, freq] of termFreq) {
const tf = freq / totalWords;
const idf = this.idfCache.get(term) || 0;
tfidf.set(term, tf * idf);
}
// Sort by TF-IDF score
const sorted = Array.from(tfidf.entries())
.sort((a, b) => b[1] - a[1])
.map(([term, score]) => ({
term,
score: parseFloat(score.toFixed(4))
}));
return sorted;
}
}
// Usage
const documents = [
"Machine learning is a subset of artificial intelligence",
"Deep learning is a subset of machine learning",
"Natural language processing uses machine learning"
];
const tfidf = new TFIDFCalculator(documents);
const scores = tfidf.calculateTFIDF(documents[0]);
console.log('TF-IDF Scores:', scores.slice(0, 5));
Keyword Prominence
Keyword prominence measures where keywords appear in the content (title, headers, body, etc.).
function calculateKeywordProminence(text, keyword) {
const lowerText = text.toLowerCase();
const lowerKeyword = keyword.toLowerCase();
// Extract sections
const title = extractTitle(text);
const headers = extractHeaders(text);
const firstParagraph = extractFirstParagraph(text);
const lastParagraph = extractLastParagraph(text);
const prominence = {
inTitle: title.toLowerCase().includes(lowerKeyword),
inHeaders: headers.some(h => h.toLowerCase().includes(lowerKeyword)),
inFirstParagraph: firstParagraph.toLowerCase().includes(lowerKeyword),
inLastParagraph: lastParagraph.toLowerCase().includes(lowerKeyword),
totalOccurrences: (lowerText.match(new RegExp(lowerKeyword, 'g')) || []).length
};
// Calculate prominence score (weighted)
let score = 0;
if (prominence.inTitle) score += 40;
if (prominence.inHeaders) score += 20;
if (prominence.inFirstParagraph) score += 20;
if (prominence.inLastParagraph) score += 10;
if (prominence.totalOccurrences > 0) score += 10;
prominence.score = score;
return prominence;
}
function extractTitle(text) {
const match = text.match(/^#\s+(.+)$/m);
return match ? match[1] : '';
}
function extractHeaders(text) {
const matches = text.match(/^#{2,6}\s+(.+)$/gm);
return matches ? matches.map(m => m.replace(/^#+\s+/, '')) : [];
}
function extractFirstParagraph(text) {
const paragraphs = text.split(/\n\n+/);
return paragraphs[0] || '';
}
function extractLastParagraph(text) {
const paragraphs = text.split(/\n\n+/);
return paragraphs[paragraphs.length - 1] || '';
}
Semantic Similarity
Detect semantic variations using cosine similarity:
function cosineSimilarity(vec1, vec2) {
let dotProduct = 0;
let mag1 = 0;
let mag2 = 0;
const allKeys = new Set([...Object.keys(vec1), ...Object.keys(vec2)]);
for (const key of allKeys) {
const val1 = vec1[key] || 0;
const val2 = vec2[key] || 0;
dotProduct += val1 * val2;
mag1 += val1 * val1;
mag2 += val2 * val2;
}
mag1 = Math.sqrt(mag1);
mag2 = Math.sqrt(mag2);
if (mag1 === 0 || mag2 === 0) return 0;
return dotProduct / (mag1 * mag2);
}
function findSemanticVariations(targetKeyword, allKeywords, threshold = 0.5) {
const targetVec = createWordVector(targetKeyword);
const variations = [];
for (const keyword of allKeywords) {
if (keyword === targetKeyword) continue;
const keywordVec = createWordVector(keyword);
const similarity = cosineSimilarity(targetVec, keywordVec);
if (similarity >= threshold) {
variations.push({ keyword, similarity });
}
}
return variations.sort((a, b) => b.similarity - a.similarity);
}
function createWordVector(text) {
const words = text.toLowerCase().split(/\s+/);
const vector = {};
for (const word of words) {
vector[word] = (vector[word] || 0) + 1;
}
return vector;
}
Optimization Techniques
1. Memoization
Cache results for repeated analyses:
class CachedKeywordDensityChecker extends KeywordDensityChecker {
constructor(options) {
super(options);
this.cache = new Map();
}
analyze(text) {
const hash = this.hashText(text);
if (this.cache.has(hash)) {
return this.cache.get(hash);
}
const results = super.analyze(text);
this.cache.set(hash, results);
return results;
}
hashText(text) {
// Simple hash function
let hash = 0;
for (let i = 0; i < text.length; i++) {
const char = text.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash; // Convert to 32-bit integer
}
return hash;
}
}
2. Web Workers
Offload heavy processing to Web Workers:
// worker.js
self.onmessage = function(e) {
const { text, options } = e.data;
const checker = new KeywordDensityChecker(options);
const results = checker.analyze(text);
self.postMessage(results);
};
// main.js
const worker = new Worker('worker.js');
worker.onmessage = function(e) {
const results = e.data;
console.log('Analysis complete:', results);
};
worker.postMessage({
text: document.body.innerText,
options: { removeStopWords: true }
});
3. Streaming Analysis
For very large texts, use streaming:
async function* streamAnalyze(text, chunkSize = 1000) {
const words = text.split(/\s+/);
for (let i = 0; i < words.length; i += chunkSize) {
const chunk = words.slice(i, i + chunkSize);
const chunkText = chunk.join(' ');
const checker = new KeywordDensityChecker();
const results = checker.analyze(chunkText);
yield {
progress: ((i + chunk.length) / words.length) * 100,
results
};
}
}
// Usage
for await (const { progress, results } of streamAnalyze(largeText)) {
console.log(`Progress: ${progress.toFixed(2)}%`);
console.log('Chunk results:', results);
}
Best Practices
1. Handle Edge Cases
function safeAnalyze(text) {
// Handle empty text
if (!text || text.trim().length === 0) {
return {
error: 'Text is empty',
totalWords: 0
};
}
// Handle very short text
if (text.split(/\s+/).length < 10) {
return {
warning: 'Text is too short for meaningful analysis',
totalWords: text.split(/\s+/).length
};
}
// Proceed with analysis
const checker = new KeywordDensityChecker();
return checker.analyze(text);
}
2. Validate Input
function validateOptions(options) {
const defaults = {
removeStopWords: true,
minWordLength: 2,
minOccurrences: 2,
maxResults: 50
};
const validated = { ...defaults, ...options };
// Validate ranges
validated.minWordLength = Math.max(1, Math.min(validated.minWordLength, 10));
validated.minOccurrences = Math.max(1, Math.min(validated.minOccurrences, 100));
validated.maxResults = Math.max(1, Math.min(validated.maxResults, 1000));
return validated;
}
3. Performance Monitoring
function analyzeWithTiming(text) {
const start = performance.now();
const checker = new KeywordDensityChecker();
const results = checker.analyze(text);
const end = performance.now();
const duration = end - start;
return {
...results,
performance: {
duration: `${duration.toFixed(2)}ms`,
wordsPerSecond: Math.round(results.totalWords / (duration / 1000))
}
};
}
Conclusion
Building a keyword density checker involves:
- Text preprocessing and tokenization
- N-gram generation for phrase analysis
- Frequency counting and density calculation
- Advanced features like TF-IDF and prominence scoring
- Optimization for performance
The tool we've built can analyze content at scale, providing insights into keyword usage, semantic variations, and optimization opportunities.
Try it yourself: Build your own checker or use our free tool at TechBelievers.com/tools/keyword-density-checker
Top comments (0)