Tech Believers

Posted on Dec 4

Building a Keyword Density Checker: Algorithm and Implementation

#security #webdev #programming

Introduction

Keyword density analysis is a fundamental SEO tool that reveals how often specific words and phrases appear in content. In this technical guide, we'll build a keyword density checker from scratch, exploring the algorithms, data structures, and optimization techniques involved.

Understanding Keyword Density

Keyword density is calculated as:

Keyword Density = (Number of keyword occurrences / Total words) × 100

For example, if "machine learning" appears 15 times in a 1,000-word article, the keyword density is 1.5%.

But modern keyword density checkers do much more:

Analyze 1-word, 2-word, and 3-word phrases
Filter stop words
Calculate TF-IDF scores
Identify semantic variations

The Algorithm: Step-by-Step

Step 1: Text Preprocessing

function preprocessText(text) {
    // Convert to lowercase for case-insensitive analysis
    text = text.toLowerCase();

    // Remove HTML tags
    text = text.replace(/<[^>]*>/g, ' ');

    // Remove special characters but keep spaces and hyphens
    text = text.replace(/[^\w\s-]/g, ' ');

    // Normalize whitespace
    text = text.replace(/\s+/g, ' ').trim();

    return text;
}

Step 2: Tokenization

function tokenize(text) {
    // Split text into words
    const words = text.split(/\s+/);

    // Filter empty strings
    return words.filter(word => word.length > 0);
}

Step 3: Stop Word Filtering

const STOP_WORDS = new Set([
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
    'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
    'to', 'was', 'will', 'with', 'the', 'this', 'but', 'they', 'have',
    'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
]);

function filterStopWords(words, removeStopWords = true) {
    if (!removeStopWords) {
        return words;
    }

    return words.filter(word => !STOP_WORDS.has(word));
}

Step 4: N-Gram Generation

function generateNGrams(words, n) {
    const ngrams = [];

    for (let i = 0; i <= words.length - n; i++) {
        const ngram = words.slice(i, i + n).join(' ');
        ngrams.push(ngram);
    }

    return ngrams;
}

// Usage
const words = ['machine', 'learning', 'is', 'powerful'];
const bigrams = generateNGrams(words, 2);
// Result: ['machine learning', 'learning is', 'is powerful']

const trigrams = generateNGrams(words, 3);
// Result: ['machine learning is', 'learning is powerful']

Step 5: Frequency Counting

function countFrequency(items) {
    const frequency = new Map();

    for (const item of items) {
        frequency.set(item, (frequency.get(item) || 0) + 1);
    }

    return frequency;
}

Step 6: Density Calculation

function calculateDensity(frequency, totalWords) {
    const densityMap = new Map();

    for (const [word, count] of frequency) {
        const density = (count / totalWords) * 100;
        densityMap.set(word, {
            count,
            density: parseFloat(density.toFixed(2))
        });
    }

    return densityMap;
}

Step 7: Sorting and Filtering

function sortByFrequency(densityMap, minOccurrences = 2) {
    // Convert Map to array of [word, data] pairs
    const entries = Array.from(densityMap.entries());

    // Filter by minimum occurrences
    const filtered = entries.filter(([word, data]) => data.count >= minOccurrences);

    // Sort by frequency (descending)
    filtered.sort((a, b) => b[1].count - a[1].count);

    return filtered;
}

Complete Implementation

class KeywordDensityChecker {
    constructor(options = {}) {
        this.options = {
            removeStopWords: options.removeStopWords !== false,
            minWordLength: options.minWordLength || 2,
            minOccurrences: options.minOccurrences || 2,
            maxResults: options.maxResults || 50
        };

        this.stopWords = new Set([
            'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
            'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
            'to', 'was', 'will', 'with', 'the', 'this', 'but', 'they', 'have',
            'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
        ]);
    }

    analyze(text) {
        // Preprocess
        const processed = this.preprocessText(text);

        // Tokenize
        const allWords = this.tokenize(processed);

        // Filter stop words
        const words = this.filterStopWords(allWords);

        // Calculate total words (for density calculation)
        const totalWords = words.length;

        // Analyze 1-word keywords
        const oneWordKeywords = this.analyzeNGrams(words, 1, totalWords);

        // Analyze 2-word phrases
        const twoWordKeywords = this.analyzeNGrams(words, 2, totalWords);

        // Analyze 3-word phrases
        const threeWordKeywords = this.analyzeNGrams(words, 3, totalWords);

        return {
            totalWords: allWords.length,
            totalWordsExcludingStopWords: totalWords,
            oneWordKeywords,
            twoWordKeywords,
            threeWordKeywords,
            timestamp: new Date().toISOString()
        };
    }

    preprocessText(text) {
        text = text.toLowerCase();
        text = text.replace(/<[^>]*>/g, ' ');
        text = text.replace(/[^\w\s-]/g, ' ');
        text = text.replace(/\s+/g, ' ').trim();
        return text;
    }

    tokenize(text) {
        return text.split(/\s+/).filter(word => word.length >= this.options.minWordLength);
    }

    filterStopWords(words) {
        if (!this.options.removeStopWords) {
            return words;
        }
        return words.filter(word => !this.stopWords.has(word));
    }

    analyzeNGrams(words, n, totalWords) {
        // Generate n-grams
        const ngrams = this.generateNGrams(words, n);

        // Count frequency
        const frequency = this.countFrequency(ngrams);

        // Calculate density
        const density = this.calculateDensity(frequency, totalWords);

        // Sort and filter
        const sorted = this.sortByFrequency(density);

        // Limit results
        return sorted.slice(0, this.options.maxResults);
    }

    generateNGrams(words, n) {
        const ngrams = [];
        for (let i = 0; i <= words.length - n; i++) {
            ngrams.push(words.slice(i, i + n).join(' '));
        }
        return ngrams;
    }

    countFrequency(items) {
        const frequency = new Map();
        for (const item of items) {
            frequency.set(item, (frequency.get(item) || 0) + 1);
        }
        return frequency;
    }

    calculateDensity(frequency, totalWords) {
        const densityMap = new Map();
        for (const [word, count] of frequency) {
            if (count >= this.options.minOccurrences) {
                const density = (count / totalWords) * 100;
                densityMap.set(word, {
                    count,
                    density: parseFloat(density.toFixed(2))
                });
            }
        }
        return densityMap;
    }

    sortByFrequency(densityMap) {
        const entries = Array.from(densityMap.entries());
        entries.sort((a, b) => b[1].count - a[1].count);
        return entries.map(([word, data]) => ({ word, ...data }));
    }
}

// Usage
const checker = new KeywordDensityChecker({
    removeStopWords: true,
    minWordLength: 2,
    minOccurrences: 2,
    maxResults: 50
});

const text = `
Machine learning is a subset of artificial intelligence. 
Machine learning algorithms learn from data. 
The more data machine learning models have, the better they perform.
`;

const results = checker.analyze(text);

console.log('Total Words:', results.totalWords);
console.log('Top 1-Word Keywords:', results.oneWordKeywords.slice(0, 5));
console.log('Top 2-Word Keywords:', results.twoWordKeywords.slice(0, 5));
console.log('Top 3-Word Keywords:', results.threeWordKeywords.slice(0, 5));

Advanced Features

TF-IDF Scoring

TF-IDF (Term Frequency-Inverse Document Frequency) reveals which terms are important to a document relative to a corpus.

class TFIDFCalculator {
    constructor(documents) {
        this.documents = documents;
        this.idfCache = new Map();
        this.calculateIDF();
    }

    calculateIDF() {
        const N = this.documents.length;
        const termDocCount = new Map();

        // Count how many documents contain each term
        for (const doc of this.documents) {
            const uniqueTerms = new Set(doc.toLowerCase().split(/\s+/));
            for (const term of uniqueTerms) {
                termDocCount.set(term, (termDocCount.get(term) || 0) + 1);
            }
        }

        // Calculate IDF for each term
        for (const [term, docCount] of termDocCount) {
            const idf = Math.log(N / docCount);
            this.idfCache.set(term, idf);
        }
    }

    calculateTFIDF(document) {
        const words = document.toLowerCase().split(/\s+/);
        const totalWords = words.length;
        const termFreq = new Map();

        // Calculate TF
        for (const word of words) {
            termFreq.set(word, (termFreq.get(word) || 0) + 1);
        }

        // Calculate TF-IDF
        const tfidf = new Map();
        for (const [term, freq] of termFreq) {
            const tf = freq / totalWords;
            const idf = this.idfCache.get(term) || 0;
            tfidf.set(term, tf * idf);
        }

        // Sort by TF-IDF score
        const sorted = Array.from(tfidf.entries())
            .sort((a, b) => b[1] - a[1])
            .map(([term, score]) => ({
                term,
                score: parseFloat(score.toFixed(4))
            }));

        return sorted;
    }
}

// Usage
const documents = [
    "Machine learning is a subset of artificial intelligence",
    "Deep learning is a subset of machine learning",
    "Natural language processing uses machine learning"
];

const tfidf = new TFIDFCalculator(documents);
const scores = tfidf.calculateTFIDF(documents[0]);
console.log('TF-IDF Scores:', scores.slice(0, 5));

Keyword Prominence

Keyword prominence measures where keywords appear in the content (title, headers, body, etc.).

function calculateKeywordProminence(text, keyword) {
    const lowerText = text.toLowerCase();
    const lowerKeyword = keyword.toLowerCase();

    // Extract sections
    const title = extractTitle(text);
    const headers = extractHeaders(text);
    const firstParagraph = extractFirstParagraph(text);
    const lastParagraph = extractLastParagraph(text);

    const prominence = {
        inTitle: title.toLowerCase().includes(lowerKeyword),
        inHeaders: headers.some(h => h.toLowerCase().includes(lowerKeyword)),
        inFirstParagraph: firstParagraph.toLowerCase().includes(lowerKeyword),
        inLastParagraph: lastParagraph.toLowerCase().includes(lowerKeyword),
        totalOccurrences: (lowerText.match(new RegExp(lowerKeyword, 'g')) || []).length
    };

    // Calculate prominence score (weighted)
    let score = 0;
    if (prominence.inTitle) score += 40;
    if (prominence.inHeaders) score += 20;
    if (prominence.inFirstParagraph) score += 20;
    if (prominence.inLastParagraph) score += 10;
    if (prominence.totalOccurrences > 0) score += 10;

    prominence.score = score;

    return prominence;
}

function extractTitle(text) {
    const match = text.match(/^#\s+(.+)$/m);
    return match ? match[1] : '';
}

function extractHeaders(text) {
    const matches = text.match(/^#{2,6}\s+(.+)$/gm);
    return matches ? matches.map(m => m.replace(/^#+\s+/, '')) : [];
}

function extractFirstParagraph(text) {
    const paragraphs = text.split(/\n\n+/);
    return paragraphs[0] || '';
}

function extractLastParagraph(text) {
    const paragraphs = text.split(/\n\n+/);
    return paragraphs[paragraphs.length - 1] || '';
}

Semantic Similarity

Detect semantic variations using cosine similarity:

function cosineSimilarity(vec1, vec2) {
    let dotProduct = 0;
    let mag1 = 0;
    let mag2 = 0;

    const allKeys = new Set([...Object.keys(vec1), ...Object.keys(vec2)]);

    for (const key of allKeys) {
        const val1 = vec1[key] || 0;
        const val2 = vec2[key] || 0;

        dotProduct += val1 * val2;
        mag1 += val1 * val1;
        mag2 += val2 * val2;
    }

    mag1 = Math.sqrt(mag1);
    mag2 = Math.sqrt(mag2);

    if (mag1 === 0 || mag2 === 0) return 0;

    return dotProduct / (mag1 * mag2);
}

function findSemanticVariations(targetKeyword, allKeywords, threshold = 0.5) {
    const targetVec = createWordVector(targetKeyword);
    const variations = [];

    for (const keyword of allKeywords) {
        if (keyword === targetKeyword) continue;

        const keywordVec = createWordVector(keyword);
        const similarity = cosineSimilarity(targetVec, keywordVec);

        if (similarity >= threshold) {
            variations.push({ keyword, similarity });
        }
    }

    return variations.sort((a, b) => b.similarity - a.similarity);
}

function createWordVector(text) {
    const words = text.toLowerCase().split(/\s+/);
    const vector = {};

    for (const word of words) {
        vector[word] = (vector[word] || 0) + 1;
    }

    return vector;
}

Optimization Techniques

1. Memoization

Cache results for repeated analyses:

class CachedKeywordDensityChecker extends KeywordDensityChecker {
    constructor(options) {
        super(options);
        this.cache = new Map();
    }

    analyze(text) {
        const hash = this.hashText(text);

        if (this.cache.has(hash)) {
            return this.cache.get(hash);
        }

        const results = super.analyze(text);
        this.cache.set(hash, results);

        return results;
    }

    hashText(text) {
        // Simple hash function
        let hash = 0;
        for (let i = 0; i < text.length; i++) {
            const char = text.charCodeAt(i);
            hash = ((hash << 5) - hash) + char;
            hash = hash & hash; // Convert to 32-bit integer
        }
        return hash;
    }
}

2. Web Workers

Offload heavy processing to Web Workers:

// worker.js
self.onmessage = function(e) {
    const { text, options } = e.data;
    const checker = new KeywordDensityChecker(options);
    const results = checker.analyze(text);
    self.postMessage(results);
};

// main.js
const worker = new Worker('worker.js');

worker.onmessage = function(e) {
    const results = e.data;
    console.log('Analysis complete:', results);
};

worker.postMessage({
    text: document.body.innerText,
    options: { removeStopWords: true }
});

3. Streaming Analysis

For very large texts, use streaming:

async function* streamAnalyze(text, chunkSize = 1000) {
    const words = text.split(/\s+/);

    for (let i = 0; i < words.length; i += chunkSize) {
        const chunk = words.slice(i, i + chunkSize);
        const chunkText = chunk.join(' ');

        const checker = new KeywordDensityChecker();
        const results = checker.analyze(chunkText);

        yield {
            progress: ((i + chunk.length) / words.length) * 100,
            results
        };
    }
}

// Usage
for await (const { progress, results } of streamAnalyze(largeText)) {
    console.log(`Progress: ${progress.toFixed(2)}%`);
    console.log('Chunk results:', results);
}

Best Practices

1. Handle Edge Cases

function safeAnalyze(text) {
    // Handle empty text
    if (!text || text.trim().length === 0) {
        return {
            error: 'Text is empty',
            totalWords: 0
        };
    }

    // Handle very short text
    if (text.split(/\s+/).length < 10) {
        return {
            warning: 'Text is too short for meaningful analysis',
            totalWords: text.split(/\s+/).length
        };
    }

    // Proceed with analysis
    const checker = new KeywordDensityChecker();
    return checker.analyze(text);
}

2. Validate Input

function validateOptions(options) {
    const defaults = {
        removeStopWords: true,
        minWordLength: 2,
        minOccurrences: 2,
        maxResults: 50
    };

    const validated = { ...defaults, ...options };

    // Validate ranges
    validated.minWordLength = Math.max(1, Math.min(validated.minWordLength, 10));
    validated.minOccurrences = Math.max(1, Math.min(validated.minOccurrences, 100));
    validated.maxResults = Math.max(1, Math.min(validated.maxResults, 1000));

    return validated;
}

3. Performance Monitoring

function analyzeWithTiming(text) {
    const start = performance.now();

    const checker = new KeywordDensityChecker();
    const results = checker.analyze(text);

    const end = performance.now();
    const duration = end - start;

    return {
        ...results,
        performance: {
            duration: `${duration.toFixed(2)}ms`,
            wordsPerSecond: Math.round(results.totalWords / (duration / 1000))
        }
    };
}

Conclusion

Building a keyword density checker involves:

Text preprocessing and tokenization
N-gram generation for phrase analysis
Frequency counting and density calculation
Advanced features like TF-IDF and prominence scoring
Optimization for performance

The tool we've built can analyze content at scale, providing insights into keyword usage, semantic variations, and optimization opportunities.

Try it yourself: Build your own checker or use our free tool at TechBelievers.com/tools/keyword-density-checker

DEV Community