Tech Believers

Posted on Dec 5

Building a Broken Link Checker: Web Crawling and HTTP Status Validation

#seo #webdev #programming

Introduction

Broken links are a common problem that damages SEO, user experience, and site credibility. In this technical guide, we'll build a comprehensive broken link checker that crawls websites, validates HTTP status codes, and identifies issues.

Understanding Broken Links

A broken link returns an HTTP error status code (typically 404) when accessed. Types of broken links:

404 Not Found: Page doesn't exist
500 Internal Server Error: Server-side error
503 Service Unavailable: Server temporarily unavailable
Timeout: Server didn't respond in time
DNS Error: Domain doesn't exist

The Crawling Algorithm

Step 1: URL Normalization

function normalizeURL(url, baseURL) {
    try {
        // Handle relative URLs
        const absoluteURL = new URL(url, baseURL);

        // Remove fragment (#section)
        absoluteURL.hash = '';

        // Normalize trailing slash
        let normalized = absoluteURL.href;
        if (normalized.endsWith('/') && absoluteURL.pathname !== '/') {
            normalized = normalized.slice(0, -1);
        }

        return normalized;
    } catch (error) {
        console.error('Invalid URL:', url, error);
        return null;
    }
}

// Usage
normalizeURL('/about', 'https://example.com');
// Result: 'https://example.com/about'

normalizeURL('../contact', 'https://example.com/about/');
// Result: 'https://example.com/contact'

Step 2: Link Extraction

function extractLinks(html, baseURL) {
    const links = new Set();

    // Create DOM parser
    const parser = new DOMParser();
    const doc = parser.parseFromString(html, 'text/html');

    // Extract <a href="...">
    const anchorTags = doc.querySelectorAll('a[href]');
    anchorTags.forEach(tag => {
        const href = tag.getAttribute('href');
        const normalized = normalizeURL(href, baseURL);
        if (normalized) {
            links.add(normalized);
        }
    });

    // Extract <img src="...">
    const imgTags = doc.querySelectorAll('img[src]');
    imgTags.forEach(tag => {
        const src = tag.getAttribute('src');
        const normalized = normalizeURL(src, baseURL);
        if (normalized) {
            links.add(normalized);
        }
    });

    // Extract <link href="..."> (CSS, etc.)
    const linkTags = doc.querySelectorAll('link[href]');
    linkTags.forEach(tag => {
        const href = tag.getAttribute('href');
        const normalized = normalizeURL(href, baseURL);
        if (normalized) {
            links.add(normalized);
        }
    });

    // Extract <script src="...">
    const scriptTags = doc.querySelectorAll('script[src]');
    scriptTags.forEach(tag => {
        const src = tag.getAttribute('src');
        const normalized = normalizeURL(src, baseURL);
        if (normalized) {
            links.add(normalized);
        }
    });

    return Array.from(links);
}

Step 3: HTTP Status Checking

async function checkURL(url, timeout = 10000) {
    try {
        const controller = new AbortController();
        const timeoutId = setTimeout(() => controller.abort(), timeout);

        const response = await fetch(url, {
            method: 'HEAD', // Faster than GET
            signal: controller.signal,
            redirect: 'follow'
        });

        clearTimeout(timeoutId);

        return {
            url,
            status: response.status,
            statusText: response.statusText,
            ok: response.ok,
            redirected: response.redirected,
            finalURL: response.url,
            error: null
        };
    } catch (error) {
        if (error.name === 'AbortError') {
            return {
                url,
                status: 0,
                statusText: 'Timeout',
                ok: false,
                error: 'Request timed out'
            };
        }

        return {
            url,
            status: 0,
            statusText: 'Error',
            ok: false,
            error: error.message
        };
    }
}

Step 4: Crawling Logic

class BrokenLinkChecker {
    constructor(options = {}) {
        this.options = {
            maxDepth: options.maxDepth || 3,
            maxPages: options.maxPages || 100,
            timeout: options.timeout || 10000,
            respectRobotsTxt: options.respectRobotsTxt !== false,
            followExternal: options.followExternal || false,
            concurrency: options.concurrency || 5
        };

        this.visited = new Set();
        this.queue = [];
        this.results = [];
        this.baseURL = null;
    }

    async crawl(startURL) {
        this.baseURL = new URL(startURL).origin;
        this.queue.push({ url: startURL, depth: 0, source: null });

        while (this.queue.length > 0 && this.visited.size < this.options.maxPages) {
            // Process URLs in batches for concurrency
            const batch = this.queue.splice(0, this.options.concurrency);
            await Promise.all(batch.map(item => this.processURL(item)));
        }

        return this.results;
    }

    async processURL({ url, depth, source }) {
        // Skip if already visited
        if (this.visited.has(url)) {
            return;
        }

        this.visited.add(url);

        // Check if URL is valid
        const status = await checkURL(url, this.options.timeout);

        // Record result
        this.results.push({
            url,
            source,
            depth,
            ...status
        });

        // If URL is broken, don't crawl further
        if (!status.ok) {
            return;
        }

        // If max depth reached, don't crawl further
        if (depth >= this.options.maxDepth) {
            return;
        }

        // Only crawl HTML pages
        if (!this.isHTML(url)) {
            return;
        }

        // Fetch page content
        try {
            const response = await fetch(url);
            const html = await response.text();

            // Extract links
            const links = extractLinks(html, url);

            // Add links to queue
            for (const link of links) {
                // Skip external links unless followExternal is true
                if (!this.options.followExternal && !link.startsWith(this.baseURL)) {
                    continue;
                }

                // Skip if already visited
                if (this.visited.has(link)) {
                    continue;
                }

                // Add to queue
                this.queue.push({
                    url: link,
                    depth: depth + 1,
                    source: url
                });
            }
        } catch (error) {
            console.error('Error fetching page:', url, error);
        }
    }

    isHTML(url) {
        const ext = url.split('.').pop().split('?')[0].toLowerCase();
        const nonHTMLExtensions = ['jpg', 'jpeg', 'png', 'gif', 'pdf', 'zip', 'css', 'js'];
        return !nonHTMLExtensions.includes(ext);
    }

    getBrokenLinks() {
        return this.results.filter(result => !result.ok);
    }

    getRedirectChains() {
        return this.results.filter(result => result.redirected);
    }

    getStatistics() {
        const total = this.results.length;
        const broken = this.getBrokenLinks().length;
        const redirects = this.getRedirectChains().length;

        const statusCodes = {};
        this.results.forEach(result => {
            statusCodes[result.status] = (statusCodes[result.status] || 0) + 1;
        });

        return {
            total,
            broken,
            redirects,
            brokenPercentage: ((broken / total) * 100).toFixed(2),
            statusCodes
        };
    }
}

// Usage
const checker = new BrokenLinkChecker({
    maxDepth: 3,
    maxPages: 100,
    timeout: 10000,
    followExternal: false,
    concurrency: 5
});

const results = await checker.crawl('https://example.com');
const brokenLinks = checker.getBrokenLinks();
const stats = checker.getStatistics();

console.log('Total links checked:', stats.total);
console.log('Broken links found:', stats.broken);
console.log('Broken links:', brokenLinks);

Server-Side Implementation (Node.js)

For more robust crawling, use Node.js with libraries like axios and cheerio:

const axios = require('axios');
const cheerio = require('cheerio');
const { URL } = require('url');

class ServerSideBrokenLinkChecker {
    constructor(options = {}) {
        this.options = {
            maxDepth: options.maxDepth || 3,
            maxPages: options.maxPages || 100,
            timeout: options.timeout || 10000,
            userAgent: options.userAgent || 'Mozilla/5.0 (compatible; BrokenLinkChecker/1.0)',
            concurrency: options.concurrency || 5,
            delay: options.delay || 100 // Delay between requests (ms)
        };

        this.visited = new Set();
        this.queue = [];
        this.results = [];
        this.baseURL = null;
    }

    async crawl(startURL) {
        this.baseURL = new URL(startURL).origin;
        this.queue.push({ url: startURL, depth: 0, source: null });

        while (this.queue.length > 0 && this.visited.size < this.options.maxPages) {
            const batch = this.queue.splice(0, this.options.concurrency);
            await Promise.all(batch.map(item => this.processURL(item)));

            // Delay to avoid overwhelming the server
            await this.sleep(this.options.delay);
        }

        return this.results;
    }

    async processURL({ url, depth, source }) {
        if (this.visited.has(url)) {
            return;
        }

        this.visited.add(url);

        // Check URL status
        const status = await this.checkURL(url);

        // Record result
        this.results.push({
            url,
            source,
            depth,
            ...status
        });

        // If broken or max depth, don't crawl further
        if (!status.ok || depth >= this.options.maxDepth) {
            return;
        }

        // Only crawl HTML pages
        if (!this.isHTML(url)) {
            return;
        }

        // Fetch and parse page
        try {
            const response = await axios.get(url, {
                timeout: this.options.timeout,
                headers: {
                    'User-Agent': this.options.userAgent
                }
            });

            const $ = cheerio.load(response.data);

            // Extract links
            const links = this.extractLinks($, url);

            // Add to queue
            for (const link of links) {
                if (!this.visited.has(link) && link.startsWith(this.baseURL)) {
                    this.queue.push({
                        url: link,
                        depth: depth + 1,
                        source: url
                    });
                }
            }
        } catch (error) {
            console.error('Error fetching page:', url, error.message);
        }
    }

    async checkURL(url) {
        try {
            const response = await axios.head(url, {
                timeout: this.options.timeout,
                maxRedirects: 5,
                validateStatus: () => true, // Don't throw on any status
                headers: {
                    'User-Agent': this.options.userAgent
                }
            });

            return {
                status: response.status,
                statusText: response.statusText,
                ok: response.status >= 200 && response.status < 400,
                redirected: response.request._redirectable._redirectCount > 0,
                finalURL: response.request.res.responseUrl,
                responseTime: response.headers['x-response-time'] || null,
                error: null
            };
        } catch (error) {
            if (error.code === 'ECONNABORTED') {
                return {
                    status: 0,
                    statusText: 'Timeout',
                    ok: false,
                    error: 'Request timed out'
                };
            }

            if (error.code === 'ENOTFOUND') {
                return {
                    status: 0,
                    statusText: 'DNS Error',
                    ok: false,
                    error: 'Domain not found'
                };
            }

            return {
                status: 0,
                statusText: 'Error',
                ok: false,
                error: error.message
            };
        }
    }

    extractLinks($, baseURL) {
        const links = new Set();

        // Extract from <a href>
        $('a[href]').each((i, elem) => {
            const href = $(elem).attr('href');
            const normalized = this.normalizeURL(href, baseURL);
            if (normalized) links.add(normalized);
        });

        // Extract from <img src>
        $('img[src]').each((i, elem) => {
            const src = $(elem).attr('src');
            const normalized = this.normalizeURL(src, baseURL);
            if (normalized) links.add(normalized);
        });

        // Extract from <link href>
        $('link[href]').each((i, elem) => {
            const href = $(elem).attr('href');
            const normalized = this.normalizeURL(href, baseURL);
            if (normalized) links.add(normalized);
        });

        // Extract from <script src>
        $('script[src]').each((i, elem) => {
            const src = $(elem).attr('src');
            const normalized = this.normalizeURL(src, baseURL);
            if (normalized) links.add(normalized);
        });

        return Array.from(links);
    }

    normalizeURL(url, baseURL) {
        try {
            const absoluteURL = new URL(url, baseURL);
            absoluteURL.hash = '';
            return absoluteURL.href;
        } catch (error) {
            return null;
        }
    }

    isHTML(url) {
        const ext = url.split('.').pop().split('?')[0].toLowerCase();
        const nonHTMLExtensions = ['jpg', 'jpeg', 'png', 'gif', 'pdf', 'zip', 'css', 'js', 'svg', 'ico'];
        return !nonHTMLExtensions.includes(ext);
    }

    sleep(ms) {
        return new Promise(resolve => setTimeout(resolve, ms));
    }

    getBrokenLinks() {
        return this.results.filter(result => !result.ok);
    }

    getByStatus(status) {
        return this.results.filter(result => result.status === status);
    }

    getStatistics() {
        const total = this.results.length;
        const broken = this.getBrokenLinks().length;
        const redirects = this.results.filter(r => r.redirected).length;

        const statusCodes = {};
        this.results.forEach(result => {
            statusCodes[result.status] = (statusCodes[result.status] || 0) + 1;
        });

        return {
            total,
            broken,
            redirects,
            brokenPercentage: ((broken / total) * 100).toFixed(2),
            statusCodes,
            pagesVisited: this.visited.size
        };
    }

    exportResults(format = 'json') {
        if (format === 'json') {
            return JSON.stringify(this.results, null, 2);
        }

        if (format === 'csv') {
            const headers = ['URL', 'Source', 'Status', 'Status Text', 'Error'];
            const rows = this.results.map(r => [
                r.url,
                r.source || 'N/A',
                r.status,
                r.statusText,
                r.error || 'N/A'
            ]);

            return [headers, ...rows]
                .map(row => row.join(','))
                .join('\n');
        }

        return this.results;
    }
}

// Usage
const checker = new ServerSideBrokenLinkChecker({
    maxDepth: 3,
    maxPages: 100,
    timeout: 10000,
    concurrency: 5,
    delay: 100
});

checker.crawl('https://example.com')
    .then(() => {
        const brokenLinks = checker.getBrokenLinks();
        const stats = checker.getStatistics();

        console.log('Statistics:', stats);
        console.log('Broken Links:', brokenLinks);

        // Export to CSV
        const csv = checker.exportResults('csv');
        require('fs').writeFileSync('broken-links.csv', csv);
    })
    .catch(error => {
        console.error('Crawl failed:', error);
    });

Advanced Features

Redirect Chain Detection

function detectRedirectChains(results) {
    const chains = [];

    for (const result of results) {
        if (result.redirected && result.finalURL !== result.url) {
            chains.push({
                original: result.url,
                final: result.finalURL,
                source: result.source,
                hops: result.redirectCount || 1
            });
        }
    }

    // Find chains with multiple hops
    const multiHopChains = chains.filter(chain => chain.hops > 1);

    return {
        all: chains,
        multiHop: multiHopChains
    };
}

Robots.txt Respect

const robotsParser = require('robots-parser');

async function getRobotsTxt(baseURL) {
    try {
        const robotsURL = new URL('/robots.txt', baseURL).href;
        const response = await axios.get(robotsURL);
        return robotsParser(robotsURL, response.data);
    } catch (error) {
        return null;
    }
}

async function isAllowedByRobots(url, userAgent, robots) {
    if (!robots) return true;
    return robots.isAllowed(url, userAgent);
}

Sitemap Integration

const xml2js = require('xml2js');

async function parseSitemap(sitemapURL) {
    try {
        const response = await axios.get(sitemapURL);
        const parser = new xml2js.Parser();
        const result = await parser.parseStringPromise(response.data);

        const urls = [];

        if (result.urlset && result.urlset.url) {
            result.urlset.url.forEach(entry => {
                if (entry.loc && entry.loc[0]) {
                    urls.push(entry.loc[0]);
                }
            });
        }

        return urls;
    } catch (error) {
        console.error('Error parsing sitemap:', error);
        return [];
    }
}

Progress Tracking

class ProgressTracker {
    constructor(total) {
        this.total = total;
        this.current = 0;
        this.startTime = Date.now();
    }

    update(current) {
        this.current = current;
        const elapsed = Date.now() - this.startTime;
        const rate = current / (elapsed / 1000);
        const remaining = (this.total - current) / rate;

        console.log(`Progress: ${current}/${this.total} (${(current/this.total*100).toFixed(1)}%)`);
        console.log(`Estimated time remaining: ${Math.round(remaining)}s`);
    }
}

Best Practices

1. Rate Limiting

class RateLimiter {
    constructor(requestsPerSecond) {
        this.requestsPerSecond = requestsPerSecond;
        this.queue = [];
        this.processing = false;
    }

    async execute(fn) {
        return new Promise((resolve, reject) => {
            this.queue.push({ fn, resolve, reject });
            this.process();
        });
    }

    async process() {
        if (this.processing || this.queue.length === 0) {
            return;
        }

        this.processing = true;

        while (this.queue.length > 0) {
            const { fn, resolve, reject } = this.queue.shift();

            try {
                const result = await fn();
                resolve(result);
            } catch (error) {
                reject(error);
            }

            // Wait before next request
            await this.sleep(1000 / this.requestsPerSecond);
        }

        this.processing = false;
    }

    sleep(ms) {
        return new Promise(resolve => setTimeout(resolve, ms));
    }
}

2. Error Handling

async function safeCheckURL(url, retries = 3) {
    for (let i = 0; i < retries; i++) {
        try {
            return await checkURL(url);
        } catch (error) {
            if (i === retries - 1) {
                return {
                    url,
                    status: 0,
                    ok: false,
                    error: `Failed after ${retries} retries: ${error.message}`
                };
            }

            // Exponential backoff
            await sleep(Math.pow(2, i) * 1000);
        }
    }
}

3. Memory Management

class MemoryEfficientChecker extends BrokenLinkChecker {
    constructor(options) {
        super(options);
        this.maxResultsInMemory = options.maxResultsInMemory || 1000;
        this.resultStream = options.resultStream || null;
    }

    async processURL(item) {
        await super.processURL(item);

        // Flush results to stream if memory limit reached
        if (this.results.length >= this.maxResultsInMemory && this.resultStream) {
            await this.flushResults();
        }
    }

    async flushResults() {
        if (this.resultStream) {
            for (const result of this.results) {
                this.resultStream.write(JSON.stringify(result) + '\n');
            }
            this.results = [];
        }
    }
}

Conclusion

Building a broken link checker involves:

URL normalization and validation
Web crawling with depth and concurrency control
HTTP status code checking
Link extraction from HTML
Redirect chain detection
Performance optimization and rate limiting

The tool we've built can crawl entire websites, identify broken links, detect redirect chains, and provide detailed reports.

Try it yourself: Build your own checker or use our free tool at TechBelievers.com/tools/broken-link-checker

DEV Community