Introduction
Broken links are a common problem that damages SEO, user experience, and site credibility. In this technical guide, we'll build a comprehensive broken link checker that crawls websites, validates HTTP status codes, and identifies issues.
Understanding Broken Links
A broken link returns an HTTP error status code (typically 404) when accessed. Types of broken links:
- 404 Not Found: Page doesn't exist
- 500 Internal Server Error: Server-side error
- 503 Service Unavailable: Server temporarily unavailable
- Timeout: Server didn't respond in time
- DNS Error: Domain doesn't exist
The Crawling Algorithm
Step 1: URL Normalization
function normalizeURL(url, baseURL) {
try {
// Handle relative URLs
const absoluteURL = new URL(url, baseURL);
// Remove fragment (#section)
absoluteURL.hash = '';
// Normalize trailing slash
let normalized = absoluteURL.href;
if (normalized.endsWith('/') && absoluteURL.pathname !== '/') {
normalized = normalized.slice(0, -1);
}
return normalized;
} catch (error) {
console.error('Invalid URL:', url, error);
return null;
}
}
// Usage
normalizeURL('/about', 'https://example.com');
// Result: 'https://example.com/about'
normalizeURL('../contact', 'https://example.com/about/');
// Result: 'https://example.com/contact'
Step 2: Link Extraction
function extractLinks(html, baseURL) {
const links = new Set();
// Create DOM parser
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
// Extract <a href="...">
const anchorTags = doc.querySelectorAll('a[href]');
anchorTags.forEach(tag => {
const href = tag.getAttribute('href');
const normalized = normalizeURL(href, baseURL);
if (normalized) {
links.add(normalized);
}
});
// Extract <img src="...">
const imgTags = doc.querySelectorAll('img[src]');
imgTags.forEach(tag => {
const src = tag.getAttribute('src');
const normalized = normalizeURL(src, baseURL);
if (normalized) {
links.add(normalized);
}
});
// Extract <link href="..."> (CSS, etc.)
const linkTags = doc.querySelectorAll('link[href]');
linkTags.forEach(tag => {
const href = tag.getAttribute('href');
const normalized = normalizeURL(href, baseURL);
if (normalized) {
links.add(normalized);
}
});
// Extract <script src="...">
const scriptTags = doc.querySelectorAll('script[src]');
scriptTags.forEach(tag => {
const src = tag.getAttribute('src');
const normalized = normalizeURL(src, baseURL);
if (normalized) {
links.add(normalized);
}
});
return Array.from(links);
}
Step 3: HTTP Status Checking
async function checkURL(url, timeout = 10000) {
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeout);
const response = await fetch(url, {
method: 'HEAD', // Faster than GET
signal: controller.signal,
redirect: 'follow'
});
clearTimeout(timeoutId);
return {
url,
status: response.status,
statusText: response.statusText,
ok: response.ok,
redirected: response.redirected,
finalURL: response.url,
error: null
};
} catch (error) {
if (error.name === 'AbortError') {
return {
url,
status: 0,
statusText: 'Timeout',
ok: false,
error: 'Request timed out'
};
}
return {
url,
status: 0,
statusText: 'Error',
ok: false,
error: error.message
};
}
}
Step 4: Crawling Logic
class BrokenLinkChecker {
constructor(options = {}) {
this.options = {
maxDepth: options.maxDepth || 3,
maxPages: options.maxPages || 100,
timeout: options.timeout || 10000,
respectRobotsTxt: options.respectRobotsTxt !== false,
followExternal: options.followExternal || false,
concurrency: options.concurrency || 5
};
this.visited = new Set();
this.queue = [];
this.results = [];
this.baseURL = null;
}
async crawl(startURL) {
this.baseURL = new URL(startURL).origin;
this.queue.push({ url: startURL, depth: 0, source: null });
while (this.queue.length > 0 && this.visited.size < this.options.maxPages) {
// Process URLs in batches for concurrency
const batch = this.queue.splice(0, this.options.concurrency);
await Promise.all(batch.map(item => this.processURL(item)));
}
return this.results;
}
async processURL({ url, depth, source }) {
// Skip if already visited
if (this.visited.has(url)) {
return;
}
this.visited.add(url);
// Check if URL is valid
const status = await checkURL(url, this.options.timeout);
// Record result
this.results.push({
url,
source,
depth,
...status
});
// If URL is broken, don't crawl further
if (!status.ok) {
return;
}
// If max depth reached, don't crawl further
if (depth >= this.options.maxDepth) {
return;
}
// Only crawl HTML pages
if (!this.isHTML(url)) {
return;
}
// Fetch page content
try {
const response = await fetch(url);
const html = await response.text();
// Extract links
const links = extractLinks(html, url);
// Add links to queue
for (const link of links) {
// Skip external links unless followExternal is true
if (!this.options.followExternal && !link.startsWith(this.baseURL)) {
continue;
}
// Skip if already visited
if (this.visited.has(link)) {
continue;
}
// Add to queue
this.queue.push({
url: link,
depth: depth + 1,
source: url
});
}
} catch (error) {
console.error('Error fetching page:', url, error);
}
}
isHTML(url) {
const ext = url.split('.').pop().split('?')[0].toLowerCase();
const nonHTMLExtensions = ['jpg', 'jpeg', 'png', 'gif', 'pdf', 'zip', 'css', 'js'];
return !nonHTMLExtensions.includes(ext);
}
getBrokenLinks() {
return this.results.filter(result => !result.ok);
}
getRedirectChains() {
return this.results.filter(result => result.redirected);
}
getStatistics() {
const total = this.results.length;
const broken = this.getBrokenLinks().length;
const redirects = this.getRedirectChains().length;
const statusCodes = {};
this.results.forEach(result => {
statusCodes[result.status] = (statusCodes[result.status] || 0) + 1;
});
return {
total,
broken,
redirects,
brokenPercentage: ((broken / total) * 100).toFixed(2),
statusCodes
};
}
}
// Usage
const checker = new BrokenLinkChecker({
maxDepth: 3,
maxPages: 100,
timeout: 10000,
followExternal: false,
concurrency: 5
});
const results = await checker.crawl('https://example.com');
const brokenLinks = checker.getBrokenLinks();
const stats = checker.getStatistics();
console.log('Total links checked:', stats.total);
console.log('Broken links found:', stats.broken);
console.log('Broken links:', brokenLinks);
Server-Side Implementation (Node.js)
For more robust crawling, use Node.js with libraries like axios and cheerio:
const axios = require('axios');
const cheerio = require('cheerio');
const { URL } = require('url');
class ServerSideBrokenLinkChecker {
constructor(options = {}) {
this.options = {
maxDepth: options.maxDepth || 3,
maxPages: options.maxPages || 100,
timeout: options.timeout || 10000,
userAgent: options.userAgent || 'Mozilla/5.0 (compatible; BrokenLinkChecker/1.0)',
concurrency: options.concurrency || 5,
delay: options.delay || 100 // Delay between requests (ms)
};
this.visited = new Set();
this.queue = [];
this.results = [];
this.baseURL = null;
}
async crawl(startURL) {
this.baseURL = new URL(startURL).origin;
this.queue.push({ url: startURL, depth: 0, source: null });
while (this.queue.length > 0 && this.visited.size < this.options.maxPages) {
const batch = this.queue.splice(0, this.options.concurrency);
await Promise.all(batch.map(item => this.processURL(item)));
// Delay to avoid overwhelming the server
await this.sleep(this.options.delay);
}
return this.results;
}
async processURL({ url, depth, source }) {
if (this.visited.has(url)) {
return;
}
this.visited.add(url);
// Check URL status
const status = await this.checkURL(url);
// Record result
this.results.push({
url,
source,
depth,
...status
});
// If broken or max depth, don't crawl further
if (!status.ok || depth >= this.options.maxDepth) {
return;
}
// Only crawl HTML pages
if (!this.isHTML(url)) {
return;
}
// Fetch and parse page
try {
const response = await axios.get(url, {
timeout: this.options.timeout,
headers: {
'User-Agent': this.options.userAgent
}
});
const $ = cheerio.load(response.data);
// Extract links
const links = this.extractLinks($, url);
// Add to queue
for (const link of links) {
if (!this.visited.has(link) && link.startsWith(this.baseURL)) {
this.queue.push({
url: link,
depth: depth + 1,
source: url
});
}
}
} catch (error) {
console.error('Error fetching page:', url, error.message);
}
}
async checkURL(url) {
try {
const response = await axios.head(url, {
timeout: this.options.timeout,
maxRedirects: 5,
validateStatus: () => true, // Don't throw on any status
headers: {
'User-Agent': this.options.userAgent
}
});
return {
status: response.status,
statusText: response.statusText,
ok: response.status >= 200 && response.status < 400,
redirected: response.request._redirectable._redirectCount > 0,
finalURL: response.request.res.responseUrl,
responseTime: response.headers['x-response-time'] || null,
error: null
};
} catch (error) {
if (error.code === 'ECONNABORTED') {
return {
status: 0,
statusText: 'Timeout',
ok: false,
error: 'Request timed out'
};
}
if (error.code === 'ENOTFOUND') {
return {
status: 0,
statusText: 'DNS Error',
ok: false,
error: 'Domain not found'
};
}
return {
status: 0,
statusText: 'Error',
ok: false,
error: error.message
};
}
}
extractLinks($, baseURL) {
const links = new Set();
// Extract from <a href>
$('a[href]').each((i, elem) => {
const href = $(elem).attr('href');
const normalized = this.normalizeURL(href, baseURL);
if (normalized) links.add(normalized);
});
// Extract from <img src>
$('img[src]').each((i, elem) => {
const src = $(elem).attr('src');
const normalized = this.normalizeURL(src, baseURL);
if (normalized) links.add(normalized);
});
// Extract from <link href>
$('link[href]').each((i, elem) => {
const href = $(elem).attr('href');
const normalized = this.normalizeURL(href, baseURL);
if (normalized) links.add(normalized);
});
// Extract from <script src>
$('script[src]').each((i, elem) => {
const src = $(elem).attr('src');
const normalized = this.normalizeURL(src, baseURL);
if (normalized) links.add(normalized);
});
return Array.from(links);
}
normalizeURL(url, baseURL) {
try {
const absoluteURL = new URL(url, baseURL);
absoluteURL.hash = '';
return absoluteURL.href;
} catch (error) {
return null;
}
}
isHTML(url) {
const ext = url.split('.').pop().split('?')[0].toLowerCase();
const nonHTMLExtensions = ['jpg', 'jpeg', 'png', 'gif', 'pdf', 'zip', 'css', 'js', 'svg', 'ico'];
return !nonHTMLExtensions.includes(ext);
}
sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
getBrokenLinks() {
return this.results.filter(result => !result.ok);
}
getByStatus(status) {
return this.results.filter(result => result.status === status);
}
getStatistics() {
const total = this.results.length;
const broken = this.getBrokenLinks().length;
const redirects = this.results.filter(r => r.redirected).length;
const statusCodes = {};
this.results.forEach(result => {
statusCodes[result.status] = (statusCodes[result.status] || 0) + 1;
});
return {
total,
broken,
redirects,
brokenPercentage: ((broken / total) * 100).toFixed(2),
statusCodes,
pagesVisited: this.visited.size
};
}
exportResults(format = 'json') {
if (format === 'json') {
return JSON.stringify(this.results, null, 2);
}
if (format === 'csv') {
const headers = ['URL', 'Source', 'Status', 'Status Text', 'Error'];
const rows = this.results.map(r => [
r.url,
r.source || 'N/A',
r.status,
r.statusText,
r.error || 'N/A'
]);
return [headers, ...rows]
.map(row => row.join(','))
.join('\n');
}
return this.results;
}
}
// Usage
const checker = new ServerSideBrokenLinkChecker({
maxDepth: 3,
maxPages: 100,
timeout: 10000,
concurrency: 5,
delay: 100
});
checker.crawl('https://example.com')
.then(() => {
const brokenLinks = checker.getBrokenLinks();
const stats = checker.getStatistics();
console.log('Statistics:', stats);
console.log('Broken Links:', brokenLinks);
// Export to CSV
const csv = checker.exportResults('csv');
require('fs').writeFileSync('broken-links.csv', csv);
})
.catch(error => {
console.error('Crawl failed:', error);
});
Advanced Features
Redirect Chain Detection
function detectRedirectChains(results) {
const chains = [];
for (const result of results) {
if (result.redirected && result.finalURL !== result.url) {
chains.push({
original: result.url,
final: result.finalURL,
source: result.source,
hops: result.redirectCount || 1
});
}
}
// Find chains with multiple hops
const multiHopChains = chains.filter(chain => chain.hops > 1);
return {
all: chains,
multiHop: multiHopChains
};
}
Robots.txt Respect
const robotsParser = require('robots-parser');
async function getRobotsTxt(baseURL) {
try {
const robotsURL = new URL('/robots.txt', baseURL).href;
const response = await axios.get(robotsURL);
return robotsParser(robotsURL, response.data);
} catch (error) {
return null;
}
}
async function isAllowedByRobots(url, userAgent, robots) {
if (!robots) return true;
return robots.isAllowed(url, userAgent);
}
Sitemap Integration
const xml2js = require('xml2js');
async function parseSitemap(sitemapURL) {
try {
const response = await axios.get(sitemapURL);
const parser = new xml2js.Parser();
const result = await parser.parseStringPromise(response.data);
const urls = [];
if (result.urlset && result.urlset.url) {
result.urlset.url.forEach(entry => {
if (entry.loc && entry.loc[0]) {
urls.push(entry.loc[0]);
}
});
}
return urls;
} catch (error) {
console.error('Error parsing sitemap:', error);
return [];
}
}
Progress Tracking
class ProgressTracker {
constructor(total) {
this.total = total;
this.current = 0;
this.startTime = Date.now();
}
update(current) {
this.current = current;
const elapsed = Date.now() - this.startTime;
const rate = current / (elapsed / 1000);
const remaining = (this.total - current) / rate;
console.log(`Progress: ${current}/${this.total} (${(current/this.total*100).toFixed(1)}%)`);
console.log(`Estimated time remaining: ${Math.round(remaining)}s`);
}
}
Best Practices
1. Rate Limiting
class RateLimiter {
constructor(requestsPerSecond) {
this.requestsPerSecond = requestsPerSecond;
this.queue = [];
this.processing = false;
}
async execute(fn) {
return new Promise((resolve, reject) => {
this.queue.push({ fn, resolve, reject });
this.process();
});
}
async process() {
if (this.processing || this.queue.length === 0) {
return;
}
this.processing = true;
while (this.queue.length > 0) {
const { fn, resolve, reject } = this.queue.shift();
try {
const result = await fn();
resolve(result);
} catch (error) {
reject(error);
}
// Wait before next request
await this.sleep(1000 / this.requestsPerSecond);
}
this.processing = false;
}
sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
2. Error Handling
async function safeCheckURL(url, retries = 3) {
for (let i = 0; i < retries; i++) {
try {
return await checkURL(url);
} catch (error) {
if (i === retries - 1) {
return {
url,
status: 0,
ok: false,
error: `Failed after ${retries} retries: ${error.message}`
};
}
// Exponential backoff
await sleep(Math.pow(2, i) * 1000);
}
}
}
3. Memory Management
class MemoryEfficientChecker extends BrokenLinkChecker {
constructor(options) {
super(options);
this.maxResultsInMemory = options.maxResultsInMemory || 1000;
this.resultStream = options.resultStream || null;
}
async processURL(item) {
await super.processURL(item);
// Flush results to stream if memory limit reached
if (this.results.length >= this.maxResultsInMemory && this.resultStream) {
await this.flushResults();
}
}
async flushResults() {
if (this.resultStream) {
for (const result of this.results) {
this.resultStream.write(JSON.stringify(result) + '\n');
}
this.results = [];
}
}
}
Conclusion
Building a broken link checker involves:
- URL normalization and validation
- Web crawling with depth and concurrency control
- HTTP status code checking
- Link extraction from HTML
- Redirect chain detection
- Performance optimization and rate limiting
The tool we've built can crawl entire websites, identify broken links, detect redirect chains, and provide detailed reports.
Try it yourself: Build your own checker or use our free tool at TechBelievers.com/tools/broken-link-checker
Top comments (0)