How to screenshot every page in your sitemap automatically

#webdev #seo #javascript #devtools

How to Screenshot Every Page in Your Sitemap Automatically

Visual audits, site archives, client deliverables, pre-migration snapshots — all require screenshots of every page on a site. Doing it manually takes hours. Doing it with Puppeteer means managing a browser pool and handling concurrency yourself.

Here's a script that parses your sitemap.xml and captures every URL with controlled concurrency and automatic retry.

Basic sitemap crawler

import fs from "fs/promises";
import path from "path";

const PAGEBOLT_API_KEY = process.env.PAGEBOLT_API_KEY;
const SITEMAP_URL = process.env.SITEMAP_URL || "https://yoursite.com/sitemap.xml";
const OUTPUT_DIR = "screenshots";
const CONCURRENCY = 3; // parallel requests
const DELAY_MS = 500;  // between batches

async function fetchSitemap(url) {
  const res = await fetch(url);
  const xml = await res.text();

  // Extract all <loc> URLs
  const urls = [...xml.matchAll(/<loc>([^<]+)<\/loc>/g)].map((m) => m[1].trim());

  // Handle sitemap index (sitemap of sitemaps)
  if (xml.includes("<sitemapindex")) {
    const nested = await Promise.all(urls.map(fetchSitemap));
    return nested.flat();
  }

  return urls;
}

async function screenshot(url) {
  const res = await fetch("https://pagebolt.dev/api/v1/screenshot", {
    method: "POST",
    headers: {
      "x-api-key": PAGEBOLT_API_KEY,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({
      url,
      fullPage: true,
      blockBanners: true,
      blockAds: true,
      blockTrackers: true,
    }),
  });

  if (!res.ok) throw new Error(`${res.status}: ${await res.text()}`);
  return Buffer.from(await res.arrayBuffer());
}

function urlToFilename(url) {
  const { pathname } = new URL(url);
  const clean = pathname.replace(/^\/|\/$/g, "").replace(/\//g, "__") || "index";
  return `${clean}.png`;
}

async function processBatch(urls) {
  return Promise.allSettled(
    urls.map(async (url) => {
      const filename = urlToFilename(url);
      const filepath = path.join(OUTPUT_DIR, filename);

      try {
        const image = await screenshot(url);
        await fs.writeFile(filepath, image);
        console.log(`✓ ${url} → ${filename}`);
        return { url, filename, ok: true };
      } catch (err) {
        console.error(`✗ ${url}: ${err.message}`);
        return { url, filename, ok: false, error: err.message };
      }
    })
  );
}

async function main() {
  await fs.mkdir(OUTPUT_DIR, { recursive: true });

  console.log(`Fetching sitemap: ${SITEMAP_URL}`);
  const urls = await fetchSitemap(SITEMAP_URL);
  console.log(`Found ${urls.length} URLs\n`);

  const results = [];

  // Process in batches to control concurrency
  for (let i = 0; i < urls.length; i += CONCURRENCY) {
    const batch = urls.slice(i, i + CONCURRENCY);
    const batchResults = await processBatch(batch);
    results.push(...batchResults.map((r) => r.value ?? r.reason));

    if (i + CONCURRENCY < urls.length) {
      await new Promise((r) => setTimeout(r, DELAY_MS));
    }
  }

  // Write summary
  const summary = {
    total: results.length,
    succeeded: results.filter((r) => r.ok).length,
    failed: results.filter((r) => !r.ok).length,
    urls: results,
  };

  await fs.writeFile(
    path.join(OUTPUT_DIR, "_results.json"),
    JSON.stringify(summary, null, 2)
  );

  console.log(`\nDone: ${summary.succeeded}/${summary.total} succeeded`);
  if (summary.failed > 0) {
    console.log("Failed URLs:");
    results.filter((r) => !r.ok).forEach((r) => console.log(`  ${r.url}: ${r.error}`));
  }
}

main().catch(console.error);

PAGEBOLT_API_KEY=your_key SITEMAP_URL=https://yoursite.com/sitemap.xml node crawl.js

With retry on failure

async function screenshotWithRetry(url, retries = 2) {
  for (let attempt = 0; attempt <= retries; attempt++) {
    try {
      return await screenshot(url);
    } catch (err) {
      if (attempt === retries) throw err;
      console.log(`  Retry ${attempt + 1}/${retries} for ${url}`);
      await new Promise((r) => setTimeout(r, 2000 * (attempt + 1)));
    }
  }
}

Mobile + desktop side by side

async function screenshotBoth(url) {
  const [desktop, mobile] = await Promise.all([
    screenshot(url),
    fetch("https://pagebolt.dev/api/v1/screenshot", {
      method: "POST",
      headers: { "x-api-key": PAGEBOLT_API_KEY, "Content-Type": "application/json" },
      body: JSON.stringify({ url, viewportDevice: "iphone_14_pro", blockBanners: true }),
    }).then((r) => r.arrayBuffer()).then((b) => Buffer.from(b)),
  ]);

  const base = urlToFilename(url).replace(".png", "");
  await fs.writeFile(path.join(OUTPUT_DIR, `${base}-desktop.png`), desktop);
  await fs.writeFile(path.join(OUTPUT_DIR, `${base}-mobile.png`), mobile);
}

GitHub Actions — scheduled weekly archive

name: Weekly site archive

on:
  schedule:
    - cron: "0 3 * * 1"  # Every Monday at 3am
  workflow_dispatch:

jobs:
  screenshot-sitemap:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Screenshot all pages
        env:
          PAGEBOLT_API_KEY: ${{ secrets.PAGEBOLT_API_KEY }}
          SITEMAP_URL: https://yoursite.com/sitemap.xml
        run: node scripts/screenshot-sitemap.js

      - name: Upload archive
        uses: actions/upload-artifact@v4
        with:
          name: site-archive-${{ github.run_id }}
          path: screenshots/
          retention-days: 90

Skip already-captured URLs

async function main() {
  const existing = new Set(
    (await fs.readdir(OUTPUT_DIR).catch(() => []))
      .filter((f) => f.endsWith(".png"))
  );

  const urls = await fetchSitemap(SITEMAP_URL);
  const pending = urls.filter((url) => !existing.has(urlToFilename(url)));

  console.log(`${urls.length} total, ${pending.length} to capture (${existing.size} already done)`);
  // process only pending...
}