How to build a website archiver that saves daily snapshots

#devops #javascript #automation #devtools

How to Build a Website Archiver That Saves Daily Snapshots

Some industries require proof of what a website showed on a given date: financial services disclosures, legal disclaimers, terms of service changes, advertising compliance. Even outside regulated industries, having a timestamped visual history is useful — for dispute resolution, rollback reference, or understanding what changed after a traffic drop.

Here's a complete archiver: screenshots on a schedule, stored with timestamps in S3, queryable by date.

Core archiver

import { S3Client, PutObjectCommand, ListObjectsV2Command } from "@aws-sdk/client-s3";

const s3 = new S3Client({ region: process.env.AWS_REGION || "us-east-1" });

const PAGES = [
  { name: "homepage",       url: "https://yoursite.com" },
  { name: "pricing",        url: "https://yoursite.com/pricing" },
  { name: "terms",          url: "https://yoursite.com/terms" },
  { name: "privacy",        url: "https://yoursite.com/privacy" },
  { name: "product",        url: "https://yoursite.com/product" },
];

async function captureAndStore(page, date) {
  const dateStr = date.toISOString().split("T")[0]; // YYYY-MM-DD
  const timestamp = date.toISOString().replace(/[:.]/g, "-");
  const key = `archive/${page.name}/${dateStr}/${timestamp}.png`;

  // Screenshot the page
  const res = await fetch("https://pagebolt.dev/api/v1/screenshot", {
    method: "POST",
    headers: {
      "x-api-key": process.env.PAGEBOLT_API_KEY,
      "Content-Type": "application/json",
    },
    body: JSON.stringify({
      url: page.url,
      fullPage: true,
      blockBanners: false, // keep banners — they're part of the historical record
      blockAds: false,
    }),
  });

  if (!res.ok) throw new Error(`Screenshot failed: ${res.status}`);
  const image = Buffer.from(await res.arrayBuffer());

  // Store in S3 with metadata
  await s3.send(new PutObjectCommand({
    Bucket: process.env.S3_BUCKET,
    Key: key,
    Body: image,
    ContentType: "image/png",
    Metadata: {
      url: page.url,
      "page-name": page.name,
      "captured-at": date.toISOString(),
    },
    // Keep archives for 7 years (compliance default)
    // Set via S3 lifecycle policy, not per-object
  }));

  console.log(`✓ ${page.name} → s3://${process.env.S3_BUCKET}/${key}`);
  return key;
}

async function runArchive() {
  const now = new Date();
  console.log(`[${now.toISOString()}] Running archive for ${PAGES.length} pages...`);

  const results = await Promise.allSettled(
    PAGES.map((page) => captureAndStore(page, now))
  );

  const succeeded = results.filter((r) => r.status === "fulfilled").length;
  const failed = results.filter((r) => r.status === "rejected");

  console.log(`Archive complete: ${succeeded}/${PAGES.length} succeeded`);

  if (failed.length > 0) {
    console.error("Failed:");
    failed.forEach((r) => console.error(` - ${r.reason}`));
  }

  return { succeeded, failed: failed.length, timestamp: now.toISOString() };
}

Schedule daily via node-cron

import cron from "node-cron";

// Every day at midnight UTC
cron.schedule("0 0 * * *", async () => {
  await runArchive();
});

console.log("Website archiver running. Daily snapshots at midnight UTC.");

GitHub Actions schedule (no server required)

# .github/workflows/website-archive.yml
name: Daily website archive

on:
  schedule:
    - cron: "0 0 * * *"   # midnight UTC daily
  workflow_dispatch:        # allow manual trigger

jobs:
  archive:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Run archiver
        env:
          PAGEBOLT_API_KEY: ${{ secrets.PAGEBOLT_API_KEY }}
          AWS_REGION: us-east-1
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          S3_BUCKET: ${{ vars.ARCHIVE_BUCKET }}
        run: node scripts/archiver.js

Query archives by date

async function getArchiveForDate(pageName, date) {
  const dateStr = date.toISOString().split("T")[0];
  const prefix = `archive/${pageName}/${dateStr}/`;

  const res = await s3.send(new ListObjectsV2Command({
    Bucket: process.env.S3_BUCKET,
    Prefix: prefix,
  }));

  return res.Contents?.map((obj) => ({
    key: obj.Key,
    capturedAt: obj.LastModified,
    url: `https://${process.env.S3_BUCKET}.s3.amazonaws.com/${obj.Key}`,
  })) ?? [];
}

// Get all snapshots of "pricing" from last 30 days
async function getPricingHistory() {
  const results = [];
  for (let i = 0; i < 30; i++) {
    const date = new Date();
    date.setDate(date.getDate() - i);
    const snapshots = await getArchiveForDate("pricing", date);
    results.push(...snapshots);
  }
  return results.sort((a, b) => b.capturedAt - a.capturedAt);
}

REST API to browse the archive

import express from "express";

const app = express();

// GET /archive/pricing?date=2025-11-01
app.get("/archive/:page", async (req, res) => {
  const { page } = req.params;
  const date = new Date(req.query.date || new Date());

  const snapshots = await getArchiveForDate(page, date);

  if (snapshots.length === 0) {
    return res.status(404).json({ error: "No archive for this date" });
  }

  res.json({ page, date: date.toISOString().split("T")[0], snapshots });
});

// GET /archive/pricing/latest — redirect to most recent snapshot
app.get("/archive/:page/latest", async (req, res) => {
  const snapshots = await getArchiveForDate(req.params.page, new Date());
  if (!snapshots.length) return res.status(404).send("No snapshots today");
  res.redirect(snapshots[snapshots.length - 1].url);
});

S3 lifecycle policy for retention (Terraform)

resource "aws_s3_bucket_lifecycle_configuration" "archive_retention" {
  bucket = aws_s3_bucket.archive.id

  rule {
    id     = "archive-retention"
    status = "Enabled"

    expiration {
      days = 2555  # 7 years — standard compliance requirement
    }

    noncurrent_version_expiration {
      noncurrent_days = 90
    }
  }
}

What to archive

Beyond your own pages, the archiver pattern applies to:

Competitor pages — pricing, homepage, feature announcements
Regulatory pages — rate cards, fee disclosures, terms of service
Third-party embeds — a widget you depend on but don't control
Advertising landing pages — proof of what an ad sent users to on a given date
Your own deploys — visual history correlated with deployment timestamps

One daily cron job, archived indefinitely in S3 at storage cost. The screenshots cost roughly $0.10/day for 5 pages — cheaper than any compliance tool.

Try it free — 100 requests/month, no credit card. → Get started in 2 minutes

DEV Community