DEV Community

agenthustler
agenthustler

Posted on

How to Scrape Wayback Machine: Historical Web Data with Python

How to Scrape Wayback Machine: Historical Web Data with Python

The Wayback Machine stores over 800 billion web pages dating back to 1996. This data is invaluable for research, competitive analysis, content recovery, and tracking website evolution.

CDX API: The Power Tool

The Wayback Machine provides a CDX API returning structured data about archived URLs — no scraping needed for the index.

import requests, json, time
from datetime import datetime
from bs4 import BeautifulSoup
import difflib

class WaybackScraper:
    CDX_API = "http://web.archive.org/cdx/search/cdx"
    WEB_BASE = "http://web.archive.org/web"

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'WaybackResearch/1.0'})

    def get_snapshots(self, url, from_date=None, to_date=None, limit=1000):
        params = {'url': url, 'output': 'json', 'limit': limit,
                  'fl': 'timestamp,original,statuscode,mimetype,length'}
        if from_date: params['from'] = from_date
        if to_date: params['to'] = to_date
        resp = self.session.get(self.CDX_API, params=params)
        data = resp.json()
        if len(data) <= 1: return []
        headers = data[0]
        return [dict(zip(headers, row)) for row in data[1:]]

    def get_page(self, url, timestamp):
        return self.session.get(f"{self.WEB_BASE}/{timestamp}id_/{url}").text
Enter fullscreen mode Exit fullscreen mode

Tracking Changes Over Time

class EvolutionTracker:
    def __init__(self, scraper):
        self.scraper = scraper

    def track_changes(self, url, selector, samples=10):
        snaps = self.scraper.get_snapshots(url)
        if not snaps: return []
        step = max(1, len(snaps) // samples)
        versions = []
        for snap in snaps[::step]:
            try:
                html = self.scraper.get_page(url, snap['timestamp'])
                soup = BeautifulSoup(html, 'html.parser')
                el = soup.select_one(selector)
                if el:
                    versions.append({
                        'timestamp': snap['timestamp'],
                        'date': datetime.strptime(snap['timestamp'][:8], '%Y%m%d').isoformat(),
                        'content': el.get_text(strip=True)[:500]
                    })
                time.sleep(1)
            except Exception as e:
                print(f"Error: {e}")
        return versions

    def compare(self, versions):
        changes = []
        for i in range(1, len(versions)):
            old = versions[i-1]['content'].splitlines()
            new = versions[i]['content'].splitlines()
            diff = list(difflib.unified_diff(old, new, lineterm=''))
            if diff:
                changes.append({'from': versions[i-1]['date'], 'to': versions[i]['date'],
                                'diff_lines': len(diff)})
        return changes
Enter fullscreen mode Exit fullscreen mode

Bulk Collection

class BulkCollector:
    def __init__(self, scraper):
        self.scraper = scraper

    def domain_history(self, domain):
        params = {'url': domain, 'output': 'json', 'matchType': 'domain',
                  'fl': 'timestamp,original,statuscode', 'collapse': 'urlkey',
                  'filter': 'statuscode:200'}
        data = self.scraper.session.get(self.scraper.CDX_API, params=params).json()
        if len(data) <= 1: return []
        return [dict(zip(data[0], row)) for row in data[1:]]

    def download_snapshot(self, domain, timestamp, output_dir='wayback_data'):
        import os
        os.makedirs(output_dir, exist_ok=True)
        urls = self.domain_history(domain)
        count = 0
        for entry in urls[:100]:
            try:
                content = self.scraper.get_page(entry['original'], timestamp)
                path = entry['original'].replace(f'https://{domain}','').replace(f'http://{domain}','')
                path = path.strip('/') or 'index.html'
                fp = os.path.join(output_dir, path)
                os.makedirs(os.path.dirname(fp), exist_ok=True)
                with open(fp, 'w') as f: f.write(content)
                count += 1
                time.sleep(1)
            except: pass
        return count
Enter fullscreen mode Exit fullscreen mode

Competitor Pricing History

def track_pricing(scraper, url, selector):
    tracker = EvolutionTracker(scraper)
    versions = tracker.track_changes(url, selector, samples=20)
    print(f"Found {len(versions)} snapshots")
    for v in versions:
        print(f"  {v['date']}: {v['content'][:100]}")
    changes = tracker.compare(versions)
    print(f"Detected {len(changes)} changes")
    return versions, changes

wb = WaybackScraper()
track_pricing(wb, 'https://example.com/pricing', '.pricing-table')
Enter fullscreen mode Exit fullscreen mode

Tips

  1. Use CDX API first — fast, no page loads needed
  2. Add 1-2s delays — Internet Archive is a nonprofit
  3. Use collapse to deduplicate
  4. Use id_ modifier for original pages without toolbar
  5. Cache results locally

For large-scale collection, ScraperAPI handles proxy rotation. ThorData provides IP diversity. Monitor with ScrapeOps.


Follow for more Python data collection tutorials.

Top comments (0)