DEV Community

agenthustler
agenthustler

Posted on

How to Monitor Software License Changes with Web Scraping

Software license changes can break your product overnight. Remember when HashiCorp moved from MPL to BSL? Build a monitoring system that catches license changes before they catch you.

Why This Matters

In 2023-2024, several major open-source projects changed licenses — Redis, Terraform, Elasticsearch. Each time, companies scrambled to assess impact. Automated monitoring gives you early warning.

Tracking GitHub License Changes

import requests
import pandas as pd
from datetime import datetime
import json

API_KEY = "YOUR_SCRAPERAPI_KEY"  # Get one at https://www.scraperapi.com?fp_ref=the52
GITHUB_TOKEN = "your_github_token"  # Optional, for higher rate limits

def check_github_license(owner, repo):
    url = f"https://api.github.com/repos/{owner}/{repo}/license"
    headers = {'Authorization': f'token {GITHUB_TOKEN}'} if GITHUB_TOKEN else {}

    response = requests.get(url, headers=headers, timeout=30)
    if response.status_code == 200:
        data = response.json()
        return {
            'repo': f'{owner}/{repo}',
            'license_key': data.get('license', {}).get('spdx_id', 'Unknown'),
            'license_name': data.get('license', {}).get('name', 'Unknown'),
            'checked_at': datetime.now().isoformat()
        }
    return None

critical_deps = [
    ('redis', 'redis'),
    ('hashicorp', 'terraform'),
    ('elastic', 'elasticsearch'),
    ('docker', 'compose'),
    ('grafana', 'grafana'),
    ('apache', 'kafka'),
    ('mongodb', 'mongo'),
    ('cockroachdb', 'cockroach'),
]

results = []
for owner, repo in critical_deps:
    license_info = check_github_license(owner, repo)
    if license_info:
        results.append(license_info)
        print(f"{owner}/{repo}: {license_info['license_key']}")

df = pd.DataFrame(results)
df.to_csv('license_snapshot.csv', index=False)
Enter fullscreen mode Exit fullscreen mode

Detecting Changes

Compare snapshots to detect license modifications:

import sqlite3

conn = sqlite3.connect('license_monitor.db')

def detect_changes(current_snapshot):
    alerts = []

    for _, row in current_snapshot.iterrows():
        previous = pd.read_sql(
            f"SELECT * FROM licenses WHERE repo='{row['repo']}' ORDER BY checked_at DESC LIMIT 1",
            conn
        )

        if not previous.empty:
            if previous.iloc[0]['license_key'] != row['license_key']:
                alerts.append({
                    'repo': row['repo'],
                    'old_license': previous.iloc[0]['license_key'],
                    'new_license': row['license_key'],
                    'detected_at': datetime.now().isoformat()
                })

    current_snapshot.to_sql('licenses', conn, if_exists='append', index=False)
    return alerts

changes = detect_changes(df)
for change in changes:
    print(f"ALERT: {change['repo']} changed from {change['old_license']} to {change['new_license']}")
Enter fullscreen mode Exit fullscreen mode

Monitoring Package Registries

Check npm and PyPI for license metadata too. Use ThorData for reliable proxy rotation:

def check_npm_license(package_name):
    url = f"https://registry.npmjs.org/{package_name}/latest"
    response = requests.get(url, timeout=30)
    if response.status_code == 200:
        data = response.json()
        return {
            'package': package_name,
            'registry': 'npm',
            'license': data.get('license', 'Unknown'),
            'version': data.get('version', 'Unknown')
        }
    return None

def check_pypi_license(package_name):
    url = f"https://pypi.org/pypi/{package_name}/json"
    response = requests.get(url, timeout=30)
    if response.status_code == 200:
        data = response.json()
        return {
            'package': package_name,
            'registry': 'pypi',
            'license': data['info'].get('license', 'Unknown'),
            'version': data['info'].get('version', 'Unknown')
        }
    return None

npm_packages = ['express', 'react', 'next', 'fastify', 'prisma']
pypi_packages = ['django', 'flask', 'fastapi', 'celery', 'sqlalchemy']

for pkg in npm_packages:
    info = check_npm_license(pkg)
    if info:
        print(f"npm/{pkg}: {info['license']}")

for pkg in pypi_packages:
    info = check_pypi_license(pkg)
    if info:
        print(f"pypi/{pkg}: {info['license']}")
Enter fullscreen mode Exit fullscreen mode

Scraping License Change Announcements

Monitor blog posts and changelogs for license announcements. ScrapeOps helps track scraper health:

from bs4 import BeautifulSoup

def scrape_changelog(repo_url):
    changelog_urls = [
        f"{repo_url}/blob/main/CHANGELOG.md",
        f"{repo_url}/blob/main/LICENSE",
        f"{repo_url}/releases"
    ]

    for url in changelog_urls:
        proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={url}"
        response = requests.get(proxy_url, timeout=60)
        if 'license' in response.text.lower():
            return True
    return False
Enter fullscreen mode Exit fullscreen mode

Key Takeaways

  • License changes often appear in LICENSE file commits weeks before announcements
  • GitHub API provides real-time license detection for any public repo
  • Cross-reference GitHub, npm, and PyPI — license metadata can be inconsistent
  • ScraperAPI handles rate limiting and IP rotation for high-volume monitoring

Top comments (0)