How to Monitor Open Source License Compliance with Scraping

#python #tutorial #webdev #programming

How to Monitor Open Source License Compliance with Scraping

Open source license violations can lead to lawsuits and forced code releases. Let's build a Python tool that monitors your dependencies and their license obligations automatically.

The Problem

Modern projects have hundreds of transitive dependencies. Each has a license — GPL propagation, attribution requirements, and patent clauses can catch teams off guard.

Setting Up

pip install requests beautifulsoup4 pip-licenses pipdeptree

Step 1: Audit Current Dependencies

import subprocess
import json
import pandas as pd

def get_installed_licenses():
    result = subprocess.run(
        ["pip-licenses", "--format=json", "--with-urls", "--with-authors"],
        capture_output=True, text=True
    )
    return json.loads(result.stdout)

licenses = get_installed_licenses()
df = pd.DataFrame(licenses)
print(f"Total packages: {len(df)}")
print(df["License"].value_counts().head(10))

Step 2: Classify License Risk

COPYLEFT = {"GPL", "LGPL", "AGPL", "MPL", "EUPL", "CPAL"}
PERMISSIVE = {"MIT", "BSD", "Apache", "ISC", "Unlicense", "WTFPL"}

def classify_license(license_str):
    upper = license_str.upper()
    for cl in COPYLEFT:
        if cl in upper:
            return "copyleft"
    for pl in PERMISSIVE:
        if pl in upper:
            return "permissive"
    if license_str.strip() in {"UNKNOWN", ""}:
        return "unknown"
    return "review_needed"

df["risk"] = df["License"].apply(classify_license)
risky = df[df["risk"].isin(["copyleft", "unknown", "review_needed"])]
print(f"\nPackages needing review: {len(risky)}")
for _, row in risky.iterrows():
    print(f"  {row['Name']} ({row['License']}) - {row['risk']}")

Step 3: Scrape PyPI for License Details

import requests
import time

def get_pypi_license(package_name):
    url = f"https://pypi.org/pypi/{package_name}/json"
    resp = requests.get(url)
    if resp.status_code == 200:
        data = resp.json()
        info = data.get("info", {})
        classifiers = info.get("classifiers", [])
        license_classifiers = [
            c.split(" :: ")[-1] for c in classifiers
            if "License" in c and "OSI Approved" in c
        ]
        return {
            "license_field": info.get("license", ""),
            "classifiers": license_classifiers,
            "home_page": info.get("home_page", "")
        }
    return None

for idx, row in df[df["risk"] == "unknown"].iterrows():
    details = get_pypi_license(row["Name"])
    if details and details["classifiers"]:
        print(f"Resolved {row['Name']}: {details['classifiers']}")
    time.sleep(0.5)

Step 4: Monitor GitHub for License Changes

def check_github_license(repo_url):
    if "github.com" not in str(repo_url):
        return None
    parts = repo_url.rstrip("/").split("/")
    if len(parts) >= 2:
        owner, repo = parts[-2], parts[-1]
        resp = requests.get(f"https://api.github.com/repos/{owner}/{repo}/license")
        if resp.status_code == 200:
            data = resp.json()
            return {
                "spdx_id": data.get("license", {}).get("spdx_id"),
                "name": data.get("license", {}).get("name")
            }
    return None

Step 5: Generate Compliance Report

def generate_report(df):
    report = {
        "total_packages": len(df),
        "permissive": len(df[df["risk"] == "permissive"]),
        "copyleft": len(df[df["risk"] == "copyleft"]),
        "unknown": len(df[df["risk"] == "unknown"]),
    }
    df.to_csv("license_audit.csv", index=False)
    print("\n=== License Compliance Report ===")
    for key, val in report.items():
        print(f"  {key}: {val}")
    if report["copyleft"] > 0:
        print("\n  WARNING: Copyleft licenses detected!")

generate_report(df)

For private registries use ScraperAPI, scale with ThorData proxies, and monitor with ScrapeOps.

Key Takeaways

Automated license auditing catches compliance issues early
PyPI metadata often has more detail than pip-licenses alone
GitHub API provides canonical license information
Regular monitoring catches license changes in updates

License compliance is a legal matter. This tool assists but does not replace legal review.

DEV Community

How to Monitor Open Source License Compliance with Scraping