How to Monitor Open Source License Compliance with Scraping
Open source license violations can lead to lawsuits and forced code releases. Let's build a Python tool that monitors your dependencies and their license obligations automatically.
The Problem
Modern projects have hundreds of transitive dependencies. Each has a license — GPL propagation, attribution requirements, and patent clauses can catch teams off guard.
Setting Up
pip install requests beautifulsoup4 pip-licenses pipdeptree
Step 1: Audit Current Dependencies
import subprocess
import json
import pandas as pd
def get_installed_licenses():
result = subprocess.run(
["pip-licenses", "--format=json", "--with-urls", "--with-authors"],
capture_output=True, text=True
)
return json.loads(result.stdout)
licenses = get_installed_licenses()
df = pd.DataFrame(licenses)
print(f"Total packages: {len(df)}")
print(df["License"].value_counts().head(10))
Step 2: Classify License Risk
COPYLEFT = {"GPL", "LGPL", "AGPL", "MPL", "EUPL", "CPAL"}
PERMISSIVE = {"MIT", "BSD", "Apache", "ISC", "Unlicense", "WTFPL"}
def classify_license(license_str):
upper = license_str.upper()
for cl in COPYLEFT:
if cl in upper:
return "copyleft"
for pl in PERMISSIVE:
if pl in upper:
return "permissive"
if license_str.strip() in {"UNKNOWN", ""}:
return "unknown"
return "review_needed"
df["risk"] = df["License"].apply(classify_license)
risky = df[df["risk"].isin(["copyleft", "unknown", "review_needed"])]
print(f"\nPackages needing review: {len(risky)}")
for _, row in risky.iterrows():
print(f" {row['Name']} ({row['License']}) - {row['risk']}")
Step 3: Scrape PyPI for License Details
import requests
import time
def get_pypi_license(package_name):
url = f"https://pypi.org/pypi/{package_name}/json"
resp = requests.get(url)
if resp.status_code == 200:
data = resp.json()
info = data.get("info", {})
classifiers = info.get("classifiers", [])
license_classifiers = [
c.split(" :: ")[-1] for c in classifiers
if "License" in c and "OSI Approved" in c
]
return {
"license_field": info.get("license", ""),
"classifiers": license_classifiers,
"home_page": info.get("home_page", "")
}
return None
for idx, row in df[df["risk"] == "unknown"].iterrows():
details = get_pypi_license(row["Name"])
if details and details["classifiers"]:
print(f"Resolved {row['Name']}: {details['classifiers']}")
time.sleep(0.5)
Step 4: Monitor GitHub for License Changes
def check_github_license(repo_url):
if "github.com" not in str(repo_url):
return None
parts = repo_url.rstrip("/").split("/")
if len(parts) >= 2:
owner, repo = parts[-2], parts[-1]
resp = requests.get(f"https://api.github.com/repos/{owner}/{repo}/license")
if resp.status_code == 200:
data = resp.json()
return {
"spdx_id": data.get("license", {}).get("spdx_id"),
"name": data.get("license", {}).get("name")
}
return None
Step 5: Generate Compliance Report
def generate_report(df):
report = {
"total_packages": len(df),
"permissive": len(df[df["risk"] == "permissive"]),
"copyleft": len(df[df["risk"] == "copyleft"]),
"unknown": len(df[df["risk"] == "unknown"]),
}
df.to_csv("license_audit.csv", index=False)
print("\n=== License Compliance Report ===")
for key, val in report.items():
print(f" {key}: {val}")
if report["copyleft"] > 0:
print("\n WARNING: Copyleft licenses detected!")
generate_report(df)
For private registries use ScraperAPI, scale with ThorData proxies, and monitor with ScrapeOps.
Key Takeaways
- Automated license auditing catches compliance issues early
- PyPI metadata often has more detail than pip-licenses alone
- GitHub API provides canonical license information
- Regular monitoring catches license changes in updates
License compliance is a legal matter. This tool assists but does not replace legal review.
Top comments (0)