GitHub repository health signals — issue response times, PR merge rates, contributor activity — reveal whether open source projects are thriving or dying. Here's how to build a monitor that tracks these metrics.
Why Monitor Repo Health?
Before depending on a library, you should know: Are issues being addressed? Are PRs reviewed? Is the project maintained by one person or a team? These signals predict whether you'll get stuck with an abandoned dependency.
Setup
pip install requests pandas
Collecting Repository Metrics
GitHub's API provides rich data. Here's a comprehensive collector:
import requests
import pandas as pd
from datetime import datetime, timedelta
class RepoHealthMonitor:
def __init__(self, token=None):
self.base_url = "https://api.github.com"
self.headers = {"Accept": "application/vnd.github.v3+json"}
if token:
self.headers["Authorization"] = f"token {token}"
def get_repo_info(self, owner, repo):
url = f"{self.base_url}/repos/{owner}/{repo}"
response = requests.get(url, headers=self.headers)
data = response.json()
return {
"stars": data.get("stargazers_count", 0),
"forks": data.get("forks_count", 0),
"open_issues": data.get("open_issues_count", 0),
"watchers": data.get("subscribers_count", 0),
"created": data.get("created_at", ""),
"updated": data.get("updated_at", ""),
"pushed": data.get("pushed_at", ""),
"language": data.get("language", ""),
"archived": data.get("archived", False)
}
def get_issue_metrics(self, owner, repo, days=90):
since = (datetime.now() - timedelta(days=days)).isoformat()
url = f"{self.base_url}/repos/{owner}/{repo}/issues"
params = {"state": "all", "since": since, "per_page": 100}
response = requests.get(url, headers=self.headers, params=params)
issues = [i for i in response.json() if "pull_request" not in i]
if not issues:
return {"total": 0}
response_times = []
for issue in issues:
created = datetime.fromisoformat(issue["created_at"].replace("Z", "+00:00"))
if issue.get("comments", 0) > 0:
comments_url = issue["comments_url"]
comments = requests.get(comments_url, headers=self.headers).json()
if comments:
first_response = datetime.fromisoformat(
comments[0]["created_at"].replace("Z", "+00:00"))
hours = (first_response - created).total_seconds() / 3600
response_times.append(hours)
closed = [i for i in issues if i["state"] == "closed"]
return {
"total": len(issues),
"closed": len(closed),
"close_rate": len(closed) / len(issues) * 100 if issues else 0,
"avg_response_hours": sum(response_times) / len(response_times) if response_times else None,
"median_response_hours": sorted(response_times)[len(response_times)//2] if response_times else None
}
def get_contributor_health(self, owner, repo):
url = f"{self.base_url}/repos/{owner}/{repo}/contributors"
params = {"per_page": 100}
response = requests.get(url, headers=self.headers, params=params)
contributors = response.json()
if not contributors or not isinstance(contributors, list):
return {"bus_factor": 0}
total_commits = sum(c.get("contributions", 0) for c in contributors)
sorted_contribs = sorted(contributors,
key=lambda c: c.get("contributions", 0),
reverse=True)
cumulative = 0
bus_factor = 0
for c in sorted_contribs:
cumulative += c.get("contributions", 0)
bus_factor += 1
if cumulative >= total_commits * 0.8:
break
return {
"total_contributors": len(contributors),
"bus_factor": bus_factor,
"top_contributor_pct": (sorted_contribs[0].get("contributions", 0) /
total_commits * 100) if total_commits else 0
}
monitor = RepoHealthMonitor()
Generating Health Reports
def health_report(owner, repo, monitor):
info = monitor.get_repo_info(owner, repo)
issues = monitor.get_issue_metrics(owner, repo)
contributors = monitor.get_contributor_health(owner, repo)
score = 0
if not info["archived"]:
score += 10
if issues.get("close_rate", 0) > 70:
score += 20
elif issues.get("close_rate", 0) > 40:
score += 10
if issues.get("avg_response_hours") and issues["avg_response_hours"] < 48:
score += 20
elif issues.get("avg_response_hours") and issues["avg_response_hours"] < 168:
score += 10
if contributors.get("bus_factor", 0) >= 3:
score += 20
elif contributors.get("bus_factor", 0) >= 2:
score += 10
if info["stars"] > 1000:
score += 15
elif info["stars"] > 100:
score += 10
days_since_push = (datetime.now() - datetime.fromisoformat(
info["pushed"].replace("Z", "+00:00")).replace(tzinfo=None)).days
if days_since_push < 7:
score += 15
elif days_since_push < 30:
score += 10
print(f"\nHealth Report: {owner}/{repo}")
print(f"{'=' * 50}")
print(f"Score: {score}/100")
print(f"Stars: {info['stars']:,} | Forks: {info['forks']:,}")
print(f"Last push: {days_since_push} days ago")
print(f"Issue close rate: {issues.get('close_rate', 0):.1f}%")
print(f"Avg response time: {issues.get('avg_response_hours', 'N/A')} hours")
print(f"Bus factor: {contributors.get('bus_factor', 'N/A')}")
return score
repos = [("pallets", "flask"), ("django", "django"), ("fastapi", "fastapi")]
for owner, repo in repos:
health_report(owner, repo, monitor)
Tools for Scaling
- ScraperAPI handles API rate limits and proxy rotation
- ThorData for high-volume data collection
- ScrapeOps to monitor your pipeline performance
Conclusion
Automated repo health monitoring protects you from depending on dying projects and helps you discover thriving ones. Start with your critical dependencies, then expand to track the ecosystem. The data is all public — you just need to collect and analyze it systematically.
Top comments (0)