How to Monitor GitHub Repository Health Metrics with Python

#python #tutorial #webdev #programming

GitHub repository health signals — issue response times, PR merge rates, contributor activity — reveal whether open source projects are thriving or dying. Here's how to build a monitor that tracks these metrics.

Why Monitor Repo Health?

Before depending on a library, you should know: Are issues being addressed? Are PRs reviewed? Is the project maintained by one person or a team? These signals predict whether you'll get stuck with an abandoned dependency.

Setup

pip install requests pandas

Collecting Repository Metrics

GitHub's API provides rich data. Here's a comprehensive collector:

import requests
import pandas as pd
from datetime import datetime, timedelta

class RepoHealthMonitor:
    def __init__(self, token=None):
        self.base_url = "https://api.github.com"
        self.headers = {"Accept": "application/vnd.github.v3+json"}
        if token:
            self.headers["Authorization"] = f"token {token}"

    def get_repo_info(self, owner, repo):
        url = f"{self.base_url}/repos/{owner}/{repo}"
        response = requests.get(url, headers=self.headers)
        data = response.json()

        return {
            "stars": data.get("stargazers_count", 0),
            "forks": data.get("forks_count", 0),
            "open_issues": data.get("open_issues_count", 0),
            "watchers": data.get("subscribers_count", 0),
            "created": data.get("created_at", ""),
            "updated": data.get("updated_at", ""),
            "pushed": data.get("pushed_at", ""),
            "language": data.get("language", ""),
            "archived": data.get("archived", False)
        }

    def get_issue_metrics(self, owner, repo, days=90):
        since = (datetime.now() - timedelta(days=days)).isoformat()

        url = f"{self.base_url}/repos/{owner}/{repo}/issues"
        params = {"state": "all", "since": since, "per_page": 100}
        response = requests.get(url, headers=self.headers, params=params)
        issues = [i for i in response.json() if "pull_request" not in i]

        if not issues:
            return {"total": 0}

        response_times = []
        for issue in issues:
            created = datetime.fromisoformat(issue["created_at"].replace("Z", "+00:00"))

            if issue.get("comments", 0) > 0:
                comments_url = issue["comments_url"]
                comments = requests.get(comments_url, headers=self.headers).json()
                if comments:
                    first_response = datetime.fromisoformat(
                        comments[0]["created_at"].replace("Z", "+00:00"))
                    hours = (first_response - created).total_seconds() / 3600
                    response_times.append(hours)

        closed = [i for i in issues if i["state"] == "closed"]

        return {
            "total": len(issues),
            "closed": len(closed),
            "close_rate": len(closed) / len(issues) * 100 if issues else 0,
            "avg_response_hours": sum(response_times) / len(response_times) if response_times else None,
            "median_response_hours": sorted(response_times)[len(response_times)//2] if response_times else None
        }

    def get_contributor_health(self, owner, repo):
        url = f"{self.base_url}/repos/{owner}/{repo}/contributors"
        params = {"per_page": 100}
        response = requests.get(url, headers=self.headers, params=params)
        contributors = response.json()

        if not contributors or not isinstance(contributors, list):
            return {"bus_factor": 0}

        total_commits = sum(c.get("contributions", 0) for c in contributors)

        sorted_contribs = sorted(contributors, 
                                  key=lambda c: c.get("contributions", 0), 
                                  reverse=True)
        cumulative = 0
        bus_factor = 0
        for c in sorted_contribs:
            cumulative += c.get("contributions", 0)
            bus_factor += 1
            if cumulative >= total_commits * 0.8:
                break

        return {
            "total_contributors": len(contributors),
            "bus_factor": bus_factor,
            "top_contributor_pct": (sorted_contribs[0].get("contributions", 0) / 
                                    total_commits * 100) if total_commits else 0
        }

monitor = RepoHealthMonitor()

Generating Health Reports

def health_report(owner, repo, monitor):
    info = monitor.get_repo_info(owner, repo)
    issues = monitor.get_issue_metrics(owner, repo)
    contributors = monitor.get_contributor_health(owner, repo)

    score = 0
    if not info["archived"]:
        score += 10
    if issues.get("close_rate", 0) > 70:
        score += 20
    elif issues.get("close_rate", 0) > 40:
        score += 10
    if issues.get("avg_response_hours") and issues["avg_response_hours"] < 48:
        score += 20
    elif issues.get("avg_response_hours") and issues["avg_response_hours"] < 168:
        score += 10
    if contributors.get("bus_factor", 0) >= 3:
        score += 20
    elif contributors.get("bus_factor", 0) >= 2:
        score += 10
    if info["stars"] > 1000:
        score += 15
    elif info["stars"] > 100:
        score += 10

    days_since_push = (datetime.now() - datetime.fromisoformat(
        info["pushed"].replace("Z", "+00:00")).replace(tzinfo=None)).days
    if days_since_push < 7:
        score += 15
    elif days_since_push < 30:
        score += 10

    print(f"\nHealth Report: {owner}/{repo}")
    print(f"{'=' * 50}")
    print(f"Score: {score}/100")
    print(f"Stars: {info['stars']:,} | Forks: {info['forks']:,}")
    print(f"Last push: {days_since_push} days ago")
    print(f"Issue close rate: {issues.get('close_rate', 0):.1f}%")
    print(f"Avg response time: {issues.get('avg_response_hours', 'N/A')} hours")
    print(f"Bus factor: {contributors.get('bus_factor', 'N/A')}")

    return score

repos = [("pallets", "flask"), ("django", "django"), ("fastapi", "fastapi")]
for owner, repo in repos:
    health_report(owner, repo, monitor)

Tools for Scaling

ScraperAPI handles API rate limits and proxy rotation
ThorData for high-volume data collection
ScrapeOps to monitor your pipeline performance

Conclusion

Automated repo health monitoring protects you from depending on dying projects and helps you discover thriving ones. Start with your critical dependencies, then expand to track the ecosystem. The data is all public — you just need to collect and analyze it systematically.

DEV Community