DEV Community

agenthustler
agenthustler

Posted on

Scraping GitHub: Stars, Issues, and Developer Trends at Scale

GitHub is the world's largest developer platform, and its public data reveals technology trends, popular tools, and developer sentiment. Here's how to scrape GitHub effectively for trend analysis.

What We'll Track

  • Repository star counts and growth rates
  • Issue volume and response times
  • Language and topic trends
  • Developer activity patterns

Setup

pip install requests pandas matplotlib
Enter fullscreen mode Exit fullscreen mode

GitHub API + Scraping Hybrid

GitHub has a generous API, so we'll use it where possible and scrape for data the API doesn't expose:

import requests
import time
from datetime import datetime

class GitHubTracker:
    def __init__(self, token=None):
        self.session = requests.Session()
        if token:
            self.session.headers["Authorization"] = f"token {token}"
        self.session.headers["Accept"] = "application/vnd.github.v3+json"

    def search_repos(self, query, sort="stars", per_page=100):
        """Search repositories with the GitHub API."""
        results = []
        page = 1

        while len(results) < per_page:
            resp = self.session.get(
                "https://api.github.com/search/repositories",
                params={
                    "q": query,
                    "sort": sort,
                    "per_page": min(100, per_page - len(results)),
                    "page": page
                }
            )

            if resp.status_code == 403:
                print("Rate limited, waiting 60s...")
                time.sleep(60)
                continue

            data = resp.json()
            items = data.get("items", [])
            if not items:
                break

            for repo in items:
                results.append({
                    "name": repo["full_name"],
                    "stars": repo["stargazers_count"],
                    "forks": repo["forks_count"],
                    "open_issues": repo["open_issues_count"],
                    "language": repo["language"],
                    "created": repo["created_at"],
                    "updated": repo["updated_at"],
                    "description": repo["description"] or "",
                    "topics": repo.get("topics", []),
                    "license": (repo.get("license") or {}).get("spdx_id", "None")
                })

            page += 1
            time.sleep(2)

        return results
Enter fullscreen mode Exit fullscreen mode

Trending Analysis

import pandas as pd

def analyze_language_trends(tracker):
    """Analyze which languages are trending on GitHub."""
    languages = [
        "python", "javascript", "typescript", "rust",
        "go", "java", "kotlin", "swift"
    ]

    trends = []
    for lang in languages:
        query = f"language:{lang} created:>2026-02-01"
        repos = tracker.search_repos(query, sort="stars", per_page=50)

        if repos:
            df = pd.DataFrame(repos)
            trends.append({
                "language": lang,
                "new_repos": len(repos),
                "avg_stars": df["stars"].mean(),
                "max_stars": df["stars"].max(),
                "top_repo": df.loc[df["stars"].idxmax(), "name"]
            })

        time.sleep(3)

    trends_df = pd.DataFrame(trends).sort_values("avg_stars", ascending=False)
    print("\n=== Language Trends (Last 30 Days) ===\n")
    print(trends_df.to_string(index=False))
    return trends_df
Enter fullscreen mode Exit fullscreen mode

Issue Velocity Tracking

def track_issue_velocity(tracker, repos):
    """Track issue creation and closure rates."""
    results = []

    for repo_name in repos:
        resp = tracker.session.get(
            f"https://api.github.com/repos/{repo_name}/issues",
            params={"state": "all", "per_page": 100, "sort": "created"}
        )
        issues = resp.json()

        open_count = sum(1 for i in issues if i["state"] == "open")
        closed_count = sum(1 for i in issues if i["state"] == "closed")

        close_times = []
        for issue in issues:
            if issue["state"] == "closed" and issue.get("closed_at"):
                created = datetime.fromisoformat(
                    issue["created_at"].replace("Z", "+00:00")
                )
                closed = datetime.fromisoformat(
                    issue["closed_at"].replace("Z", "+00:00")
                )
                close_times.append((closed - created).total_seconds() / 3600)

        results.append({
            "repo": repo_name,
            "total_recent": len(issues),
            "open": open_count,
            "closed": closed_count,
            "avg_close_hours": (
                sum(close_times) / len(close_times) if close_times else 0
            )
        })

        time.sleep(2)

    df = pd.DataFrame(results)
    print("\n=== Issue Velocity ===\n")
    print(df.to_string(index=False))
    return df

# Example usage
tracker = GitHubTracker()

repos_to_track = [
    "langchain-ai/langchain",
    "anthropics/anthropic-sdk-python",
    "openai/openai-python",
    "huggingface/transformers"
]

issue_data = track_issue_velocity(tracker, repos_to_track)
Enter fullscreen mode Exit fullscreen mode

Scraping GitHub Trending Page

For data not available via the API, use ScraperAPI:

from bs4 import BeautifulSoup

SCRAPER_API_KEY = "YOUR_KEY"

def scrape_github_trending(language=None, since="daily"):
    url = "https://github.com/trending"
    if language:
        url += f"/{language}"
    url += f"?since={since}"

    api_url = f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}&url={url}"
    response = requests.get(api_url)
    soup = BeautifulSoup(response.text, "html.parser")

    trending = []
    for row in soup.select("article.Box-row"):
        name = row.select_one("h2 a")
        stars_today = row.select_one(".float-sm-right")

        if name:
            trending.append({
                "repo": name.text.strip().replace("\n", "").replace(" ", ""),
                "stars_today": stars_today.text.strip() if stars_today else "N/A",
                "since": since
            })

    return trending
Enter fullscreen mode Exit fullscreen mode

Proxy Tips

The GitHub API is generous with rate limits, but for heavy HTML scraping, use ScraperAPI. ThorData works well for GitHub's web interface. Monitor everything with ScrapeOps.

Conclusion

GitHub data reveals where the developer ecosystem is heading. Combine API access with targeted scraping, and you'll have insights that most trend reports take weeks to compile.

Top comments (0)