GitHub is the world's largest developer platform, and its public data reveals technology trends, popular tools, and developer sentiment. Here's how to scrape GitHub effectively for trend analysis.
What We'll Track
- Repository star counts and growth rates
- Issue volume and response times
- Language and topic trends
- Developer activity patterns
Setup
pip install requests pandas matplotlib
GitHub API + Scraping Hybrid
GitHub has a generous API, so we'll use it where possible and scrape for data the API doesn't expose:
import requests
import time
from datetime import datetime
class GitHubTracker:
def __init__(self, token=None):
self.session = requests.Session()
if token:
self.session.headers["Authorization"] = f"token {token}"
self.session.headers["Accept"] = "application/vnd.github.v3+json"
def search_repos(self, query, sort="stars", per_page=100):
"""Search repositories with the GitHub API."""
results = []
page = 1
while len(results) < per_page:
resp = self.session.get(
"https://api.github.com/search/repositories",
params={
"q": query,
"sort": sort,
"per_page": min(100, per_page - len(results)),
"page": page
}
)
if resp.status_code == 403:
print("Rate limited, waiting 60s...")
time.sleep(60)
continue
data = resp.json()
items = data.get("items", [])
if not items:
break
for repo in items:
results.append({
"name": repo["full_name"],
"stars": repo["stargazers_count"],
"forks": repo["forks_count"],
"open_issues": repo["open_issues_count"],
"language": repo["language"],
"created": repo["created_at"],
"updated": repo["updated_at"],
"description": repo["description"] or "",
"topics": repo.get("topics", []),
"license": (repo.get("license") or {}).get("spdx_id", "None")
})
page += 1
time.sleep(2)
return results
Trending Analysis
import pandas as pd
def analyze_language_trends(tracker):
"""Analyze which languages are trending on GitHub."""
languages = [
"python", "javascript", "typescript", "rust",
"go", "java", "kotlin", "swift"
]
trends = []
for lang in languages:
query = f"language:{lang} created:>2026-02-01"
repos = tracker.search_repos(query, sort="stars", per_page=50)
if repos:
df = pd.DataFrame(repos)
trends.append({
"language": lang,
"new_repos": len(repos),
"avg_stars": df["stars"].mean(),
"max_stars": df["stars"].max(),
"top_repo": df.loc[df["stars"].idxmax(), "name"]
})
time.sleep(3)
trends_df = pd.DataFrame(trends).sort_values("avg_stars", ascending=False)
print("\n=== Language Trends (Last 30 Days) ===\n")
print(trends_df.to_string(index=False))
return trends_df
Issue Velocity Tracking
def track_issue_velocity(tracker, repos):
"""Track issue creation and closure rates."""
results = []
for repo_name in repos:
resp = tracker.session.get(
f"https://api.github.com/repos/{repo_name}/issues",
params={"state": "all", "per_page": 100, "sort": "created"}
)
issues = resp.json()
open_count = sum(1 for i in issues if i["state"] == "open")
closed_count = sum(1 for i in issues if i["state"] == "closed")
close_times = []
for issue in issues:
if issue["state"] == "closed" and issue.get("closed_at"):
created = datetime.fromisoformat(
issue["created_at"].replace("Z", "+00:00")
)
closed = datetime.fromisoformat(
issue["closed_at"].replace("Z", "+00:00")
)
close_times.append((closed - created).total_seconds() / 3600)
results.append({
"repo": repo_name,
"total_recent": len(issues),
"open": open_count,
"closed": closed_count,
"avg_close_hours": (
sum(close_times) / len(close_times) if close_times else 0
)
})
time.sleep(2)
df = pd.DataFrame(results)
print("\n=== Issue Velocity ===\n")
print(df.to_string(index=False))
return df
# Example usage
tracker = GitHubTracker()
repos_to_track = [
"langchain-ai/langchain",
"anthropics/anthropic-sdk-python",
"openai/openai-python",
"huggingface/transformers"
]
issue_data = track_issue_velocity(tracker, repos_to_track)
Scraping GitHub Trending Page
For data not available via the API, use ScraperAPI:
from bs4 import BeautifulSoup
SCRAPER_API_KEY = "YOUR_KEY"
def scrape_github_trending(language=None, since="daily"):
url = "https://github.com/trending"
if language:
url += f"/{language}"
url += f"?since={since}"
api_url = f"http://api.scraperapi.com?api_key={SCRAPER_API_KEY}&url={url}"
response = requests.get(api_url)
soup = BeautifulSoup(response.text, "html.parser")
trending = []
for row in soup.select("article.Box-row"):
name = row.select_one("h2 a")
stars_today = row.select_one(".float-sm-right")
if name:
trending.append({
"repo": name.text.strip().replace("\n", "").replace(" ", ""),
"stars_today": stars_today.text.strip() if stars_today else "N/A",
"since": since
})
return trending
Proxy Tips
The GitHub API is generous with rate limits, but for heavy HTML scraping, use ScraperAPI. ThorData works well for GitHub's web interface. Monitor everything with ScrapeOps.
Conclusion
GitHub data reveals where the developer ecosystem is heading. Combine API access with targeted scraping, and you'll have insights that most trend reports take weeks to compile.
Top comments (0)