Why Mine GitHub Data?
GitHub hosts over 200 million repositories and 100 million developers. Mining this data enables powerful use cases:
- Developer analytics — Track trending technologies and skill demand
- Competitive intelligence — Monitor competitor open-source activity
- Talent sourcing — Find developers by contribution patterns
- Technology trends — Identify rising frameworks and tools
- Open source health — Evaluate project sustainability
This guide covers both GitHub's API and web scraping techniques for large-scale data extraction.
GitHub REST API: The Foundation
GitHub's API is well-documented and generous — 5,000 requests/hour with authentication.
Setup
import requests
import time
from datetime import datetime, timedelta
class GitHubClient:
BASE_URL = 'https://api.github.com'
def __init__(self, token=None):
self.session = requests.Session()
self.session.headers.update({
'Accept': 'application/vnd.github.v3+json',
'User-Agent': 'DataMiner/1.0'
})
if token:
self.session.headers['Authorization'] = f'token {token}'
def _get(self, endpoint, params=None):
url = f'{self.BASE_URL}/{endpoint}'
response = self.session.get(url, params=params)
# Handle rate limiting
remaining = int(response.headers.get('X-RateLimit-Remaining', 1))
if remaining < 10:
reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
wait = max(reset_time - time.time(), 0) + 1
print(f'Rate limit approaching, waiting {wait:.0f}s')
time.sleep(wait)
return response.json() if response.status_code == 200 else None
def _get_paginated(self, endpoint, params=None, max_pages=10):
all_items = []
params = params or {}
params['per_page'] = 100
for page in range(1, max_pages + 1):
params['page'] = page
data = self._get(endpoint, params)
if not data:
break
all_items.extend(data)
if len(data) < 100:
break
return all_items
client = GitHubClient('YOUR_GITHUB_TOKEN') # Optional but recommended
Search Repositories
def search_repos(query, sort='stars', max_results=200):
all_repos = []
per_page = 100
for page in range(1, (max_results // per_page) + 2):
params = {
'q': query,
'sort': sort,
'order': 'desc',
'per_page': per_page,
'page': page,
}
data = client._get('search/repositories', params)
if not data or 'items' not in data:
break
for repo in data['items']:
all_repos.append({
'full_name': repo['full_name'],
'description': repo.get('description', '')[:200],
'stars': repo['stargazers_count'],
'forks': repo['forks_count'],
'language': repo.get('language'),
'created': repo['created_at'],
'updated': repo['updated_at'],
'topics': repo.get('topics', []),
'open_issues': repo['open_issues_count'],
'license': repo.get('license', {}).get('spdx_id') if repo.get('license') else None,
'url': repo['html_url'],
})
if len(data['items']) < per_page:
break
time.sleep(2) # Respect rate limits
return all_repos[:max_results]
# Find trending Python web scraping repos
repos = search_repos('web scraping language:python', max_results=50)
for r in repos[:10]:
print(f"{r['full_name']}: {r['stars']:,} stars - {r['description'][:60]}")
Extract Contributors
def get_contributors(owner, repo, max_contributors=100):
contributors = client._get_paginated(
f'repos/{owner}/{repo}/contributors',
max_pages=max_contributors // 100 + 1
)
return [{
'username': c['login'],
'contributions': c['contributions'],
'profile_url': c['html_url'],
'avatar': c['avatar_url'],
} for c in (contributors or [])][:max_contributors]
def get_contributor_details(username):
user = client._get(f'users/{username}')
if not user:
return None
return {
'username': user['login'],
'name': user.get('name'),
'company': user.get('company'),
'location': user.get('location'),
'bio': user.get('bio'),
'public_repos': user['public_repos'],
'followers': user['followers'],
'following': user['following'],
'created': user['created_at'],
'blog': user.get('blog'),
}
# Top contributors to a popular repo
contribs = get_contributors('microsoft', 'playwright')
for c in contribs[:10]:
print(f" {c['username']}: {c['contributions']} contributions")
Track Star History
def get_stargazers_over_time(owner, repo, max_pages=10):
"""Get star history with timestamps"""
session = requests.Session()
session.headers.update({
'Accept': 'application/vnd.github.star+json', # Special media type for timestamps
'Authorization': f'token YOUR_TOKEN'
})
stars = []
for page in range(1, max_pages + 1):
url = f'https://api.github.com/repos/{owner}/{repo}/stargazers'
response = session.get(url, params={'page': page, 'per_page': 100})
if response.status_code != 200:
break
data = response.json()
if not data:
break
for star in data:
stars.append({
'user': star['user']['login'],
'starred_at': star['starred_at'],
})
time.sleep(1)
return stars
Handling API Rate Limits
GitHub's limits:
- Unauthenticated: 60 requests/hour
- Authenticated: 5,000 requests/hour
- Search API: 30 requests/minute
def check_rate_limit():
data = client._get('rate_limit')
if data:
core = data['resources']['core']
search = data['resources']['search']
print(f"Core: {core['remaining']}/{core['limit']} (resets {datetime.fromtimestamp(core['reset'])})")
print(f"Search: {search['remaining']}/{search['limit']} (resets {datetime.fromtimestamp(search['reset'])})")
When the API isn't enough, scraping fills the gap. Using ScrapeOps for proxy rotation helps when you need to supplement API data with web-scraped content from GitHub pages.
Technology Trend Analysis
import pandas as pd
from collections import Counter
def analyze_tech_trends(language, since_days=365):
since = (datetime.now() - timedelta(days=since_days)).strftime('%Y-%m-%d')
repos = search_repos(
f'language:{language} created:>{since} stars:>100',
sort='stars',
max_results=200
)
# Topic analysis
all_topics = []
for repo in repos:
all_topics.extend(repo.get('topics', []))
topic_counts = Counter(all_topics).most_common(20)
# Language distribution
languages = Counter(r['language'] for r in repos if r['language'])
# Growth metrics
df = pd.DataFrame(repos)
df['created'] = pd.to_datetime(df['created'])
monthly = df.groupby(df['created'].dt.to_period('M')).size()
return {
'total_repos': len(repos),
'total_stars': sum(r['stars'] for r in repos),
'top_topics': topic_counts,
'monthly_creation': monthly.to_dict(),
'avg_stars': df['stars'].mean(),
'median_stars': df['stars'].median(),
}
trends = analyze_tech_trends('python')
print(f"New Python repos (>100 stars): {trends['total_repos']}")
print(f"Top topics: {trends['top_topics'][:10]}")
Building a Developer Profile Database
import sqlite3
class GitHubDatabase:
def __init__(self, db_path='github_data.db'):
self.conn = sqlite3.connect(db_path)
self.setup()
def setup(self):
self.conn.executescript('''
CREATE TABLE IF NOT EXISTS repos (
full_name TEXT PRIMARY KEY, stars INTEGER,
forks INTEGER, language TEXT, description TEXT,
created TEXT, updated TEXT, collected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS developers (
username TEXT PRIMARY KEY, name TEXT,
company TEXT, location TEXT, followers INTEGER,
public_repos INTEGER, collected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
''')
def save_repos(self, repos):
for r in repos:
self.conn.execute('''
INSERT OR REPLACE INTO repos
(full_name, stars, forks, language, description, created, updated)
VALUES (?, ?, ?, ?, ?, ?, ?)
''', (r['full_name'], r['stars'], r['forks'], r['language'],
r.get('description', '')[:200], r['created'], r['updated']))
self.conn.commit()
db = GitHubDatabase()
repos = search_repos('web scraping', max_results=100)
db.save_repos(repos)
print(f'Saved {len(repos)} repos to database')
Managed GitHub Scraping
For large-scale GitHub data collection that goes beyond API rate limits, the GitHub Scraper on Apify handles pagination, rate limiting, and structured data extraction automatically.
Best Practices
- Always authenticate — 5,000 vs 60 requests/hour
- Cache responses — Store data locally to avoid redundant calls
-
Use conditional requests —
If-None-Matchheader to check if data changed -
Paginate efficiently — Use
per_page=100to minimize API calls -
Handle rate limits gracefully — Check
X-RateLimit-Remainingheaders - Supplement with scraping — Use ScrapeOps when you need data the API doesn't provide
Conclusion
GitHub's API is one of the best public APIs available, but for large-scale data mining, you'll need to combine API access with smart rate limiting and supplemental scraping. Use the GitHub Scraper on Apify for production workloads, and ScrapeOps for proxy infrastructure when scraping beyond API limits.
Top comments (0)