Web scraping GitHub is one of the most common data collection tasks in 2026. Whether you're building a developer talent pipeline, analyzing open-source trends, or tracking competitor activity, GitHub's 100M+ repositories and 40M+ developers represent an unmatched dataset.
But GitHub doesn't make it easy. Rate limits are tight, anti-bot measures are real, and their API has quirks that trip up even experienced developers.
In this guide, I'll walk you through everything you need to know about scraping GitHub in 2026 — from profiles and repositories to trending pages — with working code examples, rate limit strategies, and tips to avoid getting banned.
Why Scrape GitHub?
Before diving into the how, let's talk about the why. Common use cases include:
- Recruiting: Finding developers with specific skills, contribution patterns, and project experience
- Market research: Tracking which technologies are gaining traction
- Competitive intelligence: Monitoring competitor open-source activity
- Academic research: Analyzing collaboration patterns and code quality trends
- Lead generation: Identifying companies building with specific tech stacks
Method 1: GitHub's REST API (The Official Way)
GitHub provides a comprehensive REST API. It's the most reliable way to get structured data, but it comes with strict rate limits.
Authentication
You'll need a Personal Access Token (PAT). Without one, you're limited to 60 requests per hour. With a token, you get 5,000.
import requests
import time
GITHUB_TOKEN = "ghp_your_token_here"
HEADERS = {
"Authorization": f"Bearer {GITHUB_TOKEN}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28"
}
def get_user_profile(username: str) -> dict:
"""Fetch a GitHub user profile."""
url = f"https://api.github.com/users/{username}"
response = requests.get(url, headers=HEADERS)
if response.status_code == 200:
data = response.json()
return {
"login": data["login"],
"name": data.get("name"),
"company": data.get("company"),
"location": data.get("location"),
"bio": data.get("bio"),
"public_repos": data["public_repos"],
"followers": data["followers"],
"following": data["following"],
"created_at": data["created_at"]
}
elif response.status_code == 403:
print("Rate limited! Waiting...")
time.sleep(60)
return get_user_profile(username)
else:
print(f"Error {response.status_code}: {response.text}")
return None
profile = get_user_profile("torvalds")
print(profile)
Rate Limit Management
This is the single most important thing to get right. Here's a robust rate limit handler:
import time
def check_rate_limit(response):
"""Check remaining rate limit and sleep if needed."""
remaining = int(response.headers.get("X-RateLimit-Remaining", 0))
reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
if remaining < 10:
wait_time = reset_time - int(time.time()) + 1
if wait_time > 0:
print(f"Rate limit low ({remaining} left). Sleeping {wait_time}s...")
time.sleep(wait_time)
return remaining
def safe_request(url: str, headers: dict, max_retries: int = 3) -> dict:
"""Make a request with automatic rate limit handling."""
for attempt in range(max_retries):
response = requests.get(url, headers=headers)
check_rate_limit(response)
if response.status_code == 200:
return response.json()
elif response.status_code == 403:
reset = int(response.headers.get("X-RateLimit-Reset", 0))
wait = max(reset - int(time.time()), 60)
print(f"403 Forbidden. Waiting {wait}s (attempt {attempt + 1})")
time.sleep(wait)
elif response.status_code == 429:
retry_after = int(response.headers.get("Retry-After", 60))
print(f"429 Too Many Requests. Waiting {retry_after}s")
time.sleep(retry_after)
else:
print(f"Unexpected status {response.status_code}")
return None
return None
Pagination
GitHub paginates all list endpoints. Here's how to handle it properly:
def get_all_repos(username: str, per_page: int = 100) -> list:
"""Fetch all repositories for a user with pagination."""
repos = []
page = 1
while True:
url = f"https://api.github.com/users/{username}/repos"
params = f"?per_page={per_page}&page={page}&sort=updated"
data = safe_request(url + params, HEADERS)
if not data or len(data) == 0:
break
for repo in data:
repos.append({
"name": repo["name"],
"full_name": repo["full_name"],
"description": repo.get("description"),
"language": repo.get("language"),
"stars": repo["stargazers_count"],
"forks": repo["forks_count"],
"open_issues": repo["open_issues_count"],
"created_at": repo["created_at"],
"updated_at": repo["updated_at"],
"topics": repo.get("topics", [])
})
page += 1
time.sleep(0.5) # Be polite
return repos
all_repos = get_all_repos("facebook")
print(f"Found {len(all_repos)} repositories")
Search API
The Search API is incredibly powerful but has its own rate limit: 30 requests per minute (authenticated).
def search_repos(query: str, sort: str = "stars", max_results: int = 200) -> list:
"""Search repositories with pagination."""
results = []
page = 1
per_page = 100
while len(results) < max_results:
url = (
f"https://api.github.com/search/repositories"
f"?q={query}&sort={sort}&per_page={per_page}&page={page}"
)
data = safe_request(url, HEADERS)
if not data or "items" not in data:
break
results.extend(data["items"])
if len(data["items"]) < per_page:
break
page += 1
time.sleep(2) # Search API has stricter limits
return results[:max_results]
# Find popular Python AI projects
ai_repos = search_repos("artificial intelligence language:python", max_results=500)
Pro tip: The Search API caps results at 1,000 items. To get more, split your query by date ranges:
def search_repos_by_date_range(query: str, start_date: str, end_date: str) -> list:
"""Search repos created within a date range to bypass 1000-result limit."""
full_query = f"{query} created:{start_date}..{end_date}"
return search_repos(full_query)
# Split into monthly chunks
months = [
("2025-01-01", "2025-01-31"),
("2025-02-01", "2025-02-28"),
("2025-03-01", "2025-03-31"),
# ... continue for each month you need
]
all_results = []
for start, end in months:
batch = search_repos_by_date_range("machine learning", start, end)
all_results.extend(batch)
print(f"{start} to {end}: {len(batch)} repos")
Method 2: Scraping GitHub's Web Pages
Sometimes the API doesn't expose what you need. GitHub Trending, contribution graphs, and certain profile details require HTML scraping.
Scraping Trending Repositories
from bs4 import BeautifulSoup
import requests
def scrape_trending(language: str = "", since: str = "daily") -> list:
"""Scrape GitHub trending page."""
url = f"https://github.com/trending/{language}?since={since}"
headers = {
"User-Agent": (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
repos = []
articles = soup.select("article.Box-row")
for article in articles:
# Repository name
h2 = article.select_one("h2 a")
if not h2:
continue
full_name = h2.get("href", "").strip("/")
# Description
p = article.select_one("p")
description = p.text.strip() if p else None
# Language
lang_span = article.select_one("[itemprop='programmingLanguage']")
language_name = lang_span.text.strip() if lang_span else None
# Stars today
stars_text = article.select("span.d-inline-block")
stars_today = None
for span in stars_text:
text = span.text.strip()
if "stars today" in text or "stars this" in text:
stars_today = text.split()[0].replace(",", "")
break
repos.append({
"full_name": full_name,
"description": description,
"language": language_name,
"stars_today": stars_today
})
return repos
trending = scrape_trending("python", "weekly")
for repo in trending[:5]:
print(f"{repo['full_name']}: {repo['stars_today']} stars")
Important: Avoiding Bans When Scraping
GitHub actively detects and blocks scrapers. Here are the rules to follow:
- Use realistic User-Agent strings — rotate them if making many requests
- Add delays between requests — minimum 2-3 seconds for web scraping
- Don't run parallel requests — sequential only
- Use a session to maintain cookies and appear more browser-like
- Respect robots.txt — GitHub allows most paths but monitor for changes
- Consider using residential proxies for large-scale scraping
import random
USER_AGENTS = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
]
session = requests.Session()
def polite_request(url: str) -> requests.Response:
"""Make a request that looks like a normal browser."""
session.headers.update({
"User-Agent": random.choice(USER_AGENTS),
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
})
time.sleep(random.uniform(2, 5))
return session.get(url)
Method 3: GraphQL API
GitHub's GraphQL API (v4) is more efficient for complex queries because you can fetch exactly what you need in a single request.
def graphql_query(query: str, variables: dict = None) -> dict:
"""Execute a GitHub GraphQL query."""
url = "https://api.github.com/graphql"
payload = {"query": query}
if variables:
payload["variables"] = variables
response = requests.post(url, headers=HEADERS, json=payload)
return response.json()
# Get user profile + their top repos in ONE request
query = """
query($username: String!) {
user(login: $username) {
name
bio
company
location
followers { totalCount }
repositories(first: 10, orderBy: {field: STARGAZERS, direction: DESC}) {
nodes {
name
stargazerCount
forkCount
primaryLanguage { name }
description
}
}
contributionsCollection {
totalCommitContributions
totalPullRequestContributions
totalIssueContributions
}
}
}
"""
result = graphql_query(query, {"username": "torvalds"})
user = result["data"]["user"]
print(f"{user['name']}: {user['followers']['totalCount']} followers")
print(f"Commits: {user['contributionsCollection']['totalCommitContributions']}")
The GraphQL API shares the same 5,000 requests/hour limit but each request can fetch much more data.
Method 4: Pre-Built Solutions
If you don't want to build and maintain your own scraper, there are ready-made tools that handle all the complexity.
Apify GitHub Scraper
Apify's GitHub Scraper handles all the complexity — rate limits, pagination, proxy rotation, and data formatting — out of the box. You configure what you want (profiles, repos, search results) and it delivers clean JSON.
It's particularly useful when you need to scrape at scale without managing infrastructure, and it handles GitHub's anti-bot measures automatically.
The Data Collector API
For broader developer data needs, The Data Collector API offers a REST API that provides GitHub-adjacent data along with data from other platforms. It comes with 100 free API calls (no credit card required), which is great for prototyping and testing data pipelines.
GitHub Archive (BigQuery)
For historical analysis, GH Archive records all public GitHub events and makes them queryable via Google BigQuery. It's free for small queries and perfect for trend analysis.
Best Practices Checklist
Here's a summary of everything you need to remember:
Authentication and Rate Limits
- Always use a Personal Access Token (5,000 req/hr vs 60 without)
- Monitor
X-RateLimit-Remainingheaders on every response - For the Search API, respect the 30 req/min limit
- Consider using multiple tokens for large-scale operations (within GitHub's ToS)
- Use conditional requests with
If-None-Match/If-Modified-Sinceheaders to save quota
Avoiding Bans
- Space requests 1-2 seconds apart for API, 3-5 seconds for web scraping
- Use realistic User-Agent strings when scraping HTML
- Don't hammer the same endpoint repeatedly in a short period
- Implement exponential backoff on 403/429 responses
- Never scrape from a single IP at high volume — use proxies for large-scale operations
Data Quality
- GitHub profiles can have null fields (company, bio, location) — handle gracefully
- Repository counts include forks — filter with
fork: falseif needed - Star counts can be stale in search results — verify on the repo endpoint if precision matters
- Contribution graphs only show public activity unless you have the right token scopes
Legal Considerations
- GitHub's Terms of Service prohibit scraping for spam or selling personal data
- Respect
robots.txtwhen scraping web pages - GDPR applies if you're collecting data on EU residents
- Consider whether your use case falls under legitimate interest or requires consent
Putting It All Together: A Complete Scraper
Here's a production-ready example that combines API calls with proper error handling:
import requests
import time
import json
import csv
from datetime import datetime
class GitHubScraper:
def __init__(self, token: str):
self.token = token
self.session = requests.Session()
self.session.headers.update({
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28"
})
self.request_count = 0
def _request(self, url: str):
"""Make a rate-limit-aware request."""
self.request_count += 1
response = self.session.get(url)
remaining = int(response.headers.get("X-RateLimit-Remaining", 100))
if remaining < 50:
reset = int(response.headers.get("X-RateLimit-Reset", 0))
wait = max(reset - int(time.time()), 0) + 1
print(f"[{self.request_count}] Rate limit: {remaining} left, waiting {wait}s")
time.sleep(wait)
if response.status_code == 200:
return response.json()
elif response.status_code in (403, 429):
time.sleep(60)
return self._request(url) # Retry once
return None
def search_users(self, query: str, max_results: int = 100) -> list:
"""Search for users matching a query."""
users = []
page = 1
while len(users) < max_results:
url = (
f"https://api.github.com/search/users"
f"?q={query}&per_page=100&page={page}"
)
data = self._request(url)
if not data or not data.get("items"):
break
for item in data["items"]:
profile = self._request(item["url"])
if profile:
users.append({
"login": profile["login"],
"name": profile.get("name"),
"email": profile.get("email"),
"company": profile.get("company"),
"location": profile.get("location"),
"bio": profile.get("bio"),
"public_repos": profile["public_repos"],
"followers": profile["followers"],
"created_at": profile["created_at"],
})
time.sleep(1)
page += 1
return users[:max_results]
def export_csv(self, data: list, filename: str):
"""Export results to CSV."""
if not data:
return
with open(filename, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
print(f"Exported {len(data)} records to {filename}")
# Usage
scraper = GitHubScraper("ghp_your_token_here")
developers = scraper.search_users("location:Berlin language:python", max_results=50)
scraper.export_csv(developers, "berlin_python_devs.csv")
Conclusion
Scraping GitHub in 2026 comes down to choosing the right method for your scale:
- Small scale (under 5,000 requests): Use the REST API with a single token
- Medium scale: Use GraphQL to reduce request count, combine with rate limit management
- Large scale: Use pre-built tools like Apify's GitHub Scraper or the Data Collector API to avoid infrastructure headaches
- Historical analysis: Use GH Archive + BigQuery
The key is respecting GitHub's limits. The platform is generous with its API (5,000 req/hr is a lot), and if you're patient and polite with your requests, you can collect enormous amounts of data without any issues.
Happy scraping — and remember, always use the data responsibly.
Top comments (0)