DEV Community

agenthustler
agenthustler

Posted on

GitHub Data Mining: Extract Repos, Stars, and Contributors with Python

Why Mine GitHub Data?

GitHub hosts over 200 million repositories and 100 million developers. Mining this data enables powerful use cases:

  • Developer analytics — Track trending technologies and skill demand
  • Competitive intelligence — Monitor competitor open-source activity
  • Talent sourcing — Find developers by contribution patterns
  • Technology trends — Identify rising frameworks and tools
  • Open source health — Evaluate project sustainability

This guide covers both GitHub's API and web scraping techniques for large-scale data extraction.

GitHub REST API: The Foundation

GitHub's API is well-documented and generous — 5,000 requests/hour with authentication.

Setup

import requests
import time
from datetime import datetime, timedelta

class GitHubClient:
    BASE_URL = 'https://api.github.com'

    def __init__(self, token=None):
        self.session = requests.Session()
        self.session.headers.update({
            'Accept': 'application/vnd.github.v3+json',
            'User-Agent': 'DataMiner/1.0'
        })
        if token:
            self.session.headers['Authorization'] = f'token {token}'

    def _get(self, endpoint, params=None):
        url = f'{self.BASE_URL}/{endpoint}'
        response = self.session.get(url, params=params)

        # Handle rate limiting
        remaining = int(response.headers.get('X-RateLimit-Remaining', 1))
        if remaining < 10:
            reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
            wait = max(reset_time - time.time(), 0) + 1
            print(f'Rate limit approaching, waiting {wait:.0f}s')
            time.sleep(wait)

        return response.json() if response.status_code == 200 else None

    def _get_paginated(self, endpoint, params=None, max_pages=10):
        all_items = []
        params = params or {}
        params['per_page'] = 100

        for page in range(1, max_pages + 1):
            params['page'] = page
            data = self._get(endpoint, params)
            if not data:
                break
            all_items.extend(data)
            if len(data) < 100:
                break

        return all_items

client = GitHubClient('YOUR_GITHUB_TOKEN')  # Optional but recommended
Enter fullscreen mode Exit fullscreen mode

Search Repositories

def search_repos(query, sort='stars', max_results=200):
    all_repos = []
    per_page = 100

    for page in range(1, (max_results // per_page) + 2):
        params = {
            'q': query,
            'sort': sort,
            'order': 'desc',
            'per_page': per_page,
            'page': page,
        }

        data = client._get('search/repositories', params)
        if not data or 'items' not in data:
            break

        for repo in data['items']:
            all_repos.append({
                'full_name': repo['full_name'],
                'description': repo.get('description', '')[:200],
                'stars': repo['stargazers_count'],
                'forks': repo['forks_count'],
                'language': repo.get('language'),
                'created': repo['created_at'],
                'updated': repo['updated_at'],
                'topics': repo.get('topics', []),
                'open_issues': repo['open_issues_count'],
                'license': repo.get('license', {}).get('spdx_id') if repo.get('license') else None,
                'url': repo['html_url'],
            })

        if len(data['items']) < per_page:
            break

        time.sleep(2)  # Respect rate limits

    return all_repos[:max_results]

# Find trending Python web scraping repos
repos = search_repos('web scraping language:python', max_results=50)
for r in repos[:10]:
    print(f"{r['full_name']}: {r['stars']:,} stars - {r['description'][:60]}")
Enter fullscreen mode Exit fullscreen mode

Extract Contributors

def get_contributors(owner, repo, max_contributors=100):
    contributors = client._get_paginated(
        f'repos/{owner}/{repo}/contributors',
        max_pages=max_contributors // 100 + 1
    )

    return [{
        'username': c['login'],
        'contributions': c['contributions'],
        'profile_url': c['html_url'],
        'avatar': c['avatar_url'],
    } for c in (contributors or [])][:max_contributors]

def get_contributor_details(username):
    user = client._get(f'users/{username}')
    if not user:
        return None

    return {
        'username': user['login'],
        'name': user.get('name'),
        'company': user.get('company'),
        'location': user.get('location'),
        'bio': user.get('bio'),
        'public_repos': user['public_repos'],
        'followers': user['followers'],
        'following': user['following'],
        'created': user['created_at'],
        'blog': user.get('blog'),
    }

# Top contributors to a popular repo
contribs = get_contributors('microsoft', 'playwright')
for c in contribs[:10]:
    print(f"  {c['username']}: {c['contributions']} contributions")
Enter fullscreen mode Exit fullscreen mode

Track Star History

def get_stargazers_over_time(owner, repo, max_pages=10):
    """Get star history with timestamps"""
    session = requests.Session()
    session.headers.update({
        'Accept': 'application/vnd.github.star+json',  # Special media type for timestamps
        'Authorization': f'token YOUR_TOKEN'
    })

    stars = []
    for page in range(1, max_pages + 1):
        url = f'https://api.github.com/repos/{owner}/{repo}/stargazers'
        response = session.get(url, params={'page': page, 'per_page': 100})

        if response.status_code != 200:
            break

        data = response.json()
        if not data:
            break

        for star in data:
            stars.append({
                'user': star['user']['login'],
                'starred_at': star['starred_at'],
            })

        time.sleep(1)

    return stars
Enter fullscreen mode Exit fullscreen mode

Handling API Rate Limits

GitHub's limits:

  • Unauthenticated: 60 requests/hour
  • Authenticated: 5,000 requests/hour
  • Search API: 30 requests/minute
def check_rate_limit():
    data = client._get('rate_limit')
    if data:
        core = data['resources']['core']
        search = data['resources']['search']
        print(f"Core: {core['remaining']}/{core['limit']} (resets {datetime.fromtimestamp(core['reset'])})")
        print(f"Search: {search['remaining']}/{search['limit']} (resets {datetime.fromtimestamp(search['reset'])})")
Enter fullscreen mode Exit fullscreen mode

When the API isn't enough, scraping fills the gap. Using ScrapeOps for proxy rotation helps when you need to supplement API data with web-scraped content from GitHub pages.

Technology Trend Analysis

import pandas as pd
from collections import Counter

def analyze_tech_trends(language, since_days=365):
    since = (datetime.now() - timedelta(days=since_days)).strftime('%Y-%m-%d')

    repos = search_repos(
        f'language:{language} created:>{since} stars:>100',
        sort='stars',
        max_results=200
    )

    # Topic analysis
    all_topics = []
    for repo in repos:
        all_topics.extend(repo.get('topics', []))

    topic_counts = Counter(all_topics).most_common(20)

    # Language distribution
    languages = Counter(r['language'] for r in repos if r['language'])

    # Growth metrics
    df = pd.DataFrame(repos)
    df['created'] = pd.to_datetime(df['created'])
    monthly = df.groupby(df['created'].dt.to_period('M')).size()

    return {
        'total_repos': len(repos),
        'total_stars': sum(r['stars'] for r in repos),
        'top_topics': topic_counts,
        'monthly_creation': monthly.to_dict(),
        'avg_stars': df['stars'].mean(),
        'median_stars': df['stars'].median(),
    }

trends = analyze_tech_trends('python')
print(f"New Python repos (>100 stars): {trends['total_repos']}")
print(f"Top topics: {trends['top_topics'][:10]}")
Enter fullscreen mode Exit fullscreen mode

Building a Developer Profile Database

import sqlite3

class GitHubDatabase:
    def __init__(self, db_path='github_data.db'):
        self.conn = sqlite3.connect(db_path)
        self.setup()

    def setup(self):
        self.conn.executescript('''
            CREATE TABLE IF NOT EXISTS repos (
                full_name TEXT PRIMARY KEY, stars INTEGER,
                forks INTEGER, language TEXT, description TEXT,
                created TEXT, updated TEXT, collected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
            CREATE TABLE IF NOT EXISTS developers (
                username TEXT PRIMARY KEY, name TEXT,
                company TEXT, location TEXT, followers INTEGER,
                public_repos INTEGER, collected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
        ''')

    def save_repos(self, repos):
        for r in repos:
            self.conn.execute('''
                INSERT OR REPLACE INTO repos 
                (full_name, stars, forks, language, description, created, updated)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            ''', (r['full_name'], r['stars'], r['forks'], r['language'],
                  r.get('description', '')[:200], r['created'], r['updated']))
        self.conn.commit()

db = GitHubDatabase()
repos = search_repos('web scraping', max_results=100)
db.save_repos(repos)
print(f'Saved {len(repos)} repos to database')
Enter fullscreen mode Exit fullscreen mode

Managed GitHub Scraping

For large-scale GitHub data collection that goes beyond API rate limits, the GitHub Scraper on Apify handles pagination, rate limiting, and structured data extraction automatically.

Best Practices

  1. Always authenticate — 5,000 vs 60 requests/hour
  2. Cache responses — Store data locally to avoid redundant calls
  3. Use conditional requestsIf-None-Match header to check if data changed
  4. Paginate efficiently — Use per_page=100 to minimize API calls
  5. Handle rate limits gracefully — Check X-RateLimit-Remaining headers
  6. Supplement with scraping — Use ScrapeOps when you need data the API doesn't provide

Conclusion

GitHub's API is one of the best public APIs available, but for large-scale data mining, you'll need to combine API access with smart rate limiting and supplemental scraping. Use the GitHub Scraper on Apify for production workloads, and ScrapeOps for proxy infrastructure when scraping beyond API limits.

Top comments (0)