agenthustler

Posted on Mar 26 • Edited on Apr 19

GitHub Data Mining: Extract Repos, Stars, and Contributors with Python

#python #webdev #tutorial #programming

Why Mine GitHub Data?

GitHub hosts over 200 million repositories and 100 million developers. Mining this data enables powerful use cases:

Developer analytics — Track trending technologies and skill demand
Competitive intelligence — Monitor competitor open-source activity
Talent sourcing — Find developers by contribution patterns
Technology trends — Identify rising frameworks and tools
Open source health — Evaluate project sustainability

This guide covers both GitHub's API and web scraping techniques for large-scale data extraction.

GitHub REST API: The Foundation

GitHub's API is well-documented and generous — 5,000 requests/hour with authentication.

Setup

import requests
import time
from datetime import datetime, timedelta

class GitHubClient:
    BASE_URL = 'https://api.github.com'

    def __init__(self, token=None):
        self.session = requests.Session()
        self.session.headers.update({
            'Accept': 'application/vnd.github.v3+json',
            'User-Agent': 'DataMiner/1.0'
        })
        if token:
            self.session.headers['Authorization'] = f'token {token}'

    def _get(self, endpoint, params=None):
        url = f'{self.BASE_URL}/{endpoint}'
        response = self.session.get(url, params=params)

        # Handle rate limiting
        remaining = int(response.headers.get('X-RateLimit-Remaining', 1))
        if remaining < 10:
            reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
            wait = max(reset_time - time.time(), 0) + 1
            print(f'Rate limit approaching, waiting {wait:.0f}s')
            time.sleep(wait)

        return response.json() if response.status_code == 200 else None

    def _get_paginated(self, endpoint, params=None, max_pages=10):
        all_items = []
        params = params or {}
        params['per_page'] = 100

        for page in range(1, max_pages + 1):
            params['page'] = page
            data = self._get(endpoint, params)
            if not data:
                break
            all_items.extend(data)
            if len(data) < 100:
                break

        return all_items

client = GitHubClient('YOUR_GITHUB_TOKEN')  # Optional but recommended

Search Repositories

def search_repos(query, sort='stars', max_results=200):
    all_repos = []
    per_page = 100

    for page in range(1, (max_results // per_page) + 2):
        params = {
            'q': query,
            'sort': sort,
            'order': 'desc',
            'per_page': per_page,
            'page': page,
        }

        data = client._get('search/repositories', params)
        if not data or 'items' not in data:
            break

        for repo in data['items']:
            all_repos.append({
                'full_name': repo['full_name'],
                'description': repo.get('description', '')[:200],
                'stars': repo['stargazers_count'],
                'forks': repo['forks_count'],
                'language': repo.get('language'),
                'created': repo['created_at'],
                'updated': repo['updated_at'],
                'topics': repo.get('topics', []),
                'open_issues': repo['open_issues_count'],
                'license': repo.get('license', {}).get('spdx_id') if repo.get('license') else None,
                'url': repo['html_url'],
            })

        if len(data['items']) < per_page:
            break

        time.sleep(2)  # Respect rate limits

    return all_repos[:max_results]

# Find trending Python web scraping repos
repos = search_repos('web scraping language:python', max_results=50)
for r in repos[:10]:
    print(f"{r['full_name']}: {r['stars']:,} stars - {r['description'][:60]}")

Extract Contributors

# Implementation is proprietary (that IS the moat).
# Skip the build — use our ready-made Apify actor:
# see the CTA below for the link (fpr=yw6md3).

Track Star History

def get_stargazers_over_time(owner, repo, max_pages=10):
    """Get star history with timestamps"""
    session = requests.Session()
    session.headers.update({
        'Accept': 'application/vnd.github.star+json',  # Special media type for timestamps
        'Authorization': f'token YOUR_TOKEN'
    })

    stars = []
    for page in range(1, max_pages + 1):
        url = f'https://api.github.com/repos/{owner}/{repo}/stargazers'
        response = session.get(url, params={'page': page, 'per_page': 100})

        if response.status_code != 200:
            break

        data = response.json()
        if not data:
            break

        for star in data:
            stars.append({
                'user': star['user']['login'],
                'starred_at': star['starred_at'],
            })

        time.sleep(1)

    return stars

Handling API Rate Limits

GitHub's limits:

Unauthenticated: 60 requests/hour
Authenticated: 5,000 requests/hour
Search API: 30 requests/minute

def check_rate_limit():
    data = client._get('rate_limit')
    if data:
        core = data['resources']['core']
        search = data['resources']['search']
        print(f"Core: {core['remaining']}/{core['limit']} (resets {datetime.fromtimestamp(core['reset'])})")
        print(f"Search: {search['remaining']}/{search['limit']} (resets {datetime.fromtimestamp(search['reset'])})")

When the API isn't enough, scraping fills the gap. Using ScrapeOps for proxy rotation helps when you need to supplement API data with web-scraped content from GitHub pages.

Technology Trend Analysis

import pandas as pd
from collections import Counter

def analyze_tech_trends(language, since_days=365):
    since = (datetime.now() - timedelta(days=since_days)).strftime('%Y-%m-%d')

    repos = search_repos(
        f'language:{language} created:>{since} stars:>100',
        sort='stars',
        max_results=200
    )

    # Topic analysis
    all_topics = []
    for repo in repos:
        all_topics.extend(repo.get('topics', []))

    topic_counts = Counter(all_topics).most_common(20)

    # Language distribution
    languages = Counter(r['language'] for r in repos if r['language'])

    # Growth metrics
    df = pd.DataFrame(repos)
    df['created'] = pd.to_datetime(df['created'])
    monthly = df.groupby(df['created'].dt.to_period('M')).size()

    return {
        'total_repos': len(repos),
        'total_stars': sum(r['stars'] for r in repos),
        'top_topics': topic_counts,
        'monthly_creation': monthly.to_dict(),
        'avg_stars': df['stars'].mean(),
        'median_stars': df['stars'].median(),
    }

trends = analyze_tech_trends('python')
print(f"New Python repos (>100 stars): {trends['total_repos']}")
print(f"Top topics: {trends['top_topics'][:10]}")

Building a Developer Profile Database

import sqlite3

class GitHubDatabase:
    def __init__(self, db_path='github_data.db'):
        self.conn = sqlite3.connect(db_path)
        self.setup()

    def setup(self):
        self.conn.executescript('''
            CREATE TABLE IF NOT EXISTS repos (
                full_name TEXT PRIMARY KEY, stars INTEGER,
                forks INTEGER, language TEXT, description TEXT,
                created TEXT, updated TEXT, collected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
            CREATE TABLE IF NOT EXISTS developers (
                username TEXT PRIMARY KEY, name TEXT,
                company TEXT, location TEXT, followers INTEGER,
                public_repos INTEGER, collected_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
        ''')

    def save_repos(self, repos):
        for r in repos:
            self.conn.execute('''
                INSERT OR REPLACE INTO repos 
                (full_name, stars, forks, language, description, created, updated)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            ''', (r['full_name'], r['stars'], r['forks'], r['language'],
                  r.get('description', '')[:200], r['created'], r['updated']))
        self.conn.commit()

db = GitHubDatabase()
repos = search_repos('web scraping', max_results=100)
db.save_repos(repos)
print(f'Saved {len(repos)} repos to database')

Managed GitHub Scraping

For large-scale GitHub data collection that goes beyond API rate limits, the GitHub Scraper on Apify handles pagination, rate limiting, and structured data extraction automatically.

Best Practices

Always authenticate — 5,000 vs 60 requests/hour
Cache responses — Store data locally to avoid redundant calls
Use conditional requests — If-None-Match header to check if data changed
Paginate efficiently — Use per_page=100 to minimize API calls
Handle rate limits gracefully — Check X-RateLimit-Remaining headers
Supplement with scraping — Use ScrapeOps when you need data the API doesn't provide

Conclusion

GitHub's API is one of the best public APIs available, but for large-scale data mining, you'll need to combine API access with smart rate limiting and supplemental scraping. Use the GitHub Scraper on Apify for production workloads, and ScrapeOps for proxy infrastructure when scraping beyond API limits.

DEV Community