Scraping Academic Conference Acceptance Rates with Python

#python #tutorial #webdev #programming

Academic conference acceptance rates are a key metric for researchers choosing where to submit. This data is scattered across individual conference sites — let's build a comprehensive tracker.

Why Acceptance Rates Matter

A conference's acceptance rate signals prestige, competition level, and review rigor. Tracking over time reveals which fields are growing and where new researchers have the best odds.

Scraping Conference Data

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

API_KEY = "YOUR_SCRAPERAPI_KEY"  # Get one at https://www.scraperapi.com?fp_ref=the52

def scrape_conference_stats(conference_url):
    proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={conference_url}"
    response = requests.get(proxy_url, timeout=60)
    soup = BeautifulSoup(response.text, 'html.parser')

    text = soup.get_text()
    stats = {}

    submissions_match = re.search(r'(\d+)\s*submissions?', text, re.IGNORECASE)
    accepted_match = re.search(r'(\d+)\s*(?:accepted|papers?\s*accepted)', text, re.IGNORECASE)
    rate_match = re.search(r'acceptance\s*rate[:\s]*(\d+\.?\d*)\s*%', text, re.IGNORECASE)

    if submissions_match:
        stats['submissions'] = int(submissions_match.group(1))
    if accepted_match:
        stats['accepted'] = int(accepted_match.group(1))
    if rate_match:
        stats['acceptance_rate'] = float(rate_match.group(1))
    elif 'submissions' in stats and 'accepted' in stats:
        stats['acceptance_rate'] = round(stats['accepted'] / stats['submissions'] * 100, 1)

    return stats

conferences = [
    {'name': 'NeurIPS 2025', 'url': 'https://neurips.cc/Conferences/2025'},
    {'name': 'ICML 2025', 'url': 'https://icml.cc/Conferences/2025'},
    {'name': 'CVPR 2025', 'url': 'https://cvpr.thecvf.com/Conferences/2025'},
    {'name': 'ACL 2025', 'url': 'https://2025.aclweb.org/'},
]

results = []
for conf in conferences:
    stats = scrape_conference_stats(conf['url'])
    stats['conference'] = conf['name']
    results.append(stats)
    print(f"{conf['name']}: {stats}")

df = pd.DataFrame(results)
df.to_csv('conference_acceptance_rates.csv', index=False)

Scraping WikiCFP for Historical Data

def scrape_wikicfp(search_term):
    url = f"http://www.wikicfp.com/cfp/servlet/tool.search?q={search_term}&year=a"
    proxy_url = f"http://api.scraperapi.com?api_key={API_KEY}&url={url}"

    response = requests.get(proxy_url, timeout=60)
    soup = BeautifulSoup(response.text, 'html.parser')

    cfps = []
    for row in soup.select('table.contsec tr'):
        cells = row.select('td')
        if len(cells) >= 4:
            link = cells[0].select_one('a')
            cfps.append({
                'conference': link.text.strip() if link else '',
                'full_name': cells[1].text.strip(),
                'deadline': cells[2].text.strip(),
                'location': cells[3].text.strip()
            })
    return cfps

ml_conferences = scrape_wikicfp('machine learning')
print(f"Found {len(ml_conferences)} ML conference CFPs")

Trend Analysis

Use ThorData for proxy rotation across academic sites:

import matplotlib.pyplot as plt

historical = pd.read_csv('conference_acceptance_rates.csv')
for conf in historical['conference'].unique():
    data = historical[historical['conference'] == conf].sort_values('year')
    plt.plot(data['year'], data['acceptance_rate'], marker='o', label=conf)

plt.xlabel('Year')
plt.ylabel('Acceptance Rate (%)')
plt.title('Conference Acceptance Rates Over Time')
plt.legend(bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.savefig('acceptance_trends.png', dpi=150)