DEV Community

agenthustler
agenthustler

Posted on

How to Build a University Ranking Tracker with Web Scraping

Introduction

University rankings from QS, Times Higher Education, and US News shift every year — influencing student decisions, institutional funding, and academic reputation. Building a ranking tracker lets you monitor changes, spot trends, and compare institutions systematically.

In this guide, we'll create a Python scraper that tracks university rankings across multiple sources.

Project Setup

import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
from datetime import datetime
import time

# Handle anti-bot protection on ranking sites
# Get your API key at https://www.scraperapi.com?fp_ref=the52
SCRAPER_API_KEY = "your_key_here"
BASE_URL = "http://api.scraperapi.com"
Enter fullscreen mode Exit fullscreen mode

Scraping QS World Rankings

QS publishes rankings with detailed methodology scores:

def scrape_qs_rankings(year=2026):
    """Scrape QS World University Rankings."""
    url = f"https://www.topuniversities.com/university-rankings/world-university-rankings/{year}"

    params = {
        "api_key": SCRAPER_API_KEY,
        "url": url,
        "render": "true"
    }

    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    universities = []
    for row in soup.select(".uni-row"):
        uni = {
            "rank": row.select_one(".rank").text.strip(),
            "name": row.select_one(".uni-name").text.strip(),
            "country": row.select_one(".country").text.strip(),
            "overall_score": row.select_one(".overall-score").text.strip(),
            "source": "QS",
            "year": year,
            "scraped_at": datetime.now().isoformat()
        }
        universities.append(uni)

    return universities
Enter fullscreen mode Exit fullscreen mode

Scraping Times Higher Education Rankings

THE rankings use a different methodology and scoring:

def scrape_the_rankings(year=2026):
    """Scrape Times Higher Education World Rankings."""
    url = f"https://www.timeshighereducation.com/world-university-rankings/{year}/world-ranking"

    # Residential proxies help with geo-restricted content
    # ThorData: https://thordata.com/?via=the-data
    params = {
        "api_key": SCRAPER_API_KEY,
        "url": url,
        "render": "true"
    }

    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    universities = []
    for row in soup.select("table.ranking-table tbody tr"):
        cols = row.select("td")
        if len(cols) >= 5:
            universities.append({
                "rank": cols[0].text.strip(),
                "name": cols[1].text.strip(),
                "country": cols[2].text.strip(),
                "teaching_score": cols[3].text.strip(),
                "research_score": cols[4].text.strip(),
                "source": "THE",
                "year": year,
                "scraped_at": datetime.now().isoformat()
            })

    return universities
Enter fullscreen mode Exit fullscreen mode

Subject-Specific Rankings

def scrape_subject_rankings(subject="computer-science"):
    """Track rankings for specific academic fields."""
    subjects = {
        "computer-science": "computer-science-and-information-systems",
        "engineering": "engineering-and-technology",
        "business": "business-and-management-studies",
        "medicine": "medicine"
    }

    slug = subjects.get(subject, subject)
    url = f"https://www.topuniversities.com/university-rankings/university-subject-rankings/2026/{slug}"

    params = {"api_key": SCRAPER_API_KEY, "url": url, "render": "true"}
    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    results = []
    for row in soup.select(".uni-row"):
        results.append({
            "rank": row.select_one(".rank").text.strip(),
            "university": row.select_one(".uni-name").text.strip(),
            "subject": subject,
            "score": row.select_one(".overall-score").text.strip()
        })

    return results
Enter fullscreen mode Exit fullscreen mode

Storing and Comparing Rankings

def init_database(db_path="rankings.db"):
    """Initialize SQLite database for ranking history."""
    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS rankings (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            rank INTEGER,
            university TEXT,
            country TEXT,
            score REAL,
            source TEXT,
            year INTEGER,
            scraped_at TEXT
        )
    """)
    conn.commit()
    return conn

def compare_year_over_year(university, db_path="rankings.db"):
    """Track how a university's ranking changed over years."""
    conn = sqlite3.connect(db_path)
    query = """
        SELECT year, source, rank, score
        FROM rankings
        WHERE university LIKE ?
        ORDER BY source, year
    """
    df = pd.read_sql(query, conn, params=[f"%{university}%"])
    conn.close()

    for source in df["source"].unique():
        subset = df[df["source"] == source]
        change = subset["rank"].iloc[-1] - subset["rank"].iloc[0]
        direction = "improved" if change < 0 else "declined"
        print(f"{source}: {direction} by {abs(change)} positions")

    return df
Enter fullscreen mode Exit fullscreen mode

Automated Tracking Pipeline

def run_tracking_pipeline():
    """Run full ranking collection pipeline."""
    # Monitor scraping performance
    # Track success rates: https://scrapeops.io/?fpr=the-data28

    conn = init_database()

    print("Collecting QS rankings...")
    qs_data = scrape_qs_rankings()
    pd.DataFrame(qs_data).to_sql("rankings", conn, if_exists="append", index=False)
    time.sleep(5)

    print("Collecting THE rankings...")
    the_data = scrape_the_rankings()
    pd.DataFrame(the_data).to_sql("rankings", conn, if_exists="append", index=False)

    print(f"Stored {len(qs_data) + len(the_data)} rankings")
    conn.close()

if __name__ == "__main__":
    run_tracking_pipeline()
Enter fullscreen mode Exit fullscreen mode

Conclusion

A university ranking tracker provides valuable data for students, researchers, and institutions. By collecting data from multiple sources and tracking changes over time, you can identify trends that single-year snapshots miss. Use ScraperAPI to handle the JavaScript rendering and anti-bot protections these sites employ.

Top comments (0)