Scraping OpenAlex and Semantic Scholar for Research Intelligence

#python #programming #tutorial #webdev

Academic research intelligence is a growing field. Whether you're tracking emerging technologies, monitoring competitors' R&D, or building citation networks — OpenAlex and Semantic Scholar are the two largest open databases of scholarly work. Here's how to extract intelligence from both.

OpenAlex vs Semantic Scholar

OpenAlex: 250M+ works, fully open API, no auth required, covers all disciplines. Run by the nonprofit OurResearch.

Semantic Scholar: 200M+ papers, AI2-backed, excellent AI/ML/CS coverage, free API with rate limits.

Querying OpenAlex

import requests
import time

class OpenAlexClient:
    BASE_URL = "https://api.openalex.org"

    def __init__(self, email=None):
        self.session = requests.Session()
        if email:
            self.session.params = {"mailto": email}  # Polite pool

    def search_works(self, query, filters=None, per_page=50, pages=3):
        all_results = []
        for page in range(1, pages + 1):
            params = {
                "search": query,
                "per_page": per_page,
                "page": page,
            }
            if filters:
                params["filter"] = filters

            resp = self.session.get(f"{self.BASE_URL}/works", params=params)
            data = resp.json()
            all_results.extend(data.get("results", []))
            time.sleep(0.2)  # Respect rate limits
        return all_results

    def get_author_works(self, author_id, since_year=2023):
        params = {
            "filter": f"author.id:{author_id},from_publication_date:{since_year}-01-01",
            "sort": "cited_by_count:desc",
            "per_page": 50,
        }
        resp = self.session.get(f"{self.BASE_URL}/works", params=params)
        return resp.json().get("results", [])

Querying Semantic Scholar

class SemanticScholarClient:
    BASE_URL = "https://api.semanticscholar.org/graph/v1"

    def __init__(self, api_key=None):
        self.session = requests.Session()
        if api_key:
            self.session.headers["x-api-key"] = api_key

    def search_papers(self, query, limit=100, fields=None):
        if fields is None:
            fields = "title,year,citationCount,authors,abstract,url"
        params = {
            "query": query,
            "limit": limit,
            "fields": fields,
        }
        resp = self.session.get(f"{self.BASE_URL}/paper/search", params=params)
        return resp.json().get("data", [])

    def get_citations(self, paper_id, fields="title,year,citationCount"):
        resp = self.session.get(
            f"{self.BASE_URL}/paper/{paper_id}/citations",
            params={"fields": fields, "limit": 500}
        )
        return resp.json().get("data", [])

Building a Research Trend Tracker

from collections import Counter
from datetime import datetime

def track_research_trend(topic, years=5):
    oalex = OpenAlexClient(email="research@example.com")
    yearly_counts = {}

    for year in range(datetime.now().year - years, datetime.now().year + 1):
        works = oalex.search_works(
            topic,
            filters=f"publication_year:{year}",
            per_page=1,
            pages=1
        )
        # OpenAlex returns total count in metadata
        yearly_counts[year] = len(works)

    # Identify growth trajectory
    counts = list(yearly_counts.values())
    if len(counts) >= 2 and counts[-1] > counts[0] * 1.5:
        trend = "accelerating"
    elif len(counts) >= 2 and counts[-1] > counts[0]:
        trend = "growing"
    else:
        trend = "stable"

    return {"topic": topic, "yearly_counts": yearly_counts, "trend": trend}

# Track multiple research areas
topics = ["large language models", "quantum computing", "CRISPR", "fusion energy"]
for topic in topics:
    result = track_research_trend(topic)
    print(f"{result['topic']}: {result['trend']} — {result['yearly_counts']}")

Cross-Referencing Both Sources

def cross_reference_paper(title):
    """Find a paper in both sources and merge metadata"""
    oalex = OpenAlexClient(email="research@example.com")
    s2 = SemanticScholarClient()

    oa_results = oalex.search_works(title, per_page=3, pages=1)
    s2_results = s2.search_papers(title, limit=3)

    merged = {
        "title": title,
        "openalex": {
            "found": len(oa_results) > 0,
            "citations": oa_results[0].get("cited_by_count") if oa_results else None,
            "doi": oa_results[0].get("doi") if oa_results else None,
        },
        "semantic_scholar": {
            "found": len(s2_results) > 0,
            "citations": s2_results[0].get("citationCount") if s2_results else None,
            "paper_id": s2_results[0].get("paperId") if s2_results else None,
        },
    }
    return merged

Building a Citation Network

def build_citation_network(seed_paper_id, depth=2):
    s2 = SemanticScholarClient()
    network = {"nodes": {}, "edges": []}

    def crawl(paper_id, current_depth):
        if current_depth > depth or paper_id in network["nodes"]:
            return
        citations = s2.get_citations(paper_id)
        network["nodes"][paper_id] = {"depth": current_depth}

        for cite in citations[:10]:  # Limit breadth
            citing = cite.get("citingPaper", {})
            citing_id = citing.get("paperId")
            if citing_id:
                network["edges"].append({
                    "from": citing_id, "to": paper_id
                })
                if current_depth < depth:
                    time.sleep(0.5)
                    crawl(citing_id, current_depth + 1)

    crawl(seed_paper_id, 0)
    return network

When You Need Web Scraping

Some academic sources don't have APIs. For scraping university pages, patent databases, or preprint servers, use ScraperAPI for reliable rendering. Scale with ThorData residential proxies for geo-restricted institutional content. Monitor scraper health with ScrapeOps.

Applications

Competitive R&D monitoring: Track what competitors are publishing
Technology scouting: Identify emerging research areas before they go mainstream
Hiring intelligence: Find top researchers by citation impact
Grant writing: Map the research landscape to position proposals

OpenAlex and Semantic Scholar together cover virtually all published research. Combining their APIs with web scraping gives you a comprehensive research intelligence platform.

Happy scraping!

Skip the Build

You don't have to reinvent this. We maintain a production-grade scraper as an Apify actor — proxies, anti-bot, retries, and schema all handled. You can run it on a pay-per-result basis and get clean JSON without writing a single line of scraping code.

our Apify scrapers on Apify