DEV Community

agenthustler
agenthustler

Posted on

Scraping OpenAlex and Semantic Scholar for Research Intelligence

Academic research intelligence is a growing field. Whether you're tracking emerging technologies, monitoring competitors' R&D, or building citation networks — OpenAlex and Semantic Scholar are the two largest open databases of scholarly work. Here's how to extract intelligence from both.

OpenAlex vs Semantic Scholar

OpenAlex: 250M+ works, fully open API, no auth required, covers all disciplines. Run by the nonprofit OurResearch.

Semantic Scholar: 200M+ papers, AI2-backed, excellent AI/ML/CS coverage, free API with rate limits.

Querying OpenAlex

import requests
import time

class OpenAlexClient:
    BASE_URL = "https://api.openalex.org"

    def __init__(self, email=None):
        self.session = requests.Session()
        if email:
            self.session.params = {"mailto": email}  # Polite pool

    def search_works(self, query, filters=None, per_page=50, pages=3):
        all_results = []
        for page in range(1, pages + 1):
            params = {
                "search": query,
                "per_page": per_page,
                "page": page,
            }
            if filters:
                params["filter"] = filters

            resp = self.session.get(f"{self.BASE_URL}/works", params=params)
            data = resp.json()
            all_results.extend(data.get("results", []))
            time.sleep(0.2)  # Respect rate limits
        return all_results

    def get_author_works(self, author_id, since_year=2023):
        params = {
            "filter": f"author.id:{author_id},from_publication_date:{since_year}-01-01",
            "sort": "cited_by_count:desc",
            "per_page": 50,
        }
        resp = self.session.get(f"{self.BASE_URL}/works", params=params)
        return resp.json().get("results", [])
Enter fullscreen mode Exit fullscreen mode

Querying Semantic Scholar

class SemanticScholarClient:
    BASE_URL = "https://api.semanticscholar.org/graph/v1"

    def __init__(self, api_key=None):
        self.session = requests.Session()
        if api_key:
            self.session.headers["x-api-key"] = api_key

    def search_papers(self, query, limit=100, fields=None):
        if fields is None:
            fields = "title,year,citationCount,authors,abstract,url"
        params = {
            "query": query,
            "limit": limit,
            "fields": fields,
        }
        resp = self.session.get(f"{self.BASE_URL}/paper/search", params=params)
        return resp.json().get("data", [])

    def get_citations(self, paper_id, fields="title,year,citationCount"):
        resp = self.session.get(
            f"{self.BASE_URL}/paper/{paper_id}/citations",
            params={"fields": fields, "limit": 500}
        )
        return resp.json().get("data", [])
Enter fullscreen mode Exit fullscreen mode

Building a Research Trend Tracker

from collections import Counter
from datetime import datetime

def track_research_trend(topic, years=5):
    oalex = OpenAlexClient(email="research@example.com")
    yearly_counts = {}

    for year in range(datetime.now().year - years, datetime.now().year + 1):
        works = oalex.search_works(
            topic,
            filters=f"publication_year:{year}",
            per_page=1,
            pages=1
        )
        # OpenAlex returns total count in metadata
        yearly_counts[year] = len(works)

    # Identify growth trajectory
    counts = list(yearly_counts.values())
    if len(counts) >= 2 and counts[-1] > counts[0] * 1.5:
        trend = "accelerating"
    elif len(counts) >= 2 and counts[-1] > counts[0]:
        trend = "growing"
    else:
        trend = "stable"

    return {"topic": topic, "yearly_counts": yearly_counts, "trend": trend}

# Track multiple research areas
topics = ["large language models", "quantum computing", "CRISPR", "fusion energy"]
for topic in topics:
    result = track_research_trend(topic)
    print(f"{result['topic']}: {result['trend']}{result['yearly_counts']}")
Enter fullscreen mode Exit fullscreen mode

Cross-Referencing Both Sources

def cross_reference_paper(title):
    """Find a paper in both sources and merge metadata"""
    oalex = OpenAlexClient(email="research@example.com")
    s2 = SemanticScholarClient()

    oa_results = oalex.search_works(title, per_page=3, pages=1)
    s2_results = s2.search_papers(title, limit=3)

    merged = {
        "title": title,
        "openalex": {
            "found": len(oa_results) > 0,
            "citations": oa_results[0].get("cited_by_count") if oa_results else None,
            "doi": oa_results[0].get("doi") if oa_results else None,
        },
        "semantic_scholar": {
            "found": len(s2_results) > 0,
            "citations": s2_results[0].get("citationCount") if s2_results else None,
            "paper_id": s2_results[0].get("paperId") if s2_results else None,
        },
    }
    return merged
Enter fullscreen mode Exit fullscreen mode

Building a Citation Network

def build_citation_network(seed_paper_id, depth=2):
    s2 = SemanticScholarClient()
    network = {"nodes": {}, "edges": []}

    def crawl(paper_id, current_depth):
        if current_depth > depth or paper_id in network["nodes"]:
            return
        citations = s2.get_citations(paper_id)
        network["nodes"][paper_id] = {"depth": current_depth}

        for cite in citations[:10]:  # Limit breadth
            citing = cite.get("citingPaper", {})
            citing_id = citing.get("paperId")
            if citing_id:
                network["edges"].append({
                    "from": citing_id, "to": paper_id
                })
                if current_depth < depth:
                    time.sleep(0.5)
                    crawl(citing_id, current_depth + 1)

    crawl(seed_paper_id, 0)
    return network
Enter fullscreen mode Exit fullscreen mode

When You Need Web Scraping

Some academic sources don't have APIs. For scraping university pages, patent databases, or preprint servers, use ScraperAPI for reliable rendering. Scale with ThorData residential proxies for geo-restricted institutional content. Monitor scraper health with ScrapeOps.

Applications

  • Competitive R&D monitoring: Track what competitors are publishing
  • Technology scouting: Identify emerging research areas before they go mainstream
  • Hiring intelligence: Find top researchers by citation impact
  • Grant writing: Map the research landscape to position proposals

OpenAlex and Semantic Scholar together cover virtually all published research. Combining their APIs with web scraping gives you a comprehensive research intelligence platform.

Happy scraping!

Top comments (0)