Academic research intelligence is a growing field. Whether you're tracking emerging technologies, monitoring competitors' R&D, or building citation networks — OpenAlex and Semantic Scholar are the two largest open databases of scholarly work. Here's how to extract intelligence from both.
OpenAlex vs Semantic Scholar
OpenAlex: 250M+ works, fully open API, no auth required, covers all disciplines. Run by the nonprofit OurResearch.
Semantic Scholar: 200M+ papers, AI2-backed, excellent AI/ML/CS coverage, free API with rate limits.
Querying OpenAlex
import requests
import time
class OpenAlexClient:
BASE_URL = "https://api.openalex.org"
def __init__(self, email=None):
self.session = requests.Session()
if email:
self.session.params = {"mailto": email} # Polite pool
def search_works(self, query, filters=None, per_page=50, pages=3):
all_results = []
for page in range(1, pages + 1):
params = {
"search": query,
"per_page": per_page,
"page": page,
}
if filters:
params["filter"] = filters
resp = self.session.get(f"{self.BASE_URL}/works", params=params)
data = resp.json()
all_results.extend(data.get("results", []))
time.sleep(0.2) # Respect rate limits
return all_results
def get_author_works(self, author_id, since_year=2023):
params = {
"filter": f"author.id:{author_id},from_publication_date:{since_year}-01-01",
"sort": "cited_by_count:desc",
"per_page": 50,
}
resp = self.session.get(f"{self.BASE_URL}/works", params=params)
return resp.json().get("results", [])
Querying Semantic Scholar
class SemanticScholarClient:
BASE_URL = "https://api.semanticscholar.org/graph/v1"
def __init__(self, api_key=None):
self.session = requests.Session()
if api_key:
self.session.headers["x-api-key"] = api_key
def search_papers(self, query, limit=100, fields=None):
if fields is None:
fields = "title,year,citationCount,authors,abstract,url"
params = {
"query": query,
"limit": limit,
"fields": fields,
}
resp = self.session.get(f"{self.BASE_URL}/paper/search", params=params)
return resp.json().get("data", [])
def get_citations(self, paper_id, fields="title,year,citationCount"):
resp = self.session.get(
f"{self.BASE_URL}/paper/{paper_id}/citations",
params={"fields": fields, "limit": 500}
)
return resp.json().get("data", [])
Building a Research Trend Tracker
from collections import Counter
from datetime import datetime
def track_research_trend(topic, years=5):
oalex = OpenAlexClient(email="research@example.com")
yearly_counts = {}
for year in range(datetime.now().year - years, datetime.now().year + 1):
works = oalex.search_works(
topic,
filters=f"publication_year:{year}",
per_page=1,
pages=1
)
# OpenAlex returns total count in metadata
yearly_counts[year] = len(works)
# Identify growth trajectory
counts = list(yearly_counts.values())
if len(counts) >= 2 and counts[-1] > counts[0] * 1.5:
trend = "accelerating"
elif len(counts) >= 2 and counts[-1] > counts[0]:
trend = "growing"
else:
trend = "stable"
return {"topic": topic, "yearly_counts": yearly_counts, "trend": trend}
# Track multiple research areas
topics = ["large language models", "quantum computing", "CRISPR", "fusion energy"]
for topic in topics:
result = track_research_trend(topic)
print(f"{result['topic']}: {result['trend']} — {result['yearly_counts']}")
Cross-Referencing Both Sources
def cross_reference_paper(title):
"""Find a paper in both sources and merge metadata"""
oalex = OpenAlexClient(email="research@example.com")
s2 = SemanticScholarClient()
oa_results = oalex.search_works(title, per_page=3, pages=1)
s2_results = s2.search_papers(title, limit=3)
merged = {
"title": title,
"openalex": {
"found": len(oa_results) > 0,
"citations": oa_results[0].get("cited_by_count") if oa_results else None,
"doi": oa_results[0].get("doi") if oa_results else None,
},
"semantic_scholar": {
"found": len(s2_results) > 0,
"citations": s2_results[0].get("citationCount") if s2_results else None,
"paper_id": s2_results[0].get("paperId") if s2_results else None,
},
}
return merged
Building a Citation Network
def build_citation_network(seed_paper_id, depth=2):
s2 = SemanticScholarClient()
network = {"nodes": {}, "edges": []}
def crawl(paper_id, current_depth):
if current_depth > depth or paper_id in network["nodes"]:
return
citations = s2.get_citations(paper_id)
network["nodes"][paper_id] = {"depth": current_depth}
for cite in citations[:10]: # Limit breadth
citing = cite.get("citingPaper", {})
citing_id = citing.get("paperId")
if citing_id:
network["edges"].append({
"from": citing_id, "to": paper_id
})
if current_depth < depth:
time.sleep(0.5)
crawl(citing_id, current_depth + 1)
crawl(seed_paper_id, 0)
return network
When You Need Web Scraping
Some academic sources don't have APIs. For scraping university pages, patent databases, or preprint servers, use ScraperAPI for reliable rendering. Scale with ThorData residential proxies for geo-restricted institutional content. Monitor scraper health with ScrapeOps.
Applications
- Competitive R&D monitoring: Track what competitors are publishing
- Technology scouting: Identify emerging research areas before they go mainstream
- Hiring intelligence: Find top researchers by citation impact
- Grant writing: Map the research landscape to position proposals
OpenAlex and Semantic Scholar together cover virtually all published research. Combining their APIs with web scraping gives you a comprehensive research intelligence platform.
Happy scraping!
Top comments (0)