Most developers know Google Scholar for finding papers. But Google Scholar has no public API and blocks scraping aggressively.
Meanwhile, Crossref quietly maintains a free API with 130 million+ scholarly articles — no API key, no OAuth, no rate limit registration. Just send a GET request and parse JSON.
Here is how to use it for research automation, citation analysis, and building academic tools.
What Is Crossref?
Crossref is the DOI registration agency used by most academic publishers. When a paper gets a DOI (like 10.1038/nature12373), it gets registered in Crossref. Their API exposes metadata for every registered work.
130M+ works. Free. No key. JSON responses.
Basic Search
import requests
def search_papers(query, rows=5):
resp = requests.get(
"https://api.crossref.org/works",
params={
"query": query,
"rows": rows,
"sort": "relevance"
}
)
data = resp.json()
papers = []
for item in data["message"]["items"]:
papers.append({
"title": item.get("title", [""])[0],
"doi": item.get("DOI", ""),
"year": item.get("published-print", {}).get("date-parts", [[None]])[0][0]
or item.get("created", {}).get("date-parts", [[None]])[0][0],
"citations": item.get("is-referenced-by-count", 0),
"type": item.get("type", ""),
"publisher": item.get("publisher", ""),
})
return papers
results = search_papers("retrieval augmented generation")
for p in results:
print(f"[{p['year']}] {p['title'][:60]}... ({p['citations']} citations)")
Output:
[2024] Retrieval-Augmented Generation for Large Language Mo... (847 citations)
[2023] A Survey on Retrieval-Augmented Text Generation... (312 citations)
...
DOI Lookup — Get Full Metadata
Every DOI resolves to a complete metadata record:
def get_paper(doi):
resp = requests.get(f"https://api.crossref.org/works/{doi}")
item = resp.json()["message"]
return {
"title": item.get("title", [""])[0],
"authors": [
f"{a.get('given', '')} {a.get('family', '')}"
for a in item.get("author", [])
],
"journal": item.get("container-title", [""])[0],
"year": item.get("published-print", {}).get("date-parts", [[None]])[0][0],
"citations": item.get("is-referenced-by-count", 0),
"references": len(item.get("reference", [])),
"abstract": item.get("abstract", ""),
"url": item.get("URL", ""),
}
paper = get_paper("10.1038/nature12373")
print(f"Title: {paper['title']}")
print(f"Authors: {', '.join(paper['authors'][:3])}")
print(f"Journal: {paper['journal']}")
print(f"Citations: {paper['citations']}")
Find Most-Cited Papers in a Field
def top_cited(query, rows=10):
resp = requests.get(
"https://api.crossref.org/works",
params={
"query": query,
"rows": rows,
"sort": "is-referenced-by-count",
"order": "desc"
}
)
items = resp.json()["message"]["items"]
return [
{
"title": i.get("title", [""])[0][:80],
"citations": i.get("is-referenced-by-count", 0),
"year": i.get("created", {}).get("date-parts", [[None]])[0][0],
}
for i in items
]
# Most cited RAG papers
for p in top_cited("retrieval augmented generation"):
print(f" [{p['year']}] {p['title']} — {p['citations']} citations")
Citation Network Analysis
Build a citation graph from any paper:
def get_references(doi):
resp = requests.get(f"https://api.crossref.org/works/{doi}")
refs = resp.json()["message"].get("reference", [])
return [
{
"doi": r.get("DOI", ""),
"title": r.get("article-title", r.get("unstructured", ""))[:80],
"year": r.get("year", ""),
}
for r in refs if r.get("DOI")
]
# Get what a paper cites
refs = get_references("10.48550/arXiv.2005.11401") # RAG paper
print(f"This paper cites {len(refs)} works with DOIs")
for r in refs[:5]:
print(f" - {r['title']}")
Filter by Date Range
def recent_papers(query, from_date="2024-01-01", rows=10):
resp = requests.get(
"https://api.crossref.org/works",
params={
"query": query,
"rows": rows,
"filter": f"from-pub-date:{from_date}",
"sort": "published",
"order": "desc"
}
)
return resp.json()["message"]["items"]
# Papers about LLM evaluation from 2024+
papers = recent_papers("large language model evaluation", "2024-01-01")
print(f"Found {len(papers)} recent papers")
Journal Statistics
def journal_info(issn):
resp = requests.get(f"https://api.crossref.org/journals/{issn}")
j = resp.json()["message"]
return {
"title": j.get("title", ""),
"publisher": j.get("publisher", ""),
"total_dois": j.get("counts", {}).get("total-dois", 0),
}
# Nature
info = journal_info("0028-0836")
print(f"{info['title']}: {info['total_dois']:,} articles")
Polite API Usage
Crossref asks you to include your email in requests for the "polite pool" (faster responses):
HEADERS = {
"User-Agent": "MyApp/1.0 (mailto:your@email.com)"
}
# Or use the mailto parameter
resp = requests.get(
"https://api.crossref.org/works",
params={"query": "machine learning", "mailto": "your@email.com"}
)
Rate Limits
| Pool | Rate | How to Join |
|---|---|---|
| Public | ~50 req/sec | Default, no setup |
| Polite | Higher priority | Add mailto parameter |
| Plus | Highest priority | Paid subscription |
For most use cases, the public pool is more than enough.
Use Cases
- Literature reviews: Find all papers citing a specific work
- Trend analysis: Track publication volume in a field over time
- Research tools: Build citation managers, paper recommenders
- Data enrichment: Add citation counts and metadata to your datasets
- Academic RAG: Feed paper abstracts into vector databases for Q&A
Full Code
All examples work as-is. Copy, paste, run. No signup needed.
Related
- arXiv API: Search 2M+ Research Papers — preprints and full text
- Open Library API: Search 20M+ Books
- 77 free web scrapers
Top comments (0)