The Problem
Every time I research a topic, I open 5 tabs: Google Scholar, arXiv, Semantic Scholar, OpenAlex, Crossref. Search the same query. Compare results. Copy DOIs back and forth.
So I built a single Python script that searches all 5 APIs simultaneously and gives me a unified result.
100 lines. No frameworks. Just requests and concurrent.futures.
The Code
#!/usr/bin/env python3
"""Search 5 academic APIs at once — unified paper finder"""
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
def search_openalex(query, limit=3):
resp = requests.get("https://api.openalex.org/works", params={
"search": query, "sort": "cited_by_count:desc", "per_page": limit
}, timeout=10)
results = []
for w in resp.json().get("results", []):
results.append({
"title": w["title"],
"year": w.get("publication_year"),
"citations": w.get("cited_by_count", 0),
"doi": w.get("doi", "").replace("https://doi.org/", ""),
"source": "OpenAlex"
})
return results
def search_semantic_scholar(query, limit=3):
resp = requests.get("https://api.semanticscholar.org/graph/v1/paper/search", params={
"query": query, "limit": limit,
"fields": "title,year,citationCount,tldr"
}, timeout=10)
results = []
for p in resp.json().get("data", []):
results.append({
"title": p["title"],
"year": p.get("year"),
"citations": p.get("citationCount", 0),
"tldr": p.get("tldr", {}).get("text", "") if p.get("tldr") else "",
"source": "Semantic Scholar"
})
return results
def search_crossref(query, limit=3):
resp = requests.get("https://api.crossref.org/works", params={
"query": query, "rows": limit, "sort": "relevance"
}, timeout=10)
results = []
for w in resp.json().get("message", {}).get("items", []):
results.append({
"title": w.get("title", ["N/A"])[0],
"year": w.get("published-print", {}).get("date-parts", [[None]])[0][0],
"citations": w.get("is-referenced-by-count", 0),
"doi": w.get("DOI", ""),
"source": "Crossref"
})
return results
def search_arxiv(query, limit=3):
import xml.etree.ElementTree as ET
resp = requests.get("http://export.arxiv.org/api/query", params={
"search_query": f"all:{query}", "max_results": limit,
"sortBy": "relevance"
}, timeout=10)
ns = {"atom": "http://www.w3.org/2005/Atom"}
root = ET.fromstring(resp.text)
results = []
for entry in root.findall("atom:entry", ns):
results.append({
"title": entry.find("atom:title", ns).text.strip().replace("\n", " "),
"year": entry.find("atom:published", ns).text[:4],
"citations": 0,
"source": "arXiv"
})
return results
def search_europe_pmc(query, limit=3):
resp = requests.get("https://www.ebi.ac.uk/europepmc/webservices/rest/search", params={
"query": query, "resultType": "core", "pageSize": limit, "format": "json"
}, timeout=10)
results = []
for r in resp.json().get("resultList", {}).get("result", []):
results.append({
"title": r.get("title", "N/A"),
"year": r.get("pubYear"),
"citations": r.get("citedByCount", 0),
"doi": r.get("doi", ""),
"source": "Europe PMC"
})
return results
def unified_search(query, limit_per_api=3):
apis = [
("OpenAlex", search_openalex),
("Semantic Scholar", search_semantic_scholar),
("Crossref", search_crossref),
("arXiv", search_arxiv),
("Europe PMC", search_europe_pmc),
]
all_results = []
with ThreadPoolExecutor(max_workers=5) as executor:
futures = {executor.submit(fn, query, limit_per_api): name for name, fn in apis}
for future in as_completed(futures):
api_name = futures[future]
try:
results = future.result()
all_results.extend(results)
print(f" ✓ {api_name}: {len(results)} results")
except Exception as e:
print(f" ✗ {api_name}: {e}")
# Sort by citations (most cited first)
all_results.sort(key=lambda x: x.get("citations", 0), reverse=True)
return all_results
if __name__ == "__main__":
query = input("\nSearch academic papers: ") or "transformer attention mechanism"
print(f"\nSearching 5 APIs for: '{query}'...\n")
results = unified_search(query)
print(f"\n{'='*80}")
print(f"Found {len(results)} papers across 5 APIs\n")
for i, r in enumerate(results[:15], 1):
print(f"{i}. [{r.get('year', '?')}] {r['title']}")
print(f" Citations: {r.get('citations', 0)} | Source: {r['source']}")
if r.get("doi"):
print(f" DOI: {r['doi']}")
if r.get("tldr"):
print(f" TLDR: {r['tldr'][:120]}...")
print()
How It Works
-
5 API calls run in parallel using
ThreadPoolExecutor— total search takes ~2 seconds - Results are normalized into a common format (title, year, citations, source)
- Merged results sorted by citation count
- Each result shows which API it came from
Sample Output
Searching 5 APIs for: 'transformer attention mechanism'...
✓ arXiv: 3 results
✓ OpenAlex: 3 results
✓ Europe PMC: 3 results
✓ Crossref: 3 results
✓ Semantic Scholar: 3 results
================================================================================
Found 15 papers across 5 APIs
1. [2017] Attention Is All You Need
Citations: 128,451 | Source: OpenAlex
DOI: 10.48550/arxiv.1706.03762
2. [2018] BERT: Pre-training of Deep Bidirectional Transformers
Citations: 98,234 | Source: Semantic Scholar
TLDR: We introduce BERT, a new language representation model...
Why Not Just Use Google Scholar?
| Feature | This Script | Google Scholar |
|---|---|---|
| API access | Yes (5 APIs) | No official API |
| Rate limits | Generous | Will block you |
| Citation data | Structured | Scraping needed |
| AI summaries | Yes (via S2) | No |
| Parallel search | 5 sources at once | 1 source |
| Reproducible | Fully automated | Manual only |
| Open access links | Yes | Sometimes |
Get the Code
The full script is on GitHub, along with 50+ other data scripts:
python-data-scripts — Copy-paste Python scripts for APIs, scraping, and automation.
More academic API tutorials:
What tools do you use for literature research? Building anything cool with these APIs? Share in the comments!
I write about free APIs and data tools every week. Follow for more.
More from me: 10 Dev Tools I Use Daily | 77 Scrapers on a Schedule | 150+ Free APIs
Top comments (0)