I Built a Research Paper Finder That Searches 5 APIs at Once (Python, 100 Lines)

#python #productivity #api #tutorial

The Problem

Every time I research a topic, I open 5 tabs: Google Scholar, arXiv, Semantic Scholar, OpenAlex, Crossref. Search the same query. Compare results. Copy DOIs back and forth.

So I built a single Python script that searches all 5 APIs simultaneously and gives me a unified result.

100 lines. No frameworks. Just requests and concurrent.futures.

The Code

#!/usr/bin/env python3
"""Search 5 academic APIs at once — unified paper finder"""
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

def search_openalex(query, limit=3):
    resp = requests.get("https://api.openalex.org/works", params={
        "search": query, "sort": "cited_by_count:desc", "per_page": limit
    }, timeout=10)
    results = []
    for w in resp.json().get("results", []):
        results.append({
            "title": w["title"],
            "year": w.get("publication_year"),
            "citations": w.get("cited_by_count", 0),
            "doi": w.get("doi", "").replace("https://doi.org/", ""),
            "source": "OpenAlex"
        })
    return results

def search_semantic_scholar(query, limit=3):
    resp = requests.get("https://api.semanticscholar.org/graph/v1/paper/search", params={
        "query": query, "limit": limit,
        "fields": "title,year,citationCount,tldr"
    }, timeout=10)
    results = []
    for p in resp.json().get("data", []):
        results.append({
            "title": p["title"],
            "year": p.get("year"),
            "citations": p.get("citationCount", 0),
            "tldr": p.get("tldr", {}).get("text", "") if p.get("tldr") else "",
            "source": "Semantic Scholar"
        })
    return results

def search_crossref(query, limit=3):
    resp = requests.get("https://api.crossref.org/works", params={
        "query": query, "rows": limit, "sort": "relevance"
    }, timeout=10)
    results = []
    for w in resp.json().get("message", {}).get("items", []):
        results.append({
            "title": w.get("title", ["N/A"])[0],
            "year": w.get("published-print", {}).get("date-parts", [[None]])[0][0],
            "citations": w.get("is-referenced-by-count", 0),
            "doi": w.get("DOI", ""),
            "source": "Crossref"
        })
    return results

def search_arxiv(query, limit=3):
    import xml.etree.ElementTree as ET
    resp = requests.get("http://export.arxiv.org/api/query", params={
        "search_query": f"all:{query}", "max_results": limit,
        "sortBy": "relevance"
    }, timeout=10)
    ns = {"atom": "http://www.w3.org/2005/Atom"}
    root = ET.fromstring(resp.text)
    results = []
    for entry in root.findall("atom:entry", ns):
        results.append({
            "title": entry.find("atom:title", ns).text.strip().replace("\n", " "),
            "year": entry.find("atom:published", ns).text[:4],
            "citations": 0,
            "source": "arXiv"
        })
    return results

def search_europe_pmc(query, limit=3):
    resp = requests.get("https://www.ebi.ac.uk/europepmc/webservices/rest/search", params={
        "query": query, "resultType": "core", "pageSize": limit, "format": "json"
    }, timeout=10)
    results = []
    for r in resp.json().get("resultList", {}).get("result", []):
        results.append({
            "title": r.get("title", "N/A"),
            "year": r.get("pubYear"),
            "citations": r.get("citedByCount", 0),
            "doi": r.get("doi", ""),
            "source": "Europe PMC"
        })
    return results

def unified_search(query, limit_per_api=3):
    apis = [
        ("OpenAlex", search_openalex),
        ("Semantic Scholar", search_semantic_scholar),
        ("Crossref", search_crossref),
        ("arXiv", search_arxiv),
        ("Europe PMC", search_europe_pmc),
    ]

    all_results = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(fn, query, limit_per_api): name for name, fn in apis}
        for future in as_completed(futures):
            api_name = futures[future]
            try:
                results = future.result()
                all_results.extend(results)
                print(f"  ✓ {api_name}: {len(results)} results")
            except Exception as e:
                print(f"  ✗ {api_name}: {e}")

    # Sort by citations (most cited first)
    all_results.sort(key=lambda x: x.get("citations", 0), reverse=True)
    return all_results

if __name__ == "__main__":
    query = input("\nSearch academic papers: ") or "transformer attention mechanism"
    print(f"\nSearching 5 APIs for: '{query}'...\n")

    results = unified_search(query)

    print(f"\n{'='*80}")
    print(f"Found {len(results)} papers across 5 APIs\n")

    for i, r in enumerate(results[:15], 1):
        print(f"{i}. [{r.get('year', '?')}] {r['title']}")
        print(f"   Citations: {r.get('citations', 0)} | Source: {r['source']}")
        if r.get("doi"):
            print(f"   DOI: {r['doi']}")
        if r.get("tldr"):
            print(f"   TLDR: {r['tldr'][:120]}...")
        print()

How It Works

5 API calls run in parallel using ThreadPoolExecutor — total search takes ~2 seconds
Results are normalized into a common format (title, year, citations, source)
Merged results sorted by citation count
Each result shows which API it came from

Sample Output

Searching 5 APIs for: 'transformer attention mechanism'...

  ✓ arXiv: 3 results
  ✓ OpenAlex: 3 results
  ✓ Europe PMC: 3 results
  ✓ Crossref: 3 results
  ✓ Semantic Scholar: 3 results

================================================================================
Found 15 papers across 5 APIs

1. [2017] Attention Is All You Need
   Citations: 128,451 | Source: OpenAlex
   DOI: 10.48550/arxiv.1706.03762

2. [2018] BERT: Pre-training of Deep Bidirectional Transformers
   Citations: 98,234 | Source: Semantic Scholar
   TLDR: We introduce BERT, a new language representation model...

Why Not Just Use Google Scholar?

Feature	This Script	Google Scholar
API access	Yes (5 APIs)	No official API
Rate limits	Generous	Will block you
Citation data	Structured	Scraping needed
AI summaries	Yes (via S2)	No
Parallel search	5 sources at once	1 source
Reproducible	Fully automated	Manual only
Open access links	Yes	Sometimes

Get the Code

The full script is on GitHub, along with 50+ other data scripts:

python-data-scripts — Copy-paste Python scripts for APIs, scraping, and automation.

DEV Community