Academic databases contain millions of papers. Scraping them automates literature reviews, citation tracking, and trend identification.
PubMed with E-utilities API
import requests
import xml.etree.ElementTree as ET
class PubMedScraper:
def __init__(self, email):
self.base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils'
self.email = email
def search(self, query, max_results=100):
resp = requests.get(f'{self.base}/esearch.fcgi', params={
'db': 'pubmed', 'term': query, 'retmax': max_results,
'retmode': 'json', 'email': self.email
})
return resp.json()['esearchresult']['idlist']
def fetch(self, pmids):
resp = requests.get(f'{self.base}/efetch.fcgi', params={
'db': 'pubmed', 'id': ','.join(pmids), 'retmode': 'xml', 'email': self.email
})
root = ET.fromstring(resp.text)
articles = []
for art in root.findall('.//PubmedArticle'):
citation = art.find('.//MedlineCitation')
a = citation.find('.//Article')
title = a.findtext('.//ArticleTitle', '')
abstract = ' '.join(p.text or '' for p in a.findall('.//AbstractText'))
authors = [f"{au.findtext('LastName', '')}, {au.findtext('ForeName', '')}"
for au in a.findall('.//Author') if au.findtext('LastName')]
pmid = citation.findtext('.//PMID', '')
articles.append({
'pmid': pmid, 'title': title, 'abstract': abstract,
'authors': authors, 'journal': a.findtext('.//Journal/Title', ''),
'url': f'https://pubmed.ncbi.nlm.nih.gov/{pmid}/'
})
return articles
arXiv API
import feedparser
class ArxivScraper:
def search(self, query, max_results=50):
resp = requests.get('http://export.arxiv.org/api/query', params={
'search_query': f'all:{query}', 'max_results': max_results,
'sortBy': 'submittedDate', 'sortOrder': 'descending'
})
feed = feedparser.parse(resp.text)
return [{
'id': e.id.split('/abs/')[-1],
'title': e.title.replace('\n', ' '),
'authors': [a['name'] for a in e.get('authors', [])],
'categories': [t['term'] for t in e.get('tags', [])],
'published': e.published,
'pdf': e.id.replace('/abs/', '/pdf/')
} for e in feed.entries]
def by_category(self, category, n=25):
resp = requests.get('http://export.arxiv.org/api/query', params={
'search_query': f'cat:{category}', 'max_results': n,
'sortBy': 'lastUpdatedDate', 'sortOrder': 'descending'
})
return [{
'title': e.title.replace('\n', ' '),
'authors': [a['name'] for a in e.get('authors', [])]
} for e in feedparser.parse(resp.text).entries]
Google Scholar Scraping
from bs4 import BeautifulSoup
import re
class ScholarScraper:
def __init__(self):
self.session = requests.Session()
self.session.headers.update({'User-Agent': 'Mozilla/5.0'})
def search(self, query, num=10):
resp = self.session.get('https://scholar.google.com/scholar',
params={'q': query, 'hl': 'en', 'num': num})
soup = BeautifulSoup(resp.text, 'html.parser')
results = []
for div in soup.select('div.gs_r.gs_or.gs_scl'):
title_el = div.select_one('h3.gs_rt a')
cite_el = div.select_one('div.gs_fl a')
citations = 0
if cite_el:
m = re.search(r'Cited by (\d+)', cite_el.get_text())
if m: citations = int(m.group(1))
results.append({
'title': title_el.get_text(strip=True) if title_el else '',
'url': title_el.get('href', '') if title_el else '',
'citations': citations
})
return results
Literature Review Pipeline
import pandas as pd
from collections import Counter
class LitReview:
def __init__(self):
self.pubmed = PubMedScraper('you@university.edu')
self.arxiv = ArxivScraper()
def comprehensive(self, query, n=50):
papers = []
pmids = self.pubmed.search(query, n)
for p in self.pubmed.fetch(pmids): p['source'] = 'pubmed'; papers.append(p)
for p in self.arxiv.search(query, n): p['source'] = 'arxiv'; papers.append(p)
return papers
def trends(self, papers):
df = pd.DataFrame(papers)
authors = []
for a in df['authors']:
if isinstance(a, list): authors.extend(a[:3])
print("Top authors:", Counter(authors).most_common(10))
return df
Scaling
For large-scale collection, ScraperAPI handles proxy rotation and CAPTCHAs. ThorData provides residential proxies, and ScrapeOps monitors research pipelines.
Conclusion
PubMed and arXiv offer excellent free APIs. Google Scholar requires careful scraping. Combine all three for comprehensive literature reviews.
Top comments (0)