Scraping Academic Data: PubMed, arXiv, and Google Scholar

#python #tutorial #webdev #programming

Academic databases contain millions of papers. Scraping them automates literature reviews, citation tracking, and trend identification.

PubMed with E-utilities API

import requests
import xml.etree.ElementTree as ET

class PubMedScraper:
    def __init__(self, email):
        self.base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils'
        self.email = email

    def search(self, query, max_results=100):
        resp = requests.get(f'{self.base}/esearch.fcgi', params={
            'db': 'pubmed', 'term': query, 'retmax': max_results,
            'retmode': 'json', 'email': self.email
        })
        return resp.json()['esearchresult']['idlist']

    def fetch(self, pmids):
        resp = requests.get(f'{self.base}/efetch.fcgi', params={
            'db': 'pubmed', 'id': ','.join(pmids), 'retmode': 'xml', 'email': self.email
        })
        root = ET.fromstring(resp.text)
        articles = []
        for art in root.findall('.//PubmedArticle'):
            citation = art.find('.//MedlineCitation')
            a = citation.find('.//Article')
            title = a.findtext('.//ArticleTitle', '')
            abstract = ' '.join(p.text or '' for p in a.findall('.//AbstractText'))
            authors = [f"{au.findtext('LastName', '')}, {au.findtext('ForeName', '')}"
                for au in a.findall('.//Author') if au.findtext('LastName')]
            pmid = citation.findtext('.//PMID', '')
            articles.append({
                'pmid': pmid, 'title': title, 'abstract': abstract,
                'authors': authors, 'journal': a.findtext('.//Journal/Title', ''),
                'url': f'https://pubmed.ncbi.nlm.nih.gov/{pmid}/'
            })
        return articles

arXiv API

import feedparser

class ArxivScraper:
    def search(self, query, max_results=50):
        resp = requests.get('http://export.arxiv.org/api/query', params={
            'search_query': f'all:{query}', 'max_results': max_results,
            'sortBy': 'submittedDate', 'sortOrder': 'descending'
        })
        feed = feedparser.parse(resp.text)
        return [{
            'id': e.id.split('/abs/')[-1],
            'title': e.title.replace('\n', ' '),
            'authors': [a['name'] for a in e.get('authors', [])],
            'categories': [t['term'] for t in e.get('tags', [])],
            'published': e.published,
            'pdf': e.id.replace('/abs/', '/pdf/')
        } for e in feed.entries]

    def by_category(self, category, n=25):
        resp = requests.get('http://export.arxiv.org/api/query', params={
            'search_query': f'cat:{category}', 'max_results': n,
            'sortBy': 'lastUpdatedDate', 'sortOrder': 'descending'
        })
        return [{
            'title': e.title.replace('\n', ' '),
            'authors': [a['name'] for a in e.get('authors', [])]
        } for e in feedparser.parse(resp.text).entries]

Google Scholar Scraping

from bs4 import BeautifulSoup
import re

class ScholarScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'Mozilla/5.0'})

    def search(self, query, num=10):
        resp = self.session.get('https://scholar.google.com/scholar',
            params={'q': query, 'hl': 'en', 'num': num})
        soup = BeautifulSoup(resp.text, 'html.parser')
        results = []
        for div in soup.select('div.gs_r.gs_or.gs_scl'):
            title_el = div.select_one('h3.gs_rt a')
            cite_el = div.select_one('div.gs_fl a')
            citations = 0
            if cite_el:
                m = re.search(r'Cited by (\d+)', cite_el.get_text())
                if m: citations = int(m.group(1))
            results.append({
                'title': title_el.get_text(strip=True) if title_el else '',
                'url': title_el.get('href', '') if title_el else '',
                'citations': citations
            })
        return results

Literature Review Pipeline

import pandas as pd
from collections import Counter

class LitReview:
    def __init__(self):
        self.pubmed = PubMedScraper('you@university.edu')
        self.arxiv = ArxivScraper()

    def comprehensive(self, query, n=50):
        papers = []
        pmids = self.pubmed.search(query, n)
        for p in self.pubmed.fetch(pmids): p['source'] = 'pubmed'; papers.append(p)
        for p in self.arxiv.search(query, n): p['source'] = 'arxiv'; papers.append(p)
        return papers

    def trends(self, papers):
        df = pd.DataFrame(papers)
        authors = []
        for a in df['authors']:
            if isinstance(a, list): authors.extend(a[:3])
        print("Top authors:", Counter(authors).most_common(10))
        return df

Scaling

For large-scale collection, ScraperAPI handles proxy rotation and CAPTCHAs. ThorData provides residential proxies, and ScrapeOps monitors research pipelines.