DEV Community

agenthustler
agenthustler

Posted on

Scraping Academic Data: PubMed, arXiv, and Google Scholar

Academic databases contain millions of papers. Scraping them automates literature reviews, citation tracking, and trend identification.

PubMed with E-utilities API

import requests
import xml.etree.ElementTree as ET

class PubMedScraper:
    def __init__(self, email):
        self.base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils'
        self.email = email

    def search(self, query, max_results=100):
        resp = requests.get(f'{self.base}/esearch.fcgi', params={
            'db': 'pubmed', 'term': query, 'retmax': max_results,
            'retmode': 'json', 'email': self.email
        })
        return resp.json()['esearchresult']['idlist']

    def fetch(self, pmids):
        resp = requests.get(f'{self.base}/efetch.fcgi', params={
            'db': 'pubmed', 'id': ','.join(pmids), 'retmode': 'xml', 'email': self.email
        })
        root = ET.fromstring(resp.text)
        articles = []
        for art in root.findall('.//PubmedArticle'):
            citation = art.find('.//MedlineCitation')
            a = citation.find('.//Article')
            title = a.findtext('.//ArticleTitle', '')
            abstract = ' '.join(p.text or '' for p in a.findall('.//AbstractText'))
            authors = [f"{au.findtext('LastName', '')}, {au.findtext('ForeName', '')}"
                for au in a.findall('.//Author') if au.findtext('LastName')]
            pmid = citation.findtext('.//PMID', '')
            articles.append({
                'pmid': pmid, 'title': title, 'abstract': abstract,
                'authors': authors, 'journal': a.findtext('.//Journal/Title', ''),
                'url': f'https://pubmed.ncbi.nlm.nih.gov/{pmid}/'
            })
        return articles
Enter fullscreen mode Exit fullscreen mode

arXiv API

import feedparser

class ArxivScraper:
    def search(self, query, max_results=50):
        resp = requests.get('http://export.arxiv.org/api/query', params={
            'search_query': f'all:{query}', 'max_results': max_results,
            'sortBy': 'submittedDate', 'sortOrder': 'descending'
        })
        feed = feedparser.parse(resp.text)
        return [{
            'id': e.id.split('/abs/')[-1],
            'title': e.title.replace('\n', ' '),
            'authors': [a['name'] for a in e.get('authors', [])],
            'categories': [t['term'] for t in e.get('tags', [])],
            'published': e.published,
            'pdf': e.id.replace('/abs/', '/pdf/')
        } for e in feed.entries]

    def by_category(self, category, n=25):
        resp = requests.get('http://export.arxiv.org/api/query', params={
            'search_query': f'cat:{category}', 'max_results': n,
            'sortBy': 'lastUpdatedDate', 'sortOrder': 'descending'
        })
        return [{
            'title': e.title.replace('\n', ' '),
            'authors': [a['name'] for a in e.get('authors', [])]
        } for e in feedparser.parse(resp.text).entries]
Enter fullscreen mode Exit fullscreen mode

Google Scholar Scraping

from bs4 import BeautifulSoup
import re

class ScholarScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'Mozilla/5.0'})

    def search(self, query, num=10):
        resp = self.session.get('https://scholar.google.com/scholar',
            params={'q': query, 'hl': 'en', 'num': num})
        soup = BeautifulSoup(resp.text, 'html.parser')
        results = []
        for div in soup.select('div.gs_r.gs_or.gs_scl'):
            title_el = div.select_one('h3.gs_rt a')
            cite_el = div.select_one('div.gs_fl a')
            citations = 0
            if cite_el:
                m = re.search(r'Cited by (\d+)', cite_el.get_text())
                if m: citations = int(m.group(1))
            results.append({
                'title': title_el.get_text(strip=True) if title_el else '',
                'url': title_el.get('href', '') if title_el else '',
                'citations': citations
            })
        return results
Enter fullscreen mode Exit fullscreen mode

Literature Review Pipeline

import pandas as pd
from collections import Counter

class LitReview:
    def __init__(self):
        self.pubmed = PubMedScraper('you@university.edu')
        self.arxiv = ArxivScraper()

    def comprehensive(self, query, n=50):
        papers = []
        pmids = self.pubmed.search(query, n)
        for p in self.pubmed.fetch(pmids): p['source'] = 'pubmed'; papers.append(p)
        for p in self.arxiv.search(query, n): p['source'] = 'arxiv'; papers.append(p)
        return papers

    def trends(self, papers):
        df = pd.DataFrame(papers)
        authors = []
        for a in df['authors']:
            if isinstance(a, list): authors.extend(a[:3])
        print("Top authors:", Counter(authors).most_common(10))
        return df
Enter fullscreen mode Exit fullscreen mode

Scaling

For large-scale collection, ScraperAPI handles proxy rotation and CAPTCHAs. ThorData provides residential proxies, and ScrapeOps monitors research pipelines.

Conclusion

PubMed and arXiv offer excellent free APIs. Google Scholar requires careful scraping. Combine all three for comprehensive literature reviews.

Top comments (0)