Scraping Scientific Preprint Servers: bioRxiv, medRxiv, SSRN

#python #tutorial #webdev #programming

Scientific preprint servers host cutting-edge research before peer review. Building a scraper for bioRxiv, medRxiv, and SSRN lets you track emerging research trends, monitor specific fields, and build datasets for meta-analysis.

Why Scrape Preprints?

Preprint servers publish research weeks or months before journals. Tracking them gives you early signals on breakthrough findings, emerging methodologies, and shifting research priorities.

Setting Up

pip install requests beautifulsoup4 pandas lxml

Scraping bioRxiv and medRxiv

Both servers share the same infrastructure and offer content pages we can parse:

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta

def scrape_biorxiv(subject="bioinformatics", days_back=7):
    papers = []
    base_url = "https://api.scraperapi.com"

    # bioRxiv has a content API
    end_date = datetime.now().strftime("%Y-%m-%d")
    start_date = (datetime.now() - timedelta(days=days_back)).strftime("%Y-%m-%d")

    target = f"https://www.biorxiv.org/search/{subject}%20jcode%3Abiorxiv%20limit_from%3A{start_date}%20limit_to%3A{end_date}%20numresults%3A50"

    params = {
        "api_key": "YOUR_SCRAPERAPI_KEY",
        "url": target
    }

    response = requests.get(base_url, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    for item in soup.select(".search-result"):
        title_el = item.select_one(".highwire-cite-title")
        authors_el = item.select_one(".highwire-citation-authors")
        date_el = item.select_one(".highwire-cite-metadata-date")
        doi_el = item.select_one(".highwire-cite-metadata-doi")

        papers.append({
            "title": title_el.text.strip() if title_el else "",
            "authors": authors_el.text.strip() if authors_el else "",
            "date": date_el.text.strip() if date_el else "",
            "doi": doi_el.text.strip() if doi_el else "",
            "server": "bioRxiv",
            "subject": subject
        })

    return papers

results = scrape_biorxiv("neuroscience", days_back=7)
print(f"Found {len(results)} papers")

Scraping SSRN

SSRN covers social sciences, economics, and law:

def scrape_ssrn(query="machine learning finance", page=1):
    papers = []

    target = f"https://papers.ssrn.com/sol3/results.cfm?RequestTimeout=50000000&txtKey_Words={query}&npage={page}"

    params = {
        "api_key": "YOUR_SCRAPERAPI_KEY",
        "url": target,
        "render": "true"
    }

    response = requests.get("https://api.scraperapi.com", params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    for item in soup.select(".result-item"):
        title = item.select_one(".title a")
        authors = item.select_one(".authors-list")
        date = item.select_one(".submission-date")
        downloads = item.select_one(".download-count")

        papers.append({
            "title": title.text.strip() if title else "",
            "url": title["href"] if title else "",
            "authors": authors.text.strip() if authors else "",
            "date": date.text.strip() if date else "",
            "downloads": downloads.text.strip() if downloads else "0",
            "server": "SSRN"
        })

    return papers

ssrn_results = scrape_ssrn("artificial intelligence regulation")

Building a Research Tracker

Combine all sources into a unified tracker:

def research_tracker(topics, days_back=7):
    all_papers = []

    for topic in topics:
        # bioRxiv
        bio = scrape_biorxiv(topic, days_back)
        all_papers.extend(bio)

        # SSRN
        ssrn = scrape_ssrn(topic)
        all_papers.extend(ssrn)

    df = pd.DataFrame(all_papers)
    df.drop_duplicates(subset=["title"], inplace=True)
    df.to_csv(f"preprints_{datetime.now().strftime('%Y%m%d')}.csv", index=False)

    print(f"Tracked {len(df)} unique papers across {len(topics)} topics")
    return df

topics = ["CRISPR", "large language models", "quantum computing"]
df = research_tracker(topics)
print(df.groupby("server").size())

Analysis: Finding Trending Topics

from collections import Counter

def find_trending_keywords(df, top_n=20):
    all_words = " ".join(df["title"].str.lower()).split()
    stop_words = {"the", "of", "and", "in", "a", "to", "for", "with", "on", "is"}
    filtered = [w for w in all_words if w not in stop_words and len(w) > 3]

    trends = Counter(filtered).most_common(top_n)
    for word, count in trends:
        print(f"  {word}: {count} papers")

    return trends

find_trending_keywords(df)

Scaling Your Preprint Scraper

Handle anti-bot protections with ScraperAPI — essential for JavaScript-heavy academic sites
Use ThorData residential proxies for consistent access to rate-limited servers
Track your scraper health with ScrapeOps monitoring

Conclusion

Preprint scraping gives researchers and data scientists a competitive edge in tracking emerging science. Build your pipeline incrementally — start with one server, nail the parsing, then expand. Always check each server's robots.txt and terms of service before scraping at scale.