Court records are public data, but accessing them programmatically is surprisingly difficult. PACER (Public Access to Court Electronic Records) charges per page, and state systems vary wildly. Let's build scrapers that extract court docket data efficiently.
Why Court Data Matters
Legal professionals, journalists, and researchers need court data for:
- Case research: Finding relevant precedents
- Due diligence: Background checks on companies and individuals
- Journalism: Investigating patterns in litigation
- Legal analytics: Predicting case outcomes based on historical data
Understanding PACER
PACER is the federal court system's electronic records platform. It charges $0.10 per page (capped at $3.00 per document). While scraping PACER directly requires an account, we can work with free alternatives and state court systems.
Setting Up
pip install requests beautifulsoup4 pandas selenium
For court websites with heavy JavaScript, ScraperAPI handles rendering:
import requests
from bs4 import BeautifulSoup
import re
import time
SCRAPER_KEY = "YOUR_SCRAPERAPI_KEY"
def fetch_rendered(url):
"""Fetch JS-rendered court page."""
resp = requests.get(
"http://api.scraperapi.com",
params={
"api_key": SCRAPER_KEY,
"url": url,
"render": "true"
},
timeout=90
)
return BeautifulSoup(resp.text, "html.parser")
Scraping Free Federal Court Data (CourtListener)
CourtListener (RECAP project) mirrors many PACER documents for free:
def search_courtlistener(query, court="all"):
"""Search CourtListener's free RECAP archive."""
url = "https://www.courtlistener.com/api/rest/v4/search/"
params = {
"q": query,
"type": "r", # RECAP documents
"court": court,
"order_by": "score desc",
"format": "json"
}
resp = requests.get(url, params=params, timeout=30)
data = resp.json()
results = []
for item in data.get("results", []):
results.append({
"case_name": item.get("caseName", ""),
"court": item.get("court", ""),
"date_filed": item.get("dateFiled", ""),
"docket_number": item.get("docketNumber", ""),
"description": item.get("description", ""),
"url": f"https://www.courtlistener.com{item.get('absolute_url', '')}"
})
return results
# Search for patent cases
patent_cases = search_courtlistener("patent infringement", court="cafc")
print(f"Found {len(patent_cases)} cases")
Scraping State Court Systems
State courts have different systems. Here's a pattern for common formats:
def scrape_state_court(state, case_type="civil"):
"""Generic state court docket scraper."""
# State court URLs vary - this is a template
court_urls = {
"ny": "https://iapps.courts.state.ny.us/nyscef/CaseSearch",
"ca": "https://www.courts.ca.gov/find-a-court.htm",
"tx": "https://www.txcourts.gov/case-search/",
}
url = court_urls.get(state)
if not url:
raise ValueError(f"No scraper for state: {state}")
soup = fetch_rendered(url)
cases = []
# Extract case listings (varies by state)
for row in soup.select("tr.case-row, .search-result"):
case_num = row.select_one(".case-number, .docket-number")
title = row.select_one(".case-title, .party-names")
date = row.select_one(".file-date, .date-filed")
status = row.select_one(".case-status, .status")
if case_num:
cases.append({
"case_number": case_num.text.strip(),
"title": title.text.strip() if title else "",
"date_filed": date.text.strip() if date else "",
"status": status.text.strip() if status else "",
"state": state
})
return cases
Extracting Docket Entries
def get_docket_entries(case_url):
"""Extract individual docket entries from a case page."""
soup = fetch_rendered(case_url)
entries = []
for entry in soup.select(".docket-entry, tr.entry-row"):
num = entry.select_one(".entry-number, .doc-num")
date = entry.select_one(".entry-date")
desc = entry.select_one(".entry-description, .docket-text")
doc_link = entry.select_one("a[href*='doc'], a[href*='pdf']")
entries.append({
"entry_number": num.text.strip() if num else "",
"date": date.text.strip() if date else "",
"description": desc.text.strip() if desc else "",
"document_url": doc_link["href"] if doc_link else None
})
return entries
Building a Case Database
import sqlite3
import pandas as pd
def create_case_db(db="court_cases.db"):
"""Initialize the case tracking database."""
conn = sqlite3.connect(db)
conn.executescript("""
CREATE TABLE IF NOT EXISTS cases (
id INTEGER PRIMARY KEY,
case_number TEXT UNIQUE,
title TEXT, court TEXT,
date_filed TEXT, status TEXT,
case_type TEXT, state TEXT
);
CREATE TABLE IF NOT EXISTS docket_entries (
id INTEGER PRIMARY KEY,
case_id INTEGER,
entry_number TEXT,
entry_date TEXT,
description TEXT,
FOREIGN KEY (case_id) REFERENCES cases(id)
);
CREATE INDEX IF NOT EXISTS idx_case_number
ON cases(case_number);
""")
return conn
def save_case(conn, case_data, entries):
"""Save case and its docket entries."""
cursor = conn.execute(
"INSERT OR IGNORE INTO cases VALUES (NULL,?,?,?,?,?,?,?)",
(case_data["case_number"], case_data["title"],
case_data["court"], case_data["date_filed"],
case_data["status"], case_data.get("type", ""),
case_data.get("state", ""))
)
case_id = cursor.lastrowid
for entry in entries:
conn.execute(
"INSERT INTO docket_entries VALUES (NULL,?,?,?,?)",
(case_id, entry["entry_number"],
entry["date"], entry["description"])
)
conn.commit()
Monitoring Active Cases
def monitor_case(case_url, db="court_cases.db"):
"""Check for new docket entries in a tracked case."""
entries = get_docket_entries(case_url)
conn = sqlite3.connect(db)
existing = pd.read_sql(
"SELECT entry_number FROM docket_entries WHERE case_id = ?",
conn, params=[case_url]
)
existing_nums = set(existing["entry_number"])
new_entries = [
e for e in entries
if e["entry_number"] not in existing_nums
]
if new_entries:
print(f"{len(new_entries)} new docket entries found!")
for entry in new_entries:
print(f" [{entry['date']}] {entry['description'][:80]}")
return new_entries
Scaling Court Data Collection
Court websites are notoriously fragile and vary between jurisdictions. Use ThorData for residential proxies when scraping state courts, and ScrapeOps to track which scrapers are failing.
Ethical Considerations
- Court records are public, but bulk downloading may violate terms of service
- PACER has specific rules about redistribution
- Some sealed or expunged records should not be collected
- Be mindful of using personal information from court records
Conclusion
Court data scraping opens up powerful possibilities for legal research and analytics. Start with CourtListener's free API for federal cases, then build state-specific scrapers as needed. The key challenge is handling the diversity of court systems, each with its own website structure and access rules.
Always respect court system terms of service and be mindful of the sensitivity of legal data.
Top comments (0)