Scraping Court Records and Legal Data with Python
Access to legal data powers everything from legal tech startups to investigative journalism. This guide covers scraping publicly available court records, case filings, and legal databases using Python.
Legal Data Sources
- PACER (Public Access to Court Electronic Records) — federal courts
- State court websites — each state has its own system
- CourtListener — free legal research platform with API
- Google Scholar — case law search
- SEC EDGAR — corporate filings and litigation
Important Disclaimer
Always verify that scraping is permitted. Many court systems have specific terms about automated access. PACER charges per page. Use official APIs when available.
Setting Up
pip install requests beautifulsoup4 selenium pandas
Scraping CourtListener (Free Legal Data)
import requests
import time
class CourtListenerScraper:
def __init__(self, api_token=None):
self.base_url = "https://www.courtlistener.com/api/rest/v4"
self.session = requests.Session()
if api_token:
self.session.headers["Authorization"] = f"Token {api_token}"
def search_opinions(self, query, court=None, date_after=None):
"""Search court opinions by keyword."""
params = {
"q": query,
"type": "o", # opinions
"format": "json"
}
if court:
params["court"] = court
if date_after:
params["filed_after"] = date_after
response = self.session.get(
f"{self.base_url}/search/",
params=params
)
return response.json()
def get_case_details(self, case_id):
"""Get full details for a specific case."""
response = self.session.get(
f"{self.base_url}/opinions/{case_id}/"
)
return response.json()
def search_dockets(self, case_name=None, court=None):
"""Search case dockets."""
params = {"format": "json"}
if case_name:
params["case_name"] = case_name
if court:
params["court"] = court
response = self.session.get(
f"{self.base_url}/dockets/",
params=params
)
return response.json()
# Usage
scraper = CourtListenerScraper()
results = scraper.search_opinions("data privacy", date_after="2024-01-01")
for case in results.get("results", [])[:5]:
print(f"{case['caseName']} - {case['court']}")
print(f" Filed: {case['dateFiled']}")
print(f" URL: {case['absolute_url']}")
Scraping State Court Records
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
class StateCourt Scraper:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
self.driver = webdriver.Chrome(options=options)
def search_cases(self, base_url, name=None, case_number=None):
"""Generic state court case search."""
self.driver.get(base_url)
if name:
name_field = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.ID, "party-name"))
)
name_field.send_keys(name)
if case_number:
case_field = self.driver.find_element(By.ID, "case-number")
case_field.send_keys(case_number)
submit = self.driver.find_element(By.CSS_SELECTOR, "button[type=submit]")
submit.click()
# Wait for results
WebDriverWait(self.driver, 15).until(
EC.presence_of_element_located((By.CLASS_NAME, "search-results"))
)
return self.parse_results()
def parse_results(self):
"""Extract case data from search results."""
results = []
rows = self.driver.find_elements(By.CSS_SELECTOR, ".result-row")
for row in rows:
results.append({
"case_number": row.find_element(By.CLASS_NAME, "case-num").text,
"title": row.find_element(By.CLASS_NAME, "case-title").text,
"date": row.find_element(By.CLASS_NAME, "file-date").text,
"status": row.find_element(By.CLASS_NAME, "case-status").text
})
return pd.DataFrame(results)
def close(self):
self.driver.quit()
Scraping SEC EDGAR Filings
import requests
from bs4 import BeautifulSoup
class SECEdgarScraper:
def __init__(self):
self.base_url = "https://efts.sec.gov/LATEST"
self.headers = {
"User-Agent": "YourApp your@email.com",
"Accept-Encoding": "gzip, deflate"
}
def search_filings(self, company, filing_type="10-K"):
"""Search SEC EDGAR for company filings."""
url = f"{self.base_url}/search-index"
params = {
"q": company,
"forms": filing_type,
"dateRange": "custom",
"startdt": "2024-01-01"
}
response = requests.get(url, params=params, headers=self.headers)
return response.json()
def get_filing_text(self, accession_number):
"""Download full filing text."""
clean_number = accession_number.replace("-", "")
url = f"https://www.sec.gov/Archives/edgar/data/{clean_number}"
response = requests.get(url, headers=self.headers)
return response.text
# Search for Apple's recent 10-K filings
edgar = SECEdgarScraper()
filings = edgar.search_filings("Apple Inc", "10-K")
for filing in filings.get("hits", {}).get("hits", [])[:5]:
source = filing["_source"]
print(f"{source['entity_name']} - {source['file_date']}")
Handling Protected Legal Sites
Many legal databases have anti-bot protections:
- ScraperAPI — handles CAPTCHAs and JS rendering needed for court portals
- ThorData — residential proxies to avoid IP blocks from government sites
- ScrapeOps — monitor your legal data pipeline for failures
Building a Legal Data Pipeline
import schedule
import json
def daily_legal_scan():
scraper = CourtListenerScraper()
keywords = ["artificial intelligence", "data privacy", "cryptocurrency"]
all_results = []
for keyword in keywords:
results = scraper.search_opinions(keyword, date_after="2024-01-01")
all_results.extend(results.get("results", []))
time.sleep(2)
with open(f"legal_scan_{datetime.now().strftime('%Y%m%d')}.json", "w") as f:
json.dump(all_results, f, indent=2)
print(f"Found {len(all_results)} new cases")
schedule.every().day.at("08:00").do(daily_legal_scan)
Conclusion
Legal data scraping opens doors for legal tech innovation, from case research tools to compliance monitoring. Always respect the data source's terms and handle sensitive legal information responsibly.
Follow for more Python data scraping guides!
Top comments (0)