Scraping Court Records and Legal Data with Python

#python #tutorial #webdev #programming

Scraping Court Records and Legal Data with Python

Access to legal data powers everything from legal tech startups to investigative journalism. This guide covers scraping publicly available court records, case filings, and legal databases using Python.

Legal Data Sources

PACER (Public Access to Court Electronic Records) — federal courts
State court websites — each state has its own system
CourtListener — free legal research platform with API
Google Scholar — case law search
SEC EDGAR — corporate filings and litigation

Important Disclaimer

Always verify that scraping is permitted. Many court systems have specific terms about automated access. PACER charges per page. Use official APIs when available.

Setting Up

pip install requests beautifulsoup4 selenium pandas

Scraping CourtListener (Free Legal Data)

import requests
import time

class CourtListenerScraper:
    def __init__(self, api_token=None):
        self.base_url = "https://www.courtlistener.com/api/rest/v4"
        self.session = requests.Session()
        if api_token:
            self.session.headers["Authorization"] = f"Token {api_token}"

    def search_opinions(self, query, court=None, date_after=None):
        """Search court opinions by keyword."""
        params = {
            "q": query,
            "type": "o",  # opinions
            "format": "json"
        }
        if court:
            params["court"] = court
        if date_after:
            params["filed_after"] = date_after

        response = self.session.get(
            f"{self.base_url}/search/",
            params=params
        )
        return response.json()

    def get_case_details(self, case_id):
        """Get full details for a specific case."""
        response = self.session.get(
            f"{self.base_url}/opinions/{case_id}/"
        )
        return response.json()

    def search_dockets(self, case_name=None, court=None):
        """Search case dockets."""
        params = {"format": "json"}
        if case_name:
            params["case_name"] = case_name
        if court:
            params["court"] = court

        response = self.session.get(
            f"{self.base_url}/dockets/",
            params=params
        )
        return response.json()

# Usage
scraper = CourtListenerScraper()
results = scraper.search_opinions("data privacy", date_after="2024-01-01")
for case in results.get("results", [])[:5]:
    print(f"{case['caseName']} - {case['court']}")
    print(f"  Filed: {case['dateFiled']}")
    print(f"  URL: {case['absolute_url']}")

Scraping State Court Records

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

class StateCourt Scraper:
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        self.driver = webdriver.Chrome(options=options)

    def search_cases(self, base_url, name=None, case_number=None):
        """Generic state court case search."""
        self.driver.get(base_url)

        if name:
            name_field = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, "party-name"))
            )
            name_field.send_keys(name)

        if case_number:
            case_field = self.driver.find_element(By.ID, "case-number")
            case_field.send_keys(case_number)

        submit = self.driver.find_element(By.CSS_SELECTOR, "button[type=submit]")
        submit.click()

        # Wait for results
        WebDriverWait(self.driver, 15).until(
            EC.presence_of_element_located((By.CLASS_NAME, "search-results"))
        )

        return self.parse_results()

    def parse_results(self):
        """Extract case data from search results."""
        results = []
        rows = self.driver.find_elements(By.CSS_SELECTOR, ".result-row")

        for row in rows:
            results.append({
                "case_number": row.find_element(By.CLASS_NAME, "case-num").text,
                "title": row.find_element(By.CLASS_NAME, "case-title").text,
                "date": row.find_element(By.CLASS_NAME, "file-date").text,
                "status": row.find_element(By.CLASS_NAME, "case-status").text
            })

        return pd.DataFrame(results)

    def close(self):
        self.driver.quit()

Scraping SEC EDGAR Filings

import requests
from bs4 import BeautifulSoup

class SECEdgarScraper:
    def __init__(self):
        self.base_url = "https://efts.sec.gov/LATEST"
        self.headers = {
            "User-Agent": "YourApp your@email.com",
            "Accept-Encoding": "gzip, deflate"
        }

    def search_filings(self, company, filing_type="10-K"):
        """Search SEC EDGAR for company filings."""
        url = f"{self.base_url}/search-index"
        params = {
            "q": company,
            "forms": filing_type,
            "dateRange": "custom",
            "startdt": "2024-01-01"
        }
        response = requests.get(url, params=params, headers=self.headers)
        return response.json()

    def get_filing_text(self, accession_number):
        """Download full filing text."""
        clean_number = accession_number.replace("-", "")
        url = f"https://www.sec.gov/Archives/edgar/data/{clean_number}"
        response = requests.get(url, headers=self.headers)
        return response.text

# Search for Apple's recent 10-K filings
edgar = SECEdgarScraper()
filings = edgar.search_filings("Apple Inc", "10-K")
for filing in filings.get("hits", {}).get("hits", [])[:5]:
    source = filing["_source"]
    print(f"{source['entity_name']} - {source['file_date']}")

Handling Protected Legal Sites

Many legal databases have anti-bot protections:

ScraperAPI — handles CAPTCHAs and JS rendering needed for court portals
ThorData — residential proxies to avoid IP blocks from government sites
ScrapeOps — monitor your legal data pipeline for failures

Building a Legal Data Pipeline

import schedule
import json

def daily_legal_scan():
    scraper = CourtListenerScraper()
    keywords = ["artificial intelligence", "data privacy", "cryptocurrency"]

    all_results = []
    for keyword in keywords:
        results = scraper.search_opinions(keyword, date_after="2024-01-01")
        all_results.extend(results.get("results", []))
        time.sleep(2)

    with open(f"legal_scan_{datetime.now().strftime('%Y%m%d')}.json", "w") as f:
        json.dump(all_results, f, indent=2)

    print(f"Found {len(all_results)} new cases")

schedule.every().day.at("08:00").do(daily_legal_scan)

Conclusion

Legal data scraping opens doors for legal tech innovation, from case research tools to compliance monitoring. Always respect the data source's terms and handle sensitive legal information responsibly.

Follow for more Python data scraping guides!

DEV Community

Scraping Court Records and Legal Data with Python