DEV Community

agenthustler
agenthustler

Posted on

Scraping Court Records and Legal Data with Python

Scraping Court Records and Legal Data with Python

Access to legal data powers everything from legal tech startups to investigative journalism. This guide covers scraping publicly available court records, case filings, and legal databases using Python.

Legal Data Sources

  • PACER (Public Access to Court Electronic Records) — federal courts
  • State court websites — each state has its own system
  • CourtListener — free legal research platform with API
  • Google Scholar — case law search
  • SEC EDGAR — corporate filings and litigation

Important Disclaimer

Always verify that scraping is permitted. Many court systems have specific terms about automated access. PACER charges per page. Use official APIs when available.

Setting Up

pip install requests beautifulsoup4 selenium pandas
Enter fullscreen mode Exit fullscreen mode

Scraping CourtListener (Free Legal Data)

import requests
import time

class CourtListenerScraper:
    def __init__(self, api_token=None):
        self.base_url = "https://www.courtlistener.com/api/rest/v4"
        self.session = requests.Session()
        if api_token:
            self.session.headers["Authorization"] = f"Token {api_token}"

    def search_opinions(self, query, court=None, date_after=None):
        """Search court opinions by keyword."""
        params = {
            "q": query,
            "type": "o",  # opinions
            "format": "json"
        }
        if court:
            params["court"] = court
        if date_after:
            params["filed_after"] = date_after

        response = self.session.get(
            f"{self.base_url}/search/",
            params=params
        )
        return response.json()

    def get_case_details(self, case_id):
        """Get full details for a specific case."""
        response = self.session.get(
            f"{self.base_url}/opinions/{case_id}/"
        )
        return response.json()

    def search_dockets(self, case_name=None, court=None):
        """Search case dockets."""
        params = {"format": "json"}
        if case_name:
            params["case_name"] = case_name
        if court:
            params["court"] = court

        response = self.session.get(
            f"{self.base_url}/dockets/",
            params=params
        )
        return response.json()

# Usage
scraper = CourtListenerScraper()
results = scraper.search_opinions("data privacy", date_after="2024-01-01")
for case in results.get("results", [])[:5]:
    print(f"{case['caseName']} - {case['court']}")
    print(f"  Filed: {case['dateFiled']}")
    print(f"  URL: {case['absolute_url']}")
Enter fullscreen mode Exit fullscreen mode

Scraping State Court Records

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

class StateCourt Scraper:
    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        self.driver = webdriver.Chrome(options=options)

    def search_cases(self, base_url, name=None, case_number=None):
        """Generic state court case search."""
        self.driver.get(base_url)

        if name:
            name_field = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, "party-name"))
            )
            name_field.send_keys(name)

        if case_number:
            case_field = self.driver.find_element(By.ID, "case-number")
            case_field.send_keys(case_number)

        submit = self.driver.find_element(By.CSS_SELECTOR, "button[type=submit]")
        submit.click()

        # Wait for results
        WebDriverWait(self.driver, 15).until(
            EC.presence_of_element_located((By.CLASS_NAME, "search-results"))
        )

        return self.parse_results()

    def parse_results(self):
        """Extract case data from search results."""
        results = []
        rows = self.driver.find_elements(By.CSS_SELECTOR, ".result-row")

        for row in rows:
            results.append({
                "case_number": row.find_element(By.CLASS_NAME, "case-num").text,
                "title": row.find_element(By.CLASS_NAME, "case-title").text,
                "date": row.find_element(By.CLASS_NAME, "file-date").text,
                "status": row.find_element(By.CLASS_NAME, "case-status").text
            })

        return pd.DataFrame(results)

    def close(self):
        self.driver.quit()
Enter fullscreen mode Exit fullscreen mode

Scraping SEC EDGAR Filings

import requests
from bs4 import BeautifulSoup

class SECEdgarScraper:
    def __init__(self):
        self.base_url = "https://efts.sec.gov/LATEST"
        self.headers = {
            "User-Agent": "YourApp your@email.com",
            "Accept-Encoding": "gzip, deflate"
        }

    def search_filings(self, company, filing_type="10-K"):
        """Search SEC EDGAR for company filings."""
        url = f"{self.base_url}/search-index"
        params = {
            "q": company,
            "forms": filing_type,
            "dateRange": "custom",
            "startdt": "2024-01-01"
        }
        response = requests.get(url, params=params, headers=self.headers)
        return response.json()

    def get_filing_text(self, accession_number):
        """Download full filing text."""
        clean_number = accession_number.replace("-", "")
        url = f"https://www.sec.gov/Archives/edgar/data/{clean_number}"
        response = requests.get(url, headers=self.headers)
        return response.text

# Search for Apple's recent 10-K filings
edgar = SECEdgarScraper()
filings = edgar.search_filings("Apple Inc", "10-K")
for filing in filings.get("hits", {}).get("hits", [])[:5]:
    source = filing["_source"]
    print(f"{source['entity_name']} - {source['file_date']}")
Enter fullscreen mode Exit fullscreen mode

Handling Protected Legal Sites

Many legal databases have anti-bot protections:

  1. ScraperAPI — handles CAPTCHAs and JS rendering needed for court portals
  2. ThorData — residential proxies to avoid IP blocks from government sites
  3. ScrapeOps — monitor your legal data pipeline for failures

Building a Legal Data Pipeline

import schedule
import json

def daily_legal_scan():
    scraper = CourtListenerScraper()
    keywords = ["artificial intelligence", "data privacy", "cryptocurrency"]

    all_results = []
    for keyword in keywords:
        results = scraper.search_opinions(keyword, date_after="2024-01-01")
        all_results.extend(results.get("results", []))
        time.sleep(2)

    with open(f"legal_scan_{datetime.now().strftime('%Y%m%d')}.json", "w") as f:
        json.dump(all_results, f, indent=2)

    print(f"Found {len(all_results)} new cases")

schedule.every().day.at("08:00").do(daily_legal_scan)
Enter fullscreen mode Exit fullscreen mode

Conclusion

Legal data scraping opens doors for legal tech innovation, from case research tools to compliance monitoring. Always respect the data source's terms and handle sensitive legal information responsibly.


Follow for more Python data scraping guides!

Top comments (0)