Building a Legal Case Law Scraper with CourtListener

#python #tutorial #webdev #programming

Introduction

Legal research is tedious and expensive. Tools like Westlaw and LexisNexis charge thousands per year, but much of the same case law is publicly available through government court systems and open-source platforms like CourtListener. In this tutorial, we'll build a Python scraper that collects case law data for legal research and analysis.

Setup

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from datetime import datetime

# CourtListener has a free API, but for scraping court websites
# you need robust proxy rotation
# Get your API key: https://www.scraperapi.com?fp_ref=the52
SCRAPER_API_KEY = "your_key_here"
BASE_URL = "http://api.scraperapi.com"

# CourtListener API (free, requires token)
CL_BASE = "https://www.courtlistener.com/api/rest/v4"
CL_TOKEN = "your_courtlistener_token"

Using the CourtListener API

CourtListener provides a well-documented REST API:

def search_opinions(query, court=None, after_date=None):
    """Search court opinions via CourtListener API."""
    headers = {"Authorization": f"Token {CL_TOKEN}"}

    params = {
        "q": query,
        "type": "o",  # opinions
        "order_by": "score desc",
        "format": "json"
    }

    if court:
        params["court"] = court
    if after_date:
        params["filed_after"] = after_date

    response = requests.get(
        f"{CL_BASE}/search/",
        headers=headers,
        params=params
    )

    if response.status_code == 200:
        data = response.json()
        return data.get("results", [])
    return []

def get_opinion_details(opinion_id):
    """Get full opinion text and metadata."""
    headers = {"Authorization": f"Token {CL_TOKEN}"}

    response = requests.get(
        f"{CL_BASE}/opinions/{opinion_id}/",
        headers=headers
    )

    if response.status_code == 200:
        return response.json()
    return None

Scraping Court Dockets

def scrape_federal_dockets(case_type="cv", district="nysd"):
    """Scrape federal court dockets from PACER/RECAP."""
    # RECAP is the free mirror of PACER data
    url = f"{CL_BASE}/dockets/"

    headers = {"Authorization": f"Token {CL_TOKEN}"}
    params = {
        "court": district,
        "nature_of_suit": case_type,
        "order_by": "-date_filed",
        "format": "json"
    }

    response = requests.get(url, headers=headers, params=params)

    if response.status_code == 200:
        return response.json().get("results", [])
    return []

def scrape_state_courts(state="california"):
    """Scrape state court case information."""
    # State courts often need JS rendering
    # Residential proxies help: https://thordata.com/?via=the-data

    state_urls = {
        "california": "https://appellatecases.courtinfo.ca.gov/search.cfm",
        "new_york": "https://iapps.courts.state.ny.us/nyscef/CaseSearch",
        "texas": "https://search.txcourts.gov/CaseSearch.aspx"
    }

    url = state_urls.get(state)
    if not url:
        return []

    params = {"api_key": SCRAPER_API_KEY, "url": url, "render": "true"}
    response = requests.get(BASE_URL, params=params)
    soup = BeautifulSoup(response.text, "html.parser")

    cases = []
    for row in soup.select("table.case-list tr"):
        cols = row.select("td")
        if len(cols) >= 4:
            cases.append({
                "case_number": cols[0].text.strip(),
                "title": cols[1].text.strip(),
                "date_filed": cols[2].text.strip(),
                "status": cols[3].text.strip(),
                "state": state
            })

    return cases

Citation Network Analysis

def build_citation_network(seed_opinion_id, depth=2):
    """Build a citation network from a seed case."""
    headers = {"Authorization": f"Token {CL_TOKEN}"}
    visited = set()
    network = {"nodes": [], "edges": []}
    queue = [(seed_opinion_id, 0)]

    while queue:
        opinion_id, current_depth = queue.pop(0)

        if opinion_id in visited or current_depth > depth:
            continue

        visited.add(opinion_id)
        opinion = get_opinion_details(opinion_id)

        if opinion:
            network["nodes"].append({
                "id": opinion_id,
                "case_name": opinion.get("case_name", ""),
                "date_filed": opinion.get("date_filed", ""),
                "court": opinion.get("court", ""),
                "depth": current_depth
            })

            # Get citing opinions
            citing_url = f"{CL_BASE}/opinions/?cites={opinion_id}"
            citing_resp = requests.get(citing_url, headers=headers)

            if citing_resp.status_code == 200:
                for cited in citing_resp.json().get("results", []):
                    cited_id = cited["id"]
                    network["edges"].append({
                        "from": opinion_id,
                        "to": cited_id
                    })
                    if current_depth < depth:
                        queue.append((cited_id, current_depth + 1))

            time.sleep(1)  # Respect rate limits

    return network

Data Storage and Search

import sqlite3

def store_cases(cases, db_path="caselaw.db"):
    """Store scraped cases in SQLite."""
    # Monitor your scraping pipeline
    # https://scrapeops.io/?fpr=the-data28

    conn = sqlite3.connect(db_path)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS cases (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            case_number TEXT,
            title TEXT,
            court TEXT,
            date_filed TEXT,
            full_text TEXT,
            scraped_at TEXT
        )
    """)

    df = pd.DataFrame(cases)
    df["scraped_at"] = datetime.now().isoformat()
    df.to_sql("cases", conn, if_exists="append", index=False)
    conn.close()

Conclusion

CourtListener's free API makes legal research accessible to everyone. By combining API access with web scraping for state courts, you can build comprehensive case law databases. Use ScraperAPI for state court sites that require JavaScript rendering, and always respect rate limits and terms of service.