Introduction
Legal research is tedious and expensive. Tools like Westlaw and LexisNexis charge thousands per year, but much of the same case law is publicly available through government court systems and open-source platforms like CourtListener. In this tutorial, we'll build a Python scraper that collects case law data for legal research and analysis.
Setup
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time
from datetime import datetime
# CourtListener has a free API, but for scraping court websites
# you need robust proxy rotation
# Get your API key: https://www.scraperapi.com?fp_ref=the52
SCRAPER_API_KEY = "your_key_here"
BASE_URL = "http://api.scraperapi.com"
# CourtListener API (free, requires token)
CL_BASE = "https://www.courtlistener.com/api/rest/v4"
CL_TOKEN = "your_courtlistener_token"
Using the CourtListener API
CourtListener provides a well-documented REST API:
def search_opinions(query, court=None, after_date=None):
"""Search court opinions via CourtListener API."""
headers = {"Authorization": f"Token {CL_TOKEN}"}
params = {
"q": query,
"type": "o", # opinions
"order_by": "score desc",
"format": "json"
}
if court:
params["court"] = court
if after_date:
params["filed_after"] = after_date
response = requests.get(
f"{CL_BASE}/search/",
headers=headers,
params=params
)
if response.status_code == 200:
data = response.json()
return data.get("results", [])
return []
def get_opinion_details(opinion_id):
"""Get full opinion text and metadata."""
headers = {"Authorization": f"Token {CL_TOKEN}"}
response = requests.get(
f"{CL_BASE}/opinions/{opinion_id}/",
headers=headers
)
if response.status_code == 200:
return response.json()
return None
Scraping Court Dockets
def scrape_federal_dockets(case_type="cv", district="nysd"):
"""Scrape federal court dockets from PACER/RECAP."""
# RECAP is the free mirror of PACER data
url = f"{CL_BASE}/dockets/"
headers = {"Authorization": f"Token {CL_TOKEN}"}
params = {
"court": district,
"nature_of_suit": case_type,
"order_by": "-date_filed",
"format": "json"
}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
return response.json().get("results", [])
return []
def scrape_state_courts(state="california"):
"""Scrape state court case information."""
# State courts often need JS rendering
# Residential proxies help: https://thordata.com/?via=the-data
state_urls = {
"california": "https://appellatecases.courtinfo.ca.gov/search.cfm",
"new_york": "https://iapps.courts.state.ny.us/nyscef/CaseSearch",
"texas": "https://search.txcourts.gov/CaseSearch.aspx"
}
url = state_urls.get(state)
if not url:
return []
params = {"api_key": SCRAPER_API_KEY, "url": url, "render": "true"}
response = requests.get(BASE_URL, params=params)
soup = BeautifulSoup(response.text, "html.parser")
cases = []
for row in soup.select("table.case-list tr"):
cols = row.select("td")
if len(cols) >= 4:
cases.append({
"case_number": cols[0].text.strip(),
"title": cols[1].text.strip(),
"date_filed": cols[2].text.strip(),
"status": cols[3].text.strip(),
"state": state
})
return cases
Citation Network Analysis
def build_citation_network(seed_opinion_id, depth=2):
"""Build a citation network from a seed case."""
headers = {"Authorization": f"Token {CL_TOKEN}"}
visited = set()
network = {"nodes": [], "edges": []}
queue = [(seed_opinion_id, 0)]
while queue:
opinion_id, current_depth = queue.pop(0)
if opinion_id in visited or current_depth > depth:
continue
visited.add(opinion_id)
opinion = get_opinion_details(opinion_id)
if opinion:
network["nodes"].append({
"id": opinion_id,
"case_name": opinion.get("case_name", ""),
"date_filed": opinion.get("date_filed", ""),
"court": opinion.get("court", ""),
"depth": current_depth
})
# Get citing opinions
citing_url = f"{CL_BASE}/opinions/?cites={opinion_id}"
citing_resp = requests.get(citing_url, headers=headers)
if citing_resp.status_code == 200:
for cited in citing_resp.json().get("results", []):
cited_id = cited["id"]
network["edges"].append({
"from": opinion_id,
"to": cited_id
})
if current_depth < depth:
queue.append((cited_id, current_depth + 1))
time.sleep(1) # Respect rate limits
return network
Data Storage and Search
import sqlite3
def store_cases(cases, db_path="caselaw.db"):
"""Store scraped cases in SQLite."""
# Monitor your scraping pipeline
# https://scrapeops.io/?fpr=the-data28
conn = sqlite3.connect(db_path)
conn.execute("""
CREATE TABLE IF NOT EXISTS cases (
id INTEGER PRIMARY KEY AUTOINCREMENT,
case_number TEXT,
title TEXT,
court TEXT,
date_filed TEXT,
full_text TEXT,
scraped_at TEXT
)
""")
df = pd.DataFrame(cases)
df["scraped_at"] = datetime.now().isoformat()
df.to_sql("cases", conn, if_exists="append", index=False)
conn.close()
Conclusion
CourtListener's free API makes legal research accessible to everyone. By combining API access with web scraping for state courts, you can build comprehensive case law databases. Use ScraperAPI for state court sites that require JavaScript rendering, and always respect rate limits and terms of service.
Top comments (0)