Web Scraping for Journalists: Investigating Data Stories with Python
Data journalism has transformed how newsrooms uncover stories. Instead of relying solely on press releases, journalists now extract structured data from government portals, corporate filings, and public databases.
Why Journalists Need Web Scraping
Public records are increasingly published online — court filings, campaign finance data, environmental reports. But they're rarely in convenient formats. Scraping lets you:
- Monitor government websites for policy changes
- Aggregate data across sources to find patterns
- Build datasets that don't exist from scattered public information
Setting Up Your Investigative Toolkit
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
class InvestigativeScraper:
def __init__(self, api_key=None):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'DataJournalism/1.0 (research purposes)'
})
self.api_key = api_key
self.proxy = f"http://api.scraperapi.com?api_key={api_key}&url=" if api_key else ""
def scrape_portal(self, url, selector):
target = f"{self.proxy}{url}" if self.api_key else url
response = self.session.get(target)
soup = BeautifulSoup(response.text, 'html.parser')
records = []
for row in soup.select(selector):
cells = row.find_all(['td', 'th'])
records.append([c.get_text(strip=True) for c in cells])
return pd.DataFrame(records)
def scrape_filings(self, base_url, pages=5):
all_filings = []
for page in range(1, pages + 1):
resp = self.session.get(f"{base_url}?page={page}")
soup = BeautifulSoup(resp.text, 'html.parser')
for f in soup.select('.filing-entry'):
all_filings.append({
'title': f.select_one('.title').get_text(strip=True),
'date': f.select_one('.date').get_text(strip=True),
'link': f.find('a')['href'],
})
time.sleep(2)
return pd.DataFrame(all_filings)
Tracking Campaign Finance Data
def track_contributions(toolkit, url):
df = toolkit.scrape_portal(url, 'table.contributions tbody tr')
if not df.empty:
df.columns = ['Donor', 'Amount', 'Date', 'Employer', 'City']
df['Amount'] = (df['Amount'].str.replace('$', '')
.str.replace(',', '').astype(float))
top = df.groupby('Donor')['Amount'].sum().sort_values(ascending=False)
print("Top Donors:")
print(top.head(20))
print(f"\nLarge donations (>$5000): {len(df[df['Amount'] > 5000])}")
return df
Building a Change Detection System
import hashlib, json
from datetime import datetime
class PageChangeTracker:
def __init__(self, storage='page_hashes.json'):
self.storage = storage
try:
with open(storage) as f:
self.hashes = json.load(f)
except FileNotFoundError:
self.hashes = {}
def check(self, url, session):
resp = session.get(url)
h = hashlib.sha256(resp.text.encode()).hexdigest()
prev = self.hashes.get(url)
changed = prev is not None and h != prev
self.hashes[url] = h
with open(self.storage, 'w') as f:
json.dump(self.hashes, f)
if changed:
return {'url': url, 'changed_at': datetime.now().isoformat()}
return {'url': url, 'status': 'unchanged'}
Ethical Considerations
- Public interest — scraping public records serves democracy
- Respect robots.txt where reasonable
- Rate limit requests — don't overwhelm small servers
- Verify data — cross-reference with other sources
- Document methodology — transparency builds trust
Scaling Your Investigation
For large investigations across multiple government sources, ScraperAPI handles proxy rotation and CAPTCHAs. ThorData provides clean residential IP pools. ScrapeOps gives monitoring dashboards to track jobs.
Conclusion
Web scraping transforms journalists into active investigators. With Python and the right tools, anyone can build datasets, detect changes, and uncover stories hidden in public data.
Follow for more Python data extraction tutorials.
Top comments (0)