Scraping Dark Patterns in Cookie Consent: A Research Dataset
Cookie consent banners have become a battleground for user privacy. Many sites use dark patterns — deceptive design choices that manipulate users into accepting tracking. In this guide, we'll build a scraper that analyzes cookie consent implementations across thousands of websites and identifies dark patterns systematically.
What Are Cookie Consent Dark Patterns?
Dark patterns in consent include:
- Pre-checked boxes — tracking enabled by default
- Asymmetric choices — "Accept All" is prominent, "Reject" is hidden
- Forced action — blocking content until you accept
- Confusing language — using double negatives or jargon
- Hidden settings — burying granular controls behind multiple clicks
Building the Scraper
We need JavaScript rendering to detect cookie banners since they're typically injected by third-party scripts. ScraperAPI handles headless rendering:
import requests
from bs4 import BeautifulSoup
import re
import json
SCRAPER_API_KEY = "YOUR_KEY"
def scrape_consent_banner(url):
response = requests.get(
"http://api.scraperapi.com",
params={
"api_key": SCRAPER_API_KEY,
"url": url,
"render": "true",
"wait_for_selector": "[class*='cookie'], [id*='cookie'], [class*='consent']"
},
timeout=60
)
soup = BeautifulSoup(response.text, "html.parser")
consent_selectors = [
"[class*='cookie']", "[id*='cookie']",
"[class*='consent']", "[id*='consent']",
"[class*='gdpr']", "[id*='gdpr']",
"[class*='privacy']"
]
banners = []
for selector in consent_selectors:
elements = soup.select(selector)
for el in elements:
banners.append({
"html": str(el)[:5000],
"text": el.get_text(strip=True)[:2000],
"tag": el.name,
"classes": el.get("class", []),
"id": el.get("id", "")
})
return {"url": url, "banners": banners}
Dark Pattern Detection Engine
Now we build classifiers for each type of dark pattern:
def detect_dark_patterns(banner_data):
patterns_found = []
html = banner_data.get("html", "")
text = banner_data.get("text", "").lower()
# 1. Asymmetric buttons
accept_btns = re.findall(
r'<button[^>]*class="[^"]*(?:primary|accept|agree|allow)[^"]*"[^>]*>',
html, re.IGNORECASE
)
reject_btns = re.findall(
r'<button[^>]*class="[^"]*(?:secondary|reject|decline|deny)[^"]*"[^>]*>',
html, re.IGNORECASE
)
reject_links = re.findall(
r'<a[^>]*>.*?(?:reject|decline|deny|refuse).*?</a>',
html, re.IGNORECASE
)
if accept_btns and (reject_links and not reject_btns):
patterns_found.append({
"type": "ASYMMETRIC_CHOICES",
"severity": "high",
"detail": "Accept is a button, reject is a text link"
})
# 2. Pre-checked tracking boxes
prechecked = re.findall(
r'<input[^>]*type="checkbox"[^>]*checked[^>]*>',
html, re.IGNORECASE
)
if prechecked:
patterns_found.append({
"type": "PRE_CHECKED",
"severity": "high",
"detail": f"{len(prechecked)} pre-checked tracking checkboxes"
})
# 3. No reject option
has_accept = bool(re.search(r'accept|agree|allow|got it|ok', text))
has_reject = bool(re.search(r'reject|decline|deny|refuse', text))
if has_accept and not has_reject:
patterns_found.append({
"type": "NO_REJECT_OPTION",
"severity": "critical",
"detail": "Accept option present but no visible reject"
})
# 4. Confusing double negatives
double_neg = re.findall(
r'(?:don.t|do not).{0,20}(?:not|disable|opt.out|uncheck)', text
)
if double_neg:
patterns_found.append({
"type": "CONFUSING_LANGUAGE",
"severity": "medium",
"detail": f"Double negative detected: {double_neg[0]}"
})
# 5. Wall/blocker pattern
overlay = re.search(
r'(?:position:\s*fixed|z-index:\s*[5-9]\d{3,})', html, re.IGNORECASE
)
if overlay and not has_reject:
patterns_found.append({
"type": "FORCED_ACTION",
"severity": "critical",
"detail": "Full-page overlay with no reject option"
})
return patterns_found
Building the Research Dataset
Scrape the top 10,000 sites and compile results:
import pandas as pd
def build_dataset(urls, output_file="dark_patterns_dataset.csv"):
results = []
for i, url in enumerate(urls):
print(f"Scanning {i+1}/{len(urls)}: {url}")
try:
data = scrape_consent_banner(url)
for banner in data["banners"]:
patterns = detect_dark_patterns(banner)
results.append({
"url": url,
"banner_text": banner["text"][:500],
"patterns_found": json.dumps(patterns),
"pattern_count": len(patterns),
"max_severity": max(
(p["severity"] for p in patterns),
default="none",
key=lambda s: {"none": 0, "low": 1, "medium": 2, "high": 3, "critical": 4}[s]
),
"has_reject": bool(re.search(r'reject|decline|deny', banner["text"], re.I))
})
except Exception as e:
print(f" Error: {e}")
df = pd.DataFrame(results)
df.to_csv(output_file, index=False)
print(f"Dataset saved: {len(results)} entries from {len(urls)} sites")
return df
Scaling to Thousands of Sites
For large-scale analysis, you need proxy diversity. ThorData offers residential proxies from multiple regions — essential for testing how consent banners vary by geography. ScrapeOps monitors success rates and helps optimize scraping performance.
Research Applications
This dataset enables research into GDPR compliance, regulatory enforcement, and the effectiveness of consent regulations. Publish your findings responsibly and consider contributing to existing dark pattern databases.
The code is research-oriented — use it to improve the web, not to replicate dark patterns in your own products.
Top comments (0)