Scraping Dark Patterns in Cookie Consent: A Research Dataset

#python #tutorial #webdev #programming

Scraping Dark Patterns in Cookie Consent: A Research Dataset

Cookie consent banners have become a battleground for user privacy. Many sites use dark patterns — deceptive design choices that manipulate users into accepting tracking. In this guide, we'll build a scraper that analyzes cookie consent implementations across thousands of websites and identifies dark patterns systematically.

What Are Cookie Consent Dark Patterns?

Dark patterns in consent include:

Pre-checked boxes — tracking enabled by default
Asymmetric choices — "Accept All" is prominent, "Reject" is hidden
Forced action — blocking content until you accept
Confusing language — using double negatives or jargon
Hidden settings — burying granular controls behind multiple clicks

Building the Scraper

We need JavaScript rendering to detect cookie banners since they're typically injected by third-party scripts. ScraperAPI handles headless rendering:

import requests
from bs4 import BeautifulSoup
import re
import json

SCRAPER_API_KEY = "YOUR_KEY"

def scrape_consent_banner(url):
    response = requests.get(
        "http://api.scraperapi.com",
        params={
            "api_key": SCRAPER_API_KEY,
            "url": url,
            "render": "true",
            "wait_for_selector": "[class*='cookie'], [id*='cookie'], [class*='consent']"
        },
        timeout=60
    )
    soup = BeautifulSoup(response.text, "html.parser")
    consent_selectors = [
        "[class*='cookie']", "[id*='cookie']",
        "[class*='consent']", "[id*='consent']",
        "[class*='gdpr']", "[id*='gdpr']",
        "[class*='privacy']"
    ]
    banners = []
    for selector in consent_selectors:
        elements = soup.select(selector)
        for el in elements:
            banners.append({
                "html": str(el)[:5000],
                "text": el.get_text(strip=True)[:2000],
                "tag": el.name,
                "classes": el.get("class", []),
                "id": el.get("id", "")
            })
    return {"url": url, "banners": banners}

Dark Pattern Detection Engine

Now we build classifiers for each type of dark pattern:

def detect_dark_patterns(banner_data):
    patterns_found = []
    html = banner_data.get("html", "")
    text = banner_data.get("text", "").lower()

    # 1. Asymmetric buttons
    accept_btns = re.findall(
        r'<button[^>]*class="[^"]*(?:primary|accept|agree|allow)[^"]*"[^>]*>',
        html, re.IGNORECASE
    )
    reject_btns = re.findall(
        r'<button[^>]*class="[^"]*(?:secondary|reject|decline|deny)[^"]*"[^>]*>',
        html, re.IGNORECASE
    )
    reject_links = re.findall(
        r'<a[^>]*>.*?(?:reject|decline|deny|refuse).*?</a>',
        html, re.IGNORECASE
    )
    if accept_btns and (reject_links and not reject_btns):
        patterns_found.append({
            "type": "ASYMMETRIC_CHOICES",
            "severity": "high",
            "detail": "Accept is a button, reject is a text link"
        })

    # 2. Pre-checked tracking boxes
    prechecked = re.findall(
        r'<input[^>]*type="checkbox"[^>]*checked[^>]*>',
        html, re.IGNORECASE
    )
    if prechecked:
        patterns_found.append({
            "type": "PRE_CHECKED",
            "severity": "high",
            "detail": f"{len(prechecked)} pre-checked tracking checkboxes"
        })

    # 3. No reject option
    has_accept = bool(re.search(r'accept|agree|allow|got it|ok', text))
    has_reject = bool(re.search(r'reject|decline|deny|refuse', text))
    if has_accept and not has_reject:
        patterns_found.append({
            "type": "NO_REJECT_OPTION",
            "severity": "critical",
            "detail": "Accept option present but no visible reject"
        })

    # 4. Confusing double negatives
    double_neg = re.findall(
        r'(?:don.t|do not).{0,20}(?:not|disable|opt.out|uncheck)', text
    )
    if double_neg:
        patterns_found.append({
            "type": "CONFUSING_LANGUAGE",
            "severity": "medium",
            "detail": f"Double negative detected: {double_neg[0]}"
        })

    # 5. Wall/blocker pattern
    overlay = re.search(
        r'(?:position:\s*fixed|z-index:\s*[5-9]\d{3,})', html, re.IGNORECASE
    )
    if overlay and not has_reject:
        patterns_found.append({
            "type": "FORCED_ACTION",
            "severity": "critical",
            "detail": "Full-page overlay with no reject option"
        })

    return patterns_found

Building the Research Dataset

Scrape the top 10,000 sites and compile results:

import pandas as pd

def build_dataset(urls, output_file="dark_patterns_dataset.csv"):
    results = []
    for i, url in enumerate(urls):
        print(f"Scanning {i+1}/{len(urls)}: {url}")
        try:
            data = scrape_consent_banner(url)
            for banner in data["banners"]:
                patterns = detect_dark_patterns(banner)
                results.append({
                    "url": url,
                    "banner_text": banner["text"][:500],
                    "patterns_found": json.dumps(patterns),
                    "pattern_count": len(patterns),
                    "max_severity": max(
                        (p["severity"] for p in patterns),
                        default="none",
                        key=lambda s: {"none": 0, "low": 1, "medium": 2, "high": 3, "critical": 4}[s]
                    ),
                    "has_reject": bool(re.search(r'reject|decline|deny', banner["text"], re.I))
                })
        except Exception as e:
            print(f"  Error: {e}")

    df = pd.DataFrame(results)
    df.to_csv(output_file, index=False)
    print(f"Dataset saved: {len(results)} entries from {len(urls)} sites")
    return df

Scaling to Thousands of Sites

For large-scale analysis, you need proxy diversity. ThorData offers residential proxies from multiple regions — essential for testing how consent banners vary by geography. ScrapeOps monitors success rates and helps optimize scraping performance.

Research Applications

This dataset enables research into GDPR compliance, regulatory enforcement, and the effectiveness of consent regulations. Publish your findings responsibly and consider contributing to existing dark pattern databases.

The code is research-oriented — use it to improve the web, not to replicate dark patterns in your own products.