Posted on Mar 22

Scraping JavaScript-Heavy Sites with Playwright and Automatic CAPTCHA Handling

Some sites can't be scraped with requests\ or httpx\ alone. They render content with JavaScript, load data via XHR calls, and throw CAPTCHAs when they detect automation. For these sites, you need a real browser.

Here's my production setup using Playwright with stealth techniques and automatic CAPTCHA solving.

Why Playwright Over Selenium

I switched from Selenium to Playwright two years ago. The reasons:

Auto-wait: Playwright waits for elements automatically. No more time.sleep()\ or explicit waits everywhere
Multiple contexts: One browser, many isolated sessions (like incognito tabs). Much lighter than multiple browser instances
Network interception: Easily mock or modify requests
Better stealth: Fewer automation fingerprints out of the box

pip install playwright passxapi
playwright install chromium

The Detection Problem

Websites detect Playwright through:

# These are what sites check:
navigator.webdriver          # true in automation
navigator.plugins.length     # 0 in headless
navigator.languages          # missing or wrong
window.chrome                # undefined in headless
navigator.permissions.query  # behaves differently

You need to patch these before loading any page.

Stealth Setup

from playwright.async_api import async_playwright

async def create_stealth_browser():
    p = await async_playwright().start()

    browser = await p.chromium.launch(
        headless=True,
        args=[
            "--disable-blink-features=AutomationControlled",
            "--disable-features=IsolateOrigins,site-per-process",
            "--disable-dev-shm-usage",
        ]
    )

    context = await browser.new_context(
        viewport={"width": 1920, "height": 1080},
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/120.0.0.0 Safari/537.36",
        locale="en-US",
        timezone_id="America/New_York",
    )

    # Patch automation indicators
    await context.add_init_script("""
        // Hide webdriver flag
        Object.defineProperty(navigator, 'webdriver', {
            get: () => undefined
        });

        // Fake plugins
        Object.defineProperty(navigator, 'plugins', {
            get: () => [1, 2, 3, 4, 5]
        });

        // Fake languages
        Object.defineProperty(navigator, 'languages', {
            get: () => ['en-US', 'en']
        });

        // Add chrome object
        window.chrome = {
            runtime: {},
            loadTimes: function() {},
            csi: function() {},
            app: {},
        };
    """)

    return p, browser, context

Context Pooling for Performance

Don't create a new browser for every page. Use a pool of contexts:

import asyncio

class BrowserPool:
    def __init__(self, max_contexts=10):
        self.max_contexts = max_contexts
        self.semaphore = asyncio.Semaphore(max_contexts)
        self.browser = None
        self.playwright = None

    async def start(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(
            headless=True,
            args=["--disable-blink-features=AutomationControlled"]
        )

    async def get_page(self):
        await self.semaphore.acquire()
        context = await self.browser.new_context(
            viewport={"width": 1920, "height": 1080},
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
        )
        await context.add_init_script(STEALTH_SCRIPT)
        page = await context.new_page()
        return page, context

    async def release(self, page, context):
        await page.close()
        await context.close()
        self.semaphore.release()

    async def close(self):
        if self.browser:
            await self.browser.close()
        if self.playwright:
            await self.playwright.stop()

Usage:

pool = BrowserPool(max_contexts=5)
await pool.start()

page, ctx = await pool.get_page()
try:
    await page.goto("https://example.com")
    content = await page.content()
finally:
    await pool.release(page, ctx)

CAPTCHA Detection and Solving

When Playwright hits a CAPTCHA, you need to detect it, solve it via API, and inject the token:

import os
from passxapi import AsyncClient

solver = AsyncClient(api_key=os.getenv("PASSXAPI_KEY"))

async def detect_captcha(page):
    """Detect what type of CAPTCHA is on the page."""
    content = await page.content()
    content_lower = content.lower()

    if "recaptcha" in content_lower or "g-recaptcha" in content_lower:
        sitekey = await page.eval_on_selector(
            "[data-sitekey]", "el => el.dataset.sitekey",
        ).catch(lambda: None)

        if not sitekey:
            # v3: sitekey in script URL
            import re
            match = re.search(r'render=([0-9A-Za-z_-]+)', content)
            sitekey = match.group(1) if match else None

        if sitekey:
            return {"type": "recaptcha_v2", "sitekey": sitekey}

    if "hcaptcha" in content_lower:
        sitekey = await page.eval_on_selector(
            "[data-sitekey]", "el => el.dataset.sitekey",
        ).catch(lambda: None)
        if sitekey:
            return {"type": "hcaptcha", "sitekey": sitekey}

    if "cf-turnstile" in content_lower:
        sitekey = await page.eval_on_selector(
            ".cf-turnstile[data-sitekey]", "el => el.dataset.sitekey",
        ).catch(lambda: None)
        if sitekey:
            return {"type": "turnstile", "sitekey": sitekey}

    return None


async def solve_and_inject(page, captcha_info):
    """Solve the CAPTCHA and inject the token into the page."""
    result = await solver.solve(
        captcha_type=captcha_info["type"],
        sitekey=captcha_info["sitekey"],
        url=page.url,
    )
    token = result["token"]

    # Inject based on type
    if captcha_info["type"] in ("recaptcha_v2", "recaptcha_v3"):
        await page.evaluate(f"""
            document.getElementById('g-recaptcha-response').value = '{token}';
            // Trigger callback if exists
            if (typeof ___grecaptcha_cfg !== 'undefined') {{
                Object.keys(___grecaptcha_cfg.clients).forEach(key => {{
                    const client = ___grecaptcha_cfg.clients[key];
                    try {{ client.aa.l.callback('{token}'); }} catch {{}}
                }});
            }}
        """)

    elif captcha_info["type"] == "hcaptcha":
        await page.evaluate(f"""
            document.querySelector('[name="h-captcha-response"]').value = '{token}';
            document.querySelector('[name="g-recaptcha-response"]').value = '{token}';
        """)

    elif captcha_info["type"] == "turnstile":
        await page.evaluate(f"""
            document.querySelector('[name="cf-turnstile-response"]').value = '{token}';
        """)

    return token

Putting It All Together

Here's a complete scraper for a JS-rendered site with CAPTCHA protection:

import asyncio
import json
import os
from playwright.async_api import async_playwright
from passxapi import AsyncClient

STEALTH_SCRIPT = """
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
Object.defineProperty(navigator, 'plugins', {get: () => [1,2,3]});
window.chrome = {runtime: {}};
"""

solver = AsyncClient(api_key=os.getenv("PASSXAPI_KEY"))

async def scrape_product(page, url):
    """Scrape a single product page, handling CAPTCHAs if needed."""
    await page.goto(url, wait_until="networkidle")

    # Check for CAPTCHA
    captcha = await detect_captcha(page)
    if captcha:
        print(f"  CAPTCHA detected: {captcha['type']}")
        await solve_and_inject(page, captcha)

        # Click submit if there's a form
        submit = page.locator("button[type=submit], input[type=submit]")
        if await submit.count() > 0:
            await submit.first.click()
            await page.wait_for_load_state("networkidle")

    # Now extract data from the JS-rendered page
    data = await page.evaluate("""
        () => {
            const title = document.querySelector('h1')?.textContent?.trim();
            const price = document.querySelector('.price')?.textContent?.trim();
            const description = document.querySelector(
                '.description'
            )?.textContent?.trim();
            const images = [...document.querySelectorAll(
                '.product-image img'
            )].map(img => img.src);

            return { title, price, description, images };
        }
    """)

    return data


async def main():
    urls = [
        "https://example.com/product/1",
        "https://example.com/product/2",
        # ... more URLs
    ]

    pool = BrowserPool(max_contexts=5)
    await pool.start()

    results = []

    async def process_url(url):
        page, ctx = await pool.get_page()
        try:
            data = await scrape_product(page, url)
            data["url"] = url
            results.append(data)
            print(f"Scraped: {data.get('title', 'unknown')}")
        except Exception as e:
            print(f"Failed {url}: {e}")
        finally:
            await pool.release(page, ctx)

    # Process all URLs with controlled concurrency
    tasks = [process_url(url) for url in urls]
    await asyncio.gather(*tasks)

    await pool.close()

    # Save results
    with open("products.json", "w") as f:
        json.dump(results, f, indent=2)

    print(f"Scraped {len(results)}/{len(urls)} products")

asyncio.run(main())

Intercepting API Calls

Many JS sites load data via XHR/fetch. Instead of parsing the DOM, intercept the API response:

async def scrape_via_api_intercept(page, url):
    """Intercept the underlying API call instead of parsing HTML."""

    api_response = None

    async def handle_response(response):
        nonlocal api_response
        # Catch the API call that loads product data
        if "/api/products/" in response.url:
            api_response = await response.json()

    page.on("response", handle_response)

    await page.goto(url, wait_until="networkidle")

    # Handle CAPTCHA if needed
    captcha = await detect_captcha(page)
    if captcha:
        await solve_and_inject(page, captcha)
        # Reload to trigger the API call again
        await page.reload(wait_until="networkidle")

    return api_response

This is often faster and more reliable than DOM parsing — you get clean JSON instead of fragile CSS selectors.

Performance Tips

Use networkidle\ sparingly — it waits for all network activity to stop. Use domcontentloaded\ when possible
Block unnecessary resources — images, fonts, and analytics slow things down

async def block_resources(page):
    """Block images, fonts, and tracking scripts."""
    await page.route("**/*.{png,jpg,jpeg,gif,svg,woff,woff2}", 
                     lambda route: route.abort())
    await page.route("**/analytics**", 
                     lambda route: route.abort())
    await page.route("**/google-analytics**", 
                     lambda route: route.abort())

Reuse contexts — creating browser contexts is expensive. Pool them.
Set timeouts — don't let slow pages block your scraper forever

Wrapping Up

Playwright + stealth patches + API-based CAPTCHA solving is the most powerful combo for scraping JavaScript-heavy, CAPTCHA-protected sites. The key is keeping your browser fingerprint clean and solving CAPTCHAs via API instead of trying to click through them.

Full CAPTCHA solving SDK: passxapi-python on GitHub

What's the hardest site you've had to scrape with Playwright? Share your war stories below.

DEV Community