Some sites can't be scraped with requests\ or httpx\ alone. They render content with JavaScript, load data via XHR calls, and throw CAPTCHAs when they detect automation. For these sites, you need a real browser.
Here's my production setup using Playwright with stealth techniques and automatic CAPTCHA solving.
Why Playwright Over Selenium
I switched from Selenium to Playwright two years ago. The reasons:
-
Auto-wait: Playwright waits for elements automatically. No more
time.sleep()\or explicit waits everywhere - Multiple contexts: One browser, many isolated sessions (like incognito tabs). Much lighter than multiple browser instances
- Network interception: Easily mock or modify requests
- Better stealth: Fewer automation fingerprints out of the box
pip install playwright passxapi
playwright install chromium
The Detection Problem
Websites detect Playwright through:
# These are what sites check:
navigator.webdriver # true in automation
navigator.plugins.length # 0 in headless
navigator.languages # missing or wrong
window.chrome # undefined in headless
navigator.permissions.query # behaves differently
You need to patch these before loading any page.
Stealth Setup
from playwright.async_api import async_playwright
async def create_stealth_browser():
p = await async_playwright().start()
browser = await p.chromium.launch(
headless=True,
args=[
"--disable-blink-features=AutomationControlled",
"--disable-features=IsolateOrigins,site-per-process",
"--disable-dev-shm-usage",
]
)
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36",
locale="en-US",
timezone_id="America/New_York",
)
# Patch automation indicators
await context.add_init_script("""
// Hide webdriver flag
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
// Fake plugins
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
// Fake languages
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
// Add chrome object
window.chrome = {
runtime: {},
loadTimes: function() {},
csi: function() {},
app: {},
};
""")
return p, browser, context
Context Pooling for Performance
Don't create a new browser for every page. Use a pool of contexts:
import asyncio
class BrowserPool:
def __init__(self, max_contexts=10):
self.max_contexts = max_contexts
self.semaphore = asyncio.Semaphore(max_contexts)
self.browser = None
self.playwright = None
async def start(self):
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled"]
)
async def get_page(self):
await self.semaphore.acquire()
context = await self.browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
)
await context.add_init_script(STEALTH_SCRIPT)
page = await context.new_page()
return page, context
async def release(self, page, context):
await page.close()
await context.close()
self.semaphore.release()
async def close(self):
if self.browser:
await self.browser.close()
if self.playwright:
await self.playwright.stop()
Usage:
pool = BrowserPool(max_contexts=5)
await pool.start()
page, ctx = await pool.get_page()
try:
await page.goto("https://example.com")
content = await page.content()
finally:
await pool.release(page, ctx)
CAPTCHA Detection and Solving
When Playwright hits a CAPTCHA, you need to detect it, solve it via API, and inject the token:
import os
from passxapi import AsyncClient
solver = AsyncClient(api_key=os.getenv("PASSXAPI_KEY"))
async def detect_captcha(page):
"""Detect what type of CAPTCHA is on the page."""
content = await page.content()
content_lower = content.lower()
if "recaptcha" in content_lower or "g-recaptcha" in content_lower:
sitekey = await page.eval_on_selector(
"[data-sitekey]", "el => el.dataset.sitekey",
).catch(lambda: None)
if not sitekey:
# v3: sitekey in script URL
import re
match = re.search(r'render=([0-9A-Za-z_-]+)', content)
sitekey = match.group(1) if match else None
if sitekey:
return {"type": "recaptcha_v2", "sitekey": sitekey}
if "hcaptcha" in content_lower:
sitekey = await page.eval_on_selector(
"[data-sitekey]", "el => el.dataset.sitekey",
).catch(lambda: None)
if sitekey:
return {"type": "hcaptcha", "sitekey": sitekey}
if "cf-turnstile" in content_lower:
sitekey = await page.eval_on_selector(
".cf-turnstile[data-sitekey]", "el => el.dataset.sitekey",
).catch(lambda: None)
if sitekey:
return {"type": "turnstile", "sitekey": sitekey}
return None
async def solve_and_inject(page, captcha_info):
"""Solve the CAPTCHA and inject the token into the page."""
result = await solver.solve(
captcha_type=captcha_info["type"],
sitekey=captcha_info["sitekey"],
url=page.url,
)
token = result["token"]
# Inject based on type
if captcha_info["type"] in ("recaptcha_v2", "recaptcha_v3"):
await page.evaluate(f"""
document.getElementById('g-recaptcha-response').value = '{token}';
// Trigger callback if exists
if (typeof ___grecaptcha_cfg !== 'undefined') {{
Object.keys(___grecaptcha_cfg.clients).forEach(key => {{
const client = ___grecaptcha_cfg.clients[key];
try {{ client.aa.l.callback('{token}'); }} catch {{}}
}});
}}
""")
elif captcha_info["type"] == "hcaptcha":
await page.evaluate(f"""
document.querySelector('[name="h-captcha-response"]').value = '{token}';
document.querySelector('[name="g-recaptcha-response"]').value = '{token}';
""")
elif captcha_info["type"] == "turnstile":
await page.evaluate(f"""
document.querySelector('[name="cf-turnstile-response"]').value = '{token}';
""")
return token
Putting It All Together
Here's a complete scraper for a JS-rendered site with CAPTCHA protection:
import asyncio
import json
import os
from playwright.async_api import async_playwright
from passxapi import AsyncClient
STEALTH_SCRIPT = """
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
Object.defineProperty(navigator, 'plugins', {get: () => [1,2,3]});
window.chrome = {runtime: {}};
"""
solver = AsyncClient(api_key=os.getenv("PASSXAPI_KEY"))
async def scrape_product(page, url):
"""Scrape a single product page, handling CAPTCHAs if needed."""
await page.goto(url, wait_until="networkidle")
# Check for CAPTCHA
captcha = await detect_captcha(page)
if captcha:
print(f" CAPTCHA detected: {captcha['type']}")
await solve_and_inject(page, captcha)
# Click submit if there's a form
submit = page.locator("button[type=submit], input[type=submit]")
if await submit.count() > 0:
await submit.first.click()
await page.wait_for_load_state("networkidle")
# Now extract data from the JS-rendered page
data = await page.evaluate("""
() => {
const title = document.querySelector('h1')?.textContent?.trim();
const price = document.querySelector('.price')?.textContent?.trim();
const description = document.querySelector(
'.description'
)?.textContent?.trim();
const images = [...document.querySelectorAll(
'.product-image img'
)].map(img => img.src);
return { title, price, description, images };
}
""")
return data
async def main():
urls = [
"https://example.com/product/1",
"https://example.com/product/2",
# ... more URLs
]
pool = BrowserPool(max_contexts=5)
await pool.start()
results = []
async def process_url(url):
page, ctx = await pool.get_page()
try:
data = await scrape_product(page, url)
data["url"] = url
results.append(data)
print(f"Scraped: {data.get('title', 'unknown')}")
except Exception as e:
print(f"Failed {url}: {e}")
finally:
await pool.release(page, ctx)
# Process all URLs with controlled concurrency
tasks = [process_url(url) for url in urls]
await asyncio.gather(*tasks)
await pool.close()
# Save results
with open("products.json", "w") as f:
json.dump(results, f, indent=2)
print(f"Scraped {len(results)}/{len(urls)} products")
asyncio.run(main())
Intercepting API Calls
Many JS sites load data via XHR/fetch. Instead of parsing the DOM, intercept the API response:
async def scrape_via_api_intercept(page, url):
"""Intercept the underlying API call instead of parsing HTML."""
api_response = None
async def handle_response(response):
nonlocal api_response
# Catch the API call that loads product data
if "/api/products/" in response.url:
api_response = await response.json()
page.on("response", handle_response)
await page.goto(url, wait_until="networkidle")
# Handle CAPTCHA if needed
captcha = await detect_captcha(page)
if captcha:
await solve_and_inject(page, captcha)
# Reload to trigger the API call again
await page.reload(wait_until="networkidle")
return api_response
This is often faster and more reliable than DOM parsing — you get clean JSON instead of fragile CSS selectors.
Performance Tips
-
Use
networkidle\sparingly — it waits for all network activity to stop. Usedomcontentloaded\when possible - Block unnecessary resources — images, fonts, and analytics slow things down
async def block_resources(page):
"""Block images, fonts, and tracking scripts."""
await page.route("**/*.{png,jpg,jpeg,gif,svg,woff,woff2}",
lambda route: route.abort())
await page.route("**/analytics**",
lambda route: route.abort())
await page.route("**/google-analytics**",
lambda route: route.abort())
- Reuse contexts — creating browser contexts is expensive. Pool them.
- Set timeouts — don't let slow pages block your scraper forever
Wrapping Up
Playwright + stealth patches + API-based CAPTCHA solving is the most powerful combo for scraping JavaScript-heavy, CAPTCHA-protected sites. The key is keeping your browser fingerprint clean and solving CAPTCHAs via API instead of trying to click through them.
Full CAPTCHA solving SDK: passxapi-python on GitHub
What's the hardest site you've had to scrape with Playwright? Share your war stories below.
Top comments (0)