You spin up 20 Playwright browsers for parallel scraping. After an hour, your server runs out of memory. Or one browser crashes and takes your whole pipeline with it.
Browser-based scraping at scale needs more than just browser.launch()\. You need a browser pool — with lifecycle management, memory limits, crash recovery, and integrated CAPTCHA handling.
Why You Need a Browser Pool
Single browser problems:
- Memory leaks — Chrome eats ~100-300MB per instance, pages leak more
- Zombie processes — crashed browsers leave orphan processes
- No concurrency control — 50 pages in one browser = tab soup
- No isolation — one bad page crashes everything
A pool solves all of these.
Step 1: The Basic Pool
# browser_pool.py
import asyncio
from playwright.async_api import async_playwright
from dataclasses import dataclass, field
from datetime import datetime
import os
@dataclassclass BrowserInstance:
browser: object
context: object
created_at: datetime = field(
default_factory=datetime.utcnow
)
pages_processed: int = 0
is_busy: bool = False
pid: int = 0
class BrowserPool:
def __init__(
self,
size: int = 5,
max_pages_per_browser: int = 50,
max_age_seconds: int = 300,
):
self.size = size
self.max_pages = max_pages_per_browser
self.max_age = max_age_seconds
self._pool: list[BrowserInstance] = []
self._lock = asyncio.Lock()
self._playwright = None
self._semaphore = asyncio.Semaphore(size)
async def start(self):
self._playwright = await async_playwright().start()
for _ in range(self.size):
instance = await self._create_instance()
self._pool.append(instance)
async def _create_instance(self) -> BrowserInstance:
browser = await self._playwright.chromium.launch(
headless=True,
args=[
"--disable-dev-shm-usage",
"--disable-gpu",
"--no-sandbox",
"--disable-setuid-sandbox",
# Limit memory per process
"--max-old-space-size=256",
"--disable-blink-features="
"AutomationControlled",
]
)
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/121.0.0.0 Safari/537.36"
),
)
# Get browser PID for monitoring
pid = browser.process.pid if browser.process else 0
return BrowserInstance(
browser=browser,
context=context,
pid=pid,
)
async def acquire(self) -> BrowserInstance:
"""Get a browser from the pool."""
await self._semaphore.acquire()
async with self._lock:
for instance in self._pool:
if not instance.is_busy:
# Check if browser needs recycling
if self._needs_recycle(instance):
await self._recycle(instance)
instance.is_busy = True
return instance
# Shouldn't reach here with semaphore
raise RuntimeError("No available browsers")
async def release(self, instance: BrowserInstance):
"""Return a browser to the pool."""
async with self._lock:
instance.is_busy = False
instance.pages_processed += 1
self._semaphore.release()
def _needs_recycle(self, instance: BrowserInstance) -> bool:
age = (
datetime.utcnow() - instance.created_at
).total_seconds()
return (
instance.pages_processed >= self.max_pages
or age > self.max_age
)
async def _recycle(self, instance: BrowserInstance):
"""Close old browser and create a fresh one."""
idx = self._pool.index(instance)
try:
await instance.context.close()
await instance.browser.close()
except Exception:
pass # Browser may already be dead
new_instance = await self._create_instance()
self._pool[idx] = new_instance
async def shutdown(self):
"""Close all browsers."""
for instance in self._pool:
try:
await instance.context.close()
await instance.browser.close()
except Exception:
pass
if self._playwright:
await self._playwright.stop()
Step 2: Context Manager for Safe Usage
from contextlib import asynccontextmanager
class BrowserPool:
# ... (previous code) ...
@asynccontextmanager async def get_page(self):
"""Get a page from the pool,
auto-release on exit."""
instance = await self.acquire()
page = None
try:
page = await instance.context.new_page()
yield page
except Exception as e:
# Browser might have crashed
if self._is_crash(e):
await self._handle_crash(instance)
raise
finally:
if page:
try:
await page.close()
except Exception:
pass
await self.release(instance)
def _is_crash(self, error: Exception) -> bool:
crash_indicators = [
"Target closed",
"Browser closed",
"Protocol error",
"Connection refused",
]
return any(
indicator in str(error)
for indicator in crash_indicators
)
async def _handle_crash(
self, instance: BrowserInstance
):
"""Replace crashed browser with a new one."""
async with self._lock:
try:
idx = self._pool.index(instance)
# Kill zombie process if needed
if instance.pid:
try:
os.kill(instance.pid, 9)
except ProcessLookupError:
pass
new_instance = await self._create_instance()
self._pool[idx] = new_instance
print(
f"Replaced crashed browser "
f"(PID {instance.pid})"
)
except ValueError:
pass # Already replaced
Step 3: Memory Monitoring
Chrome is a memory hog. Monitor and kill browsers that get too fat:
import psutil
class MemoryMonitor:
def __init__(
self,
pool: BrowserPool,
max_memory_mb: int = 500,
check_interval: int = 30,
):
self.pool = pool
self.max_memory = max_memory_mb * 1024 * 1024
self.interval = check_interval
self._task = None
async def start(self):
self._task = asyncio.create_task(
self._monitor_loop()
)
async def _monitor_loop(self):
while True:
await asyncio.sleep(self.interval)
await self._check_memory()
async def _check_memory(self):
for instance in self.pool._pool:
if not instance.pid:
continue
try:
proc = psutil.Process(instance.pid)
memory = proc.memory_info().rss
# Include child processes
for child in proc.children(recursive=True):
memory += child.memory_info().rss
memory_mb = memory / (1024 * 1024)
if memory > self.max_memory:
print(
f"Browser PID {instance.pid} "
f"using {memory_mb:.0f}MB "
f"(limit: "
f"{self.max_memory/1024/1024:.0f}MB)"
f" — recycling"
)
if not instance.is_busy:
await self.pool._recycle(instance)
except (
psutil.NoSuchProcess,
psutil.AccessDenied
):
pass
async def get_stats(self) -> dict:
total_memory = 0
stats = []
for i, instance in enumerate(self.pool._pool):
try:
proc = psutil.Process(instance.pid)
mem = proc.memory_info().rss
for child in proc.children(recursive=True):
mem += child.memory_info().rss
total_memory += mem
stats.append({
"id": i,
"pid": instance.pid,
"memory_mb": mem / (1024 * 1024),
"pages": instance.pages_processed,
"busy": instance.is_busy,
})
except psutil.NoSuchProcess:
stats.append({
"id": i, "status": "dead"
})
return {
"browsers": stats,
"total_memory_mb": total_memory / (1024*1024),
"pool_size": self.pool.size,
}
Step 4: CAPTCHA-Aware Scraper Using the Pool
# scraper.py
from browser_pool import BrowserPool, MemoryMonitor
class PooledScraper:
def __init__(
self,
pool_size: int = 5,
captcha_solver = None,
):
self.pool = BrowserPool(
size=pool_size,
max_pages_per_browser=50,
max_age_seconds=300,
)
self.monitor = MemoryMonitor(
self.pool, max_memory_mb=500
)
self.solver = captcha_solver
self.stats = {
"scraped": 0, "captchas": 0,
"errors": 0, "crashes": 0,
}
async def start(self):
await self.pool.start()
await self.monitor.start()
async def scrape(self, url: str) -> dict:
async with self.pool.get_page() as page:
try:
await page.goto(
url,
wait_until="domcontentloaded",
timeout=30000
)
# Check for CAPTCHA
captcha = await self._detect_captcha(page)
if captcha and self.solver:
await self._solve_captcha(
page, captcha
)
self.stats["captchas"] += 1
# Extract data
content = await page.content()
self.stats["scraped"] += 1
return {
"url": url,
"html": content,
"captcha_solved": bool(captcha),
}
except Exception as e:
self.stats["errors"] += 1
return {"url": url, "error": str(e)}
async def _detect_captcha(self, page) -> dict | None:
selectors = {
"recaptcha_v2": ".g-recaptcha",
"hcaptcha": ".h-captcha",
"turnstile": ".cf-turnstile",
}
for ctype, selector in selectors.items():
el = await page.query_selector(selector)
if el:
sitekey = await el.get_attribute(
"data-sitekey"
)
return {
"type": ctype,
"sitekey": sitekey
}
return None
async def _solve_captcha(self, page, captcha):
token = await self.solver.solve(
captcha_type=captcha["type"],
sitekey=captcha["sitekey"],
url=page.url,
)
# Inject token
field_map = {
"recaptcha_v2": "g-recaptcha-response",
"hcaptcha": "h-captcha-response",
"turnstile": "cf-turnstile-response",
}
field = field_map.get(
captcha["type"], "captcha-response"
)
await page.evaluate(f"""() => {{
const el = document.querySelector(
'textarea[name="{field}"]'
);
if (el) el.value = '{token}';
}}""")
async def scrape_many(
self, urls: list[str]
) -> list[dict]:
tasks = [self.scrape(url) for url in urls]
return await asyncio.gather(*tasks)
async def shutdown(self):
mem_stats = await self.monitor.get_stats()
print(f"Memory: {mem_stats}")
print(f"Scraper stats: {self.stats}")
await self.pool.shutdown()
Step 5: Using It
import asyncio
async def main():
# Setup
solver = CaptchaSolver(
api_base="https://www.passxapi.com"
)
scraper = PooledScraper(
pool_size=5,
captcha_solver=solver,
)
await scraper.start()
# Load URLs
urls = [
f"https://shop.example.com/product/{i}"
for i in range(1, 1001)
]
# Scrape in batches
batch_size = 20
all_results = []
for i in range(0, len(urls), batch_size):
batch = urls[i:i + batch_size]
results = await scraper.scrape_many(batch)
all_results.extend(results)
print(
f"Batch {i//batch_size + 1}: "
f"{scraper.stats}"
)
# Cleanup
await scraper.shutdown()
# Results
success = [
r for r in all_results if "html" in r
]
print(
f"Done: {len(success)}/{len(urls)} succeeded"
)
asyncio.run(main())
Resource Limits for Docker
When running in containers, set hard limits:
# docker-compose.yml
services:
scraper:
build: .
deploy:
resources:
limits:
memory: 2G # Hard cap
cpus: "2.0"
reservations:
memory: 512M
environment:
- POOL_SIZE=5
- MAX_MEMORY_PER_BROWSER=400
# Shared memory for Chrome
shm_size: "1gb"
Without shm_size\, Chrome will crash with "Out of shared memory" errors.
Health Endpoint
from fastapi import FastAPI
app = FastAPI()
@app.get("/health")async def health():
mem = await scraper.monitor.get_stats()
return {
"status": "healthy",
"pool": {
"size": scraper.pool.size,
"total_memory_mb": mem["total_memory_mb"],
},
"scraper": scraper.stats,
}
Key Takeaways
- Pool browsers, don't spawn per-request — reuse reduces overhead from 2s to 50ms
- Recycle after N pages or N minutes — prevents memory leaks
- Monitor memory per-process — kill fat browsers before OOM
- Handle crashes gracefully — replace dead browsers, don't restart everything
- Use context managers — auto-release on success or failure
- Set Docker shm_size — Chrome needs shared memory
- CAPTCHA solving is just another step — detect, solve via API, inject, continue
For the CAPTCHA-solving integration, check out passxapi-python — it provides async clients that work seamlessly inside browser pool workflows.
How many concurrent browsers do you run? What's your memory limit? Share your setup in the comments.
Top comments (0)