Synchronous scraping is slow. A single-threaded scraper processing 1 page/sec takes 17 minutes for 1,000 URLs.
With asyncio and Playwright, you can scrape 50-100 pages concurrently. Same 1,000 URLs: under 2 minutes.
The Problem With Synchronous Scraping
# Synchronous - slow
import requests
urls = [f"https://example.com/page/{i}" for i in range(1000)]
results = []
for url in urls:
resp = requests.get(url) # blocks here
results.append(resp.text)
# Takes ~17 minutes at 1 req/sec
Each request blocks the entire program.
Solution: Async Playwright with Semaphore
import asyncio
from playwright.async_api import async_playwright
from typing import List, Dict
async def scrape_page(page, url: str) -> Dict:
try:
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
return {"url": url, "title": await page.title(), "status": "ok"}
except Exception as e:
return {"url": url, "error": str(e), "status": "failed"}
async def scrape_batch(urls: List[str], concurrency: int = 50) -> List[Dict]:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
semaphore = asyncio.Semaphore(concurrency)
async def bounded_scrape(url: str):
async with semaphore:
page = await browser.new_page()
try:
return await scrape_page(page, url)
finally:
await page.close()
tasks = [bounded_scrape(url) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
await browser.close()
return [r for r in results if isinstance(r, dict)]
# Run it
urls = [f"https://example.com/page/{i}" for i in range(1000)]
results = asyncio.run(scrape_batch(urls, concurrency=50))
print(f"Scraped {len(results)} pages")
Speed Comparison
| Method | 1,000 URLs | Time |
|---|---|---|
| requests sync | 1/sec | 17 min |
| httpx async | 5-8/sec | 3 min |
| Playwright async (50) | 10/sec | 90 sec |
| Playwright async (100) | 20/sec | 50 sec |
Rate Limiting for Anti-Detection
Higher concurrency = higher detection risk. 50 concurrent is the sweet spot.
import asyncio
import random
async def scrape_with_delay(page, url: str) -> Dict:
# Human-like delay between requests
await asyncio.sleep(random.uniform(0.5, 2.0))
await page.goto(url, wait_until="domcontentloaded")
return {"url": url, "title": await page.title()}
Handling Failures
async def scrape_with_retry(page, url: str, max_retries: int = 3) -> Dict:
for attempt in range(max_retries):
try:
await page.goto(url, wait_until="domcontentloaded", timeout=15000)
# Check for bot detection
if await page.locator("text=Access Denied").count() > 0:
raise Exception("Bot detection triggered")
return {"url": url, "status": "ok"}
except Exception as e:
if attempt == max_retries - 1:
return {"url": url, "error": str(e), "status": "failed"}
# Exponential backoff
await asyncio.sleep((2 ** attempt) + random.uniform(0, 1))
return {"url": url, "status": "failed"}
Memory Management for Large Batches
Playwright browsers leak memory. Restart every 200 pages:
async def scrape_large_batch(urls: List[str], batch_size: int = 200):
all_results = []
for i in range(0, len(urls), batch_size):
batch = urls[i:i + batch_size]
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
semaphore = asyncio.Semaphore(50)
async def process(url):
async with semaphore:
page = await browser.new_page()
try:
return await scrape_with_retry(page, url)
finally:
await page.close()
results = await asyncio.gather(*[process(u) for u in batch])
all_results.extend([r for r in results if isinstance(r, dict)])
await browser.close()
print(f"Batch {i//batch_size + 1} complete: {len(batch)} pages")
return all_results
Real Production Numbers
After switching our SERP tracker to async:
- Before: 500 keywords, 45 minutes
- After: 500 keywords, 4 minutes
- Improvement: 11x faster
- Memory: Stable at 2GB with batch restarts
Want 12 production-grade async scrapers already optimized for concurrency? Google SERP, LinkedIn, Amazon, and more.
Apify Scrapers Bundle — 29 EUR -> https://vhubster3.gumroad.com/l/fjmtqn
Top comments (0)