Async Python Web Scraping: 10x Faster With asyncio and Playwright

#asyncio #playwright #python #webdev

Synchronous scraping is slow. A single-threaded scraper processing 1 page/sec takes 17 minutes for 1,000 URLs.

With asyncio and Playwright, you can scrape 50-100 pages concurrently. Same 1,000 URLs: under 2 minutes.

The Problem With Synchronous Scraping

# Synchronous - slow
import requests

urls = [f"https://example.com/page/{i}" for i in range(1000)]
results = []
for url in urls:
    resp = requests.get(url)  # blocks here
    results.append(resp.text)
# Takes ~17 minutes at 1 req/sec

Each request blocks the entire program.

Solution: Async Playwright with Semaphore

import asyncio
from playwright.async_api import async_playwright
from typing import List, Dict

async def scrape_page(page, url: str) -> Dict:
    try:
        await page.goto(url, wait_until="domcontentloaded", timeout=15000)
        return {"url": url, "title": await page.title(), "status": "ok"}
    except Exception as e:
        return {"url": url, "error": str(e), "status": "failed"}

async def scrape_batch(urls: List[str], concurrency: int = 50) -> List[Dict]:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        semaphore = asyncio.Semaphore(concurrency)

        async def bounded_scrape(url: str):
            async with semaphore:
                page = await browser.new_page()
                try:
                    return await scrape_page(page, url)
                finally:
                    await page.close()

        tasks = [bounded_scrape(url) for url in urls]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        await browser.close()

    return [r for r in results if isinstance(r, dict)]

# Run it
urls = [f"https://example.com/page/{i}" for i in range(1000)]
results = asyncio.run(scrape_batch(urls, concurrency=50))
print(f"Scraped {len(results)} pages")

Speed Comparison

Method	1,000 URLs	Time
requests sync	1/sec	17 min
httpx async	5-8/sec	3 min
Playwright async (50)	10/sec	90 sec
Playwright async (100)	20/sec	50 sec

Rate Limiting for Anti-Detection

Higher concurrency = higher detection risk. 50 concurrent is the sweet spot.

import asyncio
import random

async def scrape_with_delay(page, url: str) -> Dict:
    # Human-like delay between requests
    await asyncio.sleep(random.uniform(0.5, 2.0))
    await page.goto(url, wait_until="domcontentloaded")
    return {"url": url, "title": await page.title()}

Handling Failures

async def scrape_with_retry(page, url: str, max_retries: int = 3) -> Dict:
    for attempt in range(max_retries):
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=15000)

            # Check for bot detection
            if await page.locator("text=Access Denied").count() > 0:
                raise Exception("Bot detection triggered")

            return {"url": url, "status": "ok"}

        except Exception as e:
            if attempt == max_retries - 1:
                return {"url": url, "error": str(e), "status": "failed"}

            # Exponential backoff
            await asyncio.sleep((2 ** attempt) + random.uniform(0, 1))

    return {"url": url, "status": "failed"}

Memory Management for Large Batches

Playwright browsers leak memory. Restart every 200 pages:

async def scrape_large_batch(urls: List[str], batch_size: int = 200):
    all_results = []

    for i in range(0, len(urls), batch_size):
        batch = urls[i:i + batch_size]

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            semaphore = asyncio.Semaphore(50)

            async def process(url):
                async with semaphore:
                    page = await browser.new_page()
                    try:
                        return await scrape_with_retry(page, url)
                    finally:
                        await page.close()

            results = await asyncio.gather(*[process(u) for u in batch])
            all_results.extend([r for r in results if isinstance(r, dict)])
            await browser.close()

        print(f"Batch {i//batch_size + 1} complete: {len(batch)} pages")

    return all_results