Async Web Scraping in Python: httpx + asyncio for 10x Faster Data Collection

#python #asyncio #webscraping #performance

Async Web Scraping in Python: httpx + asyncio for 10x Faster Data Collection

Synchronous scraping makes requests one at a time. While you wait for one response, you're doing nothing. Async scraping makes 10-50 requests simultaneously — same time, 10-50x the output.

Here's how to actually implement it, with real benchmarks.

Why Async? The Numbers

Scraping 100 pages, each taking 1 second to respond:

Synchronous:  100 × 1s = 100 seconds
Async (10x):  10 × 1s  = 10 seconds (10 concurrent)
Async (50x):  2 × 1s   = 2 seconds  (50 concurrent)

The catch: servers rate-limit you if you go too fast. The sweet spot is usually 5-20 concurrent requests.

Setup

pip install httpx aiohttp asyncio

We'll use httpx — it supports both sync and async, has HTTP/2, and works well with curl_cffi for anti-bot when needed.

Basic Async Scraper

import asyncio
import httpx
from bs4 import BeautifulSoup
from typing import List, Dict

async def fetch_page(client: httpx.AsyncClient, url: str) -> dict:
    """Fetch a single page and extract data."""
    try:
        response = await client.get(url, timeout=15.0)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find('title')

        return {
            'url': url,
            'status': response.status_code,
            'title': title.text.strip() if title else '',
            'content_length': len(response.text),
        }
    except httpx.HTTPStatusError as e:
        return {'url': url, 'status': e.response.status_code, 'error': str(e)}
    except Exception as e:
        return {'url': url, 'status': 0, 'error': str(e)}

async def scrape_all(urls: List[str], concurrency: int = 10) -> List[dict]:
    """Scrape multiple URLs concurrently."""

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    }

    # Semaphore limits concurrent requests
    semaphore = asyncio.Semaphore(concurrency)

    async def fetch_with_semaphore(client, url):
        async with semaphore:
            return await fetch_page(client, url)

    async with httpx.AsyncClient(headers=headers, follow_redirects=True) as client:
        tasks = [fetch_with_semaphore(client, url) for url in urls]
        results = await asyncio.gather(*tasks, return_exceptions=True)

    # Filter out exceptions, keep dicts
    return [r for r in results if isinstance(r, dict)]

# Run it
urls = [f"https://news.ycombinator.com/news?p={i}" for i in range(1, 21)]  # 20 pages

import time
start = time.time()
results = asyncio.run(scrape_all(urls, concurrency=10))
elapsed = time.time() - start

print(f"Scraped {len(results)} pages in {elapsed:.1f}s")
for r in results[:3]:
    print(f"  [{r['status']}] {r['url'][-30:]} — {r.get('title', '')[:40]}")

Benchmark: Sync vs Async

import asyncio, httpx, requests, time

urls = [f"https://httpbin.org/delay/0.5" for _ in range(20)]  # 0.5s each

# Synchronous
start = time.time()
session = requests.Session()
for url in urls:
    session.get(url)
sync_time = time.time() - start
print(f"Sync: {sync_time:.1f}s for 20 requests")

# Async
async def async_bench():
    async with httpx.AsyncClient() as client:
        tasks = [client.get(url) for url in urls]
        await asyncio.gather(*tasks)

start = time.time()
asyncio.run(async_bench())
async_time = time.time() - start
print(f"Async: {async_time:.1f}s for 20 requests")
print(f"Speedup: {sync_time/async_time:.1f}x")
# Output: Sync: 10.2s, Async: 0.9s, Speedup: 11.3x

Production-Grade Async Scraper with Rate Limiting

import asyncio
import httpx
from bs4 import BeautifulSoup
import csv, json
from typing import List, Dict, Optional
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class AsyncScraper:
    def __init__(
        self,
        concurrency: int = 10,
        delay_between_requests: float = 0.5,
        max_retries: int = 3,
        timeout: float = 15.0
    ):
        self.semaphore = asyncio.Semaphore(concurrency)
        self.delay = delay_between_requests
        self.max_retries = max_retries
        self.timeout = timeout
        self.results: List[dict] = []
        self._success_count = 0
        self._error_count = 0

    async def fetch(self, client: httpx.AsyncClient, url: str) -> Optional[str]:
        """Fetch URL with retries and rate limiting."""
        for attempt in range(self.max_retries):
            try:
                response = await client.get(url, timeout=self.timeout)

                if response.status_code == 200:
                    return response.text
                elif response.status_code == 429:
                    wait = 2 ** attempt * 5  # Exponential backoff
                    logger.warning(f"Rate limited on {url}, waiting {wait}s")
                    await asyncio.sleep(wait)
                elif response.status_code in (403, 404):
                    logger.warning(f"Got {response.status_code} for {url}")
                    return None

            except (httpx.TimeoutException, httpx.ConnectError) as e:
                logger.warning(f"Connection error for {url} (attempt {attempt+1}): {e}")
                await asyncio.sleep(2 ** attempt)

        return None

    def parse(self, html: str, url: str) -> dict:
        """Override this method to extract your data."""
        soup = BeautifulSoup(html, 'html.parser')
        return {
            'url': url,
            'title': (soup.find('title') or soup.find('h1') or type('', (), {'text': ''})()).text.strip(),
            'word_count': len(soup.get_text().split()),
        }

    async def process_url(self, client: httpx.AsyncClient, url: str):
        """Process a single URL with semaphore control."""
        async with self.semaphore:
            html = await self.fetch(client, url)

            if html:
                data = self.parse(html, url)
                self.results.append(data)
                self._success_count += 1
            else:
                self._error_count += 1
                self.results.append({'url': url, 'error': True})

            # Rate limiting delay
            if self.delay > 0:
                await asyncio.sleep(self.delay)

    async def run(self, urls: List[str]) -> List[dict]:
        """Run the scraper on all URLs."""
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
        }

        async with httpx.AsyncClient(
            headers=headers,
            follow_redirects=True,
            http2=True  # Enable HTTP/2 for better performance
        ) as client:
            tasks = [self.process_url(client, url) for url in urls]
            await asyncio.gather(*tasks)

        logger.info(f"Done: {self._success_count} success, {self._error_count} errors")
        return self.results

    def save_csv(self, filename: str):
        """Save results to CSV."""
        if not self.results:
            return
        with open(filename, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=self.results[0].keys())
            writer.writeheader()
            writer.writerows(self.results)
        logger.info(f"Saved {len(self.results)} rows to {filename}")

# Usage: subclass and override parse()
class EcommerceScraper(AsyncScraper):
    def parse(self, html: str, url: str) -> dict:
        soup = BeautifulSoup(html, 'html.parser')

        price_elem = soup.select_one('.price, .product-price, [data-price]')
        name_elem = soup.select_one('h1.product-title, .product-name, h1')

        return {
            'url': url,
            'name': name_elem.text.strip() if name_elem else '',
            'price': price_elem.text.strip() if price_elem else '',
        }

scraper = EcommerceScraper(concurrency=8, delay_between_requests=0.5)
urls = ["https://books.toscrape.com/catalogue/page-1.html",
        "https://books.toscrape.com/catalogue/page-2.html"]

results = asyncio.run(scraper.run(urls))
scraper.save_csv('products.csv')

Async with Proxies

import asyncio, httpx, random

PROXIES = [
    "http://user:pass@proxy1:8080",
    "http://user:pass@proxy2:8080",
    "http://user:pass@proxy3:8080",
]

async def fetch_with_proxy(url: str, proxy: str) -> str:
    async with httpx.AsyncClient(proxy=proxy) as client:
        response = await client.get(url)
        return response.text

async def scrape_with_rotating_proxies(urls: list) -> list:
    results = []

    for i, url in enumerate(urls):
        proxy = PROXIES[i % len(PROXIES)]  # Rotate proxies

        try:
            html = await fetch_with_proxy(url, proxy)
            results.append({'url': url, 'html': html})
        except Exception as e:
            # Try different proxy on failure
            other_proxy = random.choice([p for p in PROXIES if p != proxy])
            try:
                html = await fetch_with_proxy(url, other_proxy)
                results.append({'url': url, 'html': html})
            except:
                results.append({'url': url, 'error': str(e)})

    return results

Progress Tracking for Long Runs

import asyncio, httpx
from tqdm.asyncio import tqdm_asyncio  # pip install tqdm

async def scrape_with_progress(urls: list) -> list:
    semaphore = asyncio.Semaphore(10)
    results = []

    async def process(client, url):
        async with semaphore:
            try:
                r = await client.get(url, timeout=10)
                results.append({'url': url, 'status': r.status_code})
            except Exception as e:
                results.append({'url': url, 'error': str(e)})

    async with httpx.AsyncClient(follow_redirects=True) as client:
        # tqdm shows a progress bar
        tasks = [process(client, url) for url in urls]
        await tqdm_asyncio.gather(*tasks, desc="Scraping")

    return results

results = asyncio.run(scrape_with_progress(your_urls))