Async Web Scraping in Python: asyncio + aiohttp + httpx (Complete 2026 Guide)

#python #webscraping #tutorial #asyncio

Sequential scraping is slow. A scraper that fetches 10 URLs one at a time takes 10× longer than one that fetches them concurrently. Python's asyncio makes concurrent HTTP requests straightforward — here's how to use it correctly.

Why asyncio for scraping

Scraping is I/O-bound: you spend most of your time waiting for network responses. Asyncio lets Python do other work (like starting new requests) while waiting for responses.

Synchronous (slow):

import requests
import time

urls = [f"https://example.com/page/{i}" for i in range(100)]

start = time.time()
for url in urls:
    response = requests.get(url)
    # Process response

print(f"Time: {time.time() - start:.1f}s")  # ~100 seconds (1s per request)

Async (fast):

import asyncio
import aiohttp
import time

async def fetch(session: aiohttp.ClientSession, url: str) -> str:
    async with session.get(url) as response:
        return await response.text()

async def scrape_all(urls: list) -> list:
    async with aiohttp.ClientSession() as session:
        tasks = [fetch(session, url) for url in urls]
        return await asyncio.gather(*tasks)

start = time.time()
results = asyncio.run(scrape_all(urls))
print(f"Time: {time.time() - start:.1f}s")  # ~2-5 seconds (all concurrent)

aiohttp: the standard async HTTP library

import asyncio
import aiohttp
from typing import Optional

async def fetch_url(
    session: aiohttp.ClientSession,
    url: str,
    headers: Optional[dict] = None
) -> dict:
    """Fetch a URL and return structured result"""
    try:
        async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
            return {
                "url": url,
                "status": resp.status,
                "content": await resp.text() if resp.status == 200 else None,
                "error": None
            }
    except asyncio.TimeoutError:
        return {"url": url, "status": None, "content": None, "error": "timeout"}
    except Exception as e:
        return {"url": url, "status": None, "content": None, "error": str(e)}

async def scrape_urls(urls: list, concurrency: int = 10) -> list:
    """Scrape multiple URLs with controlled concurrency"""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0"
    }

    # Semaphore limits simultaneous requests
    semaphore = asyncio.Semaphore(concurrency)

    async def fetch_with_semaphore(session, url):
        async with semaphore:
            return await fetch_url(session, url, headers)

    async with aiohttp.ClientSession() as session:
        tasks = [fetch_with_semaphore(session, url) for url in urls]
        results = await asyncio.gather(*tasks)

    return list(results)

# Usage
urls = [f"https://httpbin.org/delay/1" for _ in range(20)]
results = asyncio.run(scrape_urls(urls, concurrency=5))

success = [r for r in results if r["status"] == 200]
failed = [r for r in results if r["error"]]
print(f"Success: {len(success)} | Failed: {len(failed)}")

httpx: the modern alternative with sync/async parity

httpx works both synchronously and asynchronously with the same API:

import httpx
import asyncio

async def scrape_with_httpx(urls: list) -> list:
    limits = httpx.Limits(max_connections=10, max_keepalive_connections=5)

    async with httpx.AsyncClient(
        limits=limits,
        timeout=15.0,
        headers={"User-Agent": "Mozilla/5.0 Chrome/122.0.0.0"}
    ) as client:
        tasks = [client.get(url) for url in urls]
        responses = await asyncio.gather(*tasks, return_exceptions=True)

    results = []
    for url, response in zip(urls, responses):
        if isinstance(response, Exception):
            results.append({"url": url, "error": str(response)})
        else:
            results.append({
                "url": url,
                "status": response.status_code,
                "content": response.text if response.status_code == 200 else None
            })

    return results

results = asyncio.run(scrape_with_httpx(["https://example.com"] * 20))

Rate limiting and polite scraping

Concurrent requests can overwhelm servers. Add delays and rate limits:

import asyncio
import aiohttp
import time
from asyncio import Semaphore

class RateLimitedScraper:
    def __init__(self, concurrency: int = 5, delay: float = 0.5):
        self.semaphore = Semaphore(concurrency)
        self.delay = delay
        self._last_request = 0

    async def _wait_for_rate_limit(self):
        """Ensure minimum delay between requests"""
        now = time.monotonic()
        time_since_last = now - self._last_request
        if time_since_last < self.delay:
            await asyncio.sleep(self.delay - time_since_last)
        self._last_request = time.monotonic()

    async def fetch(self, session: aiohttp.ClientSession, url: str) -> dict:
        async with self.semaphore:
            await self._wait_for_rate_limit()
            try:
                async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
                    return {"url": url, "status": resp.status, "content": await resp.text()}
            except Exception as e:
                return {"url": url, "error": str(e)}

    async def scrape_all(self, urls: list) -> list:
        async with aiohttp.ClientSession() as session:
            tasks = [self.fetch(session, url) for url in urls]
            return await asyncio.gather(*tasks)

scraper = RateLimitedScraper(concurrency=3, delay=1.0)  # 3 concurrent, 1s min delay
results = asyncio.run(scraper.scrape_all(urls))

Parsing HTML in async scrapers

BeautifulSoup is synchronous — that's fine. Use it after awaiting the response:

import asyncio
import aiohttp
from bs4 import BeautifulSoup

async def scrape_and_parse(url: str, session: aiohttp.ClientSession) -> dict:
    async with session.get(url) as response:
        html = await response.text()

    # BeautifulSoup parsing happens after the await — no issue
    soup = BeautifulSoup(html, "html.parser")

    return {
        "url": url,
        "title": soup.title.string if soup.title else None,
        "links": [a.get("href") for a in soup.find_all("a", href=True)][:10],
        "paragraphs": len(soup.find_all("p"))
    }

async def main():
    urls = [
        "https://example.com",
        "https://httpbin.org",
        "https://python.org"
    ]

    async with aiohttp.ClientSession() as session:
        results = await asyncio.gather(*[scrape_and_parse(url, session) for url in urls])

    for r in results:
        print(f"{r['url']}: '{r['title']}' - {len(r['links'])} links")

asyncio.run(main())

Handling errors and retries

import asyncio
import aiohttp

async def fetch_with_retry(
    session: aiohttp.ClientSession,
    url: str,
    max_retries: int = 3,
    backoff: float = 2.0
) -> dict:
    for attempt in range(max_retries):
        try:
            async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
                if resp.status == 429:  # Rate limited
                    retry_after = int(resp.headers.get("Retry-After", 60))
                    await asyncio.sleep(retry_after)
                    continue
                elif resp.status >= 500:  # Server error
                    await asyncio.sleep(backoff ** attempt)
                    continue

                return {"url": url, "status": resp.status, "content": await resp.text()}

        except (aiohttp.ClientError, asyncio.TimeoutError) as e:
            if attempt == max_retries - 1:
                return {"url": url, "error": str(e)}
            await asyncio.sleep(backoff ** attempt)

    return {"url": url, "error": "max retries exceeded"}

Performance comparison

Tested against 100 URLs with 1-second server response time:

Method	Time	Notes
requests (sequential)	~100s	1 request at a time
asyncio (concurrent=10)	~12s	10 concurrent
asyncio (concurrent=50)	~3s	50 concurrent
asyncio (concurrent=100)	~1.5s	All at once

Diminishing returns above 20-50 concurrent for most real-world sites (they'll rate limit you).

When asyncio isn't enough

Asyncio is pure HTTP — it doesn't run JavaScript. For JS-rendered pages:

Playwright async: async with async_playwright() as p — real browser, async API
Managed actors: For major platforms at scale, use the Apify Scrapers Bundle ($29) — handles concurrency, proxies, and anti-bot internally

For most scraping tasks that don't need a browser, asyncio + aiohttp or httpx is the fastest option available in Python.