Sequential scraping is slow. A scraper that fetches 10 URLs one at a time takes 10× longer than one that fetches them concurrently. Python's asyncio makes concurrent HTTP requests straightforward — here's how to use it correctly.
Why asyncio for scraping
Scraping is I/O-bound: you spend most of your time waiting for network responses. Asyncio lets Python do other work (like starting new requests) while waiting for responses.
Synchronous (slow):
import requests
import time
urls = [f"https://example.com/page/{i}" for i in range(100)]
start = time.time()
for url in urls:
response = requests.get(url)
# Process response
print(f"Time: {time.time() - start:.1f}s") # ~100 seconds (1s per request)
Async (fast):
import asyncio
import aiohttp
import time
async def fetch(session: aiohttp.ClientSession, url: str) -> str:
async with session.get(url) as response:
return await response.text()
async def scrape_all(urls: list) -> list:
async with aiohttp.ClientSession() as session:
tasks = [fetch(session, url) for url in urls]
return await asyncio.gather(*tasks)
start = time.time()
results = asyncio.run(scrape_all(urls))
print(f"Time: {time.time() - start:.1f}s") # ~2-5 seconds (all concurrent)
aiohttp: the standard async HTTP library
import asyncio
import aiohttp
from typing import Optional
async def fetch_url(
session: aiohttp.ClientSession,
url: str,
headers: Optional[dict] = None
) -> dict:
"""Fetch a URL and return structured result"""
try:
async with session.get(url, headers=headers, timeout=aiohttp.ClientTimeout(total=15)) as resp:
return {
"url": url,
"status": resp.status,
"content": await resp.text() if resp.status == 200 else None,
"error": None
}
except asyncio.TimeoutError:
return {"url": url, "status": None, "content": None, "error": "timeout"}
except Exception as e:
return {"url": url, "status": None, "content": None, "error": str(e)}
async def scrape_urls(urls: list, concurrency: int = 10) -> list:
"""Scrape multiple URLs with controlled concurrency"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/122.0.0.0"
}
# Semaphore limits simultaneous requests
semaphore = asyncio.Semaphore(concurrency)
async def fetch_with_semaphore(session, url):
async with semaphore:
return await fetch_url(session, url, headers)
async with aiohttp.ClientSession() as session:
tasks = [fetch_with_semaphore(session, url) for url in urls]
results = await asyncio.gather(*tasks)
return list(results)
# Usage
urls = [f"https://httpbin.org/delay/1" for _ in range(20)]
results = asyncio.run(scrape_urls(urls, concurrency=5))
success = [r for r in results if r["status"] == 200]
failed = [r for r in results if r["error"]]
print(f"Success: {len(success)} | Failed: {len(failed)}")
httpx: the modern alternative with sync/async parity
httpx works both synchronously and asynchronously with the same API:
import httpx
import asyncio
async def scrape_with_httpx(urls: list) -> list:
limits = httpx.Limits(max_connections=10, max_keepalive_connections=5)
async with httpx.AsyncClient(
limits=limits,
timeout=15.0,
headers={"User-Agent": "Mozilla/5.0 Chrome/122.0.0.0"}
) as client:
tasks = [client.get(url) for url in urls]
responses = await asyncio.gather(*tasks, return_exceptions=True)
results = []
for url, response in zip(urls, responses):
if isinstance(response, Exception):
results.append({"url": url, "error": str(response)})
else:
results.append({
"url": url,
"status": response.status_code,
"content": response.text if response.status_code == 200 else None
})
return results
results = asyncio.run(scrape_with_httpx(["https://example.com"] * 20))
Rate limiting and polite scraping
Concurrent requests can overwhelm servers. Add delays and rate limits:
import asyncio
import aiohttp
import time
from asyncio import Semaphore
class RateLimitedScraper:
def __init__(self, concurrency: int = 5, delay: float = 0.5):
self.semaphore = Semaphore(concurrency)
self.delay = delay
self._last_request = 0
async def _wait_for_rate_limit(self):
"""Ensure minimum delay between requests"""
now = time.monotonic()
time_since_last = now - self._last_request
if time_since_last < self.delay:
await asyncio.sleep(self.delay - time_since_last)
self._last_request = time.monotonic()
async def fetch(self, session: aiohttp.ClientSession, url: str) -> dict:
async with self.semaphore:
await self._wait_for_rate_limit()
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
return {"url": url, "status": resp.status, "content": await resp.text()}
except Exception as e:
return {"url": url, "error": str(e)}
async def scrape_all(self, urls: list) -> list:
async with aiohttp.ClientSession() as session:
tasks = [self.fetch(session, url) for url in urls]
return await asyncio.gather(*tasks)
scraper = RateLimitedScraper(concurrency=3, delay=1.0) # 3 concurrent, 1s min delay
results = asyncio.run(scraper.scrape_all(urls))
Parsing HTML in async scrapers
BeautifulSoup is synchronous — that's fine. Use it after awaiting the response:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
async def scrape_and_parse(url: str, session: aiohttp.ClientSession) -> dict:
async with session.get(url) as response:
html = await response.text()
# BeautifulSoup parsing happens after the await — no issue
soup = BeautifulSoup(html, "html.parser")
return {
"url": url,
"title": soup.title.string if soup.title else None,
"links": [a.get("href") for a in soup.find_all("a", href=True)][:10],
"paragraphs": len(soup.find_all("p"))
}
async def main():
urls = [
"https://example.com",
"https://httpbin.org",
"https://python.org"
]
async with aiohttp.ClientSession() as session:
results = await asyncio.gather(*[scrape_and_parse(url, session) for url in urls])
for r in results:
print(f"{r['url']}: '{r['title']}' - {len(r['links'])} links")
asyncio.run(main())
Handling errors and retries
import asyncio
import aiohttp
async def fetch_with_retry(
session: aiohttp.ClientSession,
url: str,
max_retries: int = 3,
backoff: float = 2.0
) -> dict:
for attempt in range(max_retries):
try:
async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as resp:
if resp.status == 429: # Rate limited
retry_after = int(resp.headers.get("Retry-After", 60))
await asyncio.sleep(retry_after)
continue
elif resp.status >= 500: # Server error
await asyncio.sleep(backoff ** attempt)
continue
return {"url": url, "status": resp.status, "content": await resp.text()}
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
if attempt == max_retries - 1:
return {"url": url, "error": str(e)}
await asyncio.sleep(backoff ** attempt)
return {"url": url, "error": "max retries exceeded"}
Performance comparison
Tested against 100 URLs with 1-second server response time:
| Method | Time | Notes |
|---|---|---|
| requests (sequential) | ~100s | 1 request at a time |
| asyncio (concurrent=10) | ~12s | 10 concurrent |
| asyncio (concurrent=50) | ~3s | 50 concurrent |
| asyncio (concurrent=100) | ~1.5s | All at once |
Diminishing returns above 20-50 concurrent for most real-world sites (they'll rate limit you).
When asyncio isn't enough
Asyncio is pure HTTP — it doesn't run JavaScript. For JS-rendered pages:
-
Playwright async:
async with async_playwright() as p— real browser, async API - Managed actors: For major platforms at scale, use the Apify Scrapers Bundle ($29) — handles concurrency, proxies, and anti-bot internally
For most scraping tasks that don't need a browser, asyncio + aiohttp or httpx is the fastest option available in Python.
n8n AI Automation Pack ($39) — 5 production-ready workflows
Production-Ready Scrapers
For scraping at scale without managing infrastructure:
Top comments (0)