Async Web Scraping in Python: httpx + asyncio for 10x Faster Data Collection
Synchronous scraping makes requests one at a time. While you wait for one response, you're doing nothing. Async scraping makes 10-50 requests simultaneously — same time, 10-50x the output.
Here's how to actually implement it, with real benchmarks.
Why Async? The Numbers
Scraping 100 pages, each taking 1 second to respond:
Synchronous: 100 × 1s = 100 seconds
Async (10x): 10 × 1s = 10 seconds (10 concurrent)
Async (50x): 2 × 1s = 2 seconds (50 concurrent)
The catch: servers rate-limit you if you go too fast. The sweet spot is usually 5-20 concurrent requests.
Setup
pip install httpx aiohttp asyncio
We'll use httpx — it supports both sync and async, has HTTP/2, and works well with curl_cffi for anti-bot when needed.
Basic Async Scraper
import asyncio
import httpx
from bs4 import BeautifulSoup
from typing import List, Dict
async def fetch_page(client: httpx.AsyncClient, url: str) -> dict:
"""Fetch a single page and extract data."""
try:
response = await client.get(url, timeout=15.0)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find('title')
return {
'url': url,
'status': response.status_code,
'title': title.text.strip() if title else '',
'content_length': len(response.text),
}
except httpx.HTTPStatusError as e:
return {'url': url, 'status': e.response.status_code, 'error': str(e)}
except Exception as e:
return {'url': url, 'status': 0, 'error': str(e)}
async def scrape_all(urls: List[str], concurrency: int = 10) -> List[dict]:
"""Scrape multiple URLs concurrently."""
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
}
# Semaphore limits concurrent requests
semaphore = asyncio.Semaphore(concurrency)
async def fetch_with_semaphore(client, url):
async with semaphore:
return await fetch_page(client, url)
async with httpx.AsyncClient(headers=headers, follow_redirects=True) as client:
tasks = [fetch_with_semaphore(client, url) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Filter out exceptions, keep dicts
return [r for r in results if isinstance(r, dict)]
# Run it
urls = [f"https://news.ycombinator.com/news?p={i}" for i in range(1, 21)] # 20 pages
import time
start = time.time()
results = asyncio.run(scrape_all(urls, concurrency=10))
elapsed = time.time() - start
print(f"Scraped {len(results)} pages in {elapsed:.1f}s")
for r in results[:3]:
print(f" [{r['status']}] {r['url'][-30:]} — {r.get('title', '')[:40]}")
Benchmark: Sync vs Async
import asyncio, httpx, requests, time
urls = [f"https://httpbin.org/delay/0.5" for _ in range(20)] # 0.5s each
# Synchronous
start = time.time()
session = requests.Session()
for url in urls:
session.get(url)
sync_time = time.time() - start
print(f"Sync: {sync_time:.1f}s for 20 requests")
# Async
async def async_bench():
async with httpx.AsyncClient() as client:
tasks = [client.get(url) for url in urls]
await asyncio.gather(*tasks)
start = time.time()
asyncio.run(async_bench())
async_time = time.time() - start
print(f"Async: {async_time:.1f}s for 20 requests")
print(f"Speedup: {sync_time/async_time:.1f}x")
# Output: Sync: 10.2s, Async: 0.9s, Speedup: 11.3x
Production-Grade Async Scraper with Rate Limiting
import asyncio
import httpx
from bs4 import BeautifulSoup
import csv, json
from typing import List, Dict, Optional
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class AsyncScraper:
def __init__(
self,
concurrency: int = 10,
delay_between_requests: float = 0.5,
max_retries: int = 3,
timeout: float = 15.0
):
self.semaphore = asyncio.Semaphore(concurrency)
self.delay = delay_between_requests
self.max_retries = max_retries
self.timeout = timeout
self.results: List[dict] = []
self._success_count = 0
self._error_count = 0
async def fetch(self, client: httpx.AsyncClient, url: str) -> Optional[str]:
"""Fetch URL with retries and rate limiting."""
for attempt in range(self.max_retries):
try:
response = await client.get(url, timeout=self.timeout)
if response.status_code == 200:
return response.text
elif response.status_code == 429:
wait = 2 ** attempt * 5 # Exponential backoff
logger.warning(f"Rate limited on {url}, waiting {wait}s")
await asyncio.sleep(wait)
elif response.status_code in (403, 404):
logger.warning(f"Got {response.status_code} for {url}")
return None
except (httpx.TimeoutException, httpx.ConnectError) as e:
logger.warning(f"Connection error for {url} (attempt {attempt+1}): {e}")
await asyncio.sleep(2 ** attempt)
return None
def parse(self, html: str, url: str) -> dict:
"""Override this method to extract your data."""
soup = BeautifulSoup(html, 'html.parser')
return {
'url': url,
'title': (soup.find('title') or soup.find('h1') or type('', (), {'text': ''})()).text.strip(),
'word_count': len(soup.get_text().split()),
}
async def process_url(self, client: httpx.AsyncClient, url: str):
"""Process a single URL with semaphore control."""
async with self.semaphore:
html = await self.fetch(client, url)
if html:
data = self.parse(html, url)
self.results.append(data)
self._success_count += 1
else:
self._error_count += 1
self.results.append({'url': url, 'error': True})
# Rate limiting delay
if self.delay > 0:
await asyncio.sleep(self.delay)
async def run(self, urls: List[str]) -> List[dict]:
"""Run the scraper on all URLs."""
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
}
async with httpx.AsyncClient(
headers=headers,
follow_redirects=True,
http2=True # Enable HTTP/2 for better performance
) as client:
tasks = [self.process_url(client, url) for url in urls]
await asyncio.gather(*tasks)
logger.info(f"Done: {self._success_count} success, {self._error_count} errors")
return self.results
def save_csv(self, filename: str):
"""Save results to CSV."""
if not self.results:
return
with open(filename, 'w', newline='') as f:
writer = csv.DictWriter(f, fieldnames=self.results[0].keys())
writer.writeheader()
writer.writerows(self.results)
logger.info(f"Saved {len(self.results)} rows to {filename}")
# Usage: subclass and override parse()
class EcommerceScraper(AsyncScraper):
def parse(self, html: str, url: str) -> dict:
soup = BeautifulSoup(html, 'html.parser')
price_elem = soup.select_one('.price, .product-price, [data-price]')
name_elem = soup.select_one('h1.product-title, .product-name, h1')
return {
'url': url,
'name': name_elem.text.strip() if name_elem else '',
'price': price_elem.text.strip() if price_elem else '',
}
scraper = EcommerceScraper(concurrency=8, delay_between_requests=0.5)
urls = ["https://books.toscrape.com/catalogue/page-1.html",
"https://books.toscrape.com/catalogue/page-2.html"]
results = asyncio.run(scraper.run(urls))
scraper.save_csv('products.csv')
Async with Proxies
import asyncio, httpx, random
PROXIES = [
"http://user:pass@proxy1:8080",
"http://user:pass@proxy2:8080",
"http://user:pass@proxy3:8080",
]
async def fetch_with_proxy(url: str, proxy: str) -> str:
async with httpx.AsyncClient(proxy=proxy) as client:
response = await client.get(url)
return response.text
async def scrape_with_rotating_proxies(urls: list) -> list:
results = []
for i, url in enumerate(urls):
proxy = PROXIES[i % len(PROXIES)] # Rotate proxies
try:
html = await fetch_with_proxy(url, proxy)
results.append({'url': url, 'html': html})
except Exception as e:
# Try different proxy on failure
other_proxy = random.choice([p for p in PROXIES if p != proxy])
try:
html = await fetch_with_proxy(url, other_proxy)
results.append({'url': url, 'html': html})
except:
results.append({'url': url, 'error': str(e)})
return results
Progress Tracking for Long Runs
import asyncio, httpx
from tqdm.asyncio import tqdm_asyncio # pip install tqdm
async def scrape_with_progress(urls: list) -> list:
semaphore = asyncio.Semaphore(10)
results = []
async def process(client, url):
async with semaphore:
try:
r = await client.get(url, timeout=10)
results.append({'url': url, 'status': r.status_code})
except Exception as e:
results.append({'url': url, 'error': str(e)})
async with httpx.AsyncClient(follow_redirects=True) as client:
# tqdm shows a progress bar
tasks = [process(client, url) for url in urls]
await tqdm_asyncio.gather(*tasks, desc="Scraping")
return results
results = asyncio.run(scrape_with_progress(your_urls))
When NOT to Use Async
Async is overkill when:
- You're scraping < 20 URLs (sync is simpler, barely slower)
- The bottleneck is parsing, not network I/O (async doesn't help)
- You need rate limiting to 1 req/5s (concurrency=1 = same as sync)
- The target site aggressively limits per-IP (async gets you banned faster)
The rule: If network wait time > 50% of total time → async helps. If CPU/parsing is the bottleneck → async doesn't help.
Related Articles
- Python Web Scraping Tutorial for Beginners 2026 — Start here if new to scraping
- Web Scraping Tools Comparison 2026 — Full tool comparison
- Web Scraping Without Getting Banned in 2026 — Anti-detection for async scrapers
Skip the setup
Pre-built async-ready scrapers with proxy rotation built in:
Apify Scrapers Bundle — $29 one-time
30 production actors. Instant download.
Top comments (0)