Scraping 1000 Pages in 10 Seconds: Python Async HTTP Guide
Synchronous requests are fine for 5 pages. For 1000, you need async. Here's how.
The Problem With Requests
import requests
import time
start = time.time()
for page in range(1, 101):
response = requests.get(f"https://api.example.com/data?page={page}")
data = response.json()
# Process data...
print(f"Time: {time.time() - start:.1f}s")
# Time: 85.3s (100 sequential requests)
85 seconds for 100 pages. At this rate, 1000 pages = 14 minutes.
Enter Async
import asyncio
import aiohttp
import time
async def fetch_page(session, page):
async with session.get(f"https://api.example.com/data?page={page}") as resp:
return await resp.json()
async def main():
start = time.time()
async with aiohttp.ClientSession() as session:
tasks = [fetch_page(session, page) for page in range(1, 101)]
results = await asyncio.gather(*tasks)
print(f"Time: {time.time() - start:.1f}s")
# Time: 2.1s (100 concurrent requests)
# 40x faster!
asyncio.run(main())
100 pages: 85s → 2.1s. That's a 40x speedup.
The Complete Production Scraper
import asyncio
import aiohttp
import json
import logging
from datetime import datetime
from pathlib import Path
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
class AsyncScraper:
def __init__(
self,
max_concurrent=20,
timeout=30,
retries=3,
delay=0.1,
headers=None,
):
self.max_concurrent = max_concurrent
self.timeout = aiohttp.ClientTimeout(total=timeout)
self.retries = retries
self.delay = delay
self.headers = headers or {
"User-Agent": "Mozilla/5.0 (compatible; MyBot/1.0)",
"Accept": "application/json",
}
self.results = []
self.errors = []
self.semaphore = asyncio.Semaphore(max_concurrent)
async def fetch(self, session, url, params=None):
"""Fetch a single URL with retry logic."""
for attempt in range(self.retries):
try:
async with self.semaphore:
async with session.get(
url,
params=params,
headers=self.headers,
timeout=self.timeout,
) as response:
response.raise_for_status()
# Respect rate limits
await asyncio.sleep(self.delay)
return {
"url": str(response.url),
"status": response.status,
"data": await response.json(),
}
except asyncio.TimeoutError:
logger.warning(f"Timeout: {url} (attempt {attempt + 1}/{self.retries})")
except aiohttp.ClientError as e:
logger.warning(f"Error: {url} — {e} (attempt {attempt + 1}/{self.retries})")
except Exception as e:
logger.error(f"Unexpected: {url} — {e}")
break # Don't retry unexpected errors
if attempt < self.retries - 1:
await asyncio.sleep(2 ** attempt) # Exponential backoff
self.errors.append({"url": url, "params": params})
return None
async def scrape_all(self, urls):
"""Scrape multiple URLs concurrently."""
async with aiohttp.ClientSession(timeout=self.timeout) as session:
tasks = [self.fetch(session, url) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Filter out errors and None results
self.results = [r for r in results if r is not None and not isinstance(r, Exception)]
logger.info(f"Scraped {len(self.results)}/{len(urls)} URLs")
if self.errors:
logger.warning(f"Failed: {len(self.errors)} URLs")
return self.results
def save_results(self, filepath="results.json"):
"""Save results to a JSON file."""
Path(filepath).write_text(
json.dumps({
"timestamp": datetime.now().isoformat(),
"total": len(self.results),
"errors": len(self.errors),
"results": self.results,
}, indent=2, default=str)
)
logger.info(f"Saved {len(self.results)} results to {filepath}")
# === Usage Examples ===
async def example_pagination():
"""Scrape all pages of a paginated API."""
scraper = AsyncScraper(max_concurrent=10, delay=0.2)
urls = [f"https://api.example.com/items?page={p}" for p in range(1, 101)]
results = await scraper.scrape_all(urls)
scraper.save_results("items.json")
return results
async def example_different_urls():
"""Scrape a list of different URLs."""
scraper = AsyncScraper(max_concurrent=5, timeout=15)
urls = [
"https://api.github.com/repos/python/cpython",
"https://api.github.com/repos/nodejs/node",
"https://api.github.com/repos/rust-lang/rust",
"https://api.github.com/repos/golang/go",
"https://api.github.com/repos/microsoft/vscode",
]
results = await scraper.scrape_all(urls)
for r in results:
print(f"{r['data']['full_name']}: ⭐ {r['data']['stargazers_count']}")
async def example_with_proxy():
"""Scrape through a proxy (for sites that block datacenter IPs)."""
scraper = AsyncScraper(
max_concurrent=5,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
},
)
# Use proxy if available
proxy = "http://user:pass@proxy.example.com:8080"
async with aiohttp.ClientSession(timeout=scraper.timeout) as session:
async with session.get(
"https://httpbin.org/ip",
proxy=proxy,
) as resp:
print(await resp.json())
# Scrape HTML pages (not just JSON APIs)
async def scrape_html(session, url):
"""Scrape HTML and extract data with regex or simple parsing."""
async with session.get(url) as resp:
text = await resp.text()
# Use re, BeautifulSoup, or lxml to extract data
# Simple example: extract all links
import re
links = re.findall(r'href="([^"]+)"', text)
return {"url": url, "links": links}
if __name__ == "__main__":
asyncio.run(example_pagination())
Performance Comparison
| Approach | 100 URLs | 1000 URLs | Complexity |
|---|---|---|---|
requests (sequential) |
85s | 850s | Simple |
requests + ThreadPool
|
12s | 120s | Medium |
aiohttp (async) |
2.1s | 21s | Medium |
aiohttp + proxy rotation |
8s | 80s | Complex |
Common Pitfalls
1. Too Many Concurrent Requests
# ❌ BAD: 1000 concurrent requests = server blocks you
tasks = [fetch(session, url) for url in thousand_urls]
await asyncio.gather(*tasks)
# ✅ GOOD: Limit concurrency with semaphore
semaphore = asyncio.Semaphore(20)
async def limited_fetch(session, url):
async with semaphore:
return await fetch(session, url)
2. No Rate Limiting
# ❌ BAD: Hammering the server
async for page in range(1000):
await fetch(session, f"https://api.example.com/data?page={page}")
# ✅ GOOD: Respect rate limits
async for page in range(1000):
await fetch(session, f"https://api.example.com/data?page={page}")
await asyncio.sleep(0.1) # 100ms between requests = 10 req/sec max
3. Not Handling HTTP Errors
# ❌ BAD: 404 crashes everything
data = await response.json()
# ✅ GOOD: Check status code
if response.status == 200:
data = await response.json()
elif response.status == 429:
retry_after = int(response.headers.get('Retry-After', 60))
logger.warning(f"Rate limited. Waiting {retry_after}s")
await asyncio.sleep(retry_after)
else:
logger.error(f"HTTP {response.status}: {response.url}")
4. Forgetting session.close()
# ❌ BAD: Resource leak
session = aiohttp.ClientSession()
# ... do stuff ...
# Forgot to close!
# ✅ GOOD: Context manager
async with aiohttp.ClientSession() as session:
# ... do stuff ...
pass # Auto-closed
When to Use What
| Tool | Best For |
|---|---|
requests |
Simple scripts, < 50 pages |
aiohttp |
Bulk data collection, APIs |
Playwright |
JavaScript-rendered pages |
Scrapy |
Large-scale crawling projects |
httpx |
Modern alternative to requests (supports async) |
Getting Started
pip install aiohttp
# Or if you also need HTML parsing:
pip install aiohttp beautifulsoup4 lxml
# Your first async scraper in 10 lines
import asyncio, aiohttp
async def main():
async with aiohttp.ClientSession() as session:
urls = [f"https://httpbin.org/get?id={i}" for i in range(100)]
tasks = [session.get(url) for url in urls]
responses = await asyncio.gather(*tasks)
print(f"Fetched {len(responses)} pages")
asyncio.run(main())
Follow @armorbreak for more Python and web scraping tutorials.
Top comments (0)