Alex Chen

Posted on May 15

Scraping 1000 Pages in 10 Seconds: Python Async HTTP Guide

#python #webscraping #performance #tutorial

Scraping 1000 Pages in 10 Seconds: Python Async HTTP Guide

Synchronous requests are fine for 5 pages. For 1000, you need async. Here's how.

The Problem With Requests

import requests
import time

start = time.time()

for page in range(1, 101):
    response = requests.get(f"https://api.example.com/data?page={page}")
    data = response.json()
    # Process data...

print(f"Time: {time.time() - start:.1f}s")
# Time: 85.3s (100 sequential requests)

85 seconds for 100 pages. At this rate, 1000 pages = 14 minutes.

Enter Async

import asyncio
import aiohttp
import time

async def fetch_page(session, page):
    async with session.get(f"https://api.example.com/data?page={page}") as resp:
        return await resp.json()

async def main():
    start = time.time()

    async with aiohttp.ClientSession() as session:
        tasks = [fetch_page(session, page) for page in range(1, 101)]
        results = await asyncio.gather(*tasks)

    print(f"Time: {time.time() - start:.1f}s")
    # Time: 2.1s (100 concurrent requests)
    # 40x faster!

asyncio.run(main())

100 pages: 85s → 2.1s. That's a 40x speedup.

The Complete Production Scraper

import asyncio
import aiohttp
import json
import logging
from datetime import datetime
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

class AsyncScraper:
    def __init__(
        self,
        max_concurrent=20,
        timeout=30,
        retries=3,
        delay=0.1,
        headers=None,
    ):
        self.max_concurrent = max_concurrent
        self.timeout = aiohttp.ClientTimeout(total=timeout)
        self.retries = retries
        self.delay = delay
        self.headers = headers or {
            "User-Agent": "Mozilla/5.0 (compatible; MyBot/1.0)",
            "Accept": "application/json",
        }
        self.results = []
        self.errors = []
        self.semaphore = asyncio.Semaphore(max_concurrent)

    async def fetch(self, session, url, params=None):
        """Fetch a single URL with retry logic."""
        for attempt in range(self.retries):
            try:
                async with self.semaphore:
                    async with session.get(
                        url,
                        params=params,
                        headers=self.headers,
                        timeout=self.timeout,
                    ) as response:
                        response.raise_for_status()

                        # Respect rate limits
                        await asyncio.sleep(self.delay)

                        return {
                            "url": str(response.url),
                            "status": response.status,
                            "data": await response.json(),
                        }

            except asyncio.TimeoutError:
                logger.warning(f"Timeout: {url} (attempt {attempt + 1}/{self.retries})")
            except aiohttp.ClientError as e:
                logger.warning(f"Error: {url} — {e} (attempt {attempt + 1}/{self.retries})")
            except Exception as e:
                logger.error(f"Unexpected: {url} — {e}")
                break  # Don't retry unexpected errors

            if attempt < self.retries - 1:
                await asyncio.sleep(2 ** attempt)  # Exponential backoff

        self.errors.append({"url": url, "params": params})
        return None

    async def scrape_all(self, urls):
        """Scrape multiple URLs concurrently."""
        async with aiohttp.ClientSession(timeout=self.timeout) as session:
            tasks = [self.fetch(session, url) for url in urls]
            results = await asyncio.gather(*tasks, return_exceptions=True)

        # Filter out errors and None results
        self.results = [r for r in results if r is not None and not isinstance(r, Exception)]

        logger.info(f"Scraped {len(self.results)}/{len(urls)} URLs")
        if self.errors:
            logger.warning(f"Failed: {len(self.errors)} URLs")

        return self.results

    def save_results(self, filepath="results.json"):
        """Save results to a JSON file."""
        Path(filepath).write_text(
            json.dumps({
                "timestamp": datetime.now().isoformat(),
                "total": len(self.results),
                "errors": len(self.errors),
                "results": self.results,
            }, indent=2, default=str)
        )
        logger.info(f"Saved {len(self.results)} results to {filepath}")


# === Usage Examples ===

async def example_pagination():
    """Scrape all pages of a paginated API."""
    scraper = AsyncScraper(max_concurrent=10, delay=0.2)

    urls = [f"https://api.example.com/items?page={p}" for p in range(1, 101)]
    results = await scraper.scrape_all(urls)
    scraper.save_results("items.json")

    return results

async def example_different_urls():
    """Scrape a list of different URLs."""
    scraper = AsyncScraper(max_concurrent=5, timeout=15)

    urls = [
        "https://api.github.com/repos/python/cpython",
        "https://api.github.com/repos/nodejs/node",
        "https://api.github.com/repos/rust-lang/rust",
        "https://api.github.com/repos/golang/go",
        "https://api.github.com/repos/microsoft/vscode",
    ]
    results = await scraper.scrape_all(urls)

    for r in results:
        print(f"{r['data']['full_name']}: ⭐ {r['data']['stargazers_count']}")


async def example_with_proxy():
    """Scrape through a proxy (for sites that block datacenter IPs)."""
    scraper = AsyncScraper(
        max_concurrent=5,
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        },
    )

    # Use proxy if available
    proxy = "http://user:pass@proxy.example.com:8080"

    async with aiohttp.ClientSession(timeout=scraper.timeout) as session:
        async with session.get(
            "https://httpbin.org/ip",
            proxy=proxy,
        ) as resp:
            print(await resp.json())


# Scrape HTML pages (not just JSON APIs)
async def scrape_html(session, url):
    """Scrape HTML and extract data with regex or simple parsing."""
    async with session.get(url) as resp:
        text = await resp.text()
        # Use re, BeautifulSoup, or lxml to extract data
        # Simple example: extract all links
        import re
        links = re.findall(r'href="([^"]+)"', text)
        return {"url": url, "links": links}


if __name__ == "__main__":
    asyncio.run(example_pagination())

Performance Comparison

Approach	100 URLs	1000 URLs	Complexity
`requests` (sequential)	85s	850s	Simple
`requests` + `ThreadPool`	12s	120s	Medium
`aiohttp` (async)	2.1s	21s	Medium
`aiohttp` + proxy rotation	8s	80s	Complex

Common Pitfalls

1. Too Many Concurrent Requests

# ❌ BAD: 1000 concurrent requests = server blocks you
tasks = [fetch(session, url) for url in thousand_urls]
await asyncio.gather(*tasks)

# ✅ GOOD: Limit concurrency with semaphore
semaphore = asyncio.Semaphore(20)

async def limited_fetch(session, url):
    async with semaphore:
        return await fetch(session, url)

2. No Rate Limiting

# ❌ BAD: Hammering the server
async for page in range(1000):
    await fetch(session, f"https://api.example.com/data?page={page}")

# ✅ GOOD: Respect rate limits
async for page in range(1000):
    await fetch(session, f"https://api.example.com/data?page={page}")
    await asyncio.sleep(0.1)  # 100ms between requests = 10 req/sec max

3. Not Handling HTTP Errors

# ❌ BAD: 404 crashes everything
data = await response.json()

# ✅ GOOD: Check status code
if response.status == 200:
    data = await response.json()
elif response.status == 429:
    retry_after = int(response.headers.get('Retry-After', 60))
    logger.warning(f"Rate limited. Waiting {retry_after}s")
    await asyncio.sleep(retry_after)
else:
    logger.error(f"HTTP {response.status}: {response.url}")

4. Forgetting session.close()

# ❌ BAD: Resource leak
session = aiohttp.ClientSession()
# ... do stuff ...
# Forgot to close!

# ✅ GOOD: Context manager
async with aiohttp.ClientSession() as session:
    # ... do stuff ...
    pass  # Auto-closed

When to Use What

Tool	Best For
`requests`	Simple scripts, < 50 pages
`aiohttp`	Bulk data collection, APIs
`Playwright`	JavaScript-rendered pages
`Scrapy`	Large-scale crawling projects
`httpx`	Modern alternative to requests (supports async)

Getting Started

pip install aiohttp

# Or if you also need HTML parsing:
pip install aiohttp beautifulsoup4 lxml

# Your first async scraper in 10 lines
import asyncio, aiohttp

async def main():
    async with aiohttp.ClientSession() as session:
        urls = [f"https://httpbin.org/get?id={i}" for i in range(100)]
        tasks = [session.get(url) for url in urls]
        responses = await asyncio.gather(*tasks)
        print(f"Fetched {len(responses)} pages")

asyncio.run(main())

Follow @armorbreak for more Python and web scraping tutorials.

DEV Community

Scraping 1000 Pages in 10 Seconds: Python Async HTTP Guide

Scraping 1000 Pages in 10 Seconds: Python Async HTTP Guide

The Problem With Requests

Enter Async

The Complete Production Scraper

Performance Comparison

Common Pitfalls

1. Too Many Concurrent Requests

2. No Rate Limiting

3. Not Handling HTTP Errors

4. Forgetting session.close()

When to Use What

Getting Started

Top comments (0)