DEV Community

Alex Chen
Alex Chen

Posted on

Scraping 1000 Pages in 10 Seconds: Python Async HTTP Guide

Scraping 1000 Pages in 10 Seconds: Python Async HTTP Guide

Synchronous requests are fine for 5 pages. For 1000, you need async. Here's how.

The Problem With Requests

import requests
import time

start = time.time()

for page in range(1, 101):
    response = requests.get(f"https://api.example.com/data?page={page}")
    data = response.json()
    # Process data...

print(f"Time: {time.time() - start:.1f}s")
# Time: 85.3s (100 sequential requests)
Enter fullscreen mode Exit fullscreen mode

85 seconds for 100 pages. At this rate, 1000 pages = 14 minutes.

Enter Async

import asyncio
import aiohttp
import time

async def fetch_page(session, page):
    async with session.get(f"https://api.example.com/data?page={page}") as resp:
        return await resp.json()

async def main():
    start = time.time()

    async with aiohttp.ClientSession() as session:
        tasks = [fetch_page(session, page) for page in range(1, 101)]
        results = await asyncio.gather(*tasks)

    print(f"Time: {time.time() - start:.1f}s")
    # Time: 2.1s (100 concurrent requests)
    # 40x faster!

asyncio.run(main())
Enter fullscreen mode Exit fullscreen mode

100 pages: 85s → 2.1s. That's a 40x speedup.

The Complete Production Scraper

import asyncio
import aiohttp
import json
import logging
from datetime import datetime
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

class AsyncScraper:
    def __init__(
        self,
        max_concurrent=20,
        timeout=30,
        retries=3,
        delay=0.1,
        headers=None,
    ):
        self.max_concurrent = max_concurrent
        self.timeout = aiohttp.ClientTimeout(total=timeout)
        self.retries = retries
        self.delay = delay
        self.headers = headers or {
            "User-Agent": "Mozilla/5.0 (compatible; MyBot/1.0)",
            "Accept": "application/json",
        }
        self.results = []
        self.errors = []
        self.semaphore = asyncio.Semaphore(max_concurrent)

    async def fetch(self, session, url, params=None):
        """Fetch a single URL with retry logic."""
        for attempt in range(self.retries):
            try:
                async with self.semaphore:
                    async with session.get(
                        url,
                        params=params,
                        headers=self.headers,
                        timeout=self.timeout,
                    ) as response:
                        response.raise_for_status()

                        # Respect rate limits
                        await asyncio.sleep(self.delay)

                        return {
                            "url": str(response.url),
                            "status": response.status,
                            "data": await response.json(),
                        }

            except asyncio.TimeoutError:
                logger.warning(f"Timeout: {url} (attempt {attempt + 1}/{self.retries})")
            except aiohttp.ClientError as e:
                logger.warning(f"Error: {url}{e} (attempt {attempt + 1}/{self.retries})")
            except Exception as e:
                logger.error(f"Unexpected: {url}{e}")
                break  # Don't retry unexpected errors

            if attempt < self.retries - 1:
                await asyncio.sleep(2 ** attempt)  # Exponential backoff

        self.errors.append({"url": url, "params": params})
        return None

    async def scrape_all(self, urls):
        """Scrape multiple URLs concurrently."""
        async with aiohttp.ClientSession(timeout=self.timeout) as session:
            tasks = [self.fetch(session, url) for url in urls]
            results = await asyncio.gather(*tasks, return_exceptions=True)

        # Filter out errors and None results
        self.results = [r for r in results if r is not None and not isinstance(r, Exception)]

        logger.info(f"Scraped {len(self.results)}/{len(urls)} URLs")
        if self.errors:
            logger.warning(f"Failed: {len(self.errors)} URLs")

        return self.results

    def save_results(self, filepath="results.json"):
        """Save results to a JSON file."""
        Path(filepath).write_text(
            json.dumps({
                "timestamp": datetime.now().isoformat(),
                "total": len(self.results),
                "errors": len(self.errors),
                "results": self.results,
            }, indent=2, default=str)
        )
        logger.info(f"Saved {len(self.results)} results to {filepath}")


# === Usage Examples ===

async def example_pagination():
    """Scrape all pages of a paginated API."""
    scraper = AsyncScraper(max_concurrent=10, delay=0.2)

    urls = [f"https://api.example.com/items?page={p}" for p in range(1, 101)]
    results = await scraper.scrape_all(urls)
    scraper.save_results("items.json")

    return results

async def example_different_urls():
    """Scrape a list of different URLs."""
    scraper = AsyncScraper(max_concurrent=5, timeout=15)

    urls = [
        "https://api.github.com/repos/python/cpython",
        "https://api.github.com/repos/nodejs/node",
        "https://api.github.com/repos/rust-lang/rust",
        "https://api.github.com/repos/golang/go",
        "https://api.github.com/repos/microsoft/vscode",
    ]
    results = await scraper.scrape_all(urls)

    for r in results:
        print(f"{r['data']['full_name']}: ⭐ {r['data']['stargazers_count']}")


async def example_with_proxy():
    """Scrape through a proxy (for sites that block datacenter IPs)."""
    scraper = AsyncScraper(
        max_concurrent=5,
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        },
    )

    # Use proxy if available
    proxy = "http://user:pass@proxy.example.com:8080"

    async with aiohttp.ClientSession(timeout=scraper.timeout) as session:
        async with session.get(
            "https://httpbin.org/ip",
            proxy=proxy,
        ) as resp:
            print(await resp.json())


# Scrape HTML pages (not just JSON APIs)
async def scrape_html(session, url):
    """Scrape HTML and extract data with regex or simple parsing."""
    async with session.get(url) as resp:
        text = await resp.text()
        # Use re, BeautifulSoup, or lxml to extract data
        # Simple example: extract all links
        import re
        links = re.findall(r'href="([^"]+)"', text)
        return {"url": url, "links": links}


if __name__ == "__main__":
    asyncio.run(example_pagination())
Enter fullscreen mode Exit fullscreen mode

Performance Comparison

Approach 100 URLs 1000 URLs Complexity
requests (sequential) 85s 850s Simple
requests + ThreadPool 12s 120s Medium
aiohttp (async) 2.1s 21s Medium
aiohttp + proxy rotation 8s 80s Complex

Common Pitfalls

1. Too Many Concurrent Requests

# ❌ BAD: 1000 concurrent requests = server blocks you
tasks = [fetch(session, url) for url in thousand_urls]
await asyncio.gather(*tasks)

# ✅ GOOD: Limit concurrency with semaphore
semaphore = asyncio.Semaphore(20)

async def limited_fetch(session, url):
    async with semaphore:
        return await fetch(session, url)
Enter fullscreen mode Exit fullscreen mode

2. No Rate Limiting

# ❌ BAD: Hammering the server
async for page in range(1000):
    await fetch(session, f"https://api.example.com/data?page={page}")

# ✅ GOOD: Respect rate limits
async for page in range(1000):
    await fetch(session, f"https://api.example.com/data?page={page}")
    await asyncio.sleep(0.1)  # 100ms between requests = 10 req/sec max
Enter fullscreen mode Exit fullscreen mode

3. Not Handling HTTP Errors

# ❌ BAD: 404 crashes everything
data = await response.json()

# ✅ GOOD: Check status code
if response.status == 200:
    data = await response.json()
elif response.status == 429:
    retry_after = int(response.headers.get('Retry-After', 60))
    logger.warning(f"Rate limited. Waiting {retry_after}s")
    await asyncio.sleep(retry_after)
else:
    logger.error(f"HTTP {response.status}: {response.url}")
Enter fullscreen mode Exit fullscreen mode

4. Forgetting session.close()

# ❌ BAD: Resource leak
session = aiohttp.ClientSession()
# ... do stuff ...
# Forgot to close!

# ✅ GOOD: Context manager
async with aiohttp.ClientSession() as session:
    # ... do stuff ...
    pass  # Auto-closed
Enter fullscreen mode Exit fullscreen mode

When to Use What

Tool Best For
requests Simple scripts, < 50 pages
aiohttp Bulk data collection, APIs
Playwright JavaScript-rendered pages
Scrapy Large-scale crawling projects
httpx Modern alternative to requests (supports async)

Getting Started

pip install aiohttp

# Or if you also need HTML parsing:
pip install aiohttp beautifulsoup4 lxml
Enter fullscreen mode Exit fullscreen mode
# Your first async scraper in 10 lines
import asyncio, aiohttp

async def main():
    async with aiohttp.ClientSession() as session:
        urls = [f"https://httpbin.org/get?id={i}" for i in range(100)]
        tasks = [session.get(url) for url in urls]
        responses = await asyncio.gather(*tasks)
        print(f"Fetched {len(responses)} pages")

asyncio.run(main())
Enter fullscreen mode Exit fullscreen mode

Follow @armorbreak for more Python and web scraping tutorials.

Top comments (0)