As a best-selling author, I invite you to explore my books on Amazon. Don't forget to follow me on Medium and show your support. Thank you! Your support means the world!
Scaling web scraping operations requires balancing efficiency with ethical considerations. I've found these Python approaches help manage large data volumes while maintaining server respect and reliability. Each technique addresses specific scaling challenges I've encountered in production environments.
Asynchronous HTTP requests dramatically improve throughput by handling multiple connections simultaneously. Here's how I manage concurrent fetches while controlling load:
import aiohttp
import asyncio
from selectolax.parser import HTMLParser
import random
USER_AGENTS = ["Mozilla/5.0...", "Opera/9.80..."] # 50+ realistic agents
async def fetch_with_retry(session, url, max_attempts=3):
for attempt in range(max_attempts):
try:
headers = {"User-Agent": random.choice(USER_AGENTS)}
async with session.get(url, headers=headers, timeout=15) as response:
if response.status == 200:
return await response.text()
await asyncio.sleep(2**attempt) # Exponential backoff
except (aiohttp.ClientError, asyncio.TimeoutError):
await asyncio.sleep(1)
return None
async def process_results(html):
tree = HTMLParser(html)
product_data = {
"name": tree.css_first("h1").text(strip=True),
"price": float(tree.css_first("[itemprop=price]").attributes["content"])
}
# Add data validation here
return product_data
async def scaled_crawler(url_list, max_concurrent=15):
connector = aiohttp.TCPConnector(limit=20, ssl=False)
async with aiohttp.ClientSession(connector=connector) as session:
semaphore = asyncio.Semaphore(max_concurrent)
async def worker(url):
async with semaphore:
html = await fetch_with_retry(session, url)
if html:
return await process_results(html)
tasks = [worker(url) for url in url_list]
return await asyncio.gather(*tasks, return_exceptions=True)
JavaScript-heavy sites need browser automation. I prefer Playwright for its reliability across dynamic content:
from playwright.async_api import async_playwright
async def scrape_js_site(url):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
)
page = await context.new_page()
try:
await page.goto(url, wait_until="networkidle", timeout=45000)
await page.wait_for_selector(".product-grid", state="visible", timeout=10000)
results = await page.evaluate('''() => {
const items = [];
document.querySelectorAll(".product-card").forEach(card => {
items.push({
id: card.dataset.id,
name: card.querySelector(".title").innerText,
price: card.querySelector(".price").innerText
});
});
return items;
}''')
return results
finally:
await browser.close()
Proxy rotation is essential for large-scale jobs. This class manages IP cycling with automatic failure detection:
import itertools
import aiohttp
from aiohttp_socks import ProxyConnector
class ProxyManager:
def __init__(self, proxy_list):
self.proxies = itertools.cycle(proxy_list)
self.working_proxies = set()
self.failed_proxies = set()
def get_next_proxy(self):
while True:
proxy = next(self.proxies)
if proxy not in self.failed_proxies:
return proxy
async def test_proxy(self, proxy, test_url="http://example.com"):
try:
connector = ProxyConnector.from_url(proxy)
async with aiohttp.ClientSession(connector=connector) as session:
async with session.get(test_url, timeout=10) as response:
if response.status == 200:
self.working_proxies.add(proxy)
return True
except:
self.failed_proxies.add(proxy)
return False
async def get_valid_session(self):
while True:
proxy = self.get_next_proxy()
if await self.test_proxy(proxy):
return aiohttp.ClientSession(
connector=ProxyConnector.from_url(proxy)
)
Parsing efficiency becomes critical at scale. Selectolax handles malformed HTML better than alternatives:
from selectolax.parser import HTMLParser
def parse_complex_page(html):
tree = HTMLParser(html)
# Handle multiple content patterns
title = tree.css_first("h1#title") or tree.css_first("title")
description = tree.css_first('meta[property="og:description"]') or \
tree.css_first('meta[name="description"]')
# Extract structured data
data = {
"title": title.text(strip=True) if title else None,
"description": description.attributes["content"] if description else "",
"sections": []
}
for section in tree.css("div.content-section"):
data["sections"].append({
"heading": section.css_first("h2").text(strip=True),
"content": [p.text(strip=True) for p in section.css("p")]
})
# Data quality checks
if not data["title"] or len(data["sections"]) == 0:
raise ValueError("Incomplete page structure")
return data
Distributed systems require task queues. I use Celery with Redis for large-scale deployments:
from celery import Celery
from celery.schedules import crontab
app = Celery('scraper',
broker='redis://localhost:6379/0',
backend='redis://localhost:6379/1')
@app.task(autoretry_for=(TimeoutError,),
retry_backoff=30,
max_retries=5)
def process_page(url):
# Extraction logic with error handling
return {"url": url, "data": extracted_content}
@app.task
def schedule_scraping():
urls = get_crawl_list() # From database or sitemap
for url in urls:
process_page.apply_async(args=[url], queue='scraping')
app.conf.beat_schedule = {
'nightly-crawl': {
'task': 'schedule_scraping',
'schedule': crontab(hour=3, minute=0),
},
}
Intelligent throttling adapts to server behavior. This implementation respects rate limits:
class RateLimiter:
def __init__(self, requests_per_minute=60):
self.delay = 60.0 / requests_per_minute
self.last_request = 0
self.penalty = 0
async def wait(self):
elapsed = asyncio.get_event_loop().time() - self.last_request
wait_time = max(0, self.delay + self.penalty - elapsed)
await asyncio.sleep(wait_time)
self.last_request = asyncio.get_event_loop().time()
def update(self, response):
if response.status == 429:
self.penalty += 5.0 # Add penalty
elif response.status == 200:
self.penalty = max(0, self.penalty - 1.0)
Data normalization pipelines ensure consistency. I combine Pandas with custom validators:
import pandas as pd
import numpy as np
from datetime import datetime
def clean_scraped_data(raw_data):
df = pd.DataFrame(raw_data)
# Standardize formats
df['price'] = pd.to_numeric(df['price'].str.replace('[$,]', ''), errors='coerce')
df['in_stock'] = df['availability'].apply(
lambda x: "in stock" in x.lower() if isinstance(x, str) else False
)
# Handle missing values
df['category'] = df['category'].fillna('uncategorized')
df = df.dropna(subset=['product_id'])
# Deduplication
df = df.drop_duplicates(subset='product_id', keep='last')
# Type conversion
df['scrape_date'] = pd.to_datetime(datetime.now().isoformat())
return df
Error resilience separates prototypes from production systems. This decorator adds robust retry logic:
def retryable(max_retries=3, delay=1, backoff=2, exceptions=(Exception,)):
def decorator(func):
async def wrapper(*args, **kwargs):
_delay = delay
for attempt in range(max_retries + 1):
try:
return await func(*args, **kwargs)
except exceptions as e:
if attempt == max_retries:
raise
await asyncio.sleep(_delay)
_delay *= backoff
return wrapper
return decorator
@retryable(exceptions=(TimeoutError, IOError))
async def fetch_data(url):
# Wrapped fetch logic
These methods form a comprehensive approach to large-scale extraction. I start projects with synchronous prototypes, then layer in complexity as needed. The key is implementing just enough sophistication for each project's requirements while maintaining readability. Throughput often improves more from smart throttling than raw concurrency. Always verify your scraping activities comply with both legal requirements and robots.txt directives. Regular monitoring of success rates and data quality helps catch issues before they compromise entire datasets.
📘 Checkout my latest ebook for free on my channel!
Be sure to like, share, comment, and subscribe to the channel!
101 Books
101 Books is an AI-driven publishing company co-founded by author Aarav Joshi. By leveraging advanced AI technology, we keep our publishing costs incredibly low—some books are priced as low as $4—making quality knowledge accessible to everyone.
Check out our book Golang Clean Code available on Amazon.
Stay tuned for updates and exciting news. When shopping for books, search for Aarav Joshi to find more of our titles. Use the provided link to enjoy special discounts!
Our Creations
Be sure to check out our creations:
Investor Central | Investor Central Spanish | Investor Central German | Smart Living | Epochs & Echoes | Puzzling Mysteries | Hindutva | Elite Dev | JS Schools
We are on Medium
Tech Koala Insights | Epochs & Echoes World | Investor Central Medium | Puzzling Mysteries Medium | Science & Epochs Medium | Modern Hindutva
Top comments (0)