Rate limiting is one of the most critical aspects of web scraping. Without proper rate limiting, your scraper will get blocked, your IP banned, and you'll waste time debugging instead of collecting data.
Here's a comprehensive guide to implementing rate limiting in your Python scrapers.
Why Rate Limiting Matters
- Avoid IP bans: Websites detect and block aggressive scrapers
- Respect servers: Don't crash someone's website
- Better data: Rate-limited scrapers collect MORE data long-term
- Ethical scraping: Be a good citizen of the internet
Token Bucket Rate Limiter
A sophisticated approach that allows burst requests while maintaining an average rate:
import time
import threading
import random
class TokenBucket:
def __init__(self, rate, capacity):
self.rate = rate
self.capacity = capacity
self.tokens = capacity
self.last_refill = time.monotonic()
self.lock = threading.Lock()
def acquire(self):
with self.lock:
now = time.monotonic()
elapsed = now - self.last_refill
self.tokens = min(self.capacity, self.tokens + elapsed * self.rate)
self.last_refill = now
if self.tokens >= 1:
self.tokens -= 1
return True
wait_time = (1 - self.tokens) / self.rate
time.sleep(wait_time)
self.tokens = 0
self.last_refill = time.monotonic()
return True
limiter = TokenBucket(rate=2, capacity=5)
Exponential Backoff
When a request fails, back off exponentially:
import requests
def request_with_backoff(url, max_retries=5):
for attempt in range(max_retries):
try:
response = requests.get(url, timeout=30)
if response.status_code == 200:
return response
if response.status_code == 429:
retry_after = response.headers.get("Retry-After")
wait_time = int(retry_after) if retry_after else (2 ** attempt) + random.uniform(0, 1)
print(f"Rate limited. Waiting {wait_time:.1f}s")
time.sleep(wait_time)
continue
if response.status_code >= 500:
wait_time = (2 ** attempt) + random.uniform(0, 1)
time.sleep(wait_time)
continue
return response
except requests.exceptions.Timeout:
time.sleep((2 ** attempt) + random.uniform(0, 1))
except requests.exceptions.ConnectionError:
time.sleep((2 ** attempt) * 2)
return None
Respecting robots.txt
from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse
class RobotsChecker:
def __init__(self):
self.parsers = {}
def can_fetch(self, url, user_agent="*"):
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
if robots_url not in self.parsers:
parser = RobotFileParser()
parser.set_url(robots_url)
try:
parser.read()
except Exception:
return True
self.parsers[robots_url] = parser
return self.parsers[robots_url].can_fetch(user_agent, url)
Adaptive Rate Limiting
Adjust your rate based on server responses:
class AdaptiveRateLimiter:
def __init__(self, initial_delay=1.0, min_delay=0.5, max_delay=30.0):
self.delay = initial_delay
self.min_delay = min_delay
self.max_delay = max_delay
self.consecutive_success = 0
def record_success(self):
self.consecutive_success += 1
if self.consecutive_success >= 10:
self.delay = max(self.min_delay, self.delay * 0.9)
self.consecutive_success = 0
def record_error(self, status_code):
self.consecutive_success = 0
if status_code == 429:
self.delay = min(self.max_delay, self.delay * 3)
else:
self.delay = min(self.max_delay, self.delay * 1.5)
def wait(self):
jitter = random.uniform(0, self.delay * 0.1)
time.sleep(self.delay + jitter)
Using Proxies for Distributed Rate Limiting
The most effective approach combines rate limiting with proxy rotation:
class ProxyRotator:
def __init__(self, proxy_list):
self.proxies = proxy_list
self.index = 0
def get_next(self):
proxy = self.proxies[self.index % len(self.proxies)]
self.index += 1
return {"http": proxy, "https": proxy}
For production proxy infrastructure, ThorData provides residential proxies with built-in rotation — perfect for distributed rate limiting across thousands of IPs.
Complete Rate-Limited Scraper
class PoliteScraper:
def __init__(self, requests_per_second=2):
self.limiter = TokenBucket(rate=requests_per_second, capacity=5)
self.adaptive = AdaptiveRateLimiter()
self.robots = RobotsChecker()
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "ResearchBot/1.0 (contact@example.com)"
})
def fetch(self, url):
if not self.robots.can_fetch(url):
return None
self.limiter.acquire()
self.adaptive.wait()
response = request_with_backoff(url)
if response and response.status_code == 200:
self.adaptive.record_success()
elif response:
self.adaptive.record_error(response.status_code)
return response
def scrape_all(self, urls):
results = []
for i, url in enumerate(urls):
response = self.fetch(url)
if response:
results.append({"url": url, "content": response.text})
if i % 50 == 0:
print(f"Progress: {i}/{len(urls)}")
return results
Best Practices Summary
- Always implement rate limiting — never scrape without it
- Check robots.txt before scraping any domain
- Use exponential backoff for failed requests
- Rotate proxies with services like ThorData for high-volume scraping
- Adapt your rate based on server responses
- Add jitter to prevent thundering herd problems
- Set timeouts on all requests
Conclusion
Rate limiting isn't just about avoiding bans — it's about building scrapers that work reliably over time. A well-rate-limited scraper will collect more data in the long run than an aggressive one that gets blocked after 100 requests.
Happy (polite) scraping!
Top comments (0)